aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks202
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/audit.c18
-rw-r--r--kernel/audit_watch.c2
-rw-r--r--kernel/auditsc.c6
-rw-r--r--kernel/capability.c15
-rw-r--r--kernel/cgroup.c1130
-rw-r--r--kernel/cgroup_debug.c105
-rw-r--r--kernel/cgroup_freezer.c15
-rw-r--r--kernel/cpuset.c93
-rw-r--r--kernel/cred.c19
-rw-r--r--kernel/exit.c179
-rw-r--r--kernel/fork.c51
-rw-r--r--kernel/futex.c160
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/hrtimer.c55
-rw-r--r--kernel/hung_task.c6
-rw-r--r--kernel/hw_breakpoint.c423
-rw-r--r--kernel/irq/chip.c6
-rw-r--r--kernel/irq/handle.c1
-rw-r--r--kernel/irq/proc.c40
-rw-r--r--kernel/irq/spurious.c16
-rw-r--r--kernel/kallsyms.c1
-rw-r--r--kernel/kgdb.c2
-rw-r--r--kernel/kmod.c21
-rw-r--r--kernel/kprobes.c76
-rw-r--r--kernel/kthread.c23
-rw-r--r--kernel/lockdep.c22
-rw-r--r--kernel/module.c166
-rw-r--r--kernel/mutex-debug.c1
-rw-r--r--kernel/mutex.c4
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/ns_cgroup.c16
-rw-r--r--kernel/panic.c3
-rw-r--r--kernel/params.c24
-rw-r--r--kernel/perf_event.c967
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/power/hibernate.c11
-rw-r--r--kernel/power/suspend_test.c5
-rw-r--r--kernel/power/swap.c44
-rw-r--r--kernel/printk.c7
-rw-r--r--kernel/ptrace.c11
-rw-r--r--kernel/rcupdate.c260
-rw-r--r--kernel/rcutiny.c282
-rw-r--r--kernel/rcutorture.c69
-rw-r--r--kernel/rcutree.c787
-rw-r--r--kernel/rcutree.h160
-rw-r--r--kernel/rcutree_plugin.h442
-rw-r--r--kernel/rcutree_trace.c26
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/res_counter.c3
-rw-r--r--kernel/sched.c404
-rw-r--r--kernel/sched_clock.c4
-rw-r--r--kernel/sched_debug.c4
-rw-r--r--kernel/sched_fair.c143
-rw-r--r--kernel/sched_rt.c61
-rw-r--r--kernel/signal.c241
-rw-r--r--kernel/slow-work-debugfs.c227
-rw-r--r--kernel/slow-work.c524
-rw-r--r--kernel/slow-work.h72
-rw-r--r--kernel/smp.c63
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/softlockup.c4
-rw-r--r--kernel/spinlock.c310
-rw-r--r--kernel/srcu.c74
-rw-r--r--kernel/sys.c58
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c116
-rw-r--r--kernel/sysctl_check.c2
-rw-r--r--kernel/time.c30
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/clocksource.c4
-rw-r--r--kernel/time/tick-sched.c9
-rw-r--r--kernel/time/timeconv.c127
-rw-r--r--kernel/time/timekeeping.c1
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/time/timer_stats.c2
-rw-r--r--kernel/trace/Kconfig38
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c39
-rw-r--r--kernel/trace/ftrace.c437
-rw-r--r--kernel/trace/kmemtrace.c2
-rw-r--r--kernel/trace/ring_buffer.c38
-rw-r--r--kernel/trace/ring_buffer_benchmark.c85
-rw-r--r--kernel/trace/trace.c64
-rw-r--r--kernel/trace/trace.h80
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_clock.c8
-rw-r--r--kernel/trace/trace_entries.h16
-rw-r--r--kernel/trace/trace_event_profile.c50
-rw-r--r--kernel/trace/trace_events.c198
-rw-r--r--kernel/trace/trace_events_filter.c426
-rw-r--r--kernel/trace/trace_export.c43
-rw-r--r--kernel/trace/trace_hw_branches.c8
-rw-r--r--kernel/trace/trace_kprobe.c1523
-rw-r--r--kernel/trace/trace_ksym.c550
-rw-r--r--kernel/trace/trace_output.c23
-rw-r--r--kernel/trace/trace_selftest.c55
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/trace/trace_syscalls.c231
-rw-r--r--kernel/uid16.c1
-rw-r--r--kernel/user.c2
-rw-r--r--kernel/utsname_sysctl.c4
-rw-r--r--kernel/workqueue.c35
104 files changed, 9468 insertions, 2977 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
new file mode 100644
index 000000000000..88c92fb44618
--- /dev/null
+++ b/kernel/Kconfig.locks
@@ -0,0 +1,202 @@
1#
2# The ARCH_INLINE foo is necessary because select ignores "depends on"
3#
4config ARCH_INLINE_SPIN_TRYLOCK
5 bool
6
7config ARCH_INLINE_SPIN_TRYLOCK_BH
8 bool
9
10config ARCH_INLINE_SPIN_LOCK
11 bool
12
13config ARCH_INLINE_SPIN_LOCK_BH
14 bool
15
16config ARCH_INLINE_SPIN_LOCK_IRQ
17 bool
18
19config ARCH_INLINE_SPIN_LOCK_IRQSAVE
20 bool
21
22config ARCH_INLINE_SPIN_UNLOCK
23 bool
24
25config ARCH_INLINE_SPIN_UNLOCK_BH
26 bool
27
28config ARCH_INLINE_SPIN_UNLOCK_IRQ
29 bool
30
31config ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
32 bool
33
34
35config ARCH_INLINE_READ_TRYLOCK
36 bool
37
38config ARCH_INLINE_READ_LOCK
39 bool
40
41config ARCH_INLINE_READ_LOCK_BH
42 bool
43
44config ARCH_INLINE_READ_LOCK_IRQ
45 bool
46
47config ARCH_INLINE_READ_LOCK_IRQSAVE
48 bool
49
50config ARCH_INLINE_READ_UNLOCK
51 bool
52
53config ARCH_INLINE_READ_UNLOCK_BH
54 bool
55
56config ARCH_INLINE_READ_UNLOCK_IRQ
57 bool
58
59config ARCH_INLINE_READ_UNLOCK_IRQRESTORE
60 bool
61
62
63config ARCH_INLINE_WRITE_TRYLOCK
64 bool
65
66config ARCH_INLINE_WRITE_LOCK
67 bool
68
69config ARCH_INLINE_WRITE_LOCK_BH
70 bool
71
72config ARCH_INLINE_WRITE_LOCK_IRQ
73 bool
74
75config ARCH_INLINE_WRITE_LOCK_IRQSAVE
76 bool
77
78config ARCH_INLINE_WRITE_UNLOCK
79 bool
80
81config ARCH_INLINE_WRITE_UNLOCK_BH
82 bool
83
84config ARCH_INLINE_WRITE_UNLOCK_IRQ
85 bool
86
87config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
88 bool
89
90#
91# lock_* functions are inlined when:
92# - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y
93#
94# trylock_* functions are inlined when:
95# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
96#
97# unlock and unlock_irq functions are inlined when:
98# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
99# or
100# - DEBUG_SPINLOCK=n and PREEMPT=n
101#
102# unlock_bh and unlock_irqrestore functions are inlined when:
103# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
104#
105
106config INLINE_SPIN_TRYLOCK
107 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK
108
109config INLINE_SPIN_TRYLOCK_BH
110 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH
111
112config INLINE_SPIN_LOCK
113 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
114
115config INLINE_SPIN_LOCK_BH
116 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
117 ARCH_INLINE_SPIN_LOCK_BH
118
119config INLINE_SPIN_LOCK_IRQ
120 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
121 ARCH_INLINE_SPIN_LOCK_IRQ
122
123config INLINE_SPIN_LOCK_IRQSAVE
124 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
125 ARCH_INLINE_SPIN_LOCK_IRQSAVE
126
127config INLINE_SPIN_UNLOCK
128 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK)
129
130config INLINE_SPIN_UNLOCK_BH
131 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH
132
133config INLINE_SPIN_UNLOCK_IRQ
134 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH)
135
136config INLINE_SPIN_UNLOCK_IRQRESTORE
137 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
138
139
140config INLINE_READ_TRYLOCK
141 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK
142
143config INLINE_READ_LOCK
144 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
145
146config INLINE_READ_LOCK_BH
147 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
148 ARCH_INLINE_READ_LOCK_BH
149
150config INLINE_READ_LOCK_IRQ
151 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
152 ARCH_INLINE_READ_LOCK_IRQ
153
154config INLINE_READ_LOCK_IRQSAVE
155 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
156 ARCH_INLINE_READ_LOCK_IRQSAVE
157
158config INLINE_READ_UNLOCK
159 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK)
160
161config INLINE_READ_UNLOCK_BH
162 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH
163
164config INLINE_READ_UNLOCK_IRQ
165 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH)
166
167config INLINE_READ_UNLOCK_IRQRESTORE
168 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE
169
170
171config INLINE_WRITE_TRYLOCK
172 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK
173
174config INLINE_WRITE_LOCK
175 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
176
177config INLINE_WRITE_LOCK_BH
178 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
179 ARCH_INLINE_WRITE_LOCK_BH
180
181config INLINE_WRITE_LOCK_IRQ
182 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
183 ARCH_INLINE_WRITE_LOCK_IRQ
184
185config INLINE_WRITE_LOCK_IRQSAVE
186 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
187 ARCH_INLINE_WRITE_LOCK_IRQSAVE
188
189config INLINE_WRITE_UNLOCK
190 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK)
191
192config INLINE_WRITE_UNLOCK_BH
193 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH
194
195config INLINE_WRITE_UNLOCK_IRQ
196 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH)
197
198config INLINE_WRITE_UNLOCK_IRQRESTORE
199 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
200
201config MUTEX_SPIN_ON_OWNER
202 def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES
diff --git a/kernel/Makefile b/kernel/Makefile
index 187c89b4783d..982c50e2ce53 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,6 +21,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 21CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 22CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg 23CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_perf_event.o = -pg
24endif 25endif
25 26
26obj-$(CONFIG_FREEZER) += freezer.o 27obj-$(CONFIG_FREEZER) += freezer.o
@@ -58,7 +59,6 @@ obj-$(CONFIG_KEXEC) += kexec.o
58obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o 59obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
59obj-$(CONFIG_COMPAT) += compat.o 60obj-$(CONFIG_COMPAT) += compat.o
60obj-$(CONFIG_CGROUPS) += cgroup.o 61obj-$(CONFIG_CGROUPS) += cgroup.o
61obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
62obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o 62obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
63obj-$(CONFIG_CPUSETS) += cpuset.o 63obj-$(CONFIG_CPUSETS) += cpuset.o
64obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o 64obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
@@ -83,6 +83,7 @@ obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
83obj-$(CONFIG_TREE_RCU) += rcutree.o 83obj-$(CONFIG_TREE_RCU) += rcutree.o
84obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o 84obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
85obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o 85obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
86obj-$(CONFIG_TINY_RCU) += rcutiny.o
86obj-$(CONFIG_RELAY) += relay.o 87obj-$(CONFIG_RELAY) += relay.o
87obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 88obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
88obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 89obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -95,7 +96,9 @@ obj-$(CONFIG_X86_DS) += trace/
95obj-$(CONFIG_RING_BUFFER) += trace/ 96obj-$(CONFIG_RING_BUFFER) += trace/
96obj-$(CONFIG_SMP) += sched_cpupri.o 97obj-$(CONFIG_SMP) += sched_cpupri.o
97obj-$(CONFIG_SLOW_WORK) += slow-work.o 98obj-$(CONFIG_SLOW_WORK) += slow-work.o
99obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
98obj-$(CONFIG_PERF_EVENTS) += perf_event.o 100obj-$(CONFIG_PERF_EVENTS) += perf_event.o
101obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
99 102
100ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 103ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
101# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 104# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/audit.c b/kernel/audit.c
index defc2e6f1e3b..5feed232be9d 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -855,18 +855,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
855 break; 855 break;
856 } 856 }
857 case AUDIT_SIGNAL_INFO: 857 case AUDIT_SIGNAL_INFO:
858 err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); 858 len = 0;
859 if (err) 859 if (audit_sig_sid) {
860 return err; 860 err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
861 if (err)
862 return err;
863 }
861 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); 864 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
862 if (!sig_data) { 865 if (!sig_data) {
863 security_release_secctx(ctx, len); 866 if (audit_sig_sid)
867 security_release_secctx(ctx, len);
864 return -ENOMEM; 868 return -ENOMEM;
865 } 869 }
866 sig_data->uid = audit_sig_uid; 870 sig_data->uid = audit_sig_uid;
867 sig_data->pid = audit_sig_pid; 871 sig_data->pid = audit_sig_pid;
868 memcpy(sig_data->ctx, ctx, len); 872 if (audit_sig_sid) {
869 security_release_secctx(ctx, len); 873 memcpy(sig_data->ctx, ctx, len);
874 security_release_secctx(ctx, len);
875 }
870 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 876 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
871 0, 0, sig_data, sizeof(*sig_data) + len); 877 0, 0, sig_data, sizeof(*sig_data) + len);
872 kfree(sig_data); 878 kfree(sig_data);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 0e96dbc60ea9..cc7e87936cbc 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -45,8 +45,8 @@
45 45
46struct audit_watch { 46struct audit_watch {
47 atomic_t count; /* reference count */ 47 atomic_t count; /* reference count */
48 char *path; /* insertion path */
49 dev_t dev; /* associated superblock device */ 48 dev_t dev; /* associated superblock device */
49 char *path; /* insertion path */
50 unsigned long ino; /* associated inode number */ 50 unsigned long ino; /* associated inode number */
51 struct audit_parent *parent; /* associated parent */ 51 struct audit_parent *parent; /* associated parent */
52 struct list_head wlist; /* entry in parent->watches list */ 52 struct list_head wlist; /* entry in parent->watches list */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 68d3c6a0ecd6..267e484f0198 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -168,12 +168,12 @@ struct audit_context {
168 int in_syscall; /* 1 if task is in a syscall */ 168 int in_syscall; /* 1 if task is in a syscall */
169 enum audit_state state, current_state; 169 enum audit_state state, current_state;
170 unsigned int serial; /* serial number for record */ 170 unsigned int serial; /* serial number for record */
171 struct timespec ctime; /* time of syscall entry */
172 int major; /* syscall number */ 171 int major; /* syscall number */
172 struct timespec ctime; /* time of syscall entry */
173 unsigned long argv[4]; /* syscall arguments */ 173 unsigned long argv[4]; /* syscall arguments */
174 int return_valid; /* return code is valid */
175 long return_code;/* syscall return code */ 174 long return_code;/* syscall return code */
176 u64 prio; 175 u64 prio;
176 int return_valid; /* return code is valid */
177 int name_count; 177 int name_count;
178 struct audit_names names[AUDIT_NAMES]; 178 struct audit_names names[AUDIT_NAMES];
179 char * filterkey; /* key for rule that triggered record */ 179 char * filterkey; /* key for rule that triggered record */
@@ -198,8 +198,8 @@ struct audit_context {
198 char target_comm[TASK_COMM_LEN]; 198 char target_comm[TASK_COMM_LEN];
199 199
200 struct audit_tree_refs *trees, *first_trees; 200 struct audit_tree_refs *trees, *first_trees;
201 int tree_count;
202 struct list_head killed_trees; 201 struct list_head killed_trees;
202 int tree_count;
203 203
204 int type; 204 int type;
205 union { 205 union {
diff --git a/kernel/capability.c b/kernel/capability.c
index 4e17041963f5..7f876e60521f 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -29,7 +29,6 @@ EXPORT_SYMBOL(__cap_empty_set);
29EXPORT_SYMBOL(__cap_full_set); 29EXPORT_SYMBOL(__cap_full_set);
30EXPORT_SYMBOL(__cap_init_eff_set); 30EXPORT_SYMBOL(__cap_init_eff_set);
31 31
32#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
33int file_caps_enabled = 1; 32int file_caps_enabled = 1;
34 33
35static int __init file_caps_disable(char *str) 34static int __init file_caps_disable(char *str)
@@ -38,7 +37,6 @@ static int __init file_caps_disable(char *str)
38 return 1; 37 return 1;
39} 38}
40__setup("no_file_caps", file_caps_disable); 39__setup("no_file_caps", file_caps_disable);
41#endif
42 40
43/* 41/*
44 * More recent versions of libcap are available from: 42 * More recent versions of libcap are available from:
@@ -169,8 +167,8 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
169 kernel_cap_t pE, pI, pP; 167 kernel_cap_t pE, pI, pP;
170 168
171 ret = cap_validate_magic(header, &tocopy); 169 ret = cap_validate_magic(header, &tocopy);
172 if (ret != 0) 170 if ((dataptr == NULL) || (ret != 0))
173 return ret; 171 return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret;
174 172
175 if (get_user(pid, &header->pid)) 173 if (get_user(pid, &header->pid))
176 return -EFAULT; 174 return -EFAULT;
@@ -238,7 +236,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
238SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) 236SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
239{ 237{
240 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; 238 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
241 unsigned i, tocopy; 239 unsigned i, tocopy, copybytes;
242 kernel_cap_t inheritable, permitted, effective; 240 kernel_cap_t inheritable, permitted, effective;
243 struct cred *new; 241 struct cred *new;
244 int ret; 242 int ret;
@@ -255,8 +253,11 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
255 if (pid != 0 && pid != task_pid_vnr(current)) 253 if (pid != 0 && pid != task_pid_vnr(current))
256 return -EPERM; 254 return -EPERM;
257 255
258 if (copy_from_user(&kdata, data, 256 copybytes = tocopy * sizeof(struct __user_cap_data_struct);
259 tocopy * sizeof(struct __user_cap_data_struct))) 257 if (copybytes > sizeof(kdata))
258 return -EFAULT;
259
260 if (copy_from_user(&kdata, data, copybytes))
260 return -EFAULT; 261 return -EFAULT;
261 262
262 for (i = 0; i < tocopy; i++) { 263 for (i = 0; i < tocopy; i++) {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index cd83d9933b6b..0249f4be9b5c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -23,6 +23,7 @@
23 */ 23 */
24 24
25#include <linux/cgroup.h> 25#include <linux/cgroup.h>
26#include <linux/ctype.h>
26#include <linux/errno.h> 27#include <linux/errno.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
28#include <linux/kernel.h> 29#include <linux/kernel.h>
@@ -48,6 +49,8 @@
48#include <linux/namei.h> 49#include <linux/namei.h>
49#include <linux/smp_lock.h> 50#include <linux/smp_lock.h>
50#include <linux/pid_namespace.h> 51#include <linux/pid_namespace.h>
52#include <linux/idr.h>
53#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
51 54
52#include <asm/atomic.h> 55#include <asm/atomic.h>
53 56
@@ -60,6 +63,8 @@ static struct cgroup_subsys *subsys[] = {
60#include <linux/cgroup_subsys.h> 63#include <linux/cgroup_subsys.h>
61}; 64};
62 65
66#define MAX_CGROUP_ROOT_NAMELEN 64
67
63/* 68/*
64 * A cgroupfs_root represents the root of a cgroup hierarchy, 69 * A cgroupfs_root represents the root of a cgroup hierarchy,
65 * and may be associated with a superblock to form an active 70 * and may be associated with a superblock to form an active
@@ -74,6 +79,9 @@ struct cgroupfs_root {
74 */ 79 */
75 unsigned long subsys_bits; 80 unsigned long subsys_bits;
76 81
82 /* Unique id for this hierarchy. */
83 int hierarchy_id;
84
77 /* The bitmask of subsystems currently attached to this hierarchy */ 85 /* The bitmask of subsystems currently attached to this hierarchy */
78 unsigned long actual_subsys_bits; 86 unsigned long actual_subsys_bits;
79 87
@@ -94,6 +102,9 @@ struct cgroupfs_root {
94 102
95 /* The path to use for release notifications. */ 103 /* The path to use for release notifications. */
96 char release_agent_path[PATH_MAX]; 104 char release_agent_path[PATH_MAX];
105
106 /* The name for this hierarchy - may be empty */
107 char name[MAX_CGROUP_ROOT_NAMELEN];
97}; 108};
98 109
99/* 110/*
@@ -141,6 +152,10 @@ struct css_id {
141static LIST_HEAD(roots); 152static LIST_HEAD(roots);
142static int root_count; 153static int root_count;
143 154
155static DEFINE_IDA(hierarchy_ida);
156static int next_hierarchy_id;
157static DEFINE_SPINLOCK(hierarchy_id_lock);
158
144/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ 159/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
145#define dummytop (&rootnode.top_cgroup) 160#define dummytop (&rootnode.top_cgroup)
146 161
@@ -201,6 +216,7 @@ struct cg_cgroup_link {
201 * cgroup, anchored on cgroup->css_sets 216 * cgroup, anchored on cgroup->css_sets
202 */ 217 */
203 struct list_head cgrp_link_list; 218 struct list_head cgrp_link_list;
219 struct cgroup *cgrp;
204 /* 220 /*
205 * List running through cg_cgroup_links pointing at a 221 * List running through cg_cgroup_links pointing at a
206 * single css_set object, anchored on css_set->cg_links 222 * single css_set object, anchored on css_set->cg_links
@@ -227,8 +243,11 @@ static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
227static DEFINE_RWLOCK(css_set_lock); 243static DEFINE_RWLOCK(css_set_lock);
228static int css_set_count; 244static int css_set_count;
229 245
230/* hash table for cgroup groups. This improves the performance to 246/*
231 * find an existing css_set */ 247 * hash table for cgroup groups. This improves the performance to find
248 * an existing css_set. This hash doesn't (currently) take into
249 * account cgroups in empty hierarchies.
250 */
232#define CSS_SET_HASH_BITS 7 251#define CSS_SET_HASH_BITS 7
233#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) 252#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
234static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; 253static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
@@ -248,48 +267,22 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
248 return &css_set_table[index]; 267 return &css_set_table[index];
249} 268}
250 269
270static void free_css_set_rcu(struct rcu_head *obj)
271{
272 struct css_set *cg = container_of(obj, struct css_set, rcu_head);
273 kfree(cg);
274}
275
251/* We don't maintain the lists running through each css_set to its 276/* We don't maintain the lists running through each css_set to its
252 * task until after the first call to cgroup_iter_start(). This 277 * task until after the first call to cgroup_iter_start(). This
253 * reduces the fork()/exit() overhead for people who have cgroups 278 * reduces the fork()/exit() overhead for people who have cgroups
254 * compiled into their kernel but not actually in use */ 279 * compiled into their kernel but not actually in use */
255static int use_task_css_set_links __read_mostly; 280static int use_task_css_set_links __read_mostly;
256 281
257/* When we create or destroy a css_set, the operation simply 282static void __put_css_set(struct css_set *cg, int taskexit)
258 * takes/releases a reference count on all the cgroups referenced
259 * by subsystems in this css_set. This can end up multiple-counting
260 * some cgroups, but that's OK - the ref-count is just a
261 * busy/not-busy indicator; ensuring that we only count each cgroup
262 * once would require taking a global lock to ensure that no
263 * subsystems moved between hierarchies while we were doing so.
264 *
265 * Possible TODO: decide at boot time based on the number of
266 * registered subsystems and the number of CPUs or NUMA nodes whether
267 * it's better for performance to ref-count every subsystem, or to
268 * take a global lock and only add one ref count to each hierarchy.
269 */
270
271/*
272 * unlink a css_set from the list and free it
273 */
274static void unlink_css_set(struct css_set *cg)
275{ 283{
276 struct cg_cgroup_link *link; 284 struct cg_cgroup_link *link;
277 struct cg_cgroup_link *saved_link; 285 struct cg_cgroup_link *saved_link;
278
279 hlist_del(&cg->hlist);
280 css_set_count--;
281
282 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
283 cg_link_list) {
284 list_del(&link->cg_link_list);
285 list_del(&link->cgrp_link_list);
286 kfree(link);
287 }
288}
289
290static void __put_css_set(struct css_set *cg, int taskexit)
291{
292 int i;
293 /* 286 /*
294 * Ensure that the refcount doesn't hit zero while any readers 287 * Ensure that the refcount doesn't hit zero while any readers
295 * can see it. Similar to atomic_dec_and_lock(), but for an 288 * can see it. Similar to atomic_dec_and_lock(), but for an
@@ -302,21 +295,28 @@ static void __put_css_set(struct css_set *cg, int taskexit)
302 write_unlock(&css_set_lock); 295 write_unlock(&css_set_lock);
303 return; 296 return;
304 } 297 }
305 unlink_css_set(cg);
306 write_unlock(&css_set_lock);
307 298
308 rcu_read_lock(); 299 /* This css_set is dead. unlink it and release cgroup refcounts */
309 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 300 hlist_del(&cg->hlist);
310 struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup); 301 css_set_count--;
302
303 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
304 cg_link_list) {
305 struct cgroup *cgrp = link->cgrp;
306 list_del(&link->cg_link_list);
307 list_del(&link->cgrp_link_list);
311 if (atomic_dec_and_test(&cgrp->count) && 308 if (atomic_dec_and_test(&cgrp->count) &&
312 notify_on_release(cgrp)) { 309 notify_on_release(cgrp)) {
313 if (taskexit) 310 if (taskexit)
314 set_bit(CGRP_RELEASABLE, &cgrp->flags); 311 set_bit(CGRP_RELEASABLE, &cgrp->flags);
315 check_for_release(cgrp); 312 check_for_release(cgrp);
316 } 313 }
314
315 kfree(link);
317 } 316 }
318 rcu_read_unlock(); 317
319 kfree(cg); 318 write_unlock(&css_set_lock);
319 call_rcu(&cg->rcu_head, free_css_set_rcu);
320} 320}
321 321
322/* 322/*
@@ -338,6 +338,78 @@ static inline void put_css_set_taskexit(struct css_set *cg)
338} 338}
339 339
340/* 340/*
341 * compare_css_sets - helper function for find_existing_css_set().
342 * @cg: candidate css_set being tested
343 * @old_cg: existing css_set for a task
344 * @new_cgrp: cgroup that's being entered by the task
345 * @template: desired set of css pointers in css_set (pre-calculated)
346 *
347 * Returns true if "cg" matches "old_cg" except for the hierarchy
348 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
349 */
350static bool compare_css_sets(struct css_set *cg,
351 struct css_set *old_cg,
352 struct cgroup *new_cgrp,
353 struct cgroup_subsys_state *template[])
354{
355 struct list_head *l1, *l2;
356
357 if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
358 /* Not all subsystems matched */
359 return false;
360 }
361
362 /*
363 * Compare cgroup pointers in order to distinguish between
364 * different cgroups in heirarchies with no subsystems. We
365 * could get by with just this check alone (and skip the
366 * memcmp above) but on most setups the memcmp check will
367 * avoid the need for this more expensive check on almost all
368 * candidates.
369 */
370
371 l1 = &cg->cg_links;
372 l2 = &old_cg->cg_links;
373 while (1) {
374 struct cg_cgroup_link *cgl1, *cgl2;
375 struct cgroup *cg1, *cg2;
376
377 l1 = l1->next;
378 l2 = l2->next;
379 /* See if we reached the end - both lists are equal length. */
380 if (l1 == &cg->cg_links) {
381 BUG_ON(l2 != &old_cg->cg_links);
382 break;
383 } else {
384 BUG_ON(l2 == &old_cg->cg_links);
385 }
386 /* Locate the cgroups associated with these links. */
387 cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
388 cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
389 cg1 = cgl1->cgrp;
390 cg2 = cgl2->cgrp;
391 /* Hierarchies should be linked in the same order. */
392 BUG_ON(cg1->root != cg2->root);
393
394 /*
395 * If this hierarchy is the hierarchy of the cgroup
396 * that's changing, then we need to check that this
397 * css_set points to the new cgroup; if it's any other
398 * hierarchy, then this css_set should point to the
399 * same cgroup as the old css_set.
400 */
401 if (cg1->root == new_cgrp->root) {
402 if (cg1 != new_cgrp)
403 return false;
404 } else {
405 if (cg1 != cg2)
406 return false;
407 }
408 }
409 return true;
410}
411
412/*
341 * find_existing_css_set() is a helper for 413 * find_existing_css_set() is a helper for
342 * find_css_set(), and checks to see whether an existing 414 * find_css_set(), and checks to see whether an existing
343 * css_set is suitable. 415 * css_set is suitable.
@@ -378,10 +450,11 @@ static struct css_set *find_existing_css_set(
378 450
379 hhead = css_set_hash(template); 451 hhead = css_set_hash(template);
380 hlist_for_each_entry(cg, node, hhead, hlist) { 452 hlist_for_each_entry(cg, node, hhead, hlist) {
381 if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { 453 if (!compare_css_sets(cg, oldcg, cgrp, template))
382 /* All subsystems matched */ 454 continue;
383 return cg; 455
384 } 456 /* This css_set matches what we need */
457 return cg;
385 } 458 }
386 459
387 /* No existing cgroup group matched */ 460 /* No existing cgroup group matched */
@@ -435,8 +508,14 @@ static void link_css_set(struct list_head *tmp_cg_links,
435 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, 508 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
436 cgrp_link_list); 509 cgrp_link_list);
437 link->cg = cg; 510 link->cg = cg;
511 link->cgrp = cgrp;
512 atomic_inc(&cgrp->count);
438 list_move(&link->cgrp_link_list, &cgrp->css_sets); 513 list_move(&link->cgrp_link_list, &cgrp->css_sets);
439 list_add(&link->cg_link_list, &cg->cg_links); 514 /*
515 * Always add links to the tail of the list so that the list
516 * is sorted by order of hierarchy creation
517 */
518 list_add_tail(&link->cg_link_list, &cg->cg_links);
440} 519}
441 520
442/* 521/*
@@ -451,11 +530,11 @@ static struct css_set *find_css_set(
451{ 530{
452 struct css_set *res; 531 struct css_set *res;
453 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; 532 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
454 int i;
455 533
456 struct list_head tmp_cg_links; 534 struct list_head tmp_cg_links;
457 535
458 struct hlist_head *hhead; 536 struct hlist_head *hhead;
537 struct cg_cgroup_link *link;
459 538
460 /* First see if we already have a cgroup group that matches 539 /* First see if we already have a cgroup group that matches
461 * the desired set */ 540 * the desired set */
@@ -489,20 +568,12 @@ static struct css_set *find_css_set(
489 568
490 write_lock(&css_set_lock); 569 write_lock(&css_set_lock);
491 /* Add reference counts and links from the new css_set. */ 570 /* Add reference counts and links from the new css_set. */
492 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 571 list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
493 struct cgroup *cgrp = res->subsys[i]->cgroup; 572 struct cgroup *c = link->cgrp;
494 struct cgroup_subsys *ss = subsys[i]; 573 if (c->root == cgrp->root)
495 atomic_inc(&cgrp->count); 574 c = cgrp;
496 /* 575 link_css_set(&tmp_cg_links, res, c);
497 * We want to add a link once per cgroup, so we
498 * only do it for the first subsystem in each
499 * hierarchy
500 */
501 if (ss->root->subsys_list.next == &ss->sibling)
502 link_css_set(&tmp_cg_links, res, cgrp);
503 } 576 }
504 if (list_empty(&rootnode.subsys_list))
505 link_css_set(&tmp_cg_links, res, dummytop);
506 577
507 BUG_ON(!list_empty(&tmp_cg_links)); 578 BUG_ON(!list_empty(&tmp_cg_links));
508 579
@@ -518,6 +589,41 @@ static struct css_set *find_css_set(
518} 589}
519 590
520/* 591/*
592 * Return the cgroup for "task" from the given hierarchy. Must be
593 * called with cgroup_mutex held.
594 */
595static struct cgroup *task_cgroup_from_root(struct task_struct *task,
596 struct cgroupfs_root *root)
597{
598 struct css_set *css;
599 struct cgroup *res = NULL;
600
601 BUG_ON(!mutex_is_locked(&cgroup_mutex));
602 read_lock(&css_set_lock);
603 /*
604 * No need to lock the task - since we hold cgroup_mutex the
605 * task can't change groups, so the only thing that can happen
606 * is that it exits and its css is set back to init_css_set.
607 */
608 css = task->cgroups;
609 if (css == &init_css_set) {
610 res = &root->top_cgroup;
611 } else {
612 struct cg_cgroup_link *link;
613 list_for_each_entry(link, &css->cg_links, cg_link_list) {
614 struct cgroup *c = link->cgrp;
615 if (c->root == root) {
616 res = c;
617 break;
618 }
619 }
620 }
621 read_unlock(&css_set_lock);
622 BUG_ON(!res);
623 return res;
624}
625
626/*
521 * There is one global cgroup mutex. We also require taking 627 * There is one global cgroup mutex. We also require taking
522 * task_lock() when dereferencing a task's cgroup subsys pointers. 628 * task_lock() when dereferencing a task's cgroup subsys pointers.
523 * See "The task_lock() exception", at the end of this comment. 629 * See "The task_lock() exception", at the end of this comment.
@@ -597,7 +703,7 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
597static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 703static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
598static int cgroup_populate_dir(struct cgroup *cgrp); 704static int cgroup_populate_dir(struct cgroup *cgrp);
599static const struct inode_operations cgroup_dir_inode_operations; 705static const struct inode_operations cgroup_dir_inode_operations;
600static struct file_operations proc_cgroupstats_operations; 706static const struct file_operations proc_cgroupstats_operations;
601 707
602static struct backing_dev_info cgroup_backing_dev_info = { 708static struct backing_dev_info cgroup_backing_dev_info = {
603 .name = "cgroup", 709 .name = "cgroup",
@@ -677,6 +783,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
677 */ 783 */
678 deactivate_super(cgrp->root->sb); 784 deactivate_super(cgrp->root->sb);
679 785
786 /*
787 * if we're getting rid of the cgroup, refcount should ensure
788 * that there are no pidlists left.
789 */
790 BUG_ON(!list_empty(&cgrp->pidlists));
791
680 call_rcu(&cgrp->rcu_head, free_cgroup_rcu); 792 call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
681 } 793 }
682 iput(inode); 794 iput(inode);
@@ -841,6 +953,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
841 seq_puts(seq, ",noprefix"); 953 seq_puts(seq, ",noprefix");
842 if (strlen(root->release_agent_path)) 954 if (strlen(root->release_agent_path))
843 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 955 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
956 if (strlen(root->name))
957 seq_printf(seq, ",name=%s", root->name);
844 mutex_unlock(&cgroup_mutex); 958 mutex_unlock(&cgroup_mutex);
845 return 0; 959 return 0;
846} 960}
@@ -849,6 +963,12 @@ struct cgroup_sb_opts {
849 unsigned long subsys_bits; 963 unsigned long subsys_bits;
850 unsigned long flags; 964 unsigned long flags;
851 char *release_agent; 965 char *release_agent;
966 char *name;
967 /* User explicitly requested empty subsystem */
968 bool none;
969
970 struct cgroupfs_root *new_root;
971
852}; 972};
853 973
854/* Convert a hierarchy specifier into a bitmask of subsystems and 974/* Convert a hierarchy specifier into a bitmask of subsystems and
@@ -863,9 +983,7 @@ static int parse_cgroupfs_options(char *data,
863 mask = ~(1UL << cpuset_subsys_id); 983 mask = ~(1UL << cpuset_subsys_id);
864#endif 984#endif
865 985
866 opts->subsys_bits = 0; 986 memset(opts, 0, sizeof(*opts));
867 opts->flags = 0;
868 opts->release_agent = NULL;
869 987
870 while ((token = strsep(&o, ",")) != NULL) { 988 while ((token = strsep(&o, ",")) != NULL) {
871 if (!*token) 989 if (!*token)
@@ -879,17 +997,42 @@ static int parse_cgroupfs_options(char *data,
879 if (!ss->disabled) 997 if (!ss->disabled)
880 opts->subsys_bits |= 1ul << i; 998 opts->subsys_bits |= 1ul << i;
881 } 999 }
1000 } else if (!strcmp(token, "none")) {
1001 /* Explicitly have no subsystems */
1002 opts->none = true;
882 } else if (!strcmp(token, "noprefix")) { 1003 } else if (!strcmp(token, "noprefix")) {
883 set_bit(ROOT_NOPREFIX, &opts->flags); 1004 set_bit(ROOT_NOPREFIX, &opts->flags);
884 } else if (!strncmp(token, "release_agent=", 14)) { 1005 } else if (!strncmp(token, "release_agent=", 14)) {
885 /* Specifying two release agents is forbidden */ 1006 /* Specifying two release agents is forbidden */
886 if (opts->release_agent) 1007 if (opts->release_agent)
887 return -EINVAL; 1008 return -EINVAL;
888 opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL); 1009 opts->release_agent =
1010 kstrndup(token + 14, PATH_MAX, GFP_KERNEL);
889 if (!opts->release_agent) 1011 if (!opts->release_agent)
890 return -ENOMEM; 1012 return -ENOMEM;
891 strncpy(opts->release_agent, token + 14, PATH_MAX - 1); 1013 } else if (!strncmp(token, "name=", 5)) {
892 opts->release_agent[PATH_MAX - 1] = 0; 1014 int i;
1015 const char *name = token + 5;
1016 /* Can't specify an empty name */
1017 if (!strlen(name))
1018 return -EINVAL;
1019 /* Must match [\w.-]+ */
1020 for (i = 0; i < strlen(name); i++) {
1021 char c = name[i];
1022 if (isalnum(c))
1023 continue;
1024 if ((c == '.') || (c == '-') || (c == '_'))
1025 continue;
1026 return -EINVAL;
1027 }
1028 /* Specifying two names is forbidden */
1029 if (opts->name)
1030 return -EINVAL;
1031 opts->name = kstrndup(name,
1032 MAX_CGROUP_ROOT_NAMELEN,
1033 GFP_KERNEL);
1034 if (!opts->name)
1035 return -ENOMEM;
893 } else { 1036 } else {
894 struct cgroup_subsys *ss; 1037 struct cgroup_subsys *ss;
895 int i; 1038 int i;
@@ -906,6 +1049,8 @@ static int parse_cgroupfs_options(char *data,
906 } 1049 }
907 } 1050 }
908 1051
1052 /* Consistency checks */
1053
909 /* 1054 /*
910 * Option noprefix was introduced just for backward compatibility 1055 * Option noprefix was introduced just for backward compatibility
911 * with the old cpuset, so we allow noprefix only if mounting just 1056 * with the old cpuset, so we allow noprefix only if mounting just
@@ -915,8 +1060,16 @@ static int parse_cgroupfs_options(char *data,
915 (opts->subsys_bits & mask)) 1060 (opts->subsys_bits & mask))
916 return -EINVAL; 1061 return -EINVAL;
917 1062
918 /* We can't have an empty hierarchy */ 1063
919 if (!opts->subsys_bits) 1064 /* Can't specify "none" and some subsystems */
1065 if (opts->subsys_bits && opts->none)
1066 return -EINVAL;
1067
1068 /*
1069 * We either have to specify by name or by subsystems. (So all
1070 * empty hierarchies must have a name).
1071 */
1072 if (!opts->subsys_bits && !opts->name)
920 return -EINVAL; 1073 return -EINVAL;
921 1074
922 return 0; 1075 return 0;
@@ -944,6 +1097,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
944 goto out_unlock; 1097 goto out_unlock;
945 } 1098 }
946 1099
1100 /* Don't allow name to change at remount */
1101 if (opts.name && strcmp(opts.name, root->name)) {
1102 ret = -EINVAL;
1103 goto out_unlock;
1104 }
1105
947 ret = rebind_subsystems(root, opts.subsys_bits); 1106 ret = rebind_subsystems(root, opts.subsys_bits);
948 if (ret) 1107 if (ret)
949 goto out_unlock; 1108 goto out_unlock;
@@ -955,6 +1114,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
955 strcpy(root->release_agent_path, opts.release_agent); 1114 strcpy(root->release_agent_path, opts.release_agent);
956 out_unlock: 1115 out_unlock:
957 kfree(opts.release_agent); 1116 kfree(opts.release_agent);
1117 kfree(opts.name);
958 mutex_unlock(&cgroup_mutex); 1118 mutex_unlock(&cgroup_mutex);
959 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1119 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
960 unlock_kernel(); 1120 unlock_kernel();
@@ -974,9 +1134,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
974 INIT_LIST_HEAD(&cgrp->children); 1134 INIT_LIST_HEAD(&cgrp->children);
975 INIT_LIST_HEAD(&cgrp->css_sets); 1135 INIT_LIST_HEAD(&cgrp->css_sets);
976 INIT_LIST_HEAD(&cgrp->release_list); 1136 INIT_LIST_HEAD(&cgrp->release_list);
977 INIT_LIST_HEAD(&cgrp->pids_list); 1137 INIT_LIST_HEAD(&cgrp->pidlists);
978 init_rwsem(&cgrp->pids_mutex); 1138 mutex_init(&cgrp->pidlist_mutex);
979} 1139}
1140
980static void init_cgroup_root(struct cgroupfs_root *root) 1141static void init_cgroup_root(struct cgroupfs_root *root)
981{ 1142{
982 struct cgroup *cgrp = &root->top_cgroup; 1143 struct cgroup *cgrp = &root->top_cgroup;
@@ -988,33 +1149,106 @@ static void init_cgroup_root(struct cgroupfs_root *root)
988 init_cgroup_housekeeping(cgrp); 1149 init_cgroup_housekeeping(cgrp);
989} 1150}
990 1151
1152static bool init_root_id(struct cgroupfs_root *root)
1153{
1154 int ret = 0;
1155
1156 do {
1157 if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
1158 return false;
1159 spin_lock(&hierarchy_id_lock);
1160 /* Try to allocate the next unused ID */
1161 ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
1162 &root->hierarchy_id);
1163 if (ret == -ENOSPC)
1164 /* Try again starting from 0 */
1165 ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
1166 if (!ret) {
1167 next_hierarchy_id = root->hierarchy_id + 1;
1168 } else if (ret != -EAGAIN) {
1169 /* Can only get here if the 31-bit IDR is full ... */
1170 BUG_ON(ret);
1171 }
1172 spin_unlock(&hierarchy_id_lock);
1173 } while (ret);
1174 return true;
1175}
1176
991static int cgroup_test_super(struct super_block *sb, void *data) 1177static int cgroup_test_super(struct super_block *sb, void *data)
992{ 1178{
993 struct cgroupfs_root *new = data; 1179 struct cgroup_sb_opts *opts = data;
994 struct cgroupfs_root *root = sb->s_fs_info; 1180 struct cgroupfs_root *root = sb->s_fs_info;
995 1181
996 /* First check subsystems */ 1182 /* If we asked for a name then it must match */
997 if (new->subsys_bits != root->subsys_bits) 1183 if (opts->name && strcmp(opts->name, root->name))
998 return 0; 1184 return 0;
999 1185
1000 /* Next check flags */ 1186 /*
1001 if (new->flags != root->flags) 1187 * If we asked for subsystems (or explicitly for no
1188 * subsystems) then they must match
1189 */
1190 if ((opts->subsys_bits || opts->none)
1191 && (opts->subsys_bits != root->subsys_bits))
1002 return 0; 1192 return 0;
1003 1193
1004 return 1; 1194 return 1;
1005} 1195}
1006 1196
1197static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1198{
1199 struct cgroupfs_root *root;
1200
1201 if (!opts->subsys_bits && !opts->none)
1202 return NULL;
1203
1204 root = kzalloc(sizeof(*root), GFP_KERNEL);
1205 if (!root)
1206 return ERR_PTR(-ENOMEM);
1207
1208 if (!init_root_id(root)) {
1209 kfree(root);
1210 return ERR_PTR(-ENOMEM);
1211 }
1212 init_cgroup_root(root);
1213
1214 root->subsys_bits = opts->subsys_bits;
1215 root->flags = opts->flags;
1216 if (opts->release_agent)
1217 strcpy(root->release_agent_path, opts->release_agent);
1218 if (opts->name)
1219 strcpy(root->name, opts->name);
1220 return root;
1221}
1222
1223static void cgroup_drop_root(struct cgroupfs_root *root)
1224{
1225 if (!root)
1226 return;
1227
1228 BUG_ON(!root->hierarchy_id);
1229 spin_lock(&hierarchy_id_lock);
1230 ida_remove(&hierarchy_ida, root->hierarchy_id);
1231 spin_unlock(&hierarchy_id_lock);
1232 kfree(root);
1233}
1234
1007static int cgroup_set_super(struct super_block *sb, void *data) 1235static int cgroup_set_super(struct super_block *sb, void *data)
1008{ 1236{
1009 int ret; 1237 int ret;
1010 struct cgroupfs_root *root = data; 1238 struct cgroup_sb_opts *opts = data;
1239
1240 /* If we don't have a new root, we can't set up a new sb */
1241 if (!opts->new_root)
1242 return -EINVAL;
1243
1244 BUG_ON(!opts->subsys_bits && !opts->none);
1011 1245
1012 ret = set_anon_super(sb, NULL); 1246 ret = set_anon_super(sb, NULL);
1013 if (ret) 1247 if (ret)
1014 return ret; 1248 return ret;
1015 1249
1016 sb->s_fs_info = root; 1250 sb->s_fs_info = opts->new_root;
1017 root->sb = sb; 1251 opts->new_root->sb = sb;
1018 1252
1019 sb->s_blocksize = PAGE_CACHE_SIZE; 1253 sb->s_blocksize = PAGE_CACHE_SIZE;
1020 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 1254 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
@@ -1051,48 +1285,43 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1051 void *data, struct vfsmount *mnt) 1285 void *data, struct vfsmount *mnt)
1052{ 1286{
1053 struct cgroup_sb_opts opts; 1287 struct cgroup_sb_opts opts;
1288 struct cgroupfs_root *root;
1054 int ret = 0; 1289 int ret = 0;
1055 struct super_block *sb; 1290 struct super_block *sb;
1056 struct cgroupfs_root *root; 1291 struct cgroupfs_root *new_root;
1057 struct list_head tmp_cg_links;
1058 1292
1059 /* First find the desired set of subsystems */ 1293 /* First find the desired set of subsystems */
1060 ret = parse_cgroupfs_options(data, &opts); 1294 ret = parse_cgroupfs_options(data, &opts);
1061 if (ret) { 1295 if (ret)
1062 kfree(opts.release_agent); 1296 goto out_err;
1063 return ret;
1064 }
1065
1066 root = kzalloc(sizeof(*root), GFP_KERNEL);
1067 if (!root) {
1068 kfree(opts.release_agent);
1069 return -ENOMEM;
1070 }
1071 1297
1072 init_cgroup_root(root); 1298 /*
1073 root->subsys_bits = opts.subsys_bits; 1299 * Allocate a new cgroup root. We may not need it if we're
1074 root->flags = opts.flags; 1300 * reusing an existing hierarchy.
1075 if (opts.release_agent) { 1301 */
1076 strcpy(root->release_agent_path, opts.release_agent); 1302 new_root = cgroup_root_from_opts(&opts);
1077 kfree(opts.release_agent); 1303 if (IS_ERR(new_root)) {
1304 ret = PTR_ERR(new_root);
1305 goto out_err;
1078 } 1306 }
1307 opts.new_root = new_root;
1079 1308
1080 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root); 1309 /* Locate an existing or new sb for this hierarchy */
1081 1310 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
1082 if (IS_ERR(sb)) { 1311 if (IS_ERR(sb)) {
1083 kfree(root); 1312 ret = PTR_ERR(sb);
1084 return PTR_ERR(sb); 1313 cgroup_drop_root(opts.new_root);
1314 goto out_err;
1085 } 1315 }
1086 1316
1087 if (sb->s_fs_info != root) { 1317 root = sb->s_fs_info;
1088 /* Reusing an existing superblock */ 1318 BUG_ON(!root);
1089 BUG_ON(sb->s_root == NULL); 1319 if (root == opts.new_root) {
1090 kfree(root); 1320 /* We used the new root structure, so this is a new hierarchy */
1091 root = NULL; 1321 struct list_head tmp_cg_links;
1092 } else {
1093 /* New superblock */
1094 struct cgroup *root_cgrp = &root->top_cgroup; 1322 struct cgroup *root_cgrp = &root->top_cgroup;
1095 struct inode *inode; 1323 struct inode *inode;
1324 struct cgroupfs_root *existing_root;
1096 int i; 1325 int i;
1097 1326
1098 BUG_ON(sb->s_root != NULL); 1327 BUG_ON(sb->s_root != NULL);
@@ -1105,6 +1334,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1105 mutex_lock(&inode->i_mutex); 1334 mutex_lock(&inode->i_mutex);
1106 mutex_lock(&cgroup_mutex); 1335 mutex_lock(&cgroup_mutex);
1107 1336
1337 if (strlen(root->name)) {
1338 /* Check for name clashes with existing mounts */
1339 for_each_active_root(existing_root) {
1340 if (!strcmp(existing_root->name, root->name)) {
1341 ret = -EBUSY;
1342 mutex_unlock(&cgroup_mutex);
1343 mutex_unlock(&inode->i_mutex);
1344 goto drop_new_super;
1345 }
1346 }
1347 }
1348
1108 /* 1349 /*
1109 * We're accessing css_set_count without locking 1350 * We're accessing css_set_count without locking
1110 * css_set_lock here, but that's OK - it can only be 1351 * css_set_lock here, but that's OK - it can only be
@@ -1123,7 +1364,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1123 if (ret == -EBUSY) { 1364 if (ret == -EBUSY) {
1124 mutex_unlock(&cgroup_mutex); 1365 mutex_unlock(&cgroup_mutex);
1125 mutex_unlock(&inode->i_mutex); 1366 mutex_unlock(&inode->i_mutex);
1126 goto free_cg_links; 1367 free_cg_links(&tmp_cg_links);
1368 goto drop_new_super;
1127 } 1369 }
1128 1370
1129 /* EBUSY should be the only error here */ 1371 /* EBUSY should be the only error here */
@@ -1155,17 +1397,27 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1155 BUG_ON(root->number_of_cgroups != 1); 1397 BUG_ON(root->number_of_cgroups != 1);
1156 1398
1157 cgroup_populate_dir(root_cgrp); 1399 cgroup_populate_dir(root_cgrp);
1158 mutex_unlock(&inode->i_mutex);
1159 mutex_unlock(&cgroup_mutex); 1400 mutex_unlock(&cgroup_mutex);
1401 mutex_unlock(&inode->i_mutex);
1402 } else {
1403 /*
1404 * We re-used an existing hierarchy - the new root (if
1405 * any) is not needed
1406 */
1407 cgroup_drop_root(opts.new_root);
1160 } 1408 }
1161 1409
1162 simple_set_mnt(mnt, sb); 1410 simple_set_mnt(mnt, sb);
1411 kfree(opts.release_agent);
1412 kfree(opts.name);
1163 return 0; 1413 return 0;
1164 1414
1165 free_cg_links:
1166 free_cg_links(&tmp_cg_links);
1167 drop_new_super: 1415 drop_new_super:
1168 deactivate_locked_super(sb); 1416 deactivate_locked_super(sb);
1417 out_err:
1418 kfree(opts.release_agent);
1419 kfree(opts.name);
1420
1169 return ret; 1421 return ret;
1170} 1422}
1171 1423
@@ -1211,7 +1463,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1211 mutex_unlock(&cgroup_mutex); 1463 mutex_unlock(&cgroup_mutex);
1212 1464
1213 kill_litter_super(sb); 1465 kill_litter_super(sb);
1214 kfree(root); 1466 cgroup_drop_root(root);
1215} 1467}
1216 1468
1217static struct file_system_type cgroup_fs_type = { 1469static struct file_system_type cgroup_fs_type = {
@@ -1276,27 +1528,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1276 return 0; 1528 return 0;
1277} 1529}
1278 1530
1279/*
1280 * Return the first subsystem attached to a cgroup's hierarchy, and
1281 * its subsystem id.
1282 */
1283
1284static void get_first_subsys(const struct cgroup *cgrp,
1285 struct cgroup_subsys_state **css, int *subsys_id)
1286{
1287 const struct cgroupfs_root *root = cgrp->root;
1288 const struct cgroup_subsys *test_ss;
1289 BUG_ON(list_empty(&root->subsys_list));
1290 test_ss = list_entry(root->subsys_list.next,
1291 struct cgroup_subsys, sibling);
1292 if (css) {
1293 *css = cgrp->subsys[test_ss->subsys_id];
1294 BUG_ON(!*css);
1295 }
1296 if (subsys_id)
1297 *subsys_id = test_ss->subsys_id;
1298}
1299
1300/** 1531/**
1301 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1532 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
1302 * @cgrp: the cgroup the task is attaching to 1533 * @cgrp: the cgroup the task is attaching to
@@ -1313,18 +1544,15 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1313 struct css_set *cg; 1544 struct css_set *cg;
1314 struct css_set *newcg; 1545 struct css_set *newcg;
1315 struct cgroupfs_root *root = cgrp->root; 1546 struct cgroupfs_root *root = cgrp->root;
1316 int subsys_id;
1317
1318 get_first_subsys(cgrp, NULL, &subsys_id);
1319 1547
1320 /* Nothing to do if the task is already in that cgroup */ 1548 /* Nothing to do if the task is already in that cgroup */
1321 oldcgrp = task_cgroup(tsk, subsys_id); 1549 oldcgrp = task_cgroup_from_root(tsk, root);
1322 if (cgrp == oldcgrp) 1550 if (cgrp == oldcgrp)
1323 return 0; 1551 return 0;
1324 1552
1325 for_each_subsys(root, ss) { 1553 for_each_subsys(root, ss) {
1326 if (ss->can_attach) { 1554 if (ss->can_attach) {
1327 retval = ss->can_attach(ss, cgrp, tsk); 1555 retval = ss->can_attach(ss, cgrp, tsk, false);
1328 if (retval) 1556 if (retval)
1329 return retval; 1557 return retval;
1330 } 1558 }
@@ -1362,7 +1590,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1362 1590
1363 for_each_subsys(root, ss) { 1591 for_each_subsys(root, ss) {
1364 if (ss->attach) 1592 if (ss->attach)
1365 ss->attach(ss, cgrp, oldcgrp, tsk); 1593 ss->attach(ss, cgrp, oldcgrp, tsk, false);
1366 } 1594 }
1367 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1595 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1368 synchronize_rcu(); 1596 synchronize_rcu();
@@ -1423,15 +1651,6 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
1423 return ret; 1651 return ret;
1424} 1652}
1425 1653
1426/* The various types of files and directories in a cgroup file system */
1427enum cgroup_filetype {
1428 FILE_ROOT,
1429 FILE_DIR,
1430 FILE_TASKLIST,
1431 FILE_NOTIFY_ON_RELEASE,
1432 FILE_RELEASE_AGENT,
1433};
1434
1435/** 1654/**
1436 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. 1655 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
1437 * @cgrp: the cgroup to be checked for liveness 1656 * @cgrp: the cgroup to be checked for liveness
@@ -1491,14 +1710,13 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
1491 return -EFAULT; 1710 return -EFAULT;
1492 1711
1493 buffer[nbytes] = 0; /* nul-terminate */ 1712 buffer[nbytes] = 0; /* nul-terminate */
1494 strstrip(buffer);
1495 if (cft->write_u64) { 1713 if (cft->write_u64) {
1496 u64 val = simple_strtoull(buffer, &end, 0); 1714 u64 val = simple_strtoull(strstrip(buffer), &end, 0);
1497 if (*end) 1715 if (*end)
1498 return -EINVAL; 1716 return -EINVAL;
1499 retval = cft->write_u64(cgrp, cft, val); 1717 retval = cft->write_u64(cgrp, cft, val);
1500 } else { 1718 } else {
1501 s64 val = simple_strtoll(buffer, &end, 0); 1719 s64 val = simple_strtoll(strstrip(buffer), &end, 0);
1502 if (*end) 1720 if (*end)
1503 return -EINVAL; 1721 return -EINVAL;
1504 retval = cft->write_s64(cgrp, cft, val); 1722 retval = cft->write_s64(cgrp, cft, val);
@@ -1534,8 +1752,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
1534 } 1752 }
1535 1753
1536 buffer[nbytes] = 0; /* nul-terminate */ 1754 buffer[nbytes] = 0; /* nul-terminate */
1537 strstrip(buffer); 1755 retval = cft->write_string(cgrp, cft, strstrip(buffer));
1538 retval = cft->write_string(cgrp, cft, buffer);
1539 if (!retval) 1756 if (!retval)
1540 retval = nbytes; 1757 retval = nbytes;
1541out: 1758out:
@@ -1644,7 +1861,7 @@ static int cgroup_seqfile_release(struct inode *inode, struct file *file)
1644 return single_release(inode, file); 1861 return single_release(inode, file);
1645} 1862}
1646 1863
1647static struct file_operations cgroup_seqfile_operations = { 1864static const struct file_operations cgroup_seqfile_operations = {
1648 .read = seq_read, 1865 .read = seq_read,
1649 .write = cgroup_file_write, 1866 .write = cgroup_file_write,
1650 .llseek = seq_lseek, 1867 .llseek = seq_lseek,
@@ -1703,7 +1920,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
1703 return simple_rename(old_dir, old_dentry, new_dir, new_dentry); 1920 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
1704} 1921}
1705 1922
1706static struct file_operations cgroup_file_operations = { 1923static const struct file_operations cgroup_file_operations = {
1707 .read = cgroup_file_read, 1924 .read = cgroup_file_read,
1708 .write = cgroup_file_write, 1925 .write = cgroup_file_write,
1709 .llseek = generic_file_llseek, 1926 .llseek = generic_file_llseek,
@@ -1876,7 +2093,7 @@ int cgroup_task_count(const struct cgroup *cgrp)
1876 * the start of a css_set 2093 * the start of a css_set
1877 */ 2094 */
1878static void cgroup_advance_iter(struct cgroup *cgrp, 2095static void cgroup_advance_iter(struct cgroup *cgrp,
1879 struct cgroup_iter *it) 2096 struct cgroup_iter *it)
1880{ 2097{
1881 struct list_head *l = it->cg_link; 2098 struct list_head *l = it->cg_link;
1882 struct cg_cgroup_link *link; 2099 struct cg_cgroup_link *link;
@@ -2129,7 +2346,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
2129} 2346}
2130 2347
2131/* 2348/*
2132 * Stuff for reading the 'tasks' file. 2349 * Stuff for reading the 'tasks'/'procs' files.
2133 * 2350 *
2134 * Reading this file can return large amounts of data if a cgroup has 2351 * Reading this file can return large amounts of data if a cgroup has
2135 * *lots* of attached tasks. So it may need several calls to read(), 2352 * *lots* of attached tasks. So it may need several calls to read(),
@@ -2139,27 +2356,196 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
2139 */ 2356 */
2140 2357
2141/* 2358/*
2142 * Load into 'pidarray' up to 'npids' of the tasks using cgroup 2359 * The following two functions "fix" the issue where there are more pids
2143 * 'cgrp'. Return actual number of pids loaded. No need to 2360 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
2144 * task_lock(p) when reading out p->cgroup, since we're in an RCU 2361 * TODO: replace with a kernel-wide solution to this problem
2145 * read section, so the css_set can't go away, and is 2362 */
2146 * immutable after creation. 2363#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
2364static void *pidlist_allocate(int count)
2365{
2366 if (PIDLIST_TOO_LARGE(count))
2367 return vmalloc(count * sizeof(pid_t));
2368 else
2369 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
2370}
2371static void pidlist_free(void *p)
2372{
2373 if (is_vmalloc_addr(p))
2374 vfree(p);
2375 else
2376 kfree(p);
2377}
2378static void *pidlist_resize(void *p, int newcount)
2379{
2380 void *newlist;
2381 /* note: if new alloc fails, old p will still be valid either way */
2382 if (is_vmalloc_addr(p)) {
2383 newlist = vmalloc(newcount * sizeof(pid_t));
2384 if (!newlist)
2385 return NULL;
2386 memcpy(newlist, p, newcount * sizeof(pid_t));
2387 vfree(p);
2388 } else {
2389 newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
2390 }
2391 return newlist;
2392}
2393
2394/*
2395 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
2396 * If the new stripped list is sufficiently smaller and there's enough memory
2397 * to allocate a new buffer, will let go of the unneeded memory. Returns the
2398 * number of unique elements.
2399 */
2400/* is the size difference enough that we should re-allocate the array? */
2401#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
2402static int pidlist_uniq(pid_t **p, int length)
2403{
2404 int src, dest = 1;
2405 pid_t *list = *p;
2406 pid_t *newlist;
2407
2408 /*
2409 * we presume the 0th element is unique, so i starts at 1. trivial
2410 * edge cases first; no work needs to be done for either
2411 */
2412 if (length == 0 || length == 1)
2413 return length;
2414 /* src and dest walk down the list; dest counts unique elements */
2415 for (src = 1; src < length; src++) {
2416 /* find next unique element */
2417 while (list[src] == list[src-1]) {
2418 src++;
2419 if (src == length)
2420 goto after;
2421 }
2422 /* dest always points to where the next unique element goes */
2423 list[dest] = list[src];
2424 dest++;
2425 }
2426after:
2427 /*
2428 * if the length difference is large enough, we want to allocate a
2429 * smaller buffer to save memory. if this fails due to out of memory,
2430 * we'll just stay with what we've got.
2431 */
2432 if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
2433 newlist = pidlist_resize(list, dest);
2434 if (newlist)
2435 *p = newlist;
2436 }
2437 return dest;
2438}
2439
2440static int cmppid(const void *a, const void *b)
2441{
2442 return *(pid_t *)a - *(pid_t *)b;
2443}
2444
2445/*
2446 * find the appropriate pidlist for our purpose (given procs vs tasks)
2447 * returns with the lock on that pidlist already held, and takes care
2448 * of the use count, or returns NULL with no locks held if we're out of
2449 * memory.
2147 */ 2450 */
2148static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) 2451static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2452 enum cgroup_filetype type)
2149{ 2453{
2150 int n = 0, pid; 2454 struct cgroup_pidlist *l;
2455 /* don't need task_nsproxy() if we're looking at ourself */
2456 struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns);
2457 /*
2458 * We can't drop the pidlist_mutex before taking the l->mutex in case
2459 * the last ref-holder is trying to remove l from the list at the same
2460 * time. Holding the pidlist_mutex precludes somebody taking whichever
2461 * list we find out from under us - compare release_pid_array().
2462 */
2463 mutex_lock(&cgrp->pidlist_mutex);
2464 list_for_each_entry(l, &cgrp->pidlists, links) {
2465 if (l->key.type == type && l->key.ns == ns) {
2466 /* found a matching list - drop the extra refcount */
2467 put_pid_ns(ns);
2468 /* make sure l doesn't vanish out from under us */
2469 down_write(&l->mutex);
2470 mutex_unlock(&cgrp->pidlist_mutex);
2471 l->use_count++;
2472 return l;
2473 }
2474 }
2475 /* entry not found; create a new one */
2476 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
2477 if (!l) {
2478 mutex_unlock(&cgrp->pidlist_mutex);
2479 put_pid_ns(ns);
2480 return l;
2481 }
2482 init_rwsem(&l->mutex);
2483 down_write(&l->mutex);
2484 l->key.type = type;
2485 l->key.ns = ns;
2486 l->use_count = 0; /* don't increment here */
2487 l->list = NULL;
2488 l->owner = cgrp;
2489 list_add(&l->links, &cgrp->pidlists);
2490 mutex_unlock(&cgrp->pidlist_mutex);
2491 return l;
2492}
2493
2494/*
2495 * Load a cgroup's pidarray with either procs' tgids or tasks' pids
2496 */
2497static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
2498 struct cgroup_pidlist **lp)
2499{
2500 pid_t *array;
2501 int length;
2502 int pid, n = 0; /* used for populating the array */
2151 struct cgroup_iter it; 2503 struct cgroup_iter it;
2152 struct task_struct *tsk; 2504 struct task_struct *tsk;
2505 struct cgroup_pidlist *l;
2506
2507 /*
2508 * If cgroup gets more users after we read count, we won't have
2509 * enough space - tough. This race is indistinguishable to the
2510 * caller from the case that the additional cgroup users didn't
2511 * show up until sometime later on.
2512 */
2513 length = cgroup_task_count(cgrp);
2514 array = pidlist_allocate(length);
2515 if (!array)
2516 return -ENOMEM;
2517 /* now, populate the array */
2153 cgroup_iter_start(cgrp, &it); 2518 cgroup_iter_start(cgrp, &it);
2154 while ((tsk = cgroup_iter_next(cgrp, &it))) { 2519 while ((tsk = cgroup_iter_next(cgrp, &it))) {
2155 if (unlikely(n == npids)) 2520 if (unlikely(n == length))
2156 break; 2521 break;
2157 pid = task_pid_vnr(tsk); 2522 /* get tgid or pid for procs or tasks file respectively */
2158 if (pid > 0) 2523 if (type == CGROUP_FILE_PROCS)
2159 pidarray[n++] = pid; 2524 pid = task_tgid_vnr(tsk);
2525 else
2526 pid = task_pid_vnr(tsk);
2527 if (pid > 0) /* make sure to only use valid results */
2528 array[n++] = pid;
2160 } 2529 }
2161 cgroup_iter_end(cgrp, &it); 2530 cgroup_iter_end(cgrp, &it);
2162 return n; 2531 length = n;
2532 /* now sort & (if procs) strip out duplicates */
2533 sort(array, length, sizeof(pid_t), cmppid, NULL);
2534 if (type == CGROUP_FILE_PROCS)
2535 length = pidlist_uniq(&array, length);
2536 l = cgroup_pidlist_find(cgrp, type);
2537 if (!l) {
2538 pidlist_free(array);
2539 return -ENOMEM;
2540 }
2541 /* store array, freeing old if necessary - lock already held */
2542 pidlist_free(l->list);
2543 l->list = array;
2544 l->length = length;
2545 l->use_count++;
2546 up_write(&l->mutex);
2547 *lp = l;
2548 return 0;
2163} 2549}
2164 2550
2165/** 2551/**
@@ -2216,37 +2602,14 @@ err:
2216 return ret; 2602 return ret;
2217} 2603}
2218 2604
2219/*
2220 * Cache pids for all threads in the same pid namespace that are
2221 * opening the same "tasks" file.
2222 */
2223struct cgroup_pids {
2224 /* The node in cgrp->pids_list */
2225 struct list_head list;
2226 /* The cgroup those pids belong to */
2227 struct cgroup *cgrp;
2228 /* The namepsace those pids belong to */
2229 struct pid_namespace *ns;
2230 /* Array of process ids in the cgroup */
2231 pid_t *tasks_pids;
2232 /* How many files are using the this tasks_pids array */
2233 int use_count;
2234 /* Length of the current tasks_pids array */
2235 int length;
2236};
2237
2238static int cmppid(const void *a, const void *b)
2239{
2240 return *(pid_t *)a - *(pid_t *)b;
2241}
2242 2605
2243/* 2606/*
2244 * seq_file methods for the "tasks" file. The seq_file position is the 2607 * seq_file methods for the tasks/procs files. The seq_file position is the
2245 * next pid to display; the seq_file iterator is a pointer to the pid 2608 * next pid to display; the seq_file iterator is a pointer to the pid
2246 * in the cgroup->tasks_pids array. 2609 * in the cgroup->l->list array.
2247 */ 2610 */
2248 2611
2249static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) 2612static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
2250{ 2613{
2251 /* 2614 /*
2252 * Initially we receive a position value that corresponds to 2615 * Initially we receive a position value that corresponds to
@@ -2254,48 +2617,45 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2254 * after a seek to the start). Use a binary-search to find the 2617 * after a seek to the start). Use a binary-search to find the
2255 * next pid to display, if any 2618 * next pid to display, if any
2256 */ 2619 */
2257 struct cgroup_pids *cp = s->private; 2620 struct cgroup_pidlist *l = s->private;
2258 struct cgroup *cgrp = cp->cgrp;
2259 int index = 0, pid = *pos; 2621 int index = 0, pid = *pos;
2260 int *iter; 2622 int *iter;
2261 2623
2262 down_read(&cgrp->pids_mutex); 2624 down_read(&l->mutex);
2263 if (pid) { 2625 if (pid) {
2264 int end = cp->length; 2626 int end = l->length;
2265 2627
2266 while (index < end) { 2628 while (index < end) {
2267 int mid = (index + end) / 2; 2629 int mid = (index + end) / 2;
2268 if (cp->tasks_pids[mid] == pid) { 2630 if (l->list[mid] == pid) {
2269 index = mid; 2631 index = mid;
2270 break; 2632 break;
2271 } else if (cp->tasks_pids[mid] <= pid) 2633 } else if (l->list[mid] <= pid)
2272 index = mid + 1; 2634 index = mid + 1;
2273 else 2635 else
2274 end = mid; 2636 end = mid;
2275 } 2637 }
2276 } 2638 }
2277 /* If we're off the end of the array, we're done */ 2639 /* If we're off the end of the array, we're done */
2278 if (index >= cp->length) 2640 if (index >= l->length)
2279 return NULL; 2641 return NULL;
2280 /* Update the abstract position to be the actual pid that we found */ 2642 /* Update the abstract position to be the actual pid that we found */
2281 iter = cp->tasks_pids + index; 2643 iter = l->list + index;
2282 *pos = *iter; 2644 *pos = *iter;
2283 return iter; 2645 return iter;
2284} 2646}
2285 2647
2286static void cgroup_tasks_stop(struct seq_file *s, void *v) 2648static void cgroup_pidlist_stop(struct seq_file *s, void *v)
2287{ 2649{
2288 struct cgroup_pids *cp = s->private; 2650 struct cgroup_pidlist *l = s->private;
2289 struct cgroup *cgrp = cp->cgrp; 2651 up_read(&l->mutex);
2290 up_read(&cgrp->pids_mutex);
2291} 2652}
2292 2653
2293static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) 2654static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
2294{ 2655{
2295 struct cgroup_pids *cp = s->private; 2656 struct cgroup_pidlist *l = s->private;
2296 int *p = v; 2657 pid_t *p = v;
2297 int *end = cp->tasks_pids + cp->length; 2658 pid_t *end = l->list + l->length;
2298
2299 /* 2659 /*
2300 * Advance to the next pid in the array. If this goes off the 2660 * Advance to the next pid in the array. If this goes off the
2301 * end, we're done 2661 * end, we're done
@@ -2309,124 +2669,107 @@ static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2309 } 2669 }
2310} 2670}
2311 2671
2312static int cgroup_tasks_show(struct seq_file *s, void *v) 2672static int cgroup_pidlist_show(struct seq_file *s, void *v)
2313{ 2673{
2314 return seq_printf(s, "%d\n", *(int *)v); 2674 return seq_printf(s, "%d\n", *(int *)v);
2315} 2675}
2316 2676
2317static const struct seq_operations cgroup_tasks_seq_operations = { 2677/*
2318 .start = cgroup_tasks_start, 2678 * seq_operations functions for iterating on pidlists through seq_file -
2319 .stop = cgroup_tasks_stop, 2679 * independent of whether it's tasks or procs
2320 .next = cgroup_tasks_next, 2680 */
2321 .show = cgroup_tasks_show, 2681static const struct seq_operations cgroup_pidlist_seq_operations = {
2682 .start = cgroup_pidlist_start,
2683 .stop = cgroup_pidlist_stop,
2684 .next = cgroup_pidlist_next,
2685 .show = cgroup_pidlist_show,
2322}; 2686};
2323 2687
2324static void release_cgroup_pid_array(struct cgroup_pids *cp) 2688static void cgroup_release_pid_array(struct cgroup_pidlist *l)
2325{ 2689{
2326 struct cgroup *cgrp = cp->cgrp; 2690 /*
2327 2691 * the case where we're the last user of this particular pidlist will
2328 down_write(&cgrp->pids_mutex); 2692 * have us remove it from the cgroup's list, which entails taking the
2329 BUG_ON(!cp->use_count); 2693 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
2330 if (!--cp->use_count) { 2694 * pidlist_mutex, we have to take pidlist_mutex first.
2331 list_del(&cp->list); 2695 */
2332 put_pid_ns(cp->ns); 2696 mutex_lock(&l->owner->pidlist_mutex);
2333 kfree(cp->tasks_pids); 2697 down_write(&l->mutex);
2334 kfree(cp); 2698 BUG_ON(!l->use_count);
2699 if (!--l->use_count) {
2700 /* we're the last user if refcount is 0; remove and free */
2701 list_del(&l->links);
2702 mutex_unlock(&l->owner->pidlist_mutex);
2703 pidlist_free(l->list);
2704 put_pid_ns(l->key.ns);
2705 up_write(&l->mutex);
2706 kfree(l);
2707 return;
2335 } 2708 }
2336 up_write(&cgrp->pids_mutex); 2709 mutex_unlock(&l->owner->pidlist_mutex);
2710 up_write(&l->mutex);
2337} 2711}
2338 2712
2339static int cgroup_tasks_release(struct inode *inode, struct file *file) 2713static int cgroup_pidlist_release(struct inode *inode, struct file *file)
2340{ 2714{
2341 struct seq_file *seq; 2715 struct cgroup_pidlist *l;
2342 struct cgroup_pids *cp;
2343
2344 if (!(file->f_mode & FMODE_READ)) 2716 if (!(file->f_mode & FMODE_READ))
2345 return 0; 2717 return 0;
2346 2718 /*
2347 seq = file->private_data; 2719 * the seq_file will only be initialized if the file was opened for
2348 cp = seq->private; 2720 * reading; hence we check if it's not null only in that case.
2349 2721 */
2350 release_cgroup_pid_array(cp); 2722 l = ((struct seq_file *)file->private_data)->private;
2723 cgroup_release_pid_array(l);
2351 return seq_release(inode, file); 2724 return seq_release(inode, file);
2352} 2725}
2353 2726
2354static struct file_operations cgroup_tasks_operations = { 2727static const struct file_operations cgroup_pidlist_operations = {
2355 .read = seq_read, 2728 .read = seq_read,
2356 .llseek = seq_lseek, 2729 .llseek = seq_lseek,
2357 .write = cgroup_file_write, 2730 .write = cgroup_file_write,
2358 .release = cgroup_tasks_release, 2731 .release = cgroup_pidlist_release,
2359}; 2732};
2360 2733
2361/* 2734/*
2362 * Handle an open on 'tasks' file. Prepare an array containing the 2735 * The following functions handle opens on a file that displays a pidlist
2363 * process id's of tasks currently attached to the cgroup being opened. 2736 * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
2737 * in the cgroup.
2364 */ 2738 */
2365 2739/* helper function for the two below it */
2366static int cgroup_tasks_open(struct inode *unused, struct file *file) 2740static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
2367{ 2741{
2368 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2742 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2369 struct pid_namespace *ns = current->nsproxy->pid_ns; 2743 struct cgroup_pidlist *l;
2370 struct cgroup_pids *cp;
2371 pid_t *pidarray;
2372 int npids;
2373 int retval; 2744 int retval;
2374 2745
2375 /* Nothing to do for write-only files */ 2746 /* Nothing to do for write-only files */
2376 if (!(file->f_mode & FMODE_READ)) 2747 if (!(file->f_mode & FMODE_READ))
2377 return 0; 2748 return 0;
2378 2749
2379 /* 2750 /* have the array populated */
2380 * If cgroup gets more users after we read count, we won't have 2751 retval = pidlist_array_load(cgrp, type, &l);
2381 * enough space - tough. This race is indistinguishable to the 2752 if (retval)
2382 * caller from the case that the additional cgroup users didn't 2753 return retval;
2383 * show up until sometime later on. 2754 /* configure file information */
2384 */ 2755 file->f_op = &cgroup_pidlist_operations;
2385 npids = cgroup_task_count(cgrp);
2386 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
2387 if (!pidarray)
2388 return -ENOMEM;
2389 npids = pid_array_load(pidarray, npids, cgrp);
2390 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
2391
2392 /*
2393 * Store the array in the cgroup, freeing the old
2394 * array if necessary
2395 */
2396 down_write(&cgrp->pids_mutex);
2397
2398 list_for_each_entry(cp, &cgrp->pids_list, list) {
2399 if (ns == cp->ns)
2400 goto found;
2401 }
2402
2403 cp = kzalloc(sizeof(*cp), GFP_KERNEL);
2404 if (!cp) {
2405 up_write(&cgrp->pids_mutex);
2406 kfree(pidarray);
2407 return -ENOMEM;
2408 }
2409 cp->cgrp = cgrp;
2410 cp->ns = ns;
2411 get_pid_ns(ns);
2412 list_add(&cp->list, &cgrp->pids_list);
2413found:
2414 kfree(cp->tasks_pids);
2415 cp->tasks_pids = pidarray;
2416 cp->length = npids;
2417 cp->use_count++;
2418 up_write(&cgrp->pids_mutex);
2419
2420 file->f_op = &cgroup_tasks_operations;
2421 2756
2422 retval = seq_open(file, &cgroup_tasks_seq_operations); 2757 retval = seq_open(file, &cgroup_pidlist_seq_operations);
2423 if (retval) { 2758 if (retval) {
2424 release_cgroup_pid_array(cp); 2759 cgroup_release_pid_array(l);
2425 return retval; 2760 return retval;
2426 } 2761 }
2427 ((struct seq_file *)file->private_data)->private = cp; 2762 ((struct seq_file *)file->private_data)->private = l;
2428 return 0; 2763 return 0;
2429} 2764}
2765static int cgroup_tasks_open(struct inode *unused, struct file *file)
2766{
2767 return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
2768}
2769static int cgroup_procs_open(struct inode *unused, struct file *file)
2770{
2771 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
2772}
2430 2773
2431static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, 2774static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
2432 struct cftype *cft) 2775 struct cftype *cft)
@@ -2449,21 +2792,27 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
2449/* 2792/*
2450 * for the common functions, 'private' gives the type of file 2793 * for the common functions, 'private' gives the type of file
2451 */ 2794 */
2795/* for hysterical raisins, we can't put this on the older files */
2796#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
2452static struct cftype files[] = { 2797static struct cftype files[] = {
2453 { 2798 {
2454 .name = "tasks", 2799 .name = "tasks",
2455 .open = cgroup_tasks_open, 2800 .open = cgroup_tasks_open,
2456 .write_u64 = cgroup_tasks_write, 2801 .write_u64 = cgroup_tasks_write,
2457 .release = cgroup_tasks_release, 2802 .release = cgroup_pidlist_release,
2458 .private = FILE_TASKLIST,
2459 .mode = S_IRUGO | S_IWUSR, 2803 .mode = S_IRUGO | S_IWUSR,
2460 }, 2804 },
2461 2805 {
2806 .name = CGROUP_FILE_GENERIC_PREFIX "procs",
2807 .open = cgroup_procs_open,
2808 /* .write_u64 = cgroup_procs_write, TODO */
2809 .release = cgroup_pidlist_release,
2810 .mode = S_IRUGO,
2811 },
2462 { 2812 {
2463 .name = "notify_on_release", 2813 .name = "notify_on_release",
2464 .read_u64 = cgroup_read_notify_on_release, 2814 .read_u64 = cgroup_read_notify_on_release,
2465 .write_u64 = cgroup_write_notify_on_release, 2815 .write_u64 = cgroup_write_notify_on_release,
2466 .private = FILE_NOTIFY_ON_RELEASE,
2467 }, 2816 },
2468}; 2817};
2469 2818
@@ -2472,7 +2821,6 @@ static struct cftype cft_release_agent = {
2472 .read_seq_string = cgroup_release_agent_show, 2821 .read_seq_string = cgroup_release_agent_show,
2473 .write_string = cgroup_release_agent_write, 2822 .write_string = cgroup_release_agent_write,
2474 .max_write_len = PATH_MAX, 2823 .max_write_len = PATH_MAX,
2475 .private = FILE_RELEASE_AGENT,
2476}; 2824};
2477 2825
2478static int cgroup_populate_dir(struct cgroup *cgrp) 2826static int cgroup_populate_dir(struct cgroup *cgrp)
@@ -2879,6 +3227,7 @@ int __init cgroup_init_early(void)
2879 init_task.cgroups = &init_css_set; 3227 init_task.cgroups = &init_css_set;
2880 3228
2881 init_css_set_link.cg = &init_css_set; 3229 init_css_set_link.cg = &init_css_set;
3230 init_css_set_link.cgrp = dummytop;
2882 list_add(&init_css_set_link.cgrp_link_list, 3231 list_add(&init_css_set_link.cgrp_link_list,
2883 &rootnode.top_cgroup.css_sets); 3232 &rootnode.top_cgroup.css_sets);
2884 list_add(&init_css_set_link.cg_link_list, 3233 list_add(&init_css_set_link.cg_link_list,
@@ -2933,7 +3282,7 @@ int __init cgroup_init(void)
2933 /* Add init_css_set to the hash table */ 3282 /* Add init_css_set to the hash table */
2934 hhead = css_set_hash(init_css_set.subsys); 3283 hhead = css_set_hash(init_css_set.subsys);
2935 hlist_add_head(&init_css_set.hlist, hhead); 3284 hlist_add_head(&init_css_set.hlist, hhead);
2936 3285 BUG_ON(!init_root_id(&rootnode));
2937 err = register_filesystem(&cgroup_fs_type); 3286 err = register_filesystem(&cgroup_fs_type);
2938 if (err < 0) 3287 if (err < 0)
2939 goto out; 3288 goto out;
@@ -2986,15 +3335,16 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
2986 for_each_active_root(root) { 3335 for_each_active_root(root) {
2987 struct cgroup_subsys *ss; 3336 struct cgroup_subsys *ss;
2988 struct cgroup *cgrp; 3337 struct cgroup *cgrp;
2989 int subsys_id;
2990 int count = 0; 3338 int count = 0;
2991 3339
2992 seq_printf(m, "%lu:", root->subsys_bits); 3340 seq_printf(m, "%d:", root->hierarchy_id);
2993 for_each_subsys(root, ss) 3341 for_each_subsys(root, ss)
2994 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 3342 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
3343 if (strlen(root->name))
3344 seq_printf(m, "%sname=%s", count ? "," : "",
3345 root->name);
2995 seq_putc(m, ':'); 3346 seq_putc(m, ':');
2996 get_first_subsys(&root->top_cgroup, NULL, &subsys_id); 3347 cgrp = task_cgroup_from_root(tsk, root);
2997 cgrp = task_cgroup(tsk, subsys_id);
2998 retval = cgroup_path(cgrp, buf, PAGE_SIZE); 3348 retval = cgroup_path(cgrp, buf, PAGE_SIZE);
2999 if (retval < 0) 3349 if (retval < 0)
3000 goto out_unlock; 3350 goto out_unlock;
@@ -3017,7 +3367,7 @@ static int cgroup_open(struct inode *inode, struct file *file)
3017 return single_open(file, proc_cgroup_show, pid); 3367 return single_open(file, proc_cgroup_show, pid);
3018} 3368}
3019 3369
3020struct file_operations proc_cgroup_operations = { 3370const struct file_operations proc_cgroup_operations = {
3021 .open = cgroup_open, 3371 .open = cgroup_open,
3022 .read = seq_read, 3372 .read = seq_read,
3023 .llseek = seq_lseek, 3373 .llseek = seq_lseek,
@@ -3033,8 +3383,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
3033 mutex_lock(&cgroup_mutex); 3383 mutex_lock(&cgroup_mutex);
3034 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3384 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3035 struct cgroup_subsys *ss = subsys[i]; 3385 struct cgroup_subsys *ss = subsys[i];
3036 seq_printf(m, "%s\t%lu\t%d\t%d\n", 3386 seq_printf(m, "%s\t%d\t%d\t%d\n",
3037 ss->name, ss->root->subsys_bits, 3387 ss->name, ss->root->hierarchy_id,
3038 ss->root->number_of_cgroups, !ss->disabled); 3388 ss->root->number_of_cgroups, !ss->disabled);
3039 } 3389 }
3040 mutex_unlock(&cgroup_mutex); 3390 mutex_unlock(&cgroup_mutex);
@@ -3046,7 +3396,7 @@ static int cgroupstats_open(struct inode *inode, struct file *file)
3046 return single_open(file, proc_cgroupstats_show, NULL); 3396 return single_open(file, proc_cgroupstats_show, NULL);
3047} 3397}
3048 3398
3049static struct file_operations proc_cgroupstats_operations = { 3399static const struct file_operations proc_cgroupstats_operations = {
3050 .open = cgroupstats_open, 3400 .open = cgroupstats_open,
3051 .read = seq_read, 3401 .read = seq_read,
3052 .llseek = seq_lseek, 3402 .llseek = seq_lseek,
@@ -3320,13 +3670,11 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
3320{ 3670{
3321 int ret; 3671 int ret;
3322 struct cgroup *target; 3672 struct cgroup *target;
3323 int subsys_id;
3324 3673
3325 if (cgrp == dummytop) 3674 if (cgrp == dummytop)
3326 return 1; 3675 return 1;
3327 3676
3328 get_first_subsys(cgrp, NULL, &subsys_id); 3677 target = task_cgroup_from_root(task, cgrp->root);
3329 target = task_cgroup(task, subsys_id);
3330 while (cgrp != target && cgrp!= cgrp->top_cgroup) 3678 while (cgrp != target && cgrp!= cgrp->top_cgroup)
3331 cgrp = cgrp->parent; 3679 cgrp = cgrp->parent;
3332 ret = (cgrp == target); 3680 ret = (cgrp == target);
@@ -3358,8 +3706,10 @@ static void check_for_release(struct cgroup *cgrp)
3358void __css_put(struct cgroup_subsys_state *css) 3706void __css_put(struct cgroup_subsys_state *css)
3359{ 3707{
3360 struct cgroup *cgrp = css->cgroup; 3708 struct cgroup *cgrp = css->cgroup;
3709 int val;
3361 rcu_read_lock(); 3710 rcu_read_lock();
3362 if (atomic_dec_return(&css->refcnt) == 1) { 3711 val = atomic_dec_return(&css->refcnt);
3712 if (val == 1) {
3363 if (notify_on_release(cgrp)) { 3713 if (notify_on_release(cgrp)) {
3364 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3714 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3365 check_for_release(cgrp); 3715 check_for_release(cgrp);
@@ -3367,6 +3717,7 @@ void __css_put(struct cgroup_subsys_state *css)
3367 cgroup_wakeup_rmdir_waiter(cgrp); 3717 cgroup_wakeup_rmdir_waiter(cgrp);
3368 } 3718 }
3369 rcu_read_unlock(); 3719 rcu_read_unlock();
3720 WARN_ON_ONCE(val < 1);
3370} 3721}
3371 3722
3372/* 3723/*
@@ -3693,3 +4044,154 @@ css_get_next(struct cgroup_subsys *ss, int id,
3693 return ret; 4044 return ret;
3694} 4045}
3695 4046
4047#ifdef CONFIG_CGROUP_DEBUG
4048static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
4049 struct cgroup *cont)
4050{
4051 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
4052
4053 if (!css)
4054 return ERR_PTR(-ENOMEM);
4055
4056 return css;
4057}
4058
4059static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
4060{
4061 kfree(cont->subsys[debug_subsys_id]);
4062}
4063
4064static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
4065{
4066 return atomic_read(&cont->count);
4067}
4068
4069static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
4070{
4071 return cgroup_task_count(cont);
4072}
4073
4074static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
4075{
4076 return (u64)(unsigned long)current->cgroups;
4077}
4078
4079static u64 current_css_set_refcount_read(struct cgroup *cont,
4080 struct cftype *cft)
4081{
4082 u64 count;
4083
4084 rcu_read_lock();
4085 count = atomic_read(&current->cgroups->refcount);
4086 rcu_read_unlock();
4087 return count;
4088}
4089
4090static int current_css_set_cg_links_read(struct cgroup *cont,
4091 struct cftype *cft,
4092 struct seq_file *seq)
4093{
4094 struct cg_cgroup_link *link;
4095 struct css_set *cg;
4096
4097 read_lock(&css_set_lock);
4098 rcu_read_lock();
4099 cg = rcu_dereference(current->cgroups);
4100 list_for_each_entry(link, &cg->cg_links, cg_link_list) {
4101 struct cgroup *c = link->cgrp;
4102 const char *name;
4103
4104 if (c->dentry)
4105 name = c->dentry->d_name.name;
4106 else
4107 name = "?";
4108 seq_printf(seq, "Root %d group %s\n",
4109 c->root->hierarchy_id, name);
4110 }
4111 rcu_read_unlock();
4112 read_unlock(&css_set_lock);
4113 return 0;
4114}
4115
4116#define MAX_TASKS_SHOWN_PER_CSS 25
4117static int cgroup_css_links_read(struct cgroup *cont,
4118 struct cftype *cft,
4119 struct seq_file *seq)
4120{
4121 struct cg_cgroup_link *link;
4122
4123 read_lock(&css_set_lock);
4124 list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
4125 struct css_set *cg = link->cg;
4126 struct task_struct *task;
4127 int count = 0;
4128 seq_printf(seq, "css_set %p\n", cg);
4129 list_for_each_entry(task, &cg->tasks, cg_list) {
4130 if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
4131 seq_puts(seq, " ...\n");
4132 break;
4133 } else {
4134 seq_printf(seq, " task %d\n",
4135 task_pid_vnr(task));
4136 }
4137 }
4138 }
4139 read_unlock(&css_set_lock);
4140 return 0;
4141}
4142
4143static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
4144{
4145 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
4146}
4147
4148static struct cftype debug_files[] = {
4149 {
4150 .name = "cgroup_refcount",
4151 .read_u64 = cgroup_refcount_read,
4152 },
4153 {
4154 .name = "taskcount",
4155 .read_u64 = debug_taskcount_read,
4156 },
4157
4158 {
4159 .name = "current_css_set",
4160 .read_u64 = current_css_set_read,
4161 },
4162
4163 {
4164 .name = "current_css_set_refcount",
4165 .read_u64 = current_css_set_refcount_read,
4166 },
4167
4168 {
4169 .name = "current_css_set_cg_links",
4170 .read_seq_string = current_css_set_cg_links_read,
4171 },
4172
4173 {
4174 .name = "cgroup_css_links",
4175 .read_seq_string = cgroup_css_links_read,
4176 },
4177
4178 {
4179 .name = "releasable",
4180 .read_u64 = releasable_read,
4181 },
4182};
4183
4184static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
4185{
4186 return cgroup_add_files(cont, ss, debug_files,
4187 ARRAY_SIZE(debug_files));
4188}
4189
4190struct cgroup_subsys debug_subsys = {
4191 .name = "debug",
4192 .create = debug_create,
4193 .destroy = debug_destroy,
4194 .populate = debug_populate,
4195 .subsys_id = debug_subsys_id,
4196};
4197#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
deleted file mode 100644
index 0c92d797baa6..000000000000
--- a/kernel/cgroup_debug.c
+++ /dev/null
@@ -1,105 +0,0 @@
1/*
2 * kernel/cgroup_debug.c - Example cgroup subsystem that
3 * exposes debug info
4 *
5 * Copyright (C) Google Inc, 2007
6 *
7 * Developed by Paul Menage (menage@google.com)
8 *
9 */
10
11#include <linux/cgroup.h>
12#include <linux/fs.h>
13#include <linux/slab.h>
14#include <linux/rcupdate.h>
15
16#include <asm/atomic.h>
17
18static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
19 struct cgroup *cont)
20{
21 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
22
23 if (!css)
24 return ERR_PTR(-ENOMEM);
25
26 return css;
27}
28
29static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
30{
31 kfree(cont->subsys[debug_subsys_id]);
32}
33
34static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
35{
36 return atomic_read(&cont->count);
37}
38
39static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
40{
41 u64 count;
42
43 count = cgroup_task_count(cont);
44 return count;
45}
46
47static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
48{
49 return (u64)(long)current->cgroups;
50}
51
52static u64 current_css_set_refcount_read(struct cgroup *cont,
53 struct cftype *cft)
54{
55 u64 count;
56
57 rcu_read_lock();
58 count = atomic_read(&current->cgroups->refcount);
59 rcu_read_unlock();
60 return count;
61}
62
63static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
64{
65 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
66}
67
68static struct cftype files[] = {
69 {
70 .name = "cgroup_refcount",
71 .read_u64 = cgroup_refcount_read,
72 },
73 {
74 .name = "taskcount",
75 .read_u64 = taskcount_read,
76 },
77
78 {
79 .name = "current_css_set",
80 .read_u64 = current_css_set_read,
81 },
82
83 {
84 .name = "current_css_set_refcount",
85 .read_u64 = current_css_set_refcount_read,
86 },
87
88 {
89 .name = "releasable",
90 .read_u64 = releasable_read,
91 },
92};
93
94static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
95{
96 return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
97}
98
99struct cgroup_subsys debug_subsys = {
100 .name = "debug",
101 .create = debug_create,
102 .destroy = debug_destroy,
103 .populate = debug_populate,
104 .subsys_id = debug_subsys_id,
105};
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index fb249e2bcada..59e9ef6aab40 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -159,7 +159,7 @@ static bool is_task_frozen_enough(struct task_struct *task)
159 */ 159 */
160static int freezer_can_attach(struct cgroup_subsys *ss, 160static int freezer_can_attach(struct cgroup_subsys *ss,
161 struct cgroup *new_cgroup, 161 struct cgroup *new_cgroup,
162 struct task_struct *task) 162 struct task_struct *task, bool threadgroup)
163{ 163{
164 struct freezer *freezer; 164 struct freezer *freezer;
165 165
@@ -177,6 +177,19 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
177 if (freezer->state == CGROUP_FROZEN) 177 if (freezer->state == CGROUP_FROZEN)
178 return -EBUSY; 178 return -EBUSY;
179 179
180 if (threadgroup) {
181 struct task_struct *c;
182
183 rcu_read_lock();
184 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
185 if (is_task_frozen_enough(c)) {
186 rcu_read_unlock();
187 return -EBUSY;
188 }
189 }
190 rcu_read_unlock();
191 }
192
180 return 0; 193 return 0;
181} 194}
182 195
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7e75a41bd508..3cf2183b472d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -537,8 +537,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
537 * element of the partition (one sched domain) to be passed to 537 * element of the partition (one sched domain) to be passed to
538 * partition_sched_domains(). 538 * partition_sched_domains().
539 */ 539 */
540/* FIXME: see the FIXME in partition_sched_domains() */ 540static int generate_sched_domains(cpumask_var_t **domains,
541static int generate_sched_domains(struct cpumask **domains,
542 struct sched_domain_attr **attributes) 541 struct sched_domain_attr **attributes)
543{ 542{
544 LIST_HEAD(q); /* queue of cpusets to be scanned */ 543 LIST_HEAD(q); /* queue of cpusets to be scanned */
@@ -546,7 +545,7 @@ static int generate_sched_domains(struct cpumask **domains,
546 struct cpuset **csa; /* array of all cpuset ptrs */ 545 struct cpuset **csa; /* array of all cpuset ptrs */
547 int csn; /* how many cpuset ptrs in csa so far */ 546 int csn; /* how many cpuset ptrs in csa so far */
548 int i, j, k; /* indices for partition finding loops */ 547 int i, j, k; /* indices for partition finding loops */
549 struct cpumask *doms; /* resulting partition; i.e. sched domains */ 548 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
550 struct sched_domain_attr *dattr; /* attributes for custom domains */ 549 struct sched_domain_attr *dattr; /* attributes for custom domains */
551 int ndoms = 0; /* number of sched domains in result */ 550 int ndoms = 0; /* number of sched domains in result */
552 int nslot; /* next empty doms[] struct cpumask slot */ 551 int nslot; /* next empty doms[] struct cpumask slot */
@@ -557,7 +556,8 @@ static int generate_sched_domains(struct cpumask **domains,
557 556
558 /* Special case for the 99% of systems with one, full, sched domain */ 557 /* Special case for the 99% of systems with one, full, sched domain */
559 if (is_sched_load_balance(&top_cpuset)) { 558 if (is_sched_load_balance(&top_cpuset)) {
560 doms = kmalloc(cpumask_size(), GFP_KERNEL); 559 ndoms = 1;
560 doms = alloc_sched_domains(ndoms);
561 if (!doms) 561 if (!doms)
562 goto done; 562 goto done;
563 563
@@ -566,9 +566,8 @@ static int generate_sched_domains(struct cpumask **domains,
566 *dattr = SD_ATTR_INIT; 566 *dattr = SD_ATTR_INIT;
567 update_domain_attr_tree(dattr, &top_cpuset); 567 update_domain_attr_tree(dattr, &top_cpuset);
568 } 568 }
569 cpumask_copy(doms, top_cpuset.cpus_allowed); 569 cpumask_copy(doms[0], top_cpuset.cpus_allowed);
570 570
571 ndoms = 1;
572 goto done; 571 goto done;
573 } 572 }
574 573
@@ -636,7 +635,7 @@ restart:
636 * Now we know how many domains to create. 635 * Now we know how many domains to create.
637 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 636 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
638 */ 637 */
639 doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL); 638 doms = alloc_sched_domains(ndoms);
640 if (!doms) 639 if (!doms)
641 goto done; 640 goto done;
642 641
@@ -656,7 +655,7 @@ restart:
656 continue; 655 continue;
657 } 656 }
658 657
659 dp = doms + nslot; 658 dp = doms[nslot];
660 659
661 if (nslot == ndoms) { 660 if (nslot == ndoms) {
662 static int warnings = 10; 661 static int warnings = 10;
@@ -718,7 +717,7 @@ done:
718static void do_rebuild_sched_domains(struct work_struct *unused) 717static void do_rebuild_sched_domains(struct work_struct *unused)
719{ 718{
720 struct sched_domain_attr *attr; 719 struct sched_domain_attr *attr;
721 struct cpumask *doms; 720 cpumask_var_t *doms;
722 int ndoms; 721 int ndoms;
723 722
724 get_online_cpus(); 723 get_online_cpus();
@@ -1324,9 +1323,10 @@ static int fmeter_getrate(struct fmeter *fmp)
1324static cpumask_var_t cpus_attach; 1323static cpumask_var_t cpus_attach;
1325 1324
1326/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1325/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1327static int cpuset_can_attach(struct cgroup_subsys *ss, 1326static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1328 struct cgroup *cont, struct task_struct *tsk) 1327 struct task_struct *tsk, bool threadgroup)
1329{ 1328{
1329 int ret;
1330 struct cpuset *cs = cgroup_cs(cont); 1330 struct cpuset *cs = cgroup_cs(cont);
1331 1331
1332 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1332 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1343,18 +1343,51 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
1343 if (tsk->flags & PF_THREAD_BOUND) 1343 if (tsk->flags & PF_THREAD_BOUND)
1344 return -EINVAL; 1344 return -EINVAL;
1345 1345
1346 return security_task_setscheduler(tsk, 0, NULL); 1346 ret = security_task_setscheduler(tsk, 0, NULL);
1347 if (ret)
1348 return ret;
1349 if (threadgroup) {
1350 struct task_struct *c;
1351
1352 rcu_read_lock();
1353 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1354 ret = security_task_setscheduler(c, 0, NULL);
1355 if (ret) {
1356 rcu_read_unlock();
1357 return ret;
1358 }
1359 }
1360 rcu_read_unlock();
1361 }
1362 return 0;
1347} 1363}
1348 1364
1349static void cpuset_attach(struct cgroup_subsys *ss, 1365static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
1350 struct cgroup *cont, struct cgroup *oldcont, 1366 struct cpuset *cs)
1351 struct task_struct *tsk) 1367{
1368 int err;
1369 /*
1370 * can_attach beforehand should guarantee that this doesn't fail.
1371 * TODO: have a better way to handle failure here
1372 */
1373 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1374 WARN_ON_ONCE(err);
1375
1376 task_lock(tsk);
1377 cpuset_change_task_nodemask(tsk, to);
1378 task_unlock(tsk);
1379 cpuset_update_task_spread_flag(cs, tsk);
1380
1381}
1382
1383static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1384 struct cgroup *oldcont, struct task_struct *tsk,
1385 bool threadgroup)
1352{ 1386{
1353 nodemask_t from, to; 1387 nodemask_t from, to;
1354 struct mm_struct *mm; 1388 struct mm_struct *mm;
1355 struct cpuset *cs = cgroup_cs(cont); 1389 struct cpuset *cs = cgroup_cs(cont);
1356 struct cpuset *oldcs = cgroup_cs(oldcont); 1390 struct cpuset *oldcs = cgroup_cs(oldcont);
1357 int err;
1358 1391
1359 if (cs == &top_cpuset) { 1392 if (cs == &top_cpuset) {
1360 cpumask_copy(cpus_attach, cpu_possible_mask); 1393 cpumask_copy(cpus_attach, cpu_possible_mask);
@@ -1363,15 +1396,19 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1363 guarantee_online_cpus(cs, cpus_attach); 1396 guarantee_online_cpus(cs, cpus_attach);
1364 guarantee_online_mems(cs, &to); 1397 guarantee_online_mems(cs, &to);
1365 } 1398 }
1366 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1367 if (err)
1368 return;
1369 1399
1370 task_lock(tsk); 1400 /* do per-task migration stuff possibly for each in the threadgroup */
1371 cpuset_change_task_nodemask(tsk, &to); 1401 cpuset_attach_task(tsk, &to, cs);
1372 task_unlock(tsk); 1402 if (threadgroup) {
1373 cpuset_update_task_spread_flag(cs, tsk); 1403 struct task_struct *c;
1404 rcu_read_lock();
1405 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1406 cpuset_attach_task(c, &to, cs);
1407 }
1408 rcu_read_unlock();
1409 }
1374 1410
1411 /* change mm; only needs to be done once even if threadgroup */
1375 from = oldcs->mems_allowed; 1412 from = oldcs->mems_allowed;
1376 to = cs->mems_allowed; 1413 to = cs->mems_allowed;
1377 mm = get_task_mm(tsk); 1414 mm = get_task_mm(tsk);
@@ -2014,7 +2051,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2014 unsigned long phase, void *unused_cpu) 2051 unsigned long phase, void *unused_cpu)
2015{ 2052{
2016 struct sched_domain_attr *attr; 2053 struct sched_domain_attr *attr;
2017 struct cpumask *doms; 2054 cpumask_var_t *doms;
2018 int ndoms; 2055 int ndoms;
2019 2056
2020 switch (phase) { 2057 switch (phase) {
@@ -2499,15 +2536,9 @@ const struct file_operations proc_cpuset_operations = {
2499}; 2536};
2500#endif /* CONFIG_PROC_PID_CPUSET */ 2537#endif /* CONFIG_PROC_PID_CPUSET */
2501 2538
2502/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */ 2539/* Display task mems_allowed in /proc/<pid>/status file. */
2503void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 2540void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2504{ 2541{
2505 seq_printf(m, "Cpus_allowed:\t");
2506 seq_cpumask(m, &task->cpus_allowed);
2507 seq_printf(m, "\n");
2508 seq_printf(m, "Cpus_allowed_list:\t");
2509 seq_cpumask_list(m, &task->cpus_allowed);
2510 seq_printf(m, "\n");
2511 seq_printf(m, "Mems_allowed:\t"); 2542 seq_printf(m, "Mems_allowed:\t");
2512 seq_nodemask(m, &task->mems_allowed); 2543 seq_nodemask(m, &task->mems_allowed);
2513 seq_printf(m, "\n"); 2544 seq_printf(m, "\n");
diff --git a/kernel/cred.c b/kernel/cred.c
index d7f7a01082eb..dd76cfe5f5b0 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -782,6 +782,25 @@ EXPORT_SYMBOL(set_create_files_as);
782 782
783#ifdef CONFIG_DEBUG_CREDENTIALS 783#ifdef CONFIG_DEBUG_CREDENTIALS
784 784
785bool creds_are_invalid(const struct cred *cred)
786{
787 if (cred->magic != CRED_MAGIC)
788 return true;
789 if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
790 return true;
791#ifdef CONFIG_SECURITY_SELINUX
792 if (selinux_is_enabled()) {
793 if ((unsigned long) cred->security < PAGE_SIZE)
794 return true;
795 if ((*(u32 *)cred->security & 0xffffff00) ==
796 (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
797 return true;
798 }
799#endif
800 return false;
801}
802EXPORT_SYMBOL(creds_are_invalid);
803
785/* 804/*
786 * dump invalid credentials 805 * dump invalid credentials
787 */ 806 */
diff --git a/kernel/exit.c b/kernel/exit.c
index 60d6fdcc9265..80ae941cfd2e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -49,6 +49,7 @@
49#include <linux/init_task.h> 49#include <linux/init_task.h>
50#include <linux/perf_event.h> 50#include <linux/perf_event.h>
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52#include <linux/hw_breakpoint.h>
52 53
53#include <asm/uaccess.h> 54#include <asm/uaccess.h>
54#include <asm/unistd.h> 55#include <asm/unistd.h>
@@ -110,9 +111,9 @@ static void __exit_signal(struct task_struct *tsk)
110 * We won't ever get here for the group leader, since it 111 * We won't ever get here for the group leader, since it
111 * will have been the last reference on the signal_struct. 112 * will have been the last reference on the signal_struct.
112 */ 113 */
113 sig->utime = cputime_add(sig->utime, task_utime(tsk)); 114 sig->utime = cputime_add(sig->utime, tsk->utime);
114 sig->stime = cputime_add(sig->stime, task_stime(tsk)); 115 sig->stime = cputime_add(sig->stime, tsk->stime);
115 sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); 116 sig->gtime = cputime_add(sig->gtime, tsk->gtime);
116 sig->min_flt += tsk->min_flt; 117 sig->min_flt += tsk->min_flt;
117 sig->maj_flt += tsk->maj_flt; 118 sig->maj_flt += tsk->maj_flt;
118 sig->nvcsw += tsk->nvcsw; 119 sig->nvcsw += tsk->nvcsw;
@@ -359,10 +360,8 @@ void __set_special_pids(struct pid *pid)
359{ 360{
360 struct task_struct *curr = current->group_leader; 361 struct task_struct *curr = current->group_leader;
361 362
362 if (task_session(curr) != pid) { 363 if (task_session(curr) != pid)
363 change_pid(curr, PIDTYPE_SID, pid); 364 change_pid(curr, PIDTYPE_SID, pid);
364 proc_sid_connector(curr);
365 }
366 365
367 if (task_pgrp(curr) != pid) 366 if (task_pgrp(curr) != pid)
368 change_pid(curr, PIDTYPE_PGID, pid); 367 change_pid(curr, PIDTYPE_PGID, pid);
@@ -976,12 +975,14 @@ NORET_TYPE void do_exit(long code)
976 disassociate_ctty(1); 975 disassociate_ctty(1);
977 976
978 module_put(task_thread_info(tsk)->exec_domain->module); 977 module_put(task_thread_info(tsk)->exec_domain->module);
979 if (tsk->binfmt)
980 module_put(tsk->binfmt->module);
981 978
982 proc_exit_connector(tsk); 979 proc_exit_connector(tsk);
983 980
984 /* 981 /*
982 * FIXME: do that only when needed, using sched_exit tracepoint
983 */
984 flush_ptrace_hw_breakpoint(tsk);
985 /*
985 * Flush inherited counters to the parent - before the parent 986 * Flush inherited counters to the parent - before the parent
986 * gets woken up by child-exit notifications. 987 * gets woken up by child-exit notifications.
987 */ 988 */
@@ -993,8 +994,6 @@ NORET_TYPE void do_exit(long code)
993 tsk->mempolicy = NULL; 994 tsk->mempolicy = NULL;
994#endif 995#endif
995#ifdef CONFIG_FUTEX 996#ifdef CONFIG_FUTEX
996 if (unlikely(!list_empty(&tsk->pi_state_list)))
997 exit_pi_state_list(tsk);
998 if (unlikely(current->pi_state_cache)) 997 if (unlikely(current->pi_state_cache))
999 kfree(current->pi_state_cache); 998 kfree(current->pi_state_cache);
1000#endif 999#endif
@@ -1097,28 +1096,28 @@ struct wait_opts {
1097 int __user *wo_stat; 1096 int __user *wo_stat;
1098 struct rusage __user *wo_rusage; 1097 struct rusage __user *wo_rusage;
1099 1098
1099 wait_queue_t child_wait;
1100 int notask_error; 1100 int notask_error;
1101}; 1101};
1102 1102
1103static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) 1103static inline
1104struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
1104{ 1105{
1105 struct pid *pid = NULL; 1106 if (type != PIDTYPE_PID)
1106 if (type == PIDTYPE_PID) 1107 task = task->group_leader;
1107 pid = task->pids[type].pid; 1108 return task->pids[type].pid;
1108 else if (type < PIDTYPE_MAX)
1109 pid = task->group_leader->pids[type].pid;
1110 return pid;
1111} 1109}
1112 1110
1113static int eligible_child(struct wait_opts *wo, struct task_struct *p) 1111static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
1114{ 1112{
1115 int err; 1113 return wo->wo_type == PIDTYPE_MAX ||
1116 1114 task_pid_type(p, wo->wo_type) == wo->wo_pid;
1117 if (wo->wo_type < PIDTYPE_MAX) { 1115}
1118 if (task_pid_type(p, wo->wo_type) != wo->wo_pid)
1119 return 0;
1120 }
1121 1116
1117static int eligible_child(struct wait_opts *wo, struct task_struct *p)
1118{
1119 if (!eligible_pid(wo, p))
1120 return 0;
1122 /* Wait for all children (clone and not) if __WALL is set; 1121 /* Wait for all children (clone and not) if __WALL is set;
1123 * otherwise, wait for clone children *only* if __WCLONE is 1122 * otherwise, wait for clone children *only* if __WCLONE is
1124 * set; otherwise, wait for non-clone children *only*. (Note: 1123 * set; otherwise, wait for non-clone children *only*. (Note:
@@ -1128,10 +1127,6 @@ static int eligible_child(struct wait_opts *wo, struct task_struct *p)
1128 && !(wo->wo_flags & __WALL)) 1127 && !(wo->wo_flags & __WALL))
1129 return 0; 1128 return 0;
1130 1129
1131 err = security_task_wait(p);
1132 if (err)
1133 return err;
1134
1135 return 1; 1130 return 1;
1136} 1131}
1137 1132
@@ -1144,18 +1139,20 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
1144 1139
1145 put_task_struct(p); 1140 put_task_struct(p);
1146 infop = wo->wo_info; 1141 infop = wo->wo_info;
1147 if (!retval) 1142 if (infop) {
1148 retval = put_user(SIGCHLD, &infop->si_signo); 1143 if (!retval)
1149 if (!retval) 1144 retval = put_user(SIGCHLD, &infop->si_signo);
1150 retval = put_user(0, &infop->si_errno); 1145 if (!retval)
1151 if (!retval) 1146 retval = put_user(0, &infop->si_errno);
1152 retval = put_user((short)why, &infop->si_code); 1147 if (!retval)
1153 if (!retval) 1148 retval = put_user((short)why, &infop->si_code);
1154 retval = put_user(pid, &infop->si_pid); 1149 if (!retval)
1155 if (!retval) 1150 retval = put_user(pid, &infop->si_pid);
1156 retval = put_user(uid, &infop->si_uid); 1151 if (!retval)
1157 if (!retval) 1152 retval = put_user(uid, &infop->si_uid);
1158 retval = put_user(status, &infop->si_status); 1153 if (!retval)
1154 retval = put_user(status, &infop->si_status);
1155 }
1159 if (!retval) 1156 if (!retval)
1160 retval = pid; 1157 retval = pid;
1161 return retval; 1158 return retval;
@@ -1213,6 +1210,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1213 struct signal_struct *psig; 1210 struct signal_struct *psig;
1214 struct signal_struct *sig; 1211 struct signal_struct *sig;
1215 unsigned long maxrss; 1212 unsigned long maxrss;
1213 cputime_t tgutime, tgstime;
1216 1214
1217 /* 1215 /*
1218 * The resource counters for the group leader are in its 1216 * The resource counters for the group leader are in its
@@ -1228,20 +1226,23 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1228 * need to protect the access to parent->signal fields, 1226 * need to protect the access to parent->signal fields,
1229 * as other threads in the parent group can be right 1227 * as other threads in the parent group can be right
1230 * here reaping other children at the same time. 1228 * here reaping other children at the same time.
1229 *
1230 * We use thread_group_times() to get times for the thread
1231 * group, which consolidates times for all threads in the
1232 * group including the group leader.
1231 */ 1233 */
1234 thread_group_times(p, &tgutime, &tgstime);
1232 spin_lock_irq(&p->real_parent->sighand->siglock); 1235 spin_lock_irq(&p->real_parent->sighand->siglock);
1233 psig = p->real_parent->signal; 1236 psig = p->real_parent->signal;
1234 sig = p->signal; 1237 sig = p->signal;
1235 psig->cutime = 1238 psig->cutime =
1236 cputime_add(psig->cutime, 1239 cputime_add(psig->cutime,
1237 cputime_add(p->utime, 1240 cputime_add(tgutime,
1238 cputime_add(sig->utime, 1241 sig->cutime));
1239 sig->cutime)));
1240 psig->cstime = 1242 psig->cstime =
1241 cputime_add(psig->cstime, 1243 cputime_add(psig->cstime,
1242 cputime_add(p->stime, 1244 cputime_add(tgstime,
1243 cputime_add(sig->stime, 1245 sig->cstime));
1244 sig->cstime)));
1245 psig->cgtime = 1246 psig->cgtime =
1246 cputime_add(psig->cgtime, 1247 cputime_add(psig->cgtime,
1247 cputime_add(p->gtime, 1248 cputime_add(p->gtime,
@@ -1485,13 +1486,14 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1485 * then ->notask_error is 0 if @p is an eligible child, 1486 * then ->notask_error is 0 if @p is an eligible child,
1486 * or another error from security_task_wait(), or still -ECHILD. 1487 * or another error from security_task_wait(), or still -ECHILD.
1487 */ 1488 */
1488static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent, 1489static int wait_consider_task(struct wait_opts *wo, int ptrace,
1489 int ptrace, struct task_struct *p) 1490 struct task_struct *p)
1490{ 1491{
1491 int ret = eligible_child(wo, p); 1492 int ret = eligible_child(wo, p);
1492 if (!ret) 1493 if (!ret)
1493 return ret; 1494 return ret;
1494 1495
1496 ret = security_task_wait(p);
1495 if (unlikely(ret < 0)) { 1497 if (unlikely(ret < 0)) {
1496 /* 1498 /*
1497 * If we have not yet seen any eligible child, 1499 * If we have not yet seen any eligible child,
@@ -1553,7 +1555,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1553 * Do not consider detached threads. 1555 * Do not consider detached threads.
1554 */ 1556 */
1555 if (!task_detached(p)) { 1557 if (!task_detached(p)) {
1556 int ret = wait_consider_task(wo, tsk, 0, p); 1558 int ret = wait_consider_task(wo, 0, p);
1557 if (ret) 1559 if (ret)
1558 return ret; 1560 return ret;
1559 } 1561 }
@@ -1567,7 +1569,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1567 struct task_struct *p; 1569 struct task_struct *p;
1568 1570
1569 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { 1571 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1570 int ret = wait_consider_task(wo, tsk, 1, p); 1572 int ret = wait_consider_task(wo, 1, p);
1571 if (ret) 1573 if (ret)
1572 return ret; 1574 return ret;
1573 } 1575 }
@@ -1575,15 +1577,38 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1575 return 0; 1577 return 0;
1576} 1578}
1577 1579
1580static int child_wait_callback(wait_queue_t *wait, unsigned mode,
1581 int sync, void *key)
1582{
1583 struct wait_opts *wo = container_of(wait, struct wait_opts,
1584 child_wait);
1585 struct task_struct *p = key;
1586
1587 if (!eligible_pid(wo, p))
1588 return 0;
1589
1590 if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
1591 return 0;
1592
1593 return default_wake_function(wait, mode, sync, key);
1594}
1595
1596void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
1597{
1598 __wake_up_sync_key(&parent->signal->wait_chldexit,
1599 TASK_INTERRUPTIBLE, 1, p);
1600}
1601
1578static long do_wait(struct wait_opts *wo) 1602static long do_wait(struct wait_opts *wo)
1579{ 1603{
1580 DECLARE_WAITQUEUE(wait, current);
1581 struct task_struct *tsk; 1604 struct task_struct *tsk;
1582 int retval; 1605 int retval;
1583 1606
1584 trace_sched_process_wait(wo->wo_pid); 1607 trace_sched_process_wait(wo->wo_pid);
1585 1608
1586 add_wait_queue(&current->signal->wait_chldexit,&wait); 1609 init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
1610 wo->child_wait.private = current;
1611 add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1587repeat: 1612repeat:
1588 /* 1613 /*
1589 * If there is nothing that can match our critiera just get out. 1614 * If there is nothing that can match our critiera just get out.
@@ -1624,32 +1649,7 @@ notask:
1624 } 1649 }
1625end: 1650end:
1626 __set_current_state(TASK_RUNNING); 1651 __set_current_state(TASK_RUNNING);
1627 remove_wait_queue(&current->signal->wait_chldexit,&wait); 1652 remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1628 if (wo->wo_info) {
1629 struct siginfo __user *infop = wo->wo_info;
1630
1631 if (retval > 0)
1632 retval = 0;
1633 else {
1634 /*
1635 * For a WNOHANG return, clear out all the fields
1636 * we would set so the user can easily tell the
1637 * difference.
1638 */
1639 if (!retval)
1640 retval = put_user(0, &infop->si_signo);
1641 if (!retval)
1642 retval = put_user(0, &infop->si_errno);
1643 if (!retval)
1644 retval = put_user(0, &infop->si_code);
1645 if (!retval)
1646 retval = put_user(0, &infop->si_pid);
1647 if (!retval)
1648 retval = put_user(0, &infop->si_uid);
1649 if (!retval)
1650 retval = put_user(0, &infop->si_status);
1651 }
1652 }
1653 return retval; 1653 return retval;
1654} 1654}
1655 1655
@@ -1694,6 +1694,29 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1694 wo.wo_stat = NULL; 1694 wo.wo_stat = NULL;
1695 wo.wo_rusage = ru; 1695 wo.wo_rusage = ru;
1696 ret = do_wait(&wo); 1696 ret = do_wait(&wo);
1697
1698 if (ret > 0) {
1699 ret = 0;
1700 } else if (infop) {
1701 /*
1702 * For a WNOHANG return, clear out all the fields
1703 * we would set so the user can easily tell the
1704 * difference.
1705 */
1706 if (!ret)
1707 ret = put_user(0, &infop->si_signo);
1708 if (!ret)
1709 ret = put_user(0, &infop->si_errno);
1710 if (!ret)
1711 ret = put_user(0, &infop->si_code);
1712 if (!ret)
1713 ret = put_user(0, &infop->si_pid);
1714 if (!ret)
1715 ret = put_user(0, &infop->si_uid);
1716 if (!ret)
1717 ret = put_user(0, &infop->si_status);
1718 }
1719
1697 put_pid(pid); 1720 put_pid(pid);
1698 1721
1699 /* avoid REGPARM breakage on x86: */ 1722 /* avoid REGPARM breakage on x86: */
diff --git a/kernel/fork.c b/kernel/fork.c
index 51ad0b0b7266..3d6f121bbe8a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -91,7 +91,7 @@ int nr_processes(void)
91 int cpu; 91 int cpu;
92 int total = 0; 92 int total = 0;
93 93
94 for_each_online_cpu(cpu) 94 for_each_possible_cpu(cpu)
95 total += per_cpu(process_counts, cpu); 95 total += per_cpu(process_counts, cpu);
96 96
97 return total; 97 return total;
@@ -434,6 +434,14 @@ __setup("coredump_filter=", coredump_filter_setup);
434 434
435#include <linux/init_task.h> 435#include <linux/init_task.h>
436 436
437static void mm_init_aio(struct mm_struct *mm)
438{
439#ifdef CONFIG_AIO
440 spin_lock_init(&mm->ioctx_lock);
441 INIT_HLIST_HEAD(&mm->ioctx_list);
442#endif
443}
444
437static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) 445static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
438{ 446{
439 atomic_set(&mm->mm_users, 1); 447 atomic_set(&mm->mm_users, 1);
@@ -447,10 +455,9 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
447 set_mm_counter(mm, file_rss, 0); 455 set_mm_counter(mm, file_rss, 0);
448 set_mm_counter(mm, anon_rss, 0); 456 set_mm_counter(mm, anon_rss, 0);
449 spin_lock_init(&mm->page_table_lock); 457 spin_lock_init(&mm->page_table_lock);
450 spin_lock_init(&mm->ioctx_lock);
451 INIT_HLIST_HEAD(&mm->ioctx_list);
452 mm->free_area_cache = TASK_UNMAPPED_BASE; 458 mm->free_area_cache = TASK_UNMAPPED_BASE;
453 mm->cached_hole_size = ~0UL; 459 mm->cached_hole_size = ~0UL;
460 mm_init_aio(mm);
454 mm_init_owner(mm, p); 461 mm_init_owner(mm, p);
455 462
456 if (likely(!mm_alloc_pgd(mm))) { 463 if (likely(!mm_alloc_pgd(mm))) {
@@ -511,6 +518,8 @@ void mmput(struct mm_struct *mm)
511 spin_unlock(&mmlist_lock); 518 spin_unlock(&mmlist_lock);
512 } 519 }
513 put_swap_token(mm); 520 put_swap_token(mm);
521 if (mm->binfmt)
522 module_put(mm->binfmt->module);
514 mmdrop(mm); 523 mmdrop(mm);
515 } 524 }
516} 525}
@@ -561,12 +570,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
561 570
562 /* Get rid of any futexes when releasing the mm */ 571 /* Get rid of any futexes when releasing the mm */
563#ifdef CONFIG_FUTEX 572#ifdef CONFIG_FUTEX
564 if (unlikely(tsk->robust_list)) 573 if (unlikely(tsk->robust_list)) {
565 exit_robust_list(tsk); 574 exit_robust_list(tsk);
575 tsk->robust_list = NULL;
576 }
566#ifdef CONFIG_COMPAT 577#ifdef CONFIG_COMPAT
567 if (unlikely(tsk->compat_robust_list)) 578 if (unlikely(tsk->compat_robust_list)) {
568 compat_exit_robust_list(tsk); 579 compat_exit_robust_list(tsk);
580 tsk->compat_robust_list = NULL;
581 }
569#endif 582#endif
583 if (unlikely(!list_empty(&tsk->pi_state_list)))
584 exit_pi_state_list(tsk);
570#endif 585#endif
571 586
572 /* Get rid of any cached register state */ 587 /* Get rid of any cached register state */
@@ -636,9 +651,14 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
636 mm->hiwater_rss = get_mm_rss(mm); 651 mm->hiwater_rss = get_mm_rss(mm);
637 mm->hiwater_vm = mm->total_vm; 652 mm->hiwater_vm = mm->total_vm;
638 653
654 if (mm->binfmt && !try_module_get(mm->binfmt->module))
655 goto free_pt;
656
639 return mm; 657 return mm;
640 658
641free_pt: 659free_pt:
660 /* don't put binfmt in mmput, we haven't got module yet */
661 mm->binfmt = NULL;
642 mmput(mm); 662 mmput(mm);
643 663
644fail_nomem: 664fail_nomem:
@@ -864,6 +884,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
864 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; 884 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
865 sig->gtime = cputime_zero; 885 sig->gtime = cputime_zero;
866 sig->cgtime = cputime_zero; 886 sig->cgtime = cputime_zero;
887#ifndef CONFIG_VIRT_CPU_ACCOUNTING
888 sig->prev_utime = sig->prev_stime = cputime_zero;
889#endif
867 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 890 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
868 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 891 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
869 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 892 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
@@ -979,6 +1002,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
979 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) 1002 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
980 return ERR_PTR(-EINVAL); 1003 return ERR_PTR(-EINVAL);
981 1004
1005 /*
1006 * Siblings of global init remain as zombies on exit since they are
1007 * not reaped by their parent (swapper). To solve this and to avoid
1008 * multi-rooted process trees, prevent global and container-inits
1009 * from creating siblings.
1010 */
1011 if ((clone_flags & CLONE_PARENT) &&
1012 current->signal->flags & SIGNAL_UNKILLABLE)
1013 return ERR_PTR(-EINVAL);
1014
982 retval = security_task_create(clone_flags); 1015 retval = security_task_create(clone_flags);
983 if (retval) 1016 if (retval)
984 goto fork_out; 1017 goto fork_out;
@@ -1020,9 +1053,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1020 if (!try_module_get(task_thread_info(p)->exec_domain->module)) 1053 if (!try_module_get(task_thread_info(p)->exec_domain->module))
1021 goto bad_fork_cleanup_count; 1054 goto bad_fork_cleanup_count;
1022 1055
1023 if (p->binfmt && !try_module_get(p->binfmt->module))
1024 goto bad_fork_cleanup_put_domain;
1025
1026 p->did_exec = 0; 1056 p->did_exec = 0;
1027 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ 1057 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
1028 copy_flags(clone_flags, p); 1058 copy_flags(clone_flags, p);
@@ -1039,8 +1069,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1039 p->gtime = cputime_zero; 1069 p->gtime = cputime_zero;
1040 p->utimescaled = cputime_zero; 1070 p->utimescaled = cputime_zero;
1041 p->stimescaled = cputime_zero; 1071 p->stimescaled = cputime_zero;
1072#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1042 p->prev_utime = cputime_zero; 1073 p->prev_utime = cputime_zero;
1043 p->prev_stime = cputime_zero; 1074 p->prev_stime = cputime_zero;
1075#endif
1044 1076
1045 p->default_timer_slack_ns = current->timer_slack_ns; 1077 p->default_timer_slack_ns = current->timer_slack_ns;
1046 1078
@@ -1310,9 +1342,6 @@ bad_fork_cleanup_cgroup:
1310#endif 1342#endif
1311 cgroup_exit(p, cgroup_callbacks_done); 1343 cgroup_exit(p, cgroup_callbacks_done);
1312 delayacct_tsk_free(p); 1344 delayacct_tsk_free(p);
1313 if (p->binfmt)
1314 module_put(p->binfmt->module);
1315bad_fork_cleanup_put_domain:
1316 module_put(task_thread_info(p)->exec_domain->module); 1345 module_put(task_thread_info(p)->exec_domain->module);
1317bad_fork_cleanup_count: 1346bad_fork_cleanup_count:
1318 atomic_dec(&p->cred->user->processes); 1347 atomic_dec(&p->cred->user->processes);
diff --git a/kernel/futex.c b/kernel/futex.c
index 248dd119a86e..fb65e822fc41 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -89,36 +89,36 @@ struct futex_pi_state {
89 union futex_key key; 89 union futex_key key;
90}; 90};
91 91
92/* 92/**
93 * We use this hashed waitqueue instead of a normal wait_queue_t, so 93 * struct futex_q - The hashed futex queue entry, one per waiting task
94 * @task: the task waiting on the futex
95 * @lock_ptr: the hash bucket lock
96 * @key: the key the futex is hashed on
97 * @pi_state: optional priority inheritance state
98 * @rt_waiter: rt_waiter storage for use with requeue_pi
99 * @requeue_pi_key: the requeue_pi target futex key
100 * @bitset: bitset for the optional bitmasked wakeup
101 *
102 * We use this hashed waitqueue, instead of a normal wait_queue_t, so
94 * we can wake only the relevant ones (hashed queues may be shared). 103 * we can wake only the relevant ones (hashed queues may be shared).
95 * 104 *
96 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 105 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
97 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 106 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
98 * The order of wakup is always to make the first condition true, then 107 * The order of wakup is always to make the first condition true, then
99 * wake up q->waiter, then make the second condition true. 108 * the second.
109 *
110 * PI futexes are typically woken before they are removed from the hash list via
111 * the rt_mutex code. See unqueue_me_pi().
100 */ 112 */
101struct futex_q { 113struct futex_q {
102 struct plist_node list; 114 struct plist_node list;
103 /* Waiter reference */
104 struct task_struct *task;
105 115
106 /* Which hash list lock to use: */ 116 struct task_struct *task;
107 spinlock_t *lock_ptr; 117 spinlock_t *lock_ptr;
108
109 /* Key which the futex is hashed on: */
110 union futex_key key; 118 union futex_key key;
111
112 /* Optional priority inheritance state: */
113 struct futex_pi_state *pi_state; 119 struct futex_pi_state *pi_state;
114
115 /* rt_waiter storage for requeue_pi: */
116 struct rt_mutex_waiter *rt_waiter; 120 struct rt_mutex_waiter *rt_waiter;
117
118 /* The expected requeue pi target futex key: */
119 union futex_key *requeue_pi_key; 121 union futex_key *requeue_pi_key;
120
121 /* Bitset for the optional bitmasked wakeup */
122 u32 bitset; 122 u32 bitset;
123}; 123};
124 124
@@ -150,7 +150,8 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
150 */ 150 */
151static inline int match_futex(union futex_key *key1, union futex_key *key2) 151static inline int match_futex(union futex_key *key1, union futex_key *key2)
152{ 152{
153 return (key1->both.word == key2->both.word 153 return (key1 && key2
154 && key1->both.word == key2->both.word
154 && key1->both.ptr == key2->both.ptr 155 && key1->both.ptr == key2->both.ptr
155 && key1->both.offset == key2->both.offset); 156 && key1->both.offset == key2->both.offset);
156} 157}
@@ -198,11 +199,12 @@ static void drop_futex_key_refs(union futex_key *key)
198} 199}
199 200
200/** 201/**
201 * get_futex_key - Get parameters which are the keys for a futex. 202 * get_futex_key() - Get parameters which are the keys for a futex
202 * @uaddr: virtual address of the futex 203 * @uaddr: virtual address of the futex
203 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 204 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
204 * @key: address where result is stored. 205 * @key: address where result is stored.
205 * @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE) 206 * @rw: mapping needs to be read/write (values: VERIFY_READ,
207 * VERIFY_WRITE)
206 * 208 *
207 * Returns a negative error code or 0 209 * Returns a negative error code or 0
208 * The key words are stored in *key on success. 210 * The key words are stored in *key on success.
@@ -288,8 +290,8 @@ void put_futex_key(int fshared, union futex_key *key)
288 drop_futex_key_refs(key); 290 drop_futex_key_refs(key);
289} 291}
290 292
291/* 293/**
292 * fault_in_user_writeable - fault in user address and verify RW access 294 * fault_in_user_writeable() - Fault in user address and verify RW access
293 * @uaddr: pointer to faulting user space address 295 * @uaddr: pointer to faulting user space address
294 * 296 *
295 * Slow path to fixup the fault we just took in the atomic write 297 * Slow path to fixup the fault we just took in the atomic write
@@ -309,8 +311,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
309 311
310/** 312/**
311 * futex_top_waiter() - Return the highest priority waiter on a futex 313 * futex_top_waiter() - Return the highest priority waiter on a futex
312 * @hb: the hash bucket the futex_q's reside in 314 * @hb: the hash bucket the futex_q's reside in
313 * @key: the futex key (to distinguish it from other futex futex_q's) 315 * @key: the futex key (to distinguish it from other futex futex_q's)
314 * 316 *
315 * Must be called with the hb lock held. 317 * Must be called with the hb lock held.
316 */ 318 */
@@ -588,7 +590,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
588} 590}
589 591
590/** 592/**
591 * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex 593 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
592 * @uaddr: the pi futex user address 594 * @uaddr: the pi futex user address
593 * @hb: the pi futex hash bucket 595 * @hb: the pi futex hash bucket
594 * @key: the futex key associated with uaddr and hb 596 * @key: the futex key associated with uaddr and hb
@@ -915,8 +917,8 @@ retry:
915 hb1 = hash_futex(&key1); 917 hb1 = hash_futex(&key1);
916 hb2 = hash_futex(&key2); 918 hb2 = hash_futex(&key2);
917 919
918 double_lock_hb(hb1, hb2);
919retry_private: 920retry_private:
921 double_lock_hb(hb1, hb2);
920 op_ret = futex_atomic_op_inuser(op, uaddr2); 922 op_ret = futex_atomic_op_inuser(op, uaddr2);
921 if (unlikely(op_ret < 0)) { 923 if (unlikely(op_ret < 0)) {
922 924
@@ -1011,9 +1013,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1011 1013
1012/** 1014/**
1013 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue 1015 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1014 * q: the futex_q 1016 * @q: the futex_q
1015 * key: the key of the requeue target futex 1017 * @key: the key of the requeue target futex
1016 * hb: the hash_bucket of the requeue target futex 1018 * @hb: the hash_bucket of the requeue target futex
1017 * 1019 *
1018 * During futex_requeue, with requeue_pi=1, it is possible to acquire the 1020 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1019 * target futex if it is uncontended or via a lock steal. Set the futex_q key 1021 * target futex if it is uncontended or via a lock steal. Set the futex_q key
@@ -1027,7 +1029,6 @@ static inline
1027void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, 1029void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1028 struct futex_hash_bucket *hb) 1030 struct futex_hash_bucket *hb)
1029{ 1031{
1030 drop_futex_key_refs(&q->key);
1031 get_futex_key_refs(key); 1032 get_futex_key_refs(key);
1032 q->key = *key; 1033 q->key = *key;
1033 1034
@@ -1225,6 +1226,7 @@ retry_private:
1225 */ 1226 */
1226 if (ret == 1) { 1227 if (ret == 1) {
1227 WARN_ON(pi_state); 1228 WARN_ON(pi_state);
1229 drop_count++;
1228 task_count++; 1230 task_count++;
1229 ret = get_futex_value_locked(&curval2, uaddr2); 1231 ret = get_futex_value_locked(&curval2, uaddr2);
1230 if (!ret) 1232 if (!ret)
@@ -1303,6 +1305,7 @@ retry_private:
1303 if (ret == 1) { 1305 if (ret == 1) {
1304 /* We got the lock. */ 1306 /* We got the lock. */
1305 requeue_pi_wake_futex(this, &key2, hb2); 1307 requeue_pi_wake_futex(this, &key2, hb2);
1308 drop_count++;
1306 continue; 1309 continue;
1307 } else if (ret) { 1310 } else if (ret) {
1308 /* -EDEADLK */ 1311 /* -EDEADLK */
@@ -1350,6 +1353,25 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1350 return hb; 1353 return hb;
1351} 1354}
1352 1355
1356static inline void
1357queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1358{
1359 spin_unlock(&hb->lock);
1360 drop_futex_key_refs(&q->key);
1361}
1362
1363/**
1364 * queue_me() - Enqueue the futex_q on the futex_hash_bucket
1365 * @q: The futex_q to enqueue
1366 * @hb: The destination hash bucket
1367 *
1368 * The hb->lock must be held by the caller, and is released here. A call to
1369 * queue_me() is typically paired with exactly one call to unqueue_me(). The
1370 * exceptions involve the PI related operations, which may use unqueue_me_pi()
1371 * or nothing if the unqueue is done as part of the wake process and the unqueue
1372 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
1373 * an example).
1374 */
1353static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 1375static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1354{ 1376{
1355 int prio; 1377 int prio;
@@ -1373,19 +1395,17 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1373 spin_unlock(&hb->lock); 1395 spin_unlock(&hb->lock);
1374} 1396}
1375 1397
1376static inline void 1398/**
1377queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) 1399 * unqueue_me() - Remove the futex_q from its futex_hash_bucket
1378{ 1400 * @q: The futex_q to unqueue
1379 spin_unlock(&hb->lock); 1401 *
1380 drop_futex_key_refs(&q->key); 1402 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
1381} 1403 * be paired with exactly one earlier call to queue_me().
1382 1404 *
1383/* 1405 * Returns:
1384 * queue_me and unqueue_me must be called as a pair, each 1406 * 1 - if the futex_q was still queued (and we removed unqueued it)
1385 * exactly once. They are called with the hashed spinlock held. 1407 * 0 - if the futex_q was already removed by the waking thread
1386 */ 1408 */
1387
1388/* Return 1 if we were still queued (ie. 0 means we were woken) */
1389static int unqueue_me(struct futex_q *q) 1409static int unqueue_me(struct futex_q *q)
1390{ 1410{
1391 spinlock_t *lock_ptr; 1411 spinlock_t *lock_ptr;
@@ -1638,17 +1658,14 @@ out:
1638static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, 1658static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1639 struct hrtimer_sleeper *timeout) 1659 struct hrtimer_sleeper *timeout)
1640{ 1660{
1641 queue_me(q, hb);
1642
1643 /* 1661 /*
1644 * There might have been scheduling since the queue_me(), as we 1662 * The task state is guaranteed to be set before another task can
1645 * cannot hold a spinlock across the get_user() in case it 1663 * wake it. set_current_state() is implemented using set_mb() and
1646 * faults, and we cannot just set TASK_INTERRUPTIBLE state when 1664 * queue_me() calls spin_unlock() upon completion, both serializing
1647 * queueing ourselves into the futex hash. This code thus has to 1665 * access to the hash list and forcing another memory barrier.
1648 * rely on the futex_wake() code removing us from hash when it
1649 * wakes us up.
1650 */ 1666 */
1651 set_current_state(TASK_INTERRUPTIBLE); 1667 set_current_state(TASK_INTERRUPTIBLE);
1668 queue_me(q, hb);
1652 1669
1653 /* Arm the timer */ 1670 /* Arm the timer */
1654 if (timeout) { 1671 if (timeout) {
@@ -1658,8 +1675,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1658 } 1675 }
1659 1676
1660 /* 1677 /*
1661 * !plist_node_empty() is safe here without any lock. 1678 * If we have been removed from the hash list, then another task
1662 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. 1679 * has tried to wake us, and we can skip the call to schedule().
1663 */ 1680 */
1664 if (likely(!plist_node_empty(&q->list))) { 1681 if (likely(!plist_node_empty(&q->list))) {
1665 /* 1682 /*
@@ -1776,6 +1793,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1776 current->timer_slack_ns); 1793 current->timer_slack_ns);
1777 } 1794 }
1778 1795
1796retry:
1779 /* Prepare to wait on uaddr. */ 1797 /* Prepare to wait on uaddr. */
1780 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 1798 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
1781 if (ret) 1799 if (ret)
@@ -1793,9 +1811,14 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1793 goto out_put_key; 1811 goto out_put_key;
1794 1812
1795 /* 1813 /*
1796 * We expect signal_pending(current), but another thread may 1814 * We expect signal_pending(current), but we might be the
1797 * have handled it for us already. 1815 * victim of a spurious wakeup as well.
1798 */ 1816 */
1817 if (!signal_pending(current)) {
1818 put_futex_key(fshared, &q.key);
1819 goto retry;
1820 }
1821
1799 ret = -ERESTARTSYS; 1822 ret = -ERESTARTSYS;
1800 if (!abs_time) 1823 if (!abs_time)
1801 goto out_put_key; 1824 goto out_put_key;
@@ -2102,11 +2125,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2102 * Unqueue the futex_q and determine which it was. 2125 * Unqueue the futex_q and determine which it was.
2103 */ 2126 */
2104 plist_del(&q->list, &q->list.plist); 2127 plist_del(&q->list, &q->list.plist);
2105 drop_futex_key_refs(&q->key);
2106 2128
2129 /* Handle spurious wakeups gracefully */
2130 ret = -EWOULDBLOCK;
2107 if (timeout && !timeout->task) 2131 if (timeout && !timeout->task)
2108 ret = -ETIMEDOUT; 2132 ret = -ETIMEDOUT;
2109 else 2133 else if (signal_pending(current))
2110 ret = -ERESTARTNOINTR; 2134 ret = -ERESTARTNOINTR;
2111 } 2135 }
2112 return ret; 2136 return ret;
@@ -2114,12 +2138,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2114 2138
2115/** 2139/**
2116 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 2140 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2117 * @uaddr: the futex we initialyl wait on (non-pi) 2141 * @uaddr: the futex we initially wait on (non-pi)
2118 * @fshared: whether the futexes are shared (1) or not (0). They must be 2142 * @fshared: whether the futexes are shared (1) or not (0). They must be
2119 * the same type, no requeueing from private to shared, etc. 2143 * the same type, no requeueing from private to shared, etc.
2120 * @val: the expected value of uaddr 2144 * @val: the expected value of uaddr
2121 * @abs_time: absolute timeout 2145 * @abs_time: absolute timeout
2122 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all. 2146 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
2123 * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) 2147 * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
2124 * @uaddr2: the pi futex we will take prior to returning to user-space 2148 * @uaddr2: the pi futex we will take prior to returning to user-space
2125 * 2149 *
@@ -2246,7 +2270,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2246 res = fixup_owner(uaddr2, fshared, &q, !ret); 2270 res = fixup_owner(uaddr2, fshared, &q, !ret);
2247 /* 2271 /*
2248 * If fixup_owner() returned an error, proprogate that. If it 2272 * If fixup_owner() returned an error, proprogate that. If it
2249 * acquired the lock, clear our -ETIMEDOUT or -EINTR. 2273 * acquired the lock, clear -ETIMEDOUT or -EINTR.
2250 */ 2274 */
2251 if (res) 2275 if (res)
2252 ret = (res < 0) ? res : 0; 2276 ret = (res < 0) ? res : 0;
@@ -2302,9 +2326,9 @@ out:
2302 */ 2326 */
2303 2327
2304/** 2328/**
2305 * sys_set_robust_list - set the robust-futex list head of a task 2329 * sys_set_robust_list() - Set the robust-futex list head of a task
2306 * @head: pointer to the list-head 2330 * @head: pointer to the list-head
2307 * @len: length of the list-head, as userspace expects 2331 * @len: length of the list-head, as userspace expects
2308 */ 2332 */
2309SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, 2333SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
2310 size_t, len) 2334 size_t, len)
@@ -2323,10 +2347,10 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
2323} 2347}
2324 2348
2325/** 2349/**
2326 * sys_get_robust_list - get the robust-futex list head of a task 2350 * sys_get_robust_list() - Get the robust-futex list head of a task
2327 * @pid: pid of the process [zero for current task] 2351 * @pid: pid of the process [zero for current task]
2328 * @head_ptr: pointer to a list-head pointer, the kernel fills it in 2352 * @head_ptr: pointer to a list-head pointer, the kernel fills it in
2329 * @len_ptr: pointer to a length field, the kernel fills in the header size 2353 * @len_ptr: pointer to a length field, the kernel fills in the header size
2330 */ 2354 */
2331SYSCALL_DEFINE3(get_robust_list, int, pid, 2355SYSCALL_DEFINE3(get_robust_list, int, pid,
2332 struct robust_list_head __user * __user *, head_ptr, 2356 struct robust_list_head __user * __user *, head_ptr,
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 654efd09f6a9..70a298d6da71 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -34,7 +34,7 @@ config GCOV_KERNEL
34config GCOV_PROFILE_ALL 34config GCOV_PROFILE_ALL
35 bool "Profile entire Kernel" 35 bool "Profile entire Kernel"
36 depends on GCOV_KERNEL 36 depends on GCOV_KERNEL
37 depends on S390 || X86 || (PPC && EXPERIMENTAL) 37 depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
38 default n 38 default n
39 ---help--- 39 ---help---
40 This options activates profiling for the entire kernel. 40 This options activates profiling for the entire kernel.
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index e5d98ce50f89..3e1c36e7998f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -509,13 +509,14 @@ static inline int hrtimer_hres_active(void)
509 * next event 509 * next event
510 * Called with interrupts disabled and base->lock held 510 * Called with interrupts disabled and base->lock held
511 */ 511 */
512static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) 512static void
513hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
513{ 514{
514 int i; 515 int i;
515 struct hrtimer_clock_base *base = cpu_base->clock_base; 516 struct hrtimer_clock_base *base = cpu_base->clock_base;
516 ktime_t expires; 517 ktime_t expires, expires_next;
517 518
518 cpu_base->expires_next.tv64 = KTIME_MAX; 519 expires_next.tv64 = KTIME_MAX;
519 520
520 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 521 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
521 struct hrtimer *timer; 522 struct hrtimer *timer;
@@ -531,10 +532,15 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
531 */ 532 */
532 if (expires.tv64 < 0) 533 if (expires.tv64 < 0)
533 expires.tv64 = 0; 534 expires.tv64 = 0;
534 if (expires.tv64 < cpu_base->expires_next.tv64) 535 if (expires.tv64 < expires_next.tv64)
535 cpu_base->expires_next = expires; 536 expires_next = expires;
536 } 537 }
537 538
539 if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
540 return;
541
542 cpu_base->expires_next.tv64 = expires_next.tv64;
543
538 if (cpu_base->expires_next.tv64 != KTIME_MAX) 544 if (cpu_base->expires_next.tv64 != KTIME_MAX)
539 tick_program_event(cpu_base->expires_next, 1); 545 tick_program_event(cpu_base->expires_next, 1);
540} 546}
@@ -617,7 +623,7 @@ static void retrigger_next_event(void *arg)
617 base->clock_base[CLOCK_REALTIME].offset = 623 base->clock_base[CLOCK_REALTIME].offset =
618 timespec_to_ktime(realtime_offset); 624 timespec_to_ktime(realtime_offset);
619 625
620 hrtimer_force_reprogram(base); 626 hrtimer_force_reprogram(base, 0);
621 spin_unlock(&base->lock); 627 spin_unlock(&base->lock);
622} 628}
623 629
@@ -720,8 +726,6 @@ static int hrtimer_switch_to_hres(void)
720 /* "Retrigger" the interrupt to get things going */ 726 /* "Retrigger" the interrupt to get things going */
721 retrigger_next_event(NULL); 727 retrigger_next_event(NULL);
722 local_irq_restore(flags); 728 local_irq_restore(flags);
723 printk(KERN_DEBUG "Switched to high resolution mode on CPU %d\n",
724 smp_processor_id());
725 return 1; 729 return 1;
726} 730}
727 731
@@ -730,7 +734,8 @@ static int hrtimer_switch_to_hres(void)
730static inline int hrtimer_hres_active(void) { return 0; } 734static inline int hrtimer_hres_active(void) { return 0; }
731static inline int hrtimer_is_hres_enabled(void) { return 0; } 735static inline int hrtimer_is_hres_enabled(void) { return 0; }
732static inline int hrtimer_switch_to_hres(void) { return 0; } 736static inline int hrtimer_switch_to_hres(void) { return 0; }
733static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } 737static inline void
738hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
734static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, 739static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
735 struct hrtimer_clock_base *base, 740 struct hrtimer_clock_base *base,
736 int wakeup) 741 int wakeup)
@@ -873,19 +878,29 @@ static void __remove_hrtimer(struct hrtimer *timer,
873 struct hrtimer_clock_base *base, 878 struct hrtimer_clock_base *base,
874 unsigned long newstate, int reprogram) 879 unsigned long newstate, int reprogram)
875{ 880{
876 if (timer->state & HRTIMER_STATE_ENQUEUED) { 881 if (!(timer->state & HRTIMER_STATE_ENQUEUED))
877 /* 882 goto out;
878 * Remove the timer from the rbtree and replace the 883
879 * first entry pointer if necessary. 884 /*
880 */ 885 * Remove the timer from the rbtree and replace the first
881 if (base->first == &timer->node) { 886 * entry pointer if necessary.
882 base->first = rb_next(&timer->node); 887 */
883 /* Reprogram the clock event device. if enabled */ 888 if (base->first == &timer->node) {
884 if (reprogram && hrtimer_hres_active()) 889 base->first = rb_next(&timer->node);
885 hrtimer_force_reprogram(base->cpu_base); 890#ifdef CONFIG_HIGH_RES_TIMERS
891 /* Reprogram the clock event device. if enabled */
892 if (reprogram && hrtimer_hres_active()) {
893 ktime_t expires;
894
895 expires = ktime_sub(hrtimer_get_expires(timer),
896 base->offset);
897 if (base->cpu_base->expires_next.tv64 == expires.tv64)
898 hrtimer_force_reprogram(base->cpu_base, 1);
886 } 899 }
887 rb_erase(&timer->node, &base->active); 900#endif
888 } 901 }
902 rb_erase(&timer->node, &base->active);
903out:
889 timer->state = newstate; 904 timer->state = newstate;
890} 905}
891 906
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 022a4927b785..0c642d51aac2 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -144,7 +144,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
144 144
145 rcu_read_lock(); 145 rcu_read_lock();
146 do_each_thread(g, t) { 146 do_each_thread(g, t) {
147 if (!--max_count) 147 if (!max_count--)
148 goto unlock; 148 goto unlock;
149 if (!--batch_count) { 149 if (!--batch_count) {
150 batch_count = HUNG_TASK_BATCHING; 150 batch_count = HUNG_TASK_BATCHING;
@@ -171,12 +171,12 @@ static unsigned long timeout_jiffies(unsigned long timeout)
171 * Process updating of timeout sysctl 171 * Process updating of timeout sysctl
172 */ 172 */
173int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, 173int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
174 struct file *filp, void __user *buffer, 174 void __user *buffer,
175 size_t *lenp, loff_t *ppos) 175 size_t *lenp, loff_t *ppos)
176{ 176{
177 int ret; 177 int ret;
178 178
179 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); 179 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
180 180
181 if (ret || !write) 181 if (ret || !write)
182 goto out; 182 goto out;
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..cf5ee1628411
--- /dev/null
+++ b/kernel/hw_breakpoint.c
@@ -0,0 +1,423 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) 2007 Alan Stern
17 * Copyright (C) IBM Corporation, 2009
18 * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
19 *
20 * Thanks to Ingo Molnar for his many suggestions.
21 *
22 * Authors: Alan Stern <stern@rowland.harvard.edu>
23 * K.Prasad <prasad@linux.vnet.ibm.com>
24 * Frederic Weisbecker <fweisbec@gmail.com>
25 */
26
27/*
28 * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
29 * using the CPU's debug registers.
30 * This file contains the arch-independent routines.
31 */
32
33#include <linux/irqflags.h>
34#include <linux/kallsyms.h>
35#include <linux/notifier.h>
36#include <linux/kprobes.h>
37#include <linux/kdebug.h>
38#include <linux/kernel.h>
39#include <linux/module.h>
40#include <linux/percpu.h>
41#include <linux/sched.h>
42#include <linux/init.h>
43#include <linux/smp.h>
44
45#include <linux/hw_breakpoint.h>
46
47/*
48 * Constraints data
49 */
50
51/* Number of pinned cpu breakpoints in a cpu */
52static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
53
54/* Number of pinned task breakpoints in a cpu */
55static DEFINE_PER_CPU(unsigned int, task_bp_pinned[HBP_NUM]);
56
57/* Number of non-pinned cpu/task breakpoints in a cpu */
58static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
59
60/* Gather the number of total pinned and un-pinned bp in a cpuset */
61struct bp_busy_slots {
62 unsigned int pinned;
63 unsigned int flexible;
64};
65
66/* Serialize accesses to the above constraints */
67static DEFINE_MUTEX(nr_bp_mutex);
68
69/*
70 * Report the maximum number of pinned breakpoints a task
71 * have in this cpu
72 */
73static unsigned int max_task_bp_pinned(int cpu)
74{
75 int i;
76 unsigned int *tsk_pinned = per_cpu(task_bp_pinned, cpu);
77
78 for (i = HBP_NUM -1; i >= 0; i--) {
79 if (tsk_pinned[i] > 0)
80 return i + 1;
81 }
82
83 return 0;
84}
85
86/*
87 * Report the number of pinned/un-pinned breakpoints we have in
88 * a given cpu (cpu > -1) or in all of them (cpu = -1).
89 */
90static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
91{
92 if (cpu >= 0) {
93 slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
94 slots->pinned += max_task_bp_pinned(cpu);
95 slots->flexible = per_cpu(nr_bp_flexible, cpu);
96
97 return;
98 }
99
100 for_each_online_cpu(cpu) {
101 unsigned int nr;
102
103 nr = per_cpu(nr_cpu_bp_pinned, cpu);
104 nr += max_task_bp_pinned(cpu);
105
106 if (nr > slots->pinned)
107 slots->pinned = nr;
108
109 nr = per_cpu(nr_bp_flexible, cpu);
110
111 if (nr > slots->flexible)
112 slots->flexible = nr;
113 }
114}
115
116/*
117 * Add a pinned breakpoint for the given task in our constraint table
118 */
119static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
120{
121 int count = 0;
122 struct perf_event *bp;
123 struct perf_event_context *ctx = tsk->perf_event_ctxp;
124 unsigned int *tsk_pinned;
125 struct list_head *list;
126 unsigned long flags;
127
128 if (WARN_ONCE(!ctx, "No perf context for this task"))
129 return;
130
131 list = &ctx->event_list;
132
133 spin_lock_irqsave(&ctx->lock, flags);
134
135 /*
136 * The current breakpoint counter is not included in the list
137 * at the open() callback time
138 */
139 list_for_each_entry(bp, list, event_entry) {
140 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
141 count++;
142 }
143
144 spin_unlock_irqrestore(&ctx->lock, flags);
145
146 if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list"))
147 return;
148
149 tsk_pinned = per_cpu(task_bp_pinned, cpu);
150 if (enable) {
151 tsk_pinned[count]++;
152 if (count > 0)
153 tsk_pinned[count-1]--;
154 } else {
155 tsk_pinned[count]--;
156 if (count > 0)
157 tsk_pinned[count-1]++;
158 }
159}
160
161/*
162 * Add/remove the given breakpoint in our constraint table
163 */
164static void toggle_bp_slot(struct perf_event *bp, bool enable)
165{
166 int cpu = bp->cpu;
167 struct task_struct *tsk = bp->ctx->task;
168
169 /* Pinned counter task profiling */
170 if (tsk) {
171 if (cpu >= 0) {
172 toggle_bp_task_slot(tsk, cpu, enable);
173 return;
174 }
175
176 for_each_online_cpu(cpu)
177 toggle_bp_task_slot(tsk, cpu, enable);
178 return;
179 }
180
181 /* Pinned counter cpu profiling */
182 if (enable)
183 per_cpu(nr_cpu_bp_pinned, bp->cpu)++;
184 else
185 per_cpu(nr_cpu_bp_pinned, bp->cpu)--;
186}
187
188/*
189 * Contraints to check before allowing this new breakpoint counter:
190 *
191 * == Non-pinned counter == (Considered as pinned for now)
192 *
193 * - If attached to a single cpu, check:
194 *
195 * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
196 * + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM
197 *
198 * -> If there are already non-pinned counters in this cpu, it means
199 * there is already a free slot for them.
200 * Otherwise, we check that the maximum number of per task
201 * breakpoints (for this cpu) plus the number of per cpu breakpoint
202 * (for this cpu) doesn't cover every registers.
203 *
204 * - If attached to every cpus, check:
205 *
206 * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
207 * + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM
208 *
209 * -> This is roughly the same, except we check the number of per cpu
210 * bp for every cpu and we keep the max one. Same for the per tasks
211 * breakpoints.
212 *
213 *
214 * == Pinned counter ==
215 *
216 * - If attached to a single cpu, check:
217 *
218 * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
219 * + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM
220 *
221 * -> Same checks as before. But now the nr_bp_flexible, if any, must keep
222 * one register at least (or they will never be fed).
223 *
224 * - If attached to every cpus, check:
225 *
226 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
227 * + max(per_cpu(task_bp_pinned, *))) < HBP_NUM
228 */
229int reserve_bp_slot(struct perf_event *bp)
230{
231 struct bp_busy_slots slots = {0};
232 int ret = 0;
233
234 mutex_lock(&nr_bp_mutex);
235
236 fetch_bp_busy_slots(&slots, bp->cpu);
237
238 /* Flexible counters need to keep at least one slot */
239 if (slots.pinned + (!!slots.flexible) == HBP_NUM) {
240 ret = -ENOSPC;
241 goto end;
242 }
243
244 toggle_bp_slot(bp, true);
245
246end:
247 mutex_unlock(&nr_bp_mutex);
248
249 return ret;
250}
251
252void release_bp_slot(struct perf_event *bp)
253{
254 mutex_lock(&nr_bp_mutex);
255
256 toggle_bp_slot(bp, false);
257
258 mutex_unlock(&nr_bp_mutex);
259}
260
261
262int __register_perf_hw_breakpoint(struct perf_event *bp)
263{
264 int ret;
265
266 ret = reserve_bp_slot(bp);
267 if (ret)
268 return ret;
269
270 /*
271 * Ptrace breakpoints can be temporary perf events only
272 * meant to reserve a slot. In this case, it is created disabled and
273 * we don't want to check the params right now (as we put a null addr)
274 * But perf tools create events as disabled and we want to check
275 * the params for them.
276 * This is a quick hack that will be removed soon, once we remove
277 * the tmp breakpoints from ptrace
278 */
279 if (!bp->attr.disabled || bp->callback == perf_bp_event)
280 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
281
282 return ret;
283}
284
285int register_perf_hw_breakpoint(struct perf_event *bp)
286{
287 bp->callback = perf_bp_event;
288
289 return __register_perf_hw_breakpoint(bp);
290}
291
292/**
293 * register_user_hw_breakpoint - register a hardware breakpoint for user space
294 * @attr: breakpoint attributes
295 * @triggered: callback to trigger when we hit the breakpoint
296 * @tsk: pointer to 'task_struct' of the process to which the address belongs
297 */
298struct perf_event *
299register_user_hw_breakpoint(struct perf_event_attr *attr,
300 perf_callback_t triggered,
301 struct task_struct *tsk)
302{
303 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
304}
305EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
306
307/**
308 * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
309 * @bp: the breakpoint structure to modify
310 * @attr: new breakpoint attributes
311 * @triggered: callback to trigger when we hit the breakpoint
312 * @tsk: pointer to 'task_struct' of the process to which the address belongs
313 */
314struct perf_event *
315modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr,
316 perf_callback_t triggered,
317 struct task_struct *tsk)
318{
319 /*
320 * FIXME: do it without unregistering
321 * - We don't want to lose our slot
322 * - If the new bp is incorrect, don't lose the older one
323 */
324 unregister_hw_breakpoint(bp);
325
326 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
327}
328EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
329
330/**
331 * unregister_hw_breakpoint - unregister a user-space hardware breakpoint
332 * @bp: the breakpoint structure to unregister
333 */
334void unregister_hw_breakpoint(struct perf_event *bp)
335{
336 if (!bp)
337 return;
338 perf_event_release_kernel(bp);
339}
340EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
341
342/**
343 * register_wide_hw_breakpoint - register a wide breakpoint in the kernel
344 * @attr: breakpoint attributes
345 * @triggered: callback to trigger when we hit the breakpoint
346 *
347 * @return a set of per_cpu pointers to perf events
348 */
349struct perf_event **
350register_wide_hw_breakpoint(struct perf_event_attr *attr,
351 perf_callback_t triggered)
352{
353 struct perf_event **cpu_events, **pevent, *bp;
354 long err;
355 int cpu;
356
357 cpu_events = alloc_percpu(typeof(*cpu_events));
358 if (!cpu_events)
359 return ERR_PTR(-ENOMEM);
360
361 for_each_possible_cpu(cpu) {
362 pevent = per_cpu_ptr(cpu_events, cpu);
363 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
364
365 *pevent = bp;
366
367 if (IS_ERR(bp)) {
368 err = PTR_ERR(bp);
369 goto fail;
370 }
371 }
372
373 return cpu_events;
374
375fail:
376 for_each_possible_cpu(cpu) {
377 pevent = per_cpu_ptr(cpu_events, cpu);
378 if (IS_ERR(*pevent))
379 break;
380 unregister_hw_breakpoint(*pevent);
381 }
382 free_percpu(cpu_events);
383 /* return the error if any */
384 return ERR_PTR(err);
385}
386EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
387
388/**
389 * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
390 * @cpu_events: the per cpu set of events to unregister
391 */
392void unregister_wide_hw_breakpoint(struct perf_event **cpu_events)
393{
394 int cpu;
395 struct perf_event **pevent;
396
397 for_each_possible_cpu(cpu) {
398 pevent = per_cpu_ptr(cpu_events, cpu);
399 unregister_hw_breakpoint(*pevent);
400 }
401 free_percpu(cpu_events);
402}
403EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
404
405static struct notifier_block hw_breakpoint_exceptions_nb = {
406 .notifier_call = hw_breakpoint_exceptions_notify,
407 /* we need to be notified first */
408 .priority = 0x7fffffff
409};
410
411static int __init init_hw_breakpoint(void)
412{
413 return register_die_notifier(&hw_breakpoint_exceptions_nb);
414}
415core_initcall(init_hw_breakpoint);
416
417
418struct pmu perf_ops_bp = {
419 .enable = arch_install_hw_breakpoint,
420 .disable = arch_uninstall_hw_breakpoint,
421 .read = hw_breakpoint_pmu_read,
422 .unthrottle = hw_breakpoint_pmu_unthrottle
423};
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c1660194d115..ba566c261adc 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -166,11 +166,11 @@ int set_irq_data(unsigned int irq, void *data)
166EXPORT_SYMBOL(set_irq_data); 166EXPORT_SYMBOL(set_irq_data);
167 167
168/** 168/**
169 * set_irq_data - set irq type data for an irq 169 * set_irq_msi - set MSI descriptor data for an irq
170 * @irq: Interrupt number 170 * @irq: Interrupt number
171 * @entry: Pointer to MSI descriptor data 171 * @entry: Pointer to MSI descriptor data
172 * 172 *
173 * Set the hardware irq controller data for an irq 173 * Set the MSI descriptor entry for an irq
174 */ 174 */
175int set_irq_msi(unsigned int irq, struct msi_desc *entry) 175int set_irq_msi(unsigned int irq, struct msi_desc *entry)
176{ 176{
@@ -590,7 +590,7 @@ out_unlock:
590} 590}
591 591
592/** 592/**
593 * handle_percpu_IRQ - Per CPU local irq handler 593 * handle_percpu_irq - Per CPU local irq handler
594 * @irq: the interrupt number 594 * @irq: the interrupt number
595 * @desc: the interrupt description structure for this irq 595 * @desc: the interrupt description structure for this irq
596 * 596 *
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a81cf80554db..17c71bb565c6 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,6 +11,7 @@
11 */ 11 */
12 12
13#include <linux/irq.h> 13#include <linux/irq.h>
14#include <linux/sched.h>
14#include <linux/slab.h> 15#include <linux/slab.h>
15#include <linux/module.h> 16#include <linux/module.h>
16#include <linux/random.h> 17#include <linux/random.h>
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 692363dd591f..0832145fea97 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -136,7 +136,7 @@ out:
136 136
137static int default_affinity_open(struct inode *inode, struct file *file) 137static int default_affinity_open(struct inode *inode, struct file *file)
138{ 138{
139 return single_open(file, default_affinity_show, NULL); 139 return single_open(file, default_affinity_show, PDE(inode)->data);
140} 140}
141 141
142static const struct file_operations default_affinity_proc_fops = { 142static const struct file_operations default_affinity_proc_fops = {
@@ -148,18 +148,28 @@ static const struct file_operations default_affinity_proc_fops = {
148}; 148};
149#endif 149#endif
150 150
151static int irq_spurious_read(char *page, char **start, off_t off, 151static int irq_spurious_proc_show(struct seq_file *m, void *v)
152 int count, int *eof, void *data)
153{ 152{
154 struct irq_desc *desc = irq_to_desc((long) data); 153 struct irq_desc *desc = irq_to_desc((long) m->private);
155 return sprintf(page, "count %u\n" 154
156 "unhandled %u\n" 155 seq_printf(m, "count %u\n" "unhandled %u\n" "last_unhandled %u ms\n",
157 "last_unhandled %u ms\n", 156 desc->irq_count, desc->irqs_unhandled,
158 desc->irq_count, 157 jiffies_to_msecs(desc->last_unhandled));
159 desc->irqs_unhandled, 158 return 0;
160 jiffies_to_msecs(desc->last_unhandled)); 159}
160
161static int irq_spurious_proc_open(struct inode *inode, struct file *file)
162{
163 return single_open(file, irq_spurious_proc_show, NULL);
161} 164}
162 165
166static const struct file_operations irq_spurious_proc_fops = {
167 .open = irq_spurious_proc_open,
168 .read = seq_read,
169 .llseek = seq_lseek,
170 .release = single_release,
171};
172
163#define MAX_NAMELEN 128 173#define MAX_NAMELEN 128
164 174
165static int name_unique(unsigned int irq, struct irqaction *new_action) 175static int name_unique(unsigned int irq, struct irqaction *new_action)
@@ -204,7 +214,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
204void register_irq_proc(unsigned int irq, struct irq_desc *desc) 214void register_irq_proc(unsigned int irq, struct irq_desc *desc)
205{ 215{
206 char name [MAX_NAMELEN]; 216 char name [MAX_NAMELEN];
207 struct proc_dir_entry *entry;
208 217
209 if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) 218 if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir)
210 return; 219 return;
@@ -214,6 +223,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
214 223
215 /* create /proc/irq/1234 */ 224 /* create /proc/irq/1234 */
216 desc->dir = proc_mkdir(name, root_irq_dir); 225 desc->dir = proc_mkdir(name, root_irq_dir);
226 if (!desc->dir)
227 return;
217 228
218#ifdef CONFIG_SMP 229#ifdef CONFIG_SMP
219 /* create /proc/irq/<irq>/smp_affinity */ 230 /* create /proc/irq/<irq>/smp_affinity */
@@ -221,11 +232,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
221 &irq_affinity_proc_fops, (void *)(long)irq); 232 &irq_affinity_proc_fops, (void *)(long)irq);
222#endif 233#endif
223 234
224 entry = create_proc_entry("spurious", 0444, desc->dir); 235 proc_create_data("spurious", 0444, desc->dir,
225 if (entry) { 236 &irq_spurious_proc_fops, (void *)(long)irq);
226 entry->data = (void *)(long)irq;
227 entry->read_proc = irq_spurious_read;
228 }
229} 237}
230 238
231#undef MAX_NAMELEN 239#undef MAX_NAMELEN
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 114e704760fe..22b0a6eedf24 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -104,7 +104,7 @@ static int misrouted_irq(int irq)
104 return ok; 104 return ok;
105} 105}
106 106
107static void poll_all_shared_irqs(void) 107static void poll_spurious_irqs(unsigned long dummy)
108{ 108{
109 struct irq_desc *desc; 109 struct irq_desc *desc;
110 int i; 110 int i;
@@ -121,25 +121,15 @@ static void poll_all_shared_irqs(void)
121 if (!(status & IRQ_SPURIOUS_DISABLED)) 121 if (!(status & IRQ_SPURIOUS_DISABLED))
122 continue; 122 continue;
123 123
124 local_irq_disable();
124 try_one_irq(i, desc); 125 try_one_irq(i, desc);
126 local_irq_enable();
125 } 127 }
126}
127
128static void poll_spurious_irqs(unsigned long dummy)
129{
130 poll_all_shared_irqs();
131 128
132 mod_timer(&poll_spurious_irq_timer, 129 mod_timer(&poll_spurious_irq_timer,
133 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 130 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
134} 131}
135 132
136#ifdef CONFIG_DEBUG_SHIRQ
137void debug_poll_all_shared_irqs(void)
138{
139 poll_all_shared_irqs();
140}
141#endif
142
143/* 133/*
144 * If 99,900 of the previous 100,000 interrupts have not been handled 134 * If 99,900 of the previous 100,000 interrupts have not been handled
145 * then assume that the IRQ is stuck in some manner. Drop a diagnostic 135 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 8b6b8b697c68..8e5288a8a355 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -181,6 +181,7 @@ unsigned long kallsyms_lookup_name(const char *name)
181 } 181 }
182 return module_kallsyms_lookup_name(name); 182 return module_kallsyms_lookup_name(name);
183} 183}
184EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
184 185
185int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, 186int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
186 unsigned long), 187 unsigned long),
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 9147a3190c9d..7d7014634022 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -870,7 +870,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
870 870
871 /* 871 /*
872 * All threads that don't have debuggerinfo should be 872 * All threads that don't have debuggerinfo should be
873 * in __schedule() sleeping, since all other CPUs 873 * in schedule() sleeping, since all other CPUs
874 * are in kgdb_wait, and thus have debuggerinfo. 874 * are in kgdb_wait, and thus have debuggerinfo.
875 */ 875 */
876 if (local_debuggerinfo) { 876 if (local_debuggerinfo) {
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 689d20f39305..25b103190364 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -80,16 +80,16 @@ int __request_module(bool wait, const char *fmt, ...)
80#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ 80#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
81 static int kmod_loop_msg; 81 static int kmod_loop_msg;
82 82
83 ret = security_kernel_module_request();
84 if (ret)
85 return ret;
86
87 va_start(args, fmt); 83 va_start(args, fmt);
88 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); 84 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
89 va_end(args); 85 va_end(args);
90 if (ret >= MODULE_NAME_LEN) 86 if (ret >= MODULE_NAME_LEN)
91 return -ENAMETOOLONG; 87 return -ENAMETOOLONG;
92 88
89 ret = security_kernel_module_request(module_name);
90 if (ret)
91 return ret;
92
93 /* If modprobe needs a service that is in a module, we get a recursive 93 /* If modprobe needs a service that is in a module, we get a recursive
94 * loop. Limit the number of running kmod threads to max_threads/2 or 94 * loop. Limit the number of running kmod threads to max_threads/2 or
95 * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method 95 * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method
@@ -143,7 +143,6 @@ struct subprocess_info {
143static int ____call_usermodehelper(void *data) 143static int ____call_usermodehelper(void *data)
144{ 144{
145 struct subprocess_info *sub_info = data; 145 struct subprocess_info *sub_info = data;
146 enum umh_wait wait = sub_info->wait;
147 int retval; 146 int retval;
148 147
149 BUG_ON(atomic_read(&sub_info->cred->usage) != 1); 148 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
@@ -185,14 +184,10 @@ static int ____call_usermodehelper(void *data)
185 */ 184 */
186 set_user_nice(current, 0); 185 set_user_nice(current, 0);
187 186
188 if (wait == UMH_WAIT_EXEC)
189 complete(sub_info->complete);
190
191 retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); 187 retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp);
192 188
193 /* Exec failed? */ 189 /* Exec failed? */
194 if (wait != UMH_WAIT_EXEC) 190 sub_info->retval = retval;
195 sub_info->retval = retval;
196 do_exit(0); 191 do_exit(0);
197} 192}
198 193
@@ -271,14 +266,16 @@ static void __call_usermodehelper(struct work_struct *work)
271 266
272 switch (wait) { 267 switch (wait) {
273 case UMH_NO_WAIT: 268 case UMH_NO_WAIT:
274 case UMH_WAIT_EXEC:
275 break; 269 break;
276 270
277 case UMH_WAIT_PROC: 271 case UMH_WAIT_PROC:
278 if (pid > 0) 272 if (pid > 0)
279 break; 273 break;
280 sub_info->retval = pid; 274 sub_info->retval = pid;
281 break; 275 /* FALLTHROUGH */
276
277 case UMH_WAIT_EXEC:
278 complete(sub_info->complete);
282 } 279 }
283} 280}
284 281
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index cfadc1291d0b..e5342a344c43 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -90,6 +90,9 @@ static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
90 */ 90 */
91static struct kprobe_blackpoint kprobe_blacklist[] = { 91static struct kprobe_blackpoint kprobe_blacklist[] = {
92 {"preempt_schedule",}, 92 {"preempt_schedule",},
93 {"native_get_debugreg",},
94 {"irq_entries_start",},
95 {"common_interrupt",},
93 {NULL} /* Terminator */ 96 {NULL} /* Terminator */
94}; 97};
95 98
@@ -673,6 +676,40 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
673 return (kprobe_opcode_t *)(((char *)addr) + p->offset); 676 return (kprobe_opcode_t *)(((char *)addr) + p->offset);
674} 677}
675 678
679/* Check passed kprobe is valid and return kprobe in kprobe_table. */
680static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
681{
682 struct kprobe *old_p, *list_p;
683
684 old_p = get_kprobe(p->addr);
685 if (unlikely(!old_p))
686 return NULL;
687
688 if (p != old_p) {
689 list_for_each_entry_rcu(list_p, &old_p->list, list)
690 if (list_p == p)
691 /* kprobe p is a valid probe */
692 goto valid;
693 return NULL;
694 }
695valid:
696 return old_p;
697}
698
699/* Return error if the kprobe is being re-registered */
700static inline int check_kprobe_rereg(struct kprobe *p)
701{
702 int ret = 0;
703 struct kprobe *old_p;
704
705 mutex_lock(&kprobe_mutex);
706 old_p = __get_valid_kprobe(p);
707 if (old_p)
708 ret = -EINVAL;
709 mutex_unlock(&kprobe_mutex);
710 return ret;
711}
712
676int __kprobes register_kprobe(struct kprobe *p) 713int __kprobes register_kprobe(struct kprobe *p)
677{ 714{
678 int ret = 0; 715 int ret = 0;
@@ -685,6 +722,10 @@ int __kprobes register_kprobe(struct kprobe *p)
685 return -EINVAL; 722 return -EINVAL;
686 p->addr = addr; 723 p->addr = addr;
687 724
725 ret = check_kprobe_rereg(p);
726 if (ret)
727 return ret;
728
688 preempt_disable(); 729 preempt_disable();
689 if (!kernel_text_address((unsigned long) p->addr) || 730 if (!kernel_text_address((unsigned long) p->addr) ||
690 in_kprobes_functions((unsigned long) p->addr)) { 731 in_kprobes_functions((unsigned long) p->addr)) {
@@ -754,26 +795,6 @@ out:
754} 795}
755EXPORT_SYMBOL_GPL(register_kprobe); 796EXPORT_SYMBOL_GPL(register_kprobe);
756 797
757/* Check passed kprobe is valid and return kprobe in kprobe_table. */
758static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
759{
760 struct kprobe *old_p, *list_p;
761
762 old_p = get_kprobe(p->addr);
763 if (unlikely(!old_p))
764 return NULL;
765
766 if (p != old_p) {
767 list_for_each_entry_rcu(list_p, &old_p->list, list)
768 if (list_p == p)
769 /* kprobe p is a valid probe */
770 goto valid;
771 return NULL;
772 }
773valid:
774 return old_p;
775}
776
777/* 798/*
778 * Unregister a kprobe without a scheduler synchronization. 799 * Unregister a kprobe without a scheduler synchronization.
779 */ 800 */
@@ -1014,9 +1035,9 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
1014 /* Pre-allocate memory for max kretprobe instances */ 1035 /* Pre-allocate memory for max kretprobe instances */
1015 if (rp->maxactive <= 0) { 1036 if (rp->maxactive <= 0) {
1016#ifdef CONFIG_PREEMPT 1037#ifdef CONFIG_PREEMPT
1017 rp->maxactive = max(10, 2 * NR_CPUS); 1038 rp->maxactive = max(10, 2 * num_possible_cpus());
1018#else 1039#else
1019 rp->maxactive = NR_CPUS; 1040 rp->maxactive = num_possible_cpus();
1020#endif 1041#endif
1021 } 1042 }
1022 spin_lock_init(&rp->lock); 1043 spin_lock_init(&rp->lock);
@@ -1141,6 +1162,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1141 arch_remove_kprobe(p); 1162 arch_remove_kprobe(p);
1142} 1163}
1143 1164
1165void __kprobes dump_kprobe(struct kprobe *kp)
1166{
1167 printk(KERN_WARNING "Dumping kprobe:\n");
1168 printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n",
1169 kp->symbol_name, kp->addr, kp->offset);
1170}
1171
1144/* Module notifier call back, checking kprobes on the module */ 1172/* Module notifier call back, checking kprobes on the module */
1145static int __kprobes kprobes_module_callback(struct notifier_block *nb, 1173static int __kprobes kprobes_module_callback(struct notifier_block *nb,
1146 unsigned long val, void *data) 1174 unsigned long val, void *data)
@@ -1333,7 +1361,7 @@ static int __kprobes kprobes_open(struct inode *inode, struct file *filp)
1333 return seq_open(filp, &kprobes_seq_ops); 1361 return seq_open(filp, &kprobes_seq_ops);
1334} 1362}
1335 1363
1336static struct file_operations debugfs_kprobes_operations = { 1364static const struct file_operations debugfs_kprobes_operations = {
1337 .open = kprobes_open, 1365 .open = kprobes_open,
1338 .read = seq_read, 1366 .read = seq_read,
1339 .llseek = seq_lseek, 1367 .llseek = seq_lseek,
@@ -1515,7 +1543,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
1515 return count; 1543 return count;
1516} 1544}
1517 1545
1518static struct file_operations fops_kp = { 1546static const struct file_operations fops_kp = {
1519 .read = read_enabled_file_bool, 1547 .read = read_enabled_file_bool,
1520 .write = write_enabled_file_bool, 1548 .write = write_enabled_file_bool,
1521}; 1549};
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 5fe709982caa..ab7ae57773e1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -150,29 +150,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
150EXPORT_SYMBOL(kthread_create); 150EXPORT_SYMBOL(kthread_create);
151 151
152/** 152/**
153 * kthread_bind - bind a just-created kthread to a cpu.
154 * @k: thread created by kthread_create().
155 * @cpu: cpu (might not be online, must be possible) for @k to run on.
156 *
157 * Description: This function is equivalent to set_cpus_allowed(),
158 * except that @cpu doesn't need to be online, and the thread must be
159 * stopped (i.e., just returned from kthread_create()).
160 */
161void kthread_bind(struct task_struct *k, unsigned int cpu)
162{
163 /* Must have done schedule() in kthread() before we set_task_cpu */
164 if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) {
165 WARN_ON(1);
166 return;
167 }
168 set_task_cpu(k, cpu);
169 k->cpus_allowed = cpumask_of_cpu(cpu);
170 k->rt.nr_cpus_allowed = 1;
171 k->flags |= PF_THREAD_BOUND;
172}
173EXPORT_SYMBOL(kthread_bind);
174
175/**
176 * kthread_stop - stop a thread created by kthread_create(). 153 * kthread_stop - stop a thread created by kthread_create().
177 * @k: thread created by kthread_create(). 154 * @k: thread created by kthread_create().
178 * 155 *
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 3815ac1d58b2..f5dcd36d3151 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -49,7 +49,7 @@
49#include "lockdep_internals.h" 49#include "lockdep_internals.h"
50 50
51#define CREATE_TRACE_POINTS 51#define CREATE_TRACE_POINTS
52#include <trace/events/lockdep.h> 52#include <trace/events/lock.h>
53 53
54#ifdef CONFIG_PROVE_LOCKING 54#ifdef CONFIG_PROVE_LOCKING
55int prove_locking = 1; 55int prove_locking = 1;
@@ -142,6 +142,11 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
142#ifdef CONFIG_LOCK_STAT 142#ifdef CONFIG_LOCK_STAT
143static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); 143static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
144 144
145static inline u64 lockstat_clock(void)
146{
147 return cpu_clock(smp_processor_id());
148}
149
145static int lock_point(unsigned long points[], unsigned long ip) 150static int lock_point(unsigned long points[], unsigned long ip)
146{ 151{
147 int i; 152 int i;
@@ -158,7 +163,7 @@ static int lock_point(unsigned long points[], unsigned long ip)
158 return i; 163 return i;
159} 164}
160 165
161static void lock_time_inc(struct lock_time *lt, s64 time) 166static void lock_time_inc(struct lock_time *lt, u64 time)
162{ 167{
163 if (time > lt->max) 168 if (time > lt->max)
164 lt->max = time; 169 lt->max = time;
@@ -234,12 +239,12 @@ static void put_lock_stats(struct lock_class_stats *stats)
234static void lock_release_holdtime(struct held_lock *hlock) 239static void lock_release_holdtime(struct held_lock *hlock)
235{ 240{
236 struct lock_class_stats *stats; 241 struct lock_class_stats *stats;
237 s64 holdtime; 242 u64 holdtime;
238 243
239 if (!lock_stat) 244 if (!lock_stat)
240 return; 245 return;
241 246
242 holdtime = sched_clock() - hlock->holdtime_stamp; 247 holdtime = lockstat_clock() - hlock->holdtime_stamp;
243 248
244 stats = get_lock_stats(hlock_class(hlock)); 249 stats = get_lock_stats(hlock_class(hlock));
245 if (hlock->read) 250 if (hlock->read)
@@ -2792,7 +2797,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2792 hlock->references = references; 2797 hlock->references = references;
2793#ifdef CONFIG_LOCK_STAT 2798#ifdef CONFIG_LOCK_STAT
2794 hlock->waittime_stamp = 0; 2799 hlock->waittime_stamp = 0;
2795 hlock->holdtime_stamp = sched_clock(); 2800 hlock->holdtime_stamp = lockstat_clock();
2796#endif 2801#endif
2797 2802
2798 if (check == 2 && !mark_irqflags(curr, hlock)) 2803 if (check == 2 && !mark_irqflags(curr, hlock))
@@ -3322,7 +3327,7 @@ found_it:
3322 if (hlock->instance != lock) 3327 if (hlock->instance != lock)
3323 return; 3328 return;
3324 3329
3325 hlock->waittime_stamp = sched_clock(); 3330 hlock->waittime_stamp = lockstat_clock();
3326 3331
3327 contention_point = lock_point(hlock_class(hlock)->contention_point, ip); 3332 contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
3328 contending_point = lock_point(hlock_class(hlock)->contending_point, 3333 contending_point = lock_point(hlock_class(hlock)->contending_point,
@@ -3345,8 +3350,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3345 struct held_lock *hlock, *prev_hlock; 3350 struct held_lock *hlock, *prev_hlock;
3346 struct lock_class_stats *stats; 3351 struct lock_class_stats *stats;
3347 unsigned int depth; 3352 unsigned int depth;
3348 u64 now; 3353 u64 now, waittime = 0;
3349 s64 waittime = 0;
3350 int i, cpu; 3354 int i, cpu;
3351 3355
3352 depth = curr->lockdep_depth; 3356 depth = curr->lockdep_depth;
@@ -3374,7 +3378,7 @@ found_it:
3374 3378
3375 cpu = smp_processor_id(); 3379 cpu = smp_processor_id();
3376 if (hlock->waittime_stamp) { 3380 if (hlock->waittime_stamp) {
3377 now = sched_clock(); 3381 now = lockstat_clock();
3378 waittime = now - hlock->waittime_stamp; 3382 waittime = now - hlock->waittime_stamp;
3379 hlock->holdtime_stamp = now; 3383 hlock->holdtime_stamp = now;
3380 } 3384 }
diff --git a/kernel/module.c b/kernel/module.c
index e6bc4b28aa62..5842a71cf052 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1187,7 +1187,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1187 1187
1188 /* Count loaded sections and allocate structures */ 1188 /* Count loaded sections and allocate structures */
1189 for (i = 0; i < nsect; i++) 1189 for (i = 0; i < nsect; i++)
1190 if (sechdrs[i].sh_flags & SHF_ALLOC) 1190 if (sechdrs[i].sh_flags & SHF_ALLOC
1191 && sechdrs[i].sh_size)
1191 nloaded++; 1192 nloaded++;
1192 size[0] = ALIGN(sizeof(*sect_attrs) 1193 size[0] = ALIGN(sizeof(*sect_attrs)
1193 + nloaded * sizeof(sect_attrs->attrs[0]), 1194 + nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1207,6 +1208,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1207 for (i = 0; i < nsect; i++) { 1208 for (i = 0; i < nsect; i++) {
1208 if (! (sechdrs[i].sh_flags & SHF_ALLOC)) 1209 if (! (sechdrs[i].sh_flags & SHF_ALLOC))
1209 continue; 1210 continue;
1211 if (!sechdrs[i].sh_size)
1212 continue;
1210 sattr->address = sechdrs[i].sh_addr; 1213 sattr->address = sechdrs[i].sh_addr;
1211 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, 1214 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
1212 GFP_KERNEL); 1215 GFP_KERNEL);
@@ -1797,6 +1800,17 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
1797 } 1800 }
1798} 1801}
1799 1802
1803static void free_modinfo(struct module *mod)
1804{
1805 struct module_attribute *attr;
1806 int i;
1807
1808 for (i = 0; (attr = modinfo_attrs[i]); i++) {
1809 if (attr->free)
1810 attr->free(mod);
1811 }
1812}
1813
1800#ifdef CONFIG_KALLSYMS 1814#ifdef CONFIG_KALLSYMS
1801 1815
1802/* lookup symbol in given range of kernel_symbols */ 1816/* lookup symbol in given range of kernel_symbols */
@@ -1862,13 +1876,93 @@ static char elf_type(const Elf_Sym *sym,
1862 return '?'; 1876 return '?';
1863} 1877}
1864 1878
1879static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
1880 unsigned int shnum)
1881{
1882 const Elf_Shdr *sec;
1883
1884 if (src->st_shndx == SHN_UNDEF
1885 || src->st_shndx >= shnum
1886 || !src->st_name)
1887 return false;
1888
1889 sec = sechdrs + src->st_shndx;
1890 if (!(sec->sh_flags & SHF_ALLOC)
1891#ifndef CONFIG_KALLSYMS_ALL
1892 || !(sec->sh_flags & SHF_EXECINSTR)
1893#endif
1894 || (sec->sh_entsize & INIT_OFFSET_MASK))
1895 return false;
1896
1897 return true;
1898}
1899
1900static unsigned long layout_symtab(struct module *mod,
1901 Elf_Shdr *sechdrs,
1902 unsigned int symindex,
1903 unsigned int strindex,
1904 const Elf_Ehdr *hdr,
1905 const char *secstrings,
1906 unsigned long *pstroffs,
1907 unsigned long *strmap)
1908{
1909 unsigned long symoffs;
1910 Elf_Shdr *symsect = sechdrs + symindex;
1911 Elf_Shdr *strsect = sechdrs + strindex;
1912 const Elf_Sym *src;
1913 const char *strtab;
1914 unsigned int i, nsrc, ndst;
1915
1916 /* Put symbol section at end of init part of module. */
1917 symsect->sh_flags |= SHF_ALLOC;
1918 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
1919 symindex) | INIT_OFFSET_MASK;
1920 DEBUGP("\t%s\n", secstrings + symsect->sh_name);
1921
1922 src = (void *)hdr + symsect->sh_offset;
1923 nsrc = symsect->sh_size / sizeof(*src);
1924 strtab = (void *)hdr + strsect->sh_offset;
1925 for (ndst = i = 1; i < nsrc; ++i, ++src)
1926 if (is_core_symbol(src, sechdrs, hdr->e_shnum)) {
1927 unsigned int j = src->st_name;
1928
1929 while(!__test_and_set_bit(j, strmap) && strtab[j])
1930 ++j;
1931 ++ndst;
1932 }
1933
1934 /* Append room for core symbols at end of core part. */
1935 symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
1936 mod->core_size = symoffs + ndst * sizeof(Elf_Sym);
1937
1938 /* Put string table section at end of init part of module. */
1939 strsect->sh_flags |= SHF_ALLOC;
1940 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
1941 strindex) | INIT_OFFSET_MASK;
1942 DEBUGP("\t%s\n", secstrings + strsect->sh_name);
1943
1944 /* Append room for core symbols' strings at end of core part. */
1945 *pstroffs = mod->core_size;
1946 __set_bit(0, strmap);
1947 mod->core_size += bitmap_weight(strmap, strsect->sh_size);
1948
1949 return symoffs;
1950}
1951
1865static void add_kallsyms(struct module *mod, 1952static void add_kallsyms(struct module *mod,
1866 Elf_Shdr *sechdrs, 1953 Elf_Shdr *sechdrs,
1954 unsigned int shnum,
1867 unsigned int symindex, 1955 unsigned int symindex,
1868 unsigned int strindex, 1956 unsigned int strindex,
1869 const char *secstrings) 1957 unsigned long symoffs,
1958 unsigned long stroffs,
1959 const char *secstrings,
1960 unsigned long *strmap)
1870{ 1961{
1871 unsigned int i; 1962 unsigned int i, ndst;
1963 const Elf_Sym *src;
1964 Elf_Sym *dst;
1965 char *s;
1872 1966
1873 mod->symtab = (void *)sechdrs[symindex].sh_addr; 1967 mod->symtab = (void *)sechdrs[symindex].sh_addr;
1874 mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); 1968 mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
@@ -1878,13 +1972,46 @@ static void add_kallsyms(struct module *mod,
1878 for (i = 0; i < mod->num_symtab; i++) 1972 for (i = 0; i < mod->num_symtab; i++)
1879 mod->symtab[i].st_info 1973 mod->symtab[i].st_info
1880 = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); 1974 = elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
1975
1976 mod->core_symtab = dst = mod->module_core + symoffs;
1977 src = mod->symtab;
1978 *dst = *src;
1979 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
1980 if (!is_core_symbol(src, sechdrs, shnum))
1981 continue;
1982 dst[ndst] = *src;
1983 dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name);
1984 ++ndst;
1985 }
1986 mod->core_num_syms = ndst;
1987
1988 mod->core_strtab = s = mod->module_core + stroffs;
1989 for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i)
1990 if (test_bit(i, strmap))
1991 *++s = mod->strtab[i];
1881} 1992}
1882#else 1993#else
1994static inline unsigned long layout_symtab(struct module *mod,
1995 Elf_Shdr *sechdrs,
1996 unsigned int symindex,
1997 unsigned int strindex,
1998 const Elf_Ehdr *hdr,
1999 const char *secstrings,
2000 unsigned long *pstroffs,
2001 unsigned long *strmap)
2002{
2003 return 0;
2004}
2005
1883static inline void add_kallsyms(struct module *mod, 2006static inline void add_kallsyms(struct module *mod,
1884 Elf_Shdr *sechdrs, 2007 Elf_Shdr *sechdrs,
2008 unsigned int shnum,
1885 unsigned int symindex, 2009 unsigned int symindex,
1886 unsigned int strindex, 2010 unsigned int strindex,
1887 const char *secstrings) 2011 unsigned long symoffs,
2012 unsigned long stroffs,
2013 const char *secstrings,
2014 const unsigned long *strmap)
1888{ 2015{
1889} 2016}
1890#endif /* CONFIG_KALLSYMS */ 2017#endif /* CONFIG_KALLSYMS */
@@ -1959,6 +2086,8 @@ static noinline struct module *load_module(void __user *umod,
1959 struct module *mod; 2086 struct module *mod;
1960 long err = 0; 2087 long err = 0;
1961 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 2088 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
2089 unsigned long symoffs, stroffs, *strmap;
2090
1962 mm_segment_t old_fs; 2091 mm_segment_t old_fs;
1963 2092
1964 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", 2093 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2040,11 +2169,6 @@ static noinline struct module *load_module(void __user *umod,
2040 /* Don't keep modinfo and version sections. */ 2169 /* Don't keep modinfo and version sections. */
2041 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2170 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2042 sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2171 sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2043#ifdef CONFIG_KALLSYMS
2044 /* Keep symbol and string tables for decoding later. */
2045 sechdrs[symindex].sh_flags |= SHF_ALLOC;
2046 sechdrs[strindex].sh_flags |= SHF_ALLOC;
2047#endif
2048 2172
2049 /* Check module struct version now, before we try to use module. */ 2173 /* Check module struct version now, before we try to use module. */
2050 if (!check_modstruct_version(sechdrs, versindex, mod)) { 2174 if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -2080,6 +2204,13 @@ static noinline struct module *load_module(void __user *umod,
2080 goto free_hdr; 2204 goto free_hdr;
2081 } 2205 }
2082 2206
2207 strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size)
2208 * sizeof(long), GFP_KERNEL);
2209 if (!strmap) {
2210 err = -ENOMEM;
2211 goto free_mod;
2212 }
2213
2083 if (find_module(mod->name)) { 2214 if (find_module(mod->name)) {
2084 err = -EEXIST; 2215 err = -EEXIST;
2085 goto free_mod; 2216 goto free_mod;
@@ -2109,6 +2240,8 @@ static noinline struct module *load_module(void __user *umod,
2109 this is done generically; there doesn't appear to be any 2240 this is done generically; there doesn't appear to be any
2110 special cases for the architectures. */ 2241 special cases for the architectures. */
2111 layout_sections(mod, hdr, sechdrs, secstrings); 2242 layout_sections(mod, hdr, sechdrs, secstrings);
2243 symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr,
2244 secstrings, &stroffs, strmap);
2112 2245
2113 /* Do the allocs. */ 2246 /* Do the allocs. */
2114 ptr = module_alloc_update_bounds(mod->core_size); 2247 ptr = module_alloc_update_bounds(mod->core_size);
@@ -2313,7 +2446,10 @@ static noinline struct module *load_module(void __user *umod,
2313 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, 2446 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
2314 sechdrs[pcpuindex].sh_size); 2447 sechdrs[pcpuindex].sh_size);
2315 2448
2316 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); 2449 add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
2450 symoffs, stroffs, secstrings, strmap);
2451 kfree(strmap);
2452 strmap = NULL;
2317 2453
2318 if (!mod->taints) { 2454 if (!mod->taints) {
2319 struct _ddebug *debug; 2455 struct _ddebug *debug;
@@ -2385,13 +2521,14 @@ static noinline struct module *load_module(void __user *umod,
2385 synchronize_sched(); 2521 synchronize_sched();
2386 module_arch_cleanup(mod); 2522 module_arch_cleanup(mod);
2387 cleanup: 2523 cleanup:
2524 free_modinfo(mod);
2388 kobject_del(&mod->mkobj.kobj); 2525 kobject_del(&mod->mkobj.kobj);
2389 kobject_put(&mod->mkobj.kobj); 2526 kobject_put(&mod->mkobj.kobj);
2390 free_unload: 2527 free_unload:
2391 module_unload_free(mod); 2528 module_unload_free(mod);
2392#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2529#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
2393 free_init:
2394 percpu_modfree(mod->refptr); 2530 percpu_modfree(mod->refptr);
2531 free_init:
2395#endif 2532#endif
2396 module_free(mod, mod->module_init); 2533 module_free(mod, mod->module_init);
2397 free_core: 2534 free_core:
@@ -2402,6 +2539,7 @@ static noinline struct module *load_module(void __user *umod,
2402 percpu_modfree(percpu); 2539 percpu_modfree(percpu);
2403 free_mod: 2540 free_mod:
2404 kfree(args); 2541 kfree(args);
2542 kfree(strmap);
2405 free_hdr: 2543 free_hdr:
2406 vfree(hdr); 2544 vfree(hdr);
2407 return ERR_PTR(err); 2545 return ERR_PTR(err);
@@ -2491,6 +2629,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2491 /* Drop initial reference. */ 2629 /* Drop initial reference. */
2492 module_put(mod); 2630 module_put(mod);
2493 trim_init_extable(mod); 2631 trim_init_extable(mod);
2632#ifdef CONFIG_KALLSYMS
2633 mod->num_symtab = mod->core_num_syms;
2634 mod->symtab = mod->core_symtab;
2635 mod->strtab = mod->core_strtab;
2636#endif
2494 module_free(mod, mod->module_init); 2637 module_free(mod, mod->module_init);
2495 mod->module_init = NULL; 2638 mod->module_init = NULL;
2496 mod->init_size = 0; 2639 mod->init_size = 0;
@@ -2952,7 +3095,6 @@ void module_layout(struct module *mod,
2952 struct modversion_info *ver, 3095 struct modversion_info *ver,
2953 struct kernel_param *kp, 3096 struct kernel_param *kp,
2954 struct kernel_symbol *ks, 3097 struct kernel_symbol *ks,
2955 struct marker *marker,
2956 struct tracepoint *tp) 3098 struct tracepoint *tp)
2957{ 3099{
2958} 3100}
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 50d022e5a560..ec815a960b5d 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -16,6 +16,7 @@
16#include <linux/delay.h> 16#include <linux/delay.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/poison.h> 18#include <linux/poison.h>
19#include <linux/sched.h>
19#include <linux/spinlock.h> 20#include <linux/spinlock.h>
20#include <linux/kallsyms.h> 21#include <linux/kallsyms.h>
21#include <linux/interrupt.h> 22#include <linux/interrupt.h>
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 947b3ad551f8..632f04c57d82 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -148,8 +148,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
148 148
149 preempt_disable(); 149 preempt_disable();
150 mutex_acquire(&lock->dep_map, subclass, 0, ip); 150 mutex_acquire(&lock->dep_map, subclass, 0, ip);
151#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && \ 151
152 !defined(CONFIG_HAVE_DEFAULT_NO_SPIN_MUTEXES) 152#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
153 /* 153 /*
154 * Optimistic spinning. 154 * Optimistic spinning.
155 * 155 *
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 61d5aa5eced3..acd24e7643eb 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -558,7 +558,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
558 558
559static ATOMIC_NOTIFIER_HEAD(die_chain); 559static ATOMIC_NOTIFIER_HEAD(die_chain);
560 560
561int notrace notify_die(enum die_val val, const char *str, 561int notrace __kprobes notify_die(enum die_val val, const char *str,
562 struct pt_regs *regs, long err, int trap, int sig) 562 struct pt_regs *regs, long err, int trap, int sig)
563{ 563{
564 struct die_args args = { 564 struct die_args args = {
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 5aa854f9e5ae..2a5dfec8efe0 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -42,8 +42,8 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
42 * (hence either you are in the same cgroup as task, or in an 42 * (hence either you are in the same cgroup as task, or in an
43 * ancestor cgroup thereof) 43 * ancestor cgroup thereof)
44 */ 44 */
45static int ns_can_attach(struct cgroup_subsys *ss, 45static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
46 struct cgroup *new_cgroup, struct task_struct *task) 46 struct task_struct *task, bool threadgroup)
47{ 47{
48 if (current != task) { 48 if (current != task) {
49 if (!capable(CAP_SYS_ADMIN)) 49 if (!capable(CAP_SYS_ADMIN))
@@ -56,6 +56,18 @@ static int ns_can_attach(struct cgroup_subsys *ss,
56 if (!cgroup_is_descendant(new_cgroup, task)) 56 if (!cgroup_is_descendant(new_cgroup, task))
57 return -EPERM; 57 return -EPERM;
58 58
59 if (threadgroup) {
60 struct task_struct *c;
61 rcu_read_lock();
62 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
63 if (!cgroup_is_descendant(new_cgroup, c)) {
64 rcu_read_unlock();
65 return -EPERM;
66 }
67 }
68 rcu_read_unlock();
69 }
70
59 return 0; 71 return 0;
60} 72}
61 73
diff --git a/kernel/panic.c b/kernel/panic.c
index bcdef26e3332..96b45d0b4ba5 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -90,6 +90,8 @@ NORET_TYPE void panic(const char * fmt, ...)
90 90
91 atomic_notifier_call_chain(&panic_notifier_list, 0, buf); 91 atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
92 92
93 bust_spinlocks(0);
94
93 if (!panic_blink) 95 if (!panic_blink)
94 panic_blink = no_blink; 96 panic_blink = no_blink;
95 97
@@ -136,7 +138,6 @@ NORET_TYPE void panic(const char * fmt, ...)
136 mdelay(1); 138 mdelay(1);
137 i++; 139 i++;
138 } 140 }
139 bust_spinlocks(0);
140} 141}
141 142
142EXPORT_SYMBOL(panic); 143EXPORT_SYMBOL(panic);
diff --git a/kernel/params.c b/kernel/params.c
index 7f6912ced2ba..d656c276508d 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -23,6 +23,7 @@
23#include <linux/device.h> 23#include <linux/device.h>
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ctype.h>
26 27
27#if 0 28#if 0
28#define DEBUGP printk 29#define DEBUGP printk
@@ -87,7 +88,7 @@ static char *next_arg(char *args, char **param, char **val)
87 } 88 }
88 89
89 for (i = 0; args[i]; i++) { 90 for (i = 0; args[i]; i++) {
90 if (args[i] == ' ' && !in_quote) 91 if (isspace(args[i]) && !in_quote)
91 break; 92 break;
92 if (equals == 0) { 93 if (equals == 0) {
93 if (args[i] == '=') 94 if (args[i] == '=')
@@ -121,7 +122,7 @@ static char *next_arg(char *args, char **param, char **val)
121 next = args + i; 122 next = args + i;
122 123
123 /* Chew up trailing spaces. */ 124 /* Chew up trailing spaces. */
124 while (*next == ' ') 125 while (isspace(*next))
125 next++; 126 next++;
126 return next; 127 return next;
127} 128}
@@ -138,7 +139,7 @@ int parse_args(const char *name,
138 DEBUGP("Parsing ARGS: %s\n", args); 139 DEBUGP("Parsing ARGS: %s\n", args);
139 140
140 /* Chew leading spaces */ 141 /* Chew leading spaces */
141 while (*args == ' ') 142 while (isspace(*args))
142 args++; 143 args++;
143 144
144 while (*args) { 145 while (*args) {
@@ -217,15 +218,11 @@ int param_set_charp(const char *val, struct kernel_param *kp)
217 return -ENOSPC; 218 return -ENOSPC;
218 } 219 }
219 220
220 if (kp->flags & KPARAM_KMALLOCED)
221 kfree(*(char **)kp->arg);
222
223 /* This is a hack. We can't need to strdup in early boot, and we 221 /* This is a hack. We can't need to strdup in early boot, and we
224 * don't need to; this mangled commandline is preserved. */ 222 * don't need to; this mangled commandline is preserved. */
225 if (slab_is_available()) { 223 if (slab_is_available()) {
226 kp->flags |= KPARAM_KMALLOCED;
227 *(char **)kp->arg = kstrdup(val, GFP_KERNEL); 224 *(char **)kp->arg = kstrdup(val, GFP_KERNEL);
228 if (!kp->arg) 225 if (!*(char **)kp->arg)
229 return -ENOMEM; 226 return -ENOMEM;
230 } else 227 } else
231 *(const char **)kp->arg = val; 228 *(const char **)kp->arg = val;
@@ -303,6 +300,7 @@ static int param_array(const char *name,
303 unsigned int min, unsigned int max, 300 unsigned int min, unsigned int max,
304 void *elem, int elemsize, 301 void *elem, int elemsize,
305 int (*set)(const char *, struct kernel_param *kp), 302 int (*set)(const char *, struct kernel_param *kp),
303 u16 flags,
306 unsigned int *num) 304 unsigned int *num)
307{ 305{
308 int ret; 306 int ret;
@@ -312,6 +310,7 @@ static int param_array(const char *name,
312 /* Get the name right for errors. */ 310 /* Get the name right for errors. */
313 kp.name = name; 311 kp.name = name;
314 kp.arg = elem; 312 kp.arg = elem;
313 kp.flags = flags;
315 314
316 /* No equals sign? */ 315 /* No equals sign? */
317 if (!val) { 316 if (!val) {
@@ -357,7 +356,8 @@ int param_array_set(const char *val, struct kernel_param *kp)
357 unsigned int temp_num; 356 unsigned int temp_num;
358 357
359 return param_array(kp->name, val, 1, arr->max, arr->elem, 358 return param_array(kp->name, val, 1, arr->max, arr->elem,
360 arr->elemsize, arr->set, arr->num ?: &temp_num); 359 arr->elemsize, arr->set, kp->flags,
360 arr->num ?: &temp_num);
361} 361}
362 362
363int param_array_get(char *buffer, struct kernel_param *kp) 363int param_array_get(char *buffer, struct kernel_param *kp)
@@ -604,11 +604,7 @@ void module_param_sysfs_remove(struct module *mod)
604 604
605void destroy_params(const struct kernel_param *params, unsigned num) 605void destroy_params(const struct kernel_param *params, unsigned num)
606{ 606{
607 unsigned int i; 607 /* FIXME: This should free kmalloced charp parameters. It doesn't. */
608
609 for (i = 0; i < num; i++)
610 if (params[i].flags & KPARAM_KMALLOCED)
611 kfree(*(char **)params[i].arg);
612} 608}
613 609
614static void __init kernel_add_sysfs_param(const char *name, 610static void __init kernel_add_sysfs_param(const char *name,
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 76ac4db405e9..6b7ddba1dd64 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -20,6 +20,7 @@
20#include <linux/percpu.h> 20#include <linux/percpu.h>
21#include <linux/ptrace.h> 21#include <linux/ptrace.h>
22#include <linux/vmstat.h> 22#include <linux/vmstat.h>
23#include <linux/vmalloc.h>
23#include <linux/hardirq.h> 24#include <linux/hardirq.h>
24#include <linux/rculist.h> 25#include <linux/rculist.h>
25#include <linux/uaccess.h> 26#include <linux/uaccess.h>
@@ -27,6 +28,8 @@
27#include <linux/anon_inodes.h> 28#include <linux/anon_inodes.h>
28#include <linux/kernel_stat.h> 29#include <linux/kernel_stat.h>
29#include <linux/perf_event.h> 30#include <linux/perf_event.h>
31#include <linux/ftrace_event.h>
32#include <linux/hw_breakpoint.h>
30 33
31#include <asm/irq_regs.h> 34#include <asm/irq_regs.h>
32 35
@@ -243,6 +246,49 @@ static void perf_unpin_context(struct perf_event_context *ctx)
243 put_ctx(ctx); 246 put_ctx(ctx);
244} 247}
245 248
249static inline u64 perf_clock(void)
250{
251 return cpu_clock(smp_processor_id());
252}
253
254/*
255 * Update the record of the current time in a context.
256 */
257static void update_context_time(struct perf_event_context *ctx)
258{
259 u64 now = perf_clock();
260
261 ctx->time += now - ctx->timestamp;
262 ctx->timestamp = now;
263}
264
265/*
266 * Update the total_time_enabled and total_time_running fields for a event.
267 */
268static void update_event_times(struct perf_event *event)
269{
270 struct perf_event_context *ctx = event->ctx;
271 u64 run_end;
272
273 if (event->state < PERF_EVENT_STATE_INACTIVE ||
274 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
275 return;
276
277 if (ctx->is_active)
278 run_end = ctx->time;
279 else
280 run_end = event->tstamp_stopped;
281
282 event->total_time_enabled = run_end - event->tstamp_enabled;
283
284 if (event->state == PERF_EVENT_STATE_INACTIVE)
285 run_end = event->tstamp_stopped;
286 else
287 run_end = ctx->time;
288
289 event->total_time_running = run_end - event->tstamp_running;
290}
291
246/* 292/*
247 * Add a event from the lists for its context. 293 * Add a event from the lists for its context.
248 * Must be called with ctx->mutex and ctx->lock held. 294 * Must be called with ctx->mutex and ctx->lock held.
@@ -291,6 +337,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
291 if (event->group_leader != event) 337 if (event->group_leader != event)
292 event->group_leader->nr_siblings--; 338 event->group_leader->nr_siblings--;
293 339
340 update_event_times(event);
341
342 /*
343 * If event was in error state, then keep it
344 * that way, otherwise bogus counts will be
345 * returned on read(). The only way to get out
346 * of error state is by explicit re-enabling
347 * of the event
348 */
349 if (event->state > PERF_EVENT_STATE_OFF)
350 event->state = PERF_EVENT_STATE_OFF;
351
294 /* 352 /*
295 * If this was a group event with sibling events then 353 * If this was a group event with sibling events then
296 * upgrade the siblings to singleton events by adding them 354 * upgrade the siblings to singleton events by adding them
@@ -444,50 +502,11 @@ retry:
444 * can remove the event safely, if the call above did not 502 * can remove the event safely, if the call above did not
445 * succeed. 503 * succeed.
446 */ 504 */
447 if (!list_empty(&event->group_entry)) { 505 if (!list_empty(&event->group_entry))
448 list_del_event(event, ctx); 506 list_del_event(event, ctx);
449 }
450 spin_unlock_irq(&ctx->lock); 507 spin_unlock_irq(&ctx->lock);
451} 508}
452 509
453static inline u64 perf_clock(void)
454{
455 return cpu_clock(smp_processor_id());
456}
457
458/*
459 * Update the record of the current time in a context.
460 */
461static void update_context_time(struct perf_event_context *ctx)
462{
463 u64 now = perf_clock();
464
465 ctx->time += now - ctx->timestamp;
466 ctx->timestamp = now;
467}
468
469/*
470 * Update the total_time_enabled and total_time_running fields for a event.
471 */
472static void update_event_times(struct perf_event *event)
473{
474 struct perf_event_context *ctx = event->ctx;
475 u64 run_end;
476
477 if (event->state < PERF_EVENT_STATE_INACTIVE ||
478 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
479 return;
480
481 event->total_time_enabled = ctx->time - event->tstamp_enabled;
482
483 if (event->state == PERF_EVENT_STATE_INACTIVE)
484 run_end = event->tstamp_stopped;
485 else
486 run_end = ctx->time;
487
488 event->total_time_running = run_end - event->tstamp_running;
489}
490
491/* 510/*
492 * Update total_time_enabled and total_time_running for all events in a group. 511 * Update total_time_enabled and total_time_running for all events in a group.
493 */ 512 */
@@ -1031,12 +1050,8 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
1031 1050
1032 perf_disable(); 1051 perf_disable();
1033 if (ctx->nr_active) { 1052 if (ctx->nr_active) {
1034 list_for_each_entry(event, &ctx->group_list, group_entry) { 1053 list_for_each_entry(event, &ctx->group_list, group_entry)
1035 if (event != event->group_leader) 1054 group_sched_out(event, cpuctx, ctx);
1036 event_sched_out(event, cpuctx, ctx);
1037 else
1038 group_sched_out(event, cpuctx, ctx);
1039 }
1040 } 1055 }
1041 perf_enable(); 1056 perf_enable();
1042 out: 1057 out:
@@ -1062,8 +1077,6 @@ static int context_equiv(struct perf_event_context *ctx1,
1062 && !ctx1->pin_count && !ctx2->pin_count; 1077 && !ctx1->pin_count && !ctx2->pin_count;
1063} 1078}
1064 1079
1065static void __perf_event_read(void *event);
1066
1067static void __perf_event_sync_stat(struct perf_event *event, 1080static void __perf_event_sync_stat(struct perf_event *event,
1068 struct perf_event *next_event) 1081 struct perf_event *next_event)
1069{ 1082{
@@ -1081,8 +1094,8 @@ static void __perf_event_sync_stat(struct perf_event *event,
1081 */ 1094 */
1082 switch (event->state) { 1095 switch (event->state) {
1083 case PERF_EVENT_STATE_ACTIVE: 1096 case PERF_EVENT_STATE_ACTIVE:
1084 __perf_event_read(event); 1097 event->pmu->read(event);
1085 break; 1098 /* fall-through */
1086 1099
1087 case PERF_EVENT_STATE_INACTIVE: 1100 case PERF_EVENT_STATE_INACTIVE:
1088 update_event_times(event); 1101 update_event_times(event);
@@ -1121,6 +1134,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1121 if (!ctx->nr_stat) 1134 if (!ctx->nr_stat)
1122 return; 1135 return;
1123 1136
1137 update_context_time(ctx);
1138
1124 event = list_first_entry(&ctx->event_list, 1139 event = list_first_entry(&ctx->event_list,
1125 struct perf_event, event_entry); 1140 struct perf_event, event_entry);
1126 1141
@@ -1164,8 +1179,6 @@ void perf_event_task_sched_out(struct task_struct *task,
1164 if (likely(!ctx || !cpuctx->task_ctx)) 1179 if (likely(!ctx || !cpuctx->task_ctx))
1165 return; 1180 return;
1166 1181
1167 update_context_time(ctx);
1168
1169 rcu_read_lock(); 1182 rcu_read_lock();
1170 parent = rcu_dereference(ctx->parent_ctx); 1183 parent = rcu_dereference(ctx->parent_ctx);
1171 next_ctx = next->perf_event_ctxp; 1184 next_ctx = next->perf_event_ctxp;
@@ -1258,12 +1271,8 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1258 if (event->cpu != -1 && event->cpu != cpu) 1271 if (event->cpu != -1 && event->cpu != cpu)
1259 continue; 1272 continue;
1260 1273
1261 if (event != event->group_leader) 1274 if (group_can_go_on(event, cpuctx, 1))
1262 event_sched_in(event, cpuctx, ctx, cpu); 1275 group_sched_in(event, cpuctx, ctx, cpu);
1263 else {
1264 if (group_can_go_on(event, cpuctx, 1))
1265 group_sched_in(event, cpuctx, ctx, cpu);
1266 }
1267 1276
1268 /* 1277 /*
1269 * If this pinned group hasn't been scheduled, 1278 * If this pinned group hasn't been scheduled,
@@ -1291,15 +1300,9 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1291 if (event->cpu != -1 && event->cpu != cpu) 1300 if (event->cpu != -1 && event->cpu != cpu)
1292 continue; 1301 continue;
1293 1302
1294 if (event != event->group_leader) { 1303 if (group_can_go_on(event, cpuctx, can_add_hw))
1295 if (event_sched_in(event, cpuctx, ctx, cpu)) 1304 if (group_sched_in(event, cpuctx, ctx, cpu))
1296 can_add_hw = 0; 1305 can_add_hw = 0;
1297 } else {
1298 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1299 if (group_sched_in(event, cpuctx, ctx, cpu))
1300 can_add_hw = 0;
1301 }
1302 }
1303 } 1306 }
1304 perf_enable(); 1307 perf_enable();
1305 out: 1308 out:
@@ -1368,7 +1371,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1368 u64 interrupts, freq; 1371 u64 interrupts, freq;
1369 1372
1370 spin_lock(&ctx->lock); 1373 spin_lock(&ctx->lock);
1371 list_for_each_entry(event, &ctx->group_list, group_entry) { 1374 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
1372 if (event->state != PERF_EVENT_STATE_ACTIVE) 1375 if (event->state != PERF_EVENT_STATE_ACTIVE)
1373 continue; 1376 continue;
1374 1377
@@ -1528,7 +1531,6 @@ static void __perf_event_read(void *info)
1528 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1531 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1529 struct perf_event *event = info; 1532 struct perf_event *event = info;
1530 struct perf_event_context *ctx = event->ctx; 1533 struct perf_event_context *ctx = event->ctx;
1531 unsigned long flags;
1532 1534
1533 /* 1535 /*
1534 * If this is a task context, we need to check whether it is 1536 * If this is a task context, we need to check whether it is
@@ -1540,12 +1542,12 @@ static void __perf_event_read(void *info)
1540 if (ctx->task && cpuctx->task_ctx != ctx) 1542 if (ctx->task && cpuctx->task_ctx != ctx)
1541 return; 1543 return;
1542 1544
1543 local_irq_save(flags); 1545 spin_lock(&ctx->lock);
1544 if (ctx->is_active) 1546 update_context_time(ctx);
1545 update_context_time(ctx);
1546 event->pmu->read(event);
1547 update_event_times(event); 1547 update_event_times(event);
1548 local_irq_restore(flags); 1548 spin_unlock(&ctx->lock);
1549
1550 event->pmu->read(event);
1549} 1551}
1550 1552
1551static u64 perf_event_read(struct perf_event *event) 1553static u64 perf_event_read(struct perf_event *event)
@@ -1558,7 +1560,13 @@ static u64 perf_event_read(struct perf_event *event)
1558 smp_call_function_single(event->oncpu, 1560 smp_call_function_single(event->oncpu,
1559 __perf_event_read, event, 1); 1561 __perf_event_read, event, 1);
1560 } else if (event->state == PERF_EVENT_STATE_INACTIVE) { 1562 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1563 struct perf_event_context *ctx = event->ctx;
1564 unsigned long flags;
1565
1566 spin_lock_irqsave(&ctx->lock, flags);
1567 update_context_time(ctx);
1561 update_event_times(event); 1568 update_event_times(event);
1569 spin_unlock_irqrestore(&ctx->lock, flags);
1562 } 1570 }
1563 1571
1564 return atomic64_read(&event->count); 1572 return atomic64_read(&event->count);
@@ -1671,6 +1679,8 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1671 return ERR_PTR(err); 1679 return ERR_PTR(err);
1672} 1680}
1673 1681
1682static void perf_event_free_filter(struct perf_event *event);
1683
1674static void free_event_rcu(struct rcu_head *head) 1684static void free_event_rcu(struct rcu_head *head)
1675{ 1685{
1676 struct perf_event *event; 1686 struct perf_event *event;
@@ -1678,6 +1688,7 @@ static void free_event_rcu(struct rcu_head *head)
1678 event = container_of(head, struct perf_event, rcu_head); 1688 event = container_of(head, struct perf_event, rcu_head);
1679 if (event->ns) 1689 if (event->ns)
1680 put_pid_ns(event->ns); 1690 put_pid_ns(event->ns);
1691 perf_event_free_filter(event);
1681 kfree(event); 1692 kfree(event);
1682} 1693}
1683 1694
@@ -1709,16 +1720,10 @@ static void free_event(struct perf_event *event)
1709 call_rcu(&event->rcu_head, free_event_rcu); 1720 call_rcu(&event->rcu_head, free_event_rcu);
1710} 1721}
1711 1722
1712/* 1723int perf_event_release_kernel(struct perf_event *event)
1713 * Called when the last reference to the file is gone.
1714 */
1715static int perf_release(struct inode *inode, struct file *file)
1716{ 1724{
1717 struct perf_event *event = file->private_data;
1718 struct perf_event_context *ctx = event->ctx; 1725 struct perf_event_context *ctx = event->ctx;
1719 1726
1720 file->private_data = NULL;
1721
1722 WARN_ON_ONCE(ctx->parent_ctx); 1727 WARN_ON_ONCE(ctx->parent_ctx);
1723 mutex_lock(&ctx->mutex); 1728 mutex_lock(&ctx->mutex);
1724 perf_event_remove_from_context(event); 1729 perf_event_remove_from_context(event);
@@ -1733,6 +1738,19 @@ static int perf_release(struct inode *inode, struct file *file)
1733 1738
1734 return 0; 1739 return 0;
1735} 1740}
1741EXPORT_SYMBOL_GPL(perf_event_release_kernel);
1742
1743/*
1744 * Called when the last reference to the file is gone.
1745 */
1746static int perf_release(struct inode *inode, struct file *file)
1747{
1748 struct perf_event *event = file->private_data;
1749
1750 file->private_data = NULL;
1751
1752 return perf_event_release_kernel(event);
1753}
1736 1754
1737static int perf_event_read_size(struct perf_event *event) 1755static int perf_event_read_size(struct perf_event *event)
1738{ 1756{
@@ -1759,91 +1777,94 @@ static int perf_event_read_size(struct perf_event *event)
1759 return size; 1777 return size;
1760} 1778}
1761 1779
1762static u64 perf_event_read_value(struct perf_event *event) 1780u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
1763{ 1781{
1764 struct perf_event *child; 1782 struct perf_event *child;
1765 u64 total = 0; 1783 u64 total = 0;
1766 1784
1785 *enabled = 0;
1786 *running = 0;
1787
1788 mutex_lock(&event->child_mutex);
1767 total += perf_event_read(event); 1789 total += perf_event_read(event);
1768 list_for_each_entry(child, &event->child_list, child_list) 1790 *enabled += event->total_time_enabled +
1791 atomic64_read(&event->child_total_time_enabled);
1792 *running += event->total_time_running +
1793 atomic64_read(&event->child_total_time_running);
1794
1795 list_for_each_entry(child, &event->child_list, child_list) {
1769 total += perf_event_read(child); 1796 total += perf_event_read(child);
1797 *enabled += child->total_time_enabled;
1798 *running += child->total_time_running;
1799 }
1800 mutex_unlock(&event->child_mutex);
1770 1801
1771 return total; 1802 return total;
1772} 1803}
1773 1804EXPORT_SYMBOL_GPL(perf_event_read_value);
1774static int perf_event_read_entry(struct perf_event *event,
1775 u64 read_format, char __user *buf)
1776{
1777 int n = 0, count = 0;
1778 u64 values[2];
1779
1780 values[n++] = perf_event_read_value(event);
1781 if (read_format & PERF_FORMAT_ID)
1782 values[n++] = primary_event_id(event);
1783
1784 count = n * sizeof(u64);
1785
1786 if (copy_to_user(buf, values, count))
1787 return -EFAULT;
1788
1789 return count;
1790}
1791 1805
1792static int perf_event_read_group(struct perf_event *event, 1806static int perf_event_read_group(struct perf_event *event,
1793 u64 read_format, char __user *buf) 1807 u64 read_format, char __user *buf)
1794{ 1808{
1795 struct perf_event *leader = event->group_leader, *sub; 1809 struct perf_event *leader = event->group_leader, *sub;
1796 int n = 0, size = 0, err = -EFAULT; 1810 int n = 0, size = 0, ret = -EFAULT;
1797 u64 values[3]; 1811 struct perf_event_context *ctx = leader->ctx;
1812 u64 values[5];
1813 u64 count, enabled, running;
1814
1815 mutex_lock(&ctx->mutex);
1816 count = perf_event_read_value(leader, &enabled, &running);
1798 1817
1799 values[n++] = 1 + leader->nr_siblings; 1818 values[n++] = 1 + leader->nr_siblings;
1800 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 1819 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1801 values[n++] = leader->total_time_enabled + 1820 values[n++] = enabled;
1802 atomic64_read(&leader->child_total_time_enabled); 1821 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1803 } 1822 values[n++] = running;
1804 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 1823 values[n++] = count;
1805 values[n++] = leader->total_time_running + 1824 if (read_format & PERF_FORMAT_ID)
1806 atomic64_read(&leader->child_total_time_running); 1825 values[n++] = primary_event_id(leader);
1807 }
1808 1826
1809 size = n * sizeof(u64); 1827 size = n * sizeof(u64);
1810 1828
1811 if (copy_to_user(buf, values, size)) 1829 if (copy_to_user(buf, values, size))
1812 return -EFAULT; 1830 goto unlock;
1813
1814 err = perf_event_read_entry(leader, read_format, buf + size);
1815 if (err < 0)
1816 return err;
1817 1831
1818 size += err; 1832 ret = size;
1819 1833
1820 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 1834 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1821 err = perf_event_read_entry(sub, read_format, 1835 n = 0;
1822 buf + size); 1836
1823 if (err < 0) 1837 values[n++] = perf_event_read_value(sub, &enabled, &running);
1824 return err; 1838 if (read_format & PERF_FORMAT_ID)
1839 values[n++] = primary_event_id(sub);
1840
1841 size = n * sizeof(u64);
1842
1843 if (copy_to_user(buf + ret, values, size)) {
1844 ret = -EFAULT;
1845 goto unlock;
1846 }
1825 1847
1826 size += err; 1848 ret += size;
1827 } 1849 }
1850unlock:
1851 mutex_unlock(&ctx->mutex);
1828 1852
1829 return size; 1853 return ret;
1830} 1854}
1831 1855
1832static int perf_event_read_one(struct perf_event *event, 1856static int perf_event_read_one(struct perf_event *event,
1833 u64 read_format, char __user *buf) 1857 u64 read_format, char __user *buf)
1834{ 1858{
1859 u64 enabled, running;
1835 u64 values[4]; 1860 u64 values[4];
1836 int n = 0; 1861 int n = 0;
1837 1862
1838 values[n++] = perf_event_read_value(event); 1863 values[n++] = perf_event_read_value(event, &enabled, &running);
1839 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 1864 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1840 values[n++] = event->total_time_enabled + 1865 values[n++] = enabled;
1841 atomic64_read(&event->child_total_time_enabled); 1866 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1842 } 1867 values[n++] = running;
1843 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1844 values[n++] = event->total_time_running +
1845 atomic64_read(&event->child_total_time_running);
1846 }
1847 if (read_format & PERF_FORMAT_ID) 1868 if (read_format & PERF_FORMAT_ID)
1848 values[n++] = primary_event_id(event); 1869 values[n++] = primary_event_id(event);
1849 1870
@@ -1874,12 +1895,10 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
1874 return -ENOSPC; 1895 return -ENOSPC;
1875 1896
1876 WARN_ON_ONCE(event->ctx->parent_ctx); 1897 WARN_ON_ONCE(event->ctx->parent_ctx);
1877 mutex_lock(&event->child_mutex);
1878 if (read_format & PERF_FORMAT_GROUP) 1898 if (read_format & PERF_FORMAT_GROUP)
1879 ret = perf_event_read_group(event, read_format, buf); 1899 ret = perf_event_read_group(event, read_format, buf);
1880 else 1900 else
1881 ret = perf_event_read_one(event, read_format, buf); 1901 ret = perf_event_read_one(event, read_format, buf);
1882 mutex_unlock(&event->child_mutex);
1883 1902
1884 return ret; 1903 return ret;
1885} 1904}
@@ -1987,7 +2006,8 @@ unlock:
1987 return ret; 2006 return ret;
1988} 2007}
1989 2008
1990int perf_event_set_output(struct perf_event *event, int output_fd); 2009static int perf_event_set_output(struct perf_event *event, int output_fd);
2010static int perf_event_set_filter(struct perf_event *event, void __user *arg);
1991 2011
1992static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 2012static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1993{ 2013{
@@ -2015,6 +2035,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2015 case PERF_EVENT_IOC_SET_OUTPUT: 2035 case PERF_EVENT_IOC_SET_OUTPUT:
2016 return perf_event_set_output(event, arg); 2036 return perf_event_set_output(event, arg);
2017 2037
2038 case PERF_EVENT_IOC_SET_FILTER:
2039 return perf_event_set_filter(event, (void __user *)arg);
2040
2018 default: 2041 default:
2019 return -ENOTTY; 2042 return -ENOTTY;
2020 } 2043 }
@@ -2105,49 +2128,31 @@ unlock:
2105 rcu_read_unlock(); 2128 rcu_read_unlock();
2106} 2129}
2107 2130
2108static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2131static unsigned long perf_data_size(struct perf_mmap_data *data)
2109{ 2132{
2110 struct perf_event *event = vma->vm_file->private_data; 2133 return data->nr_pages << (PAGE_SHIFT + data->data_order);
2111 struct perf_mmap_data *data; 2134}
2112 int ret = VM_FAULT_SIGBUS;
2113
2114 if (vmf->flags & FAULT_FLAG_MKWRITE) {
2115 if (vmf->pgoff == 0)
2116 ret = 0;
2117 return ret;
2118 }
2119
2120 rcu_read_lock();
2121 data = rcu_dereference(event->data);
2122 if (!data)
2123 goto unlock;
2124
2125 if (vmf->pgoff == 0) {
2126 vmf->page = virt_to_page(data->user_page);
2127 } else {
2128 int nr = vmf->pgoff - 1;
2129
2130 if ((unsigned)nr > data->nr_pages)
2131 goto unlock;
2132 2135
2133 if (vmf->flags & FAULT_FLAG_WRITE) 2136#ifndef CONFIG_PERF_USE_VMALLOC
2134 goto unlock;
2135 2137
2136 vmf->page = virt_to_page(data->data_pages[nr]); 2138/*
2137 } 2139 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
2140 */
2138 2141
2139 get_page(vmf->page); 2142static struct page *
2140 vmf->page->mapping = vma->vm_file->f_mapping; 2143perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2141 vmf->page->index = vmf->pgoff; 2144{
2145 if (pgoff > data->nr_pages)
2146 return NULL;
2142 2147
2143 ret = 0; 2148 if (pgoff == 0)
2144unlock: 2149 return virt_to_page(data->user_page);
2145 rcu_read_unlock();
2146 2150
2147 return ret; 2151 return virt_to_page(data->data_pages[pgoff - 1]);
2148} 2152}
2149 2153
2150static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages) 2154static struct perf_mmap_data *
2155perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2151{ 2156{
2152 struct perf_mmap_data *data; 2157 struct perf_mmap_data *data;
2153 unsigned long size; 2158 unsigned long size;
@@ -2172,19 +2177,10 @@ static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2172 goto fail_data_pages; 2177 goto fail_data_pages;
2173 } 2178 }
2174 2179
2180 data->data_order = 0;
2175 data->nr_pages = nr_pages; 2181 data->nr_pages = nr_pages;
2176 atomic_set(&data->lock, -1);
2177 2182
2178 if (event->attr.watermark) { 2183 return data;
2179 data->watermark = min_t(long, PAGE_SIZE * nr_pages,
2180 event->attr.wakeup_watermark);
2181 }
2182 if (!data->watermark)
2183 data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
2184
2185 rcu_assign_pointer(event->data, data);
2186
2187 return 0;
2188 2184
2189fail_data_pages: 2185fail_data_pages:
2190 for (i--; i >= 0; i--) 2186 for (i--; i >= 0; i--)
@@ -2196,7 +2192,7 @@ fail_user_page:
2196 kfree(data); 2192 kfree(data);
2197 2193
2198fail: 2194fail:
2199 return -ENOMEM; 2195 return NULL;
2200} 2196}
2201 2197
2202static void perf_mmap_free_page(unsigned long addr) 2198static void perf_mmap_free_page(unsigned long addr)
@@ -2207,28 +2203,170 @@ static void perf_mmap_free_page(unsigned long addr)
2207 __free_page(page); 2203 __free_page(page);
2208} 2204}
2209 2205
2210static void __perf_mmap_data_free(struct rcu_head *rcu_head) 2206static void perf_mmap_data_free(struct perf_mmap_data *data)
2211{ 2207{
2212 struct perf_mmap_data *data;
2213 int i; 2208 int i;
2214 2209
2215 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2216
2217 perf_mmap_free_page((unsigned long)data->user_page); 2210 perf_mmap_free_page((unsigned long)data->user_page);
2218 for (i = 0; i < data->nr_pages; i++) 2211 for (i = 0; i < data->nr_pages; i++)
2219 perf_mmap_free_page((unsigned long)data->data_pages[i]); 2212 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2213 kfree(data);
2214}
2215
2216#else
2217
2218/*
2219 * Back perf_mmap() with vmalloc memory.
2220 *
2221 * Required for architectures that have d-cache aliasing issues.
2222 */
2223
2224static struct page *
2225perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2226{
2227 if (pgoff > (1UL << data->data_order))
2228 return NULL;
2229
2230 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
2231}
2232
2233static void perf_mmap_unmark_page(void *addr)
2234{
2235 struct page *page = vmalloc_to_page(addr);
2236
2237 page->mapping = NULL;
2238}
2239
2240static void perf_mmap_data_free_work(struct work_struct *work)
2241{
2242 struct perf_mmap_data *data;
2243 void *base;
2244 int i, nr;
2245
2246 data = container_of(work, struct perf_mmap_data, work);
2247 nr = 1 << data->data_order;
2248
2249 base = data->user_page;
2250 for (i = 0; i < nr + 1; i++)
2251 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2252
2253 vfree(base);
2254 kfree(data);
2255}
2256
2257static void perf_mmap_data_free(struct perf_mmap_data *data)
2258{
2259 schedule_work(&data->work);
2260}
2261
2262static struct perf_mmap_data *
2263perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2264{
2265 struct perf_mmap_data *data;
2266 unsigned long size;
2267 void *all_buf;
2268
2269 WARN_ON(atomic_read(&event->mmap_count));
2270
2271 size = sizeof(struct perf_mmap_data);
2272 size += sizeof(void *);
2273
2274 data = kzalloc(size, GFP_KERNEL);
2275 if (!data)
2276 goto fail;
2220 2277
2278 INIT_WORK(&data->work, perf_mmap_data_free_work);
2279
2280 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2281 if (!all_buf)
2282 goto fail_all_buf;
2283
2284 data->user_page = all_buf;
2285 data->data_pages[0] = all_buf + PAGE_SIZE;
2286 data->data_order = ilog2(nr_pages);
2287 data->nr_pages = 1;
2288
2289 return data;
2290
2291fail_all_buf:
2221 kfree(data); 2292 kfree(data);
2293
2294fail:
2295 return NULL;
2296}
2297
2298#endif
2299
2300static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2301{
2302 struct perf_event *event = vma->vm_file->private_data;
2303 struct perf_mmap_data *data;
2304 int ret = VM_FAULT_SIGBUS;
2305
2306 if (vmf->flags & FAULT_FLAG_MKWRITE) {
2307 if (vmf->pgoff == 0)
2308 ret = 0;
2309 return ret;
2310 }
2311
2312 rcu_read_lock();
2313 data = rcu_dereference(event->data);
2314 if (!data)
2315 goto unlock;
2316
2317 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
2318 goto unlock;
2319
2320 vmf->page = perf_mmap_to_page(data, vmf->pgoff);
2321 if (!vmf->page)
2322 goto unlock;
2323
2324 get_page(vmf->page);
2325 vmf->page->mapping = vma->vm_file->f_mapping;
2326 vmf->page->index = vmf->pgoff;
2327
2328 ret = 0;
2329unlock:
2330 rcu_read_unlock();
2331
2332 return ret;
2222} 2333}
2223 2334
2224static void perf_mmap_data_free(struct perf_event *event) 2335static void
2336perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2337{
2338 long max_size = perf_data_size(data);
2339
2340 atomic_set(&data->lock, -1);
2341
2342 if (event->attr.watermark) {
2343 data->watermark = min_t(long, max_size,
2344 event->attr.wakeup_watermark);
2345 }
2346
2347 if (!data->watermark)
2348 data->watermark = max_size / 2;
2349
2350
2351 rcu_assign_pointer(event->data, data);
2352}
2353
2354static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2355{
2356 struct perf_mmap_data *data;
2357
2358 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2359 perf_mmap_data_free(data);
2360}
2361
2362static void perf_mmap_data_release(struct perf_event *event)
2225{ 2363{
2226 struct perf_mmap_data *data = event->data; 2364 struct perf_mmap_data *data = event->data;
2227 2365
2228 WARN_ON(atomic_read(&event->mmap_count)); 2366 WARN_ON(atomic_read(&event->mmap_count));
2229 2367
2230 rcu_assign_pointer(event->data, NULL); 2368 rcu_assign_pointer(event->data, NULL);
2231 call_rcu(&data->rcu_head, __perf_mmap_data_free); 2369 call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
2232} 2370}
2233 2371
2234static void perf_mmap_open(struct vm_area_struct *vma) 2372static void perf_mmap_open(struct vm_area_struct *vma)
@@ -2244,16 +2382,17 @@ static void perf_mmap_close(struct vm_area_struct *vma)
2244 2382
2245 WARN_ON_ONCE(event->ctx->parent_ctx); 2383 WARN_ON_ONCE(event->ctx->parent_ctx);
2246 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 2384 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2385 unsigned long size = perf_data_size(event->data);
2247 struct user_struct *user = current_user(); 2386 struct user_struct *user = current_user();
2248 2387
2249 atomic_long_sub(event->data->nr_pages + 1, &user->locked_vm); 2388 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2250 vma->vm_mm->locked_vm -= event->data->nr_locked; 2389 vma->vm_mm->locked_vm -= event->data->nr_locked;
2251 perf_mmap_data_free(event); 2390 perf_mmap_data_release(event);
2252 mutex_unlock(&event->mmap_mutex); 2391 mutex_unlock(&event->mmap_mutex);
2253 } 2392 }
2254} 2393}
2255 2394
2256static struct vm_operations_struct perf_mmap_vmops = { 2395static const struct vm_operations_struct perf_mmap_vmops = {
2257 .open = perf_mmap_open, 2396 .open = perf_mmap_open,
2258 .close = perf_mmap_close, 2397 .close = perf_mmap_close,
2259 .fault = perf_mmap_fault, 2398 .fault = perf_mmap_fault,
@@ -2266,6 +2405,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2266 unsigned long user_locked, user_lock_limit; 2405 unsigned long user_locked, user_lock_limit;
2267 struct user_struct *user = current_user(); 2406 struct user_struct *user = current_user();
2268 unsigned long locked, lock_limit; 2407 unsigned long locked, lock_limit;
2408 struct perf_mmap_data *data;
2269 unsigned long vma_size; 2409 unsigned long vma_size;
2270 unsigned long nr_pages; 2410 unsigned long nr_pages;
2271 long user_extra, extra; 2411 long user_extra, extra;
@@ -2328,10 +2468,15 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2328 } 2468 }
2329 2469
2330 WARN_ON(event->data); 2470 WARN_ON(event->data);
2331 ret = perf_mmap_data_alloc(event, nr_pages); 2471
2332 if (ret) 2472 data = perf_mmap_data_alloc(event, nr_pages);
2473 ret = -ENOMEM;
2474 if (!data)
2333 goto unlock; 2475 goto unlock;
2334 2476
2477 ret = 0;
2478 perf_mmap_data_init(event, data);
2479
2335 atomic_set(&event->mmap_count, 1); 2480 atomic_set(&event->mmap_count, 1);
2336 atomic_long_add(user_extra, &user->locked_vm); 2481 atomic_long_add(user_extra, &user->locked_vm);
2337 vma->vm_mm->locked_vm += extra; 2482 vma->vm_mm->locked_vm += extra;
@@ -2519,7 +2664,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2519 if (!data->writable) 2664 if (!data->writable)
2520 return true; 2665 return true;
2521 2666
2522 mask = (data->nr_pages << PAGE_SHIFT) - 1; 2667 mask = perf_data_size(data) - 1;
2523 2668
2524 offset = (offset - tail) & mask; 2669 offset = (offset - tail) & mask;
2525 head = (head - tail) & mask; 2670 head = (head - tail) & mask;
@@ -2558,20 +2703,21 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
2558static void perf_output_lock(struct perf_output_handle *handle) 2703static void perf_output_lock(struct perf_output_handle *handle)
2559{ 2704{
2560 struct perf_mmap_data *data = handle->data; 2705 struct perf_mmap_data *data = handle->data;
2561 int cpu; 2706 int cur, cpu = get_cpu();
2562 2707
2563 handle->locked = 0; 2708 handle->locked = 0;
2564 2709
2565 local_irq_save(handle->flags); 2710 for (;;) {
2566 cpu = smp_processor_id(); 2711 cur = atomic_cmpxchg(&data->lock, -1, cpu);
2567 2712 if (cur == -1) {
2568 if (in_nmi() && atomic_read(&data->lock) == cpu) 2713 handle->locked = 1;
2569 return; 2714 break;
2715 }
2716 if (cur == cpu)
2717 break;
2570 2718
2571 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2572 cpu_relax(); 2719 cpu_relax();
2573 2720 }
2574 handle->locked = 1;
2575} 2721}
2576 2722
2577static void perf_output_unlock(struct perf_output_handle *handle) 2723static void perf_output_unlock(struct perf_output_handle *handle)
@@ -2617,14 +2763,14 @@ again:
2617 if (atomic_xchg(&data->wakeup, 0)) 2763 if (atomic_xchg(&data->wakeup, 0))
2618 perf_output_wakeup(handle); 2764 perf_output_wakeup(handle);
2619out: 2765out:
2620 local_irq_restore(handle->flags); 2766 put_cpu();
2621} 2767}
2622 2768
2623void perf_output_copy(struct perf_output_handle *handle, 2769void perf_output_copy(struct perf_output_handle *handle,
2624 const void *buf, unsigned int len) 2770 const void *buf, unsigned int len)
2625{ 2771{
2626 unsigned int pages_mask; 2772 unsigned int pages_mask;
2627 unsigned int offset; 2773 unsigned long offset;
2628 unsigned int size; 2774 unsigned int size;
2629 void **pages; 2775 void **pages;
2630 2776
@@ -2633,12 +2779,14 @@ void perf_output_copy(struct perf_output_handle *handle,
2633 pages = handle->data->data_pages; 2779 pages = handle->data->data_pages;
2634 2780
2635 do { 2781 do {
2636 unsigned int page_offset; 2782 unsigned long page_offset;
2783 unsigned long page_size;
2637 int nr; 2784 int nr;
2638 2785
2639 nr = (offset >> PAGE_SHIFT) & pages_mask; 2786 nr = (offset >> PAGE_SHIFT) & pages_mask;
2640 page_offset = offset & (PAGE_SIZE - 1); 2787 page_size = 1UL << (handle->data->data_order + PAGE_SHIFT);
2641 size = min_t(unsigned int, PAGE_SIZE - page_offset, len); 2788 page_offset = offset & (page_size - 1);
2789 size = min_t(unsigned int, page_size - page_offset, len);
2642 2790
2643 memcpy(pages[nr] + page_offset, buf, size); 2791 memcpy(pages[nr] + page_offset, buf, size);
2644 2792
@@ -3126,15 +3274,10 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
3126{ 3274{
3127 struct perf_event *event; 3275 struct perf_event *event;
3128 3276
3129 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3130 return;
3131
3132 rcu_read_lock();
3133 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3277 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3134 if (perf_event_task_match(event)) 3278 if (perf_event_task_match(event))
3135 perf_event_task_output(event, task_event); 3279 perf_event_task_output(event, task_event);
3136 } 3280 }
3137 rcu_read_unlock();
3138} 3281}
3139 3282
3140static void perf_event_task_event(struct perf_task_event *task_event) 3283static void perf_event_task_event(struct perf_task_event *task_event)
@@ -3142,11 +3285,11 @@ static void perf_event_task_event(struct perf_task_event *task_event)
3142 struct perf_cpu_context *cpuctx; 3285 struct perf_cpu_context *cpuctx;
3143 struct perf_event_context *ctx = task_event->task_ctx; 3286 struct perf_event_context *ctx = task_event->task_ctx;
3144 3287
3288 rcu_read_lock();
3145 cpuctx = &get_cpu_var(perf_cpu_context); 3289 cpuctx = &get_cpu_var(perf_cpu_context);
3146 perf_event_task_ctx(&cpuctx->ctx, task_event); 3290 perf_event_task_ctx(&cpuctx->ctx, task_event);
3147 put_cpu_var(perf_cpu_context); 3291 put_cpu_var(perf_cpu_context);
3148 3292
3149 rcu_read_lock();
3150 if (!ctx) 3293 if (!ctx)
3151 ctx = rcu_dereference(task_event->task->perf_event_ctxp); 3294 ctx = rcu_dereference(task_event->task->perf_event_ctxp);
3152 if (ctx) 3295 if (ctx)
@@ -3238,15 +3381,10 @@ static void perf_event_comm_ctx(struct perf_event_context *ctx,
3238{ 3381{
3239 struct perf_event *event; 3382 struct perf_event *event;
3240 3383
3241 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3242 return;
3243
3244 rcu_read_lock();
3245 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3384 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3246 if (perf_event_comm_match(event)) 3385 if (perf_event_comm_match(event))
3247 perf_event_comm_output(event, comm_event); 3386 perf_event_comm_output(event, comm_event);
3248 } 3387 }
3249 rcu_read_unlock();
3250} 3388}
3251 3389
3252static void perf_event_comm_event(struct perf_comm_event *comm_event) 3390static void perf_event_comm_event(struct perf_comm_event *comm_event)
@@ -3257,7 +3395,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3257 char comm[TASK_COMM_LEN]; 3395 char comm[TASK_COMM_LEN];
3258 3396
3259 memset(comm, 0, sizeof(comm)); 3397 memset(comm, 0, sizeof(comm));
3260 strncpy(comm, comm_event->task->comm, sizeof(comm)); 3398 strlcpy(comm, comm_event->task->comm, sizeof(comm));
3261 size = ALIGN(strlen(comm)+1, sizeof(u64)); 3399 size = ALIGN(strlen(comm)+1, sizeof(u64));
3262 3400
3263 comm_event->comm = comm; 3401 comm_event->comm = comm;
@@ -3265,11 +3403,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3265 3403
3266 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 3404 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3267 3405
3406 rcu_read_lock();
3268 cpuctx = &get_cpu_var(perf_cpu_context); 3407 cpuctx = &get_cpu_var(perf_cpu_context);
3269 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 3408 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3270 put_cpu_var(perf_cpu_context); 3409 put_cpu_var(perf_cpu_context);
3271 3410
3272 rcu_read_lock();
3273 /* 3411 /*
3274 * doesn't really matter which of the child contexts the 3412 * doesn't really matter which of the child contexts the
3275 * events ends up in. 3413 * events ends up in.
@@ -3362,15 +3500,10 @@ static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3362{ 3500{
3363 struct perf_event *event; 3501 struct perf_event *event;
3364 3502
3365 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3366 return;
3367
3368 rcu_read_lock();
3369 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3503 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3370 if (perf_event_mmap_match(event, mmap_event)) 3504 if (perf_event_mmap_match(event, mmap_event))
3371 perf_event_mmap_output(event, mmap_event); 3505 perf_event_mmap_output(event, mmap_event);
3372 } 3506 }
3373 rcu_read_unlock();
3374} 3507}
3375 3508
3376static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) 3509static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
@@ -3426,11 +3559,11 @@ got_name:
3426 3559
3427 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 3560 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3428 3561
3562 rcu_read_lock();
3429 cpuctx = &get_cpu_var(perf_cpu_context); 3563 cpuctx = &get_cpu_var(perf_cpu_context);
3430 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); 3564 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3431 put_cpu_var(perf_cpu_context); 3565 put_cpu_var(perf_cpu_context);
3432 3566
3433 rcu_read_lock();
3434 /* 3567 /*
3435 * doesn't really matter which of the child contexts the 3568 * doesn't really matter which of the child contexts the
3436 * events ends up in. 3569 * events ends up in.
@@ -3569,7 +3702,11 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3569 perf_event_disable(event); 3702 perf_event_disable(event);
3570 } 3703 }
3571 3704
3572 perf_event_output(event, nmi, data, regs); 3705 if (event->overflow_handler)
3706 event->overflow_handler(event, nmi, data, regs);
3707 else
3708 perf_event_output(event, nmi, data, regs);
3709
3573 return ret; 3710 return ret;
3574} 3711}
3575 3712
@@ -3614,16 +3751,16 @@ again:
3614 return nr; 3751 return nr;
3615} 3752}
3616 3753
3617static void perf_swevent_overflow(struct perf_event *event, 3754static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
3618 int nmi, struct perf_sample_data *data, 3755 int nmi, struct perf_sample_data *data,
3619 struct pt_regs *regs) 3756 struct pt_regs *regs)
3620{ 3757{
3621 struct hw_perf_event *hwc = &event->hw; 3758 struct hw_perf_event *hwc = &event->hw;
3622 int throttle = 0; 3759 int throttle = 0;
3623 u64 overflow;
3624 3760
3625 data->period = event->hw.last_period; 3761 data->period = event->hw.last_period;
3626 overflow = perf_swevent_set_period(event); 3762 if (!overflow)
3763 overflow = perf_swevent_set_period(event);
3627 3764
3628 if (hwc->interrupts == MAX_INTERRUPTS) 3765 if (hwc->interrupts == MAX_INTERRUPTS)
3629 return; 3766 return;
@@ -3656,14 +3793,19 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
3656 3793
3657 atomic64_add(nr, &event->count); 3794 atomic64_add(nr, &event->count);
3658 3795
3796 if (!regs)
3797 return;
3798
3659 if (!hwc->sample_period) 3799 if (!hwc->sample_period)
3660 return; 3800 return;
3661 3801
3662 if (!regs) 3802 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
3803 return perf_swevent_overflow(event, 1, nmi, data, regs);
3804
3805 if (atomic64_add_negative(nr, &hwc->period_left))
3663 return; 3806 return;
3664 3807
3665 if (!atomic64_add_negative(nr, &hwc->period_left)) 3808 perf_swevent_overflow(event, 0, nmi, data, regs);
3666 perf_swevent_overflow(event, nmi, data, regs);
3667} 3809}
3668 3810
3669static int perf_swevent_is_counting(struct perf_event *event) 3811static int perf_swevent_is_counting(struct perf_event *event)
@@ -3696,25 +3838,44 @@ static int perf_swevent_is_counting(struct perf_event *event)
3696 return 1; 3838 return 1;
3697} 3839}
3698 3840
3841static int perf_tp_event_match(struct perf_event *event,
3842 struct perf_sample_data *data);
3843
3844static int perf_exclude_event(struct perf_event *event,
3845 struct pt_regs *regs)
3846{
3847 if (regs) {
3848 if (event->attr.exclude_user && user_mode(regs))
3849 return 1;
3850
3851 if (event->attr.exclude_kernel && !user_mode(regs))
3852 return 1;
3853 }
3854
3855 return 0;
3856}
3857
3699static int perf_swevent_match(struct perf_event *event, 3858static int perf_swevent_match(struct perf_event *event,
3700 enum perf_type_id type, 3859 enum perf_type_id type,
3701 u32 event_id, struct pt_regs *regs) 3860 u32 event_id,
3861 struct perf_sample_data *data,
3862 struct pt_regs *regs)
3702{ 3863{
3703 if (!perf_swevent_is_counting(event)) 3864 if (!perf_swevent_is_counting(event))
3704 return 0; 3865 return 0;
3705 3866
3706 if (event->attr.type != type) 3867 if (event->attr.type != type)
3707 return 0; 3868 return 0;
3869
3708 if (event->attr.config != event_id) 3870 if (event->attr.config != event_id)
3709 return 0; 3871 return 0;
3710 3872
3711 if (regs) { 3873 if (perf_exclude_event(event, regs))
3712 if (event->attr.exclude_user && user_mode(regs)) 3874 return 0;
3713 return 0;
3714 3875
3715 if (event->attr.exclude_kernel && !user_mode(regs)) 3876 if (event->attr.type == PERF_TYPE_TRACEPOINT &&
3716 return 0; 3877 !perf_tp_event_match(event, data))
3717 } 3878 return 0;
3718 3879
3719 return 1; 3880 return 1;
3720} 3881}
@@ -3727,49 +3888,59 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3727{ 3888{
3728 struct perf_event *event; 3889 struct perf_event *event;
3729 3890
3730 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3731 return;
3732
3733 rcu_read_lock();
3734 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3891 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3735 if (perf_swevent_match(event, type, event_id, regs)) 3892 if (perf_swevent_match(event, type, event_id, data, regs))
3736 perf_swevent_add(event, nr, nmi, data, regs); 3893 perf_swevent_add(event, nr, nmi, data, regs);
3737 } 3894 }
3738 rcu_read_unlock();
3739} 3895}
3740 3896
3741static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx) 3897int perf_swevent_get_recursion_context(void)
3742{ 3898{
3899 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3900 int rctx;
3901
3743 if (in_nmi()) 3902 if (in_nmi())
3744 return &cpuctx->recursion[3]; 3903 rctx = 3;
3904 else if (in_irq())
3905 rctx = 2;
3906 else if (in_softirq())
3907 rctx = 1;
3908 else
3909 rctx = 0;
3745 3910
3746 if (in_irq()) 3911 if (cpuctx->recursion[rctx]) {
3747 return &cpuctx->recursion[2]; 3912 put_cpu_var(perf_cpu_context);
3913 return -1;
3914 }
3748 3915
3749 if (in_softirq()) 3916 cpuctx->recursion[rctx]++;
3750 return &cpuctx->recursion[1]; 3917 barrier();
3751 3918
3752 return &cpuctx->recursion[0]; 3919 return rctx;
3920}
3921EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
3922
3923void perf_swevent_put_recursion_context(int rctx)
3924{
3925 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
3926 barrier();
3927 cpuctx->recursion[rctx]--;
3928 put_cpu_var(perf_cpu_context);
3753} 3929}
3930EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
3754 3931
3755static void do_perf_sw_event(enum perf_type_id type, u32 event_id, 3932static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3756 u64 nr, int nmi, 3933 u64 nr, int nmi,
3757 struct perf_sample_data *data, 3934 struct perf_sample_data *data,
3758 struct pt_regs *regs) 3935 struct pt_regs *regs)
3759{ 3936{
3760 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); 3937 struct perf_cpu_context *cpuctx;
3761 int *recursion = perf_swevent_recursion_context(cpuctx);
3762 struct perf_event_context *ctx; 3938 struct perf_event_context *ctx;
3763 3939
3764 if (*recursion) 3940 cpuctx = &__get_cpu_var(perf_cpu_context);
3765 goto out; 3941 rcu_read_lock();
3766
3767 (*recursion)++;
3768 barrier();
3769
3770 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, 3942 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3771 nr, nmi, data, regs); 3943 nr, nmi, data, regs);
3772 rcu_read_lock();
3773 /* 3944 /*
3774 * doesn't really matter which of the child contexts the 3945 * doesn't really matter which of the child contexts the
3775 * events ends up in. 3946 * events ends up in.
@@ -3778,23 +3949,24 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3778 if (ctx) 3949 if (ctx)
3779 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); 3950 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3780 rcu_read_unlock(); 3951 rcu_read_unlock();
3781
3782 barrier();
3783 (*recursion)--;
3784
3785out:
3786 put_cpu_var(perf_cpu_context);
3787} 3952}
3788 3953
3789void __perf_sw_event(u32 event_id, u64 nr, int nmi, 3954void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3790 struct pt_regs *regs, u64 addr) 3955 struct pt_regs *regs, u64 addr)
3791{ 3956{
3792 struct perf_sample_data data = { 3957 struct perf_sample_data data;
3793 .addr = addr, 3958 int rctx;
3794 };
3795 3959
3796 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, 3960 rctx = perf_swevent_get_recursion_context();
3797 &data, regs); 3961 if (rctx < 0)
3962 return;
3963
3964 data.addr = addr;
3965 data.raw = NULL;
3966
3967 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
3968
3969 perf_swevent_put_recursion_context(rctx);
3798} 3970}
3799 3971
3800static void perf_swevent_read(struct perf_event *event) 3972static void perf_swevent_read(struct perf_event *event)
@@ -3839,6 +4011,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3839 event->pmu->read(event); 4011 event->pmu->read(event);
3840 4012
3841 data.addr = 0; 4013 data.addr = 0;
4014 data.period = event->hw.last_period;
3842 regs = get_irq_regs(); 4015 regs = get_irq_regs();
3843 /* 4016 /*
3844 * In case we exclude kernel IPs or are somehow not in interrupt 4017 * In case we exclude kernel IPs or are somehow not in interrupt
@@ -3849,8 +4022,9 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3849 regs = task_pt_regs(current); 4022 regs = task_pt_regs(current);
3850 4023
3851 if (regs) { 4024 if (regs) {
3852 if (perf_event_overflow(event, 0, &data, regs)) 4025 if (!(event->attr.exclude_idle && current->pid == 0))
3853 ret = HRTIMER_NORESTART; 4026 if (perf_event_overflow(event, 0, &data, regs))
4027 ret = HRTIMER_NORESTART;
3854 } 4028 }
3855 4029
3856 period = max_t(u64, 10000, event->hw.sample_period); 4030 period = max_t(u64, 10000, event->hw.sample_period);
@@ -3859,6 +4033,42 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3859 return ret; 4033 return ret;
3860} 4034}
3861 4035
4036static void perf_swevent_start_hrtimer(struct perf_event *event)
4037{
4038 struct hw_perf_event *hwc = &event->hw;
4039
4040 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4041 hwc->hrtimer.function = perf_swevent_hrtimer;
4042 if (hwc->sample_period) {
4043 u64 period;
4044
4045 if (hwc->remaining) {
4046 if (hwc->remaining < 0)
4047 period = 10000;
4048 else
4049 period = hwc->remaining;
4050 hwc->remaining = 0;
4051 } else {
4052 period = max_t(u64, 10000, hwc->sample_period);
4053 }
4054 __hrtimer_start_range_ns(&hwc->hrtimer,
4055 ns_to_ktime(period), 0,
4056 HRTIMER_MODE_REL, 0);
4057 }
4058}
4059
4060static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4061{
4062 struct hw_perf_event *hwc = &event->hw;
4063
4064 if (hwc->sample_period) {
4065 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4066 hwc->remaining = ktime_to_ns(remaining);
4067
4068 hrtimer_cancel(&hwc->hrtimer);
4069 }
4070}
4071
3862/* 4072/*
3863 * Software event: cpu wall time clock 4073 * Software event: cpu wall time clock
3864 */ 4074 */
@@ -3881,22 +4091,14 @@ static int cpu_clock_perf_event_enable(struct perf_event *event)
3881 int cpu = raw_smp_processor_id(); 4091 int cpu = raw_smp_processor_id();
3882 4092
3883 atomic64_set(&hwc->prev_count, cpu_clock(cpu)); 4093 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
3884 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 4094 perf_swevent_start_hrtimer(event);
3885 hwc->hrtimer.function = perf_swevent_hrtimer;
3886 if (hwc->sample_period) {
3887 u64 period = max_t(u64, 10000, hwc->sample_period);
3888 __hrtimer_start_range_ns(&hwc->hrtimer,
3889 ns_to_ktime(period), 0,
3890 HRTIMER_MODE_REL, 0);
3891 }
3892 4095
3893 return 0; 4096 return 0;
3894} 4097}
3895 4098
3896static void cpu_clock_perf_event_disable(struct perf_event *event) 4099static void cpu_clock_perf_event_disable(struct perf_event *event)
3897{ 4100{
3898 if (event->hw.sample_period) 4101 perf_swevent_cancel_hrtimer(event);
3899 hrtimer_cancel(&event->hw.hrtimer);
3900 cpu_clock_perf_event_update(event); 4102 cpu_clock_perf_event_update(event);
3901} 4103}
3902 4104
@@ -3933,22 +4135,15 @@ static int task_clock_perf_event_enable(struct perf_event *event)
3933 now = event->ctx->time; 4135 now = event->ctx->time;
3934 4136
3935 atomic64_set(&hwc->prev_count, now); 4137 atomic64_set(&hwc->prev_count, now);
3936 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 4138
3937 hwc->hrtimer.function = perf_swevent_hrtimer; 4139 perf_swevent_start_hrtimer(event);
3938 if (hwc->sample_period) {
3939 u64 period = max_t(u64, 10000, hwc->sample_period);
3940 __hrtimer_start_range_ns(&hwc->hrtimer,
3941 ns_to_ktime(period), 0,
3942 HRTIMER_MODE_REL, 0);
3943 }
3944 4140
3945 return 0; 4141 return 0;
3946} 4142}
3947 4143
3948static void task_clock_perf_event_disable(struct perf_event *event) 4144static void task_clock_perf_event_disable(struct perf_event *event)
3949{ 4145{
3950 if (event->hw.sample_period) 4146 perf_swevent_cancel_hrtimer(event);
3951 hrtimer_cancel(&event->hw.hrtimer);
3952 task_clock_perf_event_update(event, event->ctx->time); 4147 task_clock_perf_event_update(event, event->ctx->time);
3953 4148
3954} 4149}
@@ -3976,6 +4171,7 @@ static const struct pmu perf_ops_task_clock = {
3976}; 4171};
3977 4172
3978#ifdef CONFIG_EVENT_PROFILE 4173#ifdef CONFIG_EVENT_PROFILE
4174
3979void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4175void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
3980 int entry_size) 4176 int entry_size)
3981{ 4177{
@@ -3994,13 +4190,21 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
3994 if (!regs) 4190 if (!regs)
3995 regs = task_pt_regs(current); 4191 regs = task_pt_regs(current);
3996 4192
4193 /* Trace events already protected against recursion */
3997 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, 4194 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
3998 &data, regs); 4195 &data, regs);
3999} 4196}
4000EXPORT_SYMBOL_GPL(perf_tp_event); 4197EXPORT_SYMBOL_GPL(perf_tp_event);
4001 4198
4002extern int ftrace_profile_enable(int); 4199static int perf_tp_event_match(struct perf_event *event,
4003extern void ftrace_profile_disable(int); 4200 struct perf_sample_data *data)
4201{
4202 void *record = data->raw->data;
4203
4204 if (likely(!event->filter) || filter_match_preds(event->filter, record))
4205 return 1;
4206 return 0;
4207}
4004 4208
4005static void tp_perf_event_destroy(struct perf_event *event) 4209static void tp_perf_event_destroy(struct perf_event *event)
4006{ 4210{
@@ -4025,11 +4229,99 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4025 4229
4026 return &perf_ops_generic; 4230 return &perf_ops_generic;
4027} 4231}
4232
4233static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4234{
4235 char *filter_str;
4236 int ret;
4237
4238 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4239 return -EINVAL;
4240
4241 filter_str = strndup_user(arg, PAGE_SIZE);
4242 if (IS_ERR(filter_str))
4243 return PTR_ERR(filter_str);
4244
4245 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
4246
4247 kfree(filter_str);
4248 return ret;
4249}
4250
4251static void perf_event_free_filter(struct perf_event *event)
4252{
4253 ftrace_profile_free_filter(event);
4254}
4255
4028#else 4256#else
4257
4258static int perf_tp_event_match(struct perf_event *event,
4259 struct perf_sample_data *data)
4260{
4261 return 1;
4262}
4263
4029static const struct pmu *tp_perf_event_init(struct perf_event *event) 4264static const struct pmu *tp_perf_event_init(struct perf_event *event)
4030{ 4265{
4031 return NULL; 4266 return NULL;
4032} 4267}
4268
4269static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4270{
4271 return -ENOENT;
4272}
4273
4274static void perf_event_free_filter(struct perf_event *event)
4275{
4276}
4277
4278#endif /* CONFIG_EVENT_PROFILE */
4279
4280#ifdef CONFIG_HAVE_HW_BREAKPOINT
4281static void bp_perf_event_destroy(struct perf_event *event)
4282{
4283 release_bp_slot(event);
4284}
4285
4286static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4287{
4288 int err;
4289 /*
4290 * The breakpoint is already filled if we haven't created the counter
4291 * through perf syscall
4292 * FIXME: manage to get trigerred to NULL if it comes from syscalls
4293 */
4294 if (!bp->callback)
4295 err = register_perf_hw_breakpoint(bp);
4296 else
4297 err = __register_perf_hw_breakpoint(bp);
4298 if (err)
4299 return ERR_PTR(err);
4300
4301 bp->destroy = bp_perf_event_destroy;
4302
4303 return &perf_ops_bp;
4304}
4305
4306void perf_bp_event(struct perf_event *bp, void *data)
4307{
4308 struct perf_sample_data sample;
4309 struct pt_regs *regs = data;
4310
4311 sample.addr = bp->attr.bp_addr;
4312
4313 if (!perf_exclude_event(bp, regs))
4314 perf_swevent_add(bp, 1, 1, &sample, regs);
4315}
4316#else
4317static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4318{
4319 return NULL;
4320}
4321
4322void perf_bp_event(struct perf_event *bp, void *regs)
4323{
4324}
4033#endif 4325#endif
4034 4326
4035atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; 4327atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
@@ -4076,6 +4368,8 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
4076 case PERF_COUNT_SW_PAGE_FAULTS_MAJ: 4368 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4077 case PERF_COUNT_SW_CONTEXT_SWITCHES: 4369 case PERF_COUNT_SW_CONTEXT_SWITCHES:
4078 case PERF_COUNT_SW_CPU_MIGRATIONS: 4370 case PERF_COUNT_SW_CPU_MIGRATIONS:
4371 case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4372 case PERF_COUNT_SW_EMULATION_FAULTS:
4079 if (!event->parent) { 4373 if (!event->parent) {
4080 atomic_inc(&perf_swevent_enabled[event_id]); 4374 atomic_inc(&perf_swevent_enabled[event_id]);
4081 event->destroy = sw_perf_event_destroy; 4375 event->destroy = sw_perf_event_destroy;
@@ -4096,6 +4390,7 @@ perf_event_alloc(struct perf_event_attr *attr,
4096 struct perf_event_context *ctx, 4390 struct perf_event_context *ctx,
4097 struct perf_event *group_leader, 4391 struct perf_event *group_leader,
4098 struct perf_event *parent_event, 4392 struct perf_event *parent_event,
4393 perf_callback_t callback,
4099 gfp_t gfpflags) 4394 gfp_t gfpflags)
4100{ 4395{
4101 const struct pmu *pmu; 4396 const struct pmu *pmu;
@@ -4138,6 +4433,11 @@ perf_event_alloc(struct perf_event_attr *attr,
4138 4433
4139 event->state = PERF_EVENT_STATE_INACTIVE; 4434 event->state = PERF_EVENT_STATE_INACTIVE;
4140 4435
4436 if (!callback && parent_event)
4437 callback = parent_event->callback;
4438
4439 event->callback = callback;
4440
4141 if (attr->disabled) 4441 if (attr->disabled)
4142 event->state = PERF_EVENT_STATE_OFF; 4442 event->state = PERF_EVENT_STATE_OFF;
4143 4443
@@ -4172,6 +4472,11 @@ perf_event_alloc(struct perf_event_attr *attr,
4172 pmu = tp_perf_event_init(event); 4472 pmu = tp_perf_event_init(event);
4173 break; 4473 break;
4174 4474
4475 case PERF_TYPE_BREAKPOINT:
4476 pmu = bp_perf_event_init(event);
4477 break;
4478
4479
4175 default: 4480 default:
4176 break; 4481 break;
4177 } 4482 }
@@ -4284,7 +4589,7 @@ err_size:
4284 goto out; 4589 goto out;
4285} 4590}
4286 4591
4287int perf_event_set_output(struct perf_event *event, int output_fd) 4592static int perf_event_set_output(struct perf_event *event, int output_fd)
4288{ 4593{
4289 struct perf_event *output_event = NULL; 4594 struct perf_event *output_event = NULL;
4290 struct file *output_file = NULL; 4595 struct file *output_file = NULL;
@@ -4414,7 +4719,7 @@ SYSCALL_DEFINE5(perf_event_open,
4414 } 4719 }
4415 4720
4416 event = perf_event_alloc(&attr, cpu, ctx, group_leader, 4721 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4417 NULL, GFP_KERNEL); 4722 NULL, NULL, GFP_KERNEL);
4418 err = PTR_ERR(event); 4723 err = PTR_ERR(event);
4419 if (IS_ERR(event)) 4724 if (IS_ERR(event))
4420 goto err_put_context; 4725 goto err_put_context;
@@ -4462,6 +4767,60 @@ err_put_context:
4462 return err; 4767 return err;
4463} 4768}
4464 4769
4770/**
4771 * perf_event_create_kernel_counter
4772 *
4773 * @attr: attributes of the counter to create
4774 * @cpu: cpu in which the counter is bound
4775 * @pid: task to profile
4776 */
4777struct perf_event *
4778perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
4779 pid_t pid, perf_callback_t callback)
4780{
4781 struct perf_event *event;
4782 struct perf_event_context *ctx;
4783 int err;
4784
4785 /*
4786 * Get the target context (task or percpu):
4787 */
4788
4789 ctx = find_get_context(pid, cpu);
4790 if (IS_ERR(ctx)) {
4791 err = PTR_ERR(ctx);
4792 goto err_exit;
4793 }
4794
4795 event = perf_event_alloc(attr, cpu, ctx, NULL,
4796 NULL, callback, GFP_KERNEL);
4797 if (IS_ERR(event)) {
4798 err = PTR_ERR(event);
4799 goto err_put_context;
4800 }
4801
4802 event->filp = NULL;
4803 WARN_ON_ONCE(ctx->parent_ctx);
4804 mutex_lock(&ctx->mutex);
4805 perf_install_in_context(ctx, event, cpu);
4806 ++ctx->generation;
4807 mutex_unlock(&ctx->mutex);
4808
4809 event->owner = current;
4810 get_task_struct(current);
4811 mutex_lock(&current->perf_event_mutex);
4812 list_add_tail(&event->owner_entry, &current->perf_event_list);
4813 mutex_unlock(&current->perf_event_mutex);
4814
4815 return event;
4816
4817 err_put_context:
4818 put_ctx(ctx);
4819 err_exit:
4820 return ERR_PTR(err);
4821}
4822EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
4823
4465/* 4824/*
4466 * inherit a event from parent task to child task: 4825 * inherit a event from parent task to child task:
4467 */ 4826 */
@@ -4487,7 +4846,7 @@ inherit_event(struct perf_event *parent_event,
4487 child_event = perf_event_alloc(&parent_event->attr, 4846 child_event = perf_event_alloc(&parent_event->attr,
4488 parent_event->cpu, child_ctx, 4847 parent_event->cpu, child_ctx,
4489 group_leader, parent_event, 4848 group_leader, parent_event,
4490 GFP_KERNEL); 4849 NULL, GFP_KERNEL);
4491 if (IS_ERR(child_event)) 4850 if (IS_ERR(child_event))
4492 return child_event; 4851 return child_event;
4493 get_ctx(child_ctx); 4852 get_ctx(child_ctx);
@@ -4505,6 +4864,8 @@ inherit_event(struct perf_event *parent_event,
4505 if (parent_event->attr.freq) 4864 if (parent_event->attr.freq)
4506 child_event->hw.sample_period = parent_event->hw.sample_period; 4865 child_event->hw.sample_period = parent_event->hw.sample_period;
4507 4866
4867 child_event->overflow_handler = parent_event->overflow_handler;
4868
4508 /* 4869 /*
4509 * Link it up in the child's context: 4870 * Link it up in the child's context:
4510 */ 4871 */
@@ -4594,7 +4955,6 @@ __perf_event_exit_task(struct perf_event *child_event,
4594{ 4955{
4595 struct perf_event *parent_event; 4956 struct perf_event *parent_event;
4596 4957
4597 update_event_times(child_event);
4598 perf_event_remove_from_context(child_event); 4958 perf_event_remove_from_context(child_event);
4599 4959
4600 parent_event = child_event->parent; 4960 parent_event = child_event->parent;
@@ -4646,6 +5006,7 @@ void perf_event_exit_task(struct task_struct *child)
4646 * the events from it. 5006 * the events from it.
4647 */ 5007 */
4648 unclone_ctx(child_ctx); 5008 unclone_ctx(child_ctx);
5009 update_context_time(child_ctx);
4649 spin_unlock_irqrestore(&child_ctx->lock, flags); 5010 spin_unlock_irqrestore(&child_ctx->lock, flags);
4650 5011
4651 /* 5012 /*
@@ -4781,9 +5142,7 @@ int perf_event_init_task(struct task_struct *child)
4781 * We dont have to disable NMIs - we are only looking at 5142 * We dont have to disable NMIs - we are only looking at
4782 * the list, not manipulating it: 5143 * the list, not manipulating it:
4783 */ 5144 */
4784 list_for_each_entry_rcu(event, &parent_ctx->event_list, event_entry) { 5145 list_for_each_entry(event, &parent_ctx->group_list, group_entry) {
4785 if (event != event->group_leader)
4786 continue;
4787 5146
4788 if (!event->attr.inherit) { 5147 if (!event->attr.inherit) {
4789 inherited_all = 0; 5148 inherited_all = 0;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 821722ae58a7..86b3796b0436 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -118,7 +118,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old
118{ 118{
119 if (!(flags & CLONE_NEWPID)) 119 if (!(flags & CLONE_NEWPID))
120 return get_pid_ns(old_ns); 120 return get_pid_ns(old_ns);
121 if (flags & CLONE_THREAD) 121 if (flags & (CLONE_THREAD|CLONE_PARENT))
122 return ERR_PTR(-EINVAL); 122 return ERR_PTR(-EINVAL);
123 return create_pid_namespace(old_ns); 123 return create_pid_namespace(old_ns);
124} 124}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 04b3a83d686f..04a9e90d248f 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -693,21 +693,22 @@ static int software_resume(void)
693 /* The snapshot device should not be opened while we're running */ 693 /* The snapshot device should not be opened while we're running */
694 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { 694 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
695 error = -EBUSY; 695 error = -EBUSY;
696 swsusp_close(FMODE_READ);
696 goto Unlock; 697 goto Unlock;
697 } 698 }
698 699
699 pm_prepare_console(); 700 pm_prepare_console();
700 error = pm_notifier_call_chain(PM_RESTORE_PREPARE); 701 error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
701 if (error) 702 if (error)
702 goto Finish; 703 goto close_finish;
703 704
704 error = usermodehelper_disable(); 705 error = usermodehelper_disable();
705 if (error) 706 if (error)
706 goto Finish; 707 goto close_finish;
707 708
708 error = create_basic_memory_bitmaps(); 709 error = create_basic_memory_bitmaps();
709 if (error) 710 if (error)
710 goto Finish; 711 goto close_finish;
711 712
712 pr_debug("PM: Preparing processes for restore.\n"); 713 pr_debug("PM: Preparing processes for restore.\n");
713 error = prepare_processes(); 714 error = prepare_processes();
@@ -719,6 +720,7 @@ static int software_resume(void)
719 pr_debug("PM: Reading hibernation image.\n"); 720 pr_debug("PM: Reading hibernation image.\n");
720 721
721 error = swsusp_read(&flags); 722 error = swsusp_read(&flags);
723 swsusp_close(FMODE_READ);
722 if (!error) 724 if (!error)
723 hibernation_restore(flags & SF_PLATFORM_MODE); 725 hibernation_restore(flags & SF_PLATFORM_MODE);
724 726
@@ -737,6 +739,9 @@ static int software_resume(void)
737 mutex_unlock(&pm_mutex); 739 mutex_unlock(&pm_mutex);
738 pr_debug("PM: Resume from disk failed.\n"); 740 pr_debug("PM: Resume from disk failed.\n");
739 return error; 741 return error;
742close_finish:
743 swsusp_close(FMODE_READ);
744 goto Finish;
740} 745}
741 746
742late_initcall(software_resume); 747late_initcall(software_resume);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 17d8bb1acf9c..25596e450ac7 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -19,7 +19,7 @@
19 * The time it takes is system-specific though, so when we test this 19 * The time it takes is system-specific though, so when we test this
20 * during system bootup we allow a LOT of time. 20 * during system bootup we allow a LOT of time.
21 */ 21 */
22#define TEST_SUSPEND_SECONDS 5 22#define TEST_SUSPEND_SECONDS 10
23 23
24static unsigned long suspend_test_start_time; 24static unsigned long suspend_test_start_time;
25 25
@@ -49,7 +49,8 @@ void suspend_test_finish(const char *label)
49 * has some performance issues. The stack dump of a WARN_ON 49 * has some performance issues. The stack dump of a WARN_ON
50 * is more likely to get the right attention than a printk... 50 * is more likely to get the right attention than a printk...
51 */ 51 */
52 WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label); 52 WARN(msec > (TEST_SUSPEND_SECONDS * 1000),
53 "Component: %s, time: %u\n", label, msec);
53} 54}
54 55
55/* 56/*
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8ba052c86d48..890f6b11b1d3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -13,7 +13,6 @@
13 13
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/file.h> 15#include <linux/file.h>
16#include <linux/utsname.h>
17#include <linux/delay.h> 16#include <linux/delay.h>
18#include <linux/bitops.h> 17#include <linux/bitops.h>
19#include <linux/genhd.h> 18#include <linux/genhd.h>
@@ -315,7 +314,6 @@ static int save_image(struct swap_map_handle *handle,
315{ 314{
316 unsigned int m; 315 unsigned int m;
317 int ret; 316 int ret;
318 int error = 0;
319 int nr_pages; 317 int nr_pages;
320 int err2; 318 int err2;
321 struct bio *bio; 319 struct bio *bio;
@@ -330,26 +328,27 @@ static int save_image(struct swap_map_handle *handle,
330 nr_pages = 0; 328 nr_pages = 0;
331 bio = NULL; 329 bio = NULL;
332 do_gettimeofday(&start); 330 do_gettimeofday(&start);
333 do { 331 while (1) {
334 ret = snapshot_read_next(snapshot, PAGE_SIZE); 332 ret = snapshot_read_next(snapshot, PAGE_SIZE);
335 if (ret > 0) { 333 if (ret <= 0)
336 error = swap_write_page(handle, data_of(*snapshot), 334 break;
337 &bio); 335 ret = swap_write_page(handle, data_of(*snapshot), &bio);
338 if (error) 336 if (ret)
339 break; 337 break;
340 if (!(nr_pages % m)) 338 if (!(nr_pages % m))
341 printk("\b\b\b\b%3d%%", nr_pages / m); 339 printk("\b\b\b\b%3d%%", nr_pages / m);
342 nr_pages++; 340 nr_pages++;
343 } 341 }
344 } while (ret > 0);
345 err2 = wait_on_bio_chain(&bio); 342 err2 = wait_on_bio_chain(&bio);
346 do_gettimeofday(&stop); 343 do_gettimeofday(&stop);
347 if (!error) 344 if (!ret)
348 error = err2; 345 ret = err2;
349 if (!error) 346 if (!ret)
350 printk("\b\b\b\bdone\n"); 347 printk("\b\b\b\bdone\n");
348 else
349 printk("\n");
351 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 350 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
352 return error; 351 return ret;
353} 352}
354 353
355/** 354/**
@@ -537,7 +536,8 @@ static int load_image(struct swap_map_handle *handle,
537 snapshot_write_finalize(snapshot); 536 snapshot_write_finalize(snapshot);
538 if (!snapshot_image_loaded(snapshot)) 537 if (!snapshot_image_loaded(snapshot))
539 error = -ENODATA; 538 error = -ENODATA;
540 } 539 } else
540 printk("\n");
541 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 541 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
542 return error; 542 return error;
543} 543}
@@ -573,8 +573,6 @@ int swsusp_read(unsigned int *flags_p)
573 error = load_image(&handle, &snapshot, header->pages - 1); 573 error = load_image(&handle, &snapshot, header->pages - 1);
574 release_swap_reader(&handle); 574 release_swap_reader(&handle);
575 575
576 blkdev_put(resume_bdev, FMODE_READ);
577
578 if (!error) 576 if (!error)
579 pr_debug("PM: Image successfully loaded\n"); 577 pr_debug("PM: Image successfully loaded\n");
580 else 578 else
@@ -597,7 +595,7 @@ int swsusp_check(void)
597 error = bio_read_page(swsusp_resume_block, 595 error = bio_read_page(swsusp_resume_block,
598 swsusp_header, NULL); 596 swsusp_header, NULL);
599 if (error) 597 if (error)
600 return error; 598 goto put;
601 599
602 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { 600 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
603 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); 601 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
@@ -605,8 +603,10 @@ int swsusp_check(void)
605 error = bio_write_page(swsusp_resume_block, 603 error = bio_write_page(swsusp_resume_block,
606 swsusp_header, NULL); 604 swsusp_header, NULL);
607 } else { 605 } else {
608 return -EINVAL; 606 error = -EINVAL;
609 } 607 }
608
609put:
610 if (error) 610 if (error)
611 blkdev_put(resume_bdev, FMODE_READ); 611 blkdev_put(resume_bdev, FMODE_READ);
612 else 612 else
diff --git a/kernel/printk.c b/kernel/printk.c
index f38b07f78a4e..b5ac4d99c667 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -33,6 +33,7 @@
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/kexec.h> 35#include <linux/kexec.h>
36#include <linux/ratelimit.h>
36 37
37#include <asm/uaccess.h> 38#include <asm/uaccess.h>
38 39
@@ -1376,11 +1377,11 @@ late_initcall(disable_boot_consoles);
1376 */ 1377 */
1377DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); 1378DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
1378 1379
1379int printk_ratelimit(void) 1380int __printk_ratelimit(const char *func)
1380{ 1381{
1381 return __ratelimit(&printk_ratelimit_state); 1382 return ___ratelimit(&printk_ratelimit_state, func);
1382} 1383}
1383EXPORT_SYMBOL(printk_ratelimit); 1384EXPORT_SYMBOL(__printk_ratelimit);
1384 1385
1385/** 1386/**
1386 * printk_timed_ratelimit - caller-controlled printk ratelimiting 1387 * printk_timed_ratelimit - caller-controlled printk ratelimiting
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 307c285af59e..23bd09cd042e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -266,9 +266,10 @@ static int ignoring_children(struct sighand_struct *sigh)
266 * or self-reaping. Do notification now if it would have happened earlier. 266 * or self-reaping. Do notification now if it would have happened earlier.
267 * If it should reap itself, return true. 267 * If it should reap itself, return true.
268 * 268 *
269 * If it's our own child, there is no notification to do. 269 * If it's our own child, there is no notification to do. But if our normal
270 * But if our normal children self-reap, then this child 270 * children self-reap, then this child was prevented by ptrace and we must
271 * was prevented by ptrace and we must reap it now. 271 * reap it now, in that case we must also wake up sub-threads sleeping in
272 * do_wait().
272 */ 273 */
273static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) 274static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
274{ 275{
@@ -278,8 +279,10 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
278 if (!task_detached(p) && thread_group_empty(p)) { 279 if (!task_detached(p) && thread_group_empty(p)) {
279 if (!same_thread_group(p->real_parent, tracer)) 280 if (!same_thread_group(p->real_parent, tracer))
280 do_notify_parent(p, p->exit_signal); 281 do_notify_parent(p, p->exit_signal);
281 else if (ignoring_children(tracer->sighand)) 282 else if (ignoring_children(tracer->sighand)) {
283 __wake_up_parent(p, tracer);
282 p->exit_signal = -1; 284 p->exit_signal = -1;
285 }
283 } 286 }
284 if (task_detached(p)) { 287 if (task_detached(p)) {
285 /* Mark it as in the process of being reaped. */ 288 /* Mark it as in the process of being reaped. */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 37ac45483082..9b7fd4723878 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,23 +44,13 @@
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h>
48 47
49enum rcu_barrier { 48#ifdef CONFIG_DEBUG_LOCK_ALLOC
50 RCU_BARRIER_STD, 49static struct lock_class_key rcu_lock_key;
51 RCU_BARRIER_BH, 50struct lockdep_map rcu_lock_map =
52 RCU_BARRIER_SCHED, 51 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
53}; 52EXPORT_SYMBOL_GPL(rcu_lock_map);
54 53#endif
55static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
56static atomic_t rcu_barrier_cpu_count;
57static DEFINE_MUTEX(rcu_barrier_mutex);
58static struct completion rcu_barrier_completion;
59int rcu_scheduler_active __read_mostly;
60
61static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0);
62static struct rcu_head rcu_migrate_head[3];
63static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq);
64 54
65/* 55/*
66 * Awaken the corresponding synchronize_rcu() instance now that a 56 * Awaken the corresponding synchronize_rcu() instance now that a
@@ -73,241 +63,3 @@ void wakeme_after_rcu(struct rcu_head *head)
73 rcu = container_of(head, struct rcu_synchronize, head); 63 rcu = container_of(head, struct rcu_synchronize, head);
74 complete(&rcu->completion); 64 complete(&rcu->completion);
75} 65}
76
77#ifdef CONFIG_TREE_PREEMPT_RCU
78
79/**
80 * synchronize_rcu - wait until a grace period has elapsed.
81 *
82 * Control will return to the caller some time after a full grace
83 * period has elapsed, in other words after all currently executing RCU
84 * read-side critical sections have completed. RCU read-side critical
85 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
86 * and may be nested.
87 */
88void synchronize_rcu(void)
89{
90 struct rcu_synchronize rcu;
91
92 if (!rcu_scheduler_active)
93 return;
94
95 init_completion(&rcu.completion);
96 /* Will wake me after RCU finished. */
97 call_rcu(&rcu.head, wakeme_after_rcu);
98 /* Wait for it. */
99 wait_for_completion(&rcu.completion);
100}
101EXPORT_SYMBOL_GPL(synchronize_rcu);
102
103#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
104
105/**
106 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
107 *
108 * Control will return to the caller some time after a full rcu-sched
109 * grace period has elapsed, in other words after all currently executing
110 * rcu-sched read-side critical sections have completed. These read-side
111 * critical sections are delimited by rcu_read_lock_sched() and
112 * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
113 * local_irq_disable(), and so on may be used in place of
114 * rcu_read_lock_sched().
115 *
116 * This means that all preempt_disable code sequences, including NMI and
117 * hardware-interrupt handlers, in progress on entry will have completed
118 * before this primitive returns. However, this does not guarantee that
119 * softirq handlers will have completed, since in some kernels, these
120 * handlers can run in process context, and can block.
121 *
122 * This primitive provides the guarantees made by the (now removed)
123 * synchronize_kernel() API. In contrast, synchronize_rcu() only
124 * guarantees that rcu_read_lock() sections will have completed.
125 * In "classic RCU", these two guarantees happen to be one and
126 * the same, but can differ in realtime RCU implementations.
127 */
128void synchronize_sched(void)
129{
130 struct rcu_synchronize rcu;
131
132 if (rcu_blocking_is_gp())
133 return;
134
135 init_completion(&rcu.completion);
136 /* Will wake me after RCU finished. */
137 call_rcu_sched(&rcu.head, wakeme_after_rcu);
138 /* Wait for it. */
139 wait_for_completion(&rcu.completion);
140}
141EXPORT_SYMBOL_GPL(synchronize_sched);
142
143/**
144 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
145 *
146 * Control will return to the caller some time after a full rcu_bh grace
147 * period has elapsed, in other words after all currently executing rcu_bh
148 * read-side critical sections have completed. RCU read-side critical
149 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
150 * and may be nested.
151 */
152void synchronize_rcu_bh(void)
153{
154 struct rcu_synchronize rcu;
155
156 if (rcu_blocking_is_gp())
157 return;
158
159 init_completion(&rcu.completion);
160 /* Will wake me after RCU finished. */
161 call_rcu_bh(&rcu.head, wakeme_after_rcu);
162 /* Wait for it. */
163 wait_for_completion(&rcu.completion);
164}
165EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
166
167static void rcu_barrier_callback(struct rcu_head *notused)
168{
169 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
170 complete(&rcu_barrier_completion);
171}
172
173/*
174 * Called with preemption disabled, and from cross-cpu IRQ context.
175 */
176static void rcu_barrier_func(void *type)
177{
178 int cpu = smp_processor_id();
179 struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
180
181 atomic_inc(&rcu_barrier_cpu_count);
182 switch ((enum rcu_barrier)type) {
183 case RCU_BARRIER_STD:
184 call_rcu(head, rcu_barrier_callback);
185 break;
186 case RCU_BARRIER_BH:
187 call_rcu_bh(head, rcu_barrier_callback);
188 break;
189 case RCU_BARRIER_SCHED:
190 call_rcu_sched(head, rcu_barrier_callback);
191 break;
192 }
193}
194
195static inline void wait_migrated_callbacks(void)
196{
197 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
198 smp_mb(); /* In case we didn't sleep. */
199}
200
201/*
202 * Orchestrate the specified type of RCU barrier, waiting for all
203 * RCU callbacks of the specified type to complete.
204 */
205static void _rcu_barrier(enum rcu_barrier type)
206{
207 BUG_ON(in_interrupt());
208 /* Take cpucontrol mutex to protect against CPU hotplug */
209 mutex_lock(&rcu_barrier_mutex);
210 init_completion(&rcu_barrier_completion);
211 /*
212 * Initialize rcu_barrier_cpu_count to 1, then invoke
213 * rcu_barrier_func() on each CPU, so that each CPU also has
214 * incremented rcu_barrier_cpu_count. Only then is it safe to
215 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
216 * might complete its grace period before all of the other CPUs
217 * did their increment, causing this function to return too
218 * early.
219 */
220 atomic_set(&rcu_barrier_cpu_count, 1);
221 on_each_cpu(rcu_barrier_func, (void *)type, 1);
222 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
223 complete(&rcu_barrier_completion);
224 wait_for_completion(&rcu_barrier_completion);
225 mutex_unlock(&rcu_barrier_mutex);
226 wait_migrated_callbacks();
227}
228
229/**
230 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
231 */
232void rcu_barrier(void)
233{
234 _rcu_barrier(RCU_BARRIER_STD);
235}
236EXPORT_SYMBOL_GPL(rcu_barrier);
237
238/**
239 * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
240 */
241void rcu_barrier_bh(void)
242{
243 _rcu_barrier(RCU_BARRIER_BH);
244}
245EXPORT_SYMBOL_GPL(rcu_barrier_bh);
246
247/**
248 * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
249 */
250void rcu_barrier_sched(void)
251{
252 _rcu_barrier(RCU_BARRIER_SCHED);
253}
254EXPORT_SYMBOL_GPL(rcu_barrier_sched);
255
256static void rcu_migrate_callback(struct rcu_head *notused)
257{
258 if (atomic_dec_and_test(&rcu_migrate_type_count))
259 wake_up(&rcu_migrate_wq);
260}
261
262extern int rcu_cpu_notify(struct notifier_block *self,
263 unsigned long action, void *hcpu);
264
265static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
266 unsigned long action, void *hcpu)
267{
268 rcu_cpu_notify(self, action, hcpu);
269 if (action == CPU_DYING) {
270 /*
271 * preempt_disable() in on_each_cpu() prevents stop_machine(),
272 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
273 * returns, all online cpus have queued rcu_barrier_func(),
274 * and the dead cpu(if it exist) queues rcu_migrate_callback()s.
275 *
276 * These callbacks ensure _rcu_barrier() waits for all
277 * RCU callbacks of the specified type to complete.
278 */
279 atomic_set(&rcu_migrate_type_count, 3);
280 call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
281 call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
282 call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
283 } else if (action == CPU_DOWN_PREPARE) {
284 /* Don't need to wait until next removal operation. */
285 /* rcu_migrate_head is protected by cpu_add_remove_lock */
286 wait_migrated_callbacks();
287 }
288
289 return NOTIFY_OK;
290}
291
292void __init rcu_init(void)
293{
294 int i;
295
296 __rcu_init();
297 cpu_notifier(rcu_barrier_cpu_hotplug, 0);
298
299 /*
300 * We don't need protection against CPU-hotplug here because
301 * this is called early in boot, before either interrupts
302 * or the scheduler are operational.
303 */
304 for_each_online_cpu(i)
305 rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i);
306}
307
308void rcu_scheduler_starting(void)
309{
310 WARN_ON(num_online_cpus() != 1);
311 WARN_ON(nr_context_switches() > 0);
312 rcu_scheduler_active = 1;
313}
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
new file mode 100644
index 000000000000..9f6d9ff2572c
--- /dev/null
+++ b/kernel/rcutiny.c
@@ -0,0 +1,282 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2008
19 *
20 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU
24 */
25#include <linux/moduleparam.h>
26#include <linux/completion.h>
27#include <linux/interrupt.h>
28#include <linux/notifier.h>
29#include <linux/rcupdate.h>
30#include <linux/kernel.h>
31#include <linux/module.h>
32#include <linux/mutex.h>
33#include <linux/sched.h>
34#include <linux/types.h>
35#include <linux/init.h>
36#include <linux/time.h>
37#include <linux/cpu.h>
38
39/* Global control variables for rcupdate callback mechanism. */
40struct rcu_ctrlblk {
41 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
42 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
43 struct rcu_head **curtail; /* ->next pointer of last CB. */
44};
45
46/* Definition for rcupdate control block. */
47static struct rcu_ctrlblk rcu_ctrlblk = {
48 .donetail = &rcu_ctrlblk.rcucblist,
49 .curtail = &rcu_ctrlblk.rcucblist,
50};
51
52static struct rcu_ctrlblk rcu_bh_ctrlblk = {
53 .donetail = &rcu_bh_ctrlblk.rcucblist,
54 .curtail = &rcu_bh_ctrlblk.rcucblist,
55};
56
57#ifdef CONFIG_NO_HZ
58
59static long rcu_dynticks_nesting = 1;
60
61/*
62 * Enter dynticks-idle mode, which is an extended quiescent state
63 * if we have fully entered that mode (i.e., if the new value of
64 * dynticks_nesting is zero).
65 */
66void rcu_enter_nohz(void)
67{
68 if (--rcu_dynticks_nesting == 0)
69 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
70}
71
72/*
73 * Exit dynticks-idle mode, so that we are no longer in an extended
74 * quiescent state.
75 */
76void rcu_exit_nohz(void)
77{
78 rcu_dynticks_nesting++;
79}
80
81#endif /* #ifdef CONFIG_NO_HZ */
82
83/*
84 * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc().
85 * Also disable irqs to avoid confusion due to interrupt handlers
86 * invoking call_rcu().
87 */
88static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
89{
90 unsigned long flags;
91
92 local_irq_save(flags);
93 if (rcp->rcucblist != NULL &&
94 rcp->donetail != rcp->curtail) {
95 rcp->donetail = rcp->curtail;
96 local_irq_restore(flags);
97 return 1;
98 }
99 local_irq_restore(flags);
100
101 return 0;
102}
103
104/*
105 * Record an rcu quiescent state. And an rcu_bh quiescent state while we
106 * are at it, given that any rcu quiescent state is also an rcu_bh
107 * quiescent state. Use "+" instead of "||" to defeat short circuiting.
108 */
109void rcu_sched_qs(int cpu)
110{
111 if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk))
112 raise_softirq(RCU_SOFTIRQ);
113}
114
115/*
116 * Record an rcu_bh quiescent state.
117 */
118void rcu_bh_qs(int cpu)
119{
120 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
121 raise_softirq(RCU_SOFTIRQ);
122}
123
124/*
125 * Check to see if the scheduling-clock interrupt came from an extended
126 * quiescent state, and, if so, tell RCU about it.
127 */
128void rcu_check_callbacks(int cpu, int user)
129{
130 if (user ||
131 (idle_cpu(cpu) &&
132 !in_softirq() &&
133 hardirq_count() <= (1 << HARDIRQ_SHIFT)))
134 rcu_sched_qs(cpu);
135 else if (!in_softirq())
136 rcu_bh_qs(cpu);
137}
138
139/*
140 * Helper function for rcu_process_callbacks() that operates on the
141 * specified rcu_ctrlkblk structure.
142 */
143static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
144{
145 struct rcu_head *next, *list;
146 unsigned long flags;
147
148 /* If no RCU callbacks ready to invoke, just return. */
149 if (&rcp->rcucblist == rcp->donetail)
150 return;
151
152 /* Move the ready-to-invoke callbacks to a local list. */
153 local_irq_save(flags);
154 list = rcp->rcucblist;
155 rcp->rcucblist = *rcp->donetail;
156 *rcp->donetail = NULL;
157 if (rcp->curtail == rcp->donetail)
158 rcp->curtail = &rcp->rcucblist;
159 rcp->donetail = &rcp->rcucblist;
160 local_irq_restore(flags);
161
162 /* Invoke the callbacks on the local list. */
163 while (list) {
164 next = list->next;
165 prefetch(next);
166 list->func(list);
167 list = next;
168 }
169}
170
171/*
172 * Invoke any callbacks whose grace period has completed.
173 */
174static void rcu_process_callbacks(struct softirq_action *unused)
175{
176 __rcu_process_callbacks(&rcu_ctrlblk);
177 __rcu_process_callbacks(&rcu_bh_ctrlblk);
178}
179
180/*
181 * Wait for a grace period to elapse. But it is illegal to invoke
182 * synchronize_sched() from within an RCU read-side critical section.
183 * Therefore, any legal call to synchronize_sched() is a quiescent
184 * state, and so on a UP system, synchronize_sched() need do nothing.
185 * Ditto for synchronize_rcu_bh(). (But Lai Jiangshan points out the
186 * benefits of doing might_sleep() to reduce latency.)
187 *
188 * Cool, huh? (Due to Josh Triplett.)
189 *
190 * But we want to make this a static inline later.
191 */
192void synchronize_sched(void)
193{
194 cond_resched();
195}
196EXPORT_SYMBOL_GPL(synchronize_sched);
197
198void synchronize_rcu_bh(void)
199{
200 synchronize_sched();
201}
202EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
203
204/*
205 * Helper function for call_rcu() and call_rcu_bh().
206 */
207static void __call_rcu(struct rcu_head *head,
208 void (*func)(struct rcu_head *rcu),
209 struct rcu_ctrlblk *rcp)
210{
211 unsigned long flags;
212
213 head->func = func;
214 head->next = NULL;
215
216 local_irq_save(flags);
217 *rcp->curtail = head;
218 rcp->curtail = &head->next;
219 local_irq_restore(flags);
220}
221
222/*
223 * Post an RCU callback to be invoked after the end of an RCU grace
224 * period. But since we have but one CPU, that would be after any
225 * quiescent state.
226 */
227void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
228{
229 __call_rcu(head, func, &rcu_ctrlblk);
230}
231EXPORT_SYMBOL_GPL(call_rcu);
232
233/*
234 * Post an RCU bottom-half callback to be invoked after any subsequent
235 * quiescent state.
236 */
237void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
238{
239 __call_rcu(head, func, &rcu_bh_ctrlblk);
240}
241EXPORT_SYMBOL_GPL(call_rcu_bh);
242
243void rcu_barrier(void)
244{
245 struct rcu_synchronize rcu;
246
247 init_completion(&rcu.completion);
248 /* Will wake me after RCU finished. */
249 call_rcu(&rcu.head, wakeme_after_rcu);
250 /* Wait for it. */
251 wait_for_completion(&rcu.completion);
252}
253EXPORT_SYMBOL_GPL(rcu_barrier);
254
255void rcu_barrier_bh(void)
256{
257 struct rcu_synchronize rcu;
258
259 init_completion(&rcu.completion);
260 /* Will wake me after RCU finished. */
261 call_rcu_bh(&rcu.head, wakeme_after_rcu);
262 /* Wait for it. */
263 wait_for_completion(&rcu.completion);
264}
265EXPORT_SYMBOL_GPL(rcu_barrier_bh);
266
267void rcu_barrier_sched(void)
268{
269 struct rcu_synchronize rcu;
270
271 init_completion(&rcu.completion);
272 /* Will wake me after RCU finished. */
273 call_rcu_sched(&rcu.head, wakeme_after_rcu);
274 /* Wait for it. */
275 wait_for_completion(&rcu.completion);
276}
277EXPORT_SYMBOL_GPL(rcu_barrier_sched);
278
279void __init rcu_init(void)
280{
281 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
282}
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 233768f21f97..a621a67ef4e3 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -327,6 +327,11 @@ rcu_torture_cb(struct rcu_head *p)
327 cur_ops->deferred_free(rp); 327 cur_ops->deferred_free(rp);
328} 328}
329 329
330static int rcu_no_completed(void)
331{
332 return 0;
333}
334
330static void rcu_torture_deferred_free(struct rcu_torture *p) 335static void rcu_torture_deferred_free(struct rcu_torture *p)
331{ 336{
332 call_rcu(&p->rtort_rcu, rcu_torture_cb); 337 call_rcu(&p->rtort_rcu, rcu_torture_cb);
@@ -388,6 +393,21 @@ static struct rcu_torture_ops rcu_sync_ops = {
388 .name = "rcu_sync" 393 .name = "rcu_sync"
389}; 394};
390 395
396static struct rcu_torture_ops rcu_expedited_ops = {
397 .init = rcu_sync_torture_init,
398 .cleanup = NULL,
399 .readlock = rcu_torture_read_lock,
400 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
401 .readunlock = rcu_torture_read_unlock,
402 .completed = rcu_no_completed,
403 .deferred_free = rcu_sync_torture_deferred_free,
404 .sync = synchronize_rcu_expedited,
405 .cb_barrier = NULL,
406 .stats = NULL,
407 .irq_capable = 1,
408 .name = "rcu_expedited"
409};
410
391/* 411/*
392 * Definitions for rcu_bh torture testing. 412 * Definitions for rcu_bh torture testing.
393 */ 413 */
@@ -547,6 +567,25 @@ static struct rcu_torture_ops srcu_ops = {
547 .name = "srcu" 567 .name = "srcu"
548}; 568};
549 569
570static void srcu_torture_synchronize_expedited(void)
571{
572 synchronize_srcu_expedited(&srcu_ctl);
573}
574
575static struct rcu_torture_ops srcu_expedited_ops = {
576 .init = srcu_torture_init,
577 .cleanup = srcu_torture_cleanup,
578 .readlock = srcu_torture_read_lock,
579 .read_delay = srcu_read_delay,
580 .readunlock = srcu_torture_read_unlock,
581 .completed = srcu_torture_completed,
582 .deferred_free = rcu_sync_torture_deferred_free,
583 .sync = srcu_torture_synchronize_expedited,
584 .cb_barrier = NULL,
585 .stats = srcu_torture_stats,
586 .name = "srcu_expedited"
587};
588
550/* 589/*
551 * Definitions for sched torture testing. 590 * Definitions for sched torture testing.
552 */ 591 */
@@ -562,11 +601,6 @@ static void sched_torture_read_unlock(int idx)
562 preempt_enable(); 601 preempt_enable();
563} 602}
564 603
565static int sched_torture_completed(void)
566{
567 return 0;
568}
569
570static void rcu_sched_torture_deferred_free(struct rcu_torture *p) 604static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
571{ 605{
572 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); 606 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
@@ -583,7 +617,7 @@ static struct rcu_torture_ops sched_ops = {
583 .readlock = sched_torture_read_lock, 617 .readlock = sched_torture_read_lock,
584 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 618 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
585 .readunlock = sched_torture_read_unlock, 619 .readunlock = sched_torture_read_unlock,
586 .completed = sched_torture_completed, 620 .completed = rcu_no_completed,
587 .deferred_free = rcu_sched_torture_deferred_free, 621 .deferred_free = rcu_sched_torture_deferred_free,
588 .sync = sched_torture_synchronize, 622 .sync = sched_torture_synchronize,
589 .cb_barrier = rcu_barrier_sched, 623 .cb_barrier = rcu_barrier_sched,
@@ -592,13 +626,13 @@ static struct rcu_torture_ops sched_ops = {
592 .name = "sched" 626 .name = "sched"
593}; 627};
594 628
595static struct rcu_torture_ops sched_ops_sync = { 629static struct rcu_torture_ops sched_sync_ops = {
596 .init = rcu_sync_torture_init, 630 .init = rcu_sync_torture_init,
597 .cleanup = NULL, 631 .cleanup = NULL,
598 .readlock = sched_torture_read_lock, 632 .readlock = sched_torture_read_lock,
599 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 633 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
600 .readunlock = sched_torture_read_unlock, 634 .readunlock = sched_torture_read_unlock,
601 .completed = sched_torture_completed, 635 .completed = rcu_no_completed,
602 .deferred_free = rcu_sync_torture_deferred_free, 636 .deferred_free = rcu_sync_torture_deferred_free,
603 .sync = sched_torture_synchronize, 637 .sync = sched_torture_synchronize,
604 .cb_barrier = NULL, 638 .cb_barrier = NULL,
@@ -606,15 +640,13 @@ static struct rcu_torture_ops sched_ops_sync = {
606 .name = "sched_sync" 640 .name = "sched_sync"
607}; 641};
608 642
609extern int rcu_expedited_torture_stats(char *page);
610
611static struct rcu_torture_ops sched_expedited_ops = { 643static struct rcu_torture_ops sched_expedited_ops = {
612 .init = rcu_sync_torture_init, 644 .init = rcu_sync_torture_init,
613 .cleanup = NULL, 645 .cleanup = NULL,
614 .readlock = sched_torture_read_lock, 646 .readlock = sched_torture_read_lock,
615 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 647 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
616 .readunlock = sched_torture_read_unlock, 648 .readunlock = sched_torture_read_unlock,
617 .completed = sched_torture_completed, 649 .completed = rcu_no_completed,
618 .deferred_free = rcu_sync_torture_deferred_free, 650 .deferred_free = rcu_sync_torture_deferred_free,
619 .sync = synchronize_sched_expedited, 651 .sync = synchronize_sched_expedited,
620 .cb_barrier = NULL, 652 .cb_barrier = NULL,
@@ -650,7 +682,7 @@ rcu_torture_writer(void *arg)
650 old_rp = rcu_torture_current; 682 old_rp = rcu_torture_current;
651 rp->rtort_mbtest = 1; 683 rp->rtort_mbtest = 1;
652 rcu_assign_pointer(rcu_torture_current, rp); 684 rcu_assign_pointer(rcu_torture_current, rp);
653 smp_wmb(); 685 smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
654 if (old_rp) { 686 if (old_rp) {
655 i = old_rp->rtort_pipe_count; 687 i = old_rp->rtort_pipe_count;
656 if (i > RCU_TORTURE_PIPE_LEN) 688 if (i > RCU_TORTURE_PIPE_LEN)
@@ -1099,9 +1131,10 @@ rcu_torture_init(void)
1099 int cpu; 1131 int cpu;
1100 int firsterr = 0; 1132 int firsterr = 0;
1101 static struct rcu_torture_ops *torture_ops[] = 1133 static struct rcu_torture_ops *torture_ops[] =
1102 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1134 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1103 &sched_expedited_ops, 1135 &rcu_bh_ops, &rcu_bh_sync_ops,
1104 &srcu_ops, &sched_ops, &sched_ops_sync, }; 1136 &srcu_ops, &srcu_expedited_ops,
1137 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1105 1138
1106 mutex_lock(&fullstop_mutex); 1139 mutex_lock(&fullstop_mutex);
1107 1140
@@ -1112,8 +1145,12 @@ rcu_torture_init(void)
1112 break; 1145 break;
1113 } 1146 }
1114 if (i == ARRAY_SIZE(torture_ops)) { 1147 if (i == ARRAY_SIZE(torture_ops)) {
1115 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", 1148 printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n",
1116 torture_type); 1149 torture_type);
1150 printk(KERN_ALERT "rcu-torture types:");
1151 for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
1152 printk(KERN_ALERT " %s", torture_ops[i]->name);
1153 printk(KERN_ALERT "\n");
1117 mutex_unlock(&fullstop_mutex); 1154 mutex_unlock(&fullstop_mutex);
1118 return -EINVAL; 1155 return -EINVAL;
1119 } 1156 }
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 52b06f6e158c..53ae9598f798 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,30 +46,30 @@
46#include <linux/cpu.h> 46#include <linux/cpu.h>
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h>
49 50
50#include "rcutree.h" 51#include "rcutree.h"
51 52
52#ifdef CONFIG_DEBUG_LOCK_ALLOC
53static struct lock_class_key rcu_lock_key;
54struct lockdep_map rcu_lock_map =
55 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
56EXPORT_SYMBOL_GPL(rcu_lock_map);
57#endif
58
59/* Data structures. */ 53/* Data structures. */
60 54
55static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
56
61#define RCU_STATE_INITIALIZER(name) { \ 57#define RCU_STATE_INITIALIZER(name) { \
62 .level = { &name.node[0] }, \ 58 .level = { &name.node[0] }, \
63 .levelcnt = { \ 59 .levelcnt = { \
64 NUM_RCU_LVL_0, /* root of hierarchy. */ \ 60 NUM_RCU_LVL_0, /* root of hierarchy. */ \
65 NUM_RCU_LVL_1, \ 61 NUM_RCU_LVL_1, \
66 NUM_RCU_LVL_2, \ 62 NUM_RCU_LVL_2, \
67 NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \ 63 NUM_RCU_LVL_3, \
64 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
68 }, \ 65 }, \
69 .signaled = RCU_SIGNAL_INIT, \ 66 .signaled = RCU_GP_IDLE, \
70 .gpnum = -300, \ 67 .gpnum = -300, \
71 .completed = -300, \ 68 .completed = -300, \
72 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ 69 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
70 .orphan_cbs_list = NULL, \
71 .orphan_cbs_tail = &name.orphan_cbs_list, \
72 .orphan_qlen = 0, \
73 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ 73 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
74 .n_force_qs = 0, \ 74 .n_force_qs = 0, \
75 .n_force_qs_ngp = 0, \ 75 .n_force_qs_ngp = 0, \
@@ -81,24 +81,18 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
83 83
84extern long rcu_batches_completed_sched(void); 84static int rcu_scheduler_active __read_mostly;
85static struct rcu_node *rcu_get_root(struct rcu_state *rsp);
86static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp,
87 struct rcu_node *rnp, unsigned long flags);
88static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags);
89#ifdef CONFIG_HOTPLUG_CPU
90static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp);
91#endif /* #ifdef CONFIG_HOTPLUG_CPU */
92static void __rcu_process_callbacks(struct rcu_state *rsp,
93 struct rcu_data *rdp);
94static void __call_rcu(struct rcu_head *head,
95 void (*func)(struct rcu_head *rcu),
96 struct rcu_state *rsp);
97static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp);
98static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp,
99 int preemptable);
100 85
101#include "rcutree_plugin.h" 86
87/*
88 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
89 * permit this function to be invoked without holding the root rcu_node
90 * structure's ->lock, but of course results can be subject to change.
91 */
92static int rcu_gp_in_progress(struct rcu_state *rsp)
93{
94 return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum);
95}
102 96
103/* 97/*
104 * Note a quiescent state. Because we do not need to know 98 * Note a quiescent state. Because we do not need to know
@@ -110,7 +104,7 @@ void rcu_sched_qs(int cpu)
110 struct rcu_data *rdp; 104 struct rcu_data *rdp;
111 105
112 rdp = &per_cpu(rcu_sched_data, cpu); 106 rdp = &per_cpu(rcu_sched_data, cpu);
113 rdp->passed_quiesc_completed = rdp->completed; 107 rdp->passed_quiesc_completed = rdp->gpnum - 1;
114 barrier(); 108 barrier();
115 rdp->passed_quiesc = 1; 109 rdp->passed_quiesc = 1;
116 rcu_preempt_note_context_switch(cpu); 110 rcu_preempt_note_context_switch(cpu);
@@ -121,7 +115,7 @@ void rcu_bh_qs(int cpu)
121 struct rcu_data *rdp; 115 struct rcu_data *rdp;
122 116
123 rdp = &per_cpu(rcu_bh_data, cpu); 117 rdp = &per_cpu(rcu_bh_data, cpu);
124 rdp->passed_quiesc_completed = rdp->completed; 118 rdp->passed_quiesc_completed = rdp->gpnum - 1;
125 barrier(); 119 barrier();
126 rdp->passed_quiesc = 1; 120 rdp->passed_quiesc = 1;
127} 121}
@@ -137,6 +131,10 @@ static int blimit = 10; /* Maximum callbacks per softirq. */
137static int qhimark = 10000; /* If this many pending, ignore blimit. */ 131static int qhimark = 10000; /* If this many pending, ignore blimit. */
138static int qlowmark = 100; /* Once only this many pending, use blimit. */ 132static int qlowmark = 100; /* Once only this many pending, use blimit. */
139 133
134module_param(blimit, int, 0);
135module_param(qhimark, int, 0);
136module_param(qlowmark, int, 0);
137
140static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 138static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
141static int rcu_pending(int cpu); 139static int rcu_pending(int cpu);
142 140
@@ -173,9 +171,7 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
173static int 171static int
174cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 172cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
175{ 173{
176 /* ACCESS_ONCE() because we are accessing outside of lock. */ 174 return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp);
177 return *rdp->nxttail[RCU_DONE_TAIL] &&
178 ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
179} 175}
180 176
181/* 177/*
@@ -345,31 +341,12 @@ void rcu_irq_exit(void)
345 set_need_resched(); 341 set_need_resched();
346} 342}
347 343
348/*
349 * Record the specified "completed" value, which is later used to validate
350 * dynticks counter manipulations. Specify "rsp->completed - 1" to
351 * unconditionally invalidate any future dynticks manipulations (which is
352 * useful at the beginning of a grace period).
353 */
354static void dyntick_record_completed(struct rcu_state *rsp, long comp)
355{
356 rsp->dynticks_completed = comp;
357}
358
359#ifdef CONFIG_SMP 344#ifdef CONFIG_SMP
360 345
361/* 346/*
362 * Recall the previously recorded value of the completion for dynticks.
363 */
364static long dyntick_recall_completed(struct rcu_state *rsp)
365{
366 return rsp->dynticks_completed;
367}
368
369/*
370 * Snapshot the specified CPU's dynticks counter so that we can later 347 * Snapshot the specified CPU's dynticks counter so that we can later
371 * credit them with an implicit quiescent state. Return 1 if this CPU 348 * credit them with an implicit quiescent state. Return 1 if this CPU
372 * is already in a quiescent state courtesy of dynticks idle mode. 349 * is in dynticks idle mode, which is an extended quiescent state.
373 */ 350 */
374static int dyntick_save_progress_counter(struct rcu_data *rdp) 351static int dyntick_save_progress_counter(struct rcu_data *rdp)
375{ 352{
@@ -429,24 +406,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
429 406
430#else /* #ifdef CONFIG_NO_HZ */ 407#else /* #ifdef CONFIG_NO_HZ */
431 408
432static void dyntick_record_completed(struct rcu_state *rsp, long comp)
433{
434}
435
436#ifdef CONFIG_SMP 409#ifdef CONFIG_SMP
437 410
438/*
439 * If there are no dynticks, then the only way that a CPU can passively
440 * be in a quiescent state is to be offline. Unlike dynticks idle, which
441 * is a point in time during the prior (already finished) grace period,
442 * an offline CPU is always in a quiescent state, and thus can be
443 * unconditionally applied. So just return the current value of completed.
444 */
445static long dyntick_recall_completed(struct rcu_state *rsp)
446{
447 return rsp->completed;
448}
449
450static int dyntick_save_progress_counter(struct rcu_data *rdp) 411static int dyntick_save_progress_counter(struct rcu_data *rdp)
451{ 412{
452 return 0; 413 return 0;
@@ -475,30 +436,34 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
475 long delta; 436 long delta;
476 unsigned long flags; 437 unsigned long flags;
477 struct rcu_node *rnp = rcu_get_root(rsp); 438 struct rcu_node *rnp = rcu_get_root(rsp);
478 struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
479 struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
480 439
481 /* Only let one CPU complain about others per time interval. */ 440 /* Only let one CPU complain about others per time interval. */
482 441
483 spin_lock_irqsave(&rnp->lock, flags); 442 spin_lock_irqsave(&rnp->lock, flags);
484 delta = jiffies - rsp->jiffies_stall; 443 delta = jiffies - rsp->jiffies_stall;
485 if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) { 444 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
486 spin_unlock_irqrestore(&rnp->lock, flags); 445 spin_unlock_irqrestore(&rnp->lock, flags);
487 return; 446 return;
488 } 447 }
489 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 448 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
449
450 /*
451 * Now rat on any tasks that got kicked up to the root rcu_node
452 * due to CPU offlining.
453 */
454 rcu_print_task_stall(rnp);
490 spin_unlock_irqrestore(&rnp->lock, flags); 455 spin_unlock_irqrestore(&rnp->lock, flags);
491 456
492 /* OK, time to rat on our buddy... */ 457 /* OK, time to rat on our buddy... */
493 458
494 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 459 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
495 for (; rnp_cur < rnp_end; rnp_cur++) { 460 rcu_for_each_leaf_node(rsp, rnp) {
496 rcu_print_task_stall(rnp); 461 rcu_print_task_stall(rnp);
497 if (rnp_cur->qsmask == 0) 462 if (rnp->qsmask == 0)
498 continue; 463 continue;
499 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++) 464 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
500 if (rnp_cur->qsmask & (1UL << cpu)) 465 if (rnp->qsmask & (1UL << cpu))
501 printk(" %d", rnp_cur->grplo + cpu); 466 printk(" %d", rnp->grplo + cpu);
502 } 467 }
503 printk(" (detected by %d, t=%ld jiffies)\n", 468 printk(" (detected by %d, t=%ld jiffies)\n",
504 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 469 smp_processor_id(), (long)(jiffies - rsp->gp_start));
@@ -537,8 +502,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
537 /* We haven't checked in, so go dump stack. */ 502 /* We haven't checked in, so go dump stack. */
538 print_cpu_stall(rsp); 503 print_cpu_stall(rsp);
539 504
540 } else if (rsp->gpnum != rsp->completed && 505 } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) {
541 delta >= RCU_STALL_RAT_DELAY) {
542 506
543 /* They had two time units to dump stack, so complain. */ 507 /* They had two time units to dump stack, so complain. */
544 print_other_cpu_stall(rsp); 508 print_other_cpu_stall(rsp);
@@ -560,13 +524,33 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
560/* 524/*
561 * Update CPU-local rcu_data state to record the newly noticed grace period. 525 * Update CPU-local rcu_data state to record the newly noticed grace period.
562 * This is used both when we started the grace period and when we notice 526 * This is used both when we started the grace period and when we notice
563 * that someone else started the grace period. 527 * that someone else started the grace period. The caller must hold the
528 * ->lock of the leaf rcu_node structure corresponding to the current CPU,
529 * and must have irqs disabled.
564 */ 530 */
531static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
532{
533 if (rdp->gpnum != rnp->gpnum) {
534 rdp->qs_pending = 1;
535 rdp->passed_quiesc = 0;
536 rdp->gpnum = rnp->gpnum;
537 }
538}
539
565static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) 540static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
566{ 541{
567 rdp->qs_pending = 1; 542 unsigned long flags;
568 rdp->passed_quiesc = 0; 543 struct rcu_node *rnp;
569 rdp->gpnum = rsp->gpnum; 544
545 local_irq_save(flags);
546 rnp = rdp->mynode;
547 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
548 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
549 local_irq_restore(flags);
550 return;
551 }
552 __note_new_gpnum(rsp, rnp, rdp);
553 spin_unlock_irqrestore(&rnp->lock, flags);
570} 554}
571 555
572/* 556/*
@@ -590,6 +574,79 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
590} 574}
591 575
592/* 576/*
577 * Advance this CPU's callbacks, but only if the current grace period
578 * has ended. This may be called only from the CPU to whom the rdp
579 * belongs. In addition, the corresponding leaf rcu_node structure's
580 * ->lock must be held by the caller, with irqs disabled.
581 */
582static void
583__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
584{
585 /* Did another grace period end? */
586 if (rdp->completed != rnp->completed) {
587
588 /* Advance callbacks. No harm if list empty. */
589 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
590 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
591 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
592
593 /* Remember that we saw this grace-period completion. */
594 rdp->completed = rnp->completed;
595 }
596}
597
598/*
599 * Advance this CPU's callbacks, but only if the current grace period
600 * has ended. This may be called only from the CPU to whom the rdp
601 * belongs.
602 */
603static void
604rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
605{
606 unsigned long flags;
607 struct rcu_node *rnp;
608
609 local_irq_save(flags);
610 rnp = rdp->mynode;
611 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
612 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
613 local_irq_restore(flags);
614 return;
615 }
616 __rcu_process_gp_end(rsp, rnp, rdp);
617 spin_unlock_irqrestore(&rnp->lock, flags);
618}
619
620/*
621 * Do per-CPU grace-period initialization for running CPU. The caller
622 * must hold the lock of the leaf rcu_node structure corresponding to
623 * this CPU.
624 */
625static void
626rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
627{
628 /* Prior grace period ended, so advance callbacks for current CPU. */
629 __rcu_process_gp_end(rsp, rnp, rdp);
630
631 /*
632 * Because this CPU just now started the new grace period, we know
633 * that all of its callbacks will be covered by this upcoming grace
634 * period, even the ones that were registered arbitrarily recently.
635 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
636 *
637 * Other CPUs cannot be sure exactly when the grace period started.
638 * Therefore, their recently registered callbacks must pass through
639 * an additional RCU_NEXT_READY stage, so that they will be handled
640 * by the next RCU grace period.
641 */
642 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
643 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
644
645 /* Set state so that this CPU will detect the next quiescent state. */
646 __note_new_gpnum(rsp, rnp, rdp);
647}
648
649/*
593 * Start a new RCU grace period if warranted, re-initializing the hierarchy 650 * Start a new RCU grace period if warranted, re-initializing the hierarchy
594 * in preparation for detecting the next grace period. The caller must hold 651 * in preparation for detecting the next grace period. The caller must hold
595 * the root node's ->lock, which is released before return. Hard irqs must 652 * the root node's ->lock, which is released before return. Hard irqs must
@@ -603,7 +660,23 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
603 struct rcu_node *rnp = rcu_get_root(rsp); 660 struct rcu_node *rnp = rcu_get_root(rsp);
604 661
605 if (!cpu_needs_another_gp(rsp, rdp)) { 662 if (!cpu_needs_another_gp(rsp, rdp)) {
606 spin_unlock_irqrestore(&rnp->lock, flags); 663 if (rnp->completed == rsp->completed) {
664 spin_unlock_irqrestore(&rnp->lock, flags);
665 return;
666 }
667 spin_unlock(&rnp->lock); /* irqs remain disabled. */
668
669 /*
670 * Propagate new ->completed value to rcu_node structures
671 * so that other CPUs don't have to wait until the start
672 * of the next grace period to process their callbacks.
673 */
674 rcu_for_each_node_breadth_first(rsp, rnp) {
675 spin_lock(&rnp->lock); /* irqs already disabled. */
676 rnp->completed = rsp->completed;
677 spin_unlock(&rnp->lock); /* irqs remain disabled. */
678 }
679 local_irq_restore(flags);
607 return; 680 return;
608 } 681 }
609 682
@@ -613,23 +686,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
613 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 686 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
614 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 687 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
615 record_gp_stall_check_time(rsp); 688 record_gp_stall_check_time(rsp);
616 dyntick_record_completed(rsp, rsp->completed - 1);
617 note_new_gpnum(rsp, rdp);
618
619 /*
620 * Because we are first, we know that all our callbacks will
621 * be covered by this upcoming grace period, even the ones
622 * that were registered arbitrarily recently.
623 */
624 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
625 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
626 689
627 /* Special-case the common single-level case. */ 690 /* Special-case the common single-level case. */
628 if (NUM_RCU_NODES == 1) { 691 if (NUM_RCU_NODES == 1) {
629 rcu_preempt_check_blocked_tasks(rnp); 692 rcu_preempt_check_blocked_tasks(rnp);
630 rnp->qsmask = rnp->qsmaskinit; 693 rnp->qsmask = rnp->qsmaskinit;
631 rnp->gpnum = rsp->gpnum; 694 rnp->gpnum = rsp->gpnum;
695 rnp->completed = rsp->completed;
632 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 696 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
697 rcu_start_gp_per_cpu(rsp, rnp, rdp);
633 spin_unlock_irqrestore(&rnp->lock, flags); 698 spin_unlock_irqrestore(&rnp->lock, flags);
634 return; 699 return;
635 } 700 }
@@ -657,70 +722,51 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
657 * one corresponding to this CPU, due to the fact that we have 722 * one corresponding to this CPU, due to the fact that we have
658 * irqs disabled. 723 * irqs disabled.
659 */ 724 */
660 for (rnp = &rsp->node[0]; rnp < &rsp->node[NUM_RCU_NODES]; rnp++) { 725 rcu_for_each_node_breadth_first(rsp, rnp) {
661 spin_lock(&rnp->lock); /* irqs already disabled. */ 726 spin_lock(&rnp->lock); /* irqs already disabled. */
662 rcu_preempt_check_blocked_tasks(rnp); 727 rcu_preempt_check_blocked_tasks(rnp);
663 rnp->qsmask = rnp->qsmaskinit; 728 rnp->qsmask = rnp->qsmaskinit;
664 rnp->gpnum = rsp->gpnum; 729 rnp->gpnum = rsp->gpnum;
665 spin_unlock(&rnp->lock); /* irqs already disabled. */ 730 rnp->completed = rsp->completed;
731 if (rnp == rdp->mynode)
732 rcu_start_gp_per_cpu(rsp, rnp, rdp);
733 spin_unlock(&rnp->lock); /* irqs remain disabled. */
666 } 734 }
667 735
736 rnp = rcu_get_root(rsp);
737 spin_lock(&rnp->lock); /* irqs already disabled. */
668 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ 738 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
739 spin_unlock(&rnp->lock); /* irqs remain disabled. */
669 spin_unlock_irqrestore(&rsp->onofflock, flags); 740 spin_unlock_irqrestore(&rsp->onofflock, flags);
670} 741}
671 742
672/* 743/*
673 * Advance this CPU's callbacks, but only if the current grace period 744 * Report a full set of quiescent states to the specified rcu_state
674 * has ended. This may be called only from the CPU to whom the rdp 745 * data structure. This involves cleaning up after the prior grace
675 * belongs. 746 * period and letting rcu_start_gp() start up the next grace period
676 */ 747 * if one is needed. Note that the caller must hold rnp->lock, as
677static void 748 * required by rcu_start_gp(), which will release it.
678rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
679{
680 long completed_snap;
681 unsigned long flags;
682
683 local_irq_save(flags);
684 completed_snap = ACCESS_ONCE(rsp->completed); /* outside of lock. */
685
686 /* Did another grace period end? */
687 if (rdp->completed != completed_snap) {
688
689 /* Advance callbacks. No harm if list empty. */
690 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
691 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
692 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
693
694 /* Remember that we saw this grace-period completion. */
695 rdp->completed = completed_snap;
696 }
697 local_irq_restore(flags);
698}
699
700/*
701 * Clean up after the prior grace period and let rcu_start_gp() start up
702 * the next grace period if one is needed. Note that the caller must
703 * hold rnp->lock, as required by rcu_start_gp(), which will release it.
704 */ 749 */
705static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags) 750static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
706 __releases(rnp->lock) 751 __releases(rcu_get_root(rsp)->lock)
707{ 752{
708 WARN_ON_ONCE(rsp->completed == rsp->gpnum); 753 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
709 rsp->completed = rsp->gpnum; 754 rsp->completed = rsp->gpnum;
710 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); 755 rsp->signaled = RCU_GP_IDLE;
711 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 756 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
712} 757}
713 758
714/* 759/*
715 * Similar to cpu_quiet(), for which it is a helper function. Allows 760 * Similar to rcu_report_qs_rdp(), for which it is a helper function.
716 * a group of CPUs to be quieted at one go, though all the CPUs in the 761 * Allows quiescent states for a group of CPUs to be reported at one go
717 * group must be represented by the same leaf rcu_node structure. 762 * to the specified rcu_node structure, though all the CPUs in the group
718 * That structure's lock must be held upon entry, and it is released 763 * must be represented by the same rcu_node structure (which need not be
719 * before return. 764 * a leaf rcu_node structure, though it often will be). That structure's
765 * lock must be held upon entry, and it is released before return.
720 */ 766 */
721static void 767static void
722cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, 768rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
723 unsigned long flags) 769 struct rcu_node *rnp, unsigned long flags)
724 __releases(rnp->lock) 770 __releases(rnp->lock)
725{ 771{
726 struct rcu_node *rnp_c; 772 struct rcu_node *rnp_c;
@@ -756,21 +802,23 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
756 802
757 /* 803 /*
758 * Get here if we are the last CPU to pass through a quiescent 804 * Get here if we are the last CPU to pass through a quiescent
759 * state for this grace period. Invoke cpu_quiet_msk_finish() 805 * state for this grace period. Invoke rcu_report_qs_rsp()
760 * to clean up and start the next grace period if one is needed. 806 * to clean up and start the next grace period if one is needed.
761 */ 807 */
762 cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */ 808 rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */
763} 809}
764 810
765/* 811/*
766 * Record a quiescent state for the specified CPU, which must either be 812 * Record a quiescent state for the specified CPU to that CPU's rcu_data
767 * the current CPU. The lastcomp argument is used to make sure we are 813 * structure. This must be either called from the specified CPU, or
768 * still in the grace period of interest. We don't want to end the current 814 * called when the specified CPU is known to be offline (and when it is
769 * grace period based on quiescent states detected in an earlier grace 815 * also known that no other CPU is concurrently trying to help the offline
770 * period! 816 * CPU). The lastcomp argument is used to make sure we are still in the
817 * grace period of interest. We don't want to end the current grace period
818 * based on quiescent states detected in an earlier grace period!
771 */ 819 */
772static void 820static void
773cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) 821rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
774{ 822{
775 unsigned long flags; 823 unsigned long flags;
776 unsigned long mask; 824 unsigned long mask;
@@ -778,15 +826,15 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
778 826
779 rnp = rdp->mynode; 827 rnp = rdp->mynode;
780 spin_lock_irqsave(&rnp->lock, flags); 828 spin_lock_irqsave(&rnp->lock, flags);
781 if (lastcomp != ACCESS_ONCE(rsp->completed)) { 829 if (lastcomp != rnp->completed) {
782 830
783 /* 831 /*
784 * Someone beat us to it for this grace period, so leave. 832 * Someone beat us to it for this grace period, so leave.
785 * The race with GP start is resolved by the fact that we 833 * The race with GP start is resolved by the fact that we
786 * hold the leaf rcu_node lock, so that the per-CPU bits 834 * hold the leaf rcu_node lock, so that the per-CPU bits
787 * cannot yet be initialized -- so we would simply find our 835 * cannot yet be initialized -- so we would simply find our
788 * CPU's bit already cleared in cpu_quiet_msk() if this race 836 * CPU's bit already cleared in rcu_report_qs_rnp() if this
789 * occurred. 837 * race occurred.
790 */ 838 */
791 rdp->passed_quiesc = 0; /* try again later! */ 839 rdp->passed_quiesc = 0; /* try again later! */
792 spin_unlock_irqrestore(&rnp->lock, flags); 840 spin_unlock_irqrestore(&rnp->lock, flags);
@@ -804,7 +852,7 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
804 */ 852 */
805 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 853 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
806 854
807 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */ 855 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
808 } 856 }
809} 857}
810 858
@@ -835,24 +883,73 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
835 if (!rdp->passed_quiesc) 883 if (!rdp->passed_quiesc)
836 return; 884 return;
837 885
838 /* Tell RCU we are done (but cpu_quiet() will be the judge of that). */ 886 /*
839 cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); 887 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
888 * judge of that).
889 */
890 rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
840} 891}
841 892
842#ifdef CONFIG_HOTPLUG_CPU 893#ifdef CONFIG_HOTPLUG_CPU
843 894
844/* 895/*
896 * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the
897 * specified flavor of RCU. The callbacks will be adopted by the next
898 * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever
899 * comes first. Because this is invoked from the CPU_DYING notifier,
900 * irqs are already disabled.
901 */
902static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
903{
904 int i;
905 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
906
907 if (rdp->nxtlist == NULL)
908 return; /* irqs disabled, so comparison is stable. */
909 spin_lock(&rsp->onofflock); /* irqs already disabled. */
910 *rsp->orphan_cbs_tail = rdp->nxtlist;
911 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
912 rdp->nxtlist = NULL;
913 for (i = 0; i < RCU_NEXT_SIZE; i++)
914 rdp->nxttail[i] = &rdp->nxtlist;
915 rsp->orphan_qlen += rdp->qlen;
916 rdp->qlen = 0;
917 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
918}
919
920/*
921 * Adopt previously orphaned RCU callbacks.
922 */
923static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
924{
925 unsigned long flags;
926 struct rcu_data *rdp;
927
928 spin_lock_irqsave(&rsp->onofflock, flags);
929 rdp = rsp->rda[smp_processor_id()];
930 if (rsp->orphan_cbs_list == NULL) {
931 spin_unlock_irqrestore(&rsp->onofflock, flags);
932 return;
933 }
934 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
935 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
936 rdp->qlen += rsp->orphan_qlen;
937 rsp->orphan_cbs_list = NULL;
938 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
939 rsp->orphan_qlen = 0;
940 spin_unlock_irqrestore(&rsp->onofflock, flags);
941}
942
943/*
845 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy 944 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
846 * and move all callbacks from the outgoing CPU to the current one. 945 * and move all callbacks from the outgoing CPU to the current one.
847 */ 946 */
848static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) 947static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
849{ 948{
850 int i;
851 unsigned long flags; 949 unsigned long flags;
852 long lastcomp;
853 unsigned long mask; 950 unsigned long mask;
951 int need_report = 0;
854 struct rcu_data *rdp = rsp->rda[cpu]; 952 struct rcu_data *rdp = rsp->rda[cpu];
855 struct rcu_data *rdp_me;
856 struct rcu_node *rnp; 953 struct rcu_node *rnp;
857 954
858 /* Exclude any attempts to start a new grace period. */ 955 /* Exclude any attempts to start a new grace period. */
@@ -865,42 +962,34 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
865 spin_lock(&rnp->lock); /* irqs already disabled. */ 962 spin_lock(&rnp->lock); /* irqs already disabled. */
866 rnp->qsmaskinit &= ~mask; 963 rnp->qsmaskinit &= ~mask;
867 if (rnp->qsmaskinit != 0) { 964 if (rnp->qsmaskinit != 0) {
868 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 965 if (rnp != rdp->mynode)
966 spin_unlock(&rnp->lock); /* irqs remain disabled. */
869 break; 967 break;
870 } 968 }
871 rcu_preempt_offline_tasks(rsp, rnp, rdp); 969 if (rnp == rdp->mynode)
970 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
971 else
972 spin_unlock(&rnp->lock); /* irqs remain disabled. */
872 mask = rnp->grpmask; 973 mask = rnp->grpmask;
873 spin_unlock(&rnp->lock); /* irqs remain disabled. */
874 rnp = rnp->parent; 974 rnp = rnp->parent;
875 } while (rnp != NULL); 975 } while (rnp != NULL);
876 lastcomp = rsp->completed;
877
878 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
879 976
880 /* 977 /*
881 * Move callbacks from the outgoing CPU to the running CPU. 978 * We still hold the leaf rcu_node structure lock here, and
882 * Note that the outgoing CPU is now quiscent, so it is now 979 * irqs are still disabled. The reason for this subterfuge is
883 * (uncharacteristically) safe to access its rcu_data structure. 980 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
884 * Note also that we must carefully retain the order of the 981 * held leads to deadlock.
885 * outgoing CPU's callbacks in order for rcu_barrier() to work
886 * correctly. Finally, note that we start all the callbacks
887 * afresh, even those that have passed through a grace period
888 * and are therefore ready to invoke. The theory is that hotplug
889 * events are rare, and that if they are frequent enough to
890 * indefinitely delay callbacks, you have far worse things to
891 * be worrying about.
892 */ 982 */
893 rdp_me = rsp->rda[smp_processor_id()]; 983 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
894 if (rdp->nxtlist != NULL) { 984 rnp = rdp->mynode;
895 *rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; 985 if (need_report & RCU_OFL_TASKS_NORM_GP)
896 rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 986 rcu_report_unblock_qs_rnp(rnp, flags);
897 rdp->nxtlist = NULL; 987 else
898 for (i = 0; i < RCU_NEXT_SIZE; i++) 988 spin_unlock_irqrestore(&rnp->lock, flags);
899 rdp->nxttail[i] = &rdp->nxtlist; 989 if (need_report & RCU_OFL_TASKS_EXP_GP)
900 rdp_me->qlen += rdp->qlen; 990 rcu_report_exp_rnp(rsp, rnp);
901 rdp->qlen = 0; 991
902 } 992 rcu_adopt_orphan_cbs(rsp);
903 local_irq_restore(flags);
904} 993}
905 994
906/* 995/*
@@ -918,6 +1007,14 @@ static void rcu_offline_cpu(int cpu)
918 1007
919#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1008#else /* #ifdef CONFIG_HOTPLUG_CPU */
920 1009
1010static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
1011{
1012}
1013
1014static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1015{
1016}
1017
921static void rcu_offline_cpu(int cpu) 1018static void rcu_offline_cpu(int cpu)
922{ 1019{
923} 1020}
@@ -928,7 +1025,7 @@ static void rcu_offline_cpu(int cpu)
928 * Invoke any RCU callbacks that have made it to the end of their grace 1025 * Invoke any RCU callbacks that have made it to the end of their grace
929 * period. Thottle as specified by rdp->blimit. 1026 * period. Thottle as specified by rdp->blimit.
930 */ 1027 */
931static void rcu_do_batch(struct rcu_data *rdp) 1028static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
932{ 1029{
933 unsigned long flags; 1030 unsigned long flags;
934 struct rcu_head *next, *list, **tail; 1031 struct rcu_head *next, *list, **tail;
@@ -981,6 +1078,13 @@ static void rcu_do_batch(struct rcu_data *rdp)
981 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) 1078 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
982 rdp->blimit = blimit; 1079 rdp->blimit = blimit;
983 1080
1081 /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
1082 if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) {
1083 rdp->qlen_last_fqs_check = 0;
1084 rdp->n_force_qs_snap = rsp->n_force_qs;
1085 } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
1086 rdp->qlen_last_fqs_check = rdp->qlen;
1087
984 local_irq_restore(flags); 1088 local_irq_restore(flags);
985 1089
986 /* Re-raise the RCU softirq if there are callbacks remaining. */ 1090 /* Re-raise the RCU softirq if there are callbacks remaining. */
@@ -1050,33 +1154,32 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1050 int cpu; 1154 int cpu;
1051 unsigned long flags; 1155 unsigned long flags;
1052 unsigned long mask; 1156 unsigned long mask;
1053 struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; 1157 struct rcu_node *rnp;
1054 struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
1055 1158
1056 for (; rnp_cur < rnp_end; rnp_cur++) { 1159 rcu_for_each_leaf_node(rsp, rnp) {
1057 mask = 0; 1160 mask = 0;
1058 spin_lock_irqsave(&rnp_cur->lock, flags); 1161 spin_lock_irqsave(&rnp->lock, flags);
1059 if (rsp->completed != lastcomp) { 1162 if (rnp->completed != lastcomp) {
1060 spin_unlock_irqrestore(&rnp_cur->lock, flags); 1163 spin_unlock_irqrestore(&rnp->lock, flags);
1061 return 1; 1164 return 1;
1062 } 1165 }
1063 if (rnp_cur->qsmask == 0) { 1166 if (rnp->qsmask == 0) {
1064 spin_unlock_irqrestore(&rnp_cur->lock, flags); 1167 spin_unlock_irqrestore(&rnp->lock, flags);
1065 continue; 1168 continue;
1066 } 1169 }
1067 cpu = rnp_cur->grplo; 1170 cpu = rnp->grplo;
1068 bit = 1; 1171 bit = 1;
1069 for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) { 1172 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
1070 if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu])) 1173 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1071 mask |= bit; 1174 mask |= bit;
1072 } 1175 }
1073 if (mask != 0 && rsp->completed == lastcomp) { 1176 if (mask != 0 && rnp->completed == lastcomp) {
1074 1177
1075 /* cpu_quiet_msk() releases rnp_cur->lock. */ 1178 /* rcu_report_qs_rnp() releases rnp->lock. */
1076 cpu_quiet_msk(mask, rsp, rnp_cur, flags); 1179 rcu_report_qs_rnp(mask, rsp, rnp, flags);
1077 continue; 1180 continue;
1078 } 1181 }
1079 spin_unlock_irqrestore(&rnp_cur->lock, flags); 1182 spin_unlock_irqrestore(&rnp->lock, flags);
1080 } 1183 }
1081 return 0; 1184 return 0;
1082} 1185}
@@ -1091,8 +1194,9 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1091 long lastcomp; 1194 long lastcomp;
1092 struct rcu_node *rnp = rcu_get_root(rsp); 1195 struct rcu_node *rnp = rcu_get_root(rsp);
1093 u8 signaled; 1196 u8 signaled;
1197 u8 forcenow;
1094 1198
1095 if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) 1199 if (!rcu_gp_in_progress(rsp))
1096 return; /* No grace period in progress, nothing to force. */ 1200 return; /* No grace period in progress, nothing to force. */
1097 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) { 1201 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) {
1098 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ 1202 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
@@ -1103,19 +1207,20 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1103 goto unlock_ret; /* no emergency and done recently. */ 1207 goto unlock_ret; /* no emergency and done recently. */
1104 rsp->n_force_qs++; 1208 rsp->n_force_qs++;
1105 spin_lock(&rnp->lock); 1209 spin_lock(&rnp->lock);
1106 lastcomp = rsp->completed; 1210 lastcomp = rsp->gpnum - 1;
1107 signaled = rsp->signaled; 1211 signaled = rsp->signaled;
1108 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 1212 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1109 if (lastcomp == rsp->gpnum) { 1213 if(!rcu_gp_in_progress(rsp)) {
1110 rsp->n_force_qs_ngp++; 1214 rsp->n_force_qs_ngp++;
1111 spin_unlock(&rnp->lock); 1215 spin_unlock(&rnp->lock);
1112 goto unlock_ret; /* no GP in progress, time updated. */ 1216 goto unlock_ret; /* no GP in progress, time updated. */
1113 } 1217 }
1114 spin_unlock(&rnp->lock); 1218 spin_unlock(&rnp->lock);
1115 switch (signaled) { 1219 switch (signaled) {
1220 case RCU_GP_IDLE:
1116 case RCU_GP_INIT: 1221 case RCU_GP_INIT:
1117 1222
1118 break; /* grace period still initializing, ignore. */ 1223 break; /* grace period idle or initializing, ignore. */
1119 1224
1120 case RCU_SAVE_DYNTICK: 1225 case RCU_SAVE_DYNTICK:
1121 1226
@@ -1126,20 +1231,29 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1126 if (rcu_process_dyntick(rsp, lastcomp, 1231 if (rcu_process_dyntick(rsp, lastcomp,
1127 dyntick_save_progress_counter)) 1232 dyntick_save_progress_counter))
1128 goto unlock_ret; 1233 goto unlock_ret;
1234 /* fall into next case. */
1235
1236 case RCU_SAVE_COMPLETED:
1129 1237
1130 /* Update state, record completion counter. */ 1238 /* Update state, record completion counter. */
1239 forcenow = 0;
1131 spin_lock(&rnp->lock); 1240 spin_lock(&rnp->lock);
1132 if (lastcomp == rsp->completed) { 1241 if (lastcomp + 1 == rsp->gpnum &&
1242 lastcomp == rsp->completed &&
1243 rsp->signaled == signaled) {
1133 rsp->signaled = RCU_FORCE_QS; 1244 rsp->signaled = RCU_FORCE_QS;
1134 dyntick_record_completed(rsp, lastcomp); 1245 rsp->completed_fqs = lastcomp;
1246 forcenow = signaled == RCU_SAVE_COMPLETED;
1135 } 1247 }
1136 spin_unlock(&rnp->lock); 1248 spin_unlock(&rnp->lock);
1137 break; 1249 if (!forcenow)
1250 break;
1251 /* fall into next case. */
1138 1252
1139 case RCU_FORCE_QS: 1253 case RCU_FORCE_QS:
1140 1254
1141 /* Check dyntick-idle state, send IPI to laggarts. */ 1255 /* Check dyntick-idle state, send IPI to laggarts. */
1142 if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp), 1256 if (rcu_process_dyntick(rsp, rsp->completed_fqs,
1143 rcu_implicit_dynticks_qs)) 1257 rcu_implicit_dynticks_qs))
1144 goto unlock_ret; 1258 goto unlock_ret;
1145 1259
@@ -1195,7 +1309,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1195 } 1309 }
1196 1310
1197 /* If there are callbacks ready, invoke them. */ 1311 /* If there are callbacks ready, invoke them. */
1198 rcu_do_batch(rdp); 1312 rcu_do_batch(rsp, rdp);
1199} 1313}
1200 1314
1201/* 1315/*
@@ -1251,7 +1365,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1251 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1365 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1252 1366
1253 /* Start a new grace period if one not already started. */ 1367 /* Start a new grace period if one not already started. */
1254 if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) { 1368 if (!rcu_gp_in_progress(rsp)) {
1255 unsigned long nestflag; 1369 unsigned long nestflag;
1256 struct rcu_node *rnp_root = rcu_get_root(rsp); 1370 struct rcu_node *rnp_root = rcu_get_root(rsp);
1257 1371
@@ -1259,10 +1373,20 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1259 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ 1373 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1260 } 1374 }
1261 1375
1262 /* Force the grace period if too many callbacks or too long waiting. */ 1376 /*
1263 if (unlikely(++rdp->qlen > qhimark)) { 1377 * Force the grace period if too many callbacks or too long waiting.
1378 * Enforce hysteresis, and don't invoke force_quiescent_state()
1379 * if some other CPU has recently done so. Also, don't bother
1380 * invoking force_quiescent_state() if the newly enqueued callback
1381 * is the only one waiting for a grace period to complete.
1382 */
1383 if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
1264 rdp->blimit = LONG_MAX; 1384 rdp->blimit = LONG_MAX;
1265 force_quiescent_state(rsp, 0); 1385 if (rsp->n_force_qs == rdp->n_force_qs_snap &&
1386 *rdp->nxttail[RCU_DONE_TAIL] != head)
1387 force_quiescent_state(rsp, 0);
1388 rdp->n_force_qs_snap = rsp->n_force_qs;
1389 rdp->qlen_last_fqs_check = rdp->qlen;
1266 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) 1390 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
1267 force_quiescent_state(rsp, 1); 1391 force_quiescent_state(rsp, 1);
1268 local_irq_restore(flags); 1392 local_irq_restore(flags);
@@ -1286,6 +1410,68 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1286} 1410}
1287EXPORT_SYMBOL_GPL(call_rcu_bh); 1411EXPORT_SYMBOL_GPL(call_rcu_bh);
1288 1412
1413/**
1414 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
1415 *
1416 * Control will return to the caller some time after a full rcu-sched
1417 * grace period has elapsed, in other words after all currently executing
1418 * rcu-sched read-side critical sections have completed. These read-side
1419 * critical sections are delimited by rcu_read_lock_sched() and
1420 * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
1421 * local_irq_disable(), and so on may be used in place of
1422 * rcu_read_lock_sched().
1423 *
1424 * This means that all preempt_disable code sequences, including NMI and
1425 * hardware-interrupt handlers, in progress on entry will have completed
1426 * before this primitive returns. However, this does not guarantee that
1427 * softirq handlers will have completed, since in some kernels, these
1428 * handlers can run in process context, and can block.
1429 *
1430 * This primitive provides the guarantees made by the (now removed)
1431 * synchronize_kernel() API. In contrast, synchronize_rcu() only
1432 * guarantees that rcu_read_lock() sections will have completed.
1433 * In "classic RCU", these two guarantees happen to be one and
1434 * the same, but can differ in realtime RCU implementations.
1435 */
1436void synchronize_sched(void)
1437{
1438 struct rcu_synchronize rcu;
1439
1440 if (rcu_blocking_is_gp())
1441 return;
1442
1443 init_completion(&rcu.completion);
1444 /* Will wake me after RCU finished. */
1445 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1446 /* Wait for it. */
1447 wait_for_completion(&rcu.completion);
1448}
1449EXPORT_SYMBOL_GPL(synchronize_sched);
1450
1451/**
1452 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
1453 *
1454 * Control will return to the caller some time after a full rcu_bh grace
1455 * period has elapsed, in other words after all currently executing rcu_bh
1456 * read-side critical sections have completed. RCU read-side critical
1457 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
1458 * and may be nested.
1459 */
1460void synchronize_rcu_bh(void)
1461{
1462 struct rcu_synchronize rcu;
1463
1464 if (rcu_blocking_is_gp())
1465 return;
1466
1467 init_completion(&rcu.completion);
1468 /* Will wake me after RCU finished. */
1469 call_rcu_bh(&rcu.head, wakeme_after_rcu);
1470 /* Wait for it. */
1471 wait_for_completion(&rcu.completion);
1472}
1473EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
1474
1289/* 1475/*
1290 * Check to see if there is any immediate RCU-related work to be done 1476 * Check to see if there is any immediate RCU-related work to be done
1291 * by the current CPU, for the specified type of RCU, returning 1 if so. 1477 * by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -1295,6 +1481,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
1295 */ 1481 */
1296static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) 1482static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1297{ 1483{
1484 struct rcu_node *rnp = rdp->mynode;
1485
1298 rdp->n_rcu_pending++; 1486 rdp->n_rcu_pending++;
1299 1487
1300 /* Check for CPU stalls, if enabled. */ 1488 /* Check for CPU stalls, if enabled. */
@@ -1319,19 +1507,19 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1319 } 1507 }
1320 1508
1321 /* Has another RCU grace period completed? */ 1509 /* Has another RCU grace period completed? */
1322 if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */ 1510 if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
1323 rdp->n_rp_gp_completed++; 1511 rdp->n_rp_gp_completed++;
1324 return 1; 1512 return 1;
1325 } 1513 }
1326 1514
1327 /* Has a new RCU grace period started? */ 1515 /* Has a new RCU grace period started? */
1328 if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */ 1516 if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
1329 rdp->n_rp_gp_started++; 1517 rdp->n_rp_gp_started++;
1330 return 1; 1518 return 1;
1331 } 1519 }
1332 1520
1333 /* Has an RCU GP gone long enough to send resched IPIs &c? */ 1521 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1334 if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) && 1522 if (rcu_gp_in_progress(rsp) &&
1335 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { 1523 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) {
1336 rdp->n_rp_need_fqs++; 1524 rdp->n_rp_need_fqs++;
1337 return 1; 1525 return 1;
@@ -1369,6 +1557,97 @@ int rcu_needs_cpu(int cpu)
1369} 1557}
1370 1558
1371/* 1559/*
1560 * This function is invoked towards the end of the scheduler's initialization
1561 * process. Before this is called, the idle task might contain
1562 * RCU read-side critical sections (during which time, this idle
1563 * task is booting the system). After this function is called, the
1564 * idle tasks are prohibited from containing RCU read-side critical
1565 * sections.
1566 */
1567void rcu_scheduler_starting(void)
1568{
1569 WARN_ON(num_online_cpus() != 1);
1570 WARN_ON(nr_context_switches() > 0);
1571 rcu_scheduler_active = 1;
1572}
1573
1574static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
1575static atomic_t rcu_barrier_cpu_count;
1576static DEFINE_MUTEX(rcu_barrier_mutex);
1577static struct completion rcu_barrier_completion;
1578
1579static void rcu_barrier_callback(struct rcu_head *notused)
1580{
1581 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
1582 complete(&rcu_barrier_completion);
1583}
1584
1585/*
1586 * Called with preemption disabled, and from cross-cpu IRQ context.
1587 */
1588static void rcu_barrier_func(void *type)
1589{
1590 int cpu = smp_processor_id();
1591 struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
1592 void (*call_rcu_func)(struct rcu_head *head,
1593 void (*func)(struct rcu_head *head));
1594
1595 atomic_inc(&rcu_barrier_cpu_count);
1596 call_rcu_func = type;
1597 call_rcu_func(head, rcu_barrier_callback);
1598}
1599
1600/*
1601 * Orchestrate the specified type of RCU barrier, waiting for all
1602 * RCU callbacks of the specified type to complete.
1603 */
1604static void _rcu_barrier(struct rcu_state *rsp,
1605 void (*call_rcu_func)(struct rcu_head *head,
1606 void (*func)(struct rcu_head *head)))
1607{
1608 BUG_ON(in_interrupt());
1609 /* Take mutex to serialize concurrent rcu_barrier() requests. */
1610 mutex_lock(&rcu_barrier_mutex);
1611 init_completion(&rcu_barrier_completion);
1612 /*
1613 * Initialize rcu_barrier_cpu_count to 1, then invoke
1614 * rcu_barrier_func() on each CPU, so that each CPU also has
1615 * incremented rcu_barrier_cpu_count. Only then is it safe to
1616 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
1617 * might complete its grace period before all of the other CPUs
1618 * did their increment, causing this function to return too
1619 * early.
1620 */
1621 atomic_set(&rcu_barrier_cpu_count, 1);
1622 preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
1623 rcu_adopt_orphan_cbs(rsp);
1624 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
1625 preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
1626 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
1627 complete(&rcu_barrier_completion);
1628 wait_for_completion(&rcu_barrier_completion);
1629 mutex_unlock(&rcu_barrier_mutex);
1630}
1631
1632/**
1633 * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
1634 */
1635void rcu_barrier_bh(void)
1636{
1637 _rcu_barrier(&rcu_bh_state, call_rcu_bh);
1638}
1639EXPORT_SYMBOL_GPL(rcu_barrier_bh);
1640
1641/**
1642 * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
1643 */
1644void rcu_barrier_sched(void)
1645{
1646 _rcu_barrier(&rcu_sched_state, call_rcu_sched);
1647}
1648EXPORT_SYMBOL_GPL(rcu_barrier_sched);
1649
1650/*
1372 * Do boot-time initialization of a CPU's per-CPU RCU data. 1651 * Do boot-time initialization of a CPU's per-CPU RCU data.
1373 */ 1652 */
1374static void __init 1653static void __init
@@ -1403,21 +1682,18 @@ static void __cpuinit
1403rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) 1682rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1404{ 1683{
1405 unsigned long flags; 1684 unsigned long flags;
1406 long lastcomp;
1407 unsigned long mask; 1685 unsigned long mask;
1408 struct rcu_data *rdp = rsp->rda[cpu]; 1686 struct rcu_data *rdp = rsp->rda[cpu];
1409 struct rcu_node *rnp = rcu_get_root(rsp); 1687 struct rcu_node *rnp = rcu_get_root(rsp);
1410 1688
1411 /* Set up local state, ensuring consistent view of global state. */ 1689 /* Set up local state, ensuring consistent view of global state. */
1412 spin_lock_irqsave(&rnp->lock, flags); 1690 spin_lock_irqsave(&rnp->lock, flags);
1413 lastcomp = rsp->completed;
1414 rdp->completed = lastcomp;
1415 rdp->gpnum = lastcomp;
1416 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1691 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1417 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1692 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1418 rdp->beenonline = 1; /* We have now been online. */ 1693 rdp->beenonline = 1; /* We have now been online. */
1419 rdp->preemptable = preemptable; 1694 rdp->preemptable = preemptable;
1420 rdp->passed_quiesc_completed = lastcomp - 1; 1695 rdp->qlen_last_fqs_check = 0;
1696 rdp->n_force_qs_snap = rsp->n_force_qs;
1421 rdp->blimit = blimit; 1697 rdp->blimit = blimit;
1422 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1698 spin_unlock(&rnp->lock); /* irqs remain disabled. */
1423 1699
@@ -1437,6 +1713,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1437 spin_lock(&rnp->lock); /* irqs already disabled. */ 1713 spin_lock(&rnp->lock); /* irqs already disabled. */
1438 rnp->qsmaskinit |= mask; 1714 rnp->qsmaskinit |= mask;
1439 mask = rnp->grpmask; 1715 mask = rnp->grpmask;
1716 if (rnp == rdp->mynode) {
1717 rdp->gpnum = rnp->completed; /* if GP in progress... */
1718 rdp->completed = rnp->completed;
1719 rdp->passed_quiesc_completed = rnp->completed - 1;
1720 }
1440 spin_unlock(&rnp->lock); /* irqs already disabled. */ 1721 spin_unlock(&rnp->lock); /* irqs already disabled. */
1441 rnp = rnp->parent; 1722 rnp = rnp->parent;
1442 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 1723 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
@@ -1454,8 +1735,8 @@ static void __cpuinit rcu_online_cpu(int cpu)
1454/* 1735/*
1455 * Handle CPU online/offline notification events. 1736 * Handle CPU online/offline notification events.
1456 */ 1737 */
1457int __cpuinit rcu_cpu_notify(struct notifier_block *self, 1738static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1458 unsigned long action, void *hcpu) 1739 unsigned long action, void *hcpu)
1459{ 1740{
1460 long cpu = (long)hcpu; 1741 long cpu = (long)hcpu;
1461 1742
@@ -1464,6 +1745,22 @@ int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1464 case CPU_UP_PREPARE_FROZEN: 1745 case CPU_UP_PREPARE_FROZEN:
1465 rcu_online_cpu(cpu); 1746 rcu_online_cpu(cpu);
1466 break; 1747 break;
1748 case CPU_DYING:
1749 case CPU_DYING_FROZEN:
1750 /*
1751 * preempt_disable() in _rcu_barrier() prevents stop_machine(),
1752 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
1753 * returns, all online cpus have queued rcu_barrier_func().
1754 * The dying CPU clears its cpu_online_mask bit and
1755 * moves all of its RCU callbacks to ->orphan_cbs_list
1756 * in the context of stop_machine(), so subsequent calls
1757 * to _rcu_barrier() will adopt these callbacks and only
1758 * then queue rcu_barrier_func() on all remaining CPUs.
1759 */
1760 rcu_send_cbs_to_orphanage(&rcu_bh_state);
1761 rcu_send_cbs_to_orphanage(&rcu_sched_state);
1762 rcu_preempt_send_cbs_to_orphanage();
1763 break;
1467 case CPU_DEAD: 1764 case CPU_DEAD:
1468 case CPU_DEAD_FROZEN: 1765 case CPU_DEAD_FROZEN:
1469 case CPU_UP_CANCELED: 1766 case CPU_UP_CANCELED:
@@ -1527,6 +1824,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1527 rnp = rsp->level[i]; 1824 rnp = rsp->level[i];
1528 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1825 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1529 spin_lock_init(&rnp->lock); 1826 spin_lock_init(&rnp->lock);
1827 lockdep_set_class(&rnp->lock, &rcu_node_class[i]);
1530 rnp->gpnum = 0; 1828 rnp->gpnum = 0;
1531 rnp->qsmask = 0; 1829 rnp->qsmask = 0;
1532 rnp->qsmaskinit = 0; 1830 rnp->qsmaskinit = 0;
@@ -1547,6 +1845,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1547 rnp->level = i; 1845 rnp->level = i;
1548 INIT_LIST_HEAD(&rnp->blocked_tasks[0]); 1846 INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
1549 INIT_LIST_HEAD(&rnp->blocked_tasks[1]); 1847 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1848 INIT_LIST_HEAD(&rnp->blocked_tasks[2]);
1849 INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
1550 } 1850 }
1551 } 1851 }
1552} 1852}
@@ -1558,6 +1858,10 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1558 */ 1858 */
1559#define RCU_INIT_FLAVOR(rsp, rcu_data) \ 1859#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1560do { \ 1860do { \
1861 int i; \
1862 int j; \
1863 struct rcu_node *rnp; \
1864 \
1561 rcu_init_one(rsp); \ 1865 rcu_init_one(rsp); \
1562 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \ 1866 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1563 j = 0; \ 1867 j = 0; \
@@ -1570,41 +1874,30 @@ do { \
1570 } \ 1874 } \
1571} while (0) 1875} while (0)
1572 1876
1573#ifdef CONFIG_TREE_PREEMPT_RCU 1877void __init rcu_init(void)
1574
1575void __init __rcu_init_preempt(void)
1576{
1577 int i; /* All used by RCU_INIT_FLAVOR(). */
1578 int j;
1579 struct rcu_node *rnp;
1580
1581 RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
1582}
1583
1584#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1585
1586void __init __rcu_init_preempt(void)
1587{ 1878{
1588} 1879 int i;
1589
1590#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1591
1592void __init __rcu_init(void)
1593{
1594 int i; /* All used by RCU_INIT_FLAVOR(). */
1595 int j;
1596 struct rcu_node *rnp;
1597 1880
1598 rcu_bootup_announce(); 1881 rcu_bootup_announce();
1599#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1600 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 1883 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1601#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 1884#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1885#if NUM_RCU_LVL_4 != 0
1886 printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n");
1887#endif /* #if NUM_RCU_LVL_4 != 0 */
1602 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); 1888 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
1603 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); 1889 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
1604 __rcu_init_preempt(); 1890 __rcu_init_preempt();
1605 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1891 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1892
1893 /*
1894 * We don't need protection against CPU-hotplug here because
1895 * this is called early in boot, before either interrupts
1896 * or the scheduler are operational.
1897 */
1898 cpu_notifier(rcu_cpu_notify, 0);
1899 for_each_online_cpu(i)
1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i);
1606} 1901}
1607 1902
1608module_param(blimit, int, 0); 1903#include "rcutree_plugin.h"
1609module_param(qhimark, int, 0);
1610module_param(qlowmark, int, 0);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 8e8287a983c2..d2a0046f63b2 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -34,10 +34,11 @@
34 * In practice, this has not been tested, so there is probably some 34 * In practice, this has not been tested, so there is probably some
35 * bug somewhere. 35 * bug somewhere.
36 */ 36 */
37#define MAX_RCU_LVLS 3 37#define MAX_RCU_LVLS 4
38#define RCU_FANOUT (CONFIG_RCU_FANOUT) 38#define RCU_FANOUT (CONFIG_RCU_FANOUT)
39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) 39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) 40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
41#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT)
41 42
42#if NR_CPUS <= RCU_FANOUT 43#if NR_CPUS <= RCU_FANOUT
43# define NUM_RCU_LVLS 1 44# define NUM_RCU_LVLS 1
@@ -45,23 +46,33 @@
45# define NUM_RCU_LVL_1 (NR_CPUS) 46# define NUM_RCU_LVL_1 (NR_CPUS)
46# define NUM_RCU_LVL_2 0 47# define NUM_RCU_LVL_2 0
47# define NUM_RCU_LVL_3 0 48# define NUM_RCU_LVL_3 0
49# define NUM_RCU_LVL_4 0
48#elif NR_CPUS <= RCU_FANOUT_SQ 50#elif NR_CPUS <= RCU_FANOUT_SQ
49# define NUM_RCU_LVLS 2 51# define NUM_RCU_LVLS 2
50# define NUM_RCU_LVL_0 1 52# define NUM_RCU_LVL_0 1
51# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT) 53# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
52# define NUM_RCU_LVL_2 (NR_CPUS) 54# define NUM_RCU_LVL_2 (NR_CPUS)
53# define NUM_RCU_LVL_3 0 55# define NUM_RCU_LVL_3 0
56# define NUM_RCU_LVL_4 0
54#elif NR_CPUS <= RCU_FANOUT_CUBE 57#elif NR_CPUS <= RCU_FANOUT_CUBE
55# define NUM_RCU_LVLS 3 58# define NUM_RCU_LVLS 3
56# define NUM_RCU_LVL_0 1 59# define NUM_RCU_LVL_0 1
57# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ) 60# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
58# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT)) 61# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
59# define NUM_RCU_LVL_3 NR_CPUS 62# define NUM_RCU_LVL_3 NR_CPUS
63# define NUM_RCU_LVL_4 0
64#elif NR_CPUS <= RCU_FANOUT_FOURTH
65# define NUM_RCU_LVLS 4
66# define NUM_RCU_LVL_0 1
67# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE)
68# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
69# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
70# define NUM_RCU_LVL_4 NR_CPUS
60#else 71#else
61# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" 72# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
62#endif /* #if (NR_CPUS) <= RCU_FANOUT */ 73#endif /* #if (NR_CPUS) <= RCU_FANOUT */
63 74
64#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) 75#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
65#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) 76#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
66 77
67/* 78/*
@@ -79,24 +90,67 @@ struct rcu_dynticks {
79 * Definition for node within the RCU grace-period-detection hierarchy. 90 * Definition for node within the RCU grace-period-detection hierarchy.
80 */ 91 */
81struct rcu_node { 92struct rcu_node {
82 spinlock_t lock; 93 spinlock_t lock; /* Root rcu_node's lock protects some */
94 /* rcu_state fields as well as following. */
83 long gpnum; /* Current grace period for this node. */ 95 long gpnum; /* Current grace period for this node. */
84 /* This will either be equal to or one */ 96 /* This will either be equal to or one */
85 /* behind the root rcu_node's gpnum. */ 97 /* behind the root rcu_node's gpnum. */
98 long completed; /* Last grace period completed for this node. */
99 /* This will either be equal to or one */
100 /* behind the root rcu_node's gpnum. */
86 unsigned long qsmask; /* CPUs or groups that need to switch in */ 101 unsigned long qsmask; /* CPUs or groups that need to switch in */
87 /* order for current grace period to proceed.*/ 102 /* order for current grace period to proceed.*/
103 /* In leaf rcu_node, each bit corresponds to */
104 /* an rcu_data structure, otherwise, each */
105 /* bit corresponds to a child rcu_node */
106 /* structure. */
107 unsigned long expmask; /* Groups that have ->blocked_tasks[] */
108 /* elements that need to drain to allow the */
109 /* current expedited grace period to */
110 /* complete (only for TREE_PREEMPT_RCU). */
88 unsigned long qsmaskinit; 111 unsigned long qsmaskinit;
89 /* Per-GP initialization for qsmask. */ 112 /* Per-GP initial value for qsmask & expmask. */
90 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 113 unsigned long grpmask; /* Mask to apply to parent qsmask. */
114 /* Only one bit will be set in this mask. */
91 int grplo; /* lowest-numbered CPU or group here. */ 115 int grplo; /* lowest-numbered CPU or group here. */
92 int grphi; /* highest-numbered CPU or group here. */ 116 int grphi; /* highest-numbered CPU or group here. */
93 u8 grpnum; /* CPU/group number for next level up. */ 117 u8 grpnum; /* CPU/group number for next level up. */
94 u8 level; /* root is at level 0. */ 118 u8 level; /* root is at level 0. */
95 struct rcu_node *parent; 119 struct rcu_node *parent;
96 struct list_head blocked_tasks[2]; 120 struct list_head blocked_tasks[4];
97 /* Tasks blocked in RCU read-side critsect. */ 121 /* Tasks blocked in RCU read-side critsect. */
122 /* Grace period number (->gpnum) x blocked */
123 /* by tasks on the (x & 0x1) element of the */
124 /* blocked_tasks[] array. */
98} ____cacheline_internodealigned_in_smp; 125} ____cacheline_internodealigned_in_smp;
99 126
127/*
128 * Do a full breadth-first scan of the rcu_node structures for the
129 * specified rcu_state structure.
130 */
131#define rcu_for_each_node_breadth_first(rsp, rnp) \
132 for ((rnp) = &(rsp)->node[0]; \
133 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
134
135/*
136 * Do a breadth-first scan of the non-leaf rcu_node structures for the
137 * specified rcu_state structure. Note that if there is a singleton
138 * rcu_node tree with but one rcu_node structure, this loop is a no-op.
139 */
140#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
141 for ((rnp) = &(rsp)->node[0]; \
142 (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
143
144/*
145 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
146 * structure. Note that if there is a singleton rcu_node tree with but
147 * one rcu_node structure, this loop -will- visit the rcu_node structure.
148 * It is still a leaf node, even if it is also the root node.
149 */
150#define rcu_for_each_leaf_node(rsp, rnp) \
151 for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
152 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
153
100/* Index values for nxttail array in struct rcu_data. */ 154/* Index values for nxttail array in struct rcu_data. */
101#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ 155#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
102#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ 156#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
@@ -126,23 +180,30 @@ struct rcu_data {
126 * Any of the partitions might be empty, in which case the 180 * Any of the partitions might be empty, in which case the
127 * pointer to that partition will be equal to the pointer for 181 * pointer to that partition will be equal to the pointer for
128 * the following partition. When the list is empty, all of 182 * the following partition. When the list is empty, all of
129 * the nxttail elements point to nxtlist, which is NULL. 183 * the nxttail elements point to the ->nxtlist pointer itself,
184 * which in that case is NULL.
130 * 185 *
131 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
132 * Entries that might have arrived after current GP ended
133 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
134 * Entries known to have arrived before current GP ended
135 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
136 * Entries that batch # <= ->completed - 1: waiting for current GP
137 * [nxtlist, *nxttail[RCU_DONE_TAIL]): 186 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
138 * Entries that batch # <= ->completed 187 * Entries that batch # <= ->completed
139 * The grace period for these entries has completed, and 188 * The grace period for these entries has completed, and
140 * the other grace-period-completed entries may be moved 189 * the other grace-period-completed entries may be moved
141 * here temporarily in rcu_process_callbacks(). 190 * here temporarily in rcu_process_callbacks().
191 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
192 * Entries that batch # <= ->completed - 1: waiting for current GP
193 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
194 * Entries known to have arrived before current GP ended
195 * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]):
196 * Entries that might have arrived after current GP ended
197 * Note that the value of *nxttail[RCU_NEXT_TAIL] will
198 * always be NULL, as this is the end of the list.
142 */ 199 */
143 struct rcu_head *nxtlist; 200 struct rcu_head *nxtlist;
144 struct rcu_head **nxttail[RCU_NEXT_SIZE]; 201 struct rcu_head **nxttail[RCU_NEXT_SIZE];
145 long qlen; /* # of queued callbacks */ 202 long qlen; /* # of queued callbacks */
203 long qlen_last_fqs_check;
204 /* qlen at last check for QS forcing */
205 unsigned long n_force_qs_snap;
206 /* did other CPU force QS recently? */
146 long blimit; /* Upper limit on a processed batch */ 207 long blimit; /* Upper limit on a processed batch */
147 208
148#ifdef CONFIG_NO_HZ 209#ifdef CONFIG_NO_HZ
@@ -173,13 +234,15 @@ struct rcu_data {
173}; 234};
174 235
175/* Values for signaled field in struct rcu_state. */ 236/* Values for signaled field in struct rcu_state. */
176#define RCU_GP_INIT 0 /* Grace period being initialized. */ 237#define RCU_GP_IDLE 0 /* No grace period in progress. */
177#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */ 238#define RCU_GP_INIT 1 /* Grace period being initialized. */
178#define RCU_FORCE_QS 2 /* Need to force quiescent state. */ 239#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
240#define RCU_SAVE_COMPLETED 3 /* Need to save rsp->completed. */
241#define RCU_FORCE_QS 4 /* Need to force quiescent state. */
179#ifdef CONFIG_NO_HZ 242#ifdef CONFIG_NO_HZ
180#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 243#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
181#else /* #ifdef CONFIG_NO_HZ */ 244#else /* #ifdef CONFIG_NO_HZ */
182#define RCU_SIGNAL_INIT RCU_FORCE_QS 245#define RCU_SIGNAL_INIT RCU_SAVE_COMPLETED
183#endif /* #else #ifdef CONFIG_NO_HZ */ 246#endif /* #else #ifdef CONFIG_NO_HZ */
184 247
185#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 248#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
@@ -216,10 +279,23 @@ struct rcu_state {
216 /* Force QS state. */ 279 /* Force QS state. */
217 long gpnum; /* Current gp number. */ 280 long gpnum; /* Current gp number. */
218 long completed; /* # of last completed gp. */ 281 long completed; /* # of last completed gp. */
282
283 /* End of fields guarded by root rcu_node's lock. */
284
219 spinlock_t onofflock; /* exclude on/offline and */ 285 spinlock_t onofflock; /* exclude on/offline and */
220 /* starting new GP. */ 286 /* starting new GP. Also */
287 /* protects the following */
288 /* orphan_cbs fields. */
289 struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */
290 /* orphaned by all CPUs in */
291 /* a given leaf rcu_node */
292 /* going offline. */
293 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
294 long orphan_qlen; /* Number of orphaned cbs. */
221 spinlock_t fqslock; /* Only one task forcing */ 295 spinlock_t fqslock; /* Only one task forcing */
222 /* quiescent states. */ 296 /* quiescent states. */
297 long completed_fqs; /* Value of completed @ snap. */
298 /* Protected by fqslock. */
223 unsigned long jiffies_force_qs; /* Time at which to invoke */ 299 unsigned long jiffies_force_qs; /* Time at which to invoke */
224 /* force_quiescent_state(). */ 300 /* force_quiescent_state(). */
225 unsigned long n_force_qs; /* Number of calls to */ 301 unsigned long n_force_qs; /* Number of calls to */
@@ -234,11 +310,15 @@ struct rcu_state {
234 unsigned long jiffies_stall; /* Time at which to check */ 310 unsigned long jiffies_stall; /* Time at which to check */
235 /* for CPU stalls. */ 311 /* for CPU stalls. */
236#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 312#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
237#ifdef CONFIG_NO_HZ
238 long dynticks_completed; /* Value of completed @ snap. */
239#endif /* #ifdef CONFIG_NO_HZ */
240}; 313};
241 314
315/* Return values for rcu_preempt_offline_tasks(). */
316
317#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */
318 /* GP were moved to root. */
319#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */
320 /* GP were moved to root. */
321
242#ifdef RCU_TREE_NONCORE 322#ifdef RCU_TREE_NONCORE
243 323
244/* 324/*
@@ -255,5 +335,37 @@ extern struct rcu_state rcu_preempt_state;
255DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); 335DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
256#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 336#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
257 337
258#endif /* #ifdef RCU_TREE_NONCORE */ 338#else /* #ifdef RCU_TREE_NONCORE */
339
340/* Forward declarations for rcutree_plugin.h */
341static void rcu_bootup_announce(void);
342long rcu_batches_completed(void);
343static void rcu_preempt_note_context_switch(int cpu);
344static int rcu_preempted_readers(struct rcu_node *rnp);
345#ifdef CONFIG_HOTPLUG_CPU
346static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
347 unsigned long flags);
348#endif /* #ifdef CONFIG_HOTPLUG_CPU */
349#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
350static void rcu_print_task_stall(struct rcu_node *rnp);
351#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
352static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
353#ifdef CONFIG_HOTPLUG_CPU
354static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
355 struct rcu_node *rnp,
356 struct rcu_data *rdp);
357static void rcu_preempt_offline_cpu(int cpu);
358#endif /* #ifdef CONFIG_HOTPLUG_CPU */
359static void rcu_preempt_check_callbacks(int cpu);
360static void rcu_preempt_process_callbacks(void);
361void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
362#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
363static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
364#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
365static int rcu_preempt_pending(int cpu);
366static int rcu_preempt_needs_cpu(int cpu);
367static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
368static void rcu_preempt_send_cbs_to_orphanage(void);
369static void __init __rcu_init_preempt(void);
259 370
371#endif /* #else #ifdef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 1cee04f627eb..37fbccdf41d5 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -24,16 +24,19 @@
24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com> 24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
25 */ 25 */
26 26
27#include <linux/delay.h>
27 28
28#ifdef CONFIG_TREE_PREEMPT_RCU 29#ifdef CONFIG_TREE_PREEMPT_RCU
29 30
30struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 31struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
31DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 32DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
32 33
34static int rcu_preempted_readers_exp(struct rcu_node *rnp);
35
33/* 36/*
34 * Tell them what RCU they are running. 37 * Tell them what RCU they are running.
35 */ 38 */
36static inline void rcu_bootup_announce(void) 39static void __init rcu_bootup_announce(void)
37{ 40{
38 printk(KERN_INFO 41 printk(KERN_INFO
39 "Experimental preemptable hierarchical RCU implementation.\n"); 42 "Experimental preemptable hierarchical RCU implementation.\n");
@@ -67,7 +70,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
67static void rcu_preempt_qs(int cpu) 70static void rcu_preempt_qs(int cpu)
68{ 71{
69 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 72 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
70 rdp->passed_quiesc_completed = rdp->completed; 73 rdp->passed_quiesc_completed = rdp->gpnum - 1;
71 barrier(); 74 barrier();
72 rdp->passed_quiesc = 1; 75 rdp->passed_quiesc = 1;
73} 76}
@@ -150,11 +153,65 @@ void __rcu_read_lock(void)
150} 153}
151EXPORT_SYMBOL_GPL(__rcu_read_lock); 154EXPORT_SYMBOL_GPL(__rcu_read_lock);
152 155
156/*
157 * Check for preempted RCU readers blocking the current grace period
158 * for the specified rcu_node structure. If the caller needs a reliable
159 * answer, it must hold the rcu_node's ->lock.
160 */
161static int rcu_preempted_readers(struct rcu_node *rnp)
162{
163 int phase = rnp->gpnum & 0x1;
164
165 return !list_empty(&rnp->blocked_tasks[phase]) ||
166 !list_empty(&rnp->blocked_tasks[phase + 2]);
167}
168
169/*
170 * Record a quiescent state for all tasks that were previously queued
171 * on the specified rcu_node structure and that were blocking the current
172 * RCU grace period. The caller must hold the specified rnp->lock with
173 * irqs disabled, and this lock is released upon return, but irqs remain
174 * disabled.
175 */
176static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
177 __releases(rnp->lock)
178{
179 unsigned long mask;
180 struct rcu_node *rnp_p;
181
182 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
183 spin_unlock_irqrestore(&rnp->lock, flags);
184 return; /* Still need more quiescent states! */
185 }
186
187 rnp_p = rnp->parent;
188 if (rnp_p == NULL) {
189 /*
190 * Either there is only one rcu_node in the tree,
191 * or tasks were kicked up to root rcu_node due to
192 * CPUs going offline.
193 */
194 rcu_report_qs_rsp(&rcu_preempt_state, flags);
195 return;
196 }
197
198 /* Report up the rest of the hierarchy. */
199 mask = rnp->grpmask;
200 spin_unlock(&rnp->lock); /* irqs remain disabled. */
201 spin_lock(&rnp_p->lock); /* irqs already disabled. */
202 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
203}
204
205/*
206 * Handle special cases during rcu_read_unlock(), such as needing to
207 * notify RCU core processing or task having blocked during the RCU
208 * read-side critical section.
209 */
153static void rcu_read_unlock_special(struct task_struct *t) 210static void rcu_read_unlock_special(struct task_struct *t)
154{ 211{
155 int empty; 212 int empty;
213 int empty_exp;
156 unsigned long flags; 214 unsigned long flags;
157 unsigned long mask;
158 struct rcu_node *rnp; 215 struct rcu_node *rnp;
159 int special; 216 int special;
160 217
@@ -196,37 +253,31 @@ static void rcu_read_unlock_special(struct task_struct *t)
196 break; 253 break;
197 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 254 spin_unlock(&rnp->lock); /* irqs remain disabled. */
198 } 255 }
199 empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); 256 empty = !rcu_preempted_readers(rnp);
257 empty_exp = !rcu_preempted_readers_exp(rnp);
258 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
200 list_del_init(&t->rcu_node_entry); 259 list_del_init(&t->rcu_node_entry);
201 t->rcu_blocked_node = NULL; 260 t->rcu_blocked_node = NULL;
202 261
203 /* 262 /*
204 * If this was the last task on the current list, and if 263 * If this was the last task on the current list, and if
205 * we aren't waiting on any CPUs, report the quiescent state. 264 * we aren't waiting on any CPUs, report the quiescent state.
206 * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk() 265 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
207 * drop rnp->lock and restore irq.
208 */ 266 */
209 if (!empty && rnp->qsmask == 0 && 267 if (empty)
210 list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) {
211 struct rcu_node *rnp_p;
212
213 if (rnp->parent == NULL) {
214 /* Only one rcu_node in the tree. */
215 cpu_quiet_msk_finish(&rcu_preempt_state, flags);
216 return;
217 }
218 /* Report up the rest of the hierarchy. */
219 mask = rnp->grpmask;
220 spin_unlock_irqrestore(&rnp->lock, flags); 268 spin_unlock_irqrestore(&rnp->lock, flags);
221 rnp_p = rnp->parent; 269 else
222 spin_lock_irqsave(&rnp_p->lock, flags); 270 rcu_report_unblock_qs_rnp(rnp, flags);
223 WARN_ON_ONCE(rnp->qsmask); 271
224 cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags); 272 /*
225 return; 273 * If this was the last task on the expedited lists,
226 } 274 * then we need to report up the rcu_node hierarchy.
227 spin_unlock(&rnp->lock); 275 */
276 if (!empty_exp && !rcu_preempted_readers_exp(rnp))
277 rcu_report_exp_rnp(&rcu_preempt_state, rnp);
278 } else {
279 local_irq_restore(flags);
228 } 280 }
229 local_irq_restore(flags);
230} 281}
231 282
232/* 283/*
@@ -257,12 +308,12 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
257{ 308{
258 unsigned long flags; 309 unsigned long flags;
259 struct list_head *lp; 310 struct list_head *lp;
260 int phase = rnp->gpnum & 0x1; 311 int phase;
261 struct task_struct *t; 312 struct task_struct *t;
262 313
263 if (!list_empty(&rnp->blocked_tasks[phase])) { 314 if (rcu_preempted_readers(rnp)) {
264 spin_lock_irqsave(&rnp->lock, flags); 315 spin_lock_irqsave(&rnp->lock, flags);
265 phase = rnp->gpnum & 0x1; /* re-read under lock. */ 316 phase = rnp->gpnum & 0x1;
266 lp = &rnp->blocked_tasks[phase]; 317 lp = &rnp->blocked_tasks[phase];
267 list_for_each_entry(t, lp, rcu_node_entry) 318 list_for_each_entry(t, lp, rcu_node_entry)
268 printk(" P%d", t->pid); 319 printk(" P%d", t->pid);
@@ -281,20 +332,10 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
281 */ 332 */
282static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 333static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
283{ 334{
284 WARN_ON_ONCE(!list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])); 335 WARN_ON_ONCE(rcu_preempted_readers(rnp));
285 WARN_ON_ONCE(rnp->qsmask); 336 WARN_ON_ONCE(rnp->qsmask);
286} 337}
287 338
288/*
289 * Check for preempted RCU readers for the specified rcu_node structure.
290 * If the caller needs a reliable answer, it must hold the rcu_node's
291 * >lock.
292 */
293static int rcu_preempted_readers(struct rcu_node *rnp)
294{
295 return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
296}
297
298#ifdef CONFIG_HOTPLUG_CPU 339#ifdef CONFIG_HOTPLUG_CPU
299 340
300/* 341/*
@@ -303,26 +344,34 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
303 * rcu_node. The reason for not just moving them to the immediate 344 * rcu_node. The reason for not just moving them to the immediate
304 * parent is to remove the need for rcu_read_unlock_special() to 345 * parent is to remove the need for rcu_read_unlock_special() to
305 * make more than two attempts to acquire the target rcu_node's lock. 346 * make more than two attempts to acquire the target rcu_node's lock.
347 * Returns true if there were tasks blocking the current RCU grace
348 * period.
349 *
350 * Returns 1 if there was previously a task blocking the current grace
351 * period on the specified rcu_node structure.
306 * 352 *
307 * The caller must hold rnp->lock with irqs disabled. 353 * The caller must hold rnp->lock with irqs disabled.
308 */ 354 */
309static void rcu_preempt_offline_tasks(struct rcu_state *rsp, 355static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
310 struct rcu_node *rnp, 356 struct rcu_node *rnp,
311 struct rcu_data *rdp) 357 struct rcu_data *rdp)
312{ 358{
313 int i; 359 int i;
314 struct list_head *lp; 360 struct list_head *lp;
315 struct list_head *lp_root; 361 struct list_head *lp_root;
362 int retval = 0;
316 struct rcu_node *rnp_root = rcu_get_root(rsp); 363 struct rcu_node *rnp_root = rcu_get_root(rsp);
317 struct task_struct *tp; 364 struct task_struct *tp;
318 365
319 if (rnp == rnp_root) { 366 if (rnp == rnp_root) {
320 WARN_ONCE(1, "Last CPU thought to be offlined?"); 367 WARN_ONCE(1, "Last CPU thought to be offlined?");
321 return; /* Shouldn't happen: at least one CPU online. */ 368 return 0; /* Shouldn't happen: at least one CPU online. */
322 } 369 }
323 WARN_ON_ONCE(rnp != rdp->mynode && 370 WARN_ON_ONCE(rnp != rdp->mynode &&
324 (!list_empty(&rnp->blocked_tasks[0]) || 371 (!list_empty(&rnp->blocked_tasks[0]) ||
325 !list_empty(&rnp->blocked_tasks[1]))); 372 !list_empty(&rnp->blocked_tasks[1]) ||
373 !list_empty(&rnp->blocked_tasks[2]) ||
374 !list_empty(&rnp->blocked_tasks[3])));
326 375
327 /* 376 /*
328 * Move tasks up to root rcu_node. Rely on the fact that the 377 * Move tasks up to root rcu_node. Rely on the fact that the
@@ -330,7 +379,11 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
330 * rcu_nodes in terms of gp_num value. This fact allows us to 379 * rcu_nodes in terms of gp_num value. This fact allows us to
331 * move the blocked_tasks[] array directly, element by element. 380 * move the blocked_tasks[] array directly, element by element.
332 */ 381 */
333 for (i = 0; i < 2; i++) { 382 if (rcu_preempted_readers(rnp))
383 retval |= RCU_OFL_TASKS_NORM_GP;
384 if (rcu_preempted_readers_exp(rnp))
385 retval |= RCU_OFL_TASKS_EXP_GP;
386 for (i = 0; i < 4; i++) {
334 lp = &rnp->blocked_tasks[i]; 387 lp = &rnp->blocked_tasks[i];
335 lp_root = &rnp_root->blocked_tasks[i]; 388 lp_root = &rnp_root->blocked_tasks[i];
336 while (!list_empty(lp)) { 389 while (!list_empty(lp)) {
@@ -342,6 +395,7 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
342 spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 395 spin_unlock(&rnp_root->lock); /* irqs remain disabled */
343 } 396 }
344 } 397 }
398 return retval;
345} 399}
346 400
347/* 401/*
@@ -392,6 +446,186 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
392} 446}
393EXPORT_SYMBOL_GPL(call_rcu); 447EXPORT_SYMBOL_GPL(call_rcu);
394 448
449/**
450 * synchronize_rcu - wait until a grace period has elapsed.
451 *
452 * Control will return to the caller some time after a full grace
453 * period has elapsed, in other words after all currently executing RCU
454 * read-side critical sections have completed. RCU read-side critical
455 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
456 * and may be nested.
457 */
458void synchronize_rcu(void)
459{
460 struct rcu_synchronize rcu;
461
462 if (!rcu_scheduler_active)
463 return;
464
465 init_completion(&rcu.completion);
466 /* Will wake me after RCU finished. */
467 call_rcu(&rcu.head, wakeme_after_rcu);
468 /* Wait for it. */
469 wait_for_completion(&rcu.completion);
470}
471EXPORT_SYMBOL_GPL(synchronize_rcu);
472
473static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
474static long sync_rcu_preempt_exp_count;
475static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
476
477/*
478 * Return non-zero if there are any tasks in RCU read-side critical
479 * sections blocking the current preemptible-RCU expedited grace period.
480 * If there is no preemptible-RCU expedited grace period currently in
481 * progress, returns zero unconditionally.
482 */
483static int rcu_preempted_readers_exp(struct rcu_node *rnp)
484{
485 return !list_empty(&rnp->blocked_tasks[2]) ||
486 !list_empty(&rnp->blocked_tasks[3]);
487}
488
489/*
490 * return non-zero if there is no RCU expedited grace period in progress
491 * for the specified rcu_node structure, in other words, if all CPUs and
492 * tasks covered by the specified rcu_node structure have done their bit
493 * for the current expedited grace period. Works only for preemptible
494 * RCU -- other RCU implementation use other means.
495 *
496 * Caller must hold sync_rcu_preempt_exp_mutex.
497 */
498static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
499{
500 return !rcu_preempted_readers_exp(rnp) &&
501 ACCESS_ONCE(rnp->expmask) == 0;
502}
503
504/*
505 * Report the exit from RCU read-side critical section for the last task
506 * that queued itself during or before the current expedited preemptible-RCU
507 * grace period. This event is reported either to the rcu_node structure on
508 * which the task was queued or to one of that rcu_node structure's ancestors,
509 * recursively up the tree. (Calm down, calm down, we do the recursion
510 * iteratively!)
511 *
512 * Caller must hold sync_rcu_preempt_exp_mutex.
513 */
514static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
515{
516 unsigned long flags;
517 unsigned long mask;
518
519 spin_lock_irqsave(&rnp->lock, flags);
520 for (;;) {
521 if (!sync_rcu_preempt_exp_done(rnp))
522 break;
523 if (rnp->parent == NULL) {
524 wake_up(&sync_rcu_preempt_exp_wq);
525 break;
526 }
527 mask = rnp->grpmask;
528 spin_unlock(&rnp->lock); /* irqs remain disabled */
529 rnp = rnp->parent;
530 spin_lock(&rnp->lock); /* irqs already disabled */
531 rnp->expmask &= ~mask;
532 }
533 spin_unlock_irqrestore(&rnp->lock, flags);
534}
535
536/*
537 * Snapshot the tasks blocking the newly started preemptible-RCU expedited
538 * grace period for the specified rcu_node structure. If there are no such
539 * tasks, report it up the rcu_node hierarchy.
540 *
541 * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock.
542 */
543static void
544sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
545{
546 int must_wait;
547
548 spin_lock(&rnp->lock); /* irqs already disabled */
549 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
550 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
551 must_wait = rcu_preempted_readers_exp(rnp);
552 spin_unlock(&rnp->lock); /* irqs remain disabled */
553 if (!must_wait)
554 rcu_report_exp_rnp(rsp, rnp);
555}
556
557/*
558 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
559 * is to invoke synchronize_sched_expedited() to push all the tasks to
560 * the ->blocked_tasks[] lists, move all entries from the first set of
561 * ->blocked_tasks[] lists to the second set, and finally wait for this
562 * second set to drain.
563 */
564void synchronize_rcu_expedited(void)
565{
566 unsigned long flags;
567 struct rcu_node *rnp;
568 struct rcu_state *rsp = &rcu_preempt_state;
569 long snap;
570 int trycount = 0;
571
572 smp_mb(); /* Caller's modifications seen first by other CPUs. */
573 snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
574 smp_mb(); /* Above access cannot bleed into critical section. */
575
576 /*
577 * Acquire lock, falling back to synchronize_rcu() if too many
578 * lock-acquisition failures. Of course, if someone does the
579 * expedited grace period for us, just leave.
580 */
581 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
582 if (trycount++ < 10)
583 udelay(trycount * num_online_cpus());
584 else {
585 synchronize_rcu();
586 return;
587 }
588 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
589 goto mb_ret; /* Others did our work for us. */
590 }
591 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
592 goto unlock_mb_ret; /* Others did our work for us. */
593
594 /* force all RCU readers onto blocked_tasks[]. */
595 synchronize_sched_expedited();
596
597 spin_lock_irqsave(&rsp->onofflock, flags);
598
599 /* Initialize ->expmask for all non-leaf rcu_node structures. */
600 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
601 spin_lock(&rnp->lock); /* irqs already disabled. */
602 rnp->expmask = rnp->qsmaskinit;
603 spin_unlock(&rnp->lock); /* irqs remain disabled. */
604 }
605
606 /* Snapshot current state of ->blocked_tasks[] lists. */
607 rcu_for_each_leaf_node(rsp, rnp)
608 sync_rcu_preempt_exp_init(rsp, rnp);
609 if (NUM_RCU_NODES > 1)
610 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
611
612 spin_unlock_irqrestore(&rsp->onofflock, flags);
613
614 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */
615 rnp = rcu_get_root(rsp);
616 wait_event(sync_rcu_preempt_exp_wq,
617 sync_rcu_preempt_exp_done(rnp));
618
619 /* Clean up and exit. */
620 smp_mb(); /* ensure expedited GP seen before counter increment. */
621 ACCESS_ONCE(sync_rcu_preempt_exp_count)++;
622unlock_mb_ret:
623 mutex_unlock(&sync_rcu_preempt_exp_mutex);
624mb_ret:
625 smp_mb(); /* ensure subsequent action seen after grace period. */
626}
627EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
628
395/* 629/*
396 * Check to see if there is any immediate preemptable-RCU-related work 630 * Check to see if there is any immediate preemptable-RCU-related work
397 * to be done. 631 * to be done.
@@ -410,6 +644,15 @@ static int rcu_preempt_needs_cpu(int cpu)
410 return !!per_cpu(rcu_preempt_data, cpu).nxtlist; 644 return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
411} 645}
412 646
647/**
648 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
649 */
650void rcu_barrier(void)
651{
652 _rcu_barrier(&rcu_preempt_state, call_rcu);
653}
654EXPORT_SYMBOL_GPL(rcu_barrier);
655
413/* 656/*
414 * Initialize preemptable RCU's per-CPU data. 657 * Initialize preemptable RCU's per-CPU data.
415 */ 658 */
@@ -419,6 +662,22 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
419} 662}
420 663
421/* 664/*
665 * Move preemptable RCU's callbacks to ->orphan_cbs_list.
666 */
667static void rcu_preempt_send_cbs_to_orphanage(void)
668{
669 rcu_send_cbs_to_orphanage(&rcu_preempt_state);
670}
671
672/*
673 * Initialize preemptable RCU's state structures.
674 */
675static void __init __rcu_init_preempt(void)
676{
677 RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
678}
679
680/*
422 * Check for a task exiting while in a preemptable-RCU read-side 681 * Check for a task exiting while in a preemptable-RCU read-side
423 * critical section, clean up if so. No need to issue warnings, 682 * critical section, clean up if so. No need to issue warnings,
424 * as debug_check_no_locks_held() already does this if lockdep 683 * as debug_check_no_locks_held() already does this if lockdep
@@ -439,7 +698,7 @@ void exit_rcu(void)
439/* 698/*
440 * Tell them what RCU they are running. 699 * Tell them what RCU they are running.
441 */ 700 */
442static inline void rcu_bootup_announce(void) 701static void __init rcu_bootup_announce(void)
443{ 702{
444 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 703 printk(KERN_INFO "Hierarchical RCU implementation.\n");
445} 704}
@@ -461,6 +720,25 @@ static void rcu_preempt_note_context_switch(int cpu)
461{ 720{
462} 721}
463 722
723/*
724 * Because preemptable RCU does not exist, there are never any preempted
725 * RCU readers.
726 */
727static int rcu_preempted_readers(struct rcu_node *rnp)
728{
729 return 0;
730}
731
732#ifdef CONFIG_HOTPLUG_CPU
733
734/* Because preemptible RCU does not exist, no quieting of tasks. */
735static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
736{
737 spin_unlock_irqrestore(&rnp->lock, flags);
738}
739
740#endif /* #ifdef CONFIG_HOTPLUG_CPU */
741
464#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 742#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
465 743
466/* 744/*
@@ -483,25 +761,19 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
483 WARN_ON_ONCE(rnp->qsmask); 761 WARN_ON_ONCE(rnp->qsmask);
484} 762}
485 763
486/*
487 * Because preemptable RCU does not exist, there are never any preempted
488 * RCU readers.
489 */
490static int rcu_preempted_readers(struct rcu_node *rnp)
491{
492 return 0;
493}
494
495#ifdef CONFIG_HOTPLUG_CPU 764#ifdef CONFIG_HOTPLUG_CPU
496 765
497/* 766/*
498 * Because preemptable RCU does not exist, it never needs to migrate 767 * Because preemptable RCU does not exist, it never needs to migrate
499 * tasks that were blocked within RCU read-side critical sections. 768 * tasks that were blocked within RCU read-side critical sections, and
769 * such non-existent tasks cannot possibly have been blocking the current
770 * grace period.
500 */ 771 */
501static void rcu_preempt_offline_tasks(struct rcu_state *rsp, 772static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
502 struct rcu_node *rnp, 773 struct rcu_node *rnp,
503 struct rcu_data *rdp) 774 struct rcu_data *rdp)
504{ 775{
776 return 0;
505} 777}
506 778
507/* 779/*
@@ -518,7 +790,7 @@ static void rcu_preempt_offline_cpu(int cpu)
518 * Because preemptable RCU does not exist, it never has any callbacks 790 * Because preemptable RCU does not exist, it never has any callbacks
519 * to check. 791 * to check.
520 */ 792 */
521void rcu_preempt_check_callbacks(int cpu) 793static void rcu_preempt_check_callbacks(int cpu)
522{ 794{
523} 795}
524 796
@@ -526,7 +798,7 @@ void rcu_preempt_check_callbacks(int cpu)
526 * Because preemptable RCU does not exist, it never has any callbacks 798 * Because preemptable RCU does not exist, it never has any callbacks
527 * to process. 799 * to process.
528 */ 800 */
529void rcu_preempt_process_callbacks(void) 801static void rcu_preempt_process_callbacks(void)
530{ 802{
531} 803}
532 804
@@ -540,6 +812,30 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
540EXPORT_SYMBOL_GPL(call_rcu); 812EXPORT_SYMBOL_GPL(call_rcu);
541 813
542/* 814/*
815 * Wait for an rcu-preempt grace period, but make it happen quickly.
816 * But because preemptable RCU does not exist, map to rcu-sched.
817 */
818void synchronize_rcu_expedited(void)
819{
820 synchronize_sched_expedited();
821}
822EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
823
824#ifdef CONFIG_HOTPLUG_CPU
825
826/*
827 * Because preemptable RCU does not exist, there is never any need to
828 * report on tasks preempted in RCU read-side critical sections during
829 * expedited RCU grace periods.
830 */
831static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
832{
833 return;
834}
835
836#endif /* #ifdef CONFIG_HOTPLUG_CPU */
837
838/*
543 * Because preemptable RCU does not exist, it never has any work to do. 839 * Because preemptable RCU does not exist, it never has any work to do.
544 */ 840 */
545static int rcu_preempt_pending(int cpu) 841static int rcu_preempt_pending(int cpu)
@@ -556,6 +852,16 @@ static int rcu_preempt_needs_cpu(int cpu)
556} 852}
557 853
558/* 854/*
855 * Because preemptable RCU does not exist, rcu_barrier() is just
856 * another name for rcu_barrier_sched().
857 */
858void rcu_barrier(void)
859{
860 rcu_barrier_sched();
861}
862EXPORT_SYMBOL_GPL(rcu_barrier);
863
864/*
559 * Because preemptable RCU does not exist, there is no per-CPU 865 * Because preemptable RCU does not exist, there is no per-CPU
560 * data to initialize. 866 * data to initialize.
561 */ 867 */
@@ -563,4 +869,18 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
563{ 869{
564} 870}
565 871
872/*
873 * Because there is no preemptable RCU, there are no callbacks to move.
874 */
875static void rcu_preempt_send_cbs_to_orphanage(void)
876{
877}
878
879/*
880 * Because preemptable RCU does not exist, it need not be initialized.
881 */
882static void __init __rcu_init_preempt(void)
883{
884}
885
566#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 886#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index c89f5e9fd173..9d2c88423b31 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -93,7 +93,7 @@ static int rcudata_open(struct inode *inode, struct file *file)
93 return single_open(file, show_rcudata, NULL); 93 return single_open(file, show_rcudata, NULL);
94} 94}
95 95
96static struct file_operations rcudata_fops = { 96static const struct file_operations rcudata_fops = {
97 .owner = THIS_MODULE, 97 .owner = THIS_MODULE,
98 .open = rcudata_open, 98 .open = rcudata_open,
99 .read = seq_read, 99 .read = seq_read,
@@ -145,7 +145,7 @@ static int rcudata_csv_open(struct inode *inode, struct file *file)
145 return single_open(file, show_rcudata_csv, NULL); 145 return single_open(file, show_rcudata_csv, NULL);
146} 146}
147 147
148static struct file_operations rcudata_csv_fops = { 148static const struct file_operations rcudata_csv_fops = {
149 .owner = THIS_MODULE, 149 .owner = THIS_MODULE,
150 .open = rcudata_csv_open, 150 .open = rcudata_csv_open,
151 .read = seq_read, 151 .read = seq_read,
@@ -155,24 +155,32 @@ static struct file_operations rcudata_csv_fops = {
155 155
156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
157{ 157{
158 long gpnum;
158 int level = 0; 159 int level = 0;
160 int phase;
159 struct rcu_node *rnp; 161 struct rcu_node *rnp;
160 162
163 gpnum = rsp->gpnum;
161 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " 164 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
162 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", 165 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
163 rsp->completed, rsp->gpnum, rsp->signaled, 166 rsp->completed, gpnum, rsp->signaled,
164 (long)(rsp->jiffies_force_qs - jiffies), 167 (long)(rsp->jiffies_force_qs - jiffies),
165 (int)(jiffies & 0xffff), 168 (int)(jiffies & 0xffff),
166 rsp->n_force_qs, rsp->n_force_qs_ngp, 169 rsp->n_force_qs, rsp->n_force_qs_ngp,
167 rsp->n_force_qs - rsp->n_force_qs_ngp, 170 rsp->n_force_qs - rsp->n_force_qs_ngp,
168 rsp->n_force_qs_lh); 171 rsp->n_force_qs_lh, rsp->orphan_qlen);
169 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { 172 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
170 if (rnp->level != level) { 173 if (rnp->level != level) {
171 seq_puts(m, "\n"); 174 seq_puts(m, "\n");
172 level = rnp->level; 175 level = rnp->level;
173 } 176 }
174 seq_printf(m, "%lx/%lx %d:%d ^%d ", 177 phase = gpnum & 0x1;
178 seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ",
175 rnp->qsmask, rnp->qsmaskinit, 179 rnp->qsmask, rnp->qsmaskinit,
180 "T."[list_empty(&rnp->blocked_tasks[phase])],
181 "E."[list_empty(&rnp->blocked_tasks[phase + 2])],
182 "T."[list_empty(&rnp->blocked_tasks[!phase])],
183 "E."[list_empty(&rnp->blocked_tasks[!phase + 2])],
176 rnp->grplo, rnp->grphi, rnp->grpnum); 184 rnp->grplo, rnp->grphi, rnp->grpnum);
177 } 185 }
178 seq_puts(m, "\n"); 186 seq_puts(m, "\n");
@@ -196,7 +204,7 @@ static int rcuhier_open(struct inode *inode, struct file *file)
196 return single_open(file, show_rcuhier, NULL); 204 return single_open(file, show_rcuhier, NULL);
197} 205}
198 206
199static struct file_operations rcuhier_fops = { 207static const struct file_operations rcuhier_fops = {
200 .owner = THIS_MODULE, 208 .owner = THIS_MODULE,
201 .open = rcuhier_open, 209 .open = rcuhier_open,
202 .read = seq_read, 210 .read = seq_read,
@@ -222,7 +230,7 @@ static int rcugp_open(struct inode *inode, struct file *file)
222 return single_open(file, show_rcugp, NULL); 230 return single_open(file, show_rcugp, NULL);
223} 231}
224 232
225static struct file_operations rcugp_fops = { 233static const struct file_operations rcugp_fops = {
226 .owner = THIS_MODULE, 234 .owner = THIS_MODULE,
227 .open = rcugp_open, 235 .open = rcugp_open,
228 .read = seq_read, 236 .read = seq_read,
@@ -276,7 +284,7 @@ static int rcu_pending_open(struct inode *inode, struct file *file)
276 return single_open(file, show_rcu_pending, NULL); 284 return single_open(file, show_rcu_pending, NULL);
277} 285}
278 286
279static struct file_operations rcu_pending_fops = { 287static const struct file_operations rcu_pending_fops = {
280 .owner = THIS_MODULE, 288 .owner = THIS_MODULE,
281 .open = rcu_pending_open, 289 .open = rcu_pending_open,
282 .read = seq_read, 290 .read = seq_read,
diff --git a/kernel/relay.c b/kernel/relay.c
index bc188549788f..760c26209a3c 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -60,7 +60,7 @@ static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
60/* 60/*
61 * vm_ops for relay file mappings. 61 * vm_ops for relay file mappings.
62 */ 62 */
63static struct vm_operations_struct relay_file_mmap_ops = { 63static const struct vm_operations_struct relay_file_mmap_ops = {
64 .fault = relay_buf_fault, 64 .fault = relay_buf_fault,
65 .close = relay_file_mmap_close, 65 .close = relay_file_mmap_close,
66}; 66};
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index e1338f074314..bcdabf37c40b 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -19,6 +19,7 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
19{ 19{
20 spin_lock_init(&counter->lock); 20 spin_lock_init(&counter->lock);
21 counter->limit = RESOURCE_MAX; 21 counter->limit = RESOURCE_MAX;
22 counter->soft_limit = RESOURCE_MAX;
22 counter->parent = parent; 23 counter->parent = parent;
23} 24}
24 25
@@ -101,6 +102,8 @@ res_counter_member(struct res_counter *counter, int member)
101 return &counter->limit; 102 return &counter->limit;
102 case RES_FAILCNT: 103 case RES_FAILCNT:
103 return &counter->failcnt; 104 return &counter->failcnt;
105 case RES_SOFT_LIMIT:
106 return &counter->soft_limit;
104 }; 107 };
105 108
106 BUG(); 109 BUG();
diff --git a/kernel/sched.c b/kernel/sched.c
index 2f76e06bea58..aa31244caa9f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -309,6 +309,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
309 */ 309 */
310static DEFINE_SPINLOCK(task_group_lock); 310static DEFINE_SPINLOCK(task_group_lock);
311 311
312#ifdef CONFIG_FAIR_GROUP_SCHED
313
312#ifdef CONFIG_SMP 314#ifdef CONFIG_SMP
313static int root_task_group_empty(void) 315static int root_task_group_empty(void)
314{ 316{
@@ -316,7 +318,6 @@ static int root_task_group_empty(void)
316} 318}
317#endif 319#endif
318 320
319#ifdef CONFIG_FAIR_GROUP_SCHED
320#ifdef CONFIG_USER_SCHED 321#ifdef CONFIG_USER_SCHED
321# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
322#else /* !CONFIG_USER_SCHED */ 323#else /* !CONFIG_USER_SCHED */
@@ -534,14 +535,12 @@ struct rq {
534 #define CPU_LOAD_IDX_MAX 5 535 #define CPU_LOAD_IDX_MAX 5
535 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 536 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
536#ifdef CONFIG_NO_HZ 537#ifdef CONFIG_NO_HZ
537 unsigned long last_tick_seen;
538 unsigned char in_nohz_recently; 538 unsigned char in_nohz_recently;
539#endif 539#endif
540 /* capture load from *all* tasks on this cpu: */ 540 /* capture load from *all* tasks on this cpu: */
541 struct load_weight load; 541 struct load_weight load;
542 unsigned long nr_load_updates; 542 unsigned long nr_load_updates;
543 u64 nr_switches; 543 u64 nr_switches;
544 u64 nr_migrations_in;
545 544
546 struct cfs_rq cfs; 545 struct cfs_rq cfs;
547 struct rt_rq rt; 546 struct rt_rq rt;
@@ -590,6 +589,8 @@ struct rq {
590 589
591 u64 rt_avg; 590 u64 rt_avg;
592 u64 age_stamp; 591 u64 age_stamp;
592 u64 idle_stamp;
593 u64 avg_idle;
593#endif 594#endif
594 595
595 /* calc_load related fields */ 596 /* calc_load related fields */
@@ -676,6 +677,7 @@ inline void update_rq_clock(struct rq *rq)
676 677
677/** 678/**
678 * runqueue_is_locked 679 * runqueue_is_locked
680 * @cpu: the processor in question.
679 * 681 *
680 * Returns true if the current cpu runqueue is locked. 682 * Returns true if the current cpu runqueue is locked.
681 * This interface allows printk to be called with the runqueue lock 683 * This interface allows printk to be called with the runqueue lock
@@ -770,7 +772,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
770 if (!sched_feat_names[i]) 772 if (!sched_feat_names[i])
771 return -EINVAL; 773 return -EINVAL;
772 774
773 filp->f_pos += cnt; 775 *ppos += cnt;
774 776
775 return cnt; 777 return cnt;
776} 778}
@@ -780,7 +782,7 @@ static int sched_feat_open(struct inode *inode, struct file *filp)
780 return single_open(filp, sched_feat_show, NULL); 782 return single_open(filp, sched_feat_show, NULL);
781} 783}
782 784
783static struct file_operations sched_feat_fops = { 785static const struct file_operations sched_feat_fops = {
784 .open = sched_feat_open, 786 .open = sched_feat_open,
785 .write = sched_feat_write, 787 .write = sched_feat_write,
786 .read = seq_read, 788 .read = seq_read,
@@ -1563,11 +1565,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1563 1565
1564#ifdef CONFIG_FAIR_GROUP_SCHED 1566#ifdef CONFIG_FAIR_GROUP_SCHED
1565 1567
1566struct update_shares_data { 1568static __read_mostly unsigned long *update_shares_data;
1567 unsigned long rq_weight[NR_CPUS];
1568};
1569
1570static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
1571 1569
1572static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1570static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1573 1571
@@ -1577,12 +1575,12 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1577static void update_group_shares_cpu(struct task_group *tg, int cpu, 1575static void update_group_shares_cpu(struct task_group *tg, int cpu,
1578 unsigned long sd_shares, 1576 unsigned long sd_shares,
1579 unsigned long sd_rq_weight, 1577 unsigned long sd_rq_weight,
1580 struct update_shares_data *usd) 1578 unsigned long *usd_rq_weight)
1581{ 1579{
1582 unsigned long shares, rq_weight; 1580 unsigned long shares, rq_weight;
1583 int boost = 0; 1581 int boost = 0;
1584 1582
1585 rq_weight = usd->rq_weight[cpu]; 1583 rq_weight = usd_rq_weight[cpu];
1586 if (!rq_weight) { 1584 if (!rq_weight) {
1587 boost = 1; 1585 boost = 1;
1588 rq_weight = NICE_0_LOAD; 1586 rq_weight = NICE_0_LOAD;
@@ -1617,7 +1615,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
1617static int tg_shares_up(struct task_group *tg, void *data) 1615static int tg_shares_up(struct task_group *tg, void *data)
1618{ 1616{
1619 unsigned long weight, rq_weight = 0, shares = 0; 1617 unsigned long weight, rq_weight = 0, shares = 0;
1620 struct update_shares_data *usd; 1618 unsigned long *usd_rq_weight;
1621 struct sched_domain *sd = data; 1619 struct sched_domain *sd = data;
1622 unsigned long flags; 1620 unsigned long flags;
1623 int i; 1621 int i;
@@ -1626,11 +1624,11 @@ static int tg_shares_up(struct task_group *tg, void *data)
1626 return 0; 1624 return 0;
1627 1625
1628 local_irq_save(flags); 1626 local_irq_save(flags);
1629 usd = &__get_cpu_var(update_shares_data); 1627 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1630 1628
1631 for_each_cpu(i, sched_domain_span(sd)) { 1629 for_each_cpu(i, sched_domain_span(sd)) {
1632 weight = tg->cfs_rq[i]->load.weight; 1630 weight = tg->cfs_rq[i]->load.weight;
1633 usd->rq_weight[i] = weight; 1631 usd_rq_weight[i] = weight;
1634 1632
1635 /* 1633 /*
1636 * If there are currently no tasks on the cpu pretend there 1634 * If there are currently no tasks on the cpu pretend there
@@ -1651,7 +1649,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1651 shares = tg->shares; 1649 shares = tg->shares;
1652 1650
1653 for_each_cpu(i, sched_domain_span(sd)) 1651 for_each_cpu(i, sched_domain_span(sd))
1654 update_group_shares_cpu(tg, i, shares, rq_weight, usd); 1652 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1655 1653
1656 local_irq_restore(flags); 1654 local_irq_restore(flags);
1657 1655
@@ -1995,6 +1993,39 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1995 p->sched_class->prio_changed(rq, p, oldprio, running); 1993 p->sched_class->prio_changed(rq, p, oldprio, running);
1996} 1994}
1997 1995
1996/**
1997 * kthread_bind - bind a just-created kthread to a cpu.
1998 * @p: thread created by kthread_create().
1999 * @cpu: cpu (might not be online, must be possible) for @k to run on.
2000 *
2001 * Description: This function is equivalent to set_cpus_allowed(),
2002 * except that @cpu doesn't need to be online, and the thread must be
2003 * stopped (i.e., just returned from kthread_create()).
2004 *
2005 * Function lives here instead of kthread.c because it messes with
2006 * scheduler internals which require locking.
2007 */
2008void kthread_bind(struct task_struct *p, unsigned int cpu)
2009{
2010 struct rq *rq = cpu_rq(cpu);
2011 unsigned long flags;
2012
2013 /* Must have done schedule() in kthread() before we set_task_cpu */
2014 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
2015 WARN_ON(1);
2016 return;
2017 }
2018
2019 spin_lock_irqsave(&rq->lock, flags);
2020 update_rq_clock(rq);
2021 set_task_cpu(p, cpu);
2022 p->cpus_allowed = cpumask_of_cpu(cpu);
2023 p->rt.nr_cpus_allowed = 1;
2024 p->flags |= PF_THREAD_BOUND;
2025 spin_unlock_irqrestore(&rq->lock, flags);
2026}
2027EXPORT_SYMBOL(kthread_bind);
2028
1998#ifdef CONFIG_SMP 2029#ifdef CONFIG_SMP
1999/* 2030/*
2000 * Is this task likely cache-hot: 2031 * Is this task likely cache-hot:
@@ -2007,7 +2038,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2007 /* 2038 /*
2008 * Buddy candidates are cache hot: 2039 * Buddy candidates are cache hot:
2009 */ 2040 */
2010 if (sched_feat(CACHE_HOT_BUDDY) && 2041 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2011 (&p->se == cfs_rq_of(&p->se)->next || 2042 (&p->se == cfs_rq_of(&p->se)->next ||
2012 &p->se == cfs_rq_of(&p->se)->last)) 2043 &p->se == cfs_rq_of(&p->se)->last))
2013 return 1; 2044 return 1;
@@ -2048,7 +2079,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2048#endif 2079#endif
2049 if (old_cpu != new_cpu) { 2080 if (old_cpu != new_cpu) {
2050 p->se.nr_migrations++; 2081 p->se.nr_migrations++;
2051 new_rq->nr_migrations_in++;
2052#ifdef CONFIG_SCHEDSTATS 2082#ifdef CONFIG_SCHEDSTATS
2053 if (task_hot(p, old_rq->clock, NULL)) 2083 if (task_hot(p, old_rq->clock, NULL))
2054 schedstat_inc(p, se.nr_forced2_migrations); 2084 schedstat_inc(p, se.nr_forced2_migrations);
@@ -2085,6 +2115,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2085 * it is sufficient to simply update the task's cpu field. 2115 * it is sufficient to simply update the task's cpu field.
2086 */ 2116 */
2087 if (!p->se.on_rq && !task_running(rq, p)) { 2117 if (!p->se.on_rq && !task_running(rq, p)) {
2118 update_rq_clock(rq);
2088 set_task_cpu(p, dest_cpu); 2119 set_task_cpu(p, dest_cpu);
2089 return 0; 2120 return 0;
2090 } 2121 }
@@ -2311,7 +2342,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2311{ 2342{
2312 int cpu, orig_cpu, this_cpu, success = 0; 2343 int cpu, orig_cpu, this_cpu, success = 0;
2313 unsigned long flags; 2344 unsigned long flags;
2314 struct rq *rq; 2345 struct rq *rq, *orig_rq;
2315 2346
2316 if (!sched_feat(SYNC_WAKEUPS)) 2347 if (!sched_feat(SYNC_WAKEUPS))
2317 wake_flags &= ~WF_SYNC; 2348 wake_flags &= ~WF_SYNC;
@@ -2319,7 +2350,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2319 this_cpu = get_cpu(); 2350 this_cpu = get_cpu();
2320 2351
2321 smp_wmb(); 2352 smp_wmb();
2322 rq = task_rq_lock(p, &flags); 2353 rq = orig_rq = task_rq_lock(p, &flags);
2323 update_rq_clock(rq); 2354 update_rq_clock(rq);
2324 if (!(p->state & state)) 2355 if (!(p->state & state))
2325 goto out; 2356 goto out;
@@ -2346,10 +2377,15 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2346 task_rq_unlock(rq, &flags); 2377 task_rq_unlock(rq, &flags);
2347 2378
2348 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2379 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2349 if (cpu != orig_cpu) 2380 if (cpu != orig_cpu) {
2381 local_irq_save(flags);
2382 rq = cpu_rq(cpu);
2383 update_rq_clock(rq);
2350 set_task_cpu(p, cpu); 2384 set_task_cpu(p, cpu);
2351 2385 local_irq_restore(flags);
2386 }
2352 rq = task_rq_lock(p, &flags); 2387 rq = task_rq_lock(p, &flags);
2388
2353 WARN_ON(p->state != TASK_WAKING); 2389 WARN_ON(p->state != TASK_WAKING);
2354 cpu = task_cpu(p); 2390 cpu = task_cpu(p);
2355 2391
@@ -2406,6 +2442,17 @@ out_running:
2406#ifdef CONFIG_SMP 2442#ifdef CONFIG_SMP
2407 if (p->sched_class->task_wake_up) 2443 if (p->sched_class->task_wake_up)
2408 p->sched_class->task_wake_up(rq, p); 2444 p->sched_class->task_wake_up(rq, p);
2445
2446 if (unlikely(rq->idle_stamp)) {
2447 u64 delta = rq->clock - rq->idle_stamp;
2448 u64 max = 2*sysctl_sched_migration_cost;
2449
2450 if (delta > max)
2451 rq->avg_idle = max;
2452 else
2453 update_avg(&rq->avg_idle, delta);
2454 rq->idle_stamp = 0;
2455 }
2409#endif 2456#endif
2410out: 2457out:
2411 task_rq_unlock(rq, &flags); 2458 task_rq_unlock(rq, &flags);
@@ -2511,26 +2558,22 @@ static void __sched_fork(struct task_struct *p)
2511void sched_fork(struct task_struct *p, int clone_flags) 2558void sched_fork(struct task_struct *p, int clone_flags)
2512{ 2559{
2513 int cpu = get_cpu(); 2560 int cpu = get_cpu();
2561 unsigned long flags;
2514 2562
2515 __sched_fork(p); 2563 __sched_fork(p);
2516 2564
2517 /* 2565 /*
2518 * Make sure we do not leak PI boosting priority to the child.
2519 */
2520 p->prio = current->normal_prio;
2521
2522 /*
2523 * Revert to default priority/policy on fork if requested. 2566 * Revert to default priority/policy on fork if requested.
2524 */ 2567 */
2525 if (unlikely(p->sched_reset_on_fork)) { 2568 if (unlikely(p->sched_reset_on_fork)) {
2526 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) 2569 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
2527 p->policy = SCHED_NORMAL; 2570 p->policy = SCHED_NORMAL;
2528 2571 p->normal_prio = p->static_prio;
2529 if (p->normal_prio < DEFAULT_PRIO) 2572 }
2530 p->prio = DEFAULT_PRIO;
2531 2573
2532 if (PRIO_TO_NICE(p->static_prio) < 0) { 2574 if (PRIO_TO_NICE(p->static_prio) < 0) {
2533 p->static_prio = NICE_TO_PRIO(0); 2575 p->static_prio = NICE_TO_PRIO(0);
2576 p->normal_prio = p->static_prio;
2534 set_load_weight(p); 2577 set_load_weight(p);
2535 } 2578 }
2536 2579
@@ -2541,13 +2584,21 @@ void sched_fork(struct task_struct *p, int clone_flags)
2541 p->sched_reset_on_fork = 0; 2584 p->sched_reset_on_fork = 0;
2542 } 2585 }
2543 2586
2587 /*
2588 * Make sure we do not leak PI boosting priority to the child.
2589 */
2590 p->prio = current->normal_prio;
2591
2544 if (!rt_prio(p->prio)) 2592 if (!rt_prio(p->prio))
2545 p->sched_class = &fair_sched_class; 2593 p->sched_class = &fair_sched_class;
2546 2594
2547#ifdef CONFIG_SMP 2595#ifdef CONFIG_SMP
2548 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); 2596 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
2549#endif 2597#endif
2598 local_irq_save(flags);
2599 update_rq_clock(cpu_rq(cpu));
2550 set_task_cpu(p, cpu); 2600 set_task_cpu(p, cpu);
2601 local_irq_restore(flags);
2551 2602
2552#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2603#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2553 if (likely(sched_info_on())) 2604 if (likely(sched_info_on()))
@@ -2581,8 +2632,6 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2581 BUG_ON(p->state != TASK_RUNNING); 2632 BUG_ON(p->state != TASK_RUNNING);
2582 update_rq_clock(rq); 2633 update_rq_clock(rq);
2583 2634
2584 p->prio = effective_prio(p);
2585
2586 if (!p->sched_class->task_new || !current->se.on_rq) { 2635 if (!p->sched_class->task_new || !current->se.on_rq) {
2587 activate_task(rq, p, 0); 2636 activate_task(rq, p, 0);
2588 } else { 2637 } else {
@@ -2816,14 +2865,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2816 */ 2865 */
2817 arch_start_context_switch(prev); 2866 arch_start_context_switch(prev);
2818 2867
2819 if (unlikely(!mm)) { 2868 if (likely(!mm)) {
2820 next->active_mm = oldmm; 2869 next->active_mm = oldmm;
2821 atomic_inc(&oldmm->mm_count); 2870 atomic_inc(&oldmm->mm_count);
2822 enter_lazy_tlb(oldmm, next); 2871 enter_lazy_tlb(oldmm, next);
2823 } else 2872 } else
2824 switch_mm(oldmm, mm, next); 2873 switch_mm(oldmm, mm, next);
2825 2874
2826 if (unlikely(!prev->mm)) { 2875 if (likely(!prev->mm)) {
2827 prev->active_mm = NULL; 2876 prev->active_mm = NULL;
2828 rq->prev_mm = oldmm; 2877 rq->prev_mm = oldmm;
2829 } 2878 }
@@ -2986,15 +3035,6 @@ static void calc_load_account_active(struct rq *this_rq)
2986} 3035}
2987 3036
2988/* 3037/*
2989 * Externally visible per-cpu scheduler statistics:
2990 * cpu_nr_migrations(cpu) - number of migrations into that cpu
2991 */
2992u64 cpu_nr_migrations(int cpu)
2993{
2994 return cpu_rq(cpu)->nr_migrations_in;
2995}
2996
2997/*
2998 * Update rq->cpu_load[] statistics. This function is usually called every 3038 * Update rq->cpu_load[] statistics. This function is usually called every
2999 * scheduler tick (TICK_NSEC). 3039 * scheduler tick (TICK_NSEC).
3000 */ 3040 */
@@ -3658,6 +3698,7 @@ static void update_group_power(struct sched_domain *sd, int cpu)
3658 3698
3659/** 3699/**
3660 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3700 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3701 * @sd: The sched_domain whose statistics are to be updated.
3661 * @group: sched_group whose statistics are to be updated. 3702 * @group: sched_group whose statistics are to be updated.
3662 * @this_cpu: Cpu for which load balance is currently performed. 3703 * @this_cpu: Cpu for which load balance is currently performed.
3663 * @idle: Idle status of this_cpu 3704 * @idle: Idle status of this_cpu
@@ -4093,7 +4134,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4093 unsigned long flags; 4134 unsigned long flags;
4094 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4135 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4095 4136
4096 cpumask_setall(cpus); 4137 cpumask_copy(cpus, cpu_online_mask);
4097 4138
4098 /* 4139 /*
4099 * When power savings policy is enabled for the parent domain, idle 4140 * When power savings policy is enabled for the parent domain, idle
@@ -4256,7 +4297,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4256 int all_pinned = 0; 4297 int all_pinned = 0;
4257 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4298 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4258 4299
4259 cpumask_setall(cpus); 4300 cpumask_copy(cpus, cpu_online_mask);
4260 4301
4261 /* 4302 /*
4262 * When power savings policy is enabled for the parent domain, idle 4303 * When power savings policy is enabled for the parent domain, idle
@@ -4396,6 +4437,11 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4396 int pulled_task = 0; 4437 int pulled_task = 0;
4397 unsigned long next_balance = jiffies + HZ; 4438 unsigned long next_balance = jiffies + HZ;
4398 4439
4440 this_rq->idle_stamp = this_rq->clock;
4441
4442 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4443 return;
4444
4399 for_each_domain(this_cpu, sd) { 4445 for_each_domain(this_cpu, sd) {
4400 unsigned long interval; 4446 unsigned long interval;
4401 4447
@@ -4410,8 +4456,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4410 interval = msecs_to_jiffies(sd->balance_interval); 4456 interval = msecs_to_jiffies(sd->balance_interval);
4411 if (time_after(next_balance, sd->last_balance + interval)) 4457 if (time_after(next_balance, sd->last_balance + interval))
4412 next_balance = sd->last_balance + interval; 4458 next_balance = sd->last_balance + interval;
4413 if (pulled_task) 4459 if (pulled_task) {
4460 this_rq->idle_stamp = 0;
4414 break; 4461 break;
4462 }
4415 } 4463 }
4416 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 4464 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4417 /* 4465 /*
@@ -5013,8 +5061,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
5013 p->gtime = cputime_add(p->gtime, cputime); 5061 p->gtime = cputime_add(p->gtime, cputime);
5014 5062
5015 /* Add guest time to cpustat. */ 5063 /* Add guest time to cpustat. */
5016 cpustat->user = cputime64_add(cpustat->user, tmp); 5064 if (TASK_NICE(p) > 0) {
5017 cpustat->guest = cputime64_add(cpustat->guest, tmp); 5065 cpustat->nice = cputime64_add(cpustat->nice, tmp);
5066 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
5067 } else {
5068 cpustat->user = cputime64_add(cpustat->user, tmp);
5069 cpustat->guest = cputime64_add(cpustat->guest, tmp);
5070 }
5018} 5071}
5019 5072
5020/* 5073/*
@@ -5129,60 +5182,86 @@ void account_idle_ticks(unsigned long ticks)
5129 * Use precise platform statistics if available: 5182 * Use precise platform statistics if available:
5130 */ 5183 */
5131#ifdef CONFIG_VIRT_CPU_ACCOUNTING 5184#ifdef CONFIG_VIRT_CPU_ACCOUNTING
5132cputime_t task_utime(struct task_struct *p) 5185void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5133{ 5186{
5134 return p->utime; 5187 *ut = p->utime;
5188 *st = p->stime;
5135} 5189}
5136 5190
5137cputime_t task_stime(struct task_struct *p) 5191void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5138{ 5192{
5139 return p->stime; 5193 struct task_cputime cputime;
5194
5195 thread_group_cputime(p, &cputime);
5196
5197 *ut = cputime.utime;
5198 *st = cputime.stime;
5140} 5199}
5141#else 5200#else
5142cputime_t task_utime(struct task_struct *p) 5201
5202#ifndef nsecs_to_cputime
5203# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
5204#endif
5205
5206void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5143{ 5207{
5144 clock_t utime = cputime_to_clock_t(p->utime), 5208 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
5145 total = utime + cputime_to_clock_t(p->stime);
5146 u64 temp;
5147 5209
5148 /* 5210 /*
5149 * Use CFS's precise accounting: 5211 * Use CFS's precise accounting:
5150 */ 5212 */
5151 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); 5213 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
5152 5214
5153 if (total) { 5215 if (total) {
5154 temp *= utime; 5216 u64 temp;
5217
5218 temp = (u64)(rtime * utime);
5155 do_div(temp, total); 5219 do_div(temp, total);
5156 } 5220 utime = (cputime_t)temp;
5157 utime = (clock_t)temp; 5221 } else
5222 utime = rtime;
5158 5223
5159 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); 5224 /*
5160 return p->prev_utime; 5225 * Compare with previous values, to keep monotonicity:
5226 */
5227 p->prev_utime = max(p->prev_utime, utime);
5228 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
5229
5230 *ut = p->prev_utime;
5231 *st = p->prev_stime;
5161} 5232}
5162 5233
5163cputime_t task_stime(struct task_struct *p) 5234/*
5235 * Must be called with siglock held.
5236 */
5237void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5164{ 5238{
5165 clock_t stime; 5239 struct signal_struct *sig = p->signal;
5240 struct task_cputime cputime;
5241 cputime_t rtime, utime, total;
5166 5242
5167 /* 5243 thread_group_cputime(p, &cputime);
5168 * Use CFS's precise accounting. (we subtract utime from
5169 * the total, to make sure the total observed by userspace
5170 * grows monotonically - apps rely on that):
5171 */
5172 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
5173 cputime_to_clock_t(task_utime(p));
5174 5244
5175 if (stime >= 0) 5245 total = cputime_add(cputime.utime, cputime.stime);
5176 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); 5246 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
5177 5247
5178 return p->prev_stime; 5248 if (total) {
5179} 5249 u64 temp;
5180#endif
5181 5250
5182inline cputime_t task_gtime(struct task_struct *p) 5251 temp = (u64)(rtime * cputime.utime);
5183{ 5252 do_div(temp, total);
5184 return p->gtime; 5253 utime = (cputime_t)temp;
5254 } else
5255 utime = rtime;
5256
5257 sig->prev_utime = max(sig->prev_utime, utime);
5258 sig->prev_stime = max(sig->prev_stime,
5259 cputime_sub(rtime, sig->prev_utime));
5260
5261 *ut = sig->prev_utime;
5262 *st = sig->prev_stime;
5185} 5263}
5264#endif
5186 5265
5187/* 5266/*
5188 * This function gets called by the timer code, with HZ frequency. 5267 * This function gets called by the timer code, with HZ frequency.
@@ -5448,7 +5527,7 @@ need_resched_nonpreemptible:
5448} 5527}
5449EXPORT_SYMBOL(schedule); 5528EXPORT_SYMBOL(schedule);
5450 5529
5451#ifdef CONFIG_SMP 5530#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
5452/* 5531/*
5453 * Look out! "owner" is an entirely speculative pointer 5532 * Look out! "owner" is an entirely speculative pointer
5454 * access and not reliable. 5533 * access and not reliable.
@@ -6142,22 +6221,14 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
6142 BUG_ON(p->se.on_rq); 6221 BUG_ON(p->se.on_rq);
6143 6222
6144 p->policy = policy; 6223 p->policy = policy;
6145 switch (p->policy) {
6146 case SCHED_NORMAL:
6147 case SCHED_BATCH:
6148 case SCHED_IDLE:
6149 p->sched_class = &fair_sched_class;
6150 break;
6151 case SCHED_FIFO:
6152 case SCHED_RR:
6153 p->sched_class = &rt_sched_class;
6154 break;
6155 }
6156
6157 p->rt_priority = prio; 6224 p->rt_priority = prio;
6158 p->normal_prio = normal_prio(p); 6225 p->normal_prio = normal_prio(p);
6159 /* we are holding p->pi_lock already */ 6226 /* we are holding p->pi_lock already */
6160 p->prio = rt_mutex_getprio(p); 6227 p->prio = rt_mutex_getprio(p);
6228 if (rt_prio(p->prio))
6229 p->sched_class = &rt_sched_class;
6230 else
6231 p->sched_class = &fair_sched_class;
6161 set_load_weight(p); 6232 set_load_weight(p);
6162} 6233}
6163 6234
@@ -6720,9 +6791,6 @@ EXPORT_SYMBOL(yield);
6720/* 6791/*
6721 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 6792 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
6722 * that process accounting knows that this is a task in IO wait state. 6793 * that process accounting knows that this is a task in IO wait state.
6723 *
6724 * But don't do that if it is a deliberate, throttling IO wait (this task
6725 * has set its backing_dev_info: the queue against which it should throttle)
6726 */ 6794 */
6727void __sched io_schedule(void) 6795void __sched io_schedule(void)
6728{ 6796{
@@ -6905,7 +6973,7 @@ void show_state_filter(unsigned long state_filter)
6905 /* 6973 /*
6906 * Only show locks if all tasks are dumped: 6974 * Only show locks if all tasks are dumped:
6907 */ 6975 */
6908 if (state_filter == -1) 6976 if (!state_filter)
6909 debug_show_all_locks(); 6977 debug_show_all_locks();
6910} 6978}
6911 6979
@@ -7710,6 +7778,16 @@ early_initcall(migration_init);
7710 7778
7711#ifdef CONFIG_SCHED_DEBUG 7779#ifdef CONFIG_SCHED_DEBUG
7712 7780
7781static __read_mostly int sched_domain_debug_enabled;
7782
7783static int __init sched_domain_debug_setup(char *str)
7784{
7785 sched_domain_debug_enabled = 1;
7786
7787 return 0;
7788}
7789early_param("sched_debug", sched_domain_debug_setup);
7790
7713static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 7791static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7714 struct cpumask *groupmask) 7792 struct cpumask *groupmask)
7715{ 7793{
@@ -7796,6 +7874,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
7796 cpumask_var_t groupmask; 7874 cpumask_var_t groupmask;
7797 int level = 0; 7875 int level = 0;
7798 7876
7877 if (!sched_domain_debug_enabled)
7878 return;
7879
7799 if (!sd) { 7880 if (!sd) {
7800 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 7881 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
7801 return; 7882 return;
@@ -7875,6 +7956,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
7875 7956
7876static void free_rootdomain(struct root_domain *rd) 7957static void free_rootdomain(struct root_domain *rd)
7877{ 7958{
7959 synchronize_sched();
7960
7878 cpupri_cleanup(&rd->cpupri); 7961 cpupri_cleanup(&rd->cpupri);
7879 7962
7880 free_cpumask_var(rd->rto_mask); 7963 free_cpumask_var(rd->rto_mask);
@@ -8015,6 +8098,7 @@ static cpumask_var_t cpu_isolated_map;
8015/* Setup the mask of cpus configured for isolated domains */ 8098/* Setup the mask of cpus configured for isolated domains */
8016static int __init isolated_cpu_setup(char *str) 8099static int __init isolated_cpu_setup(char *str)
8017{ 8100{
8101 alloc_bootmem_cpumask_var(&cpu_isolated_map);
8018 cpulist_parse(str, cpu_isolated_map); 8102 cpulist_parse(str, cpu_isolated_map);
8019 return 1; 8103 return 1;
8020} 8104}
@@ -8851,7 +8935,7 @@ static int build_sched_domains(const struct cpumask *cpu_map)
8851 return __build_sched_domains(cpu_map, NULL); 8935 return __build_sched_domains(cpu_map, NULL);
8852} 8936}
8853 8937
8854static struct cpumask *doms_cur; /* current sched domains */ 8938static cpumask_var_t *doms_cur; /* current sched domains */
8855static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 8939static int ndoms_cur; /* number of sched domains in 'doms_cur' */
8856static struct sched_domain_attr *dattr_cur; 8940static struct sched_domain_attr *dattr_cur;
8857 /* attribues of custom domains in 'doms_cur' */ 8941 /* attribues of custom domains in 'doms_cur' */
@@ -8873,6 +8957,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void)
8873 return 0; 8957 return 0;
8874} 8958}
8875 8959
8960cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
8961{
8962 int i;
8963 cpumask_var_t *doms;
8964
8965 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
8966 if (!doms)
8967 return NULL;
8968 for (i = 0; i < ndoms; i++) {
8969 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
8970 free_sched_domains(doms, i);
8971 return NULL;
8972 }
8973 }
8974 return doms;
8975}
8976
8977void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
8978{
8979 unsigned int i;
8980 for (i = 0; i < ndoms; i++)
8981 free_cpumask_var(doms[i]);
8982 kfree(doms);
8983}
8984
8876/* 8985/*
8877 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 8986 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
8878 * For now this just excludes isolated cpus, but could be used to 8987 * For now this just excludes isolated cpus, but could be used to
@@ -8884,12 +8993,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
8884 8993
8885 arch_update_cpu_topology(); 8994 arch_update_cpu_topology();
8886 ndoms_cur = 1; 8995 ndoms_cur = 1;
8887 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); 8996 doms_cur = alloc_sched_domains(ndoms_cur);
8888 if (!doms_cur) 8997 if (!doms_cur)
8889 doms_cur = fallback_doms; 8998 doms_cur = &fallback_doms;
8890 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); 8999 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
8891 dattr_cur = NULL; 9000 dattr_cur = NULL;
8892 err = build_sched_domains(doms_cur); 9001 err = build_sched_domains(doms_cur[0]);
8893 register_sched_domain_sysctl(); 9002 register_sched_domain_sysctl();
8894 9003
8895 return err; 9004 return err;
@@ -8939,19 +9048,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8939 * doms_new[] to the current sched domain partitioning, doms_cur[]. 9048 * doms_new[] to the current sched domain partitioning, doms_cur[].
8940 * It destroys each deleted domain and builds each new domain. 9049 * It destroys each deleted domain and builds each new domain.
8941 * 9050 *
8942 * 'doms_new' is an array of cpumask's of length 'ndoms_new'. 9051 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
8943 * The masks don't intersect (don't overlap.) We should setup one 9052 * The masks don't intersect (don't overlap.) We should setup one
8944 * sched domain for each mask. CPUs not in any of the cpumasks will 9053 * sched domain for each mask. CPUs not in any of the cpumasks will
8945 * not be load balanced. If the same cpumask appears both in the 9054 * not be load balanced. If the same cpumask appears both in the
8946 * current 'doms_cur' domains and in the new 'doms_new', we can leave 9055 * current 'doms_cur' domains and in the new 'doms_new', we can leave
8947 * it as it is. 9056 * it as it is.
8948 * 9057 *
8949 * The passed in 'doms_new' should be kmalloc'd. This routine takes 9058 * The passed in 'doms_new' should be allocated using
8950 * ownership of it and will kfree it when done with it. If the caller 9059 * alloc_sched_domains. This routine takes ownership of it and will
8951 * failed the kmalloc call, then it can pass in doms_new == NULL && 9060 * free_sched_domains it when done with it. If the caller failed the
8952 * ndoms_new == 1, and partition_sched_domains() will fallback to 9061 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
8953 * the single partition 'fallback_doms', it also forces the domains 9062 * and partition_sched_domains() will fallback to the single partition
8954 * to be rebuilt. 9063 * 'fallback_doms', it also forces the domains to be rebuilt.
8955 * 9064 *
8956 * If doms_new == NULL it will be replaced with cpu_online_mask. 9065 * If doms_new == NULL it will be replaced with cpu_online_mask.
8957 * ndoms_new == 0 is a special case for destroying existing domains, 9066 * ndoms_new == 0 is a special case for destroying existing domains,
@@ -8959,8 +9068,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8959 * 9068 *
8960 * Call with hotplug lock held 9069 * Call with hotplug lock held
8961 */ 9070 */
8962/* FIXME: Change to struct cpumask *doms_new[] */ 9071void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
8963void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
8964 struct sched_domain_attr *dattr_new) 9072 struct sched_domain_attr *dattr_new)
8965{ 9073{
8966 int i, j, n; 9074 int i, j, n;
@@ -8979,40 +9087,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
8979 /* Destroy deleted domains */ 9087 /* Destroy deleted domains */
8980 for (i = 0; i < ndoms_cur; i++) { 9088 for (i = 0; i < ndoms_cur; i++) {
8981 for (j = 0; j < n && !new_topology; j++) { 9089 for (j = 0; j < n && !new_topology; j++) {
8982 if (cpumask_equal(&doms_cur[i], &doms_new[j]) 9090 if (cpumask_equal(doms_cur[i], doms_new[j])
8983 && dattrs_equal(dattr_cur, i, dattr_new, j)) 9091 && dattrs_equal(dattr_cur, i, dattr_new, j))
8984 goto match1; 9092 goto match1;
8985 } 9093 }
8986 /* no match - a current sched domain not in new doms_new[] */ 9094 /* no match - a current sched domain not in new doms_new[] */
8987 detach_destroy_domains(doms_cur + i); 9095 detach_destroy_domains(doms_cur[i]);
8988match1: 9096match1:
8989 ; 9097 ;
8990 } 9098 }
8991 9099
8992 if (doms_new == NULL) { 9100 if (doms_new == NULL) {
8993 ndoms_cur = 0; 9101 ndoms_cur = 0;
8994 doms_new = fallback_doms; 9102 doms_new = &fallback_doms;
8995 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); 9103 cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map);
8996 WARN_ON_ONCE(dattr_new); 9104 WARN_ON_ONCE(dattr_new);
8997 } 9105 }
8998 9106
8999 /* Build new domains */ 9107 /* Build new domains */
9000 for (i = 0; i < ndoms_new; i++) { 9108 for (i = 0; i < ndoms_new; i++) {
9001 for (j = 0; j < ndoms_cur && !new_topology; j++) { 9109 for (j = 0; j < ndoms_cur && !new_topology; j++) {
9002 if (cpumask_equal(&doms_new[i], &doms_cur[j]) 9110 if (cpumask_equal(doms_new[i], doms_cur[j])
9003 && dattrs_equal(dattr_new, i, dattr_cur, j)) 9111 && dattrs_equal(dattr_new, i, dattr_cur, j))
9004 goto match2; 9112 goto match2;
9005 } 9113 }
9006 /* no match - add a new doms_new */ 9114 /* no match - add a new doms_new */
9007 __build_sched_domains(doms_new + i, 9115 __build_sched_domains(doms_new[i],
9008 dattr_new ? dattr_new + i : NULL); 9116 dattr_new ? dattr_new + i : NULL);
9009match2: 9117match2:
9010 ; 9118 ;
9011 } 9119 }
9012 9120
9013 /* Remember the new sched domains */ 9121 /* Remember the new sched domains */
9014 if (doms_cur != fallback_doms) 9122 if (doms_cur != &fallback_doms)
9015 kfree(doms_cur); 9123 free_sched_domains(doms_cur, ndoms_cur);
9016 kfree(dattr_cur); /* kfree(NULL) is safe */ 9124 kfree(dattr_cur); /* kfree(NULL) is safe */
9017 doms_cur = doms_new; 9125 doms_cur = doms_new;
9018 dattr_cur = dattr_new; 9126 dattr_cur = dattr_new;
@@ -9334,10 +9442,6 @@ void __init sched_init(void)
9334#ifdef CONFIG_CPUMASK_OFFSTACK 9442#ifdef CONFIG_CPUMASK_OFFSTACK
9335 alloc_size += num_possible_cpus() * cpumask_size(); 9443 alloc_size += num_possible_cpus() * cpumask_size();
9336#endif 9444#endif
9337 /*
9338 * As sched_init() is called before page_alloc is setup,
9339 * we use alloc_bootmem().
9340 */
9341 if (alloc_size) { 9445 if (alloc_size) {
9342 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 9446 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
9343 9447
@@ -9406,6 +9510,10 @@ void __init sched_init(void)
9406#endif /* CONFIG_USER_SCHED */ 9510#endif /* CONFIG_USER_SCHED */
9407#endif /* CONFIG_GROUP_SCHED */ 9511#endif /* CONFIG_GROUP_SCHED */
9408 9512
9513#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9514 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
9515 __alignof__(unsigned long));
9516#endif
9409 for_each_possible_cpu(i) { 9517 for_each_possible_cpu(i) {
9410 struct rq *rq; 9518 struct rq *rq;
9411 9519
@@ -9488,6 +9596,8 @@ void __init sched_init(void)
9488 rq->cpu = i; 9596 rq->cpu = i;
9489 rq->online = 0; 9597 rq->online = 0;
9490 rq->migration_thread = NULL; 9598 rq->migration_thread = NULL;
9599 rq->idle_stamp = 0;
9600 rq->avg_idle = 2*sysctl_sched_migration_cost;
9491 INIT_LIST_HEAD(&rq->migration_queue); 9601 INIT_LIST_HEAD(&rq->migration_queue);
9492 rq_attach_root(rq, &def_root_domain); 9602 rq_attach_root(rq, &def_root_domain);
9493#endif 9603#endif
@@ -9531,13 +9641,15 @@ void __init sched_init(void)
9531 current->sched_class = &fair_sched_class; 9641 current->sched_class = &fair_sched_class;
9532 9642
9533 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 9643 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
9534 alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 9644 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
9535#ifdef CONFIG_SMP 9645#ifdef CONFIG_SMP
9536#ifdef CONFIG_NO_HZ 9646#ifdef CONFIG_NO_HZ
9537 alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 9647 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
9538 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 9648 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
9539#endif 9649#endif
9540 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 9650 /* May be allocated at isolcpus cmdline parse time */
9651 if (cpu_isolated_map == NULL)
9652 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9541#endif /* SMP */ 9653#endif /* SMP */
9542 9654
9543 perf_event_init(); 9655 perf_event_init();
@@ -10312,7 +10424,7 @@ static int sched_rt_global_constraints(void)
10312#endif /* CONFIG_RT_GROUP_SCHED */ 10424#endif /* CONFIG_RT_GROUP_SCHED */
10313 10425
10314int sched_rt_handler(struct ctl_table *table, int write, 10426int sched_rt_handler(struct ctl_table *table, int write,
10315 struct file *filp, void __user *buffer, size_t *lenp, 10427 void __user *buffer, size_t *lenp,
10316 loff_t *ppos) 10428 loff_t *ppos)
10317{ 10429{
10318 int ret; 10430 int ret;
@@ -10323,7 +10435,7 @@ int sched_rt_handler(struct ctl_table *table, int write,
10323 old_period = sysctl_sched_rt_period; 10435 old_period = sysctl_sched_rt_period;
10324 old_runtime = sysctl_sched_rt_runtime; 10436 old_runtime = sysctl_sched_rt_runtime;
10325 10437
10326 ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); 10438 ret = proc_dointvec(table, write, buffer, lenp, ppos);
10327 10439
10328 if (!ret && write) { 10440 if (!ret && write) {
10329 ret = sched_rt_global_constraints(); 10441 ret = sched_rt_global_constraints();
@@ -10377,8 +10489,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
10377} 10489}
10378 10490
10379static int 10491static int
10380cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 10492cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
10381 struct task_struct *tsk)
10382{ 10493{
10383#ifdef CONFIG_RT_GROUP_SCHED 10494#ifdef CONFIG_RT_GROUP_SCHED
10384 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) 10495 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
@@ -10388,15 +10499,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10388 if (tsk->sched_class != &fair_sched_class) 10499 if (tsk->sched_class != &fair_sched_class)
10389 return -EINVAL; 10500 return -EINVAL;
10390#endif 10501#endif
10502 return 0;
10503}
10391 10504
10505static int
10506cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10507 struct task_struct *tsk, bool threadgroup)
10508{
10509 int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
10510 if (retval)
10511 return retval;
10512 if (threadgroup) {
10513 struct task_struct *c;
10514 rcu_read_lock();
10515 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
10516 retval = cpu_cgroup_can_attach_task(cgrp, c);
10517 if (retval) {
10518 rcu_read_unlock();
10519 return retval;
10520 }
10521 }
10522 rcu_read_unlock();
10523 }
10392 return 0; 10524 return 0;
10393} 10525}
10394 10526
10395static void 10527static void
10396cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 10528cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10397 struct cgroup *old_cont, struct task_struct *tsk) 10529 struct cgroup *old_cont, struct task_struct *tsk,
10530 bool threadgroup)
10398{ 10531{
10399 sched_move_task(tsk); 10532 sched_move_task(tsk);
10533 if (threadgroup) {
10534 struct task_struct *c;
10535 rcu_read_lock();
10536 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
10537 sched_move_task(c);
10538 }
10539 rcu_read_unlock();
10540 }
10400} 10541}
10401 10542
10402#ifdef CONFIG_FAIR_GROUP_SCHED 10543#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -10838,6 +10979,7 @@ void synchronize_sched_expedited(void)
10838 spin_unlock_irqrestore(&rq->lock, flags); 10979 spin_unlock_irqrestore(&rq->lock, flags);
10839 } 10980 }
10840 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; 10981 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10982 synchronize_sched_expedited_count++;
10841 mutex_unlock(&rcu_sched_expedited_mutex); 10983 mutex_unlock(&rcu_sched_expedited_mutex);
10842 put_online_cpus(); 10984 put_online_cpus();
10843 if (need_full_sync) 10985 if (need_full_sync)
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index ac2e1dc708bd..479ce5682d7c 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -127,7 +127,7 @@ again:
127 clock = wrap_max(clock, min_clock); 127 clock = wrap_max(clock, min_clock);
128 clock = wrap_min(clock, max_clock); 128 clock = wrap_min(clock, max_clock);
129 129
130 if (cmpxchg(&scd->clock, old_clock, clock) != old_clock) 130 if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
131 goto again; 131 goto again;
132 132
133 return clock; 133 return clock;
@@ -163,7 +163,7 @@ again:
163 val = remote_clock; 163 val = remote_clock;
164 } 164 }
165 165
166 if (cmpxchg(ptr, old_val, val) != old_val) 166 if (cmpxchg64(ptr, old_val, val) != old_val)
167 goto again; 167 goto again;
168 168
169 return val; 169 return val;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index efb84409bc43..6988cf08f705 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -285,12 +285,16 @@ static void print_cpu(struct seq_file *m, int cpu)
285 285
286#ifdef CONFIG_SCHEDSTATS 286#ifdef CONFIG_SCHEDSTATS
287#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); 287#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
288#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
288 289
289 P(yld_count); 290 P(yld_count);
290 291
291 P(sched_switch); 292 P(sched_switch);
292 P(sched_count); 293 P(sched_count);
293 P(sched_goidle); 294 P(sched_goidle);
295#ifdef CONFIG_SMP
296 P64(avg_idle);
297#endif
294 298
295 P(ttwu_count); 299 P(ttwu_count);
296 P(ttwu_local); 300 P(ttwu_local);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ecc637a0d591..f61837ad336d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -384,10 +384,10 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
384 384
385#ifdef CONFIG_SCHED_DEBUG 385#ifdef CONFIG_SCHED_DEBUG
386int sched_nr_latency_handler(struct ctl_table *table, int write, 386int sched_nr_latency_handler(struct ctl_table *table, int write,
387 struct file *filp, void __user *buffer, size_t *lenp, 387 void __user *buffer, size_t *lenp,
388 loff_t *ppos) 388 loff_t *ppos)
389{ 389{
390 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 390 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
391 391
392 if (ret || !write) 392 if (ret || !write)
393 return ret; 393 return ret;
@@ -822,6 +822,26 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
822 * re-elected due to buddy favours. 822 * re-elected due to buddy favours.
823 */ 823 */
824 clear_buddies(cfs_rq, curr); 824 clear_buddies(cfs_rq, curr);
825 return;
826 }
827
828 /*
829 * Ensure that a task that missed wakeup preemption by a
830 * narrow margin doesn't have to wait for a full slice.
831 * This also mitigates buddy induced latencies under load.
832 */
833 if (!sched_feat(WAKEUP_PREEMPT))
834 return;
835
836 if (delta_exec < sysctl_sched_min_granularity)
837 return;
838
839 if (cfs_rq->nr_running > 1) {
840 struct sched_entity *se = __pick_next_entity(cfs_rq);
841 s64 delta = curr->vruntime - se->vruntime;
842
843 if (delta > ideal_runtime)
844 resched_task(rq_of(cfs_rq)->curr);
825 } 845 }
826} 846}
827 847
@@ -861,12 +881,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
861static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 881static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
862{ 882{
863 struct sched_entity *se = __pick_next_entity(cfs_rq); 883 struct sched_entity *se = __pick_next_entity(cfs_rq);
884 struct sched_entity *left = se;
864 885
865 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) 886 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
866 return cfs_rq->next; 887 se = cfs_rq->next;
867 888
868 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) 889 /*
869 return cfs_rq->last; 890 * Prefer last buddy, try to return the CPU to a preempted task.
891 */
892 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
893 se = cfs_rq->last;
894
895 clear_buddies(cfs_rq, se);
870 896
871 return se; 897 return se;
872} 898}
@@ -1319,6 +1345,37 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1319} 1345}
1320 1346
1321/* 1347/*
1348 * Try and locate an idle CPU in the sched_domain.
1349 */
1350static int
1351select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1352{
1353 int cpu = smp_processor_id();
1354 int prev_cpu = task_cpu(p);
1355 int i;
1356
1357 /*
1358 * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
1359 * test in select_task_rq_fair) and the prev_cpu is idle then that's
1360 * always a better target than the current cpu.
1361 */
1362 if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
1363 return prev_cpu;
1364
1365 /*
1366 * Otherwise, iterate the domain and find an elegible idle cpu.
1367 */
1368 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
1369 if (!cpu_rq(i)->cfs.nr_running) {
1370 target = i;
1371 break;
1372 }
1373 }
1374
1375 return target;
1376}
1377
1378/*
1322 * sched_balance_self: balance the current task (running on cpu) in domains 1379 * sched_balance_self: balance the current task (running on cpu) in domains
1323 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and 1380 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1324 * SD_BALANCE_EXEC. 1381 * SD_BALANCE_EXEC.
@@ -1372,11 +1429,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1372 want_sd = 0; 1429 want_sd = 0;
1373 } 1430 }
1374 1431
1375 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && 1432 /*
1376 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { 1433 * While iterating the domains looking for a spanning
1434 * WAKE_AFFINE domain, adjust the affine target to any idle cpu
1435 * in cache sharing domains along the way.
1436 */
1437 if (want_affine) {
1438 int target = -1;
1377 1439
1378 affine_sd = tmp; 1440 /*
1379 want_affine = 0; 1441 * If both cpu and prev_cpu are part of this domain,
1442 * cpu is a valid SD_WAKE_AFFINE target.
1443 */
1444 if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
1445 target = cpu;
1446
1447 /*
1448 * If there's an idle sibling in this domain, make that
1449 * the wake_affine target instead of the current cpu.
1450 */
1451 if (tmp->flags & SD_PREFER_SIBLING)
1452 target = select_idle_sibling(p, tmp, target);
1453
1454 if (target >= 0) {
1455 if (tmp->flags & SD_WAKE_AFFINE) {
1456 affine_sd = tmp;
1457 want_affine = 0;
1458 }
1459 cpu = target;
1460 }
1380 } 1461 }
1381 1462
1382 if (!want_sd && !want_affine) 1463 if (!want_sd && !want_affine)
@@ -1568,6 +1649,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1568 struct sched_entity *se = &curr->se, *pse = &p->se; 1649 struct sched_entity *se = &curr->se, *pse = &p->se;
1569 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1650 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1570 int sync = wake_flags & WF_SYNC; 1651 int sync = wake_flags & WF_SYNC;
1652 int scale = cfs_rq->nr_running >= sched_nr_latency;
1571 1653
1572 update_curr(cfs_rq); 1654 update_curr(cfs_rq);
1573 1655
@@ -1582,18 +1664,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1582 if (unlikely(se == pse)) 1664 if (unlikely(se == pse))
1583 return; 1665 return;
1584 1666
1585 /* 1667 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
1586 * Only set the backward buddy when the current task is still on the
1587 * rq. This can happen when a wakeup gets interleaved with schedule on
1588 * the ->pre_schedule() or idle_balance() point, either of which can
1589 * drop the rq lock.
1590 *
1591 * Also, during early boot the idle thread is in the fair class, for
1592 * obvious reasons its a bad idea to schedule back to the idle thread.
1593 */
1594 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
1595 set_last_buddy(se);
1596 if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
1597 set_next_buddy(pse); 1668 set_next_buddy(pse);
1598 1669
1599 /* 1670 /*
@@ -1639,8 +1710,22 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1639 1710
1640 BUG_ON(!pse); 1711 BUG_ON(!pse);
1641 1712
1642 if (wakeup_preempt_entity(se, pse) == 1) 1713 if (wakeup_preempt_entity(se, pse) == 1) {
1643 resched_task(curr); 1714 resched_task(curr);
1715 /*
1716 * Only set the backward buddy when the current task is still
1717 * on the rq. This can happen when a wakeup gets interleaved
1718 * with schedule on the ->pre_schedule() or idle_balance()
1719 * point, either of which can * drop the rq lock.
1720 *
1721 * Also, during early boot the idle thread is in the fair class,
1722 * for obvious reasons its a bad idea to schedule back to it.
1723 */
1724 if (unlikely(!se->on_rq || curr == rq->idle))
1725 return;
1726 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
1727 set_last_buddy(se);
1728 }
1644} 1729}
1645 1730
1646static struct task_struct *pick_next_task_fair(struct rq *rq) 1731static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1649,21 +1734,11 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1649 struct cfs_rq *cfs_rq = &rq->cfs; 1734 struct cfs_rq *cfs_rq = &rq->cfs;
1650 struct sched_entity *se; 1735 struct sched_entity *se;
1651 1736
1652 if (unlikely(!cfs_rq->nr_running)) 1737 if (!cfs_rq->nr_running)
1653 return NULL; 1738 return NULL;
1654 1739
1655 do { 1740 do {
1656 se = pick_next_entity(cfs_rq); 1741 se = pick_next_entity(cfs_rq);
1657 /*
1658 * If se was a buddy, clear it so that it will have to earn
1659 * the favour again.
1660 *
1661 * If se was not a buddy, clear the buddies because neither
1662 * was elegible to run, let them earn it again.
1663 *
1664 * IOW. unconditionally clear buddies.
1665 */
1666 __clear_buddies(cfs_rq, NULL);
1667 set_next_entity(cfs_rq, se); 1742 set_next_entity(cfs_rq, se);
1668 cfs_rq = group_cfs_rq(se); 1743 cfs_rq = group_cfs_rq(se);
1669 } while (cfs_rq); 1744 } while (cfs_rq);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index a4d790cddb19..5c5fef378415 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1153,29 +1153,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1153 1153
1154static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 1154static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1155 1155
1156static inline int pick_optimal_cpu(int this_cpu,
1157 const struct cpumask *mask)
1158{
1159 int first;
1160
1161 /* "this_cpu" is cheaper to preempt than a remote processor */
1162 if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
1163 return this_cpu;
1164
1165 first = cpumask_first(mask);
1166 if (first < nr_cpu_ids)
1167 return first;
1168
1169 return -1;
1170}
1171
1172static int find_lowest_rq(struct task_struct *task) 1156static int find_lowest_rq(struct task_struct *task)
1173{ 1157{
1174 struct sched_domain *sd; 1158 struct sched_domain *sd;
1175 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); 1159 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
1176 int this_cpu = smp_processor_id(); 1160 int this_cpu = smp_processor_id();
1177 int cpu = task_cpu(task); 1161 int cpu = task_cpu(task);
1178 cpumask_var_t domain_mask;
1179 1162
1180 if (task->rt.nr_cpus_allowed == 1) 1163 if (task->rt.nr_cpus_allowed == 1)
1181 return -1; /* No other targets possible */ 1164 return -1; /* No other targets possible */
@@ -1198,28 +1181,26 @@ static int find_lowest_rq(struct task_struct *task)
1198 * Otherwise, we consult the sched_domains span maps to figure 1181 * Otherwise, we consult the sched_domains span maps to figure
1199 * out which cpu is logically closest to our hot cache data. 1182 * out which cpu is logically closest to our hot cache data.
1200 */ 1183 */
1201 if (this_cpu == cpu) 1184 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1202 this_cpu = -1; /* Skip this_cpu opt if the same */ 1185 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1203
1204 if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
1205 for_each_domain(cpu, sd) {
1206 if (sd->flags & SD_WAKE_AFFINE) {
1207 int best_cpu;
1208 1186
1209 cpumask_and(domain_mask, 1187 for_each_domain(cpu, sd) {
1210 sched_domain_span(sd), 1188 if (sd->flags & SD_WAKE_AFFINE) {
1211 lowest_mask); 1189 int best_cpu;
1212 1190
1213 best_cpu = pick_optimal_cpu(this_cpu, 1191 /*
1214 domain_mask); 1192 * "this_cpu" is cheaper to preempt than a
1215 1193 * remote processor.
1216 if (best_cpu != -1) { 1194 */
1217 free_cpumask_var(domain_mask); 1195 if (this_cpu != -1 &&
1218 return best_cpu; 1196 cpumask_test_cpu(this_cpu, sched_domain_span(sd)))
1219 } 1197 return this_cpu;
1220 } 1198
1199 best_cpu = cpumask_first_and(lowest_mask,
1200 sched_domain_span(sd));
1201 if (best_cpu < nr_cpu_ids)
1202 return best_cpu;
1221 } 1203 }
1222 free_cpumask_var(domain_mask);
1223 } 1204 }
1224 1205
1225 /* 1206 /*
@@ -1227,7 +1208,13 @@ static int find_lowest_rq(struct task_struct *task)
1227 * just give the caller *something* to work with from the compatible 1208 * just give the caller *something* to work with from the compatible
1228 * locations. 1209 * locations.
1229 */ 1210 */
1230 return pick_optimal_cpu(this_cpu, lowest_mask); 1211 if (this_cpu != -1)
1212 return this_cpu;
1213
1214 cpu = cpumask_any(lowest_mask);
1215 if (cpu < nr_cpu_ids)
1216 return cpu;
1217 return -1;
1231} 1218}
1232 1219
1233/* Will lock the rq it finds */ 1220/* Will lock the rq it finds */
diff --git a/kernel/signal.c b/kernel/signal.c
index 64c5deeaca5d..6b982f2cf524 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,12 +22,14 @@
22#include <linux/ptrace.h> 22#include <linux/ptrace.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/signalfd.h> 24#include <linux/signalfd.h>
25#include <linux/ratelimit.h>
25#include <linux/tracehook.h> 26#include <linux/tracehook.h>
26#include <linux/capability.h> 27#include <linux/capability.h>
27#include <linux/freezer.h> 28#include <linux/freezer.h>
28#include <linux/pid_namespace.h> 29#include <linux/pid_namespace.h>
29#include <linux/nsproxy.h> 30#include <linux/nsproxy.h>
30#include <trace/events/sched.h> 31#define CREATE_TRACE_POINTS
32#include <trace/events/signal.h>
31 33
32#include <asm/param.h> 34#include <asm/param.h>
33#include <asm/uaccess.h> 35#include <asm/uaccess.h>
@@ -41,6 +43,8 @@
41 43
42static struct kmem_cache *sigqueue_cachep; 44static struct kmem_cache *sigqueue_cachep;
43 45
46int print_fatal_signals __read_mostly;
47
44static void __user *sig_handler(struct task_struct *t, int sig) 48static void __user *sig_handler(struct task_struct *t, int sig)
45{ 49{
46 return t->sighand->action[sig - 1].sa.sa_handler; 50 return t->sighand->action[sig - 1].sa.sa_handler;
@@ -159,7 +163,7 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
159{ 163{
160 unsigned long i, *s, *m, x; 164 unsigned long i, *s, *m, x;
161 int sig = 0; 165 int sig = 0;
162 166
163 s = pending->signal.sig; 167 s = pending->signal.sig;
164 m = mask->sig; 168 m = mask->sig;
165 switch (_NSIG_WORDS) { 169 switch (_NSIG_WORDS) {
@@ -184,17 +188,31 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
184 sig = ffz(~x) + 1; 188 sig = ffz(~x) + 1;
185 break; 189 break;
186 } 190 }
187 191
188 return sig; 192 return sig;
189} 193}
190 194
195static inline void print_dropped_signal(int sig)
196{
197 static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
198
199 if (!print_fatal_signals)
200 return;
201
202 if (!__ratelimit(&ratelimit_state))
203 return;
204
205 printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
206 current->comm, current->pid, sig);
207}
208
191/* 209/*
192 * allocate a new signal queue record 210 * allocate a new signal queue record
193 * - this may be called without locks if and only if t == current, otherwise an 211 * - this may be called without locks if and only if t == current, otherwise an
194 * appopriate lock must be held to stop the target task from exiting 212 * appopriate lock must be held to stop the target task from exiting
195 */ 213 */
196static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, 214static struct sigqueue *
197 int override_rlimit) 215__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
198{ 216{
199 struct sigqueue *q = NULL; 217 struct sigqueue *q = NULL;
200 struct user_struct *user; 218 struct user_struct *user;
@@ -207,10 +225,15 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
207 */ 225 */
208 user = get_uid(__task_cred(t)->user); 226 user = get_uid(__task_cred(t)->user);
209 atomic_inc(&user->sigpending); 227 atomic_inc(&user->sigpending);
228
210 if (override_rlimit || 229 if (override_rlimit ||
211 atomic_read(&user->sigpending) <= 230 atomic_read(&user->sigpending) <=
212 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) 231 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) {
213 q = kmem_cache_alloc(sigqueue_cachep, flags); 232 q = kmem_cache_alloc(sigqueue_cachep, flags);
233 } else {
234 print_dropped_signal(sig);
235 }
236
214 if (unlikely(q == NULL)) { 237 if (unlikely(q == NULL)) {
215 atomic_dec(&user->sigpending); 238 atomic_dec(&user->sigpending);
216 free_uid(user); 239 free_uid(user);
@@ -705,7 +728,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
705 728
706 if (why) { 729 if (why) {
707 /* 730 /*
708 * The first thread which returns from finish_stop() 731 * The first thread which returns from do_signal_stop()
709 * will take ->siglock, notice SIGNAL_CLD_MASK, and 732 * will take ->siglock, notice SIGNAL_CLD_MASK, and
710 * notify its parent. See get_signal_to_deliver(). 733 * notify its parent. See get_signal_to_deliver().
711 */ 734 */
@@ -834,7 +857,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
834 struct sigqueue *q; 857 struct sigqueue *q;
835 int override_rlimit; 858 int override_rlimit;
836 859
837 trace_sched_signal_send(sig, t); 860 trace_signal_generate(sig, info, t);
838 861
839 assert_spin_locked(&t->sighand->siglock); 862 assert_spin_locked(&t->sighand->siglock);
840 863
@@ -869,7 +892,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
869 else 892 else
870 override_rlimit = 0; 893 override_rlimit = 0;
871 894
872 q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, 895 q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
873 override_rlimit); 896 override_rlimit);
874 if (q) { 897 if (q) {
875 list_add_tail(&q->list, &pending->list); 898 list_add_tail(&q->list, &pending->list);
@@ -896,12 +919,21 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
896 break; 919 break;
897 } 920 }
898 } else if (!is_si_special(info)) { 921 } else if (!is_si_special(info)) {
899 if (sig >= SIGRTMIN && info->si_code != SI_USER) 922 if (sig >= SIGRTMIN && info->si_code != SI_USER) {
900 /* 923 /*
901 * Queue overflow, abort. We may abort if the signal was rt 924 * Queue overflow, abort. We may abort if the
902 * and sent by user using something other than kill(). 925 * signal was rt and sent by user using something
903 */ 926 * other than kill().
927 */
928 trace_signal_overflow_fail(sig, group, info);
904 return -EAGAIN; 929 return -EAGAIN;
930 } else {
931 /*
932 * This is a silent loss of information. We still
933 * send the signal, but the *info bits are lost.
934 */
935 trace_signal_lose_info(sig, group, info);
936 }
905 } 937 }
906 938
907out_set: 939out_set:
@@ -925,8 +957,6 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
925 return __send_signal(sig, info, t, group, from_ancestor_ns); 957 return __send_signal(sig, info, t, group, from_ancestor_ns);
926} 958}
927 959
928int print_fatal_signals;
929
930static void print_fatal_signal(struct pt_regs *regs, int signr) 960static void print_fatal_signal(struct pt_regs *regs, int signr)
931{ 961{
932 printk("%s/%d: potentially unexpected fatal signal %d.\n", 962 printk("%s/%d: potentially unexpected fatal signal %d.\n",
@@ -971,6 +1001,20 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
971 return send_signal(sig, info, t, 0); 1001 return send_signal(sig, info, t, 0);
972} 1002}
973 1003
1004int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
1005 bool group)
1006{
1007 unsigned long flags;
1008 int ret = -ESRCH;
1009
1010 if (lock_task_sighand(p, &flags)) {
1011 ret = send_signal(sig, info, p, group);
1012 unlock_task_sighand(p, &flags);
1013 }
1014
1015 return ret;
1016}
1017
974/* 1018/*
975 * Force a signal that the process can't ignore: if necessary 1019 * Force a signal that the process can't ignore: if necessary
976 * we unblock the signal and change any SIG_IGN to SIG_DFL. 1020 * we unblock the signal and change any SIG_IGN to SIG_DFL.
@@ -1036,12 +1080,6 @@ void zap_other_threads(struct task_struct *p)
1036 } 1080 }
1037} 1081}
1038 1082
1039int __fatal_signal_pending(struct task_struct *tsk)
1040{
1041 return sigismember(&tsk->pending.signal, SIGKILL);
1042}
1043EXPORT_SYMBOL(__fatal_signal_pending);
1044
1045struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) 1083struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
1046{ 1084{
1047 struct sighand_struct *sighand; 1085 struct sighand_struct *sighand;
@@ -1068,18 +1106,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
1068 */ 1106 */
1069int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1107int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1070{ 1108{
1071 unsigned long flags; 1109 int ret = check_kill_permission(sig, info, p);
1072 int ret;
1073
1074 ret = check_kill_permission(sig, info, p);
1075 1110
1076 if (!ret && sig) { 1111 if (!ret && sig)
1077 ret = -ESRCH; 1112 ret = do_send_sig_info(sig, info, p, true);
1078 if (lock_task_sighand(p, &flags)) {
1079 ret = __group_send_sig_info(sig, info, p);
1080 unlock_task_sighand(p, &flags);
1081 }
1082 }
1083 1113
1084 return ret; 1114 return ret;
1085} 1115}
@@ -1224,15 +1254,9 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
1224 * These are for backward compatibility with the rest of the kernel source. 1254 * These are for backward compatibility with the rest of the kernel source.
1225 */ 1255 */
1226 1256
1227/*
1228 * The caller must ensure the task can't exit.
1229 */
1230int 1257int
1231send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1258send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1232{ 1259{
1233 int ret;
1234 unsigned long flags;
1235
1236 /* 1260 /*
1237 * Make sure legacy kernel users don't send in bad values 1261 * Make sure legacy kernel users don't send in bad values
1238 * (normal paths check this in check_kill_permission). 1262 * (normal paths check this in check_kill_permission).
@@ -1240,10 +1264,7 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1240 if (!valid_signal(sig)) 1264 if (!valid_signal(sig))
1241 return -EINVAL; 1265 return -EINVAL;
1242 1266
1243 spin_lock_irqsave(&p->sighand->siglock, flags); 1267 return do_send_sig_info(sig, info, p, false);
1244 ret = specific_send_sig_info(sig, info, p);
1245 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1246 return ret;
1247} 1268}
1248 1269
1249#define __si_special(priv) \ 1270#define __si_special(priv) \
@@ -1302,19 +1323,19 @@ EXPORT_SYMBOL(kill_pid);
1302 * These functions support sending signals using preallocated sigqueue 1323 * These functions support sending signals using preallocated sigqueue
1303 * structures. This is needed "because realtime applications cannot 1324 * structures. This is needed "because realtime applications cannot
1304 * afford to lose notifications of asynchronous events, like timer 1325 * afford to lose notifications of asynchronous events, like timer
1305 * expirations or I/O completions". In the case of Posix Timers 1326 * expirations or I/O completions". In the case of Posix Timers
1306 * we allocate the sigqueue structure from the timer_create. If this 1327 * we allocate the sigqueue structure from the timer_create. If this
1307 * allocation fails we are able to report the failure to the application 1328 * allocation fails we are able to report the failure to the application
1308 * with an EAGAIN error. 1329 * with an EAGAIN error.
1309 */ 1330 */
1310
1311struct sigqueue *sigqueue_alloc(void) 1331struct sigqueue *sigqueue_alloc(void)
1312{ 1332{
1313 struct sigqueue *q; 1333 struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
1314 1334
1315 if ((q = __sigqueue_alloc(current, GFP_KERNEL, 0))) 1335 if (q)
1316 q->flags |= SIGQUEUE_PREALLOC; 1336 q->flags |= SIGQUEUE_PREALLOC;
1317 return(q); 1337
1338 return q;
1318} 1339}
1319 1340
1320void sigqueue_free(struct sigqueue *q) 1341void sigqueue_free(struct sigqueue *q)
@@ -1383,15 +1404,6 @@ ret:
1383} 1404}
1384 1405
1385/* 1406/*
1386 * Wake up any threads in the parent blocked in wait* syscalls.
1387 */
1388static inline void __wake_up_parent(struct task_struct *p,
1389 struct task_struct *parent)
1390{
1391 wake_up_interruptible_sync(&parent->signal->wait_chldexit);
1392}
1393
1394/*
1395 * Let a parent know about the death of a child. 1407 * Let a parent know about the death of a child.
1396 * For a stopped/continued status change, use do_notify_parent_cldstop instead. 1408 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
1397 * 1409 *
@@ -1673,29 +1685,6 @@ void ptrace_notify(int exit_code)
1673 spin_unlock_irq(&current->sighand->siglock); 1685 spin_unlock_irq(&current->sighand->siglock);
1674} 1686}
1675 1687
1676static void
1677finish_stop(int stop_count)
1678{
1679 /*
1680 * If there are no other threads in the group, or if there is
1681 * a group stop in progress and we are the last to stop,
1682 * report to the parent. When ptraced, every thread reports itself.
1683 */
1684 if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) {
1685 read_lock(&tasklist_lock);
1686 do_notify_parent_cldstop(current, CLD_STOPPED);
1687 read_unlock(&tasklist_lock);
1688 }
1689
1690 do {
1691 schedule();
1692 } while (try_to_freeze());
1693 /*
1694 * Now we don't run again until continued.
1695 */
1696 current->exit_code = 0;
1697}
1698
1699/* 1688/*
1700 * This performs the stopping for SIGSTOP and other stop signals. 1689 * This performs the stopping for SIGSTOP and other stop signals.
1701 * We have to stop all threads in the thread group. 1690 * We have to stop all threads in the thread group.
@@ -1705,15 +1694,9 @@ finish_stop(int stop_count)
1705static int do_signal_stop(int signr) 1694static int do_signal_stop(int signr)
1706{ 1695{
1707 struct signal_struct *sig = current->signal; 1696 struct signal_struct *sig = current->signal;
1708 int stop_count; 1697 int notify;
1709 1698
1710 if (sig->group_stop_count > 0) { 1699 if (!sig->group_stop_count) {
1711 /*
1712 * There is a group stop in progress. We don't need to
1713 * start another one.
1714 */
1715 stop_count = --sig->group_stop_count;
1716 } else {
1717 struct task_struct *t; 1700 struct task_struct *t;
1718 1701
1719 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || 1702 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
@@ -1725,7 +1708,7 @@ static int do_signal_stop(int signr)
1725 */ 1708 */
1726 sig->group_exit_code = signr; 1709 sig->group_exit_code = signr;
1727 1710
1728 stop_count = 0; 1711 sig->group_stop_count = 1;
1729 for (t = next_thread(current); t != current; t = next_thread(t)) 1712 for (t = next_thread(current); t != current; t = next_thread(t))
1730 /* 1713 /*
1731 * Setting state to TASK_STOPPED for a group 1714 * Setting state to TASK_STOPPED for a group
@@ -1734,19 +1717,44 @@ static int do_signal_stop(int signr)
1734 */ 1717 */
1735 if (!(t->flags & PF_EXITING) && 1718 if (!(t->flags & PF_EXITING) &&
1736 !task_is_stopped_or_traced(t)) { 1719 !task_is_stopped_or_traced(t)) {
1737 stop_count++; 1720 sig->group_stop_count++;
1738 signal_wake_up(t, 0); 1721 signal_wake_up(t, 0);
1739 } 1722 }
1740 sig->group_stop_count = stop_count;
1741 } 1723 }
1724 /*
1725 * If there are no other threads in the group, or if there is
1726 * a group stop in progress and we are the last to stop, report
1727 * to the parent. When ptraced, every thread reports itself.
1728 */
1729 notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0;
1730 notify = tracehook_notify_jctl(notify, CLD_STOPPED);
1731 /*
1732 * tracehook_notify_jctl() can drop and reacquire siglock, so
1733 * we keep ->group_stop_count != 0 before the call. If SIGCONT
1734 * or SIGKILL comes in between ->group_stop_count == 0.
1735 */
1736 if (sig->group_stop_count) {
1737 if (!--sig->group_stop_count)
1738 sig->flags = SIGNAL_STOP_STOPPED;
1739 current->exit_code = sig->group_exit_code;
1740 __set_current_state(TASK_STOPPED);
1741 }
1742 spin_unlock_irq(&current->sighand->siglock);
1742 1743
1743 if (stop_count == 0) 1744 if (notify) {
1744 sig->flags = SIGNAL_STOP_STOPPED; 1745 read_lock(&tasklist_lock);
1745 current->exit_code = sig->group_exit_code; 1746 do_notify_parent_cldstop(current, notify);
1746 __set_current_state(TASK_STOPPED); 1747 read_unlock(&tasklist_lock);
1748 }
1749
1750 /* Now we don't run again until woken by SIGCONT or SIGKILL */
1751 do {
1752 schedule();
1753 } while (try_to_freeze());
1754
1755 tracehook_finish_jctl();
1756 current->exit_code = 0;
1747 1757
1748 spin_unlock_irq(&current->sighand->siglock);
1749 finish_stop(stop_count);
1750 return 1; 1758 return 1;
1751} 1759}
1752 1760
@@ -1815,14 +1823,15 @@ relock:
1815 int why = (signal->flags & SIGNAL_STOP_CONTINUED) 1823 int why = (signal->flags & SIGNAL_STOP_CONTINUED)
1816 ? CLD_CONTINUED : CLD_STOPPED; 1824 ? CLD_CONTINUED : CLD_STOPPED;
1817 signal->flags &= ~SIGNAL_CLD_MASK; 1825 signal->flags &= ~SIGNAL_CLD_MASK;
1818 spin_unlock_irq(&sighand->siglock);
1819 1826
1820 if (unlikely(!tracehook_notify_jctl(1, why))) 1827 why = tracehook_notify_jctl(why, CLD_CONTINUED);
1821 goto relock; 1828 spin_unlock_irq(&sighand->siglock);
1822 1829
1823 read_lock(&tasklist_lock); 1830 if (why) {
1824 do_notify_parent_cldstop(current->group_leader, why); 1831 read_lock(&tasklist_lock);
1825 read_unlock(&tasklist_lock); 1832 do_notify_parent_cldstop(current->group_leader, why);
1833 read_unlock(&tasklist_lock);
1834 }
1826 goto relock; 1835 goto relock;
1827 } 1836 }
1828 1837
@@ -1860,6 +1869,9 @@ relock:
1860 ka = &sighand->action[signr-1]; 1869 ka = &sighand->action[signr-1];
1861 } 1870 }
1862 1871
1872 /* Trace actually delivered signals. */
1873 trace_signal_deliver(signr, info, ka);
1874
1863 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ 1875 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
1864 continue; 1876 continue;
1865 if (ka->sa.sa_handler != SIG_DFL) { 1877 if (ka->sa.sa_handler != SIG_DFL) {
@@ -1987,14 +1999,14 @@ void exit_signals(struct task_struct *tsk)
1987 if (unlikely(tsk->signal->group_stop_count) && 1999 if (unlikely(tsk->signal->group_stop_count) &&
1988 !--tsk->signal->group_stop_count) { 2000 !--tsk->signal->group_stop_count) {
1989 tsk->signal->flags = SIGNAL_STOP_STOPPED; 2001 tsk->signal->flags = SIGNAL_STOP_STOPPED;
1990 group_stop = 1; 2002 group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED);
1991 } 2003 }
1992out: 2004out:
1993 spin_unlock_irq(&tsk->sighand->siglock); 2005 spin_unlock_irq(&tsk->sighand->siglock);
1994 2006
1995 if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) { 2007 if (unlikely(group_stop)) {
1996 read_lock(&tasklist_lock); 2008 read_lock(&tasklist_lock);
1997 do_notify_parent_cldstop(tsk, CLD_STOPPED); 2009 do_notify_parent_cldstop(tsk, group_stop);
1998 read_unlock(&tasklist_lock); 2010 read_unlock(&tasklist_lock);
1999 } 2011 }
2000} 2012}
@@ -2290,7 +2302,6 @@ static int
2290do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) 2302do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
2291{ 2303{
2292 struct task_struct *p; 2304 struct task_struct *p;
2293 unsigned long flags;
2294 int error = -ESRCH; 2305 int error = -ESRCH;
2295 2306
2296 rcu_read_lock(); 2307 rcu_read_lock();
@@ -2300,14 +2311,16 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
2300 /* 2311 /*
2301 * The null signal is a permissions and process existence 2312 * The null signal is a permissions and process existence
2302 * probe. No signal is actually delivered. 2313 * probe. No signal is actually delivered.
2303 *
2304 * If lock_task_sighand() fails we pretend the task dies
2305 * after receiving the signal. The window is tiny, and the
2306 * signal is private anyway.
2307 */ 2314 */
2308 if (!error && sig && lock_task_sighand(p, &flags)) { 2315 if (!error && sig) {
2309 error = specific_send_sig_info(sig, info, p); 2316 error = do_send_sig_info(sig, info, p, false);
2310 unlock_task_sighand(p, &flags); 2317 /*
2318 * If lock_task_sighand() failed we pretend the task
2319 * dies after receiving the signal. The window is tiny,
2320 * and the signal is private anyway.
2321 */
2322 if (unlikely(error == -ESRCH))
2323 error = 0;
2311 } 2324 }
2312 } 2325 }
2313 rcu_read_unlock(); 2326 rcu_read_unlock();
diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c
new file mode 100644
index 000000000000..e45c43645298
--- /dev/null
+++ b/kernel/slow-work-debugfs.c
@@ -0,0 +1,227 @@
1/* Slow work debugging
2 *
3 * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/slow-work.h>
14#include <linux/fs.h>
15#include <linux/time.h>
16#include <linux/seq_file.h>
17#include "slow-work.h"
18
19#define ITERATOR_SHIFT (BITS_PER_LONG - 4)
20#define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT)
21#define ITERATOR_COUNTER (~ITERATOR_SELECTOR)
22
23void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
24{
25 seq_puts(m, "Slow-work: New thread");
26}
27
28/*
29 * Render the time mark field on a work item into a 5-char time with units plus
30 * a space
31 */
32static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
33{
34 struct timespec now, diff;
35
36 now = CURRENT_TIME;
37 diff = timespec_sub(now, work->mark);
38
39 if (diff.tv_sec < 0)
40 seq_puts(m, " -ve ");
41 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
42 seq_printf(m, "%3luns ", diff.tv_nsec);
43 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
44 seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
45 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
46 seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
47 else if (diff.tv_sec <= 1)
48 seq_puts(m, " 1s ");
49 else if (diff.tv_sec < 60)
50 seq_printf(m, "%4lus ", diff.tv_sec);
51 else if (diff.tv_sec < 60 * 60)
52 seq_printf(m, "%4lum ", diff.tv_sec / 60);
53 else if (diff.tv_sec < 60 * 60 * 24)
54 seq_printf(m, "%4luh ", diff.tv_sec / 3600);
55 else
56 seq_puts(m, "exces ");
57}
58
59/*
60 * Describe a slow work item for debugfs
61 */
62static int slow_work_runqueue_show(struct seq_file *m, void *v)
63{
64 struct slow_work *work;
65 struct list_head *p = v;
66 unsigned long id;
67
68 switch ((unsigned long) v) {
69 case 1:
70 seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n");
71 return 0;
72 case 2:
73 seq_puts(m, "=== ===== ================ == ===== ==========\n");
74 return 0;
75
76 case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
77 id = (unsigned long) v - 3;
78
79 read_lock(&slow_work_execs_lock);
80 work = slow_work_execs[id];
81 if (work) {
82 smp_read_barrier_depends();
83
84 seq_printf(m, "%3lu %5d %16p %2lx ",
85 id, slow_work_pids[id], work, work->flags);
86 slow_work_print_mark(m, work);
87
88 if (work->ops->desc)
89 work->ops->desc(work, m);
90 seq_putc(m, '\n');
91 }
92 read_unlock(&slow_work_execs_lock);
93 return 0;
94
95 default:
96 work = list_entry(p, struct slow_work, link);
97 seq_printf(m, "%3s - %16p %2lx ",
98 work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
99 work, work->flags);
100 slow_work_print_mark(m, work);
101
102 if (work->ops->desc)
103 work->ops->desc(work, m);
104 seq_putc(m, '\n');
105 return 0;
106 }
107}
108
109/*
110 * map the iterator to a work item
111 */
112static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
113{
114 struct list_head *p;
115 unsigned long count, id;
116
117 switch (*_pos >> ITERATOR_SHIFT) {
118 case 0x0:
119 if (*_pos == 0)
120 *_pos = 1;
121 if (*_pos < 3)
122 return (void *)(unsigned long) *_pos;
123 if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
124 for (id = *_pos - 3;
125 id < SLOW_WORK_THREAD_LIMIT;
126 id++, (*_pos)++)
127 if (slow_work_execs[id])
128 return (void *)(unsigned long) *_pos;
129 *_pos = 0x1UL << ITERATOR_SHIFT;
130
131 case 0x1:
132 count = *_pos & ITERATOR_COUNTER;
133 list_for_each(p, &slow_work_queue) {
134 if (count == 0)
135 return p;
136 count--;
137 }
138 *_pos = 0x2UL << ITERATOR_SHIFT;
139
140 case 0x2:
141 count = *_pos & ITERATOR_COUNTER;
142 list_for_each(p, &vslow_work_queue) {
143 if (count == 0)
144 return p;
145 count--;
146 }
147 *_pos = 0x3UL << ITERATOR_SHIFT;
148
149 default:
150 return NULL;
151 }
152}
153
154/*
155 * set up the iterator to start reading from the first line
156 */
157static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
158{
159 spin_lock_irq(&slow_work_queue_lock);
160 return slow_work_runqueue_index(m, _pos);
161}
162
163/*
164 * move to the next line
165 */
166static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
167{
168 struct list_head *p = v;
169 unsigned long selector = *_pos >> ITERATOR_SHIFT;
170
171 (*_pos)++;
172 switch (selector) {
173 case 0x0:
174 return slow_work_runqueue_index(m, _pos);
175
176 case 0x1:
177 if (*_pos >> ITERATOR_SHIFT == 0x1) {
178 p = p->next;
179 if (p != &slow_work_queue)
180 return p;
181 }
182 *_pos = 0x2UL << ITERATOR_SHIFT;
183 p = &vslow_work_queue;
184
185 case 0x2:
186 if (*_pos >> ITERATOR_SHIFT == 0x2) {
187 p = p->next;
188 if (p != &vslow_work_queue)
189 return p;
190 }
191 *_pos = 0x3UL << ITERATOR_SHIFT;
192
193 default:
194 return NULL;
195 }
196}
197
198/*
199 * clean up after reading
200 */
201static void slow_work_runqueue_stop(struct seq_file *m, void *v)
202{
203 spin_unlock_irq(&slow_work_queue_lock);
204}
205
206static const struct seq_operations slow_work_runqueue_ops = {
207 .start = slow_work_runqueue_start,
208 .stop = slow_work_runqueue_stop,
209 .next = slow_work_runqueue_next,
210 .show = slow_work_runqueue_show,
211};
212
213/*
214 * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents
215 */
216static int slow_work_runqueue_open(struct inode *inode, struct file *file)
217{
218 return seq_open(file, &slow_work_runqueue_ops);
219}
220
221const struct file_operations slow_work_runqueue_fops = {
222 .owner = THIS_MODULE,
223 .open = slow_work_runqueue_open,
224 .read = seq_read,
225 .llseek = seq_lseek,
226 .release = seq_release,
227};
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 09d7519557d3..00889bd3c590 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -16,20 +16,17 @@
16#include <linux/kthread.h> 16#include <linux/kthread.h>
17#include <linux/freezer.h> 17#include <linux/freezer.h>
18#include <linux/wait.h> 18#include <linux/wait.h>
19 19#include <linux/debugfs.h>
20#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of 20#include "slow-work.h"
21 * things to do */
22#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
23 * OOM */
24 21
25static void slow_work_cull_timeout(unsigned long); 22static void slow_work_cull_timeout(unsigned long);
26static void slow_work_oom_timeout(unsigned long); 23static void slow_work_oom_timeout(unsigned long);
27 24
28#ifdef CONFIG_SYSCTL 25#ifdef CONFIG_SYSCTL
29static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *, 26static int slow_work_min_threads_sysctl(struct ctl_table *, int,
30 void __user *, size_t *, loff_t *); 27 void __user *, size_t *, loff_t *);
31 28
32static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *, 29static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
33 void __user *, size_t *, loff_t *); 30 void __user *, size_t *, loff_t *);
34#endif 31#endif
35 32
@@ -46,7 +43,7 @@ static unsigned vslow_work_proportion = 50; /* % of threads that may process
46 43
47#ifdef CONFIG_SYSCTL 44#ifdef CONFIG_SYSCTL
48static const int slow_work_min_min_threads = 2; 45static const int slow_work_min_min_threads = 2;
49static int slow_work_max_max_threads = 255; 46static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT;
50static const int slow_work_min_vslow = 1; 47static const int slow_work_min_vslow = 1;
51static const int slow_work_max_vslow = 99; 48static const int slow_work_max_vslow = 99;
52 49
@@ -98,6 +95,56 @@ static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
98static struct slow_work slow_work_new_thread; /* new thread starter */ 95static struct slow_work slow_work_new_thread; /* new thread starter */
99 96
100/* 97/*
98 * slow work ID allocation (use slow_work_queue_lock)
99 */
100static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
101
102/*
103 * Unregistration tracking to prevent put_ref() from disappearing during module
104 * unload
105 */
106#ifdef CONFIG_MODULES
107static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT];
108static struct module *slow_work_unreg_module;
109static struct slow_work *slow_work_unreg_work_item;
110static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq);
111static DEFINE_MUTEX(slow_work_unreg_sync_lock);
112
113static void slow_work_set_thread_processing(int id, struct slow_work *work)
114{
115 if (work)
116 slow_work_thread_processing[id] = work->owner;
117}
118static void slow_work_done_thread_processing(int id, struct slow_work *work)
119{
120 struct module *module = slow_work_thread_processing[id];
121
122 slow_work_thread_processing[id] = NULL;
123 smp_mb();
124 if (slow_work_unreg_work_item == work ||
125 slow_work_unreg_module == module)
126 wake_up_all(&slow_work_unreg_wq);
127}
128static void slow_work_clear_thread_processing(int id)
129{
130 slow_work_thread_processing[id] = NULL;
131}
132#else
133static void slow_work_set_thread_processing(int id, struct slow_work *work) {}
134static void slow_work_done_thread_processing(int id, struct slow_work *work) {}
135static void slow_work_clear_thread_processing(int id) {}
136#endif
137
138/*
139 * Data for tracking currently executing items for indication through /proc
140 */
141#ifdef CONFIG_SLOW_WORK_DEBUG
142struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT];
143pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT];
144DEFINE_RWLOCK(slow_work_execs_lock);
145#endif
146
147/*
101 * The queues of work items and the lock governing access to them. These are 148 * The queues of work items and the lock governing access to them. These are
102 * shared between all the CPUs. It doesn't make sense to have per-CPU queues 149 * shared between all the CPUs. It doesn't make sense to have per-CPU queues
103 * as the number of threads bears no relation to the number of CPUs. 150 * as the number of threads bears no relation to the number of CPUs.
@@ -105,9 +152,18 @@ static struct slow_work slow_work_new_thread; /* new thread starter */
105 * There are two queues of work items: one for slow work items, and one for 152 * There are two queues of work items: one for slow work items, and one for
106 * very slow work items. 153 * very slow work items.
107 */ 154 */
108static LIST_HEAD(slow_work_queue); 155LIST_HEAD(slow_work_queue);
109static LIST_HEAD(vslow_work_queue); 156LIST_HEAD(vslow_work_queue);
110static DEFINE_SPINLOCK(slow_work_queue_lock); 157DEFINE_SPINLOCK(slow_work_queue_lock);
158
159/*
160 * The following are two wait queues that get pinged when a work item is placed
161 * on an empty queue. These allow work items that are hogging a thread by
162 * sleeping in a way that could be deferred to yield their thread and enqueue
163 * themselves.
164 */
165static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation);
166static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation);
111 167
112/* 168/*
113 * The thread controls. A variable used to signal to the threads that they 169 * The thread controls. A variable used to signal to the threads that they
@@ -126,6 +182,20 @@ static DECLARE_COMPLETION(slow_work_last_thread_exited);
126static int slow_work_user_count; 182static int slow_work_user_count;
127static DEFINE_MUTEX(slow_work_user_lock); 183static DEFINE_MUTEX(slow_work_user_lock);
128 184
185static inline int slow_work_get_ref(struct slow_work *work)
186{
187 if (work->ops->get_ref)
188 return work->ops->get_ref(work);
189
190 return 0;
191}
192
193static inline void slow_work_put_ref(struct slow_work *work)
194{
195 if (work->ops->put_ref)
196 work->ops->put_ref(work);
197}
198
129/* 199/*
130 * Calculate the maximum number of active threads in the pool that are 200 * Calculate the maximum number of active threads in the pool that are
131 * permitted to process very slow work items. 201 * permitted to process very slow work items.
@@ -149,7 +219,7 @@ static unsigned slow_work_calc_vsmax(void)
149 * Attempt to execute stuff queued on a slow thread. Return true if we managed 219 * Attempt to execute stuff queued on a slow thread. Return true if we managed
150 * it, false if there was nothing to do. 220 * it, false if there was nothing to do.
151 */ 221 */
152static bool slow_work_execute(void) 222static noinline bool slow_work_execute(int id)
153{ 223{
154 struct slow_work *work = NULL; 224 struct slow_work *work = NULL;
155 unsigned vsmax; 225 unsigned vsmax;
@@ -186,6 +256,13 @@ static bool slow_work_execute(void)
186 } else { 256 } else {
187 very_slow = false; /* avoid the compiler warning */ 257 very_slow = false; /* avoid the compiler warning */
188 } 258 }
259
260 slow_work_set_thread_processing(id, work);
261 if (work) {
262 slow_work_mark_time(work);
263 slow_work_begin_exec(id, work);
264 }
265
189 spin_unlock_irq(&slow_work_queue_lock); 266 spin_unlock_irq(&slow_work_queue_lock);
190 267
191 if (!work) 268 if (!work)
@@ -194,12 +271,19 @@ static bool slow_work_execute(void)
194 if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags)) 271 if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
195 BUG(); 272 BUG();
196 273
197 work->ops->execute(work); 274 /* don't execute if the work is in the process of being cancelled */
275 if (!test_bit(SLOW_WORK_CANCELLING, &work->flags))
276 work->ops->execute(work);
198 277
199 if (very_slow) 278 if (very_slow)
200 atomic_dec(&vslow_work_executing_count); 279 atomic_dec(&vslow_work_executing_count);
201 clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags); 280 clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
202 281
282 /* wake up anyone waiting for this work to be complete */
283 wake_up_bit(&work->flags, SLOW_WORK_EXECUTING);
284
285 slow_work_end_exec(id, work);
286
203 /* if someone tried to enqueue the item whilst we were executing it, 287 /* if someone tried to enqueue the item whilst we were executing it,
204 * then it'll be left unenqueued to avoid multiple threads trying to 288 * then it'll be left unenqueued to avoid multiple threads trying to
205 * execute it simultaneously 289 * execute it simultaneously
@@ -219,7 +303,10 @@ static bool slow_work_execute(void)
219 spin_unlock_irq(&slow_work_queue_lock); 303 spin_unlock_irq(&slow_work_queue_lock);
220 } 304 }
221 305
222 work->ops->put_ref(work); 306 /* sort out the race between module unloading and put_ref() */
307 slow_work_put_ref(work);
308 slow_work_done_thread_processing(id, work);
309
223 return true; 310 return true;
224 311
225auto_requeue: 312auto_requeue:
@@ -227,15 +314,61 @@ auto_requeue:
227 * - we transfer our ref on the item back to the appropriate queue 314 * - we transfer our ref on the item back to the appropriate queue
228 * - don't wake another thread up as we're awake already 315 * - don't wake another thread up as we're awake already
229 */ 316 */
317 slow_work_mark_time(work);
230 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) 318 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
231 list_add_tail(&work->link, &vslow_work_queue); 319 list_add_tail(&work->link, &vslow_work_queue);
232 else 320 else
233 list_add_tail(&work->link, &slow_work_queue); 321 list_add_tail(&work->link, &slow_work_queue);
234 spin_unlock_irq(&slow_work_queue_lock); 322 spin_unlock_irq(&slow_work_queue_lock);
323 slow_work_clear_thread_processing(id);
235 return true; 324 return true;
236} 325}
237 326
238/** 327/**
328 * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work
329 * work: The work item under execution that wants to sleep
330 * _timeout: Scheduler sleep timeout
331 *
332 * Allow a requeueable work item to sleep on a slow-work processor thread until
333 * that thread is needed to do some other work or the sleep is interrupted by
334 * some other event.
335 *
336 * The caller must set up a wake up event before calling this and must have set
337 * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
338 * condition before calling this function as no test is made here.
339 *
340 * False is returned if there is nothing on the queue; true is returned if the
341 * work item should be requeued
342 */
343bool slow_work_sleep_till_thread_needed(struct slow_work *work,
344 signed long *_timeout)
345{
346 wait_queue_head_t *wfo_wq;
347 struct list_head *queue;
348
349 DEFINE_WAIT(wait);
350
351 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
352 wfo_wq = &vslow_work_queue_waits_for_occupation;
353 queue = &vslow_work_queue;
354 } else {
355 wfo_wq = &slow_work_queue_waits_for_occupation;
356 queue = &slow_work_queue;
357 }
358
359 if (!list_empty(queue))
360 return true;
361
362 add_wait_queue_exclusive(wfo_wq, &wait);
363 if (list_empty(queue))
364 *_timeout = schedule_timeout(*_timeout);
365 finish_wait(wfo_wq, &wait);
366
367 return !list_empty(queue);
368}
369EXPORT_SYMBOL(slow_work_sleep_till_thread_needed);
370
371/**
239 * slow_work_enqueue - Schedule a slow work item for processing 372 * slow_work_enqueue - Schedule a slow work item for processing
240 * @work: The work item to queue 373 * @work: The work item to queue
241 * 374 *
@@ -260,16 +393,22 @@ auto_requeue:
260 * allowed to pick items to execute. This ensures that very slow items won't 393 * allowed to pick items to execute. This ensures that very slow items won't
261 * overly block ones that are just ordinarily slow. 394 * overly block ones that are just ordinarily slow.
262 * 395 *
263 * Returns 0 if successful, -EAGAIN if not. 396 * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is
397 * attempted queued)
264 */ 398 */
265int slow_work_enqueue(struct slow_work *work) 399int slow_work_enqueue(struct slow_work *work)
266{ 400{
401 wait_queue_head_t *wfo_wq;
402 struct list_head *queue;
267 unsigned long flags; 403 unsigned long flags;
404 int ret;
405
406 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
407 return -ECANCELED;
268 408
269 BUG_ON(slow_work_user_count <= 0); 409 BUG_ON(slow_work_user_count <= 0);
270 BUG_ON(!work); 410 BUG_ON(!work);
271 BUG_ON(!work->ops); 411 BUG_ON(!work->ops);
272 BUG_ON(!work->ops->get_ref);
273 412
274 /* when honouring an enqueue request, we only promise that we will run 413 /* when honouring an enqueue request, we only promise that we will run
275 * the work function in the future; we do not promise to run it once 414 * the work function in the future; we do not promise to run it once
@@ -280,8 +419,19 @@ int slow_work_enqueue(struct slow_work *work)
280 * maintaining our promise 419 * maintaining our promise
281 */ 420 */
282 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { 421 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
422 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
423 wfo_wq = &vslow_work_queue_waits_for_occupation;
424 queue = &vslow_work_queue;
425 } else {
426 wfo_wq = &slow_work_queue_waits_for_occupation;
427 queue = &slow_work_queue;
428 }
429
283 spin_lock_irqsave(&slow_work_queue_lock, flags); 430 spin_lock_irqsave(&slow_work_queue_lock, flags);
284 431
432 if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags)))
433 goto cancelled;
434
285 /* we promise that we will not attempt to execute the work 435 /* we promise that we will not attempt to execute the work
286 * function in more than one thread simultaneously 436 * function in more than one thread simultaneously
287 * 437 *
@@ -299,25 +449,221 @@ int slow_work_enqueue(struct slow_work *work)
299 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { 449 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
300 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); 450 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
301 } else { 451 } else {
302 if (work->ops->get_ref(work) < 0) 452 ret = slow_work_get_ref(work);
303 goto cant_get_ref; 453 if (ret < 0)
304 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) 454 goto failed;
305 list_add_tail(&work->link, &vslow_work_queue); 455 slow_work_mark_time(work);
306 else 456 list_add_tail(&work->link, queue);
307 list_add_tail(&work->link, &slow_work_queue);
308 wake_up(&slow_work_thread_wq); 457 wake_up(&slow_work_thread_wq);
458
459 /* if someone who could be requeued is sleeping on a
460 * thread, then ask them to yield their thread */
461 if (work->link.prev == queue)
462 wake_up(wfo_wq);
309 } 463 }
310 464
311 spin_unlock_irqrestore(&slow_work_queue_lock, flags); 465 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
312 } 466 }
313 return 0; 467 return 0;
314 468
315cant_get_ref: 469cancelled:
470 ret = -ECANCELED;
471failed:
316 spin_unlock_irqrestore(&slow_work_queue_lock, flags); 472 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
317 return -EAGAIN; 473 return ret;
318} 474}
319EXPORT_SYMBOL(slow_work_enqueue); 475EXPORT_SYMBOL(slow_work_enqueue);
320 476
477static int slow_work_wait(void *word)
478{
479 schedule();
480 return 0;
481}
482
483/**
484 * slow_work_cancel - Cancel a slow work item
485 * @work: The work item to cancel
486 *
487 * This function will cancel a previously enqueued work item. If we cannot
488 * cancel the work item, it is guarenteed to have run when this function
489 * returns.
490 */
491void slow_work_cancel(struct slow_work *work)
492{
493 bool wait = true, put = false;
494
495 set_bit(SLOW_WORK_CANCELLING, &work->flags);
496 smp_mb();
497
498 /* if the work item is a delayed work item with an active timer, we
499 * need to wait for the timer to finish _before_ getting the spinlock,
500 * lest we deadlock against the timer routine
501 *
502 * the timer routine will leave DELAYED set if it notices the
503 * CANCELLING flag in time
504 */
505 if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
506 struct delayed_slow_work *dwork =
507 container_of(work, struct delayed_slow_work, work);
508 del_timer_sync(&dwork->timer);
509 }
510
511 spin_lock_irq(&slow_work_queue_lock);
512
513 if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
514 /* the timer routine aborted or never happened, so we are left
515 * holding the timer's reference on the item and should just
516 * drop the pending flag and wait for any ongoing execution to
517 * finish */
518 struct delayed_slow_work *dwork =
519 container_of(work, struct delayed_slow_work, work);
520
521 BUG_ON(timer_pending(&dwork->timer));
522 BUG_ON(!list_empty(&work->link));
523
524 clear_bit(SLOW_WORK_DELAYED, &work->flags);
525 put = true;
526 clear_bit(SLOW_WORK_PENDING, &work->flags);
527
528 } else if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
529 !list_empty(&work->link)) {
530 /* the link in the pending queue holds a reference on the item
531 * that we will need to release */
532 list_del_init(&work->link);
533 wait = false;
534 put = true;
535 clear_bit(SLOW_WORK_PENDING, &work->flags);
536
537 } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) {
538 /* the executor is holding our only reference on the item, so
539 * we merely need to wait for it to finish executing */
540 clear_bit(SLOW_WORK_PENDING, &work->flags);
541 }
542
543 spin_unlock_irq(&slow_work_queue_lock);
544
545 /* the EXECUTING flag is set by the executor whilst the spinlock is set
546 * and before the item is dequeued - so assuming the above doesn't
547 * actually dequeue it, simply waiting for the EXECUTING flag to be
548 * released here should be sufficient */
549 if (wait)
550 wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait,
551 TASK_UNINTERRUPTIBLE);
552
553 clear_bit(SLOW_WORK_CANCELLING, &work->flags);
554 if (put)
555 slow_work_put_ref(work);
556}
557EXPORT_SYMBOL(slow_work_cancel);
558
559/*
560 * Handle expiry of the delay timer, indicating that a delayed slow work item
561 * should now be queued if not cancelled
562 */
563static void delayed_slow_work_timer(unsigned long data)
564{
565 wait_queue_head_t *wfo_wq;
566 struct list_head *queue;
567 struct slow_work *work = (struct slow_work *) data;
568 unsigned long flags;
569 bool queued = false, put = false, first = false;
570
571 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
572 wfo_wq = &vslow_work_queue_waits_for_occupation;
573 queue = &vslow_work_queue;
574 } else {
575 wfo_wq = &slow_work_queue_waits_for_occupation;
576 queue = &slow_work_queue;
577 }
578
579 spin_lock_irqsave(&slow_work_queue_lock, flags);
580 if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) {
581 clear_bit(SLOW_WORK_DELAYED, &work->flags);
582
583 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
584 /* we discard the reference the timer was holding in
585 * favour of the one the executor holds */
586 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
587 put = true;
588 } else {
589 slow_work_mark_time(work);
590 list_add_tail(&work->link, queue);
591 queued = true;
592 if (work->link.prev == queue)
593 first = true;
594 }
595 }
596
597 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
598 if (put)
599 slow_work_put_ref(work);
600 if (first)
601 wake_up(wfo_wq);
602 if (queued)
603 wake_up(&slow_work_thread_wq);
604}
605
606/**
607 * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing
608 * @dwork: The delayed work item to queue
609 * @delay: When to start executing the work, in jiffies from now
610 *
611 * This is similar to slow_work_enqueue(), but it adds a delay before the work
612 * is actually queued for processing.
613 *
614 * The item can have delayed processing requested on it whilst it is being
615 * executed. The delay will begin immediately, and if it expires before the
616 * item finishes executing, the item will be placed back on the queue when it
617 * has done executing.
618 */
619int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
620 unsigned long delay)
621{
622 struct slow_work *work = &dwork->work;
623 unsigned long flags;
624 int ret;
625
626 if (delay == 0)
627 return slow_work_enqueue(&dwork->work);
628
629 BUG_ON(slow_work_user_count <= 0);
630 BUG_ON(!work);
631 BUG_ON(!work->ops);
632
633 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
634 return -ECANCELED;
635
636 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
637 spin_lock_irqsave(&slow_work_queue_lock, flags);
638
639 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
640 goto cancelled;
641
642 /* the timer holds a reference whilst it is pending */
643 ret = work->ops->get_ref(work);
644 if (ret < 0)
645 goto cant_get_ref;
646
647 if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags))
648 BUG();
649 dwork->timer.expires = jiffies + delay;
650 dwork->timer.data = (unsigned long) work;
651 dwork->timer.function = delayed_slow_work_timer;
652 add_timer(&dwork->timer);
653
654 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
655 }
656
657 return 0;
658
659cancelled:
660 ret = -ECANCELED;
661cant_get_ref:
662 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
663 return ret;
664}
665EXPORT_SYMBOL(delayed_slow_work_enqueue);
666
321/* 667/*
322 * Schedule a cull of the thread pool at some time in the near future 668 * Schedule a cull of the thread pool at some time in the near future
323 */ 669 */
@@ -368,13 +714,23 @@ static inline bool slow_work_available(int vsmax)
368 */ 714 */
369static int slow_work_thread(void *_data) 715static int slow_work_thread(void *_data)
370{ 716{
371 int vsmax; 717 int vsmax, id;
372 718
373 DEFINE_WAIT(wait); 719 DEFINE_WAIT(wait);
374 720
375 set_freezable(); 721 set_freezable();
376 set_user_nice(current, -5); 722 set_user_nice(current, -5);
377 723
724 /* allocate ourselves an ID */
725 spin_lock_irq(&slow_work_queue_lock);
726 id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
727 BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT);
728 __set_bit(id, slow_work_ids);
729 slow_work_set_thread_pid(id, current->pid);
730 spin_unlock_irq(&slow_work_queue_lock);
731
732 sprintf(current->comm, "kslowd%03u", id);
733
378 for (;;) { 734 for (;;) {
379 vsmax = vslow_work_proportion; 735 vsmax = vslow_work_proportion;
380 vsmax *= atomic_read(&slow_work_thread_count); 736 vsmax *= atomic_read(&slow_work_thread_count);
@@ -395,7 +751,7 @@ static int slow_work_thread(void *_data)
395 vsmax *= atomic_read(&slow_work_thread_count); 751 vsmax *= atomic_read(&slow_work_thread_count);
396 vsmax /= 100; 752 vsmax /= 100;
397 753
398 if (slow_work_available(vsmax) && slow_work_execute()) { 754 if (slow_work_available(vsmax) && slow_work_execute(id)) {
399 cond_resched(); 755 cond_resched();
400 if (list_empty(&slow_work_queue) && 756 if (list_empty(&slow_work_queue) &&
401 list_empty(&vslow_work_queue) && 757 list_empty(&vslow_work_queue) &&
@@ -412,6 +768,11 @@ static int slow_work_thread(void *_data)
412 break; 768 break;
413 } 769 }
414 770
771 spin_lock_irq(&slow_work_queue_lock);
772 slow_work_set_thread_pid(id, 0);
773 __clear_bit(id, slow_work_ids);
774 spin_unlock_irq(&slow_work_queue_lock);
775
415 if (atomic_dec_and_test(&slow_work_thread_count)) 776 if (atomic_dec_and_test(&slow_work_thread_count))
416 complete_and_exit(&slow_work_last_thread_exited, 0); 777 complete_and_exit(&slow_work_last_thread_exited, 0);
417 return 0; 778 return 0;
@@ -427,21 +788,6 @@ static void slow_work_cull_timeout(unsigned long data)
427} 788}
428 789
429/* 790/*
430 * Get a reference on slow work thread starter
431 */
432static int slow_work_new_thread_get_ref(struct slow_work *work)
433{
434 return 0;
435}
436
437/*
438 * Drop a reference on slow work thread starter
439 */
440static void slow_work_new_thread_put_ref(struct slow_work *work)
441{
442}
443
444/*
445 * Start a new slow work thread 791 * Start a new slow work thread
446 */ 792 */
447static void slow_work_new_thread_execute(struct slow_work *work) 793static void slow_work_new_thread_execute(struct slow_work *work)
@@ -475,9 +821,11 @@ static void slow_work_new_thread_execute(struct slow_work *work)
475} 821}
476 822
477static const struct slow_work_ops slow_work_new_thread_ops = { 823static const struct slow_work_ops slow_work_new_thread_ops = {
478 .get_ref = slow_work_new_thread_get_ref, 824 .owner = THIS_MODULE,
479 .put_ref = slow_work_new_thread_put_ref,
480 .execute = slow_work_new_thread_execute, 825 .execute = slow_work_new_thread_execute,
826#ifdef CONFIG_SLOW_WORK_DEBUG
827 .desc = slow_work_new_thread_desc,
828#endif
481}; 829};
482 830
483/* 831/*
@@ -493,10 +841,10 @@ static void slow_work_oom_timeout(unsigned long data)
493 * Handle adjustment of the minimum number of threads 841 * Handle adjustment of the minimum number of threads
494 */ 842 */
495static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, 843static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
496 struct file *filp, void __user *buffer, 844 void __user *buffer,
497 size_t *lenp, loff_t *ppos) 845 size_t *lenp, loff_t *ppos)
498{ 846{
499 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 847 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
500 int n; 848 int n;
501 849
502 if (ret == 0) { 850 if (ret == 0) {
@@ -521,10 +869,10 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
521 * Handle adjustment of the maximum number of threads 869 * Handle adjustment of the maximum number of threads
522 */ 870 */
523static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, 871static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
524 struct file *filp, void __user *buffer, 872 void __user *buffer,
525 size_t *lenp, loff_t *ppos) 873 size_t *lenp, loff_t *ppos)
526{ 874{
527 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 875 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
528 int n; 876 int n;
529 877
530 if (ret == 0) { 878 if (ret == 0) {
@@ -546,12 +894,13 @@ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
546 894
547/** 895/**
548 * slow_work_register_user - Register a user of the facility 896 * slow_work_register_user - Register a user of the facility
897 * @module: The module about to make use of the facility
549 * 898 *
550 * Register a user of the facility, starting up the initial threads if there 899 * Register a user of the facility, starting up the initial threads if there
551 * aren't any other users at this point. This will return 0 if successful, or 900 * aren't any other users at this point. This will return 0 if successful, or
552 * an error if not. 901 * an error if not.
553 */ 902 */
554int slow_work_register_user(void) 903int slow_work_register_user(struct module *module)
555{ 904{
556 struct task_struct *p; 905 struct task_struct *p;
557 int loop; 906 int loop;
@@ -598,14 +947,81 @@ error:
598} 947}
599EXPORT_SYMBOL(slow_work_register_user); 948EXPORT_SYMBOL(slow_work_register_user);
600 949
950/*
951 * wait for all outstanding items from the calling module to complete
952 * - note that more items may be queued whilst we're waiting
953 */
954static void slow_work_wait_for_items(struct module *module)
955{
956#ifdef CONFIG_MODULES
957 DECLARE_WAITQUEUE(myself, current);
958 struct slow_work *work;
959 int loop;
960
961 mutex_lock(&slow_work_unreg_sync_lock);
962 add_wait_queue(&slow_work_unreg_wq, &myself);
963
964 for (;;) {
965 spin_lock_irq(&slow_work_queue_lock);
966
967 /* first of all, we wait for the last queued item in each list
968 * to be processed */
969 list_for_each_entry_reverse(work, &vslow_work_queue, link) {
970 if (work->owner == module) {
971 set_current_state(TASK_UNINTERRUPTIBLE);
972 slow_work_unreg_work_item = work;
973 goto do_wait;
974 }
975 }
976 list_for_each_entry_reverse(work, &slow_work_queue, link) {
977 if (work->owner == module) {
978 set_current_state(TASK_UNINTERRUPTIBLE);
979 slow_work_unreg_work_item = work;
980 goto do_wait;
981 }
982 }
983
984 /* then we wait for the items being processed to finish */
985 slow_work_unreg_module = module;
986 smp_mb();
987 for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) {
988 if (slow_work_thread_processing[loop] == module)
989 goto do_wait;
990 }
991 spin_unlock_irq(&slow_work_queue_lock);
992 break; /* okay, we're done */
993
994 do_wait:
995 spin_unlock_irq(&slow_work_queue_lock);
996 schedule();
997 slow_work_unreg_work_item = NULL;
998 slow_work_unreg_module = NULL;
999 }
1000
1001 remove_wait_queue(&slow_work_unreg_wq, &myself);
1002 mutex_unlock(&slow_work_unreg_sync_lock);
1003#endif /* CONFIG_MODULES */
1004}
1005
601/** 1006/**
602 * slow_work_unregister_user - Unregister a user of the facility 1007 * slow_work_unregister_user - Unregister a user of the facility
1008 * @module: The module whose items should be cleared
603 * 1009 *
604 * Unregister a user of the facility, killing all the threads if this was the 1010 * Unregister a user of the facility, killing all the threads if this was the
605 * last one. 1011 * last one.
1012 *
1013 * This waits for all the work items belonging to the nominated module to go
1014 * away before proceeding.
606 */ 1015 */
607void slow_work_unregister_user(void) 1016void slow_work_unregister_user(struct module *module)
608{ 1017{
1018 /* first of all, wait for all outstanding items from the calling module
1019 * to complete */
1020 if (module)
1021 slow_work_wait_for_items(module);
1022
1023 /* then we can actually go about shutting down the facility if need
1024 * be */
609 mutex_lock(&slow_work_user_lock); 1025 mutex_lock(&slow_work_user_lock);
610 1026
611 BUG_ON(slow_work_user_count <= 0); 1027 BUG_ON(slow_work_user_count <= 0);
@@ -639,6 +1055,16 @@ static int __init init_slow_work(void)
639 if (slow_work_max_max_threads < nr_cpus * 2) 1055 if (slow_work_max_max_threads < nr_cpus * 2)
640 slow_work_max_max_threads = nr_cpus * 2; 1056 slow_work_max_max_threads = nr_cpus * 2;
641#endif 1057#endif
1058#ifdef CONFIG_SLOW_WORK_DEBUG
1059 {
1060 struct dentry *dbdir;
1061
1062 dbdir = debugfs_create_dir("slow_work", NULL);
1063 if (dbdir && !IS_ERR(dbdir))
1064 debugfs_create_file("runqueue", S_IFREG | 0400, dbdir,
1065 NULL, &slow_work_runqueue_fops);
1066 }
1067#endif
642 return 0; 1068 return 0;
643} 1069}
644 1070
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
new file mode 100644
index 000000000000..321f3c59d732
--- /dev/null
+++ b/kernel/slow-work.h
@@ -0,0 +1,72 @@
1/* Slow work private definitions
2 *
3 * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of
13 * things to do */
14#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
15 * OOM */
16
17#define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */
18
19/*
20 * slow-work.c
21 */
22#ifdef CONFIG_SLOW_WORK_DEBUG
23extern struct slow_work *slow_work_execs[];
24extern pid_t slow_work_pids[];
25extern rwlock_t slow_work_execs_lock;
26#endif
27
28extern struct list_head slow_work_queue;
29extern struct list_head vslow_work_queue;
30extern spinlock_t slow_work_queue_lock;
31
32/*
33 * slow-work-debugfs.c
34 */
35#ifdef CONFIG_SLOW_WORK_DEBUG
36extern const struct file_operations slow_work_runqueue_fops;
37
38extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
39#endif
40
41/*
42 * Helper functions
43 */
44static inline void slow_work_set_thread_pid(int id, pid_t pid)
45{
46#ifdef CONFIG_SLOW_WORK_PROC
47 slow_work_pids[id] = pid;
48#endif
49}
50
51static inline void slow_work_mark_time(struct slow_work *work)
52{
53#ifdef CONFIG_SLOW_WORK_PROC
54 work->mark = CURRENT_TIME;
55#endif
56}
57
58static inline void slow_work_begin_exec(int id, struct slow_work *work)
59{
60#ifdef CONFIG_SLOW_WORK_PROC
61 slow_work_execs[id] = work;
62#endif
63}
64
65static inline void slow_work_end_exec(int id, struct slow_work *work)
66{
67#ifdef CONFIG_SLOW_WORK_PROC
68 write_lock(&slow_work_execs_lock);
69 slow_work_execs[id] = NULL;
70 write_unlock(&slow_work_execs_lock);
71#endif
72}
diff --git a/kernel/smp.c b/kernel/smp.c
index fd47a256a24e..a8c76069cf50 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -265,9 +265,7 @@ static DEFINE_PER_CPU(struct call_single_data, csd_data);
265 * @info: An arbitrary pointer to pass to the function. 265 * @info: An arbitrary pointer to pass to the function.
266 * @wait: If true, wait until function has completed on other CPUs. 266 * @wait: If true, wait until function has completed on other CPUs.
267 * 267 *
268 * Returns 0 on success, else a negative status code. Note that @wait 268 * Returns 0 on success, else a negative status code.
269 * will be implicitly turned on in case of allocation failures, since
270 * we fall back to on-stack allocation.
271 */ 269 */
272int smp_call_function_single(int cpu, void (*func) (void *info), void *info, 270int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
273 int wait) 271 int wait)
@@ -321,6 +319,51 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
321} 319}
322EXPORT_SYMBOL(smp_call_function_single); 320EXPORT_SYMBOL(smp_call_function_single);
323 321
322/*
323 * smp_call_function_any - Run a function on any of the given cpus
324 * @mask: The mask of cpus it can run on.
325 * @func: The function to run. This must be fast and non-blocking.
326 * @info: An arbitrary pointer to pass to the function.
327 * @wait: If true, wait until function has completed.
328 *
329 * Returns 0 on success, else a negative status code (if no cpus were online).
330 * Note that @wait will be implicitly turned on in case of allocation failures,
331 * since we fall back to on-stack allocation.
332 *
333 * Selection preference:
334 * 1) current cpu if in @mask
335 * 2) any cpu of current node if in @mask
336 * 3) any other online cpu in @mask
337 */
338int smp_call_function_any(const struct cpumask *mask,
339 void (*func)(void *info), void *info, int wait)
340{
341 unsigned int cpu;
342 const struct cpumask *nodemask;
343 int ret;
344
345 /* Try for same CPU (cheapest) */
346 cpu = get_cpu();
347 if (cpumask_test_cpu(cpu, mask))
348 goto call;
349
350 /* Try for same node. */
351 nodemask = cpumask_of_node(cpu);
352 for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
353 cpu = cpumask_next_and(cpu, nodemask, mask)) {
354 if (cpu_online(cpu))
355 goto call;
356 }
357
358 /* Any online will do: smp_call_function_single handles nr_cpu_ids. */
359 cpu = cpumask_any_and(mask, cpu_online_mask);
360call:
361 ret = smp_call_function_single(cpu, func, info, wait);
362 put_cpu();
363 return ret;
364}
365EXPORT_SYMBOL_GPL(smp_call_function_any);
366
324/** 367/**
325 * __smp_call_function_single(): Run a function on another CPU 368 * __smp_call_function_single(): Run a function on another CPU
326 * @cpu: The CPU to run on. 369 * @cpu: The CPU to run on.
@@ -347,13 +390,6 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
347 generic_exec_single(cpu, data, wait); 390 generic_exec_single(cpu, data, wait);
348} 391}
349 392
350/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */
351
352#ifndef arch_send_call_function_ipi_mask
353# define arch_send_call_function_ipi_mask(maskp) \
354 arch_send_call_function_ipi(*(maskp))
355#endif
356
357/** 393/**
358 * smp_call_function_many(): Run a function on a set of other CPUs. 394 * smp_call_function_many(): Run a function on a set of other CPUs.
359 * @mask: The set of cpus to run on (only runs on online subset). 395 * @mask: The set of cpus to run on (only runs on online subset).
@@ -362,9 +398,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
362 * @wait: If true, wait (atomically) until function has completed 398 * @wait: If true, wait (atomically) until function has completed
363 * on other CPUs. 399 * on other CPUs.
364 * 400 *
365 * If @wait is true, then returns once @func has returned. Note that @wait 401 * If @wait is true, then returns once @func has returned.
366 * will be implicitly turned on in case of allocation failures, since
367 * we fall back to on-stack allocation.
368 * 402 *
369 * You must not call this function with disabled interrupts or from a 403 * You must not call this function with disabled interrupts or from a
370 * hardware interrupt handler or from a bottom half handler. Preemption 404 * hardware interrupt handler or from a bottom half handler. Preemption
@@ -450,8 +484,7 @@ EXPORT_SYMBOL(smp_call_function_many);
450 * Returns 0. 484 * Returns 0.
451 * 485 *
452 * If @wait is true, then returns once @func has returned; otherwise 486 * If @wait is true, then returns once @func has returned; otherwise
453 * it returns just before the target cpu calls @func. In case of allocation 487 * it returns just before the target cpu calls @func.
454 * failure, @wait will be implicitly turned on.
455 * 488 *
456 * You must not call this function with disabled interrupts or from a 489 * You must not call this function with disabled interrupts or from a
457 * hardware interrupt handler or from a bottom half handler. 490 * hardware interrupt handler or from a bottom half handler.
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f8749e5216e0..21939d9e830e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -302,9 +302,9 @@ void irq_exit(void)
302 if (!in_interrupt() && local_softirq_pending()) 302 if (!in_interrupt() && local_softirq_pending())
303 invoke_softirq(); 303 invoke_softirq();
304 304
305 rcu_irq_exit();
305#ifdef CONFIG_NO_HZ 306#ifdef CONFIG_NO_HZ
306 /* Make sure that timer wheel updates are propagated */ 307 /* Make sure that timer wheel updates are propagated */
307 rcu_irq_exit();
308 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) 308 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
309 tick_nohz_stop_sched_tick(0); 309 tick_nohz_stop_sched_tick(0);
310#endif 310#endif
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 88796c330838..81324d12eb35 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -90,11 +90,11 @@ void touch_all_softlockup_watchdogs(void)
90EXPORT_SYMBOL(touch_all_softlockup_watchdogs); 90EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
91 91
92int proc_dosoftlockup_thresh(struct ctl_table *table, int write, 92int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
93 struct file *filp, void __user *buffer, 93 void __user *buffer,
94 size_t *lenp, loff_t *ppos) 94 size_t *lenp, loff_t *ppos)
95{ 95{
96 touch_all_softlockup_watchdogs(); 96 touch_all_softlockup_watchdogs();
97 return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 97 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
98} 98}
99 99
100/* 100/*
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 5ddab730cb2f..41e042219ff6 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -21,145 +21,28 @@
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/module.h> 22#include <linux/module.h>
23 23
24#ifndef _spin_trylock
25int __lockfunc _spin_trylock(spinlock_t *lock)
26{
27 return __spin_trylock(lock);
28}
29EXPORT_SYMBOL(_spin_trylock);
30#endif
31
32#ifndef _read_trylock
33int __lockfunc _read_trylock(rwlock_t *lock)
34{
35 return __read_trylock(lock);
36}
37EXPORT_SYMBOL(_read_trylock);
38#endif
39
40#ifndef _write_trylock
41int __lockfunc _write_trylock(rwlock_t *lock)
42{
43 return __write_trylock(lock);
44}
45EXPORT_SYMBOL(_write_trylock);
46#endif
47
48/* 24/*
49 * If lockdep is enabled then we use the non-preemption spin-ops 25 * If lockdep is enabled then we use the non-preemption spin-ops
50 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are 26 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
51 * not re-enabled during lock-acquire (which the preempt-spin-ops do): 27 * not re-enabled during lock-acquire (which the preempt-spin-ops do):
52 */ 28 */
53#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) 29#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
54
55#ifndef _read_lock
56void __lockfunc _read_lock(rwlock_t *lock)
57{
58 __read_lock(lock);
59}
60EXPORT_SYMBOL(_read_lock);
61#endif
62
63#ifndef _spin_lock_irqsave
64unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
65{
66 return __spin_lock_irqsave(lock);
67}
68EXPORT_SYMBOL(_spin_lock_irqsave);
69#endif
70
71#ifndef _spin_lock_irq
72void __lockfunc _spin_lock_irq(spinlock_t *lock)
73{
74 __spin_lock_irq(lock);
75}
76EXPORT_SYMBOL(_spin_lock_irq);
77#endif
78
79#ifndef _spin_lock_bh
80void __lockfunc _spin_lock_bh(spinlock_t *lock)
81{
82 __spin_lock_bh(lock);
83}
84EXPORT_SYMBOL(_spin_lock_bh);
85#endif
86
87#ifndef _read_lock_irqsave
88unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
89{
90 return __read_lock_irqsave(lock);
91}
92EXPORT_SYMBOL(_read_lock_irqsave);
93#endif
94
95#ifndef _read_lock_irq
96void __lockfunc _read_lock_irq(rwlock_t *lock)
97{
98 __read_lock_irq(lock);
99}
100EXPORT_SYMBOL(_read_lock_irq);
101#endif
102
103#ifndef _read_lock_bh
104void __lockfunc _read_lock_bh(rwlock_t *lock)
105{
106 __read_lock_bh(lock);
107}
108EXPORT_SYMBOL(_read_lock_bh);
109#endif
110
111#ifndef _write_lock_irqsave
112unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
113{
114 return __write_lock_irqsave(lock);
115}
116EXPORT_SYMBOL(_write_lock_irqsave);
117#endif
118
119#ifndef _write_lock_irq
120void __lockfunc _write_lock_irq(rwlock_t *lock)
121{
122 __write_lock_irq(lock);
123}
124EXPORT_SYMBOL(_write_lock_irq);
125#endif
126
127#ifndef _write_lock_bh
128void __lockfunc _write_lock_bh(rwlock_t *lock)
129{
130 __write_lock_bh(lock);
131}
132EXPORT_SYMBOL(_write_lock_bh);
133#endif
134
135#ifndef _spin_lock
136void __lockfunc _spin_lock(spinlock_t *lock)
137{
138 __spin_lock(lock);
139}
140EXPORT_SYMBOL(_spin_lock);
141#endif
142
143#ifndef _write_lock
144void __lockfunc _write_lock(rwlock_t *lock)
145{
146 __write_lock(lock);
147}
148EXPORT_SYMBOL(_write_lock);
149#endif
150
151#else /* CONFIG_PREEMPT: */
152
153/* 30/*
31 * The __lock_function inlines are taken from
32 * include/linux/spinlock_api_smp.h
33 */
34#else
35/*
36 * We build the __lock_function inlines here. They are too large for
37 * inlining all over the place, but here is only one user per function
38 * which embedds them into the calling _lock_function below.
39 *
154 * This could be a long-held lock. We both prepare to spin for a long 40 * This could be a long-held lock. We both prepare to spin for a long
155 * time (making _this_ CPU preemptable if possible), and we also signal 41 * time (making _this_ CPU preemptable if possible), and we also signal
156 * towards that other CPU that it should break the lock ASAP. 42 * towards that other CPU that it should break the lock ASAP.
157 *
158 * (We do this in a function because inlining it would be excessive.)
159 */ 43 */
160
161#define BUILD_LOCK_OPS(op, locktype) \ 44#define BUILD_LOCK_OPS(op, locktype) \
162void __lockfunc _##op##_lock(locktype##_t *lock) \ 45void __lockfunc __##op##_lock(locktype##_t *lock) \
163{ \ 46{ \
164 for (;;) { \ 47 for (;;) { \
165 preempt_disable(); \ 48 preempt_disable(); \
@@ -175,9 +58,7 @@ void __lockfunc _##op##_lock(locktype##_t *lock) \
175 (lock)->break_lock = 0; \ 58 (lock)->break_lock = 0; \
176} \ 59} \
177 \ 60 \
178EXPORT_SYMBOL(_##op##_lock); \ 61unsigned long __lockfunc __##op##_lock_irqsave(locktype##_t *lock) \
179 \
180unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \
181{ \ 62{ \
182 unsigned long flags; \ 63 unsigned long flags; \
183 \ 64 \
@@ -198,16 +79,12 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \
198 return flags; \ 79 return flags; \
199} \ 80} \
200 \ 81 \
201EXPORT_SYMBOL(_##op##_lock_irqsave); \ 82void __lockfunc __##op##_lock_irq(locktype##_t *lock) \
202 \
203void __lockfunc _##op##_lock_irq(locktype##_t *lock) \
204{ \ 83{ \
205 _##op##_lock_irqsave(lock); \ 84 _##op##_lock_irqsave(lock); \
206} \ 85} \
207 \ 86 \
208EXPORT_SYMBOL(_##op##_lock_irq); \ 87void __lockfunc __##op##_lock_bh(locktype##_t *lock) \
209 \
210void __lockfunc _##op##_lock_bh(locktype##_t *lock) \
211{ \ 88{ \
212 unsigned long flags; \ 89 unsigned long flags; \
213 \ 90 \
@@ -220,23 +97,21 @@ void __lockfunc _##op##_lock_bh(locktype##_t *lock) \
220 local_bh_disable(); \ 97 local_bh_disable(); \
221 local_irq_restore(flags); \ 98 local_irq_restore(flags); \
222} \ 99} \
223 \
224EXPORT_SYMBOL(_##op##_lock_bh)
225 100
226/* 101/*
227 * Build preemption-friendly versions of the following 102 * Build preemption-friendly versions of the following
228 * lock-spinning functions: 103 * lock-spinning functions:
229 * 104 *
230 * _[spin|read|write]_lock() 105 * __[spin|read|write]_lock()
231 * _[spin|read|write]_lock_irq() 106 * __[spin|read|write]_lock_irq()
232 * _[spin|read|write]_lock_irqsave() 107 * __[spin|read|write]_lock_irqsave()
233 * _[spin|read|write]_lock_bh() 108 * __[spin|read|write]_lock_bh()
234 */ 109 */
235BUILD_LOCK_OPS(spin, spinlock); 110BUILD_LOCK_OPS(spin, spinlock);
236BUILD_LOCK_OPS(read, rwlock); 111BUILD_LOCK_OPS(read, rwlock);
237BUILD_LOCK_OPS(write, rwlock); 112BUILD_LOCK_OPS(write, rwlock);
238 113
239#endif /* CONFIG_PREEMPT */ 114#endif
240 115
241#ifdef CONFIG_DEBUG_LOCK_ALLOC 116#ifdef CONFIG_DEBUG_LOCK_ALLOC
242 117
@@ -248,7 +123,8 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
248} 123}
249EXPORT_SYMBOL(_spin_lock_nested); 124EXPORT_SYMBOL(_spin_lock_nested);
250 125
251unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) 126unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock,
127 int subclass)
252{ 128{
253 unsigned long flags; 129 unsigned long flags;
254 130
@@ -272,7 +148,127 @@ EXPORT_SYMBOL(_spin_lock_nest_lock);
272 148
273#endif 149#endif
274 150
275#ifndef _spin_unlock 151#ifndef CONFIG_INLINE_SPIN_TRYLOCK
152int __lockfunc _spin_trylock(spinlock_t *lock)
153{
154 return __spin_trylock(lock);
155}
156EXPORT_SYMBOL(_spin_trylock);
157#endif
158
159#ifndef CONFIG_INLINE_READ_TRYLOCK
160int __lockfunc _read_trylock(rwlock_t *lock)
161{
162 return __read_trylock(lock);
163}
164EXPORT_SYMBOL(_read_trylock);
165#endif
166
167#ifndef CONFIG_INLINE_WRITE_TRYLOCK
168int __lockfunc _write_trylock(rwlock_t *lock)
169{
170 return __write_trylock(lock);
171}
172EXPORT_SYMBOL(_write_trylock);
173#endif
174
175#ifndef CONFIG_INLINE_READ_LOCK
176void __lockfunc _read_lock(rwlock_t *lock)
177{
178 __read_lock(lock);
179}
180EXPORT_SYMBOL(_read_lock);
181#endif
182
183#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
184unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
185{
186 return __spin_lock_irqsave(lock);
187}
188EXPORT_SYMBOL(_spin_lock_irqsave);
189#endif
190
191#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ
192void __lockfunc _spin_lock_irq(spinlock_t *lock)
193{
194 __spin_lock_irq(lock);
195}
196EXPORT_SYMBOL(_spin_lock_irq);
197#endif
198
199#ifndef CONFIG_INLINE_SPIN_LOCK_BH
200void __lockfunc _spin_lock_bh(spinlock_t *lock)
201{
202 __spin_lock_bh(lock);
203}
204EXPORT_SYMBOL(_spin_lock_bh);
205#endif
206
207#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE
208unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
209{
210 return __read_lock_irqsave(lock);
211}
212EXPORT_SYMBOL(_read_lock_irqsave);
213#endif
214
215#ifndef CONFIG_INLINE_READ_LOCK_IRQ
216void __lockfunc _read_lock_irq(rwlock_t *lock)
217{
218 __read_lock_irq(lock);
219}
220EXPORT_SYMBOL(_read_lock_irq);
221#endif
222
223#ifndef CONFIG_INLINE_READ_LOCK_BH
224void __lockfunc _read_lock_bh(rwlock_t *lock)
225{
226 __read_lock_bh(lock);
227}
228EXPORT_SYMBOL(_read_lock_bh);
229#endif
230
231#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
232unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
233{
234 return __write_lock_irqsave(lock);
235}
236EXPORT_SYMBOL(_write_lock_irqsave);
237#endif
238
239#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ
240void __lockfunc _write_lock_irq(rwlock_t *lock)
241{
242 __write_lock_irq(lock);
243}
244EXPORT_SYMBOL(_write_lock_irq);
245#endif
246
247#ifndef CONFIG_INLINE_WRITE_LOCK_BH
248void __lockfunc _write_lock_bh(rwlock_t *lock)
249{
250 __write_lock_bh(lock);
251}
252EXPORT_SYMBOL(_write_lock_bh);
253#endif
254
255#ifndef CONFIG_INLINE_SPIN_LOCK
256void __lockfunc _spin_lock(spinlock_t *lock)
257{
258 __spin_lock(lock);
259}
260EXPORT_SYMBOL(_spin_lock);
261#endif
262
263#ifndef CONFIG_INLINE_WRITE_LOCK
264void __lockfunc _write_lock(rwlock_t *lock)
265{
266 __write_lock(lock);
267}
268EXPORT_SYMBOL(_write_lock);
269#endif
270
271#ifndef CONFIG_INLINE_SPIN_UNLOCK
276void __lockfunc _spin_unlock(spinlock_t *lock) 272void __lockfunc _spin_unlock(spinlock_t *lock)
277{ 273{
278 __spin_unlock(lock); 274 __spin_unlock(lock);
@@ -280,7 +276,7 @@ void __lockfunc _spin_unlock(spinlock_t *lock)
280EXPORT_SYMBOL(_spin_unlock); 276EXPORT_SYMBOL(_spin_unlock);
281#endif 277#endif
282 278
283#ifndef _write_unlock 279#ifndef CONFIG_INLINE_WRITE_UNLOCK
284void __lockfunc _write_unlock(rwlock_t *lock) 280void __lockfunc _write_unlock(rwlock_t *lock)
285{ 281{
286 __write_unlock(lock); 282 __write_unlock(lock);
@@ -288,7 +284,7 @@ void __lockfunc _write_unlock(rwlock_t *lock)
288EXPORT_SYMBOL(_write_unlock); 284EXPORT_SYMBOL(_write_unlock);
289#endif 285#endif
290 286
291#ifndef _read_unlock 287#ifndef CONFIG_INLINE_READ_UNLOCK
292void __lockfunc _read_unlock(rwlock_t *lock) 288void __lockfunc _read_unlock(rwlock_t *lock)
293{ 289{
294 __read_unlock(lock); 290 __read_unlock(lock);
@@ -296,7 +292,7 @@ void __lockfunc _read_unlock(rwlock_t *lock)
296EXPORT_SYMBOL(_read_unlock); 292EXPORT_SYMBOL(_read_unlock);
297#endif 293#endif
298 294
299#ifndef _spin_unlock_irqrestore 295#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
300void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) 296void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
301{ 297{
302 __spin_unlock_irqrestore(lock, flags); 298 __spin_unlock_irqrestore(lock, flags);
@@ -304,7 +300,7 @@ void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
304EXPORT_SYMBOL(_spin_unlock_irqrestore); 300EXPORT_SYMBOL(_spin_unlock_irqrestore);
305#endif 301#endif
306 302
307#ifndef _spin_unlock_irq 303#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ
308void __lockfunc _spin_unlock_irq(spinlock_t *lock) 304void __lockfunc _spin_unlock_irq(spinlock_t *lock)
309{ 305{
310 __spin_unlock_irq(lock); 306 __spin_unlock_irq(lock);
@@ -312,7 +308,7 @@ void __lockfunc _spin_unlock_irq(spinlock_t *lock)
312EXPORT_SYMBOL(_spin_unlock_irq); 308EXPORT_SYMBOL(_spin_unlock_irq);
313#endif 309#endif
314 310
315#ifndef _spin_unlock_bh 311#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH
316void __lockfunc _spin_unlock_bh(spinlock_t *lock) 312void __lockfunc _spin_unlock_bh(spinlock_t *lock)
317{ 313{
318 __spin_unlock_bh(lock); 314 __spin_unlock_bh(lock);
@@ -320,7 +316,7 @@ void __lockfunc _spin_unlock_bh(spinlock_t *lock)
320EXPORT_SYMBOL(_spin_unlock_bh); 316EXPORT_SYMBOL(_spin_unlock_bh);
321#endif 317#endif
322 318
323#ifndef _read_unlock_irqrestore 319#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
324void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 320void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
325{ 321{
326 __read_unlock_irqrestore(lock, flags); 322 __read_unlock_irqrestore(lock, flags);
@@ -328,7 +324,7 @@ void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
328EXPORT_SYMBOL(_read_unlock_irqrestore); 324EXPORT_SYMBOL(_read_unlock_irqrestore);
329#endif 325#endif
330 326
331#ifndef _read_unlock_irq 327#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ
332void __lockfunc _read_unlock_irq(rwlock_t *lock) 328void __lockfunc _read_unlock_irq(rwlock_t *lock)
333{ 329{
334 __read_unlock_irq(lock); 330 __read_unlock_irq(lock);
@@ -336,7 +332,7 @@ void __lockfunc _read_unlock_irq(rwlock_t *lock)
336EXPORT_SYMBOL(_read_unlock_irq); 332EXPORT_SYMBOL(_read_unlock_irq);
337#endif 333#endif
338 334
339#ifndef _read_unlock_bh 335#ifndef CONFIG_INLINE_READ_UNLOCK_BH
340void __lockfunc _read_unlock_bh(rwlock_t *lock) 336void __lockfunc _read_unlock_bh(rwlock_t *lock)
341{ 337{
342 __read_unlock_bh(lock); 338 __read_unlock_bh(lock);
@@ -344,7 +340,7 @@ void __lockfunc _read_unlock_bh(rwlock_t *lock)
344EXPORT_SYMBOL(_read_unlock_bh); 340EXPORT_SYMBOL(_read_unlock_bh);
345#endif 341#endif
346 342
347#ifndef _write_unlock_irqrestore 343#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
348void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 344void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
349{ 345{
350 __write_unlock_irqrestore(lock, flags); 346 __write_unlock_irqrestore(lock, flags);
@@ -352,7 +348,7 @@ void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
352EXPORT_SYMBOL(_write_unlock_irqrestore); 348EXPORT_SYMBOL(_write_unlock_irqrestore);
353#endif 349#endif
354 350
355#ifndef _write_unlock_irq 351#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ
356void __lockfunc _write_unlock_irq(rwlock_t *lock) 352void __lockfunc _write_unlock_irq(rwlock_t *lock)
357{ 353{
358 __write_unlock_irq(lock); 354 __write_unlock_irq(lock);
@@ -360,7 +356,7 @@ void __lockfunc _write_unlock_irq(rwlock_t *lock)
360EXPORT_SYMBOL(_write_unlock_irq); 356EXPORT_SYMBOL(_write_unlock_irq);
361#endif 357#endif
362 358
363#ifndef _write_unlock_bh 359#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH
364void __lockfunc _write_unlock_bh(rwlock_t *lock) 360void __lockfunc _write_unlock_bh(rwlock_t *lock)
365{ 361{
366 __write_unlock_bh(lock); 362 __write_unlock_bh(lock);
@@ -368,7 +364,7 @@ void __lockfunc _write_unlock_bh(rwlock_t *lock)
368EXPORT_SYMBOL(_write_unlock_bh); 364EXPORT_SYMBOL(_write_unlock_bh);
369#endif 365#endif
370 366
371#ifndef _spin_trylock_bh 367#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH
372int __lockfunc _spin_trylock_bh(spinlock_t *lock) 368int __lockfunc _spin_trylock_bh(spinlock_t *lock)
373{ 369{
374 return __spin_trylock_bh(lock); 370 return __spin_trylock_bh(lock);
diff --git a/kernel/srcu.c b/kernel/srcu.c
index b0aeeaf22ce4..818d7d9aa03c 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -49,6 +49,7 @@ int init_srcu_struct(struct srcu_struct *sp)
49 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); 49 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
50 return (sp->per_cpu_ref ? 0 : -ENOMEM); 50 return (sp->per_cpu_ref ? 0 : -ENOMEM);
51} 51}
52EXPORT_SYMBOL_GPL(init_srcu_struct);
52 53
53/* 54/*
54 * srcu_readers_active_idx -- returns approximate number of readers 55 * srcu_readers_active_idx -- returns approximate number of readers
@@ -97,6 +98,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
97 free_percpu(sp->per_cpu_ref); 98 free_percpu(sp->per_cpu_ref);
98 sp->per_cpu_ref = NULL; 99 sp->per_cpu_ref = NULL;
99} 100}
101EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
100 102
101/** 103/**
102 * srcu_read_lock - register a new reader for an SRCU-protected structure. 104 * srcu_read_lock - register a new reader for an SRCU-protected structure.
@@ -118,6 +120,7 @@ int srcu_read_lock(struct srcu_struct *sp)
118 preempt_enable(); 120 preempt_enable();
119 return idx; 121 return idx;
120} 122}
123EXPORT_SYMBOL_GPL(srcu_read_lock);
121 124
122/** 125/**
123 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. 126 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
@@ -136,22 +139,12 @@ void srcu_read_unlock(struct srcu_struct *sp, int idx)
136 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; 139 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
137 preempt_enable(); 140 preempt_enable();
138} 141}
142EXPORT_SYMBOL_GPL(srcu_read_unlock);
139 143
140/** 144/*
141 * synchronize_srcu - wait for prior SRCU read-side critical-section completion 145 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
142 * @sp: srcu_struct with which to synchronize.
143 *
144 * Flip the completed counter, and wait for the old count to drain to zero.
145 * As with classic RCU, the updater must use some separate means of
146 * synchronizing concurrent updates. Can block; must be called from
147 * process context.
148 *
149 * Note that it is illegal to call synchornize_srcu() from the corresponding
150 * SRCU read-side critical section; doing so will result in deadlock.
151 * However, it is perfectly legal to call synchronize_srcu() on one
152 * srcu_struct from some other srcu_struct's read-side critical section.
153 */ 146 */
154void synchronize_srcu(struct srcu_struct *sp) 147void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
155{ 148{
156 int idx; 149 int idx;
157 150
@@ -173,7 +166,7 @@ void synchronize_srcu(struct srcu_struct *sp)
173 return; 166 return;
174 } 167 }
175 168
176 synchronize_sched(); /* Force memory barrier on all CPUs. */ 169 sync_func(); /* Force memory barrier on all CPUs. */
177 170
178 /* 171 /*
179 * The preceding synchronize_sched() ensures that any CPU that 172 * The preceding synchronize_sched() ensures that any CPU that
@@ -190,7 +183,7 @@ void synchronize_srcu(struct srcu_struct *sp)
190 idx = sp->completed & 0x1; 183 idx = sp->completed & 0x1;
191 sp->completed++; 184 sp->completed++;
192 185
193 synchronize_sched(); /* Force memory barrier on all CPUs. */ 186 sync_func(); /* Force memory barrier on all CPUs. */
194 187
195 /* 188 /*
196 * At this point, because of the preceding synchronize_sched(), 189 * At this point, because of the preceding synchronize_sched(),
@@ -203,7 +196,7 @@ void synchronize_srcu(struct srcu_struct *sp)
203 while (srcu_readers_active_idx(sp, idx)) 196 while (srcu_readers_active_idx(sp, idx))
204 schedule_timeout_interruptible(1); 197 schedule_timeout_interruptible(1);
205 198
206 synchronize_sched(); /* Force memory barrier on all CPUs. */ 199 sync_func(); /* Force memory barrier on all CPUs. */
207 200
208 /* 201 /*
209 * The preceding synchronize_sched() forces all srcu_read_unlock() 202 * The preceding synchronize_sched() forces all srcu_read_unlock()
@@ -237,6 +230,47 @@ void synchronize_srcu(struct srcu_struct *sp)
237} 230}
238 231
239/** 232/**
233 * synchronize_srcu - wait for prior SRCU read-side critical-section completion
234 * @sp: srcu_struct with which to synchronize.
235 *
236 * Flip the completed counter, and wait for the old count to drain to zero.
237 * As with classic RCU, the updater must use some separate means of
238 * synchronizing concurrent updates. Can block; must be called from
239 * process context.
240 *
241 * Note that it is illegal to call synchronize_srcu() from the corresponding
242 * SRCU read-side critical section; doing so will result in deadlock.
243 * However, it is perfectly legal to call synchronize_srcu() on one
244 * srcu_struct from some other srcu_struct's read-side critical section.
245 */
246void synchronize_srcu(struct srcu_struct *sp)
247{
248 __synchronize_srcu(sp, synchronize_sched);
249}
250EXPORT_SYMBOL_GPL(synchronize_srcu);
251
252/**
253 * synchronize_srcu_expedited - like synchronize_srcu, but less patient
254 * @sp: srcu_struct with which to synchronize.
255 *
256 * Flip the completed counter, and wait for the old count to drain to zero.
257 * As with classic RCU, the updater must use some separate means of
258 * synchronizing concurrent updates. Can block; must be called from
259 * process context.
260 *
261 * Note that it is illegal to call synchronize_srcu_expedited()
262 * from the corresponding SRCU read-side critical section; doing so
263 * will result in deadlock. However, it is perfectly legal to call
264 * synchronize_srcu_expedited() on one srcu_struct from some other
265 * srcu_struct's read-side critical section.
266 */
267void synchronize_srcu_expedited(struct srcu_struct *sp)
268{
269 __synchronize_srcu(sp, synchronize_sched_expedited);
270}
271EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
272
273/**
240 * srcu_batches_completed - return batches completed. 274 * srcu_batches_completed - return batches completed.
241 * @sp: srcu_struct on which to report batch completion. 275 * @sp: srcu_struct on which to report batch completion.
242 * 276 *
@@ -248,10 +282,4 @@ long srcu_batches_completed(struct srcu_struct *sp)
248{ 282{
249 return sp->completed; 283 return sp->completed;
250} 284}
251
252EXPORT_SYMBOL_GPL(init_srcu_struct);
253EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
254EXPORT_SYMBOL_GPL(srcu_read_lock);
255EXPORT_SYMBOL_GPL(srcu_read_unlock);
256EXPORT_SYMBOL_GPL(synchronize_srcu);
257EXPORT_SYMBOL_GPL(srcu_batches_completed); 285EXPORT_SYMBOL_GPL(srcu_batches_completed);
diff --git a/kernel/sys.c b/kernel/sys.c
index ebcb15611728..9968c5fb55b9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -911,16 +911,15 @@ change_okay:
911 911
912void do_sys_times(struct tms *tms) 912void do_sys_times(struct tms *tms)
913{ 913{
914 struct task_cputime cputime; 914 cputime_t tgutime, tgstime, cutime, cstime;
915 cputime_t cutime, cstime;
916 915
917 thread_group_cputime(current, &cputime);
918 spin_lock_irq(&current->sighand->siglock); 916 spin_lock_irq(&current->sighand->siglock);
917 thread_group_times(current, &tgutime, &tgstime);
919 cutime = current->signal->cutime; 918 cutime = current->signal->cutime;
920 cstime = current->signal->cstime; 919 cstime = current->signal->cstime;
921 spin_unlock_irq(&current->sighand->siglock); 920 spin_unlock_irq(&current->sighand->siglock);
922 tms->tms_utime = cputime_to_clock_t(cputime.utime); 921 tms->tms_utime = cputime_to_clock_t(tgutime);
923 tms->tms_stime = cputime_to_clock_t(cputime.stime); 922 tms->tms_stime = cputime_to_clock_t(tgstime);
924 tms->tms_cutime = cputime_to_clock_t(cutime); 923 tms->tms_cutime = cputime_to_clock_t(cutime);
925 tms->tms_cstime = cputime_to_clock_t(cstime); 924 tms->tms_cstime = cputime_to_clock_t(cstime);
926} 925}
@@ -1110,6 +1109,8 @@ SYSCALL_DEFINE0(setsid)
1110 err = session; 1109 err = session;
1111out: 1110out:
1112 write_unlock_irq(&tasklist_lock); 1111 write_unlock_irq(&tasklist_lock);
1112 if (err > 0)
1113 proc_sid_connector(group_leader);
1113 return err; 1114 return err;
1114} 1115}
1115 1116
@@ -1336,16 +1337,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1336{ 1337{
1337 struct task_struct *t; 1338 struct task_struct *t;
1338 unsigned long flags; 1339 unsigned long flags;
1339 cputime_t utime, stime; 1340 cputime_t tgutime, tgstime, utime, stime;
1340 struct task_cputime cputime;
1341 unsigned long maxrss = 0; 1341 unsigned long maxrss = 0;
1342 1342
1343 memset((char *) r, 0, sizeof *r); 1343 memset((char *) r, 0, sizeof *r);
1344 utime = stime = cputime_zero; 1344 utime = stime = cputime_zero;
1345 1345
1346 if (who == RUSAGE_THREAD) { 1346 if (who == RUSAGE_THREAD) {
1347 utime = task_utime(current); 1347 task_times(current, &utime, &stime);
1348 stime = task_stime(current);
1349 accumulate_thread_rusage(p, r); 1348 accumulate_thread_rusage(p, r);
1350 maxrss = p->signal->maxrss; 1349 maxrss = p->signal->maxrss;
1351 goto out; 1350 goto out;
@@ -1371,9 +1370,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1371 break; 1370 break;
1372 1371
1373 case RUSAGE_SELF: 1372 case RUSAGE_SELF:
1374 thread_group_cputime(p, &cputime); 1373 thread_group_times(p, &tgutime, &tgstime);
1375 utime = cputime_add(utime, cputime.utime); 1374 utime = cputime_add(utime, tgutime);
1376 stime = cputime_add(stime, cputime.stime); 1375 stime = cputime_add(stime, tgstime);
1377 r->ru_nvcsw += p->signal->nvcsw; 1376 r->ru_nvcsw += p->signal->nvcsw;
1378 r->ru_nivcsw += p->signal->nivcsw; 1377 r->ru_nivcsw += p->signal->nivcsw;
1379 r->ru_minflt += p->signal->min_flt; 1378 r->ru_minflt += p->signal->min_flt;
@@ -1542,6 +1541,41 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1542 current->timer_slack_ns = arg2; 1541 current->timer_slack_ns = arg2;
1543 error = 0; 1542 error = 0;
1544 break; 1543 break;
1544 case PR_MCE_KILL:
1545 if (arg4 | arg5)
1546 return -EINVAL;
1547 switch (arg2) {
1548 case PR_MCE_KILL_CLEAR:
1549 if (arg3 != 0)
1550 return -EINVAL;
1551 current->flags &= ~PF_MCE_PROCESS;
1552 break;
1553 case PR_MCE_KILL_SET:
1554 current->flags |= PF_MCE_PROCESS;
1555 if (arg3 == PR_MCE_KILL_EARLY)
1556 current->flags |= PF_MCE_EARLY;
1557 else if (arg3 == PR_MCE_KILL_LATE)
1558 current->flags &= ~PF_MCE_EARLY;
1559 else if (arg3 == PR_MCE_KILL_DEFAULT)
1560 current->flags &=
1561 ~(PF_MCE_EARLY|PF_MCE_PROCESS);
1562 else
1563 return -EINVAL;
1564 break;
1565 default:
1566 return -EINVAL;
1567 }
1568 error = 0;
1569 break;
1570 case PR_MCE_KILL_GET:
1571 if (arg2 | arg3 | arg4 | arg5)
1572 return -EINVAL;
1573 if (current->flags & PF_MCE_PROCESS)
1574 error = (current->flags & PF_MCE_EARLY) ?
1575 PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
1576 else
1577 error = PR_MCE_KILL_DEFAULT;
1578 break;
1545 default: 1579 default:
1546 error = -EINVAL; 1580 error = -EINVAL;
1547 break; 1581 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 515bc230ac2a..e06d0b8d1951 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -49,6 +49,7 @@ cond_syscall(sys_sendmsg);
49cond_syscall(compat_sys_sendmsg); 49cond_syscall(compat_sys_sendmsg);
50cond_syscall(sys_recvmsg); 50cond_syscall(sys_recvmsg);
51cond_syscall(compat_sys_recvmsg); 51cond_syscall(compat_sys_recvmsg);
52cond_syscall(compat_sys_recvfrom);
52cond_syscall(sys_socketcall); 53cond_syscall(sys_socketcall);
53cond_syscall(sys_futex); 54cond_syscall(sys_futex);
54cond_syscall(compat_sys_futex); 55cond_syscall(compat_sys_futex);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0dfaa47d7cb6..4dbf93a52ee9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -26,7 +26,6 @@
26#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
27#include <linux/security.h> 27#include <linux/security.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/utsname.h>
30#include <linux/kmemcheck.h> 29#include <linux/kmemcheck.h>
31#include <linux/smp_lock.h> 30#include <linux/smp_lock.h>
32#include <linux/fs.h> 31#include <linux/fs.h>
@@ -37,6 +36,7 @@
37#include <linux/sysrq.h> 36#include <linux/sysrq.h>
38#include <linux/highuid.h> 37#include <linux/highuid.h>
39#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/ratelimit.h>
40#include <linux/hugetlb.h> 40#include <linux/hugetlb.h>
41#include <linux/initrd.h> 41#include <linux/initrd.h>
42#include <linux/key.h> 42#include <linux/key.h>
@@ -77,6 +77,7 @@ extern int max_threads;
77extern int core_uses_pid; 77extern int core_uses_pid;
78extern int suid_dumpable; 78extern int suid_dumpable;
79extern char core_pattern[]; 79extern char core_pattern[];
80extern unsigned int core_pipe_limit;
80extern int pid_max; 81extern int pid_max;
81extern int min_free_kbytes; 82extern int min_free_kbytes;
82extern int pid_max_min, pid_max_max; 83extern int pid_max_min, pid_max_max;
@@ -158,14 +159,16 @@ extern int no_unaligned_warning;
158extern int unaligned_dump_stack; 159extern int unaligned_dump_stack;
159#endif 160#endif
160 161
162extern struct ratelimit_state printk_ratelimit_state;
163
161#ifdef CONFIG_RT_MUTEXES 164#ifdef CONFIG_RT_MUTEXES
162extern int max_lock_depth; 165extern int max_lock_depth;
163#endif 166#endif
164 167
165#ifdef CONFIG_PROC_SYSCTL 168#ifdef CONFIG_PROC_SYSCTL
166static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, 169static int proc_do_cad_pid(struct ctl_table *table, int write,
167 void __user *buffer, size_t *lenp, loff_t *ppos); 170 void __user *buffer, size_t *lenp, loff_t *ppos);
168static int proc_taint(struct ctl_table *table, int write, struct file *filp, 171static int proc_taint(struct ctl_table *table, int write,
169 void __user *buffer, size_t *lenp, loff_t *ppos); 172 void __user *buffer, size_t *lenp, loff_t *ppos);
170#endif 173#endif
171 174
@@ -424,6 +427,14 @@ static struct ctl_table kern_table[] = {
424 .proc_handler = &proc_dostring, 427 .proc_handler = &proc_dostring,
425 .strategy = &sysctl_string, 428 .strategy = &sysctl_string,
426 }, 429 },
430 {
431 .ctl_name = CTL_UNNUMBERED,
432 .procname = "core_pipe_limit",
433 .data = &core_pipe_limit,
434 .maxlen = sizeof(unsigned int),
435 .mode = 0644,
436 .proc_handler = &proc_dointvec,
437 },
427#ifdef CONFIG_PROC_SYSCTL 438#ifdef CONFIG_PROC_SYSCTL
428 { 439 {
429 .procname = "tainted", 440 .procname = "tainted",
@@ -1390,6 +1401,31 @@ static struct ctl_table vm_table[] = {
1390 .mode = 0644, 1401 .mode = 0644,
1391 .proc_handler = &scan_unevictable_handler, 1402 .proc_handler = &scan_unevictable_handler,
1392 }, 1403 },
1404#ifdef CONFIG_MEMORY_FAILURE
1405 {
1406 .ctl_name = CTL_UNNUMBERED,
1407 .procname = "memory_failure_early_kill",
1408 .data = &sysctl_memory_failure_early_kill,
1409 .maxlen = sizeof(sysctl_memory_failure_early_kill),
1410 .mode = 0644,
1411 .proc_handler = &proc_dointvec_minmax,
1412 .strategy = &sysctl_intvec,
1413 .extra1 = &zero,
1414 .extra2 = &one,
1415 },
1416 {
1417 .ctl_name = CTL_UNNUMBERED,
1418 .procname = "memory_failure_recovery",
1419 .data = &sysctl_memory_failure_recovery,
1420 .maxlen = sizeof(sysctl_memory_failure_recovery),
1421 .mode = 0644,
1422 .proc_handler = &proc_dointvec_minmax,
1423 .strategy = &sysctl_intvec,
1424 .extra1 = &zero,
1425 .extra2 = &one,
1426 },
1427#endif
1428
1393/* 1429/*
1394 * NOTE: do not add new entries to this table unless you have read 1430 * NOTE: do not add new entries to this table unless you have read
1395 * Documentation/sysctl/ctl_unnumbered.txt 1431 * Documentation/sysctl/ctl_unnumbered.txt
@@ -2218,7 +2254,7 @@ void sysctl_head_put(struct ctl_table_header *head)
2218#ifdef CONFIG_PROC_SYSCTL 2254#ifdef CONFIG_PROC_SYSCTL
2219 2255
2220static int _proc_do_string(void* data, int maxlen, int write, 2256static int _proc_do_string(void* data, int maxlen, int write,
2221 struct file *filp, void __user *buffer, 2257 void __user *buffer,
2222 size_t *lenp, loff_t *ppos) 2258 size_t *lenp, loff_t *ppos)
2223{ 2259{
2224 size_t len; 2260 size_t len;
@@ -2279,7 +2315,6 @@ static int _proc_do_string(void* data, int maxlen, int write,
2279 * proc_dostring - read a string sysctl 2315 * proc_dostring - read a string sysctl
2280 * @table: the sysctl table 2316 * @table: the sysctl table
2281 * @write: %TRUE if this is a write to the sysctl file 2317 * @write: %TRUE if this is a write to the sysctl file
2282 * @filp: the file structure
2283 * @buffer: the user buffer 2318 * @buffer: the user buffer
2284 * @lenp: the size of the user buffer 2319 * @lenp: the size of the user buffer
2285 * @ppos: file position 2320 * @ppos: file position
@@ -2293,10 +2328,10 @@ static int _proc_do_string(void* data, int maxlen, int write,
2293 * 2328 *
2294 * Returns 0 on success. 2329 * Returns 0 on success.
2295 */ 2330 */
2296int proc_dostring(struct ctl_table *table, int write, struct file *filp, 2331int proc_dostring(struct ctl_table *table, int write,
2297 void __user *buffer, size_t *lenp, loff_t *ppos) 2332 void __user *buffer, size_t *lenp, loff_t *ppos)
2298{ 2333{
2299 return _proc_do_string(table->data, table->maxlen, write, filp, 2334 return _proc_do_string(table->data, table->maxlen, write,
2300 buffer, lenp, ppos); 2335 buffer, lenp, ppos);
2301} 2336}
2302 2337
@@ -2321,7 +2356,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
2321} 2356}
2322 2357
2323static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, 2358static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2324 int write, struct file *filp, void __user *buffer, 2359 int write, void __user *buffer,
2325 size_t *lenp, loff_t *ppos, 2360 size_t *lenp, loff_t *ppos,
2326 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2361 int (*conv)(int *negp, unsigned long *lvalp, int *valp,
2327 int write, void *data), 2362 int write, void *data),
@@ -2428,13 +2463,13 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2428#undef TMPBUFLEN 2463#undef TMPBUFLEN
2429} 2464}
2430 2465
2431static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp, 2466static int do_proc_dointvec(struct ctl_table *table, int write,
2432 void __user *buffer, size_t *lenp, loff_t *ppos, 2467 void __user *buffer, size_t *lenp, loff_t *ppos,
2433 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2468 int (*conv)(int *negp, unsigned long *lvalp, int *valp,
2434 int write, void *data), 2469 int write, void *data),
2435 void *data) 2470 void *data)
2436{ 2471{
2437 return __do_proc_dointvec(table->data, table, write, filp, 2472 return __do_proc_dointvec(table->data, table, write,
2438 buffer, lenp, ppos, conv, data); 2473 buffer, lenp, ppos, conv, data);
2439} 2474}
2440 2475
@@ -2442,7 +2477,6 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
2442 * proc_dointvec - read a vector of integers 2477 * proc_dointvec - read a vector of integers
2443 * @table: the sysctl table 2478 * @table: the sysctl table
2444 * @write: %TRUE if this is a write to the sysctl file 2479 * @write: %TRUE if this is a write to the sysctl file
2445 * @filp: the file structure
2446 * @buffer: the user buffer 2480 * @buffer: the user buffer
2447 * @lenp: the size of the user buffer 2481 * @lenp: the size of the user buffer
2448 * @ppos: file position 2482 * @ppos: file position
@@ -2452,10 +2486,10 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
2452 * 2486 *
2453 * Returns 0 on success. 2487 * Returns 0 on success.
2454 */ 2488 */
2455int proc_dointvec(struct ctl_table *table, int write, struct file *filp, 2489int proc_dointvec(struct ctl_table *table, int write,
2456 void __user *buffer, size_t *lenp, loff_t *ppos) 2490 void __user *buffer, size_t *lenp, loff_t *ppos)
2457{ 2491{
2458 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2492 return do_proc_dointvec(table,write,buffer,lenp,ppos,
2459 NULL,NULL); 2493 NULL,NULL);
2460} 2494}
2461 2495
@@ -2463,7 +2497,7 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
2463 * Taint values can only be increased 2497 * Taint values can only be increased
2464 * This means we can safely use a temporary. 2498 * This means we can safely use a temporary.
2465 */ 2499 */
2466static int proc_taint(struct ctl_table *table, int write, struct file *filp, 2500static int proc_taint(struct ctl_table *table, int write,
2467 void __user *buffer, size_t *lenp, loff_t *ppos) 2501 void __user *buffer, size_t *lenp, loff_t *ppos)
2468{ 2502{
2469 struct ctl_table t; 2503 struct ctl_table t;
@@ -2475,7 +2509,7 @@ static int proc_taint(struct ctl_table *table, int write, struct file *filp,
2475 2509
2476 t = *table; 2510 t = *table;
2477 t.data = &tmptaint; 2511 t.data = &tmptaint;
2478 err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); 2512 err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
2479 if (err < 0) 2513 if (err < 0)
2480 return err; 2514 return err;
2481 2515
@@ -2527,7 +2561,6 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
2527 * proc_dointvec_minmax - read a vector of integers with min/max values 2561 * proc_dointvec_minmax - read a vector of integers with min/max values
2528 * @table: the sysctl table 2562 * @table: the sysctl table
2529 * @write: %TRUE if this is a write to the sysctl file 2563 * @write: %TRUE if this is a write to the sysctl file
2530 * @filp: the file structure
2531 * @buffer: the user buffer 2564 * @buffer: the user buffer
2532 * @lenp: the size of the user buffer 2565 * @lenp: the size of the user buffer
2533 * @ppos: file position 2566 * @ppos: file position
@@ -2540,19 +2573,18 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
2540 * 2573 *
2541 * Returns 0 on success. 2574 * Returns 0 on success.
2542 */ 2575 */
2543int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, 2576int proc_dointvec_minmax(struct ctl_table *table, int write,
2544 void __user *buffer, size_t *lenp, loff_t *ppos) 2577 void __user *buffer, size_t *lenp, loff_t *ppos)
2545{ 2578{
2546 struct do_proc_dointvec_minmax_conv_param param = { 2579 struct do_proc_dointvec_minmax_conv_param param = {
2547 .min = (int *) table->extra1, 2580 .min = (int *) table->extra1,
2548 .max = (int *) table->extra2, 2581 .max = (int *) table->extra2,
2549 }; 2582 };
2550 return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, 2583 return do_proc_dointvec(table, write, buffer, lenp, ppos,
2551 do_proc_dointvec_minmax_conv, &param); 2584 do_proc_dointvec_minmax_conv, &param);
2552} 2585}
2553 2586
2554static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, 2587static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
2555 struct file *filp,
2556 void __user *buffer, 2588 void __user *buffer,
2557 size_t *lenp, loff_t *ppos, 2589 size_t *lenp, loff_t *ppos,
2558 unsigned long convmul, 2590 unsigned long convmul,
@@ -2657,21 +2689,19 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2657} 2689}
2658 2690
2659static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, 2691static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
2660 struct file *filp,
2661 void __user *buffer, 2692 void __user *buffer,
2662 size_t *lenp, loff_t *ppos, 2693 size_t *lenp, loff_t *ppos,
2663 unsigned long convmul, 2694 unsigned long convmul,
2664 unsigned long convdiv) 2695 unsigned long convdiv)
2665{ 2696{
2666 return __do_proc_doulongvec_minmax(table->data, table, write, 2697 return __do_proc_doulongvec_minmax(table->data, table, write,
2667 filp, buffer, lenp, ppos, convmul, convdiv); 2698 buffer, lenp, ppos, convmul, convdiv);
2668} 2699}
2669 2700
2670/** 2701/**
2671 * proc_doulongvec_minmax - read a vector of long integers with min/max values 2702 * proc_doulongvec_minmax - read a vector of long integers with min/max values
2672 * @table: the sysctl table 2703 * @table: the sysctl table
2673 * @write: %TRUE if this is a write to the sysctl file 2704 * @write: %TRUE if this is a write to the sysctl file
2674 * @filp: the file structure
2675 * @buffer: the user buffer 2705 * @buffer: the user buffer
2676 * @lenp: the size of the user buffer 2706 * @lenp: the size of the user buffer
2677 * @ppos: file position 2707 * @ppos: file position
@@ -2684,17 +2714,16 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
2684 * 2714 *
2685 * Returns 0 on success. 2715 * Returns 0 on success.
2686 */ 2716 */
2687int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, 2717int proc_doulongvec_minmax(struct ctl_table *table, int write,
2688 void __user *buffer, size_t *lenp, loff_t *ppos) 2718 void __user *buffer, size_t *lenp, loff_t *ppos)
2689{ 2719{
2690 return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l); 2720 return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
2691} 2721}
2692 2722
2693/** 2723/**
2694 * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values 2724 * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
2695 * @table: the sysctl table 2725 * @table: the sysctl table
2696 * @write: %TRUE if this is a write to the sysctl file 2726 * @write: %TRUE if this is a write to the sysctl file
2697 * @filp: the file structure
2698 * @buffer: the user buffer 2727 * @buffer: the user buffer
2699 * @lenp: the size of the user buffer 2728 * @lenp: the size of the user buffer
2700 * @ppos: file position 2729 * @ppos: file position
@@ -2709,11 +2738,10 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp
2709 * Returns 0 on success. 2738 * Returns 0 on success.
2710 */ 2739 */
2711int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, 2740int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2712 struct file *filp,
2713 void __user *buffer, 2741 void __user *buffer,
2714 size_t *lenp, loff_t *ppos) 2742 size_t *lenp, loff_t *ppos)
2715{ 2743{
2716 return do_proc_doulongvec_minmax(table, write, filp, buffer, 2744 return do_proc_doulongvec_minmax(table, write, buffer,
2717 lenp, ppos, HZ, 1000l); 2745 lenp, ppos, HZ, 1000l);
2718} 2746}
2719 2747
@@ -2789,7 +2817,6 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
2789 * proc_dointvec_jiffies - read a vector of integers as seconds 2817 * proc_dointvec_jiffies - read a vector of integers as seconds
2790 * @table: the sysctl table 2818 * @table: the sysctl table
2791 * @write: %TRUE if this is a write to the sysctl file 2819 * @write: %TRUE if this is a write to the sysctl file
2792 * @filp: the file structure
2793 * @buffer: the user buffer 2820 * @buffer: the user buffer
2794 * @lenp: the size of the user buffer 2821 * @lenp: the size of the user buffer
2795 * @ppos: file position 2822 * @ppos: file position
@@ -2801,10 +2828,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
2801 * 2828 *
2802 * Returns 0 on success. 2829 * Returns 0 on success.
2803 */ 2830 */
2804int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, 2831int proc_dointvec_jiffies(struct ctl_table *table, int write,
2805 void __user *buffer, size_t *lenp, loff_t *ppos) 2832 void __user *buffer, size_t *lenp, loff_t *ppos)
2806{ 2833{
2807 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2834 return do_proc_dointvec(table,write,buffer,lenp,ppos,
2808 do_proc_dointvec_jiffies_conv,NULL); 2835 do_proc_dointvec_jiffies_conv,NULL);
2809} 2836}
2810 2837
@@ -2812,7 +2839,6 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
2812 * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds 2839 * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
2813 * @table: the sysctl table 2840 * @table: the sysctl table
2814 * @write: %TRUE if this is a write to the sysctl file 2841 * @write: %TRUE if this is a write to the sysctl file
2815 * @filp: the file structure
2816 * @buffer: the user buffer 2842 * @buffer: the user buffer
2817 * @lenp: the size of the user buffer 2843 * @lenp: the size of the user buffer
2818 * @ppos: pointer to the file position 2844 * @ppos: pointer to the file position
@@ -2824,10 +2850,10 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
2824 * 2850 *
2825 * Returns 0 on success. 2851 * Returns 0 on success.
2826 */ 2852 */
2827int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, 2853int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
2828 void __user *buffer, size_t *lenp, loff_t *ppos) 2854 void __user *buffer, size_t *lenp, loff_t *ppos)
2829{ 2855{
2830 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2856 return do_proc_dointvec(table,write,buffer,lenp,ppos,
2831 do_proc_dointvec_userhz_jiffies_conv,NULL); 2857 do_proc_dointvec_userhz_jiffies_conv,NULL);
2832} 2858}
2833 2859
@@ -2835,7 +2861,6 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
2835 * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds 2861 * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
2836 * @table: the sysctl table 2862 * @table: the sysctl table
2837 * @write: %TRUE if this is a write to the sysctl file 2863 * @write: %TRUE if this is a write to the sysctl file
2838 * @filp: the file structure
2839 * @buffer: the user buffer 2864 * @buffer: the user buffer
2840 * @lenp: the size of the user buffer 2865 * @lenp: the size of the user buffer
2841 * @ppos: file position 2866 * @ppos: file position
@@ -2848,14 +2873,14 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
2848 * 2873 *
2849 * Returns 0 on success. 2874 * Returns 0 on success.
2850 */ 2875 */
2851int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, 2876int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
2852 void __user *buffer, size_t *lenp, loff_t *ppos) 2877 void __user *buffer, size_t *lenp, loff_t *ppos)
2853{ 2878{
2854 return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, 2879 return do_proc_dointvec(table, write, buffer, lenp, ppos,
2855 do_proc_dointvec_ms_jiffies_conv, NULL); 2880 do_proc_dointvec_ms_jiffies_conv, NULL);
2856} 2881}
2857 2882
2858static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, 2883static int proc_do_cad_pid(struct ctl_table *table, int write,
2859 void __user *buffer, size_t *lenp, loff_t *ppos) 2884 void __user *buffer, size_t *lenp, loff_t *ppos)
2860{ 2885{
2861 struct pid *new_pid; 2886 struct pid *new_pid;
@@ -2864,7 +2889,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
2864 2889
2865 tmp = pid_vnr(cad_pid); 2890 tmp = pid_vnr(cad_pid);
2866 2891
2867 r = __do_proc_dointvec(&tmp, table, write, filp, buffer, 2892 r = __do_proc_dointvec(&tmp, table, write, buffer,
2868 lenp, ppos, NULL, NULL); 2893 lenp, ppos, NULL, NULL);
2869 if (r || !write) 2894 if (r || !write)
2870 return r; 2895 return r;
@@ -2879,50 +2904,49 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
2879 2904
2880#else /* CONFIG_PROC_FS */ 2905#else /* CONFIG_PROC_FS */
2881 2906
2882int proc_dostring(struct ctl_table *table, int write, struct file *filp, 2907int proc_dostring(struct ctl_table *table, int write,
2883 void __user *buffer, size_t *lenp, loff_t *ppos) 2908 void __user *buffer, size_t *lenp, loff_t *ppos)
2884{ 2909{
2885 return -ENOSYS; 2910 return -ENOSYS;
2886} 2911}
2887 2912
2888int proc_dointvec(struct ctl_table *table, int write, struct file *filp, 2913int proc_dointvec(struct ctl_table *table, int write,
2889 void __user *buffer, size_t *lenp, loff_t *ppos) 2914 void __user *buffer, size_t *lenp, loff_t *ppos)
2890{ 2915{
2891 return -ENOSYS; 2916 return -ENOSYS;
2892} 2917}
2893 2918
2894int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, 2919int proc_dointvec_minmax(struct ctl_table *table, int write,
2895 void __user *buffer, size_t *lenp, loff_t *ppos) 2920 void __user *buffer, size_t *lenp, loff_t *ppos)
2896{ 2921{
2897 return -ENOSYS; 2922 return -ENOSYS;
2898} 2923}
2899 2924
2900int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, 2925int proc_dointvec_jiffies(struct ctl_table *table, int write,
2901 void __user *buffer, size_t *lenp, loff_t *ppos) 2926 void __user *buffer, size_t *lenp, loff_t *ppos)
2902{ 2927{
2903 return -ENOSYS; 2928 return -ENOSYS;
2904} 2929}
2905 2930
2906int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, 2931int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
2907 void __user *buffer, size_t *lenp, loff_t *ppos) 2932 void __user *buffer, size_t *lenp, loff_t *ppos)
2908{ 2933{
2909 return -ENOSYS; 2934 return -ENOSYS;
2910} 2935}
2911 2936
2912int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, 2937int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
2913 void __user *buffer, size_t *lenp, loff_t *ppos) 2938 void __user *buffer, size_t *lenp, loff_t *ppos)
2914{ 2939{
2915 return -ENOSYS; 2940 return -ENOSYS;
2916} 2941}
2917 2942
2918int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, 2943int proc_doulongvec_minmax(struct ctl_table *table, int write,
2919 void __user *buffer, size_t *lenp, loff_t *ppos) 2944 void __user *buffer, size_t *lenp, loff_t *ppos)
2920{ 2945{
2921 return -ENOSYS; 2946 return -ENOSYS;
2922} 2947}
2923 2948
2924int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, 2949int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2925 struct file *filp,
2926 void __user *buffer, 2950 void __user *buffer,
2927 size_t *lenp, loff_t *ppos) 2951 size_t *lenp, loff_t *ppos)
2928{ 2952{
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index b38423ca711a..b6e7aaea4604 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1521,7 +1521,7 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1521 if (!table->ctl_name && table->strategy) 1521 if (!table->ctl_name && table->strategy)
1522 set_fail(&fail, table, "Strategy without ctl_name"); 1522 set_fail(&fail, table, "Strategy without ctl_name");
1523#endif 1523#endif
1524#ifdef CONFIG_PROC_FS 1524#ifdef CONFIG_PROC_SYSCTL
1525 if (table->procname && !table->proc_handler) 1525 if (table->procname && !table->proc_handler)
1526 set_fail(&fail, table, "No proc_handler"); 1526 set_fail(&fail, table, "No proc_handler");
1527#endif 1527#endif
diff --git a/kernel/time.c b/kernel/time.c
index 2e2e469a7fec..804798005d19 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -662,6 +662,36 @@ u64 nsec_to_clock_t(u64 x)
662#endif 662#endif
663} 663}
664 664
665/**
666 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
667 *
668 * @n: nsecs in u64
669 *
670 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
671 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
672 * for scheduler, not for use in device drivers to calculate timeout value.
673 *
674 * note:
675 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
676 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
677 */
678unsigned long nsecs_to_jiffies(u64 n)
679{
680#if (NSEC_PER_SEC % HZ) == 0
681 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
682 return div_u64(n, NSEC_PER_SEC / HZ);
683#elif (HZ % 512) == 0
684 /* overflow after 292 years if HZ = 1024 */
685 return div_u64(n * HZ / 512, NSEC_PER_SEC / 512);
686#else
687 /*
688 * Generic case - optimized for cases where HZ is a multiple of 3.
689 * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc.
690 */
691 return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
692#endif
693}
694
665#if (BITS_PER_LONG < 64) 695#if (BITS_PER_LONG < 64)
666u64 get_jiffies_64(void) 696u64 get_jiffies_64(void)
667{ 697{
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 0b0a6366c9d4..ee266620b06c 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o
2 2
3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 09113347d328..5e18c6ab2c6a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -394,15 +394,11 @@ void clocksource_resume(void)
394{ 394{
395 struct clocksource *cs; 395 struct clocksource *cs;
396 396
397 mutex_lock(&clocksource_mutex);
398
399 list_for_each_entry(cs, &clocksource_list, list) 397 list_for_each_entry(cs, &clocksource_list, list)
400 if (cs->resume) 398 if (cs->resume)
401 cs->resume(); 399 cs->resume();
402 400
403 clocksource_resume_watchdog(); 401 clocksource_resume_watchdog();
404
405 mutex_unlock(&clocksource_mutex);
406} 402}
407 403
408/** 404/**
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e0f59a21c061..89aed5933ed4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -231,6 +231,13 @@ void tick_nohz_stop_sched_tick(int inidle)
231 if (!inidle && !ts->inidle) 231 if (!inidle && !ts->inidle)
232 goto end; 232 goto end;
233 233
234 /*
235 * Set ts->inidle unconditionally. Even if the system did not
236 * switch to NOHZ mode the cpu frequency governers rely on the
237 * update of the idle time accounting in tick_nohz_start_idle().
238 */
239 ts->inidle = 1;
240
234 now = tick_nohz_start_idle(ts); 241 now = tick_nohz_start_idle(ts);
235 242
236 /* 243 /*
@@ -248,8 +255,6 @@ void tick_nohz_stop_sched_tick(int inidle)
248 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 255 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
249 goto end; 256 goto end;
250 257
251 ts->inidle = 1;
252
253 if (need_resched()) 258 if (need_resched())
254 goto end; 259 goto end;
255 260
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
new file mode 100644
index 000000000000..86628e755f38
--- /dev/null
+++ b/kernel/time/timeconv.c
@@ -0,0 +1,127 @@
1/*
2 * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc.
3 * This file is part of the GNU C Library.
4 * Contributed by Paul Eggert (eggert@twinsun.com).
5 *
6 * The GNU C Library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
10 *
11 * The GNU C Library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
15 *
16 * You should have received a copy of the GNU Library General Public
17 * License along with the GNU C Library; see the file COPYING.LIB. If not,
18 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
20 */
21
22/*
23 * Converts the calendar time to broken-down time representation
24 * Based on code from glibc-2.6
25 *
26 * 2009-7-14:
27 * Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com>
28 */
29
30#include <linux/time.h>
31#include <linux/module.h>
32
33/*
34 * Nonzero if YEAR is a leap year (every 4 years,
35 * except every 100th isn't, and every 400th is).
36 */
37static int __isleap(long year)
38{
39 return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0);
40}
41
42/* do a mathdiv for long type */
43static long math_div(long a, long b)
44{
45 return a / b - (a % b < 0);
46}
47
48/* How many leap years between y1 and y2, y1 must less or equal to y2 */
49static long leaps_between(long y1, long y2)
50{
51 long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100)
52 + math_div(y1 - 1, 400);
53 long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100)
54 + math_div(y2 - 1, 400);
55 return leaps2 - leaps1;
56}
57
58/* How many days come before each month (0-12). */
59static const unsigned short __mon_yday[2][13] = {
60 /* Normal years. */
61 {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365},
62 /* Leap years. */
63 {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366}
64};
65
66#define SECS_PER_HOUR (60 * 60)
67#define SECS_PER_DAY (SECS_PER_HOUR * 24)
68
69/**
70 * time_to_tm - converts the calendar time to local broken-down time
71 *
72 * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970,
73 * Coordinated Universal Time (UTC).
74 * @offset offset seconds adding to totalsecs.
75 * @result pointer to struct tm variable to receive broken-down time
76 */
77void time_to_tm(time_t totalsecs, int offset, struct tm *result)
78{
79 long days, rem, y;
80 const unsigned short *ip;
81
82 days = totalsecs / SECS_PER_DAY;
83 rem = totalsecs % SECS_PER_DAY;
84 rem += offset;
85 while (rem < 0) {
86 rem += SECS_PER_DAY;
87 --days;
88 }
89 while (rem >= SECS_PER_DAY) {
90 rem -= SECS_PER_DAY;
91 ++days;
92 }
93
94 result->tm_hour = rem / SECS_PER_HOUR;
95 rem %= SECS_PER_HOUR;
96 result->tm_min = rem / 60;
97 result->tm_sec = rem % 60;
98
99 /* January 1, 1970 was a Thursday. */
100 result->tm_wday = (4 + days) % 7;
101 if (result->tm_wday < 0)
102 result->tm_wday += 7;
103
104 y = 1970;
105
106 while (days < 0 || days >= (__isleap(y) ? 366 : 365)) {
107 /* Guess a corrected year, assuming 365 days per year. */
108 long yg = y + math_div(days, 365);
109
110 /* Adjust DAYS and Y to match the guessed year. */
111 days -= (yg - y) * 365 + leaps_between(y, yg);
112 y = yg;
113 }
114
115 result->tm_year = y - 1900;
116
117 result->tm_yday = days;
118
119 ip = __mon_yday[__isleap(y)];
120 for (y = 11; days < ip[y]; y--)
121 continue;
122 days -= ip[y];
123
124 result->tm_mon = y;
125 result->tm_mday = days + 1;
126}
127EXPORT_SYMBOL(time_to_tm);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fb0f46fa1ecd..c3a4e2907eaa 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -13,6 +13,7 @@
13#include <linux/percpu.h> 13#include <linux/percpu.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/sched.h>
16#include <linux/sysdev.h> 17#include <linux/sysdev.h>
17#include <linux/clocksource.h> 18#include <linux/clocksource.h>
18#include <linux/jiffies.h> 19#include <linux/jiffies.h>
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index fddd69d16e03..1b5b7aa2fdfd 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -275,7 +275,7 @@ static int timer_list_open(struct inode *inode, struct file *filp)
275 return single_open(filp, timer_list_show, NULL); 275 return single_open(filp, timer_list_show, NULL);
276} 276}
277 277
278static struct file_operations timer_list_fops = { 278static const struct file_operations timer_list_fops = {
279 .open = timer_list_open, 279 .open = timer_list_open,
280 .read = seq_read, 280 .read = seq_read,
281 .llseek = seq_lseek, 281 .llseek = seq_lseek,
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 4cde8b9c716f..ee5681f8d7ec 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -395,7 +395,7 @@ static int tstats_open(struct inode *inode, struct file *filp)
395 return single_open(filp, tstats_show, NULL); 395 return single_open(filp, tstats_show, NULL);
396} 396}
397 397
398static struct file_operations tstats_fops = { 398static const struct file_operations tstats_fops = {
399 .open = tstats_open, 399 .open = tstats_open,
400 .read = seq_read, 400 .read = seq_read,
401 .write = tstats_write, 401 .write = tstats_write,
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b416512ad17f..d006554888dc 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -339,6 +339,27 @@ config POWER_TRACER
339 power management decisions, specifically the C-state and P-state 339 power management decisions, specifically the C-state and P-state
340 behavior. 340 behavior.
341 341
342config KSYM_TRACER
343 bool "Trace read and write access on kernel memory locations"
344 depends on HAVE_HW_BREAKPOINT
345 select TRACING
346 help
347 This tracer helps find read and write operations on any given kernel
348 symbol i.e. /proc/kallsyms.
349
350config PROFILE_KSYM_TRACER
351 bool "Profile all kernel memory accesses on 'watched' variables"
352 depends on KSYM_TRACER
353 help
354 This tracer profiles kernel accesses on variables watched through the
355 ksym tracer ftrace plugin. Depending upon the hardware, all read
356 and write operations on kernel variables can be monitored for
357 accesses.
358
359 The results will be displayed in:
360 /debugfs/tracing/profile_ksym
361
362 Say N if unsure.
342 363
343config STACK_TRACER 364config STACK_TRACER
344 bool "Trace max stack" 365 bool "Trace max stack"
@@ -428,6 +449,23 @@ config BLK_DEV_IO_TRACE
428 449
429 If unsure, say N. 450 If unsure, say N.
430 451
452config KPROBE_EVENT
453 depends on KPROBES
454 depends on X86
455 bool "Enable kprobes-based dynamic events"
456 select TRACING
457 default y
458 help
459 This allows the user to add tracing events (similar to tracepoints) on the fly
460 via the ftrace interface. See Documentation/trace/kprobetrace.txt
461 for more details.
462
463 Those events can be inserted wherever kprobes can probe, and record
464 various register and memory values.
465
466 This option is also required by perf-probe subcommand of perf tools. If
467 you want to use perf tools, this option is strongly recommended.
468
431config DYNAMIC_FTRACE 469config DYNAMIC_FTRACE
432 bool "enable/disable ftrace tracepoints dynamically" 470 bool "enable/disable ftrace tracepoints dynamically"
433 depends on FUNCTION_TRACER 471 depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 26f03ac07c2b..cd9ecd89ec77 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,8 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o
53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
57obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
56obj-$(CONFIG_EVENT_TRACING) += power-traces.o 58obj-$(CONFIG_EVENT_TRACING) += power-traces.o
57 59
58libftrace-y := ftrace.o 60libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 3eb159c277c8..d9d6206e0b14 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -856,6 +856,37 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
856} 856}
857 857
858/** 858/**
859 * blk_add_trace_rq_remap - Add a trace for a request-remap operation
860 * @q: queue the io is for
861 * @rq: the source request
862 * @dev: target device
863 * @from: source sector
864 *
865 * Description:
866 * Device mapper remaps request to other devices.
867 * Add a trace for that action.
868 *
869 **/
870static void blk_add_trace_rq_remap(struct request_queue *q,
871 struct request *rq, dev_t dev,
872 sector_t from)
873{
874 struct blk_trace *bt = q->blk_trace;
875 struct blk_io_trace_remap r;
876
877 if (likely(!bt))
878 return;
879
880 r.device_from = cpu_to_be32(dev);
881 r.device_to = cpu_to_be32(disk_devt(rq->rq_disk));
882 r.sector_from = cpu_to_be64(from);
883
884 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
885 rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors,
886 sizeof(r), &r);
887}
888
889/**
859 * blk_add_driver_data - Add binary message with driver-specific data 890 * blk_add_driver_data - Add binary message with driver-specific data
860 * @q: queue the io is for 891 * @q: queue the io is for
861 * @rq: io request 892 * @rq: io request
@@ -922,10 +953,13 @@ static void blk_register_tracepoints(void)
922 WARN_ON(ret); 953 WARN_ON(ret);
923 ret = register_trace_block_remap(blk_add_trace_remap); 954 ret = register_trace_block_remap(blk_add_trace_remap);
924 WARN_ON(ret); 955 WARN_ON(ret);
956 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap);
957 WARN_ON(ret);
925} 958}
926 959
927static void blk_unregister_tracepoints(void) 960static void blk_unregister_tracepoints(void)
928{ 961{
962 unregister_trace_block_rq_remap(blk_add_trace_rq_remap);
929 unregister_trace_block_remap(blk_add_trace_remap); 963 unregister_trace_block_remap(blk_add_trace_remap);
930 unregister_trace_block_split(blk_add_trace_split); 964 unregister_trace_block_split(blk_add_trace_split);
931 unregister_trace_block_unplug_io(blk_add_trace_unplug_io); 965 unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
@@ -1657,6 +1691,11 @@ int blk_trace_init_sysfs(struct device *dev)
1657 return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); 1691 return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
1658} 1692}
1659 1693
1694void blk_trace_remove_sysfs(struct device *dev)
1695{
1696 sysfs_remove_group(&dev->kobj, &blk_trace_attr_group);
1697}
1698
1660#endif /* CONFIG_BLK_DEV_IO_TRACE */ 1699#endif /* CONFIG_BLK_DEV_IO_TRACE */
1661 1700
1662#ifdef CONFIG_EVENT_TRACING 1701#ifdef CONFIG_EVENT_TRACING
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 23df7771c937..e51a1bcb7bed 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -60,6 +60,13 @@ static int last_ftrace_enabled;
60/* Quick disabling of function tracer. */ 60/* Quick disabling of function tracer. */
61int function_trace_stop; 61int function_trace_stop;
62 62
63/* List for set_ftrace_pid's pids. */
64LIST_HEAD(ftrace_pids);
65struct ftrace_pid {
66 struct list_head list;
67 struct pid *pid;
68};
69
63/* 70/*
64 * ftrace_disabled is set when an anomaly is discovered. 71 * ftrace_disabled is set when an anomaly is discovered.
65 * ftrace_disabled is much stronger than ftrace_enabled. 72 * ftrace_disabled is much stronger than ftrace_enabled.
@@ -78,6 +85,10 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
78ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 85ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
79ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 86ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
80 87
88#ifdef CONFIG_FUNCTION_GRAPH_TRACER
89static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
90#endif
91
81static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 92static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
82{ 93{
83 struct ftrace_ops *op = ftrace_list; 94 struct ftrace_ops *op = ftrace_list;
@@ -155,7 +166,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
155 else 166 else
156 func = ftrace_list_func; 167 func = ftrace_list_func;
157 168
158 if (ftrace_pid_trace) { 169 if (!list_empty(&ftrace_pids)) {
159 set_ftrace_pid_function(func); 170 set_ftrace_pid_function(func);
160 func = ftrace_pid_func; 171 func = ftrace_pid_func;
161 } 172 }
@@ -203,7 +214,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
203 if (ftrace_list->next == &ftrace_list_end) { 214 if (ftrace_list->next == &ftrace_list_end) {
204 ftrace_func_t func = ftrace_list->func; 215 ftrace_func_t func = ftrace_list->func;
205 216
206 if (ftrace_pid_trace) { 217 if (!list_empty(&ftrace_pids)) {
207 set_ftrace_pid_function(func); 218 set_ftrace_pid_function(func);
208 func = ftrace_pid_func; 219 func = ftrace_pid_func;
209 } 220 }
@@ -225,9 +236,13 @@ static void ftrace_update_pid_func(void)
225 if (ftrace_trace_function == ftrace_stub) 236 if (ftrace_trace_function == ftrace_stub)
226 return; 237 return;
227 238
239#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
228 func = ftrace_trace_function; 240 func = ftrace_trace_function;
241#else
242 func = __ftrace_trace_function;
243#endif
229 244
230 if (ftrace_pid_trace) { 245 if (!list_empty(&ftrace_pids)) {
231 set_ftrace_pid_function(func); 246 set_ftrace_pid_function(func);
232 func = ftrace_pid_func; 247 func = ftrace_pid_func;
233 } else { 248 } else {
@@ -736,7 +751,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
736 out: 751 out:
737 mutex_unlock(&ftrace_profile_lock); 752 mutex_unlock(&ftrace_profile_lock);
738 753
739 filp->f_pos += cnt; 754 *ppos += cnt;
740 755
741 return cnt; 756 return cnt;
742} 757}
@@ -817,8 +832,6 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
817} 832}
818#endif /* CONFIG_FUNCTION_PROFILER */ 833#endif /* CONFIG_FUNCTION_PROFILER */
819 834
820/* set when tracing only a pid */
821struct pid *ftrace_pid_trace;
822static struct pid * const ftrace_swapper_pid = &init_struct_pid; 835static struct pid * const ftrace_swapper_pid = &init_struct_pid;
823 836
824#ifdef CONFIG_DYNAMIC_FTRACE 837#ifdef CONFIG_DYNAMIC_FTRACE
@@ -1074,14 +1087,9 @@ static void ftrace_replace_code(int enable)
1074 failed = __ftrace_replace_code(rec, enable); 1087 failed = __ftrace_replace_code(rec, enable);
1075 if (failed) { 1088 if (failed) {
1076 rec->flags |= FTRACE_FL_FAILED; 1089 rec->flags |= FTRACE_FL_FAILED;
1077 if ((system_state == SYSTEM_BOOTING) || 1090 ftrace_bug(failed, rec->ip);
1078 !core_kernel_text(rec->ip)) { 1091 /* Stop processing */
1079 ftrace_free_rec(rec); 1092 return;
1080 } else {
1081 ftrace_bug(failed, rec->ip);
1082 /* Stop processing */
1083 return;
1084 }
1085 } 1093 }
1086 } while_for_each_ftrace_rec(); 1094 } while_for_each_ftrace_rec();
1087} 1095}
@@ -1262,12 +1270,34 @@ static int ftrace_update_code(struct module *mod)
1262 ftrace_new_addrs = p->newlist; 1270 ftrace_new_addrs = p->newlist;
1263 p->flags = 0L; 1271 p->flags = 0L;
1264 1272
1265 /* convert record (i.e, patch mcount-call with NOP) */ 1273 /*
1266 if (ftrace_code_disable(mod, p)) { 1274 * Do the initial record convertion from mcount jump
1267 p->flags |= FTRACE_FL_CONVERTED; 1275 * to the NOP instructions.
1268 ftrace_update_cnt++; 1276 */
1269 } else 1277 if (!ftrace_code_disable(mod, p)) {
1270 ftrace_free_rec(p); 1278 ftrace_free_rec(p);
1279 continue;
1280 }
1281
1282 p->flags |= FTRACE_FL_CONVERTED;
1283 ftrace_update_cnt++;
1284
1285 /*
1286 * If the tracing is enabled, go ahead and enable the record.
1287 *
1288 * The reason not to enable the record immediatelly is the
1289 * inherent check of ftrace_make_nop/ftrace_make_call for
1290 * correct previous instructions. Making first the NOP
1291 * conversion puts the module to the correct state, thus
1292 * passing the ftrace_make_call check.
1293 */
1294 if (ftrace_start_up) {
1295 int failed = __ftrace_replace_code(p, 1);
1296 if (failed) {
1297 ftrace_bug(failed, p->ip);
1298 ftrace_free_rec(p);
1299 }
1300 }
1271 } 1301 }
1272 1302
1273 stop = ftrace_now(raw_smp_processor_id()); 1303 stop = ftrace_now(raw_smp_processor_id());
@@ -1621,8 +1651,10 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1621 if (!ret) { 1651 if (!ret) {
1622 struct seq_file *m = file->private_data; 1652 struct seq_file *m = file->private_data;
1623 m->private = iter; 1653 m->private = iter;
1624 } else 1654 } else {
1655 trace_parser_put(&iter->parser);
1625 kfree(iter); 1656 kfree(iter);
1657 }
1626 } else 1658 } else
1627 file->private_data = iter; 1659 file->private_data = iter;
1628 mutex_unlock(&ftrace_regex_lock); 1660 mutex_unlock(&ftrace_regex_lock);
@@ -1655,60 +1687,6 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
1655 return ret; 1687 return ret;
1656} 1688}
1657 1689
1658enum {
1659 MATCH_FULL,
1660 MATCH_FRONT_ONLY,
1661 MATCH_MIDDLE_ONLY,
1662 MATCH_END_ONLY,
1663};
1664
1665/*
1666 * (static function - no need for kernel doc)
1667 *
1668 * Pass in a buffer containing a glob and this function will
1669 * set search to point to the search part of the buffer and
1670 * return the type of search it is (see enum above).
1671 * This does modify buff.
1672 *
1673 * Returns enum type.
1674 * search returns the pointer to use for comparison.
1675 * not returns 1 if buff started with a '!'
1676 * 0 otherwise.
1677 */
1678static int
1679ftrace_setup_glob(char *buff, int len, char **search, int *not)
1680{
1681 int type = MATCH_FULL;
1682 int i;
1683
1684 if (buff[0] == '!') {
1685 *not = 1;
1686 buff++;
1687 len--;
1688 } else
1689 *not = 0;
1690
1691 *search = buff;
1692
1693 for (i = 0; i < len; i++) {
1694 if (buff[i] == '*') {
1695 if (!i) {
1696 *search = buff + 1;
1697 type = MATCH_END_ONLY;
1698 } else {
1699 if (type == MATCH_END_ONLY)
1700 type = MATCH_MIDDLE_ONLY;
1701 else
1702 type = MATCH_FRONT_ONLY;
1703 buff[i] = 0;
1704 break;
1705 }
1706 }
1707 }
1708
1709 return type;
1710}
1711
1712static int ftrace_match(char *str, char *regex, int len, int type) 1690static int ftrace_match(char *str, char *regex, int len, int type)
1713{ 1691{
1714 int matched = 0; 1692 int matched = 0;
@@ -1757,7 +1735,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
1757 int not; 1735 int not;
1758 1736
1759 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1737 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1760 type = ftrace_setup_glob(buff, len, &search, &not); 1738 type = filter_parse_regex(buff, len, &search, &not);
1761 1739
1762 search_len = strlen(search); 1740 search_len = strlen(search);
1763 1741
@@ -1825,7 +1803,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
1825 } 1803 }
1826 1804
1827 if (strlen(buff)) { 1805 if (strlen(buff)) {
1828 type = ftrace_setup_glob(buff, strlen(buff), &search, &not); 1806 type = filter_parse_regex(buff, strlen(buff), &search, &not);
1829 search_len = strlen(search); 1807 search_len = strlen(search);
1830 } 1808 }
1831 1809
@@ -1990,7 +1968,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
1990 int count = 0; 1968 int count = 0;
1991 char *search; 1969 char *search;
1992 1970
1993 type = ftrace_setup_glob(glob, strlen(glob), &search, &not); 1971 type = filter_parse_regex(glob, strlen(glob), &search, &not);
1994 len = strlen(search); 1972 len = strlen(search);
1995 1973
1996 /* we do not support '!' for function probes */ 1974 /* we do not support '!' for function probes */
@@ -2067,7 +2045,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
2067 else if (glob) { 2045 else if (glob) {
2068 int not; 2046 int not;
2069 2047
2070 type = ftrace_setup_glob(glob, strlen(glob), &search, &not); 2048 type = filter_parse_regex(glob, strlen(glob), &search, &not);
2071 len = strlen(search); 2049 len = strlen(search);
2072 2050
2073 /* we do not support '!' for function probes */ 2051 /* we do not support '!' for function probes */
@@ -2202,7 +2180,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2202 struct trace_parser *parser; 2180 struct trace_parser *parser;
2203 ssize_t ret, read; 2181 ssize_t ret, read;
2204 2182
2205 if (!cnt || cnt < 0) 2183 if (!cnt)
2206 return 0; 2184 return 0;
2207 2185
2208 mutex_lock(&ftrace_regex_lock); 2186 mutex_lock(&ftrace_regex_lock);
@@ -2216,20 +2194,20 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2216 parser = &iter->parser; 2194 parser = &iter->parser;
2217 read = trace_get_user(parser, ubuf, cnt, ppos); 2195 read = trace_get_user(parser, ubuf, cnt, ppos);
2218 2196
2219 if (trace_parser_loaded(parser) && 2197 if (read >= 0 && trace_parser_loaded(parser) &&
2220 !trace_parser_cont(parser)) { 2198 !trace_parser_cont(parser)) {
2221 ret = ftrace_process_regex(parser->buffer, 2199 ret = ftrace_process_regex(parser->buffer,
2222 parser->idx, enable); 2200 parser->idx, enable);
2223 if (ret) 2201 if (ret)
2224 goto out; 2202 goto out_unlock;
2225 2203
2226 trace_parser_clear(parser); 2204 trace_parser_clear(parser);
2227 } 2205 }
2228 2206
2229 ret = read; 2207 ret = read;
2230 2208out_unlock:
2231 mutex_unlock(&ftrace_regex_lock); 2209 mutex_unlock(&ftrace_regex_lock);
2232out: 2210
2233 return ret; 2211 return ret;
2234} 2212}
2235 2213
@@ -2311,6 +2289,32 @@ static int __init set_ftrace_filter(char *str)
2311} 2289}
2312__setup("ftrace_filter=", set_ftrace_filter); 2290__setup("ftrace_filter=", set_ftrace_filter);
2313 2291
2292#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2293static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
2294static int __init set_graph_function(char *str)
2295{
2296 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
2297 return 1;
2298}
2299__setup("ftrace_graph_filter=", set_graph_function);
2300
2301static void __init set_ftrace_early_graph(char *buf)
2302{
2303 int ret;
2304 char *func;
2305
2306 while (buf) {
2307 func = strsep(&buf, ",");
2308 /* we allow only one expression at a time */
2309 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
2310 func);
2311 if (ret)
2312 printk(KERN_DEBUG "ftrace: function %s not "
2313 "traceable\n", func);
2314 }
2315}
2316#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2317
2314static void __init set_ftrace_early_filter(char *buf, int enable) 2318static void __init set_ftrace_early_filter(char *buf, int enable)
2315{ 2319{
2316 char *func; 2320 char *func;
@@ -2327,6 +2331,10 @@ static void __init set_ftrace_early_filters(void)
2327 set_ftrace_early_filter(ftrace_filter_buf, 1); 2331 set_ftrace_early_filter(ftrace_filter_buf, 1);
2328 if (ftrace_notrace_buf[0]) 2332 if (ftrace_notrace_buf[0])
2329 set_ftrace_early_filter(ftrace_notrace_buf, 0); 2333 set_ftrace_early_filter(ftrace_notrace_buf, 0);
2334#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2335 if (ftrace_graph_buf[0])
2336 set_ftrace_early_graph(ftrace_graph_buf);
2337#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2330} 2338}
2331 2339
2332static int 2340static int
@@ -2512,7 +2520,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2512 return -ENODEV; 2520 return -ENODEV;
2513 2521
2514 /* decode regex */ 2522 /* decode regex */
2515 type = ftrace_setup_glob(buffer, strlen(buffer), &search, &not); 2523 type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
2516 if (not) 2524 if (not)
2517 return -EINVAL; 2525 return -EINVAL;
2518 2526
@@ -2552,8 +2560,7 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2552 size_t cnt, loff_t *ppos) 2560 size_t cnt, loff_t *ppos)
2553{ 2561{
2554 struct trace_parser parser; 2562 struct trace_parser parser;
2555 size_t read = 0; 2563 ssize_t read, ret;
2556 ssize_t ret;
2557 2564
2558 if (!cnt || cnt < 0) 2565 if (!cnt || cnt < 0)
2559 return 0; 2566 return 0;
@@ -2562,29 +2569,31 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2562 2569
2563 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) { 2570 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
2564 ret = -EBUSY; 2571 ret = -EBUSY;
2565 goto out; 2572 goto out_unlock;
2566 } 2573 }
2567 2574
2568 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { 2575 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
2569 ret = -ENOMEM; 2576 ret = -ENOMEM;
2570 goto out; 2577 goto out_unlock;
2571 } 2578 }
2572 2579
2573 read = trace_get_user(&parser, ubuf, cnt, ppos); 2580 read = trace_get_user(&parser, ubuf, cnt, ppos);
2574 2581
2575 if (trace_parser_loaded((&parser))) { 2582 if (read >= 0 && trace_parser_loaded((&parser))) {
2576 parser.buffer[parser.idx] = 0; 2583 parser.buffer[parser.idx] = 0;
2577 2584
2578 /* we allow only one expression at a time */ 2585 /* we allow only one expression at a time */
2579 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, 2586 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
2580 parser.buffer); 2587 parser.buffer);
2581 if (ret) 2588 if (ret)
2582 goto out; 2589 goto out_free;
2583 } 2590 }
2584 2591
2585 ret = read; 2592 ret = read;
2586 out: 2593
2594out_free:
2587 trace_parser_put(&parser); 2595 trace_parser_put(&parser);
2596out_unlock:
2588 mutex_unlock(&graph_lock); 2597 mutex_unlock(&graph_lock);
2589 2598
2590 return ret; 2599 return ret;
@@ -2622,7 +2631,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
2622 return 0; 2631 return 0;
2623} 2632}
2624 2633
2625static int ftrace_convert_nops(struct module *mod, 2634static int ftrace_process_locs(struct module *mod,
2626 unsigned long *start, 2635 unsigned long *start,
2627 unsigned long *end) 2636 unsigned long *end)
2628{ 2637{
@@ -2655,19 +2664,17 @@ static int ftrace_convert_nops(struct module *mod,
2655} 2664}
2656 2665
2657#ifdef CONFIG_MODULES 2666#ifdef CONFIG_MODULES
2658void ftrace_release(void *start, void *end) 2667void ftrace_release_mod(struct module *mod)
2659{ 2668{
2660 struct dyn_ftrace *rec; 2669 struct dyn_ftrace *rec;
2661 struct ftrace_page *pg; 2670 struct ftrace_page *pg;
2662 unsigned long s = (unsigned long)start;
2663 unsigned long e = (unsigned long)end;
2664 2671
2665 if (ftrace_disabled || !start || start == end) 2672 if (ftrace_disabled)
2666 return; 2673 return;
2667 2674
2668 mutex_lock(&ftrace_lock); 2675 mutex_lock(&ftrace_lock);
2669 do_for_each_ftrace_rec(pg, rec) { 2676 do_for_each_ftrace_rec(pg, rec) {
2670 if ((rec->ip >= s) && (rec->ip < e)) { 2677 if (within_module_core(rec->ip, mod)) {
2671 /* 2678 /*
2672 * rec->ip is changed in ftrace_free_rec() 2679 * rec->ip is changed in ftrace_free_rec()
2673 * It should not between s and e if record was freed. 2680 * It should not between s and e if record was freed.
@@ -2684,7 +2691,7 @@ static void ftrace_init_module(struct module *mod,
2684{ 2691{
2685 if (ftrace_disabled || start == end) 2692 if (ftrace_disabled || start == end)
2686 return; 2693 return;
2687 ftrace_convert_nops(mod, start, end); 2694 ftrace_process_locs(mod, start, end);
2688} 2695}
2689 2696
2690static int ftrace_module_notify(struct notifier_block *self, 2697static int ftrace_module_notify(struct notifier_block *self,
@@ -2699,9 +2706,7 @@ static int ftrace_module_notify(struct notifier_block *self,
2699 mod->num_ftrace_callsites); 2706 mod->num_ftrace_callsites);
2700 break; 2707 break;
2701 case MODULE_STATE_GOING: 2708 case MODULE_STATE_GOING:
2702 ftrace_release(mod->ftrace_callsites, 2709 ftrace_release_mod(mod);
2703 mod->ftrace_callsites +
2704 mod->num_ftrace_callsites);
2705 break; 2710 break;
2706 } 2711 }
2707 2712
@@ -2747,7 +2752,7 @@ void __init ftrace_init(void)
2747 2752
2748 last_ftrace_enabled = ftrace_enabled = 1; 2753 last_ftrace_enabled = ftrace_enabled = 1;
2749 2754
2750 ret = ftrace_convert_nops(NULL, 2755 ret = ftrace_process_locs(NULL,
2751 __start_mcount_loc, 2756 __start_mcount_loc,
2752 __stop_mcount_loc); 2757 __stop_mcount_loc);
2753 2758
@@ -2780,23 +2785,6 @@ static inline void ftrace_startup_enable(int command) { }
2780# define ftrace_shutdown_sysctl() do { } while (0) 2785# define ftrace_shutdown_sysctl() do { } while (0)
2781#endif /* CONFIG_DYNAMIC_FTRACE */ 2786#endif /* CONFIG_DYNAMIC_FTRACE */
2782 2787
2783static ssize_t
2784ftrace_pid_read(struct file *file, char __user *ubuf,
2785 size_t cnt, loff_t *ppos)
2786{
2787 char buf[64];
2788 int r;
2789
2790 if (ftrace_pid_trace == ftrace_swapper_pid)
2791 r = sprintf(buf, "swapper tasks\n");
2792 else if (ftrace_pid_trace)
2793 r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace));
2794 else
2795 r = sprintf(buf, "no pid\n");
2796
2797 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2798}
2799
2800static void clear_ftrace_swapper(void) 2788static void clear_ftrace_swapper(void)
2801{ 2789{
2802 struct task_struct *p; 2790 struct task_struct *p;
@@ -2847,14 +2835,12 @@ static void set_ftrace_pid(struct pid *pid)
2847 rcu_read_unlock(); 2835 rcu_read_unlock();
2848} 2836}
2849 2837
2850static void clear_ftrace_pid_task(struct pid **pid) 2838static void clear_ftrace_pid_task(struct pid *pid)
2851{ 2839{
2852 if (*pid == ftrace_swapper_pid) 2840 if (pid == ftrace_swapper_pid)
2853 clear_ftrace_swapper(); 2841 clear_ftrace_swapper();
2854 else 2842 else
2855 clear_ftrace_pid(*pid); 2843 clear_ftrace_pid(pid);
2856
2857 *pid = NULL;
2858} 2844}
2859 2845
2860static void set_ftrace_pid_task(struct pid *pid) 2846static void set_ftrace_pid_task(struct pid *pid)
@@ -2865,74 +2851,184 @@ static void set_ftrace_pid_task(struct pid *pid)
2865 set_ftrace_pid(pid); 2851 set_ftrace_pid(pid);
2866} 2852}
2867 2853
2868static ssize_t 2854static int ftrace_pid_add(int p)
2869ftrace_pid_write(struct file *filp, const char __user *ubuf,
2870 size_t cnt, loff_t *ppos)
2871{ 2855{
2872 struct pid *pid; 2856 struct pid *pid;
2873 char buf[64]; 2857 struct ftrace_pid *fpid;
2874 long val; 2858 int ret = -EINVAL;
2875 int ret;
2876 2859
2877 if (cnt >= sizeof(buf)) 2860 mutex_lock(&ftrace_lock);
2878 return -EINVAL;
2879 2861
2880 if (copy_from_user(&buf, ubuf, cnt)) 2862 if (!p)
2881 return -EFAULT; 2863 pid = ftrace_swapper_pid;
2864 else
2865 pid = find_get_pid(p);
2882 2866
2883 buf[cnt] = 0; 2867 if (!pid)
2868 goto out;
2884 2869
2885 ret = strict_strtol(buf, 10, &val); 2870 ret = 0;
2886 if (ret < 0)
2887 return ret;
2888 2871
2889 mutex_lock(&ftrace_lock); 2872 list_for_each_entry(fpid, &ftrace_pids, list)
2890 if (val < 0) { 2873 if (fpid->pid == pid)
2891 /* disable pid tracing */ 2874 goto out_put;
2892 if (!ftrace_pid_trace)
2893 goto out;
2894 2875
2895 clear_ftrace_pid_task(&ftrace_pid_trace); 2876 ret = -ENOMEM;
2896 2877
2897 } else { 2878 fpid = kmalloc(sizeof(*fpid), GFP_KERNEL);
2898 /* swapper task is special */ 2879 if (!fpid)
2899 if (!val) { 2880 goto out_put;
2900 pid = ftrace_swapper_pid;
2901 if (pid == ftrace_pid_trace)
2902 goto out;
2903 } else {
2904 pid = find_get_pid(val);
2905 2881
2906 if (pid == ftrace_pid_trace) { 2882 list_add(&fpid->list, &ftrace_pids);
2907 put_pid(pid); 2883 fpid->pid = pid;
2908 goto out;
2909 }
2910 }
2911 2884
2912 if (ftrace_pid_trace) 2885 set_ftrace_pid_task(pid);
2913 clear_ftrace_pid_task(&ftrace_pid_trace);
2914 2886
2915 if (!pid) 2887 ftrace_update_pid_func();
2916 goto out; 2888 ftrace_startup_enable(0);
2889
2890 mutex_unlock(&ftrace_lock);
2891 return 0;
2892
2893out_put:
2894 if (pid != ftrace_swapper_pid)
2895 put_pid(pid);
2896
2897out:
2898 mutex_unlock(&ftrace_lock);
2899 return ret;
2900}
2901
2902static void ftrace_pid_reset(void)
2903{
2904 struct ftrace_pid *fpid, *safe;
2905
2906 mutex_lock(&ftrace_lock);
2907 list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) {
2908 struct pid *pid = fpid->pid;
2917 2909
2918 ftrace_pid_trace = pid; 2910 clear_ftrace_pid_task(pid);
2919 2911
2920 set_ftrace_pid_task(ftrace_pid_trace); 2912 list_del(&fpid->list);
2913 kfree(fpid);
2921 } 2914 }
2922 2915
2923 /* update the function call */
2924 ftrace_update_pid_func(); 2916 ftrace_update_pid_func();
2925 ftrace_startup_enable(0); 2917 ftrace_startup_enable(0);
2926 2918
2927 out:
2928 mutex_unlock(&ftrace_lock); 2919 mutex_unlock(&ftrace_lock);
2920}
2929 2921
2930 return cnt; 2922static void *fpid_start(struct seq_file *m, loff_t *pos)
2923{
2924 mutex_lock(&ftrace_lock);
2925
2926 if (list_empty(&ftrace_pids) && (!*pos))
2927 return (void *) 1;
2928
2929 return seq_list_start(&ftrace_pids, *pos);
2930}
2931
2932static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
2933{
2934 if (v == (void *)1)
2935 return NULL;
2936
2937 return seq_list_next(v, &ftrace_pids, pos);
2938}
2939
2940static void fpid_stop(struct seq_file *m, void *p)
2941{
2942 mutex_unlock(&ftrace_lock);
2943}
2944
2945static int fpid_show(struct seq_file *m, void *v)
2946{
2947 const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
2948
2949 if (v == (void *)1) {
2950 seq_printf(m, "no pid\n");
2951 return 0;
2952 }
2953
2954 if (fpid->pid == ftrace_swapper_pid)
2955 seq_printf(m, "swapper tasks\n");
2956 else
2957 seq_printf(m, "%u\n", pid_vnr(fpid->pid));
2958
2959 return 0;
2960}
2961
2962static const struct seq_operations ftrace_pid_sops = {
2963 .start = fpid_start,
2964 .next = fpid_next,
2965 .stop = fpid_stop,
2966 .show = fpid_show,
2967};
2968
2969static int
2970ftrace_pid_open(struct inode *inode, struct file *file)
2971{
2972 int ret = 0;
2973
2974 if ((file->f_mode & FMODE_WRITE) &&
2975 (file->f_flags & O_TRUNC))
2976 ftrace_pid_reset();
2977
2978 if (file->f_mode & FMODE_READ)
2979 ret = seq_open(file, &ftrace_pid_sops);
2980
2981 return ret;
2982}
2983
2984static ssize_t
2985ftrace_pid_write(struct file *filp, const char __user *ubuf,
2986 size_t cnt, loff_t *ppos)
2987{
2988 char buf[64], *tmp;
2989 long val;
2990 int ret;
2991
2992 if (cnt >= sizeof(buf))
2993 return -EINVAL;
2994
2995 if (copy_from_user(&buf, ubuf, cnt))
2996 return -EFAULT;
2997
2998 buf[cnt] = 0;
2999
3000 /*
3001 * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid"
3002 * to clean the filter quietly.
3003 */
3004 tmp = strstrip(buf);
3005 if (strlen(tmp) == 0)
3006 return 1;
3007
3008 ret = strict_strtol(tmp, 10, &val);
3009 if (ret < 0)
3010 return ret;
3011
3012 ret = ftrace_pid_add(val);
3013
3014 return ret ? ret : cnt;
3015}
3016
3017static int
3018ftrace_pid_release(struct inode *inode, struct file *file)
3019{
3020 if (file->f_mode & FMODE_READ)
3021 seq_release(inode, file);
3022
3023 return 0;
2931} 3024}
2932 3025
2933static const struct file_operations ftrace_pid_fops = { 3026static const struct file_operations ftrace_pid_fops = {
2934 .read = ftrace_pid_read, 3027 .open = ftrace_pid_open,
2935 .write = ftrace_pid_write, 3028 .write = ftrace_pid_write,
3029 .read = seq_read,
3030 .llseek = seq_lseek,
3031 .release = ftrace_pid_release,
2936}; 3032};
2937 3033
2938static __init int ftrace_init_debugfs(void) 3034static __init int ftrace_init_debugfs(void)
@@ -3015,7 +3111,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
3015 3111
3016int 3112int
3017ftrace_enable_sysctl(struct ctl_table *table, int write, 3113ftrace_enable_sysctl(struct ctl_table *table, int write,
3018 struct file *file, void __user *buffer, size_t *lenp, 3114 void __user *buffer, size_t *lenp,
3019 loff_t *ppos) 3115 loff_t *ppos)
3020{ 3116{
3021 int ret; 3117 int ret;
@@ -3025,7 +3121,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
3025 3121
3026 mutex_lock(&ftrace_lock); 3122 mutex_lock(&ftrace_lock);
3027 3123
3028 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 3124 ret = proc_dointvec(table, write, buffer, lenp, ppos);
3029 3125
3030 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) 3126 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
3031 goto out; 3127 goto out;
@@ -3295,4 +3391,3 @@ void ftrace_graph_stop(void)
3295 ftrace_stop(); 3391 ftrace_stop();
3296} 3392}
3297#endif 3393#endif
3298
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 81b1645c8549..a91da69f153a 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -501,7 +501,7 @@ static int __init init_kmem_tracer(void)
501 return 1; 501 return 1;
502 } 502 }
503 503
504 if (!register_tracer(&kmem_tracer)) { 504 if (register_tracer(&kmem_tracer) != 0) {
505 pr_warning("Warning: could not register the kmem tracer\n"); 505 pr_warning("Warning: could not register the kmem tracer\n");
506 return 1; 506 return 1;
507 } 507 }
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d4ff01970547..a1ca4956ab5e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -397,18 +397,21 @@ int ring_buffer_print_page_header(struct trace_seq *s)
397 int ret; 397 int ret;
398 398
399 ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" 399 ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
400 "offset:0;\tsize:%u;\n", 400 "offset:0;\tsize:%u;\tsigned:%u;\n",
401 (unsigned int)sizeof(field.time_stamp)); 401 (unsigned int)sizeof(field.time_stamp),
402 (unsigned int)is_signed_type(u64));
402 403
403 ret = trace_seq_printf(s, "\tfield: local_t commit;\t" 404 ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
404 "offset:%u;\tsize:%u;\n", 405 "offset:%u;\tsize:%u;\tsigned:%u;\n",
405 (unsigned int)offsetof(typeof(field), commit), 406 (unsigned int)offsetof(typeof(field), commit),
406 (unsigned int)sizeof(field.commit)); 407 (unsigned int)sizeof(field.commit),
408 (unsigned int)is_signed_type(long));
407 409
408 ret = trace_seq_printf(s, "\tfield: char data;\t" 410 ret = trace_seq_printf(s, "\tfield: char data;\t"
409 "offset:%u;\tsize:%u;\n", 411 "offset:%u;\tsize:%u;\tsigned:%u;\n",
410 (unsigned int)offsetof(typeof(field), data), 412 (unsigned int)offsetof(typeof(field), data),
411 (unsigned int)BUF_PAGE_SIZE); 413 (unsigned int)BUF_PAGE_SIZE,
414 (unsigned int)is_signed_type(char));
412 415
413 return ret; 416 return ret;
414} 417}
@@ -483,7 +486,7 @@ struct ring_buffer_iter {
483/* Up this if you want to test the TIME_EXTENTS and normalization */ 486/* Up this if you want to test the TIME_EXTENTS and normalization */
484#define DEBUG_SHIFT 0 487#define DEBUG_SHIFT 0
485 488
486static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu) 489static inline u64 rb_time_stamp(struct ring_buffer *buffer)
487{ 490{
488 /* shift to debug/test normalization and TIME_EXTENTS */ 491 /* shift to debug/test normalization and TIME_EXTENTS */
489 return buffer->clock() << DEBUG_SHIFT; 492 return buffer->clock() << DEBUG_SHIFT;
@@ -494,7 +497,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
494 u64 time; 497 u64 time;
495 498
496 preempt_disable_notrace(); 499 preempt_disable_notrace();
497 time = rb_time_stamp(buffer, cpu); 500 time = rb_time_stamp(buffer);
498 preempt_enable_no_resched_notrace(); 501 preempt_enable_no_resched_notrace();
499 502
500 return time; 503 return time;
@@ -599,7 +602,7 @@ static struct list_head *rb_list_head(struct list_head *list)
599} 602}
600 603
601/* 604/*
602 * rb_is_head_page - test if the give page is the head page 605 * rb_is_head_page - test if the given page is the head page
603 * 606 *
604 * Because the reader may move the head_page pointer, we can 607 * Because the reader may move the head_page pointer, we can
605 * not trust what the head page is (it may be pointing to 608 * not trust what the head page is (it may be pointing to
@@ -1193,6 +1196,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1193 atomic_inc(&cpu_buffer->record_disabled); 1196 atomic_inc(&cpu_buffer->record_disabled);
1194 synchronize_sched(); 1197 synchronize_sched();
1195 1198
1199 spin_lock_irq(&cpu_buffer->reader_lock);
1196 rb_head_page_deactivate(cpu_buffer); 1200 rb_head_page_deactivate(cpu_buffer);
1197 1201
1198 for (i = 0; i < nr_pages; i++) { 1202 for (i = 0; i < nr_pages; i++) {
@@ -1207,6 +1211,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1207 return; 1211 return;
1208 1212
1209 rb_reset_cpu(cpu_buffer); 1213 rb_reset_cpu(cpu_buffer);
1214 spin_unlock_irq(&cpu_buffer->reader_lock);
1210 1215
1211 rb_check_pages(cpu_buffer); 1216 rb_check_pages(cpu_buffer);
1212 1217
@@ -1785,9 +1790,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1785static struct ring_buffer_event * 1790static struct ring_buffer_event *
1786rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, 1791rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1787 unsigned long length, unsigned long tail, 1792 unsigned long length, unsigned long tail,
1788 struct buffer_page *commit_page,
1789 struct buffer_page *tail_page, u64 *ts) 1793 struct buffer_page *tail_page, u64 *ts)
1790{ 1794{
1795 struct buffer_page *commit_page = cpu_buffer->commit_page;
1791 struct ring_buffer *buffer = cpu_buffer->buffer; 1796 struct ring_buffer *buffer = cpu_buffer->buffer;
1792 struct buffer_page *next_page; 1797 struct buffer_page *next_page;
1793 int ret; 1798 int ret;
@@ -1868,7 +1873,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1868 * Nested commits always have zero deltas, so 1873 * Nested commits always have zero deltas, so
1869 * just reread the time stamp 1874 * just reread the time stamp
1870 */ 1875 */
1871 *ts = rb_time_stamp(buffer, cpu_buffer->cpu); 1876 *ts = rb_time_stamp(buffer);
1872 next_page->page->time_stamp = *ts; 1877 next_page->page->time_stamp = *ts;
1873 } 1878 }
1874 1879
@@ -1890,13 +1895,10 @@ static struct ring_buffer_event *
1890__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 1895__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1891 unsigned type, unsigned long length, u64 *ts) 1896 unsigned type, unsigned long length, u64 *ts)
1892{ 1897{
1893 struct buffer_page *tail_page, *commit_page; 1898 struct buffer_page *tail_page;
1894 struct ring_buffer_event *event; 1899 struct ring_buffer_event *event;
1895 unsigned long tail, write; 1900 unsigned long tail, write;
1896 1901
1897 commit_page = cpu_buffer->commit_page;
1898 /* we just need to protect against interrupts */
1899 barrier();
1900 tail_page = cpu_buffer->tail_page; 1902 tail_page = cpu_buffer->tail_page;
1901 write = local_add_return(length, &tail_page->write); 1903 write = local_add_return(length, &tail_page->write);
1902 1904
@@ -1907,7 +1909,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1907 /* See if we shot pass the end of this buffer page */ 1909 /* See if we shot pass the end of this buffer page */
1908 if (write > BUF_PAGE_SIZE) 1910 if (write > BUF_PAGE_SIZE)
1909 return rb_move_tail(cpu_buffer, length, tail, 1911 return rb_move_tail(cpu_buffer, length, tail,
1910 commit_page, tail_page, ts); 1912 tail_page, ts);
1911 1913
1912 /* We reserved something on the buffer */ 1914 /* We reserved something on the buffer */
1913 1915
@@ -2111,7 +2113,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2111 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) 2113 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
2112 goto out_fail; 2114 goto out_fail;
2113 2115
2114 ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); 2116 ts = rb_time_stamp(cpu_buffer->buffer);
2115 2117
2116 /* 2118 /*
2117 * Only the first commit can update the timestamp. 2119 * Only the first commit can update the timestamp.
@@ -2681,7 +2683,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
2681EXPORT_SYMBOL_GPL(ring_buffer_entries); 2683EXPORT_SYMBOL_GPL(ring_buffer_entries);
2682 2684
2683/** 2685/**
2684 * ring_buffer_overrun_cpu - get the number of overruns in buffer 2686 * ring_buffer_overruns - get the number of overruns in buffer
2685 * @buffer: The ring buffer 2687 * @buffer: The ring buffer
2686 * 2688 *
2687 * Returns the total number of overruns in the ring buffer 2689 * Returns the total number of overruns in the ring buffer
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 573d3cc762c3..b2477caf09c2 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -35,6 +35,28 @@ static int disable_reader;
35module_param(disable_reader, uint, 0644); 35module_param(disable_reader, uint, 0644);
36MODULE_PARM_DESC(disable_reader, "only run producer"); 36MODULE_PARM_DESC(disable_reader, "only run producer");
37 37
38static int write_iteration = 50;
39module_param(write_iteration, uint, 0644);
40MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
41
42static int producer_nice = 19;
43static int consumer_nice = 19;
44
45static int producer_fifo = -1;
46static int consumer_fifo = -1;
47
48module_param(producer_nice, uint, 0644);
49MODULE_PARM_DESC(producer_nice, "nice prio for producer");
50
51module_param(consumer_nice, uint, 0644);
52MODULE_PARM_DESC(consumer_nice, "nice prio for consumer");
53
54module_param(producer_fifo, uint, 0644);
55MODULE_PARM_DESC(producer_fifo, "fifo prio for producer");
56
57module_param(consumer_fifo, uint, 0644);
58MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer");
59
38static int read_events; 60static int read_events;
39 61
40static int kill_test; 62static int kill_test;
@@ -208,15 +230,18 @@ static void ring_buffer_producer(void)
208 do { 230 do {
209 struct ring_buffer_event *event; 231 struct ring_buffer_event *event;
210 int *entry; 232 int *entry;
211 233 int i;
212 event = ring_buffer_lock_reserve(buffer, 10); 234
213 if (!event) { 235 for (i = 0; i < write_iteration; i++) {
214 missed++; 236 event = ring_buffer_lock_reserve(buffer, 10);
215 } else { 237 if (!event) {
216 hit++; 238 missed++;
217 entry = ring_buffer_event_data(event); 239 } else {
218 *entry = smp_processor_id(); 240 hit++;
219 ring_buffer_unlock_commit(buffer, event); 241 entry = ring_buffer_event_data(event);
242 *entry = smp_processor_id();
243 ring_buffer_unlock_commit(buffer, event);
244 }
220 } 245 }
221 do_gettimeofday(&end_tv); 246 do_gettimeofday(&end_tv);
222 247
@@ -263,6 +288,27 @@ static void ring_buffer_producer(void)
263 288
264 if (kill_test) 289 if (kill_test)
265 trace_printk("ERROR!\n"); 290 trace_printk("ERROR!\n");
291
292 if (!disable_reader) {
293 if (consumer_fifo < 0)
294 trace_printk("Running Consumer at nice: %d\n",
295 consumer_nice);
296 else
297 trace_printk("Running Consumer at SCHED_FIFO %d\n",
298 consumer_fifo);
299 }
300 if (producer_fifo < 0)
301 trace_printk("Running Producer at nice: %d\n",
302 producer_nice);
303 else
304 trace_printk("Running Producer at SCHED_FIFO %d\n",
305 producer_fifo);
306
307 /* Let the user know that the test is running at low priority */
308 if (producer_fifo < 0 && consumer_fifo < 0 &&
309 producer_nice == 19 && consumer_nice == 19)
310 trace_printk("WARNING!!! This test is running at lowest priority.\n");
311
266 trace_printk("Time: %lld (usecs)\n", time); 312 trace_printk("Time: %lld (usecs)\n", time);
267 trace_printk("Overruns: %lld\n", overruns); 313 trace_printk("Overruns: %lld\n", overruns);
268 if (disable_reader) 314 if (disable_reader)
@@ -392,6 +438,27 @@ static int __init ring_buffer_benchmark_init(void)
392 if (IS_ERR(producer)) 438 if (IS_ERR(producer))
393 goto out_kill; 439 goto out_kill;
394 440
441 /*
442 * Run them as low-prio background tasks by default:
443 */
444 if (!disable_reader) {
445 if (consumer_fifo >= 0) {
446 struct sched_param param = {
447 .sched_priority = consumer_fifo
448 };
449 sched_setscheduler(consumer, SCHED_FIFO, &param);
450 } else
451 set_user_nice(consumer, consumer_nice);
452 }
453
454 if (producer_fifo >= 0) {
455 struct sched_param param = {
456 .sched_priority = consumer_fifo
457 };
458 sched_setscheduler(producer, SCHED_FIFO, &param);
459 } else
460 set_user_nice(producer, producer_nice);
461
395 return 0; 462 return 0;
396 463
397 out_kill: 464 out_kill:
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6c0f6a8a22eb..874f2893cff0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -129,7 +129,7 @@ static int tracing_set_tracer(const char *buf);
129static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; 129static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
130static char *default_bootup_tracer; 130static char *default_bootup_tracer;
131 131
132static int __init set_ftrace(char *str) 132static int __init set_cmdline_ftrace(char *str)
133{ 133{
134 strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); 134 strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
135 default_bootup_tracer = bootup_tracer_buf; 135 default_bootup_tracer = bootup_tracer_buf;
@@ -137,7 +137,7 @@ static int __init set_ftrace(char *str)
137 ring_buffer_expanded = 1; 137 ring_buffer_expanded = 1;
138 return 1; 138 return 1;
139} 139}
140__setup("ftrace=", set_ftrace); 140__setup("ftrace=", set_cmdline_ftrace);
141 141
142static int __init set_ftrace_dump_on_oops(char *str) 142static int __init set_ftrace_dump_on_oops(char *str)
143{ 143{
@@ -415,7 +415,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
415 415
416 /* read the non-space input */ 416 /* read the non-space input */
417 while (cnt && !isspace(ch)) { 417 while (cnt && !isspace(ch)) {
418 if (parser->idx < parser->size) 418 if (parser->idx < parser->size - 1)
419 parser->buffer[parser->idx++] = ch; 419 parser->buffer[parser->idx++] = ch;
420 else { 420 else {
421 ret = -EINVAL; 421 ret = -EINVAL;
@@ -1361,10 +1361,11 @@ int trace_array_vprintk(struct trace_array *tr,
1361 pause_graph_tracing(); 1361 pause_graph_tracing();
1362 raw_local_irq_save(irq_flags); 1362 raw_local_irq_save(irq_flags);
1363 __raw_spin_lock(&trace_buf_lock); 1363 __raw_spin_lock(&trace_buf_lock);
1364 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); 1364 if (args == NULL) {
1365 1365 strncpy(trace_buf, fmt, TRACE_BUF_SIZE);
1366 len = min(len, TRACE_BUF_SIZE-1); 1366 len = strlen(trace_buf);
1367 trace_buf[len] = 0; 1367 } else
1368 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1368 1369
1369 size = sizeof(*entry) + len + 1; 1370 size = sizeof(*entry) + len + 1;
1370 buffer = tr->buffer; 1371 buffer = tr->buffer;
@@ -1373,10 +1374,10 @@ int trace_array_vprintk(struct trace_array *tr,
1373 if (!event) 1374 if (!event)
1374 goto out_unlock; 1375 goto out_unlock;
1375 entry = ring_buffer_event_data(event); 1376 entry = ring_buffer_event_data(event);
1376 entry->ip = ip; 1377 entry->ip = ip;
1377 1378
1378 memcpy(&entry->buf, trace_buf, len); 1379 memcpy(&entry->buf, trace_buf, len);
1379 entry->buf[len] = 0; 1380 entry->buf[len] = '\0';
1380 if (!filter_check_discard(call, entry, buffer, event)) 1381 if (!filter_check_discard(call, entry, buffer, event))
1381 ring_buffer_unlock_commit(buffer, event); 1382 ring_buffer_unlock_commit(buffer, event);
1382 1383
@@ -1393,7 +1394,7 @@ int trace_array_vprintk(struct trace_array *tr,
1393 1394
1394int trace_vprintk(unsigned long ip, const char *fmt, va_list args) 1395int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1395{ 1396{
1396 return trace_array_printk(&global_trace, ip, fmt, args); 1397 return trace_array_vprintk(&global_trace, ip, fmt, args);
1397} 1398}
1398EXPORT_SYMBOL_GPL(trace_vprintk); 1399EXPORT_SYMBOL_GPL(trace_vprintk);
1399 1400
@@ -1984,11 +1985,9 @@ __tracing_open(struct inode *inode, struct file *file)
1984 if (current_trace) 1985 if (current_trace)
1985 *iter->trace = *current_trace; 1986 *iter->trace = *current_trace;
1986 1987
1987 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) 1988 if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
1988 goto fail; 1989 goto fail;
1989 1990
1990 cpumask_clear(iter->started);
1991
1992 if (current_trace && current_trace->print_max) 1991 if (current_trace && current_trace->print_max)
1993 iter->tr = &max_tr; 1992 iter->tr = &max_tr;
1994 else 1993 else
@@ -2442,7 +2441,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2442 return ret; 2441 return ret;
2443 } 2442 }
2444 2443
2445 filp->f_pos += cnt; 2444 *ppos += cnt;
2446 2445
2447 return cnt; 2446 return cnt;
2448} 2447}
@@ -2584,7 +2583,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2584 } 2583 }
2585 mutex_unlock(&trace_types_lock); 2584 mutex_unlock(&trace_types_lock);
2586 2585
2587 filp->f_pos += cnt; 2586 *ppos += cnt;
2588 2587
2589 return cnt; 2588 return cnt;
2590} 2589}
@@ -2766,7 +2765,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2766 if (err) 2765 if (err)
2767 return err; 2766 return err;
2768 2767
2769 filp->f_pos += ret; 2768 *ppos += ret;
2770 2769
2771 return ret; 2770 return ret;
2772} 2771}
@@ -3301,7 +3300,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3301 } 3300 }
3302 } 3301 }
3303 3302
3304 filp->f_pos += cnt; 3303 *ppos += cnt;
3305 3304
3306 /* If check pages failed, return ENOMEM */ 3305 /* If check pages failed, return ENOMEM */
3307 if (tracing_disabled) 3306 if (tracing_disabled)
@@ -3321,22 +3320,11 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3321 return cnt; 3320 return cnt;
3322} 3321}
3323 3322
3324static int mark_printk(const char *fmt, ...)
3325{
3326 int ret;
3327 va_list args;
3328 va_start(args, fmt);
3329 ret = trace_vprintk(0, fmt, args);
3330 va_end(args);
3331 return ret;
3332}
3333
3334static ssize_t 3323static ssize_t
3335tracing_mark_write(struct file *filp, const char __user *ubuf, 3324tracing_mark_write(struct file *filp, const char __user *ubuf,
3336 size_t cnt, loff_t *fpos) 3325 size_t cnt, loff_t *fpos)
3337{ 3326{
3338 char *buf; 3327 char *buf;
3339 char *end;
3340 3328
3341 if (tracing_disabled) 3329 if (tracing_disabled)
3342 return -EINVAL; 3330 return -EINVAL;
@@ -3344,7 +3332,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3344 if (cnt > TRACE_BUF_SIZE) 3332 if (cnt > TRACE_BUF_SIZE)
3345 cnt = TRACE_BUF_SIZE; 3333 cnt = TRACE_BUF_SIZE;
3346 3334
3347 buf = kmalloc(cnt + 1, GFP_KERNEL); 3335 buf = kmalloc(cnt + 2, GFP_KERNEL);
3348 if (buf == NULL) 3336 if (buf == NULL)
3349 return -ENOMEM; 3337 return -ENOMEM;
3350 3338
@@ -3352,14 +3340,13 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3352 kfree(buf); 3340 kfree(buf);
3353 return -EFAULT; 3341 return -EFAULT;
3354 } 3342 }
3343 if (buf[cnt-1] != '\n') {
3344 buf[cnt] = '\n';
3345 buf[cnt+1] = '\0';
3346 } else
3347 buf[cnt] = '\0';
3355 3348
3356 /* Cut from the first nil or newline. */ 3349 cnt = trace_vprintk(0, buf, NULL);
3357 buf[cnt] = '\0';
3358 end = strchr(buf, '\n');
3359 if (end)
3360 *end = '\0';
3361
3362 cnt = mark_printk("%s\n", buf);
3363 kfree(buf); 3350 kfree(buf);
3364 *fpos += cnt; 3351 *fpos += cnt;
3365 3352
@@ -3732,7 +3719,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3732 3719
3733 s = kmalloc(sizeof(*s), GFP_KERNEL); 3720 s = kmalloc(sizeof(*s), GFP_KERNEL);
3734 if (!s) 3721 if (!s)
3735 return ENOMEM; 3722 return -ENOMEM;
3736 3723
3737 trace_seq_init(s); 3724 trace_seq_init(s);
3738 3725
@@ -4389,7 +4376,7 @@ __init static int tracer_alloc_buffers(void)
4389 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 4376 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
4390 goto out_free_buffer_mask; 4377 goto out_free_buffer_mask;
4391 4378
4392 if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) 4379 if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
4393 goto out_free_tracing_cpumask; 4380 goto out_free_tracing_cpumask;
4394 4381
4395 /* To save memory, keep the ring buffer size to its minimum */ 4382 /* To save memory, keep the ring buffer size to its minimum */
@@ -4400,7 +4387,6 @@ __init static int tracer_alloc_buffers(void)
4400 4387
4401 cpumask_copy(tracing_buffer_mask, cpu_possible_mask); 4388 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
4402 cpumask_copy(tracing_cpumask, cpu_all_mask); 4389 cpumask_copy(tracing_cpumask, cpu_all_mask);
4403 cpumask_clear(tracing_reader_cpumask);
4404 4390
4405 /* TODO: make the number of buffers hot pluggable with CPUS */ 4391 /* TODO: make the number of buffers hot pluggable with CPUS */
4406 global_trace.buffer = ring_buffer_alloc(ring_buf_size, 4392 global_trace.buffer = ring_buffer_alloc(ring_buf_size,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 405cb850b75d..1d7f4830a80d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,6 +11,7 @@
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <trace/boot.h> 12#include <trace/boot.h>
13#include <linux/kmemtrace.h> 13#include <linux/kmemtrace.h>
14#include <linux/hw_breakpoint.h>
14 15
15#include <linux/trace_seq.h> 16#include <linux/trace_seq.h>
16#include <linux/ftrace_event.h> 17#include <linux/ftrace_event.h>
@@ -37,6 +38,7 @@ enum trace_type {
37 TRACE_KMEM_ALLOC, 38 TRACE_KMEM_ALLOC,
38 TRACE_KMEM_FREE, 39 TRACE_KMEM_FREE,
39 TRACE_BLK, 40 TRACE_BLK,
41 TRACE_KSYM,
40 42
41 __TRACE_LAST_TYPE, 43 __TRACE_LAST_TYPE,
42}; 44};
@@ -98,9 +100,32 @@ struct syscall_trace_enter {
98struct syscall_trace_exit { 100struct syscall_trace_exit {
99 struct trace_entry ent; 101 struct trace_entry ent;
100 int nr; 102 int nr;
101 unsigned long ret; 103 long ret;
102}; 104};
103 105
106struct kprobe_trace_entry {
107 struct trace_entry ent;
108 unsigned long ip;
109 int nargs;
110 unsigned long args[];
111};
112
113#define SIZEOF_KPROBE_TRACE_ENTRY(n) \
114 (offsetof(struct kprobe_trace_entry, args) + \
115 (sizeof(unsigned long) * (n)))
116
117struct kretprobe_trace_entry {
118 struct trace_entry ent;
119 unsigned long func;
120 unsigned long ret_ip;
121 int nargs;
122 unsigned long args[];
123};
124
125#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \
126 (offsetof(struct kretprobe_trace_entry, args) + \
127 (sizeof(unsigned long) * (n)))
128
104/* 129/*
105 * trace_flag_type is an enumeration that holds different 130 * trace_flag_type is an enumeration that holds different
106 * states when a trace occurs. These are: 131 * states when a trace occurs. These are:
@@ -209,6 +234,7 @@ extern void __ftrace_bad_type(void);
209 TRACE_KMEM_ALLOC); \ 234 TRACE_KMEM_ALLOC); \
210 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 235 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
211 TRACE_KMEM_FREE); \ 236 TRACE_KMEM_FREE); \
237 IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
212 __ftrace_bad_type(); \ 238 __ftrace_bad_type(); \
213 } while (0) 239 } while (0)
214 240
@@ -364,6 +390,8 @@ int register_tracer(struct tracer *type);
364void unregister_tracer(struct tracer *type); 390void unregister_tracer(struct tracer *type);
365int is_tracing_stopped(void); 391int is_tracing_stopped(void);
366 392
393extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
394
367extern unsigned long nsecs_to_usecs(unsigned long nsecs); 395extern unsigned long nsecs_to_usecs(unsigned long nsecs);
368 396
369#ifdef CONFIG_TRACER_MAX_TRACE 397#ifdef CONFIG_TRACER_MAX_TRACE
@@ -438,6 +466,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
438 struct trace_array *tr); 466 struct trace_array *tr);
439extern int trace_selftest_startup_hw_branches(struct tracer *trace, 467extern int trace_selftest_startup_hw_branches(struct tracer *trace,
440 struct trace_array *tr); 468 struct trace_array *tr);
469extern int trace_selftest_startup_ksym(struct tracer *trace,
470 struct trace_array *tr);
441#endif /* CONFIG_FTRACE_STARTUP_TEST */ 471#endif /* CONFIG_FTRACE_STARTUP_TEST */
442 472
443extern void *head_page(struct trace_array_cpu *data); 473extern void *head_page(struct trace_array_cpu *data);
@@ -483,10 +513,6 @@ static inline int ftrace_graph_addr(unsigned long addr)
483 return 0; 513 return 0;
484} 514}
485#else 515#else
486static inline int ftrace_trace_addr(unsigned long addr)
487{
488 return 1;
489}
490static inline int ftrace_graph_addr(unsigned long addr) 516static inline int ftrace_graph_addr(unsigned long addr)
491{ 517{
492 return 1; 518 return 1;
@@ -500,12 +526,12 @@ print_graph_function(struct trace_iterator *iter)
500} 526}
501#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 527#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
502 528
503extern struct pid *ftrace_pid_trace; 529extern struct list_head ftrace_pids;
504 530
505#ifdef CONFIG_FUNCTION_TRACER 531#ifdef CONFIG_FUNCTION_TRACER
506static inline int ftrace_trace_task(struct task_struct *task) 532static inline int ftrace_trace_task(struct task_struct *task)
507{ 533{
508 if (!ftrace_pid_trace) 534 if (list_empty(&ftrace_pids))
509 return 1; 535 return 1;
510 536
511 return test_tsk_trace_trace(task); 537 return test_tsk_trace_trace(task);
@@ -687,7 +713,6 @@ struct event_filter {
687 int n_preds; 713 int n_preds;
688 struct filter_pred **preds; 714 struct filter_pred **preds;
689 char *filter_string; 715 char *filter_string;
690 bool no_reset;
691}; 716};
692 717
693struct event_subsystem { 718struct event_subsystem {
@@ -699,22 +724,40 @@ struct event_subsystem {
699}; 724};
700 725
701struct filter_pred; 726struct filter_pred;
727struct regex;
702 728
703typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, 729typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
704 int val1, int val2); 730 int val1, int val2);
705 731
732typedef int (*regex_match_func)(char *str, struct regex *r, int len);
733
734enum regex_type {
735 MATCH_FULL = 0,
736 MATCH_FRONT_ONLY,
737 MATCH_MIDDLE_ONLY,
738 MATCH_END_ONLY,
739};
740
741struct regex {
742 char pattern[MAX_FILTER_STR_VAL];
743 int len;
744 int field_len;
745 regex_match_func match;
746};
747
706struct filter_pred { 748struct filter_pred {
707 filter_pred_fn_t fn; 749 filter_pred_fn_t fn;
708 u64 val; 750 u64 val;
709 char str_val[MAX_FILTER_STR_VAL]; 751 struct regex regex;
710 int str_len; 752 char *field_name;
711 char *field_name; 753 int offset;
712 int offset; 754 int not;
713 int not; 755 int op;
714 int op; 756 int pop_n;
715 int pop_n;
716}; 757};
717 758
759extern enum regex_type
760filter_parse_regex(char *buff, int len, char **search, int *not);
718extern void print_event_filter(struct ftrace_event_call *call, 761extern void print_event_filter(struct ftrace_event_call *call,
719 struct trace_seq *s); 762 struct trace_seq *s);
720extern int apply_event_filter(struct ftrace_event_call *call, 763extern int apply_event_filter(struct ftrace_event_call *call,
@@ -730,7 +773,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
730 struct ring_buffer *buffer, 773 struct ring_buffer *buffer,
731 struct ring_buffer_event *event) 774 struct ring_buffer_event *event)
732{ 775{
733 if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) { 776 if (unlikely(call->filter_active) &&
777 !filter_match_preds(call->filter, rec)) {
734 ring_buffer_discard_commit(buffer, event); 778 ring_buffer_discard_commit(buffer, event);
735 return 1; 779 return 1;
736 } 780 }
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 7a7a9fd249a9..4a194f08f88c 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -34,6 +34,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
34 struct trace_array *tr = branch_tracer; 34 struct trace_array *tr = branch_tracer;
35 struct ring_buffer_event *event; 35 struct ring_buffer_event *event;
36 struct trace_branch *entry; 36 struct trace_branch *entry;
37 struct ring_buffer *buffer;
37 unsigned long flags; 38 unsigned long flags;
38 int cpu, pc; 39 int cpu, pc;
39 const char *p; 40 const char *p;
@@ -54,7 +55,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
54 goto out; 55 goto out;
55 56
56 pc = preempt_count(); 57 pc = preempt_count();
57 event = trace_buffer_lock_reserve(tr, TRACE_BRANCH, 58 buffer = tr->buffer;
59 event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
58 sizeof(*entry), flags, pc); 60 sizeof(*entry), flags, pc);
59 if (!event) 61 if (!event)
60 goto out; 62 goto out;
@@ -74,8 +76,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
74 entry->line = f->line; 76 entry->line = f->line;
75 entry->correct = val == expect; 77 entry->correct = val == expect;
76 78
77 if (!filter_check_discard(call, entry, tr->buffer, event)) 79 if (!filter_check_discard(call, entry, buffer, event))
78 ring_buffer_unlock_commit(tr->buffer, event); 80 ring_buffer_unlock_commit(buffer, event);
79 81
80 out: 82 out:
81 atomic_dec(&tr->data[cpu]->disabled); 83 atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 20c5f92e28a8..878c03f386ba 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -20,6 +20,8 @@
20#include <linux/ktime.h> 20#include <linux/ktime.h>
21#include <linux/trace_clock.h> 21#include <linux/trace_clock.h>
22 22
23#include "trace.h"
24
23/* 25/*
24 * trace_clock_local(): the simplest and least coherent tracing clock. 26 * trace_clock_local(): the simplest and least coherent tracing clock.
25 * 27 *
@@ -28,17 +30,17 @@
28 */ 30 */
29u64 notrace trace_clock_local(void) 31u64 notrace trace_clock_local(void)
30{ 32{
31 unsigned long flags;
32 u64 clock; 33 u64 clock;
34 int resched;
33 35
34 /* 36 /*
35 * sched_clock() is an architecture implemented, fast, scalable, 37 * sched_clock() is an architecture implemented, fast, scalable,
36 * lockless clock. It is not guaranteed to be coherent across 38 * lockless clock. It is not guaranteed to be coherent across
37 * CPUs, nor across CPU idle events. 39 * CPUs, nor across CPU idle events.
38 */ 40 */
39 raw_local_irq_save(flags); 41 resched = ftrace_preempt_disable();
40 clock = sched_clock(); 42 clock = sched_clock();
41 raw_local_irq_restore(flags); 43 ftrace_preempt_enable(resched);
42 44
43 return clock; 45 return clock;
44} 46}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index ead3d724599d..c16a08f399df 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -364,3 +364,19 @@ FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
364 F_printk("type:%u call_site:%lx ptr:%p", 364 F_printk("type:%u call_site:%lx ptr:%p",
365 __entry->type_id, __entry->call_site, __entry->ptr) 365 __entry->type_id, __entry->call_site, __entry->ptr)
366); 366);
367
368FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
369
370 TRACE_KSYM,
371
372 F_STRUCT(
373 __field( unsigned long, ip )
374 __field( unsigned char, type )
375 __array( char , cmd, TASK_COMM_LEN )
376 __field( unsigned long, addr )
377 ),
378
379 F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
380 (void *)__entry->ip, (unsigned int)__entry->type,
381 (void *)__entry->addr, __entry->cmd)
382);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index dd44b8768867..d9c60f80aa0d 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -8,17 +8,14 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include "trace.h" 9#include "trace.h"
10 10
11/*
12 * We can't use a size but a type in alloc_percpu()
13 * So let's create a dummy type that matches the desired size
14 */
15typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
16 11
17char *trace_profile_buf; 12char *perf_trace_buf;
18EXPORT_SYMBOL_GPL(trace_profile_buf); 13EXPORT_SYMBOL_GPL(perf_trace_buf);
14
15char *perf_trace_buf_nmi;
16EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
19 17
20char *trace_profile_buf_nmi; 18typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
21EXPORT_SYMBOL_GPL(trace_profile_buf_nmi);
22 19
23/* Count the events in use (per event id, not per instance) */ 20/* Count the events in use (per event id, not per instance) */
24static int total_profile_count; 21static int total_profile_count;
@@ -31,29 +28,34 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
31 if (atomic_inc_return(&event->profile_count)) 28 if (atomic_inc_return(&event->profile_count))
32 return 0; 29 return 0;
33 30
34 if (!total_profile_count++) { 31 if (!total_profile_count) {
35 buf = (char *)alloc_percpu(profile_buf_t); 32 buf = (char *)alloc_percpu(perf_trace_t);
36 if (!buf) 33 if (!buf)
37 goto fail_buf; 34 goto fail_buf;
38 35
39 rcu_assign_pointer(trace_profile_buf, buf); 36 rcu_assign_pointer(perf_trace_buf, buf);
40 37
41 buf = (char *)alloc_percpu(profile_buf_t); 38 buf = (char *)alloc_percpu(perf_trace_t);
42 if (!buf) 39 if (!buf)
43 goto fail_buf_nmi; 40 goto fail_buf_nmi;
44 41
45 rcu_assign_pointer(trace_profile_buf_nmi, buf); 42 rcu_assign_pointer(perf_trace_buf_nmi, buf);
46 } 43 }
47 44
48 ret = event->profile_enable(); 45 ret = event->profile_enable(event);
49 if (!ret) 46 if (!ret) {
47 total_profile_count++;
50 return 0; 48 return 0;
49 }
51 50
52 kfree(trace_profile_buf_nmi);
53fail_buf_nmi: 51fail_buf_nmi:
54 kfree(trace_profile_buf); 52 if (!total_profile_count) {
53 free_percpu(perf_trace_buf_nmi);
54 free_percpu(perf_trace_buf);
55 perf_trace_buf_nmi = NULL;
56 perf_trace_buf = NULL;
57 }
55fail_buf: 58fail_buf:
56 total_profile_count--;
57 atomic_dec(&event->profile_count); 59 atomic_dec(&event->profile_count);
58 60
59 return ret; 61 return ret;
@@ -84,14 +86,14 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event)
84 if (!atomic_add_negative(-1, &event->profile_count)) 86 if (!atomic_add_negative(-1, &event->profile_count))
85 return; 87 return;
86 88
87 event->profile_disable(); 89 event->profile_disable(event);
88 90
89 if (!--total_profile_count) { 91 if (!--total_profile_count) {
90 buf = trace_profile_buf; 92 buf = perf_trace_buf;
91 rcu_assign_pointer(trace_profile_buf, NULL); 93 rcu_assign_pointer(perf_trace_buf, NULL);
92 94
93 nmi_buf = trace_profile_buf_nmi; 95 nmi_buf = perf_trace_buf_nmi;
94 rcu_assign_pointer(trace_profile_buf_nmi, NULL); 96 rcu_assign_pointer(perf_trace_buf_nmi, NULL);
95 97
96 /* 98 /*
97 * Ensure every events in profiling have finished before 99 * Ensure every events in profiling have finished before
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 6f03c8a1105e..1d18315dc836 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -93,9 +93,7 @@ int trace_define_common_fields(struct ftrace_event_call *call)
93} 93}
94EXPORT_SYMBOL_GPL(trace_define_common_fields); 94EXPORT_SYMBOL_GPL(trace_define_common_fields);
95 95
96#ifdef CONFIG_MODULES 96void trace_destroy_fields(struct ftrace_event_call *call)
97
98static void trace_destroy_fields(struct ftrace_event_call *call)
99{ 97{
100 struct ftrace_event_field *field, *next; 98 struct ftrace_event_field *field, *next;
101 99
@@ -107,8 +105,6 @@ static void trace_destroy_fields(struct ftrace_event_call *call)
107 } 105 }
108} 106}
109 107
110#endif /* CONFIG_MODULES */
111
112static void ftrace_event_enable_disable(struct ftrace_event_call *call, 108static void ftrace_event_enable_disable(struct ftrace_event_call *call,
113 int enable) 109 int enable)
114{ 110{
@@ -117,14 +113,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
117 if (call->enabled) { 113 if (call->enabled) {
118 call->enabled = 0; 114 call->enabled = 0;
119 tracing_stop_cmdline_record(); 115 tracing_stop_cmdline_record();
120 call->unregfunc(call->data); 116 call->unregfunc(call);
121 } 117 }
122 break; 118 break;
123 case 1: 119 case 1:
124 if (!call->enabled) { 120 if (!call->enabled) {
125 call->enabled = 1; 121 call->enabled = 1;
126 tracing_start_cmdline_record(); 122 tracing_start_cmdline_record();
127 call->regfunc(call->data); 123 call->regfunc(call);
128 } 124 }
129 break; 125 break;
130 } 126 }
@@ -232,10 +228,9 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
232 size_t cnt, loff_t *ppos) 228 size_t cnt, loff_t *ppos)
233{ 229{
234 struct trace_parser parser; 230 struct trace_parser parser;
235 size_t read = 0; 231 ssize_t read, ret;
236 ssize_t ret;
237 232
238 if (!cnt || cnt < 0) 233 if (!cnt)
239 return 0; 234 return 0;
240 235
241 ret = tracing_update_buffers(); 236 ret = tracing_update_buffers();
@@ -247,7 +242,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
247 242
248 read = trace_get_user(&parser, ubuf, cnt, ppos); 243 read = trace_get_user(&parser, ubuf, cnt, ppos);
249 244
250 if (trace_parser_loaded((&parser))) { 245 if (read >= 0 && trace_parser_loaded((&parser))) {
251 int set = 1; 246 int set = 1;
252 247
253 if (*parser.buffer == '!') 248 if (*parser.buffer == '!')
@@ -508,7 +503,7 @@ extern char *__bad_type_size(void);
508#define FIELD(type, name) \ 503#define FIELD(type, name) \
509 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ 504 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \
510 #type, "common_" #name, offsetof(typeof(field), name), \ 505 #type, "common_" #name, offsetof(typeof(field), name), \
511 sizeof(field.name) 506 sizeof(field.name), is_signed_type(type)
512 507
513static int trace_write_header(struct trace_seq *s) 508static int trace_write_header(struct trace_seq *s)
514{ 509{
@@ -516,17 +511,17 @@ static int trace_write_header(struct trace_seq *s)
516 511
517 /* struct trace_entry */ 512 /* struct trace_entry */
518 return trace_seq_printf(s, 513 return trace_seq_printf(s,
519 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 514 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
520 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 515 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
521 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 516 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
522 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 517 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
523 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 518 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
524 "\n", 519 "\n",
525 FIELD(unsigned short, type), 520 FIELD(unsigned short, type),
526 FIELD(unsigned char, flags), 521 FIELD(unsigned char, flags),
527 FIELD(unsigned char, preempt_count), 522 FIELD(unsigned char, preempt_count),
528 FIELD(int, pid), 523 FIELD(int, pid),
529 FIELD(int, lock_depth)); 524 FIELD(int, lock_depth));
530} 525}
531 526
532static ssize_t 527static ssize_t
@@ -879,9 +874,9 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
879 "'%s/filter' entry\n", name); 874 "'%s/filter' entry\n", name);
880 } 875 }
881 876
882 entry = trace_create_file("enable", 0644, system->entry, 877 trace_create_file("enable", 0644, system->entry,
883 (void *)system->name, 878 (void *)system->name,
884 &ftrace_system_enable_fops); 879 &ftrace_system_enable_fops);
885 880
886 return system->entry; 881 return system->entry;
887} 882}
@@ -893,7 +888,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
893 const struct file_operations *filter, 888 const struct file_operations *filter,
894 const struct file_operations *format) 889 const struct file_operations *format)
895{ 890{
896 struct dentry *entry;
897 int ret; 891 int ret;
898 892
899 /* 893 /*
@@ -911,12 +905,12 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
911 } 905 }
912 906
913 if (call->regfunc) 907 if (call->regfunc)
914 entry = trace_create_file("enable", 0644, call->dir, call, 908 trace_create_file("enable", 0644, call->dir, call,
915 enable); 909 enable);
916 910
917 if (call->id && call->profile_enable) 911 if (call->id && call->profile_enable)
918 entry = trace_create_file("id", 0444, call->dir, call, 912 trace_create_file("id", 0444, call->dir, call,
919 id); 913 id);
920 914
921 if (call->define_fields) { 915 if (call->define_fields) {
922 ret = call->define_fields(call); 916 ret = call->define_fields(call);
@@ -925,41 +919,60 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
925 " events/%s\n", call->name); 919 " events/%s\n", call->name);
926 return ret; 920 return ret;
927 } 921 }
928 entry = trace_create_file("filter", 0644, call->dir, call, 922 trace_create_file("filter", 0644, call->dir, call,
929 filter); 923 filter);
930 } 924 }
931 925
932 /* A trace may not want to export its format */ 926 /* A trace may not want to export its format */
933 if (!call->show_format) 927 if (!call->show_format)
934 return 0; 928 return 0;
935 929
936 entry = trace_create_file("format", 0444, call->dir, call, 930 trace_create_file("format", 0444, call->dir, call,
937 format); 931 format);
938 932
939 return 0; 933 return 0;
940} 934}
941 935
942#define for_each_event(event, start, end) \ 936static int __trace_add_event_call(struct ftrace_event_call *call)
943 for (event = start; \ 937{
944 (unsigned long)event < (unsigned long)end; \ 938 struct dentry *d_events;
945 event++) 939 int ret;
946 940
947#ifdef CONFIG_MODULES 941 if (!call->name)
942 return -EINVAL;
948 943
949static LIST_HEAD(ftrace_module_file_list); 944 if (call->raw_init) {
945 ret = call->raw_init(call);
946 if (ret < 0) {
947 if (ret != -ENOSYS)
948 pr_warning("Could not initialize trace "
949 "events/%s\n", call->name);
950 return ret;
951 }
952 }
950 953
951/* 954 d_events = event_trace_events_dir();
952 * Modules must own their file_operations to keep up with 955 if (!d_events)
953 * reference counting. 956 return -ENOENT;
954 */ 957
955struct ftrace_module_file_ops { 958 ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
956 struct list_head list; 959 &ftrace_enable_fops, &ftrace_event_filter_fops,
957 struct module *mod; 960 &ftrace_event_format_fops);
958 struct file_operations id; 961 if (!ret)
959 struct file_operations enable; 962 list_add(&call->list, &ftrace_events);
960 struct file_operations format; 963
961 struct file_operations filter; 964 return ret;
962}; 965}
966
967/* Add an additional event_call dynamically */
968int trace_add_event_call(struct ftrace_event_call *call)
969{
970 int ret;
971 mutex_lock(&event_mutex);
972 ret = __trace_add_event_call(call);
973 mutex_unlock(&event_mutex);
974 return ret;
975}
963 976
964static void remove_subsystem_dir(const char *name) 977static void remove_subsystem_dir(const char *name)
965{ 978{
@@ -987,6 +1000,53 @@ static void remove_subsystem_dir(const char *name)
987 } 1000 }
988} 1001}
989 1002
1003/*
1004 * Must be called under locking both of event_mutex and trace_event_mutex.
1005 */
1006static void __trace_remove_event_call(struct ftrace_event_call *call)
1007{
1008 ftrace_event_enable_disable(call, 0);
1009 if (call->event)
1010 __unregister_ftrace_event(call->event);
1011 debugfs_remove_recursive(call->dir);
1012 list_del(&call->list);
1013 trace_destroy_fields(call);
1014 destroy_preds(call);
1015 remove_subsystem_dir(call->system);
1016}
1017
1018/* Remove an event_call */
1019void trace_remove_event_call(struct ftrace_event_call *call)
1020{
1021 mutex_lock(&event_mutex);
1022 down_write(&trace_event_mutex);
1023 __trace_remove_event_call(call);
1024 up_write(&trace_event_mutex);
1025 mutex_unlock(&event_mutex);
1026}
1027
1028#define for_each_event(event, start, end) \
1029 for (event = start; \
1030 (unsigned long)event < (unsigned long)end; \
1031 event++)
1032
1033#ifdef CONFIG_MODULES
1034
1035static LIST_HEAD(ftrace_module_file_list);
1036
1037/*
1038 * Modules must own their file_operations to keep up with
1039 * reference counting.
1040 */
1041struct ftrace_module_file_ops {
1042 struct list_head list;
1043 struct module *mod;
1044 struct file_operations id;
1045 struct file_operations enable;
1046 struct file_operations format;
1047 struct file_operations filter;
1048};
1049
990static struct ftrace_module_file_ops * 1050static struct ftrace_module_file_ops *
991trace_create_file_ops(struct module *mod) 1051trace_create_file_ops(struct module *mod)
992{ 1052{
@@ -1044,7 +1104,7 @@ static void trace_module_add_events(struct module *mod)
1044 if (!call->name) 1104 if (!call->name)
1045 continue; 1105 continue;
1046 if (call->raw_init) { 1106 if (call->raw_init) {
1047 ret = call->raw_init(); 1107 ret = call->raw_init(call);
1048 if (ret < 0) { 1108 if (ret < 0) {
1049 if (ret != -ENOSYS) 1109 if (ret != -ENOSYS)
1050 pr_warning("Could not initialize trace " 1110 pr_warning("Could not initialize trace "
@@ -1062,10 +1122,11 @@ static void trace_module_add_events(struct module *mod)
1062 return; 1122 return;
1063 } 1123 }
1064 call->mod = mod; 1124 call->mod = mod;
1065 list_add(&call->list, &ftrace_events); 1125 ret = event_create_dir(call, d_events,
1066 event_create_dir(call, d_events, 1126 &file_ops->id, &file_ops->enable,
1067 &file_ops->id, &file_ops->enable, 1127 &file_ops->filter, &file_ops->format);
1068 &file_ops->filter, &file_ops->format); 1128 if (!ret)
1129 list_add(&call->list, &ftrace_events);
1069 } 1130 }
1070} 1131}
1071 1132
@@ -1079,14 +1140,7 @@ static void trace_module_remove_events(struct module *mod)
1079 list_for_each_entry_safe(call, p, &ftrace_events, list) { 1140 list_for_each_entry_safe(call, p, &ftrace_events, list) {
1080 if (call->mod == mod) { 1141 if (call->mod == mod) {
1081 found = true; 1142 found = true;
1082 ftrace_event_enable_disable(call, 0); 1143 __trace_remove_event_call(call);
1083 if (call->event)
1084 __unregister_ftrace_event(call->event);
1085 debugfs_remove_recursive(call->dir);
1086 list_del(&call->list);
1087 trace_destroy_fields(call);
1088 destroy_preds(call);
1089 remove_subsystem_dir(call->system);
1090 } 1144 }
1091 } 1145 }
1092 1146
@@ -1204,7 +1258,7 @@ static __init int event_trace_init(void)
1204 if (!call->name) 1258 if (!call->name)
1205 continue; 1259 continue;
1206 if (call->raw_init) { 1260 if (call->raw_init) {
1207 ret = call->raw_init(); 1261 ret = call->raw_init(call);
1208 if (ret < 0) { 1262 if (ret < 0) {
1209 if (ret != -ENOSYS) 1263 if (ret != -ENOSYS)
1210 pr_warning("Could not initialize trace " 1264 pr_warning("Could not initialize trace "
@@ -1212,10 +1266,12 @@ static __init int event_trace_init(void)
1212 continue; 1266 continue;
1213 } 1267 }
1214 } 1268 }
1215 list_add(&call->list, &ftrace_events); 1269 ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
1216 event_create_dir(call, d_events, &ftrace_event_id_fops, 1270 &ftrace_enable_fops,
1217 &ftrace_enable_fops, &ftrace_event_filter_fops, 1271 &ftrace_event_filter_fops,
1218 &ftrace_event_format_fops); 1272 &ftrace_event_format_fops);
1273 if (!ret)
1274 list_add(&call->list, &ftrace_events);
1219 } 1275 }
1220 1276
1221 while (true) { 1277 while (true) {
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 23245785927f..50504cb228de 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -18,11 +18,10 @@
18 * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> 18 * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
19 */ 19 */
20 20
21#include <linux/debugfs.h>
22#include <linux/uaccess.h>
23#include <linux/module.h> 21#include <linux/module.h>
24#include <linux/ctype.h> 22#include <linux/ctype.h>
25#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/perf_event.h>
26 25
27#include "trace.h" 26#include "trace.h"
28#include "trace_output.h" 27#include "trace_output.h"
@@ -31,6 +30,7 @@ enum filter_op_ids
31{ 30{
32 OP_OR, 31 OP_OR,
33 OP_AND, 32 OP_AND,
33 OP_GLOB,
34 OP_NE, 34 OP_NE,
35 OP_EQ, 35 OP_EQ,
36 OP_LT, 36 OP_LT,
@@ -48,16 +48,17 @@ struct filter_op {
48}; 48};
49 49
50static struct filter_op filter_ops[] = { 50static struct filter_op filter_ops[] = {
51 { OP_OR, "||", 1 }, 51 { OP_OR, "||", 1 },
52 { OP_AND, "&&", 2 }, 52 { OP_AND, "&&", 2 },
53 { OP_NE, "!=", 4 }, 53 { OP_GLOB, "~", 4 },
54 { OP_EQ, "==", 4 }, 54 { OP_NE, "!=", 4 },
55 { OP_LT, "<", 5 }, 55 { OP_EQ, "==", 4 },
56 { OP_LE, "<=", 5 }, 56 { OP_LT, "<", 5 },
57 { OP_GT, ">", 5 }, 57 { OP_LE, "<=", 5 },
58 { OP_GE, ">=", 5 }, 58 { OP_GT, ">", 5 },
59 { OP_NONE, "OP_NONE", 0 }, 59 { OP_GE, ">=", 5 },
60 { OP_OPEN_PAREN, "(", 0 }, 60 { OP_NONE, "OP_NONE", 0 },
61 { OP_OPEN_PAREN, "(", 0 },
61}; 62};
62 63
63enum { 64enum {
@@ -197,9 +198,9 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
197 char *addr = (char *)(event + pred->offset); 198 char *addr = (char *)(event + pred->offset);
198 int cmp, match; 199 int cmp, match;
199 200
200 cmp = strncmp(addr, pred->str_val, pred->str_len); 201 cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len);
201 202
202 match = (!cmp) ^ pred->not; 203 match = cmp ^ pred->not;
203 204
204 return match; 205 return match;
205} 206}
@@ -211,9 +212,9 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
211 char **addr = (char **)(event + pred->offset); 212 char **addr = (char **)(event + pred->offset);
212 int cmp, match; 213 int cmp, match;
213 214
214 cmp = strncmp(*addr, pred->str_val, pred->str_len); 215 cmp = pred->regex.match(*addr, &pred->regex, pred->regex.field_len);
215 216
216 match = (!cmp) ^ pred->not; 217 match = cmp ^ pred->not;
217 218
218 return match; 219 return match;
219} 220}
@@ -237,9 +238,9 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,
237 char *addr = (char *)(event + str_loc); 238 char *addr = (char *)(event + str_loc);
238 int cmp, match; 239 int cmp, match;
239 240
240 cmp = strncmp(addr, pred->str_val, str_len); 241 cmp = pred->regex.match(addr, &pred->regex, str_len);
241 242
242 match = (!cmp) ^ pred->not; 243 match = cmp ^ pred->not;
243 244
244 return match; 245 return match;
245} 246}
@@ -250,10 +251,121 @@ static int filter_pred_none(struct filter_pred *pred, void *event,
250 return 0; 251 return 0;
251} 252}
252 253
254/* Basic regex callbacks */
255static int regex_match_full(char *str, struct regex *r, int len)
256{
257 if (strncmp(str, r->pattern, len) == 0)
258 return 1;
259 return 0;
260}
261
262static int regex_match_front(char *str, struct regex *r, int len)
263{
264 if (strncmp(str, r->pattern, len) == 0)
265 return 1;
266 return 0;
267}
268
269static int regex_match_middle(char *str, struct regex *r, int len)
270{
271 if (strstr(str, r->pattern))
272 return 1;
273 return 0;
274}
275
276static int regex_match_end(char *str, struct regex *r, int len)
277{
278 char *ptr = strstr(str, r->pattern);
279
280 if (ptr && (ptr[r->len] == 0))
281 return 1;
282 return 0;
283}
284
285/**
286 * filter_parse_regex - parse a basic regex
287 * @buff: the raw regex
288 * @len: length of the regex
289 * @search: will point to the beginning of the string to compare
290 * @not: tell whether the match will have to be inverted
291 *
292 * This passes in a buffer containing a regex and this function will
293 * set search to point to the search part of the buffer and
294 * return the type of search it is (see enum above).
295 * This does modify buff.
296 *
297 * Returns enum type.
298 * search returns the pointer to use for comparison.
299 * not returns 1 if buff started with a '!'
300 * 0 otherwise.
301 */
302enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
303{
304 int type = MATCH_FULL;
305 int i;
306
307 if (buff[0] == '!') {
308 *not = 1;
309 buff++;
310 len--;
311 } else
312 *not = 0;
313
314 *search = buff;
315
316 for (i = 0; i < len; i++) {
317 if (buff[i] == '*') {
318 if (!i) {
319 *search = buff + 1;
320 type = MATCH_END_ONLY;
321 } else {
322 if (type == MATCH_END_ONLY)
323 type = MATCH_MIDDLE_ONLY;
324 else
325 type = MATCH_FRONT_ONLY;
326 buff[i] = 0;
327 break;
328 }
329 }
330 }
331
332 return type;
333}
334
335static void filter_build_regex(struct filter_pred *pred)
336{
337 struct regex *r = &pred->regex;
338 char *search;
339 enum regex_type type = MATCH_FULL;
340 int not = 0;
341
342 if (pred->op == OP_GLOB) {
343 type = filter_parse_regex(r->pattern, r->len, &search, &not);
344 r->len = strlen(search);
345 memmove(r->pattern, search, r->len+1);
346 }
347
348 switch (type) {
349 case MATCH_FULL:
350 r->match = regex_match_full;
351 break;
352 case MATCH_FRONT_ONLY:
353 r->match = regex_match_front;
354 break;
355 case MATCH_MIDDLE_ONLY:
356 r->match = regex_match_middle;
357 break;
358 case MATCH_END_ONLY:
359 r->match = regex_match_end;
360 break;
361 }
362
363 pred->not ^= not;
364}
365
253/* return 1 if event matches, 0 otherwise (discard) */ 366/* return 1 if event matches, 0 otherwise (discard) */
254int filter_match_preds(struct ftrace_event_call *call, void *rec) 367int filter_match_preds(struct event_filter *filter, void *rec)
255{ 368{
256 struct event_filter *filter = call->filter;
257 int match, top = 0, val1 = 0, val2 = 0; 369 int match, top = 0, val1 = 0, val2 = 0;
258 int stack[MAX_FILTER_PRED]; 370 int stack[MAX_FILTER_PRED];
259 struct filter_pred *pred; 371 struct filter_pred *pred;
@@ -396,7 +508,7 @@ static void filter_clear_pred(struct filter_pred *pred)
396{ 508{
397 kfree(pred->field_name); 509 kfree(pred->field_name);
398 pred->field_name = NULL; 510 pred->field_name = NULL;
399 pred->str_len = 0; 511 pred->regex.len = 0;
400} 512}
401 513
402static int filter_set_pred(struct filter_pred *dest, 514static int filter_set_pred(struct filter_pred *dest,
@@ -426,9 +538,8 @@ static void filter_disable_preds(struct ftrace_event_call *call)
426 filter->preds[i]->fn = filter_pred_none; 538 filter->preds[i]->fn = filter_pred_none;
427} 539}
428 540
429void destroy_preds(struct ftrace_event_call *call) 541static void __free_preds(struct event_filter *filter)
430{ 542{
431 struct event_filter *filter = call->filter;
432 int i; 543 int i;
433 544
434 if (!filter) 545 if (!filter)
@@ -441,21 +552,24 @@ void destroy_preds(struct ftrace_event_call *call)
441 kfree(filter->preds); 552 kfree(filter->preds);
442 kfree(filter->filter_string); 553 kfree(filter->filter_string);
443 kfree(filter); 554 kfree(filter);
555}
556
557void destroy_preds(struct ftrace_event_call *call)
558{
559 __free_preds(call->filter);
444 call->filter = NULL; 560 call->filter = NULL;
561 call->filter_active = 0;
445} 562}
446 563
447static int init_preds(struct ftrace_event_call *call) 564static struct event_filter *__alloc_preds(void)
448{ 565{
449 struct event_filter *filter; 566 struct event_filter *filter;
450 struct filter_pred *pred; 567 struct filter_pred *pred;
451 int i; 568 int i;
452 569
453 if (call->filter) 570 filter = kzalloc(sizeof(*filter), GFP_KERNEL);
454 return 0; 571 if (!filter)
455 572 return ERR_PTR(-ENOMEM);
456 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
457 if (!call->filter)
458 return -ENOMEM;
459 573
460 filter->n_preds = 0; 574 filter->n_preds = 0;
461 575
@@ -471,12 +585,24 @@ static int init_preds(struct ftrace_event_call *call)
471 filter->preds[i] = pred; 585 filter->preds[i] = pred;
472 } 586 }
473 587
474 return 0; 588 return filter;
475 589
476oom: 590oom:
477 destroy_preds(call); 591 __free_preds(filter);
592 return ERR_PTR(-ENOMEM);
593}
594
595static int init_preds(struct ftrace_event_call *call)
596{
597 if (call->filter)
598 return 0;
599
600 call->filter_active = 0;
601 call->filter = __alloc_preds();
602 if (IS_ERR(call->filter))
603 return PTR_ERR(call->filter);
478 604
479 return -ENOMEM; 605 return 0;
480} 606}
481 607
482static int init_subsystem_preds(struct event_subsystem *system) 608static int init_subsystem_preds(struct event_subsystem *system)
@@ -499,14 +625,7 @@ static int init_subsystem_preds(struct event_subsystem *system)
499 return 0; 625 return 0;
500} 626}
501 627
502enum { 628static void filter_free_subsystem_preds(struct event_subsystem *system)
503 FILTER_DISABLE_ALL,
504 FILTER_INIT_NO_RESET,
505 FILTER_SKIP_NO_RESET,
506};
507
508static void filter_free_subsystem_preds(struct event_subsystem *system,
509 int flag)
510{ 629{
511 struct ftrace_event_call *call; 630 struct ftrace_event_call *call;
512 631
@@ -517,14 +636,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
517 if (strcmp(call->system, system->name) != 0) 636 if (strcmp(call->system, system->name) != 0)
518 continue; 637 continue;
519 638
520 if (flag == FILTER_INIT_NO_RESET) {
521 call->filter->no_reset = false;
522 continue;
523 }
524
525 if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
526 continue;
527
528 filter_disable_preds(call); 639 filter_disable_preds(call);
529 remove_filter_string(call->filter); 640 remove_filter_string(call->filter);
530 } 641 }
@@ -532,10 +643,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
532 643
533static int filter_add_pred_fn(struct filter_parse_state *ps, 644static int filter_add_pred_fn(struct filter_parse_state *ps,
534 struct ftrace_event_call *call, 645 struct ftrace_event_call *call,
646 struct event_filter *filter,
535 struct filter_pred *pred, 647 struct filter_pred *pred,
536 filter_pred_fn_t fn) 648 filter_pred_fn_t fn)
537{ 649{
538 struct event_filter *filter = call->filter;
539 int idx, err; 650 int idx, err;
540 651
541 if (filter->n_preds == MAX_FILTER_PRED) { 652 if (filter->n_preds == MAX_FILTER_PRED) {
@@ -550,7 +661,6 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
550 return err; 661 return err;
551 662
552 filter->n_preds++; 663 filter->n_preds++;
553 call->filter_active = 1;
554 664
555 return 0; 665 return 0;
556} 666}
@@ -575,7 +685,10 @@ static bool is_string_field(struct ftrace_event_field *field)
575 685
576static int is_legal_op(struct ftrace_event_field *field, int op) 686static int is_legal_op(struct ftrace_event_field *field, int op)
577{ 687{
578 if (is_string_field(field) && (op != OP_EQ && op != OP_NE)) 688 if (is_string_field(field) &&
689 (op != OP_EQ && op != OP_NE && op != OP_GLOB))
690 return 0;
691 if (!is_string_field(field) && op == OP_GLOB)
579 return 0; 692 return 0;
580 693
581 return 1; 694 return 1;
@@ -626,6 +739,7 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
626 739
627static int filter_add_pred(struct filter_parse_state *ps, 740static int filter_add_pred(struct filter_parse_state *ps,
628 struct ftrace_event_call *call, 741 struct ftrace_event_call *call,
742 struct event_filter *filter,
629 struct filter_pred *pred, 743 struct filter_pred *pred,
630 bool dry_run) 744 bool dry_run)
631{ 745{
@@ -660,21 +774,22 @@ static int filter_add_pred(struct filter_parse_state *ps,
660 } 774 }
661 775
662 if (is_string_field(field)) { 776 if (is_string_field(field)) {
663 pred->str_len = field->size; 777 filter_build_regex(pred);
664 778
665 if (field->filter_type == FILTER_STATIC_STRING) 779 if (field->filter_type == FILTER_STATIC_STRING) {
666 fn = filter_pred_string; 780 fn = filter_pred_string;
667 else if (field->filter_type == FILTER_DYN_STRING) 781 pred->regex.field_len = field->size;
782 } else if (field->filter_type == FILTER_DYN_STRING)
668 fn = filter_pred_strloc; 783 fn = filter_pred_strloc;
669 else { 784 else {
670 fn = filter_pred_pchar; 785 fn = filter_pred_pchar;
671 pred->str_len = strlen(pred->str_val); 786 pred->regex.field_len = strlen(pred->regex.pattern);
672 } 787 }
673 } else { 788 } else {
674 if (field->is_signed) 789 if (field->is_signed)
675 ret = strict_strtoll(pred->str_val, 0, &val); 790 ret = strict_strtoll(pred->regex.pattern, 0, &val);
676 else 791 else
677 ret = strict_strtoull(pred->str_val, 0, &val); 792 ret = strict_strtoull(pred->regex.pattern, 0, &val);
678 if (ret) { 793 if (ret) {
679 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); 794 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
680 return -EINVAL; 795 return -EINVAL;
@@ -694,45 +809,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
694 809
695add_pred_fn: 810add_pred_fn:
696 if (!dry_run) 811 if (!dry_run)
697 return filter_add_pred_fn(ps, call, pred, fn); 812 return filter_add_pred_fn(ps, call, filter, pred, fn);
698 return 0;
699}
700
701static int filter_add_subsystem_pred(struct filter_parse_state *ps,
702 struct event_subsystem *system,
703 struct filter_pred *pred,
704 char *filter_string,
705 bool dry_run)
706{
707 struct ftrace_event_call *call;
708 int err = 0;
709 bool fail = true;
710
711 list_for_each_entry(call, &ftrace_events, list) {
712
713 if (!call->define_fields)
714 continue;
715
716 if (strcmp(call->system, system->name))
717 continue;
718
719 if (call->filter->no_reset)
720 continue;
721
722 err = filter_add_pred(ps, call, pred, dry_run);
723 if (err)
724 call->filter->no_reset = true;
725 else
726 fail = false;
727
728 if (!dry_run)
729 replace_filter_string(call->filter, filter_string);
730 }
731
732 if (fail) {
733 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
734 return err;
735 }
736 return 0; 813 return 0;
737} 814}
738 815
@@ -933,8 +1010,9 @@ static void postfix_clear(struct filter_parse_state *ps)
933 1010
934 while (!list_empty(&ps->postfix)) { 1011 while (!list_empty(&ps->postfix)) {
935 elt = list_first_entry(&ps->postfix, struct postfix_elt, list); 1012 elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
936 kfree(elt->operand);
937 list_del(&elt->list); 1013 list_del(&elt->list);
1014 kfree(elt->operand);
1015 kfree(elt);
938 } 1016 }
939} 1017}
940 1018
@@ -1044,8 +1122,8 @@ static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
1044 return NULL; 1122 return NULL;
1045 } 1123 }
1046 1124
1047 strcpy(pred->str_val, operand2); 1125 strcpy(pred->regex.pattern, operand2);
1048 pred->str_len = strlen(operand2); 1126 pred->regex.len = strlen(pred->regex.pattern);
1049 1127
1050 pred->op = op; 1128 pred->op = op;
1051 1129
@@ -1089,8 +1167,8 @@ static int check_preds(struct filter_parse_state *ps)
1089 return 0; 1167 return 0;
1090} 1168}
1091 1169
1092static int replace_preds(struct event_subsystem *system, 1170static int replace_preds(struct ftrace_event_call *call,
1093 struct ftrace_event_call *call, 1171 struct event_filter *filter,
1094 struct filter_parse_state *ps, 1172 struct filter_parse_state *ps,
1095 char *filter_string, 1173 char *filter_string,
1096 bool dry_run) 1174 bool dry_run)
@@ -1137,11 +1215,7 @@ static int replace_preds(struct event_subsystem *system,
1137add_pred: 1215add_pred:
1138 if (!pred) 1216 if (!pred)
1139 return -ENOMEM; 1217 return -ENOMEM;
1140 if (call) 1218 err = filter_add_pred(ps, call, filter, pred, dry_run);
1141 err = filter_add_pred(ps, call, pred, false);
1142 else
1143 err = filter_add_subsystem_pred(ps, system, pred,
1144 filter_string, dry_run);
1145 filter_free_pred(pred); 1219 filter_free_pred(pred);
1146 if (err) 1220 if (err)
1147 return err; 1221 return err;
@@ -1152,10 +1226,50 @@ add_pred:
1152 return 0; 1226 return 0;
1153} 1227}
1154 1228
1155int apply_event_filter(struct ftrace_event_call *call, char *filter_string) 1229static int replace_system_preds(struct event_subsystem *system,
1230 struct filter_parse_state *ps,
1231 char *filter_string)
1156{ 1232{
1233 struct ftrace_event_call *call;
1234 bool fail = true;
1157 int err; 1235 int err;
1158 1236
1237 list_for_each_entry(call, &ftrace_events, list) {
1238 struct event_filter *filter = call->filter;
1239
1240 if (!call->define_fields)
1241 continue;
1242
1243 if (strcmp(call->system, system->name) != 0)
1244 continue;
1245
1246 /* try to see if the filter can be applied */
1247 err = replace_preds(call, filter, ps, filter_string, true);
1248 if (err)
1249 continue;
1250
1251 /* really apply the filter */
1252 filter_disable_preds(call);
1253 err = replace_preds(call, filter, ps, filter_string, false);
1254 if (err)
1255 filter_disable_preds(call);
1256 else {
1257 call->filter_active = 1;
1258 replace_filter_string(filter, filter_string);
1259 }
1260 fail = false;
1261 }
1262
1263 if (fail) {
1264 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1265 return -EINVAL;
1266 }
1267 return 0;
1268}
1269
1270int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1271{
1272 int err;
1159 struct filter_parse_state *ps; 1273 struct filter_parse_state *ps;
1160 1274
1161 mutex_lock(&event_mutex); 1275 mutex_lock(&event_mutex);
@@ -1167,8 +1281,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1167 if (!strcmp(strstrip(filter_string), "0")) { 1281 if (!strcmp(strstrip(filter_string), "0")) {
1168 filter_disable_preds(call); 1282 filter_disable_preds(call);
1169 remove_filter_string(call->filter); 1283 remove_filter_string(call->filter);
1170 mutex_unlock(&event_mutex); 1284 goto out_unlock;
1171 return 0;
1172 } 1285 }
1173 1286
1174 err = -ENOMEM; 1287 err = -ENOMEM;
@@ -1186,10 +1299,11 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1186 goto out; 1299 goto out;
1187 } 1300 }
1188 1301
1189 err = replace_preds(NULL, call, ps, filter_string, false); 1302 err = replace_preds(call, call->filter, ps, filter_string, false);
1190 if (err) 1303 if (err)
1191 append_filter_err(ps, call->filter); 1304 append_filter_err(ps, call->filter);
1192 1305 else
1306 call->filter_active = 1;
1193out: 1307out:
1194 filter_opstack_clear(ps); 1308 filter_opstack_clear(ps);
1195 postfix_clear(ps); 1309 postfix_clear(ps);
@@ -1204,7 +1318,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1204 char *filter_string) 1318 char *filter_string)
1205{ 1319{
1206 int err; 1320 int err;
1207
1208 struct filter_parse_state *ps; 1321 struct filter_parse_state *ps;
1209 1322
1210 mutex_lock(&event_mutex); 1323 mutex_lock(&event_mutex);
@@ -1214,10 +1327,9 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1214 goto out_unlock; 1327 goto out_unlock;
1215 1328
1216 if (!strcmp(strstrip(filter_string), "0")) { 1329 if (!strcmp(strstrip(filter_string), "0")) {
1217 filter_free_subsystem_preds(system, FILTER_DISABLE_ALL); 1330 filter_free_subsystem_preds(system);
1218 remove_filter_string(system->filter); 1331 remove_filter_string(system->filter);
1219 mutex_unlock(&event_mutex); 1332 goto out_unlock;
1220 return 0;
1221 } 1333 }
1222 1334
1223 err = -ENOMEM; 1335 err = -ENOMEM;
@@ -1234,31 +1346,87 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1234 goto out; 1346 goto out;
1235 } 1347 }
1236 1348
1237 filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET); 1349 err = replace_system_preds(system, ps, filter_string);
1238 1350 if (err)
1239 /* try to see the filter can be applied to which events */
1240 err = replace_preds(system, NULL, ps, filter_string, true);
1241 if (err) {
1242 append_filter_err(ps, system->filter); 1351 append_filter_err(ps, system->filter);
1243 goto out; 1352
1353out:
1354 filter_opstack_clear(ps);
1355 postfix_clear(ps);
1356 kfree(ps);
1357out_unlock:
1358 mutex_unlock(&event_mutex);
1359
1360 return err;
1361}
1362
1363#ifdef CONFIG_EVENT_PROFILE
1364
1365void ftrace_profile_free_filter(struct perf_event *event)
1366{
1367 struct event_filter *filter = event->filter;
1368
1369 event->filter = NULL;
1370 __free_preds(filter);
1371}
1372
1373int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1374 char *filter_str)
1375{
1376 int err;
1377 struct event_filter *filter;
1378 struct filter_parse_state *ps;
1379 struct ftrace_event_call *call = NULL;
1380
1381 mutex_lock(&event_mutex);
1382
1383 list_for_each_entry(call, &ftrace_events, list) {
1384 if (call->id == event_id)
1385 break;
1244 } 1386 }
1245 1387
1246 filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET); 1388 err = -EINVAL;
1389 if (!call)
1390 goto out_unlock;
1247 1391
1248 /* really apply the filter to the events */ 1392 err = -EEXIST;
1249 err = replace_preds(system, NULL, ps, filter_string, false); 1393 if (event->filter)
1250 if (err) { 1394 goto out_unlock;
1251 append_filter_err(ps, system->filter); 1395
1252 filter_free_subsystem_preds(system, 2); 1396 filter = __alloc_preds();
1397 if (IS_ERR(filter)) {
1398 err = PTR_ERR(filter);
1399 goto out_unlock;
1253 } 1400 }
1254 1401
1255out: 1402 err = -ENOMEM;
1403 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1404 if (!ps)
1405 goto free_preds;
1406
1407 parse_init(ps, filter_ops, filter_str);
1408 err = filter_parse(ps);
1409 if (err)
1410 goto free_ps;
1411
1412 err = replace_preds(call, filter, ps, filter_str, false);
1413 if (!err)
1414 event->filter = filter;
1415
1416free_ps:
1256 filter_opstack_clear(ps); 1417 filter_opstack_clear(ps);
1257 postfix_clear(ps); 1418 postfix_clear(ps);
1258 kfree(ps); 1419 kfree(ps);
1420
1421free_preds:
1422 if (err)
1423 __free_preds(filter);
1424
1259out_unlock: 1425out_unlock:
1260 mutex_unlock(&event_mutex); 1426 mutex_unlock(&event_mutex);
1261 1427
1262 return err; 1428 return err;
1263} 1429}
1264 1430
1431#endif /* CONFIG_EVENT_PROFILE */
1432
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 9753fcc61bc5..dff8c84ddf17 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -48,11 +48,11 @@
48struct ____ftrace_##name { \ 48struct ____ftrace_##name { \
49 tstruct \ 49 tstruct \
50}; \ 50}; \
51static void __used ____ftrace_check_##name(void) \ 51static void __always_unused ____ftrace_check_##name(void) \
52{ \ 52{ \
53 struct ____ftrace_##name *__entry = NULL; \ 53 struct ____ftrace_##name *__entry = NULL; \
54 \ 54 \
55 /* force cmpile-time check on F_printk() */ \ 55 /* force compile-time check on F_printk() */ \
56 printk(print); \ 56 printk(print); \
57} 57}
58 58
@@ -66,44 +66,47 @@ static void __used ____ftrace_check_##name(void) \
66#undef __field 66#undef __field
67#define __field(type, item) \ 67#define __field(type, item) \
68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
69 "offset:%zu;\tsize:%zu;\n", \ 69 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
70 offsetof(typeof(field), item), \ 70 offsetof(typeof(field), item), \
71 sizeof(field.item)); \ 71 sizeof(field.item), is_signed_type(type)); \
72 if (!ret) \ 72 if (!ret) \
73 return 0; 73 return 0;
74 74
75#undef __field_desc 75#undef __field_desc
76#define __field_desc(type, container, item) \ 76#define __field_desc(type, container, item) \
77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
78 "offset:%zu;\tsize:%zu;\n", \ 78 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
79 offsetof(typeof(field), container.item), \ 79 offsetof(typeof(field), container.item), \
80 sizeof(field.container.item)); \ 80 sizeof(field.container.item), \
81 is_signed_type(type)); \
81 if (!ret) \ 82 if (!ret) \
82 return 0; 83 return 0;
83 84
84#undef __array 85#undef __array
85#define __array(type, item, len) \ 86#define __array(type, item, len) \
86 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ 87 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
87 "offset:%zu;\tsize:%zu;\n", \ 88 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
88 offsetof(typeof(field), item), \ 89 offsetof(typeof(field), item), \
89 sizeof(field.item)); \ 90 sizeof(field.item), is_signed_type(type)); \
90 if (!ret) \ 91 if (!ret) \
91 return 0; 92 return 0;
92 93
93#undef __array_desc 94#undef __array_desc
94#define __array_desc(type, container, item, len) \ 95#define __array_desc(type, container, item, len) \
95 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ 96 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
96 "offset:%zu;\tsize:%zu;\n", \ 97 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
97 offsetof(typeof(field), container.item), \ 98 offsetof(typeof(field), container.item), \
98 sizeof(field.container.item)); \ 99 sizeof(field.container.item), \
100 is_signed_type(type)); \
99 if (!ret) \ 101 if (!ret) \
100 return 0; 102 return 0;
101 103
102#undef __dynamic_array 104#undef __dynamic_array
103#define __dynamic_array(type, item) \ 105#define __dynamic_array(type, item) \
104 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 106 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
105 "offset:%zu;\tsize:0;\n", \ 107 "offset:%zu;\tsize:0;\tsigned:%u;\n", \
106 offsetof(typeof(field), item)); \ 108 offsetof(typeof(field), item), \
109 is_signed_type(type)); \
107 if (!ret) \ 110 if (!ret) \
108 return 0; 111 return 0;
109 112
@@ -131,7 +134,6 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
131 134
132#include "trace_entries.h" 135#include "trace_entries.h"
133 136
134
135#undef __field 137#undef __field
136#define __field(type, item) \ 138#define __field(type, item) \
137 ret = trace_define_field(event_call, #type, #item, \ 139 ret = trace_define_field(event_call, #type, #item, \
@@ -193,6 +195,11 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
193 195
194#include "trace_entries.h" 196#include "trace_entries.h"
195 197
198static int ftrace_raw_init_event(struct ftrace_event_call *call)
199{
200 INIT_LIST_HEAD(&call->fields);
201 return 0;
202}
196 203
197#undef __field 204#undef __field
198#define __field(type, item) 205#define __field(type, item)
@@ -211,7 +218,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
211 218
212#undef FTRACE_ENTRY 219#undef FTRACE_ENTRY
213#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ 220#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
214static int ftrace_raw_init_event_##call(void); \
215 \ 221 \
216struct ftrace_event_call __used \ 222struct ftrace_event_call __used \
217__attribute__((__aligned__(4))) \ 223__attribute__((__aligned__(4))) \
@@ -219,14 +225,9 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
219 .name = #call, \ 225 .name = #call, \
220 .id = type, \ 226 .id = type, \
221 .system = __stringify(TRACE_SYSTEM), \ 227 .system = __stringify(TRACE_SYSTEM), \
222 .raw_init = ftrace_raw_init_event_##call, \ 228 .raw_init = ftrace_raw_init_event, \
223 .show_format = ftrace_format_##call, \ 229 .show_format = ftrace_format_##call, \
224 .define_fields = ftrace_define_fields_##call, \ 230 .define_fields = ftrace_define_fields_##call, \
225}; \ 231}; \
226static int ftrace_raw_init_event_##call(void) \
227{ \
228 INIT_LIST_HEAD(&event_##call.fields); \
229 return 0; \
230} \
231 232
232#include "trace_entries.h" 233#include "trace_entries.h"
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 23b63859130e..69543a905cd5 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -165,6 +165,7 @@ void trace_hw_branch(u64 from, u64 to)
165 struct ftrace_event_call *call = &event_hw_branch; 165 struct ftrace_event_call *call = &event_hw_branch;
166 struct trace_array *tr = hw_branch_trace; 166 struct trace_array *tr = hw_branch_trace;
167 struct ring_buffer_event *event; 167 struct ring_buffer_event *event;
168 struct ring_buffer *buf;
168 struct hw_branch_entry *entry; 169 struct hw_branch_entry *entry;
169 unsigned long irq1; 170 unsigned long irq1;
170 int cpu; 171 int cpu;
@@ -180,7 +181,8 @@ void trace_hw_branch(u64 from, u64 to)
180 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) 181 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
181 goto out; 182 goto out;
182 183
183 event = trace_buffer_lock_reserve(tr, TRACE_HW_BRANCHES, 184 buf = tr->buffer;
185 event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES,
184 sizeof(*entry), 0, 0); 186 sizeof(*entry), 0, 0);
185 if (!event) 187 if (!event)
186 goto out; 188 goto out;
@@ -189,8 +191,8 @@ void trace_hw_branch(u64 from, u64 to)
189 entry->ent.type = TRACE_HW_BRANCHES; 191 entry->ent.type = TRACE_HW_BRANCHES;
190 entry->from = from; 192 entry->from = from;
191 entry->to = to; 193 entry->to = to;
192 if (!filter_check_discard(call, entry, tr->buffer, event)) 194 if (!filter_check_discard(call, entry, buf, event))
193 trace_buffer_unlock_commit(tr, event, 0, 0); 195 trace_buffer_unlock_commit(buf, event, 0, 0);
194 196
195 out: 197 out:
196 atomic_dec(&tr->data[cpu]->disabled); 198 atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
new file mode 100644
index 000000000000..aff5f80b59b8
--- /dev/null
+++ b/kernel/trace/trace_kprobe.c
@@ -0,0 +1,1523 @@
1/*
2 * Kprobes-based tracing events
3 *
4 * Created by Masami Hiramatsu <mhiramat@redhat.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/module.h>
21#include <linux/uaccess.h>
22#include <linux/kprobes.h>
23#include <linux/seq_file.h>
24#include <linux/slab.h>
25#include <linux/smp.h>
26#include <linux/debugfs.h>
27#include <linux/types.h>
28#include <linux/string.h>
29#include <linux/ctype.h>
30#include <linux/ptrace.h>
31#include <linux/perf_event.h>
32
33#include "trace.h"
34#include "trace_output.h"
35
36#define MAX_TRACE_ARGS 128
37#define MAX_ARGSTR_LEN 63
38#define MAX_EVENT_NAME_LEN 64
39#define KPROBE_EVENT_SYSTEM "kprobes"
40
41/* Reserved field names */
42#define FIELD_STRING_IP "__probe_ip"
43#define FIELD_STRING_NARGS "__probe_nargs"
44#define FIELD_STRING_RETIP "__probe_ret_ip"
45#define FIELD_STRING_FUNC "__probe_func"
46
47const char *reserved_field_names[] = {
48 "common_type",
49 "common_flags",
50 "common_preempt_count",
51 "common_pid",
52 "common_tgid",
53 "common_lock_depth",
54 FIELD_STRING_IP,
55 FIELD_STRING_NARGS,
56 FIELD_STRING_RETIP,
57 FIELD_STRING_FUNC,
58};
59
60struct fetch_func {
61 unsigned long (*func)(struct pt_regs *, void *);
62 void *data;
63};
64
65static __kprobes unsigned long call_fetch(struct fetch_func *f,
66 struct pt_regs *regs)
67{
68 return f->func(regs, f->data);
69}
70
71/* fetch handlers */
72static __kprobes unsigned long fetch_register(struct pt_regs *regs,
73 void *offset)
74{
75 return regs_get_register(regs, (unsigned int)((unsigned long)offset));
76}
77
78static __kprobes unsigned long fetch_stack(struct pt_regs *regs,
79 void *num)
80{
81 return regs_get_kernel_stack_nth(regs,
82 (unsigned int)((unsigned long)num));
83}
84
85static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
86{
87 unsigned long retval;
88
89 if (probe_kernel_address(addr, retval))
90 return 0;
91 return retval;
92}
93
94static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
95{
96 return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num));
97}
98
99static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
100 void *dummy)
101{
102 return regs_return_value(regs);
103}
104
105static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs,
106 void *dummy)
107{
108 return kernel_stack_pointer(regs);
109}
110
111/* Memory fetching by symbol */
112struct symbol_cache {
113 char *symbol;
114 long offset;
115 unsigned long addr;
116};
117
118static unsigned long update_symbol_cache(struct symbol_cache *sc)
119{
120 sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
121 if (sc->addr)
122 sc->addr += sc->offset;
123 return sc->addr;
124}
125
126static void free_symbol_cache(struct symbol_cache *sc)
127{
128 kfree(sc->symbol);
129 kfree(sc);
130}
131
132static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
133{
134 struct symbol_cache *sc;
135
136 if (!sym || strlen(sym) == 0)
137 return NULL;
138 sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
139 if (!sc)
140 return NULL;
141
142 sc->symbol = kstrdup(sym, GFP_KERNEL);
143 if (!sc->symbol) {
144 kfree(sc);
145 return NULL;
146 }
147 sc->offset = offset;
148
149 update_symbol_cache(sc);
150 return sc;
151}
152
153static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data)
154{
155 struct symbol_cache *sc = data;
156
157 if (sc->addr)
158 return fetch_memory(regs, (void *)sc->addr);
159 else
160 return 0;
161}
162
163/* Special indirect memory access interface */
164struct indirect_fetch_data {
165 struct fetch_func orig;
166 long offset;
167};
168
169static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data)
170{
171 struct indirect_fetch_data *ind = data;
172 unsigned long addr;
173
174 addr = call_fetch(&ind->orig, regs);
175 if (addr) {
176 addr += ind->offset;
177 return fetch_memory(regs, (void *)addr);
178 } else
179 return 0;
180}
181
182static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data)
183{
184 if (data->orig.func == fetch_indirect)
185 free_indirect_fetch_data(data->orig.data);
186 else if (data->orig.func == fetch_symbol)
187 free_symbol_cache(data->orig.data);
188 kfree(data);
189}
190
191/**
192 * Kprobe event core functions
193 */
194
195struct probe_arg {
196 struct fetch_func fetch;
197 const char *name;
198};
199
200/* Flags for trace_probe */
201#define TP_FLAG_TRACE 1
202#define TP_FLAG_PROFILE 2
203
204struct trace_probe {
205 struct list_head list;
206 struct kretprobe rp; /* Use rp.kp for kprobe use */
207 unsigned long nhit;
208 unsigned int flags; /* For TP_FLAG_* */
209 const char *symbol; /* symbol name */
210 struct ftrace_event_call call;
211 struct trace_event event;
212 unsigned int nr_args;
213 struct probe_arg args[];
214};
215
216#define SIZEOF_TRACE_PROBE(n) \
217 (offsetof(struct trace_probe, args) + \
218 (sizeof(struct probe_arg) * (n)))
219
220static __kprobes int probe_is_return(struct trace_probe *tp)
221{
222 return tp->rp.handler != NULL;
223}
224
225static __kprobes const char *probe_symbol(struct trace_probe *tp)
226{
227 return tp->symbol ? tp->symbol : "unknown";
228}
229
230static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
231{
232 int ret = -EINVAL;
233
234 if (ff->func == fetch_argument)
235 ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data);
236 else if (ff->func == fetch_register) {
237 const char *name;
238 name = regs_query_register_name((unsigned int)((long)ff->data));
239 ret = snprintf(buf, n, "%%%s", name);
240 } else if (ff->func == fetch_stack)
241 ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
242 else if (ff->func == fetch_memory)
243 ret = snprintf(buf, n, "@0x%p", ff->data);
244 else if (ff->func == fetch_symbol) {
245 struct symbol_cache *sc = ff->data;
246 if (sc->offset)
247 ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
248 sc->offset);
249 else
250 ret = snprintf(buf, n, "@%s", sc->symbol);
251 } else if (ff->func == fetch_retvalue)
252 ret = snprintf(buf, n, "$retval");
253 else if (ff->func == fetch_stack_address)
254 ret = snprintf(buf, n, "$stack");
255 else if (ff->func == fetch_indirect) {
256 struct indirect_fetch_data *id = ff->data;
257 size_t l = 0;
258 ret = snprintf(buf, n, "%+ld(", id->offset);
259 if (ret >= n)
260 goto end;
261 l += ret;
262 ret = probe_arg_string(buf + l, n - l, &id->orig);
263 if (ret < 0)
264 goto end;
265 l += ret;
266 ret = snprintf(buf + l, n - l, ")");
267 ret += l;
268 }
269end:
270 if (ret >= n)
271 return -ENOSPC;
272 return ret;
273}
274
275static int register_probe_event(struct trace_probe *tp);
276static void unregister_probe_event(struct trace_probe *tp);
277
278static DEFINE_MUTEX(probe_lock);
279static LIST_HEAD(probe_list);
280
281static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
282static int kretprobe_dispatcher(struct kretprobe_instance *ri,
283 struct pt_regs *regs);
284
285/*
286 * Allocate new trace_probe and initialize it (including kprobes).
287 */
288static struct trace_probe *alloc_trace_probe(const char *group,
289 const char *event,
290 void *addr,
291 const char *symbol,
292 unsigned long offs,
293 int nargs, int is_return)
294{
295 struct trace_probe *tp;
296
297 tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL);
298 if (!tp)
299 return ERR_PTR(-ENOMEM);
300
301 if (symbol) {
302 tp->symbol = kstrdup(symbol, GFP_KERNEL);
303 if (!tp->symbol)
304 goto error;
305 tp->rp.kp.symbol_name = tp->symbol;
306 tp->rp.kp.offset = offs;
307 } else
308 tp->rp.kp.addr = addr;
309
310 if (is_return)
311 tp->rp.handler = kretprobe_dispatcher;
312 else
313 tp->rp.kp.pre_handler = kprobe_dispatcher;
314
315 if (!event)
316 goto error;
317 tp->call.name = kstrdup(event, GFP_KERNEL);
318 if (!tp->call.name)
319 goto error;
320
321 if (!group)
322 goto error;
323 tp->call.system = kstrdup(group, GFP_KERNEL);
324 if (!tp->call.system)
325 goto error;
326
327 INIT_LIST_HEAD(&tp->list);
328 return tp;
329error:
330 kfree(tp->call.name);
331 kfree(tp->symbol);
332 kfree(tp);
333 return ERR_PTR(-ENOMEM);
334}
335
336static void free_probe_arg(struct probe_arg *arg)
337{
338 if (arg->fetch.func == fetch_symbol)
339 free_symbol_cache(arg->fetch.data);
340 else if (arg->fetch.func == fetch_indirect)
341 free_indirect_fetch_data(arg->fetch.data);
342 kfree(arg->name);
343}
344
345static void free_trace_probe(struct trace_probe *tp)
346{
347 int i;
348
349 for (i = 0; i < tp->nr_args; i++)
350 free_probe_arg(&tp->args[i]);
351
352 kfree(tp->call.system);
353 kfree(tp->call.name);
354 kfree(tp->symbol);
355 kfree(tp);
356}
357
358static struct trace_probe *find_probe_event(const char *event,
359 const char *group)
360{
361 struct trace_probe *tp;
362
363 list_for_each_entry(tp, &probe_list, list)
364 if (strcmp(tp->call.name, event) == 0 &&
365 strcmp(tp->call.system, group) == 0)
366 return tp;
367 return NULL;
368}
369
370/* Unregister a trace_probe and probe_event: call with locking probe_lock */
371static void unregister_trace_probe(struct trace_probe *tp)
372{
373 if (probe_is_return(tp))
374 unregister_kretprobe(&tp->rp);
375 else
376 unregister_kprobe(&tp->rp.kp);
377 list_del(&tp->list);
378 unregister_probe_event(tp);
379}
380
381/* Register a trace_probe and probe_event */
382static int register_trace_probe(struct trace_probe *tp)
383{
384 struct trace_probe *old_tp;
385 int ret;
386
387 mutex_lock(&probe_lock);
388
389 /* register as an event */
390 old_tp = find_probe_event(tp->call.name, tp->call.system);
391 if (old_tp) {
392 /* delete old event */
393 unregister_trace_probe(old_tp);
394 free_trace_probe(old_tp);
395 }
396 ret = register_probe_event(tp);
397 if (ret) {
398 pr_warning("Faild to register probe event(%d)\n", ret);
399 goto end;
400 }
401
402 tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
403 if (probe_is_return(tp))
404 ret = register_kretprobe(&tp->rp);
405 else
406 ret = register_kprobe(&tp->rp.kp);
407
408 if (ret) {
409 pr_warning("Could not insert probe(%d)\n", ret);
410 if (ret == -EILSEQ) {
411 pr_warning("Probing address(0x%p) is not an "
412 "instruction boundary.\n",
413 tp->rp.kp.addr);
414 ret = -EINVAL;
415 }
416 unregister_probe_event(tp);
417 } else
418 list_add_tail(&tp->list, &probe_list);
419end:
420 mutex_unlock(&probe_lock);
421 return ret;
422}
423
424/* Split symbol and offset. */
425static int split_symbol_offset(char *symbol, unsigned long *offset)
426{
427 char *tmp;
428 int ret;
429
430 if (!offset)
431 return -EINVAL;
432
433 tmp = strchr(symbol, '+');
434 if (tmp) {
435 /* skip sign because strict_strtol doesn't accept '+' */
436 ret = strict_strtoul(tmp + 1, 0, offset);
437 if (ret)
438 return ret;
439 *tmp = '\0';
440 } else
441 *offset = 0;
442 return 0;
443}
444
445#define PARAM_MAX_ARGS 16
446#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
447
448static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
449{
450 int ret = 0;
451 unsigned long param;
452
453 if (strcmp(arg, "retval") == 0) {
454 if (is_return) {
455 ff->func = fetch_retvalue;
456 ff->data = NULL;
457 } else
458 ret = -EINVAL;
459 } else if (strncmp(arg, "stack", 5) == 0) {
460 if (arg[5] == '\0') {
461 ff->func = fetch_stack_address;
462 ff->data = NULL;
463 } else if (isdigit(arg[5])) {
464 ret = strict_strtoul(arg + 5, 10, &param);
465 if (ret || param > PARAM_MAX_STACK)
466 ret = -EINVAL;
467 else {
468 ff->func = fetch_stack;
469 ff->data = (void *)param;
470 }
471 } else
472 ret = -EINVAL;
473 } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) {
474 ret = strict_strtoul(arg + 3, 10, &param);
475 if (ret || param > PARAM_MAX_ARGS)
476 ret = -EINVAL;
477 else {
478 ff->func = fetch_argument;
479 ff->data = (void *)param;
480 }
481 } else
482 ret = -EINVAL;
483 return ret;
484}
485
486/* Recursive argument parser */
487static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
488{
489 int ret = 0;
490 unsigned long param;
491 long offset;
492 char *tmp;
493
494 switch (arg[0]) {
495 case '$':
496 ret = parse_probe_vars(arg + 1, ff, is_return);
497 break;
498 case '%': /* named register */
499 ret = regs_query_register_offset(arg + 1);
500 if (ret >= 0) {
501 ff->func = fetch_register;
502 ff->data = (void *)(unsigned long)ret;
503 ret = 0;
504 }
505 break;
506 case '@': /* memory or symbol */
507 if (isdigit(arg[1])) {
508 ret = strict_strtoul(arg + 1, 0, &param);
509 if (ret)
510 break;
511 ff->func = fetch_memory;
512 ff->data = (void *)param;
513 } else {
514 ret = split_symbol_offset(arg + 1, &offset);
515 if (ret)
516 break;
517 ff->data = alloc_symbol_cache(arg + 1, offset);
518 if (ff->data)
519 ff->func = fetch_symbol;
520 else
521 ret = -EINVAL;
522 }
523 break;
524 case '+': /* indirect memory */
525 case '-':
526 tmp = strchr(arg, '(');
527 if (!tmp) {
528 ret = -EINVAL;
529 break;
530 }
531 *tmp = '\0';
532 ret = strict_strtol(arg + 1, 0, &offset);
533 if (ret)
534 break;
535 if (arg[0] == '-')
536 offset = -offset;
537 arg = tmp + 1;
538 tmp = strrchr(arg, ')');
539 if (tmp) {
540 struct indirect_fetch_data *id;
541 *tmp = '\0';
542 id = kzalloc(sizeof(struct indirect_fetch_data),
543 GFP_KERNEL);
544 if (!id)
545 return -ENOMEM;
546 id->offset = offset;
547 ret = __parse_probe_arg(arg, &id->orig, is_return);
548 if (ret)
549 kfree(id);
550 else {
551 ff->func = fetch_indirect;
552 ff->data = (void *)id;
553 }
554 } else
555 ret = -EINVAL;
556 break;
557 default:
558 /* TODO: support custom handler */
559 ret = -EINVAL;
560 }
561 return ret;
562}
563
564/* String length checking wrapper */
565static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
566{
567 if (strlen(arg) > MAX_ARGSTR_LEN) {
568 pr_info("Argument is too long.: %s\n", arg);
569 return -ENOSPC;
570 }
571 return __parse_probe_arg(arg, ff, is_return);
572}
573
574/* Return 1 if name is reserved or already used by another argument */
575static int conflict_field_name(const char *name,
576 struct probe_arg *args, int narg)
577{
578 int i;
579 for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
580 if (strcmp(reserved_field_names[i], name) == 0)
581 return 1;
582 for (i = 0; i < narg; i++)
583 if (strcmp(args[i].name, name) == 0)
584 return 1;
585 return 0;
586}
587
588static int create_trace_probe(int argc, char **argv)
589{
590 /*
591 * Argument syntax:
592 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
593 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
594 * Fetch args:
595 * $argN : fetch Nth of function argument. (N:0-)
596 * $retval : fetch return value
597 * $stack : fetch stack address
598 * $stackN : fetch Nth of stack (N:0-)
599 * @ADDR : fetch memory at ADDR (ADDR should be in kernel)
600 * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
601 * %REG : fetch register REG
602 * Indirect memory fetch:
603 * +|-offs(ARG) : fetch memory at ARG +|- offs address.
604 * Alias name of args:
605 * NAME=FETCHARG : set NAME as alias of FETCHARG.
606 */
607 struct trace_probe *tp;
608 int i, ret = 0;
609 int is_return = 0;
610 char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
611 unsigned long offset = 0;
612 void *addr = NULL;
613 char buf[MAX_EVENT_NAME_LEN];
614
615 if (argc < 2) {
616 pr_info("Probe point is not specified.\n");
617 return -EINVAL;
618 }
619
620 if (argv[0][0] == 'p')
621 is_return = 0;
622 else if (argv[0][0] == 'r')
623 is_return = 1;
624 else {
625 pr_info("Probe definition must be started with 'p' or 'r'.\n");
626 return -EINVAL;
627 }
628
629 if (argv[0][1] == ':') {
630 event = &argv[0][2];
631 if (strchr(event, '/')) {
632 group = event;
633 event = strchr(group, '/') + 1;
634 event[-1] = '\0';
635 if (strlen(group) == 0) {
636 pr_info("Group name is not specifiled\n");
637 return -EINVAL;
638 }
639 }
640 if (strlen(event) == 0) {
641 pr_info("Event name is not specifiled\n");
642 return -EINVAL;
643 }
644 }
645
646 if (isdigit(argv[1][0])) {
647 if (is_return) {
648 pr_info("Return probe point must be a symbol.\n");
649 return -EINVAL;
650 }
651 /* an address specified */
652 ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr);
653 if (ret) {
654 pr_info("Failed to parse address.\n");
655 return ret;
656 }
657 } else {
658 /* a symbol specified */
659 symbol = argv[1];
660 /* TODO: support .init module functions */
661 ret = split_symbol_offset(symbol, &offset);
662 if (ret) {
663 pr_info("Failed to parse symbol.\n");
664 return ret;
665 }
666 if (offset && is_return) {
667 pr_info("Return probe must be used without offset.\n");
668 return -EINVAL;
669 }
670 }
671 argc -= 2; argv += 2;
672
673 /* setup a probe */
674 if (!group)
675 group = KPROBE_EVENT_SYSTEM;
676 if (!event) {
677 /* Make a new event name */
678 if (symbol)
679 snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld",
680 is_return ? 'r' : 'p', symbol, offset);
681 else
682 snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p",
683 is_return ? 'r' : 'p', addr);
684 event = buf;
685 }
686 tp = alloc_trace_probe(group, event, addr, symbol, offset, argc,
687 is_return);
688 if (IS_ERR(tp)) {
689 pr_info("Failed to allocate trace_probe.(%d)\n",
690 (int)PTR_ERR(tp));
691 return PTR_ERR(tp);
692 }
693
694 /* parse arguments */
695 ret = 0;
696 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
697 /* Parse argument name */
698 arg = strchr(argv[i], '=');
699 if (arg)
700 *arg++ = '\0';
701 else
702 arg = argv[i];
703
704 if (conflict_field_name(argv[i], tp->args, i)) {
705 pr_info("Argument%d name '%s' conflicts with "
706 "another field.\n", i, argv[i]);
707 ret = -EINVAL;
708 goto error;
709 }
710
711 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
712 if (!tp->args[i].name) {
713 pr_info("Failed to allocate argument%d name '%s'.\n",
714 i, argv[i]);
715 ret = -ENOMEM;
716 goto error;
717 }
718
719 /* Parse fetch argument */
720 ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return);
721 if (ret) {
722 pr_info("Parse error at argument%d. (%d)\n", i, ret);
723 kfree(tp->args[i].name);
724 goto error;
725 }
726
727 tp->nr_args++;
728 }
729
730 ret = register_trace_probe(tp);
731 if (ret)
732 goto error;
733 return 0;
734
735error:
736 free_trace_probe(tp);
737 return ret;
738}
739
740static void cleanup_all_probes(void)
741{
742 struct trace_probe *tp;
743
744 mutex_lock(&probe_lock);
745 /* TODO: Use batch unregistration */
746 while (!list_empty(&probe_list)) {
747 tp = list_entry(probe_list.next, struct trace_probe, list);
748 unregister_trace_probe(tp);
749 free_trace_probe(tp);
750 }
751 mutex_unlock(&probe_lock);
752}
753
754
755/* Probes listing interfaces */
756static void *probes_seq_start(struct seq_file *m, loff_t *pos)
757{
758 mutex_lock(&probe_lock);
759 return seq_list_start(&probe_list, *pos);
760}
761
762static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
763{
764 return seq_list_next(v, &probe_list, pos);
765}
766
767static void probes_seq_stop(struct seq_file *m, void *v)
768{
769 mutex_unlock(&probe_lock);
770}
771
772static int probes_seq_show(struct seq_file *m, void *v)
773{
774 struct trace_probe *tp = v;
775 int i, ret;
776 char buf[MAX_ARGSTR_LEN + 1];
777
778 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
779 seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
780
781 if (!tp->symbol)
782 seq_printf(m, " 0x%p", tp->rp.kp.addr);
783 else if (tp->rp.kp.offset)
784 seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset);
785 else
786 seq_printf(m, " %s", probe_symbol(tp));
787
788 for (i = 0; i < tp->nr_args; i++) {
789 ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch);
790 if (ret < 0) {
791 pr_warning("Argument%d decoding error(%d).\n", i, ret);
792 return ret;
793 }
794 seq_printf(m, " %s=%s", tp->args[i].name, buf);
795 }
796 seq_printf(m, "\n");
797 return 0;
798}
799
800static const struct seq_operations probes_seq_op = {
801 .start = probes_seq_start,
802 .next = probes_seq_next,
803 .stop = probes_seq_stop,
804 .show = probes_seq_show
805};
806
807static int probes_open(struct inode *inode, struct file *file)
808{
809 if ((file->f_mode & FMODE_WRITE) &&
810 (file->f_flags & O_TRUNC))
811 cleanup_all_probes();
812
813 return seq_open(file, &probes_seq_op);
814}
815
816static int command_trace_probe(const char *buf)
817{
818 char **argv;
819 int argc = 0, ret = 0;
820
821 argv = argv_split(GFP_KERNEL, buf, &argc);
822 if (!argv)
823 return -ENOMEM;
824
825 if (argc)
826 ret = create_trace_probe(argc, argv);
827
828 argv_free(argv);
829 return ret;
830}
831
832#define WRITE_BUFSIZE 128
833
834static ssize_t probes_write(struct file *file, const char __user *buffer,
835 size_t count, loff_t *ppos)
836{
837 char *kbuf, *tmp;
838 int ret;
839 size_t done;
840 size_t size;
841
842 kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
843 if (!kbuf)
844 return -ENOMEM;
845
846 ret = done = 0;
847 while (done < count) {
848 size = count - done;
849 if (size >= WRITE_BUFSIZE)
850 size = WRITE_BUFSIZE - 1;
851 if (copy_from_user(kbuf, buffer + done, size)) {
852 ret = -EFAULT;
853 goto out;
854 }
855 kbuf[size] = '\0';
856 tmp = strchr(kbuf, '\n');
857 if (tmp) {
858 *tmp = '\0';
859 size = tmp - kbuf + 1;
860 } else if (done + size < count) {
861 pr_warning("Line length is too long: "
862 "Should be less than %d.", WRITE_BUFSIZE);
863 ret = -EINVAL;
864 goto out;
865 }
866 done += size;
867 /* Remove comments */
868 tmp = strchr(kbuf, '#');
869 if (tmp)
870 *tmp = '\0';
871
872 ret = command_trace_probe(kbuf);
873 if (ret)
874 goto out;
875 }
876 ret = done;
877out:
878 kfree(kbuf);
879 return ret;
880}
881
882static const struct file_operations kprobe_events_ops = {
883 .owner = THIS_MODULE,
884 .open = probes_open,
885 .read = seq_read,
886 .llseek = seq_lseek,
887 .release = seq_release,
888 .write = probes_write,
889};
890
891/* Probes profiling interfaces */
892static int probes_profile_seq_show(struct seq_file *m, void *v)
893{
894 struct trace_probe *tp = v;
895
896 seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit,
897 tp->rp.kp.nmissed);
898
899 return 0;
900}
901
902static const struct seq_operations profile_seq_op = {
903 .start = probes_seq_start,
904 .next = probes_seq_next,
905 .stop = probes_seq_stop,
906 .show = probes_profile_seq_show
907};
908
909static int profile_open(struct inode *inode, struct file *file)
910{
911 return seq_open(file, &profile_seq_op);
912}
913
914static const struct file_operations kprobe_profile_ops = {
915 .owner = THIS_MODULE,
916 .open = profile_open,
917 .read = seq_read,
918 .llseek = seq_lseek,
919 .release = seq_release,
920};
921
922/* Kprobe handler */
923static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
924{
925 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
926 struct kprobe_trace_entry *entry;
927 struct ring_buffer_event *event;
928 struct ring_buffer *buffer;
929 int size, i, pc;
930 unsigned long irq_flags;
931 struct ftrace_event_call *call = &tp->call;
932
933 tp->nhit++;
934
935 local_save_flags(irq_flags);
936 pc = preempt_count();
937
938 size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
939
940 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
941 irq_flags, pc);
942 if (!event)
943 return 0;
944
945 entry = ring_buffer_event_data(event);
946 entry->nargs = tp->nr_args;
947 entry->ip = (unsigned long)kp->addr;
948 for (i = 0; i < tp->nr_args; i++)
949 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
950
951 if (!filter_current_check_discard(buffer, call, entry, event))
952 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
953 return 0;
954}
955
956/* Kretprobe handler */
957static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
958 struct pt_regs *regs)
959{
960 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
961 struct kretprobe_trace_entry *entry;
962 struct ring_buffer_event *event;
963 struct ring_buffer *buffer;
964 int size, i, pc;
965 unsigned long irq_flags;
966 struct ftrace_event_call *call = &tp->call;
967
968 local_save_flags(irq_flags);
969 pc = preempt_count();
970
971 size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
972
973 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
974 irq_flags, pc);
975 if (!event)
976 return 0;
977
978 entry = ring_buffer_event_data(event);
979 entry->nargs = tp->nr_args;
980 entry->func = (unsigned long)tp->rp.kp.addr;
981 entry->ret_ip = (unsigned long)ri->ret_addr;
982 for (i = 0; i < tp->nr_args; i++)
983 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
984
985 if (!filter_current_check_discard(buffer, call, entry, event))
986 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
987
988 return 0;
989}
990
991/* Event entry printers */
992enum print_line_t
993print_kprobe_event(struct trace_iterator *iter, int flags)
994{
995 struct kprobe_trace_entry *field;
996 struct trace_seq *s = &iter->seq;
997 struct trace_event *event;
998 struct trace_probe *tp;
999 int i;
1000
1001 field = (struct kprobe_trace_entry *)iter->ent;
1002 event = ftrace_find_event(field->ent.type);
1003 tp = container_of(event, struct trace_probe, event);
1004
1005 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1006 goto partial;
1007
1008 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
1009 goto partial;
1010
1011 if (!trace_seq_puts(s, ")"))
1012 goto partial;
1013
1014 for (i = 0; i < field->nargs; i++)
1015 if (!trace_seq_printf(s, " %s=%lx",
1016 tp->args[i].name, field->args[i]))
1017 goto partial;
1018
1019 if (!trace_seq_puts(s, "\n"))
1020 goto partial;
1021
1022 return TRACE_TYPE_HANDLED;
1023partial:
1024 return TRACE_TYPE_PARTIAL_LINE;
1025}
1026
1027enum print_line_t
1028print_kretprobe_event(struct trace_iterator *iter, int flags)
1029{
1030 struct kretprobe_trace_entry *field;
1031 struct trace_seq *s = &iter->seq;
1032 struct trace_event *event;
1033 struct trace_probe *tp;
1034 int i;
1035
1036 field = (struct kretprobe_trace_entry *)iter->ent;
1037 event = ftrace_find_event(field->ent.type);
1038 tp = container_of(event, struct trace_probe, event);
1039
1040 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1041 goto partial;
1042
1043 if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
1044 goto partial;
1045
1046 if (!trace_seq_puts(s, " <- "))
1047 goto partial;
1048
1049 if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
1050 goto partial;
1051
1052 if (!trace_seq_puts(s, ")"))
1053 goto partial;
1054
1055 for (i = 0; i < field->nargs; i++)
1056 if (!trace_seq_printf(s, " %s=%lx",
1057 tp->args[i].name, field->args[i]))
1058 goto partial;
1059
1060 if (!trace_seq_puts(s, "\n"))
1061 goto partial;
1062
1063 return TRACE_TYPE_HANDLED;
1064partial:
1065 return TRACE_TYPE_PARTIAL_LINE;
1066}
1067
1068static int probe_event_enable(struct ftrace_event_call *call)
1069{
1070 struct trace_probe *tp = (struct trace_probe *)call->data;
1071
1072 tp->flags |= TP_FLAG_TRACE;
1073 if (probe_is_return(tp))
1074 return enable_kretprobe(&tp->rp);
1075 else
1076 return enable_kprobe(&tp->rp.kp);
1077}
1078
1079static void probe_event_disable(struct ftrace_event_call *call)
1080{
1081 struct trace_probe *tp = (struct trace_probe *)call->data;
1082
1083 tp->flags &= ~TP_FLAG_TRACE;
1084 if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) {
1085 if (probe_is_return(tp))
1086 disable_kretprobe(&tp->rp);
1087 else
1088 disable_kprobe(&tp->rp.kp);
1089 }
1090}
1091
1092static int probe_event_raw_init(struct ftrace_event_call *event_call)
1093{
1094 INIT_LIST_HEAD(&event_call->fields);
1095
1096 return 0;
1097}
1098
1099#undef DEFINE_FIELD
1100#define DEFINE_FIELD(type, item, name, is_signed) \
1101 do { \
1102 ret = trace_define_field(event_call, #type, name, \
1103 offsetof(typeof(field), item), \
1104 sizeof(field.item), is_signed, \
1105 FILTER_OTHER); \
1106 if (ret) \
1107 return ret; \
1108 } while (0)
1109
1110static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1111{
1112 int ret, i;
1113 struct kprobe_trace_entry field;
1114 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1115
1116 ret = trace_define_common_fields(event_call);
1117 if (!ret)
1118 return ret;
1119
1120 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
1121 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1122 /* Set argument names as fields */
1123 for (i = 0; i < tp->nr_args; i++)
1124 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
1125 return 0;
1126}
1127
1128static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1129{
1130 int ret, i;
1131 struct kretprobe_trace_entry field;
1132 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1133
1134 ret = trace_define_common_fields(event_call);
1135 if (!ret)
1136 return ret;
1137
1138 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
1139 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1140 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1141 /* Set argument names as fields */
1142 for (i = 0; i < tp->nr_args; i++)
1143 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
1144 return 0;
1145}
1146
1147static int __probe_event_show_format(struct trace_seq *s,
1148 struct trace_probe *tp, const char *fmt,
1149 const char *arg)
1150{
1151 int i;
1152
1153 /* Show format */
1154 if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
1155 return 0;
1156
1157 for (i = 0; i < tp->nr_args; i++)
1158 if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name))
1159 return 0;
1160
1161 if (!trace_seq_printf(s, "\", %s", arg))
1162 return 0;
1163
1164 for (i = 0; i < tp->nr_args; i++)
1165 if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name))
1166 return 0;
1167
1168 return trace_seq_puts(s, "\n");
1169}
1170
1171#undef SHOW_FIELD
1172#define SHOW_FIELD(type, item, name) \
1173 do { \
1174 ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \
1175 "offset:%u;\tsize:%u;\n", name, \
1176 (unsigned int)offsetof(typeof(field), item),\
1177 (unsigned int)sizeof(type)); \
1178 if (!ret) \
1179 return 0; \
1180 } while (0)
1181
1182static int kprobe_event_show_format(struct ftrace_event_call *call,
1183 struct trace_seq *s)
1184{
1185 struct kprobe_trace_entry field __attribute__((unused));
1186 int ret, i;
1187 struct trace_probe *tp = (struct trace_probe *)call->data;
1188
1189 SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP);
1190 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
1191
1192 /* Show fields */
1193 for (i = 0; i < tp->nr_args; i++)
1194 SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
1195 trace_seq_puts(s, "\n");
1196
1197 return __probe_event_show_format(s, tp, "(%lx)",
1198 "REC->" FIELD_STRING_IP);
1199}
1200
1201static int kretprobe_event_show_format(struct ftrace_event_call *call,
1202 struct trace_seq *s)
1203{
1204 struct kretprobe_trace_entry field __attribute__((unused));
1205 int ret, i;
1206 struct trace_probe *tp = (struct trace_probe *)call->data;
1207
1208 SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC);
1209 SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP);
1210 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
1211
1212 /* Show fields */
1213 for (i = 0; i < tp->nr_args; i++)
1214 SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
1215 trace_seq_puts(s, "\n");
1216
1217 return __probe_event_show_format(s, tp, "(%lx <- %lx)",
1218 "REC->" FIELD_STRING_FUNC
1219 ", REC->" FIELD_STRING_RETIP);
1220}
1221
1222#ifdef CONFIG_EVENT_PROFILE
1223
1224/* Kprobe profile handler */
1225static __kprobes int kprobe_profile_func(struct kprobe *kp,
1226 struct pt_regs *regs)
1227{
1228 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1229 struct ftrace_event_call *call = &tp->call;
1230 struct kprobe_trace_entry *entry;
1231 struct trace_entry *ent;
1232 int size, __size, i, pc, __cpu;
1233 unsigned long irq_flags;
1234 char *trace_buf;
1235 char *raw_data;
1236 int rctx;
1237
1238 pc = preempt_count();
1239 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
1240 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1241 size -= sizeof(u32);
1242 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1243 "profile buffer not large enough"))
1244 return 0;
1245
1246 /*
1247 * Protect the non nmi buffer
1248 * This also protects the rcu read side
1249 */
1250 local_irq_save(irq_flags);
1251
1252 rctx = perf_swevent_get_recursion_context();
1253 if (rctx < 0)
1254 goto end_recursion;
1255
1256 __cpu = smp_processor_id();
1257
1258 if (in_nmi())
1259 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1260 else
1261 trace_buf = rcu_dereference(perf_trace_buf);
1262
1263 if (!trace_buf)
1264 goto end;
1265
1266 raw_data = per_cpu_ptr(trace_buf, __cpu);
1267
1268 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1269 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1270 entry = (struct kprobe_trace_entry *)raw_data;
1271 ent = &entry->ent;
1272
1273 tracing_generic_entry_update(ent, irq_flags, pc);
1274 ent->type = call->id;
1275 entry->nargs = tp->nr_args;
1276 entry->ip = (unsigned long)kp->addr;
1277 for (i = 0; i < tp->nr_args; i++)
1278 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1279 perf_tp_event(call->id, entry->ip, 1, entry, size);
1280
1281end:
1282 perf_swevent_put_recursion_context(rctx);
1283end_recursion:
1284 local_irq_restore(irq_flags);
1285
1286 return 0;
1287}
1288
1289/* Kretprobe profile handler */
1290static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
1291 struct pt_regs *regs)
1292{
1293 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1294 struct ftrace_event_call *call = &tp->call;
1295 struct kretprobe_trace_entry *entry;
1296 struct trace_entry *ent;
1297 int size, __size, i, pc, __cpu;
1298 unsigned long irq_flags;
1299 char *trace_buf;
1300 char *raw_data;
1301 int rctx;
1302
1303 pc = preempt_count();
1304 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
1305 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1306 size -= sizeof(u32);
1307 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1308 "profile buffer not large enough"))
1309 return 0;
1310
1311 /*
1312 * Protect the non nmi buffer
1313 * This also protects the rcu read side
1314 */
1315 local_irq_save(irq_flags);
1316
1317 rctx = perf_swevent_get_recursion_context();
1318 if (rctx < 0)
1319 goto end_recursion;
1320
1321 __cpu = smp_processor_id();
1322
1323 if (in_nmi())
1324 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1325 else
1326 trace_buf = rcu_dereference(perf_trace_buf);
1327
1328 if (!trace_buf)
1329 goto end;
1330
1331 raw_data = per_cpu_ptr(trace_buf, __cpu);
1332
1333 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1334 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1335 entry = (struct kretprobe_trace_entry *)raw_data;
1336 ent = &entry->ent;
1337
1338 tracing_generic_entry_update(ent, irq_flags, pc);
1339 ent->type = call->id;
1340 entry->nargs = tp->nr_args;
1341 entry->func = (unsigned long)tp->rp.kp.addr;
1342 entry->ret_ip = (unsigned long)ri->ret_addr;
1343 for (i = 0; i < tp->nr_args; i++)
1344 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1345 perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
1346
1347end:
1348 perf_swevent_put_recursion_context(rctx);
1349end_recursion:
1350 local_irq_restore(irq_flags);
1351
1352 return 0;
1353}
1354
1355static int probe_profile_enable(struct ftrace_event_call *call)
1356{
1357 struct trace_probe *tp = (struct trace_probe *)call->data;
1358
1359 tp->flags |= TP_FLAG_PROFILE;
1360
1361 if (probe_is_return(tp))
1362 return enable_kretprobe(&tp->rp);
1363 else
1364 return enable_kprobe(&tp->rp.kp);
1365}
1366
1367static void probe_profile_disable(struct ftrace_event_call *call)
1368{
1369 struct trace_probe *tp = (struct trace_probe *)call->data;
1370
1371 tp->flags &= ~TP_FLAG_PROFILE;
1372
1373 if (!(tp->flags & TP_FLAG_TRACE)) {
1374 if (probe_is_return(tp))
1375 disable_kretprobe(&tp->rp);
1376 else
1377 disable_kprobe(&tp->rp.kp);
1378 }
1379}
1380#endif /* CONFIG_EVENT_PROFILE */
1381
1382
1383static __kprobes
1384int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1385{
1386 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1387
1388 if (tp->flags & TP_FLAG_TRACE)
1389 kprobe_trace_func(kp, regs);
1390#ifdef CONFIG_EVENT_PROFILE
1391 if (tp->flags & TP_FLAG_PROFILE)
1392 kprobe_profile_func(kp, regs);
1393#endif /* CONFIG_EVENT_PROFILE */
1394 return 0; /* We don't tweek kernel, so just return 0 */
1395}
1396
1397static __kprobes
1398int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1399{
1400 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1401
1402 if (tp->flags & TP_FLAG_TRACE)
1403 kretprobe_trace_func(ri, regs);
1404#ifdef CONFIG_EVENT_PROFILE
1405 if (tp->flags & TP_FLAG_PROFILE)
1406 kretprobe_profile_func(ri, regs);
1407#endif /* CONFIG_EVENT_PROFILE */
1408 return 0; /* We don't tweek kernel, so just return 0 */
1409}
1410
1411static int register_probe_event(struct trace_probe *tp)
1412{
1413 struct ftrace_event_call *call = &tp->call;
1414 int ret;
1415
1416 /* Initialize ftrace_event_call */
1417 if (probe_is_return(tp)) {
1418 tp->event.trace = print_kretprobe_event;
1419 call->raw_init = probe_event_raw_init;
1420 call->show_format = kretprobe_event_show_format;
1421 call->define_fields = kretprobe_event_define_fields;
1422 } else {
1423 tp->event.trace = print_kprobe_event;
1424 call->raw_init = probe_event_raw_init;
1425 call->show_format = kprobe_event_show_format;
1426 call->define_fields = kprobe_event_define_fields;
1427 }
1428 call->event = &tp->event;
1429 call->id = register_ftrace_event(&tp->event);
1430 if (!call->id)
1431 return -ENODEV;
1432 call->enabled = 0;
1433 call->regfunc = probe_event_enable;
1434 call->unregfunc = probe_event_disable;
1435
1436#ifdef CONFIG_EVENT_PROFILE
1437 atomic_set(&call->profile_count, -1);
1438 call->profile_enable = probe_profile_enable;
1439 call->profile_disable = probe_profile_disable;
1440#endif
1441 call->data = tp;
1442 ret = trace_add_event_call(call);
1443 if (ret) {
1444 pr_info("Failed to register kprobe event: %s\n", call->name);
1445 unregister_ftrace_event(&tp->event);
1446 }
1447 return ret;
1448}
1449
1450static void unregister_probe_event(struct trace_probe *tp)
1451{
1452 /* tp->event is unregistered in trace_remove_event_call() */
1453 trace_remove_event_call(&tp->call);
1454}
1455
1456/* Make a debugfs interface for controling probe points */
1457static __init int init_kprobe_trace(void)
1458{
1459 struct dentry *d_tracer;
1460 struct dentry *entry;
1461
1462 d_tracer = tracing_init_dentry();
1463 if (!d_tracer)
1464 return 0;
1465
1466 entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
1467 NULL, &kprobe_events_ops);
1468
1469 /* Event list interface */
1470 if (!entry)
1471 pr_warning("Could not create debugfs "
1472 "'kprobe_events' entry\n");
1473
1474 /* Profile interface */
1475 entry = debugfs_create_file("kprobe_profile", 0444, d_tracer,
1476 NULL, &kprobe_profile_ops);
1477
1478 if (!entry)
1479 pr_warning("Could not create debugfs "
1480 "'kprobe_profile' entry\n");
1481 return 0;
1482}
1483fs_initcall(init_kprobe_trace);
1484
1485
1486#ifdef CONFIG_FTRACE_STARTUP_TEST
1487
1488static int kprobe_trace_selftest_target(int a1, int a2, int a3,
1489 int a4, int a5, int a6)
1490{
1491 return a1 + a2 + a3 + a4 + a5 + a6;
1492}
1493
1494static __init int kprobe_trace_self_tests_init(void)
1495{
1496 int ret;
1497 int (*target)(int, int, int, int, int, int);
1498
1499 target = kprobe_trace_selftest_target;
1500
1501 pr_info("Testing kprobe tracing: ");
1502
1503 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
1504 "$arg1 $arg2 $arg3 $arg4 $stack $stack0");
1505 if (WARN_ON_ONCE(ret))
1506 pr_warning("error enabling function entry\n");
1507
1508 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
1509 "$retval");
1510 if (WARN_ON_ONCE(ret))
1511 pr_warning("error enabling function return\n");
1512
1513 ret = target(1, 2, 3, 4, 5, 6);
1514
1515 cleanup_all_probes();
1516
1517 pr_cont("OK\n");
1518 return 0;
1519}
1520
1521late_initcall(kprobe_trace_self_tests_init);
1522
1523#endif
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
new file mode 100644
index 000000000000..ddfa0fd43bc0
--- /dev/null
+++ b/kernel/trace/trace_ksym.c
@@ -0,0 +1,550 @@
1/*
2 * trace_ksym.c - Kernel Symbol Tracer
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2009
19 */
20
21#include <linux/kallsyms.h>
22#include <linux/uaccess.h>
23#include <linux/debugfs.h>
24#include <linux/ftrace.h>
25#include <linux/module.h>
26#include <linux/fs.h>
27
28#include "trace_output.h"
29#include "trace_stat.h"
30#include "trace.h"
31
32#include <linux/hw_breakpoint.h>
33#include <asm/hw_breakpoint.h>
34
35/*
36 * For now, let us restrict the no. of symbols traced simultaneously to number
37 * of available hardware breakpoint registers.
38 */
39#define KSYM_TRACER_MAX HBP_NUM
40
41#define KSYM_TRACER_OP_LEN 3 /* rw- */
42
43struct trace_ksym {
44 struct perf_event **ksym_hbp;
45 struct perf_event_attr attr;
46#ifdef CONFIG_PROFILE_KSYM_TRACER
47 unsigned long counter;
48#endif
49 struct hlist_node ksym_hlist;
50};
51
52static struct trace_array *ksym_trace_array;
53
54static unsigned int ksym_filter_entry_count;
55static unsigned int ksym_tracing_enabled;
56
57static HLIST_HEAD(ksym_filter_head);
58
59static DEFINE_MUTEX(ksym_tracer_mutex);
60
61#ifdef CONFIG_PROFILE_KSYM_TRACER
62
63#define MAX_UL_INT 0xffffffff
64
65void ksym_collect_stats(unsigned long hbp_hit_addr)
66{
67 struct hlist_node *node;
68 struct trace_ksym *entry;
69
70 rcu_read_lock();
71 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
72 if ((entry->attr.bp_addr == hbp_hit_addr) &&
73 (entry->counter <= MAX_UL_INT)) {
74 entry->counter++;
75 break;
76 }
77 }
78 rcu_read_unlock();
79}
80#endif /* CONFIG_PROFILE_KSYM_TRACER */
81
82void ksym_hbp_handler(struct perf_event *hbp, void *data)
83{
84 struct ring_buffer_event *event;
85 struct ksym_trace_entry *entry;
86 struct pt_regs *regs = data;
87 struct ring_buffer *buffer;
88 int pc;
89
90 if (!ksym_tracing_enabled)
91 return;
92
93 buffer = ksym_trace_array->buffer;
94
95 pc = preempt_count();
96
97 event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
98 sizeof(*entry), 0, pc);
99 if (!event)
100 return;
101
102 entry = ring_buffer_event_data(event);
103 entry->ip = instruction_pointer(regs);
104 entry->type = hw_breakpoint_type(hbp);
105 entry->addr = hw_breakpoint_addr(hbp);
106 strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
107
108#ifdef CONFIG_PROFILE_KSYM_TRACER
109 ksym_collect_stats(hw_breakpoint_addr(hbp));
110#endif /* CONFIG_PROFILE_KSYM_TRACER */
111
112 trace_buffer_unlock_commit(buffer, event, 0, pc);
113}
114
115/* Valid access types are represented as
116 *
117 * rw- : Set Read/Write Access Breakpoint
118 * -w- : Set Write Access Breakpoint
119 * --- : Clear Breakpoints
120 * --x : Set Execution Break points (Not available yet)
121 *
122 */
123static int ksym_trace_get_access_type(char *str)
124{
125 int access = 0;
126
127 if (str[0] == 'r')
128 access |= HW_BREAKPOINT_R;
129
130 if (str[1] == 'w')
131 access |= HW_BREAKPOINT_W;
132
133 if (str[2] == 'x')
134 access |= HW_BREAKPOINT_X;
135
136 switch (access) {
137 case HW_BREAKPOINT_R:
138 case HW_BREAKPOINT_W:
139 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
140 return access;
141 default:
142 return -EINVAL;
143 }
144}
145
146/*
147 * There can be several possible malformed requests and we attempt to capture
148 * all of them. We enumerate some of the rules
149 * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
150 * i.e. multiple ':' symbols disallowed. Possible uses are of the form
151 * <module>:<ksym_name>:<op>.
152 * 2. No delimiter symbol ':' in the input string
153 * 3. Spurious operator symbols or symbols not in their respective positions
154 * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
155 * 5. Kernel symbol not a part of /proc/kallsyms
156 * 6. Duplicate requests
157 */
158static int parse_ksym_trace_str(char *input_string, char **ksymname,
159 unsigned long *addr)
160{
161 int ret;
162
163 *ksymname = strsep(&input_string, ":");
164 *addr = kallsyms_lookup_name(*ksymname);
165
166 /* Check for malformed request: (2), (1) and (5) */
167 if ((!input_string) ||
168 (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
169 (*addr == 0))
170 return -EINVAL;;
171
172 ret = ksym_trace_get_access_type(input_string);
173
174 return ret;
175}
176
177int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
178{
179 struct trace_ksym *entry;
180 int ret = -ENOMEM;
181
182 if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
183 printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
184 " new requests for tracing can be accepted now.\n",
185 KSYM_TRACER_MAX);
186 return -ENOSPC;
187 }
188
189 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
190 if (!entry)
191 return -ENOMEM;
192
193 hw_breakpoint_init(&entry->attr);
194
195 entry->attr.bp_type = op;
196 entry->attr.bp_addr = addr;
197 entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
198
199 ret = -EAGAIN;
200 entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
201 ksym_hbp_handler);
202
203 if (IS_ERR(entry->ksym_hbp)) {
204 ret = PTR_ERR(entry->ksym_hbp);
205 printk(KERN_INFO "ksym_tracer request failed. Try again"
206 " later!!\n");
207 goto err;
208 }
209
210 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
211 ksym_filter_entry_count++;
212
213 return 0;
214
215err:
216 kfree(entry);
217
218 return ret;
219}
220
221static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
222 size_t count, loff_t *ppos)
223{
224 struct trace_ksym *entry;
225 struct hlist_node *node;
226 struct trace_seq *s;
227 ssize_t cnt = 0;
228 int ret;
229
230 s = kmalloc(sizeof(*s), GFP_KERNEL);
231 if (!s)
232 return -ENOMEM;
233 trace_seq_init(s);
234
235 mutex_lock(&ksym_tracer_mutex);
236
237 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
238 ret = trace_seq_printf(s, "%pS:", (void *)entry->attr.bp_addr);
239 if (entry->attr.bp_type == HW_BREAKPOINT_R)
240 ret = trace_seq_puts(s, "r--\n");
241 else if (entry->attr.bp_type == HW_BREAKPOINT_W)
242 ret = trace_seq_puts(s, "-w-\n");
243 else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
244 ret = trace_seq_puts(s, "rw-\n");
245 WARN_ON_ONCE(!ret);
246 }
247
248 cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
249
250 mutex_unlock(&ksym_tracer_mutex);
251
252 kfree(s);
253
254 return cnt;
255}
256
257static void __ksym_trace_reset(void)
258{
259 struct trace_ksym *entry;
260 struct hlist_node *node, *node1;
261
262 mutex_lock(&ksym_tracer_mutex);
263 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
264 ksym_hlist) {
265 unregister_wide_hw_breakpoint(entry->ksym_hbp);
266 ksym_filter_entry_count--;
267 hlist_del_rcu(&(entry->ksym_hlist));
268 synchronize_rcu();
269 kfree(entry);
270 }
271 mutex_unlock(&ksym_tracer_mutex);
272}
273
274static ssize_t ksym_trace_filter_write(struct file *file,
275 const char __user *buffer,
276 size_t count, loff_t *ppos)
277{
278 struct trace_ksym *entry;
279 struct hlist_node *node;
280 char *input_string, *ksymname = NULL;
281 unsigned long ksym_addr = 0;
282 int ret, op, changed = 0;
283
284 input_string = kzalloc(count + 1, GFP_KERNEL);
285 if (!input_string)
286 return -ENOMEM;
287
288 if (copy_from_user(input_string, buffer, count)) {
289 kfree(input_string);
290 return -EFAULT;
291 }
292 input_string[count] = '\0';
293
294 strstrip(input_string);
295
296 /*
297 * Clear all breakpoints if:
298 * 1: echo > ksym_trace_filter
299 * 2: echo 0 > ksym_trace_filter
300 * 3: echo "*:---" > ksym_trace_filter
301 */
302 if (!input_string[0] || !strcmp(input_string, "0") ||
303 !strcmp(input_string, "*:---")) {
304 __ksym_trace_reset();
305 kfree(input_string);
306 return count;
307 }
308
309 ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
310 if (ret < 0) {
311 kfree(input_string);
312 return ret;
313 }
314
315 mutex_lock(&ksym_tracer_mutex);
316
317 ret = -EINVAL;
318 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
319 if (entry->attr.bp_addr == ksym_addr) {
320 /* Check for malformed request: (6) */
321 if (entry->attr.bp_type != op)
322 changed = 1;
323 else
324 goto out;
325 break;
326 }
327 }
328 if (changed) {
329 unregister_wide_hw_breakpoint(entry->ksym_hbp);
330 entry->attr.bp_type = op;
331 ret = 0;
332 if (op > 0) {
333 entry->ksym_hbp =
334 register_wide_hw_breakpoint(&entry->attr,
335 ksym_hbp_handler);
336 if (IS_ERR(entry->ksym_hbp))
337 ret = PTR_ERR(entry->ksym_hbp);
338 else
339 goto out;
340 }
341 /* Error or "symbol:---" case: drop it */
342 ksym_filter_entry_count--;
343 hlist_del_rcu(&(entry->ksym_hlist));
344 synchronize_rcu();
345 kfree(entry);
346 goto out;
347 } else {
348 /* Check for malformed request: (4) */
349 if (op == 0)
350 goto out;
351 ret = process_new_ksym_entry(ksymname, op, ksym_addr);
352 }
353out:
354 mutex_unlock(&ksym_tracer_mutex);
355
356 kfree(input_string);
357
358 if (!ret)
359 ret = count;
360 return ret;
361}
362
363static const struct file_operations ksym_tracing_fops = {
364 .open = tracing_open_generic,
365 .read = ksym_trace_filter_read,
366 .write = ksym_trace_filter_write,
367};
368
369static void ksym_trace_reset(struct trace_array *tr)
370{
371 ksym_tracing_enabled = 0;
372 __ksym_trace_reset();
373}
374
375static int ksym_trace_init(struct trace_array *tr)
376{
377 int cpu, ret = 0;
378
379 for_each_online_cpu(cpu)
380 tracing_reset(tr, cpu);
381 ksym_tracing_enabled = 1;
382 ksym_trace_array = tr;
383
384 return ret;
385}
386
387static void ksym_trace_print_header(struct seq_file *m)
388{
389 seq_puts(m,
390 "# TASK-PID CPU# Symbol "
391 "Type Function\n");
392 seq_puts(m,
393 "# | | | "
394 " | |\n");
395}
396
397static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
398{
399 struct trace_entry *entry = iter->ent;
400 struct trace_seq *s = &iter->seq;
401 struct ksym_trace_entry *field;
402 char str[KSYM_SYMBOL_LEN];
403 int ret;
404
405 if (entry->type != TRACE_KSYM)
406 return TRACE_TYPE_UNHANDLED;
407
408 trace_assign_type(field, entry);
409
410 ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
411 entry->pid, iter->cpu, (char *)field->addr);
412 if (!ret)
413 return TRACE_TYPE_PARTIAL_LINE;
414
415 switch (field->type) {
416 case HW_BREAKPOINT_R:
417 ret = trace_seq_printf(s, " R ");
418 break;
419 case HW_BREAKPOINT_W:
420 ret = trace_seq_printf(s, " W ");
421 break;
422 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
423 ret = trace_seq_printf(s, " RW ");
424 break;
425 default:
426 return TRACE_TYPE_PARTIAL_LINE;
427 }
428
429 if (!ret)
430 return TRACE_TYPE_PARTIAL_LINE;
431
432 sprint_symbol(str, field->ip);
433 ret = trace_seq_printf(s, "%s\n", str);
434 if (!ret)
435 return TRACE_TYPE_PARTIAL_LINE;
436
437 return TRACE_TYPE_HANDLED;
438}
439
440struct tracer ksym_tracer __read_mostly =
441{
442 .name = "ksym_tracer",
443 .init = ksym_trace_init,
444 .reset = ksym_trace_reset,
445#ifdef CONFIG_FTRACE_SELFTEST
446 .selftest = trace_selftest_startup_ksym,
447#endif
448 .print_header = ksym_trace_print_header,
449 .print_line = ksym_trace_output
450};
451
452__init static int init_ksym_trace(void)
453{
454 struct dentry *d_tracer;
455 struct dentry *entry;
456
457 d_tracer = tracing_init_dentry();
458 ksym_filter_entry_count = 0;
459
460 entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer,
461 NULL, &ksym_tracing_fops);
462 if (!entry)
463 pr_warning("Could not create debugfs "
464 "'ksym_trace_filter' file\n");
465
466 return register_tracer(&ksym_tracer);
467}
468device_initcall(init_ksym_trace);
469
470
471#ifdef CONFIG_PROFILE_KSYM_TRACER
472static int ksym_tracer_stat_headers(struct seq_file *m)
473{
474 seq_puts(m, " Access Type ");
475 seq_puts(m, " Symbol Counter\n");
476 seq_puts(m, " ----------- ");
477 seq_puts(m, " ------ -------\n");
478 return 0;
479}
480
481static int ksym_tracer_stat_show(struct seq_file *m, void *v)
482{
483 struct hlist_node *stat = v;
484 struct trace_ksym *entry;
485 int access_type = 0;
486 char fn_name[KSYM_NAME_LEN];
487
488 entry = hlist_entry(stat, struct trace_ksym, ksym_hlist);
489
490 access_type = entry->attr.bp_type;
491
492 switch (access_type) {
493 case HW_BREAKPOINT_R:
494 seq_puts(m, " R ");
495 break;
496 case HW_BREAKPOINT_W:
497 seq_puts(m, " W ");
498 break;
499 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
500 seq_puts(m, " RW ");
501 break;
502 default:
503 seq_puts(m, " NA ");
504 }
505
506 if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
507 seq_printf(m, " %-36s", fn_name);
508 else
509 seq_printf(m, " %-36s", "<NA>");
510 seq_printf(m, " %15lu\n", entry->counter);
511
512 return 0;
513}
514
515static void *ksym_tracer_stat_start(struct tracer_stat *trace)
516{
517 return ksym_filter_head.first;
518}
519
520static void *
521ksym_tracer_stat_next(void *v, int idx)
522{
523 struct hlist_node *stat = v;
524
525 return stat->next;
526}
527
528static struct tracer_stat ksym_tracer_stats = {
529 .name = "ksym_tracer",
530 .stat_start = ksym_tracer_stat_start,
531 .stat_next = ksym_tracer_stat_next,
532 .stat_headers = ksym_tracer_stat_headers,
533 .stat_show = ksym_tracer_stat_show
534};
535
536__init static int ksym_tracer_stat_init(void)
537{
538 int ret;
539
540 ret = register_stat_tracer(&ksym_tracer_stats);
541 if (ret) {
542 printk(KERN_WARNING "Warning: could not register "
543 "ksym tracer stats\n");
544 return 1;
545 }
546
547 return 0;
548}
549fs_initcall(ksym_tracer_stat_init);
550#endif /* CONFIG_PROFILE_KSYM_TRACER */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index f572f44c6e1e..b6c12c6a1bcd 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -69,6 +69,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
69 * @s: trace sequence descriptor 69 * @s: trace sequence descriptor
70 * @fmt: printf format string 70 * @fmt: printf format string
71 * 71 *
72 * It returns 0 if the trace oversizes the buffer's free
73 * space, 1 otherwise.
74 *
72 * The tracer may use either sequence operations or its own 75 * The tracer may use either sequence operations or its own
73 * copy to user routines. To simplify formating of a trace 76 * copy to user routines. To simplify formating of a trace
74 * trace_seq_printf is used to store strings into a special 77 * trace_seq_printf is used to store strings into a special
@@ -95,7 +98,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
95 98
96 s->len += ret; 99 s->len += ret;
97 100
98 return len; 101 return 1;
99} 102}
100EXPORT_SYMBOL_GPL(trace_seq_printf); 103EXPORT_SYMBOL_GPL(trace_seq_printf);
101 104
@@ -486,16 +489,18 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
486 hardirq ? 'h' : softirq ? 's' : '.')) 489 hardirq ? 'h' : softirq ? 's' : '.'))
487 return 0; 490 return 0;
488 491
489 if (entry->lock_depth < 0) 492 if (entry->preempt_count)
490 ret = trace_seq_putc(s, '.'); 493 ret = trace_seq_printf(s, "%x", entry->preempt_count);
491 else 494 else
492 ret = trace_seq_printf(s, "%d", entry->lock_depth); 495 ret = trace_seq_putc(s, '.');
496
493 if (!ret) 497 if (!ret)
494 return 0; 498 return 0;
495 499
496 if (entry->preempt_count) 500 if (entry->lock_depth < 0)
497 return trace_seq_printf(s, "%x", entry->preempt_count); 501 return trace_seq_putc(s, '.');
498 return trace_seq_putc(s, '.'); 502
503 return trace_seq_printf(s, "%d", entry->lock_depth);
499} 504}
500 505
501static int 506static int
@@ -883,7 +888,7 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
883 trace_assign_type(field, iter->ent); 888 trace_assign_type(field, iter->ent);
884 889
885 if (!S) 890 if (!S)
886 task_state_char(field->prev_state); 891 S = task_state_char(field->prev_state);
887 T = task_state_char(field->next_state); 892 T = task_state_char(field->next_state);
888 if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", 893 if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
889 field->prev_pid, 894 field->prev_pid,
@@ -918,7 +923,7 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
918 trace_assign_type(field, iter->ent); 923 trace_assign_type(field, iter->ent);
919 924
920 if (!S) 925 if (!S)
921 task_state_char(field->prev_state); 926 S = task_state_char(field->prev_state);
922 T = task_state_char(field->next_state); 927 T = task_state_char(field->next_state);
923 928
924 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); 929 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index d2cdbabb4ead..dc98309e839a 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
17 case TRACE_GRAPH_ENT: 17 case TRACE_GRAPH_ENT:
18 case TRACE_GRAPH_RET: 18 case TRACE_GRAPH_RET:
19 case TRACE_HW_BRANCHES: 19 case TRACE_HW_BRANCHES:
20 case TRACE_KSYM:
20 return 1; 21 return 1;
21 } 22 }
22 return 0; 23 return 0;
@@ -808,3 +809,57 @@ trace_selftest_startup_hw_branches(struct tracer *trace,
808 return ret; 809 return ret;
809} 810}
810#endif /* CONFIG_HW_BRANCH_TRACER */ 811#endif /* CONFIG_HW_BRANCH_TRACER */
812
813#ifdef CONFIG_KSYM_TRACER
814static int ksym_selftest_dummy;
815
816int
817trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
818{
819 unsigned long count;
820 int ret;
821
822 /* start the tracing */
823 ret = tracer_init(trace, tr);
824 if (ret) {
825 warn_failed_init_tracer(trace, ret);
826 return ret;
827 }
828
829 ksym_selftest_dummy = 0;
830 /* Register the read-write tracing request */
831
832 ret = process_new_ksym_entry("ksym_selftest_dummy",
833 HW_BREAKPOINT_R | HW_BREAKPOINT_W,
834 (unsigned long)(&ksym_selftest_dummy));
835
836 if (ret < 0) {
837 printk(KERN_CONT "ksym_trace read-write startup test failed\n");
838 goto ret_path;
839 }
840 /* Perform a read and a write operation over the dummy variable to
841 * trigger the tracer
842 */
843 if (ksym_selftest_dummy == 0)
844 ksym_selftest_dummy++;
845
846 /* stop the tracing. */
847 tracing_stop();
848 /* check the trace buffer */
849 ret = trace_test_buffer(tr, &count);
850 trace->reset(tr);
851 tracing_start();
852
853 /* read & write operations - one each is performed on the dummy variable
854 * triggering two entries in the trace buffer
855 */
856 if (!ret && count != 2) {
857 printk(KERN_CONT "Ksym tracer startup test failed");
858 ret = -1;
859 }
860
861ret_path:
862 return ret;
863}
864#endif /* CONFIG_KSYM_TRACER */
865
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 0f6facb050a1..8504ac71e4e8 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -296,14 +296,14 @@ static const struct file_operations stack_trace_fops = {
296 296
297int 297int
298stack_trace_sysctl(struct ctl_table *table, int write, 298stack_trace_sysctl(struct ctl_table *table, int write,
299 struct file *file, void __user *buffer, size_t *lenp, 299 void __user *buffer, size_t *lenp,
300 loff_t *ppos) 300 loff_t *ppos)
301{ 301{
302 int ret; 302 int ret;
303 303
304 mutex_lock(&stack_sysctl_mutex); 304 mutex_lock(&stack_sysctl_mutex);
305 305
306 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 306 ret = proc_dointvec(table, write, buffer, lenp, ppos);
307 307
308 if (ret || !write || 308 if (ret || !write ||
309 (last_stack_tracer_enabled == !!stack_tracer_enabled)) 309 (last_stack_tracer_enabled == !!stack_tracer_enabled))
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 9fbce6c9d2e1..57501d90096a 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -14,6 +14,43 @@ static int sys_refcount_exit;
14static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); 14static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
15static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); 15static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
16 16
17extern unsigned long __start_syscalls_metadata[];
18extern unsigned long __stop_syscalls_metadata[];
19
20static struct syscall_metadata **syscalls_metadata;
21
22static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
23{
24 struct syscall_metadata *start;
25 struct syscall_metadata *stop;
26 char str[KSYM_SYMBOL_LEN];
27
28
29 start = (struct syscall_metadata *)__start_syscalls_metadata;
30 stop = (struct syscall_metadata *)__stop_syscalls_metadata;
31 kallsyms_lookup(syscall, NULL, NULL, NULL, str);
32
33 for ( ; start < stop; start++) {
34 /*
35 * Only compare after the "sys" prefix. Archs that use
36 * syscall wrappers may have syscalls symbols aliases prefixed
37 * with "SyS" instead of "sys", leading to an unwanted
38 * mismatch.
39 */
40 if (start->name && !strcmp(start->name + 3, str + 3))
41 return start;
42 }
43 return NULL;
44}
45
46static struct syscall_metadata *syscall_nr_to_meta(int nr)
47{
48 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
49 return NULL;
50
51 return syscalls_metadata[nr];
52}
53
17enum print_line_t 54enum print_line_t
18print_syscall_enter(struct trace_iterator *iter, int flags) 55print_syscall_enter(struct trace_iterator *iter, int flags)
19{ 56{
@@ -30,7 +67,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
30 if (!entry) 67 if (!entry)
31 goto end; 68 goto end;
32 69
33 if (entry->enter_id != ent->type) { 70 if (entry->enter_event->id != ent->type) {
34 WARN_ON_ONCE(1); 71 WARN_ON_ONCE(1);
35 goto end; 72 goto end;
36 } 73 }
@@ -85,7 +122,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
85 return TRACE_TYPE_HANDLED; 122 return TRACE_TYPE_HANDLED;
86 } 123 }
87 124
88 if (entry->exit_id != ent->type) { 125 if (entry->exit_event->id != ent->type) {
89 WARN_ON_ONCE(1); 126 WARN_ON_ONCE(1);
90 return TRACE_TYPE_UNHANDLED; 127 return TRACE_TYPE_UNHANDLED;
91 } 128 }
@@ -103,24 +140,19 @@ extern char *__bad_type_size(void);
103#define SYSCALL_FIELD(type, name) \ 140#define SYSCALL_FIELD(type, name) \
104 sizeof(type) != sizeof(trace.name) ? \ 141 sizeof(type) != sizeof(trace.name) ? \
105 __bad_type_size() : \ 142 __bad_type_size() : \
106 #type, #name, offsetof(typeof(trace), name), sizeof(trace.name) 143 #type, #name, offsetof(typeof(trace), name), \
144 sizeof(trace.name), is_signed_type(type)
107 145
108int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) 146int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
109{ 147{
110 int i; 148 int i;
111 int nr;
112 int ret; 149 int ret;
113 struct syscall_metadata *entry; 150 struct syscall_metadata *entry = call->data;
114 struct syscall_trace_enter trace; 151 struct syscall_trace_enter trace;
115 int offset = offsetof(struct syscall_trace_enter, args); 152 int offset = offsetof(struct syscall_trace_enter, args);
116 153
117 nr = syscall_name_to_nr(call->data); 154 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
118 entry = syscall_nr_to_meta(nr); 155 "\tsigned:%u;\n",
119
120 if (!entry)
121 return 0;
122
123 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
124 SYSCALL_FIELD(int, nr)); 156 SYSCALL_FIELD(int, nr));
125 if (!ret) 157 if (!ret)
126 return 0; 158 return 0;
@@ -130,8 +162,10 @@ int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
130 entry->args[i]); 162 entry->args[i]);
131 if (!ret) 163 if (!ret)
132 return 0; 164 return 0;
133 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset, 165 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;"
134 sizeof(unsigned long)); 166 "\tsigned:%u;\n", offset,
167 sizeof(unsigned long),
168 is_signed_type(unsigned long));
135 if (!ret) 169 if (!ret)
136 return 0; 170 return 0;
137 offset += sizeof(unsigned long); 171 offset += sizeof(unsigned long);
@@ -163,10 +197,12 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
163 struct syscall_trace_exit trace; 197 struct syscall_trace_exit trace;
164 198
165 ret = trace_seq_printf(s, 199 ret = trace_seq_printf(s,
166 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 200 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
167 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", 201 "\tsigned:%u;\n"
202 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
203 "\tsigned:%u;\n",
168 SYSCALL_FIELD(int, nr), 204 SYSCALL_FIELD(int, nr),
169 SYSCALL_FIELD(unsigned long, ret)); 205 SYSCALL_FIELD(long, ret));
170 if (!ret) 206 if (!ret)
171 return 0; 207 return 0;
172 208
@@ -176,22 +212,19 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
176int syscall_enter_define_fields(struct ftrace_event_call *call) 212int syscall_enter_define_fields(struct ftrace_event_call *call)
177{ 213{
178 struct syscall_trace_enter trace; 214 struct syscall_trace_enter trace;
179 struct syscall_metadata *meta; 215 struct syscall_metadata *meta = call->data;
180 int ret; 216 int ret;
181 int nr;
182 int i; 217 int i;
183 int offset = offsetof(typeof(trace), args); 218 int offset = offsetof(typeof(trace), args);
184 219
185 nr = syscall_name_to_nr(call->data);
186 meta = syscall_nr_to_meta(nr);
187
188 if (!meta)
189 return 0;
190
191 ret = trace_define_common_fields(call); 220 ret = trace_define_common_fields(call);
192 if (ret) 221 if (ret)
193 return ret; 222 return ret;
194 223
224 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
225 if (ret)
226 return ret;
227
195 for (i = 0; i < meta->nb_args; i++) { 228 for (i = 0; i < meta->nb_args; i++) {
196 ret = trace_define_field(call, meta->types[i], 229 ret = trace_define_field(call, meta->types[i],
197 meta->args[i], offset, 230 meta->args[i], offset,
@@ -212,7 +245,11 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
212 if (ret) 245 if (ret)
213 return ret; 246 return ret;
214 247
215 ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0, 248 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
249 if (ret)
250 return ret;
251
252 ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
216 FILTER_OTHER); 253 FILTER_OTHER);
217 254
218 return ret; 255 return ret;
@@ -239,8 +276,8 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
239 276
240 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 277 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
241 278
242 event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id, 279 event = trace_current_buffer_lock_reserve(&buffer,
243 size, 0, 0); 280 sys_data->enter_event->id, size, 0, 0);
244 if (!event) 281 if (!event)
245 return; 282 return;
246 283
@@ -271,8 +308,8 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
271 if (!sys_data) 308 if (!sys_data)
272 return; 309 return;
273 310
274 event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id, 311 event = trace_current_buffer_lock_reserve(&buffer,
275 sizeof(*entry), 0, 0); 312 sys_data->exit_event->id, sizeof(*entry), 0, 0);
276 if (!event) 313 if (!event)
277 return; 314 return;
278 315
@@ -285,14 +322,12 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
285 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 322 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
286} 323}
287 324
288int reg_event_syscall_enter(void *ptr) 325int reg_event_syscall_enter(struct ftrace_event_call *call)
289{ 326{
290 int ret = 0; 327 int ret = 0;
291 int num; 328 int num;
292 char *name;
293 329
294 name = (char *)ptr; 330 num = ((struct syscall_metadata *)call->data)->syscall_nr;
295 num = syscall_name_to_nr(name);
296 if (num < 0 || num >= NR_syscalls) 331 if (num < 0 || num >= NR_syscalls)
297 return -ENOSYS; 332 return -ENOSYS;
298 mutex_lock(&syscall_trace_lock); 333 mutex_lock(&syscall_trace_lock);
@@ -309,13 +344,11 @@ int reg_event_syscall_enter(void *ptr)
309 return ret; 344 return ret;
310} 345}
311 346
312void unreg_event_syscall_enter(void *ptr) 347void unreg_event_syscall_enter(struct ftrace_event_call *call)
313{ 348{
314 int num; 349 int num;
315 char *name;
316 350
317 name = (char *)ptr; 351 num = ((struct syscall_metadata *)call->data)->syscall_nr;
318 num = syscall_name_to_nr(name);
319 if (num < 0 || num >= NR_syscalls) 352 if (num < 0 || num >= NR_syscalls)
320 return; 353 return;
321 mutex_lock(&syscall_trace_lock); 354 mutex_lock(&syscall_trace_lock);
@@ -326,14 +359,12 @@ void unreg_event_syscall_enter(void *ptr)
326 mutex_unlock(&syscall_trace_lock); 359 mutex_unlock(&syscall_trace_lock);
327} 360}
328 361
329int reg_event_syscall_exit(void *ptr) 362int reg_event_syscall_exit(struct ftrace_event_call *call)
330{ 363{
331 int ret = 0; 364 int ret = 0;
332 int num; 365 int num;
333 char *name;
334 366
335 name = (char *)ptr; 367 num = ((struct syscall_metadata *)call->data)->syscall_nr;
336 num = syscall_name_to_nr(name);
337 if (num < 0 || num >= NR_syscalls) 368 if (num < 0 || num >= NR_syscalls)
338 return -ENOSYS; 369 return -ENOSYS;
339 mutex_lock(&syscall_trace_lock); 370 mutex_lock(&syscall_trace_lock);
@@ -350,13 +381,11 @@ int reg_event_syscall_exit(void *ptr)
350 return ret; 381 return ret;
351} 382}
352 383
353void unreg_event_syscall_exit(void *ptr) 384void unreg_event_syscall_exit(struct ftrace_event_call *call)
354{ 385{
355 int num; 386 int num;
356 char *name;
357 387
358 name = (char *)ptr; 388 num = ((struct syscall_metadata *)call->data)->syscall_nr;
359 num = syscall_name_to_nr(name);
360 if (num < 0 || num >= NR_syscalls) 389 if (num < 0 || num >= NR_syscalls)
361 return; 390 return;
362 mutex_lock(&syscall_trace_lock); 391 mutex_lock(&syscall_trace_lock);
@@ -367,13 +396,44 @@ void unreg_event_syscall_exit(void *ptr)
367 mutex_unlock(&syscall_trace_lock); 396 mutex_unlock(&syscall_trace_lock);
368} 397}
369 398
370struct trace_event event_syscall_enter = { 399int init_syscall_trace(struct ftrace_event_call *call)
371 .trace = print_syscall_enter, 400{
372}; 401 int id;
402
403 id = register_ftrace_event(call->event);
404 if (!id)
405 return -ENODEV;
406 call->id = id;
407 INIT_LIST_HEAD(&call->fields);
408 return 0;
409}
410
411int __init init_ftrace_syscalls(void)
412{
413 struct syscall_metadata *meta;
414 unsigned long addr;
415 int i;
416
417 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
418 NR_syscalls, GFP_KERNEL);
419 if (!syscalls_metadata) {
420 WARN_ON(1);
421 return -ENOMEM;
422 }
423
424 for (i = 0; i < NR_syscalls; i++) {
425 addr = arch_syscall_addr(i);
426 meta = find_syscall_meta(addr);
427 if (!meta)
428 continue;
429
430 meta->syscall_nr = i;
431 syscalls_metadata[i] = meta;
432 }
373 433
374struct trace_event event_syscall_exit = { 434 return 0;
375 .trace = print_syscall_exit, 435}
376}; 436core_initcall(init_ftrace_syscalls);
377 437
378#ifdef CONFIG_EVENT_PROFILE 438#ifdef CONFIG_EVENT_PROFILE
379 439
@@ -387,8 +447,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
387 struct syscall_metadata *sys_data; 447 struct syscall_metadata *sys_data;
388 struct syscall_trace_enter *rec; 448 struct syscall_trace_enter *rec;
389 unsigned long flags; 449 unsigned long flags;
450 char *trace_buf;
390 char *raw_data; 451 char *raw_data;
391 int syscall_nr; 452 int syscall_nr;
453 int rctx;
392 int size; 454 int size;
393 int cpu; 455 int cpu;
394 456
@@ -412,41 +474,42 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
412 /* Protect the per cpu buffer, begin the rcu read side */ 474 /* Protect the per cpu buffer, begin the rcu read side */
413 local_irq_save(flags); 475 local_irq_save(flags);
414 476
477 rctx = perf_swevent_get_recursion_context();
478 if (rctx < 0)
479 goto end_recursion;
480
415 cpu = smp_processor_id(); 481 cpu = smp_processor_id();
416 482
417 if (in_nmi()) 483 trace_buf = rcu_dereference(perf_trace_buf);
418 raw_data = rcu_dereference(trace_profile_buf_nmi);
419 else
420 raw_data = rcu_dereference(trace_profile_buf);
421 484
422 if (!raw_data) 485 if (!trace_buf)
423 goto end; 486 goto end;
424 487
425 raw_data = per_cpu_ptr(raw_data, cpu); 488 raw_data = per_cpu_ptr(trace_buf, cpu);
426 489
427 /* zero the dead bytes from align to not leak stack to user */ 490 /* zero the dead bytes from align to not leak stack to user */
428 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 491 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
429 492
430 rec = (struct syscall_trace_enter *) raw_data; 493 rec = (struct syscall_trace_enter *) raw_data;
431 tracing_generic_entry_update(&rec->ent, 0, 0); 494 tracing_generic_entry_update(&rec->ent, 0, 0);
432 rec->ent.type = sys_data->enter_id; 495 rec->ent.type = sys_data->enter_event->id;
433 rec->nr = syscall_nr; 496 rec->nr = syscall_nr;
434 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 497 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
435 (unsigned long *)&rec->args); 498 (unsigned long *)&rec->args);
436 perf_tp_event(sys_data->enter_id, 0, 1, rec, size); 499 perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size);
437 500
438end: 501end:
502 perf_swevent_put_recursion_context(rctx);
503end_recursion:
439 local_irq_restore(flags); 504 local_irq_restore(flags);
440} 505}
441 506
442int reg_prof_syscall_enter(char *name) 507int prof_sysenter_enable(struct ftrace_event_call *call)
443{ 508{
444 int ret = 0; 509 int ret = 0;
445 int num; 510 int num;
446 511
447 num = syscall_name_to_nr(name); 512 num = ((struct syscall_metadata *)call->data)->syscall_nr;
448 if (num < 0 || num >= NR_syscalls)
449 return -ENOSYS;
450 513
451 mutex_lock(&syscall_trace_lock); 514 mutex_lock(&syscall_trace_lock);
452 if (!sys_prof_refcount_enter) 515 if (!sys_prof_refcount_enter)
@@ -462,13 +525,11 @@ int reg_prof_syscall_enter(char *name)
462 return ret; 525 return ret;
463} 526}
464 527
465void unreg_prof_syscall_enter(char *name) 528void prof_sysenter_disable(struct ftrace_event_call *call)
466{ 529{
467 int num; 530 int num;
468 531
469 num = syscall_name_to_nr(name); 532 num = ((struct syscall_metadata *)call->data)->syscall_nr;
470 if (num < 0 || num >= NR_syscalls)
471 return;
472 533
473 mutex_lock(&syscall_trace_lock); 534 mutex_lock(&syscall_trace_lock);
474 sys_prof_refcount_enter--; 535 sys_prof_refcount_enter--;
@@ -484,7 +545,9 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
484 struct syscall_trace_exit *rec; 545 struct syscall_trace_exit *rec;
485 unsigned long flags; 546 unsigned long flags;
486 int syscall_nr; 547 int syscall_nr;
548 char *trace_buf;
487 char *raw_data; 549 char *raw_data;
550 int rctx;
488 int size; 551 int size;
489 int cpu; 552 int cpu;
490 553
@@ -510,17 +573,19 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
510 573
511 /* Protect the per cpu buffer, begin the rcu read side */ 574 /* Protect the per cpu buffer, begin the rcu read side */
512 local_irq_save(flags); 575 local_irq_save(flags);
576
577 rctx = perf_swevent_get_recursion_context();
578 if (rctx < 0)
579 goto end_recursion;
580
513 cpu = smp_processor_id(); 581 cpu = smp_processor_id();
514 582
515 if (in_nmi()) 583 trace_buf = rcu_dereference(perf_trace_buf);
516 raw_data = rcu_dereference(trace_profile_buf_nmi);
517 else
518 raw_data = rcu_dereference(trace_profile_buf);
519 584
520 if (!raw_data) 585 if (!trace_buf)
521 goto end; 586 goto end;
522 587
523 raw_data = per_cpu_ptr(raw_data, cpu); 588 raw_data = per_cpu_ptr(trace_buf, cpu);
524 589
525 /* zero the dead bytes from align to not leak stack to user */ 590 /* zero the dead bytes from align to not leak stack to user */
526 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 591 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -528,24 +593,24 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
528 rec = (struct syscall_trace_exit *)raw_data; 593 rec = (struct syscall_trace_exit *)raw_data;
529 594
530 tracing_generic_entry_update(&rec->ent, 0, 0); 595 tracing_generic_entry_update(&rec->ent, 0, 0);
531 rec->ent.type = sys_data->exit_id; 596 rec->ent.type = sys_data->exit_event->id;
532 rec->nr = syscall_nr; 597 rec->nr = syscall_nr;
533 rec->ret = syscall_get_return_value(current, regs); 598 rec->ret = syscall_get_return_value(current, regs);
534 599
535 perf_tp_event(sys_data->exit_id, 0, 1, rec, size); 600 perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size);
536 601
537end: 602end:
603 perf_swevent_put_recursion_context(rctx);
604end_recursion:
538 local_irq_restore(flags); 605 local_irq_restore(flags);
539} 606}
540 607
541int reg_prof_syscall_exit(char *name) 608int prof_sysexit_enable(struct ftrace_event_call *call)
542{ 609{
543 int ret = 0; 610 int ret = 0;
544 int num; 611 int num;
545 612
546 num = syscall_name_to_nr(name); 613 num = ((struct syscall_metadata *)call->data)->syscall_nr;
547 if (num < 0 || num >= NR_syscalls)
548 return -ENOSYS;
549 614
550 mutex_lock(&syscall_trace_lock); 615 mutex_lock(&syscall_trace_lock);
551 if (!sys_prof_refcount_exit) 616 if (!sys_prof_refcount_exit)
@@ -561,13 +626,11 @@ int reg_prof_syscall_exit(char *name)
561 return ret; 626 return ret;
562} 627}
563 628
564void unreg_prof_syscall_exit(char *name) 629void prof_sysexit_disable(struct ftrace_event_call *call)
565{ 630{
566 int num; 631 int num;
567 632
568 num = syscall_name_to_nr(name); 633 num = ((struct syscall_metadata *)call->data)->syscall_nr;
569 if (num < 0 || num >= NR_syscalls)
570 return;
571 634
572 mutex_lock(&syscall_trace_lock); 635 mutex_lock(&syscall_trace_lock);
573 sys_prof_refcount_exit--; 636 sys_prof_refcount_exit--;
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 0314501688b9..419209893d87 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -4,7 +4,6 @@
4 */ 4 */
5 5
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/utsname.h>
8#include <linux/mman.h> 7#include <linux/mman.h>
9#include <linux/notifier.h> 8#include <linux/notifier.h>
10#include <linux/reboot.h> 9#include <linux/reboot.h>
diff --git a/kernel/user.c b/kernel/user.c
index 2c000e7132ac..46d0165ca70c 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -330,9 +330,9 @@ done:
330 */ 330 */
331static void free_user(struct user_struct *up, unsigned long flags) 331static void free_user(struct user_struct *up, unsigned long flags)
332{ 332{
333 spin_unlock_irqrestore(&uidhash_lock, flags);
334 INIT_DELAYED_WORK(&up->work, cleanup_user_struct); 333 INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
335 schedule_delayed_work(&up->work, msecs_to_jiffies(1000)); 334 schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
335 spin_unlock_irqrestore(&uidhash_lock, flags);
336} 336}
337 337
338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ 338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 92359cc747a7..69eae358a726 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -42,14 +42,14 @@ static void put_uts(ctl_table *table, int write, void *which)
42 * Special case of dostring for the UTS structure. This has locks 42 * Special case of dostring for the UTS structure. This has locks
43 * to observe. Should this be in kernel/sys.c ???? 43 * to observe. Should this be in kernel/sys.c ????
44 */ 44 */
45static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, 45static int proc_do_uts_string(ctl_table *table, int write,
46 void __user *buffer, size_t *lenp, loff_t *ppos) 46 void __user *buffer, size_t *lenp, loff_t *ppos)
47{ 47{
48 struct ctl_table uts_table; 48 struct ctl_table uts_table;
49 int r; 49 int r;
50 memcpy(&uts_table, table, sizeof(uts_table)); 50 memcpy(&uts_table, table, sizeof(uts_table));
51 uts_table.data = get_uts(table, write); 51 uts_table.data = get_uts(table, write);
52 r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos); 52 r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
53 put_uts(table, write, uts_table.data); 53 put_uts(table, write, uts_table.data);
54 return r; 54 return r;
55} 55}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index addfe2df93b1..67e526b6ae81 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -640,6 +640,24 @@ int schedule_delayed_work(struct delayed_work *dwork,
640EXPORT_SYMBOL(schedule_delayed_work); 640EXPORT_SYMBOL(schedule_delayed_work);
641 641
642/** 642/**
643 * flush_delayed_work - block until a dwork_struct's callback has terminated
644 * @dwork: the delayed work which is to be flushed
645 *
646 * Any timeout is cancelled, and any pending work is run immediately.
647 */
648void flush_delayed_work(struct delayed_work *dwork)
649{
650 if (del_timer_sync(&dwork->timer)) {
651 struct cpu_workqueue_struct *cwq;
652 cwq = wq_per_cpu(keventd_wq, get_cpu());
653 __queue_work(cwq, &dwork->work);
654 put_cpu();
655 }
656 flush_work(&dwork->work);
657}
658EXPORT_SYMBOL(flush_delayed_work);
659
660/**
643 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay 661 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
644 * @cpu: cpu to use 662 * @cpu: cpu to use
645 * @dwork: job to be done 663 * @dwork: job to be done
@@ -667,6 +685,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
667int schedule_on_each_cpu(work_func_t func) 685int schedule_on_each_cpu(work_func_t func)
668{ 686{
669 int cpu; 687 int cpu;
688 int orig = -1;
670 struct work_struct *works; 689 struct work_struct *works;
671 690
672 works = alloc_percpu(struct work_struct); 691 works = alloc_percpu(struct work_struct);
@@ -674,14 +693,28 @@ int schedule_on_each_cpu(work_func_t func)
674 return -ENOMEM; 693 return -ENOMEM;
675 694
676 get_online_cpus(); 695 get_online_cpus();
696
697 /*
698 * When running in keventd don't schedule a work item on
699 * itself. Can just call directly because the work queue is
700 * already bound. This also is faster.
701 */
702 if (current_is_keventd())
703 orig = raw_smp_processor_id();
704
677 for_each_online_cpu(cpu) { 705 for_each_online_cpu(cpu) {
678 struct work_struct *work = per_cpu_ptr(works, cpu); 706 struct work_struct *work = per_cpu_ptr(works, cpu);
679 707
680 INIT_WORK(work, func); 708 INIT_WORK(work, func);
681 schedule_work_on(cpu, work); 709 if (cpu != orig)
710 schedule_work_on(cpu, work);
682 } 711 }
712 if (orig >= 0)
713 func(per_cpu_ptr(works, orig));
714
683 for_each_online_cpu(cpu) 715 for_each_online_cpu(cpu)
684 flush_work(per_cpu_ptr(works, cpu)); 716 flush_work(per_cpu_ptr(works, cpu));
717
685 put_online_cpus(); 718 put_online_cpus();
686 free_percpu(works); 719 free_percpu(works);
687 return 0; 720 return 0;