aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks202
-rw-r--r--kernel/Makefile11
-rw-r--r--kernel/audit.c18
-rw-r--r--kernel/audit_watch.c2
-rw-r--r--kernel/auditsc.c6
-rw-r--r--kernel/capability.c15
-rw-r--r--kernel/cgroup.c1136
-rw-r--r--kernel/cgroup_debug.c105
-rw-r--r--kernel/cgroup_freezer.c15
-rw-r--r--kernel/cpu.c15
-rw-r--r--kernel/cpuset.c93
-rw-r--r--kernel/cred.c22
-rw-r--r--kernel/delayacct.c1
-rw-r--r--kernel/dma-coherent.c176
-rw-r--r--kernel/exit.c189
-rw-r--r--kernel/fork.c92
-rw-r--r--kernel/futex.c160
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/hrtimer.c152
-rw-r--r--kernel/hung_task.c6
-rw-r--r--kernel/hw_breakpoint.c423
-rw-r--r--kernel/irq/chip.c6
-rw-r--r--kernel/irq/handle.c1
-rw-r--r--kernel/irq/proc.c40
-rw-r--r--kernel/irq/spurious.c16
-rw-r--r--kernel/itimer.c169
-rw-r--r--kernel/kallsyms.c4
-rw-r--r--kernel/kfifo.c2
-rw-r--r--kernel/kgdb.c2
-rw-r--r--kernel/kmod.c8
-rw-r--r--kernel/kprobes.c78
-rw-r--r--kernel/kthread.c23
-rw-r--r--kernel/lockdep.c25
-rw-r--r--kernel/lockdep_proc.c2
-rw-r--r--kernel/marker.c930
-rw-r--r--kernel/module.c195
-rw-r--r--kernel/mutex-debug.c1
-rw-r--r--kernel/mutex.c4
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/ns_cgroup.c16
-rw-r--r--kernel/panic.c5
-rw-r--r--kernel/params.c24
-rw-r--r--kernel/perf_counter.c4962
-rw-r--r--kernel/perf_event.c5359
-rw-r--r--kernel/pid.c15
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/posix-cpu-timers.c155
-rw-r--r--kernel/posix-timers.c35
-rw-r--r--kernel/power/Kconfig14
-rw-r--r--kernel/power/console.c63
-rw-r--r--kernel/power/hibernate.c32
-rw-r--r--kernel/power/main.c17
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/process.c1
-rw-r--r--kernel/power/snapshot.c414
-rw-r--r--kernel/power/suspend_test.c5
-rw-r--r--kernel/power/swap.c44
-rw-r--r--kernel/printk.c40
-rw-r--r--kernel/profile.c45
-rw-r--r--kernel/ptrace.c11
-rw-r--r--kernel/rcupdate.c222
-rw-r--r--kernel/rcutiny.c282
-rw-r--r--kernel/rcutorture.c112
-rw-r--r--kernel/rcutree.c870
-rw-r--r--kernel/rcutree.h162
-rw-r--r--kernel/rcutree_plugin.h506
-rw-r--r--kernel/rcutree_trace.c28
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/res_counter.c3
-rw-r--r--kernel/resource.c23
-rw-r--r--kernel/sched.c907
-rw-r--r--kernel/sched_clock.c122
-rw-r--r--kernel/sched_debug.c5
-rw-r--r--kernel/sched_fair.c579
-rw-r--r--kernel/sched_features.h122
-rw-r--r--kernel/sched_idletask.c11
-rw-r--r--kernel/sched_rt.c81
-rw-r--r--kernel/signal.c241
-rw-r--r--kernel/slow-work-debugfs.c227
-rw-r--r--kernel/slow-work.c524
-rw-r--r--kernel/slow-work.h72
-rw-r--r--kernel/smp.c132
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/softlockup.c4
-rw-r--r--kernel/spinlock.c310
-rw-r--r--kernel/srcu.c74
-rw-r--r--kernel/sys.c82
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl.c166
-rw-r--r--kernel/sysctl_check.c2
-rw-r--r--kernel/taskstats.c10
-rw-r--r--kernel/time.c39
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/clocksource.c529
-rw-r--r--kernel/time/jiffies.c6
-rw-r--r--kernel/time/ntp.c7
-rw-r--r--kernel/time/tick-sched.c9
-rw-r--r--kernel/time/timeconv.c127
-rw-r--r--kernel/time/timekeeping.c536
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/time/timer_stats.c2
-rw-r--r--kernel/timer.c64
-rw-r--r--kernel/trace/Kconfig68
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/blktrace.c39
-rw-r--r--kernel/trace/ftrace.c610
-rw-r--r--kernel/trace/kmemtrace.c2
-rw-r--r--kernel/trace/power-traces.c20
-rw-r--r--kernel/trace/ring_buffer.c57
-rw-r--r--kernel/trace/ring_buffer_benchmark.c85
-rw-r--r--kernel/trace/trace.c250
-rw-r--r--kernel/trace/trace.h353
-rw-r--r--kernel/trace/trace_boot.c8
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_clock.c32
-rw-r--r--kernel/trace/trace_entries.h382
-rw-r--r--kernel/trace/trace_event_profile.c89
-rw-r--r--kernel/trace/trace_event_types.h178
-rw-r--r--kernel/trace/trace_events.c328
-rw-r--r--kernel/trace/trace_events_filter.c467
-rw-r--r--kernel/trace/trace_export.c287
-rw-r--r--kernel/trace/trace_functions.c2
-rw-r--r--kernel/trace/trace_functions_graph.c66
-rw-r--r--kernel/trace/trace_hw_branches.c10
-rw-r--r--kernel/trace/trace_irqsoff.c16
-rw-r--r--kernel/trace/trace_kprobe.c1523
-rw-r--r--kernel/trace/trace_ksym.c550
-rw-r--r--kernel/trace/trace_mmiotrace.c10
-rw-r--r--kernel/trace/trace_output.c55
-rw-r--r--kernel/trace/trace_output.h2
-rw-r--r--kernel/trace/trace_power.c218
-rw-r--r--kernel/trace/trace_printk.c1
-rw-r--r--kernel/trace/trace_sched_wakeup.c52
-rw-r--r--kernel/trace/trace_selftest.c55
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/trace/trace_syscalls.c298
-rw-r--r--kernel/tracepoint.c2
-rw-r--r--kernel/uid16.c1
-rw-r--r--kernel/user.c2
-rw-r--r--kernel/utsname_sysctl.c4
-rw-r--r--kernel/workqueue.c35
141 files changed, 17716 insertions, 11278 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
new file mode 100644
index 000000000000..88c92fb44618
--- /dev/null
+++ b/kernel/Kconfig.locks
@@ -0,0 +1,202 @@
1#
2# The ARCH_INLINE foo is necessary because select ignores "depends on"
3#
4config ARCH_INLINE_SPIN_TRYLOCK
5 bool
6
7config ARCH_INLINE_SPIN_TRYLOCK_BH
8 bool
9
10config ARCH_INLINE_SPIN_LOCK
11 bool
12
13config ARCH_INLINE_SPIN_LOCK_BH
14 bool
15
16config ARCH_INLINE_SPIN_LOCK_IRQ
17 bool
18
19config ARCH_INLINE_SPIN_LOCK_IRQSAVE
20 bool
21
22config ARCH_INLINE_SPIN_UNLOCK
23 bool
24
25config ARCH_INLINE_SPIN_UNLOCK_BH
26 bool
27
28config ARCH_INLINE_SPIN_UNLOCK_IRQ
29 bool
30
31config ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
32 bool
33
34
35config ARCH_INLINE_READ_TRYLOCK
36 bool
37
38config ARCH_INLINE_READ_LOCK
39 bool
40
41config ARCH_INLINE_READ_LOCK_BH
42 bool
43
44config ARCH_INLINE_READ_LOCK_IRQ
45 bool
46
47config ARCH_INLINE_READ_LOCK_IRQSAVE
48 bool
49
50config ARCH_INLINE_READ_UNLOCK
51 bool
52
53config ARCH_INLINE_READ_UNLOCK_BH
54 bool
55
56config ARCH_INLINE_READ_UNLOCK_IRQ
57 bool
58
59config ARCH_INLINE_READ_UNLOCK_IRQRESTORE
60 bool
61
62
63config ARCH_INLINE_WRITE_TRYLOCK
64 bool
65
66config ARCH_INLINE_WRITE_LOCK
67 bool
68
69config ARCH_INLINE_WRITE_LOCK_BH
70 bool
71
72config ARCH_INLINE_WRITE_LOCK_IRQ
73 bool
74
75config ARCH_INLINE_WRITE_LOCK_IRQSAVE
76 bool
77
78config ARCH_INLINE_WRITE_UNLOCK
79 bool
80
81config ARCH_INLINE_WRITE_UNLOCK_BH
82 bool
83
84config ARCH_INLINE_WRITE_UNLOCK_IRQ
85 bool
86
87config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
88 bool
89
90#
91# lock_* functions are inlined when:
92# - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y
93#
94# trylock_* functions are inlined when:
95# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
96#
97# unlock and unlock_irq functions are inlined when:
98# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
99# or
100# - DEBUG_SPINLOCK=n and PREEMPT=n
101#
102# unlock_bh and unlock_irqrestore functions are inlined when:
103# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
104#
105
106config INLINE_SPIN_TRYLOCK
107 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK
108
109config INLINE_SPIN_TRYLOCK_BH
110 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH
111
112config INLINE_SPIN_LOCK
113 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
114
115config INLINE_SPIN_LOCK_BH
116 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
117 ARCH_INLINE_SPIN_LOCK_BH
118
119config INLINE_SPIN_LOCK_IRQ
120 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
121 ARCH_INLINE_SPIN_LOCK_IRQ
122
123config INLINE_SPIN_LOCK_IRQSAVE
124 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
125 ARCH_INLINE_SPIN_LOCK_IRQSAVE
126
127config INLINE_SPIN_UNLOCK
128 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK)
129
130config INLINE_SPIN_UNLOCK_BH
131 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH
132
133config INLINE_SPIN_UNLOCK_IRQ
134 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH)
135
136config INLINE_SPIN_UNLOCK_IRQRESTORE
137 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
138
139
140config INLINE_READ_TRYLOCK
141 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK
142
143config INLINE_READ_LOCK
144 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
145
146config INLINE_READ_LOCK_BH
147 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
148 ARCH_INLINE_READ_LOCK_BH
149
150config INLINE_READ_LOCK_IRQ
151 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
152 ARCH_INLINE_READ_LOCK_IRQ
153
154config INLINE_READ_LOCK_IRQSAVE
155 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
156 ARCH_INLINE_READ_LOCK_IRQSAVE
157
158config INLINE_READ_UNLOCK
159 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK)
160
161config INLINE_READ_UNLOCK_BH
162 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH
163
164config INLINE_READ_UNLOCK_IRQ
165 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH)
166
167config INLINE_READ_UNLOCK_IRQRESTORE
168 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE
169
170
171config INLINE_WRITE_TRYLOCK
172 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK
173
174config INLINE_WRITE_LOCK
175 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
176
177config INLINE_WRITE_LOCK_BH
178 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
179 ARCH_INLINE_WRITE_LOCK_BH
180
181config INLINE_WRITE_LOCK_IRQ
182 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
183 ARCH_INLINE_WRITE_LOCK_IRQ
184
185config INLINE_WRITE_LOCK_IRQSAVE
186 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
187 ARCH_INLINE_WRITE_LOCK_IRQSAVE
188
189config INLINE_WRITE_UNLOCK
190 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK)
191
192config INLINE_WRITE_UNLOCK_BH
193 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH
194
195config INLINE_WRITE_UNLOCK_IRQ
196 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH)
197
198config INLINE_WRITE_UNLOCK_IRQRESTORE
199 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
200
201config MUTEX_SPIN_ON_OWNER
202 def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES
diff --git a/kernel/Makefile b/kernel/Makefile
index b833bd5cc127..982c50e2ce53 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,6 +21,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 21CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 22CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg 23CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_perf_event.o = -pg
24endif 25endif
25 26
26obj-$(CONFIG_FREEZER) += freezer.o 27obj-$(CONFIG_FREEZER) += freezer.o
@@ -58,7 +59,6 @@ obj-$(CONFIG_KEXEC) += kexec.o
58obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o 59obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
59obj-$(CONFIG_COMPAT) += compat.o 60obj-$(CONFIG_COMPAT) += compat.o
60obj-$(CONFIG_CGROUPS) += cgroup.o 61obj-$(CONFIG_CGROUPS) += cgroup.o
61obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
62obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o 62obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
63obj-$(CONFIG_CPUSETS) += cpuset.o 63obj-$(CONFIG_CPUSETS) += cpuset.o
64obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o 64obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
@@ -83,21 +83,22 @@ obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
83obj-$(CONFIG_TREE_RCU) += rcutree.o 83obj-$(CONFIG_TREE_RCU) += rcutree.o
84obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o 84obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
85obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o 85obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
86obj-$(CONFIG_TINY_RCU) += rcutiny.o
86obj-$(CONFIG_RELAY) += relay.o 87obj-$(CONFIG_RELAY) += relay.o
87obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 88obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
88obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 89obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
89obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 90obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
90obj-$(CONFIG_MARKERS) += marker.o
91obj-$(CONFIG_TRACEPOINTS) += tracepoint.o 91obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
92obj-$(CONFIG_LATENCYTOP) += latencytop.o 92obj-$(CONFIG_LATENCYTOP) += latencytop.o
93obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
94obj-$(CONFIG_FUNCTION_TRACER) += trace/ 93obj-$(CONFIG_FUNCTION_TRACER) += trace/
95obj-$(CONFIG_TRACING) += trace/ 94obj-$(CONFIG_TRACING) += trace/
96obj-$(CONFIG_X86_DS) += trace/ 95obj-$(CONFIG_X86_DS) += trace/
97obj-$(CONFIG_RING_BUFFER) += trace/ 96obj-$(CONFIG_RING_BUFFER) += trace/
98obj-$(CONFIG_SMP) += sched_cpupri.o 97obj-$(CONFIG_SMP) += sched_cpupri.o
99obj-$(CONFIG_SLOW_WORK) += slow-work.o 98obj-$(CONFIG_SLOW_WORK) += slow-work.o
100obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o 99obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
100obj-$(CONFIG_PERF_EVENTS) += perf_event.o
101obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
101 102
102ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 103ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
103# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 104# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
@@ -117,7 +118,7 @@ $(obj)/config_data.gz: .config FORCE
117 $(call if_changed,gzip) 118 $(call if_changed,gzip)
118 119
119quiet_cmd_ikconfiggz = IKCFG $@ 120quiet_cmd_ikconfiggz = IKCFG $@
120 cmd_ikconfiggz = (echo "static const char kernel_config_data[] = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@ 121 cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
121targets += config_data.h 122targets += config_data.h
122$(obj)/config_data.h: $(obj)/config_data.gz FORCE 123$(obj)/config_data.h: $(obj)/config_data.gz FORCE
123 $(call if_changed,ikconfiggz) 124 $(call if_changed,ikconfiggz)
diff --git a/kernel/audit.c b/kernel/audit.c
index defc2e6f1e3b..5feed232be9d 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -855,18 +855,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
855 break; 855 break;
856 } 856 }
857 case AUDIT_SIGNAL_INFO: 857 case AUDIT_SIGNAL_INFO:
858 err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); 858 len = 0;
859 if (err) 859 if (audit_sig_sid) {
860 return err; 860 err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
861 if (err)
862 return err;
863 }
861 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); 864 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
862 if (!sig_data) { 865 if (!sig_data) {
863 security_release_secctx(ctx, len); 866 if (audit_sig_sid)
867 security_release_secctx(ctx, len);
864 return -ENOMEM; 868 return -ENOMEM;
865 } 869 }
866 sig_data->uid = audit_sig_uid; 870 sig_data->uid = audit_sig_uid;
867 sig_data->pid = audit_sig_pid; 871 sig_data->pid = audit_sig_pid;
868 memcpy(sig_data->ctx, ctx, len); 872 if (audit_sig_sid) {
869 security_release_secctx(ctx, len); 873 memcpy(sig_data->ctx, ctx, len);
874 security_release_secctx(ctx, len);
875 }
870 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 876 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
871 0, 0, sig_data, sizeof(*sig_data) + len); 877 0, 0, sig_data, sizeof(*sig_data) + len);
872 kfree(sig_data); 878 kfree(sig_data);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 0e96dbc60ea9..cc7e87936cbc 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -45,8 +45,8 @@
45 45
46struct audit_watch { 46struct audit_watch {
47 atomic_t count; /* reference count */ 47 atomic_t count; /* reference count */
48 char *path; /* insertion path */
49 dev_t dev; /* associated superblock device */ 48 dev_t dev; /* associated superblock device */
49 char *path; /* insertion path */
50 unsigned long ino; /* associated inode number */ 50 unsigned long ino; /* associated inode number */
51 struct audit_parent *parent; /* associated parent */ 51 struct audit_parent *parent; /* associated parent */
52 struct list_head wlist; /* entry in parent->watches list */ 52 struct list_head wlist; /* entry in parent->watches list */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 68d3c6a0ecd6..267e484f0198 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -168,12 +168,12 @@ struct audit_context {
168 int in_syscall; /* 1 if task is in a syscall */ 168 int in_syscall; /* 1 if task is in a syscall */
169 enum audit_state state, current_state; 169 enum audit_state state, current_state;
170 unsigned int serial; /* serial number for record */ 170 unsigned int serial; /* serial number for record */
171 struct timespec ctime; /* time of syscall entry */
172 int major; /* syscall number */ 171 int major; /* syscall number */
172 struct timespec ctime; /* time of syscall entry */
173 unsigned long argv[4]; /* syscall arguments */ 173 unsigned long argv[4]; /* syscall arguments */
174 int return_valid; /* return code is valid */
175 long return_code;/* syscall return code */ 174 long return_code;/* syscall return code */
176 u64 prio; 175 u64 prio;
176 int return_valid; /* return code is valid */
177 int name_count; 177 int name_count;
178 struct audit_names names[AUDIT_NAMES]; 178 struct audit_names names[AUDIT_NAMES];
179 char * filterkey; /* key for rule that triggered record */ 179 char * filterkey; /* key for rule that triggered record */
@@ -198,8 +198,8 @@ struct audit_context {
198 char target_comm[TASK_COMM_LEN]; 198 char target_comm[TASK_COMM_LEN];
199 199
200 struct audit_tree_refs *trees, *first_trees; 200 struct audit_tree_refs *trees, *first_trees;
201 int tree_count;
202 struct list_head killed_trees; 201 struct list_head killed_trees;
202 int tree_count;
203 203
204 int type; 204 int type;
205 union { 205 union {
diff --git a/kernel/capability.c b/kernel/capability.c
index 4e17041963f5..7f876e60521f 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -29,7 +29,6 @@ EXPORT_SYMBOL(__cap_empty_set);
29EXPORT_SYMBOL(__cap_full_set); 29EXPORT_SYMBOL(__cap_full_set);
30EXPORT_SYMBOL(__cap_init_eff_set); 30EXPORT_SYMBOL(__cap_init_eff_set);
31 31
32#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
33int file_caps_enabled = 1; 32int file_caps_enabled = 1;
34 33
35static int __init file_caps_disable(char *str) 34static int __init file_caps_disable(char *str)
@@ -38,7 +37,6 @@ static int __init file_caps_disable(char *str)
38 return 1; 37 return 1;
39} 38}
40__setup("no_file_caps", file_caps_disable); 39__setup("no_file_caps", file_caps_disable);
41#endif
42 40
43/* 41/*
44 * More recent versions of libcap are available from: 42 * More recent versions of libcap are available from:
@@ -169,8 +167,8 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
169 kernel_cap_t pE, pI, pP; 167 kernel_cap_t pE, pI, pP;
170 168
171 ret = cap_validate_magic(header, &tocopy); 169 ret = cap_validate_magic(header, &tocopy);
172 if (ret != 0) 170 if ((dataptr == NULL) || (ret != 0))
173 return ret; 171 return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret;
174 172
175 if (get_user(pid, &header->pid)) 173 if (get_user(pid, &header->pid))
176 return -EFAULT; 174 return -EFAULT;
@@ -238,7 +236,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
238SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) 236SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
239{ 237{
240 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; 238 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
241 unsigned i, tocopy; 239 unsigned i, tocopy, copybytes;
242 kernel_cap_t inheritable, permitted, effective; 240 kernel_cap_t inheritable, permitted, effective;
243 struct cred *new; 241 struct cred *new;
244 int ret; 242 int ret;
@@ -255,8 +253,11 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
255 if (pid != 0 && pid != task_pid_vnr(current)) 253 if (pid != 0 && pid != task_pid_vnr(current))
256 return -EPERM; 254 return -EPERM;
257 255
258 if (copy_from_user(&kdata, data, 256 copybytes = tocopy * sizeof(struct __user_cap_data_struct);
259 tocopy * sizeof(struct __user_cap_data_struct))) 257 if (copybytes > sizeof(kdata))
258 return -EFAULT;
259
260 if (copy_from_user(&kdata, data, copybytes))
260 return -EFAULT; 261 return -EFAULT;
261 262
262 for (i = 0; i < tocopy; i++) { 263 for (i = 0; i < tocopy; i++) {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c7ece8f027f2..0249f4be9b5c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -23,6 +23,7 @@
23 */ 23 */
24 24
25#include <linux/cgroup.h> 25#include <linux/cgroup.h>
26#include <linux/ctype.h>
26#include <linux/errno.h> 27#include <linux/errno.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
28#include <linux/kernel.h> 29#include <linux/kernel.h>
@@ -48,6 +49,8 @@
48#include <linux/namei.h> 49#include <linux/namei.h>
49#include <linux/smp_lock.h> 50#include <linux/smp_lock.h>
50#include <linux/pid_namespace.h> 51#include <linux/pid_namespace.h>
52#include <linux/idr.h>
53#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
51 54
52#include <asm/atomic.h> 55#include <asm/atomic.h>
53 56
@@ -60,6 +63,8 @@ static struct cgroup_subsys *subsys[] = {
60#include <linux/cgroup_subsys.h> 63#include <linux/cgroup_subsys.h>
61}; 64};
62 65
66#define MAX_CGROUP_ROOT_NAMELEN 64
67
63/* 68/*
64 * A cgroupfs_root represents the root of a cgroup hierarchy, 69 * A cgroupfs_root represents the root of a cgroup hierarchy,
65 * and may be associated with a superblock to form an active 70 * and may be associated with a superblock to form an active
@@ -74,6 +79,9 @@ struct cgroupfs_root {
74 */ 79 */
75 unsigned long subsys_bits; 80 unsigned long subsys_bits;
76 81
82 /* Unique id for this hierarchy. */
83 int hierarchy_id;
84
77 /* The bitmask of subsystems currently attached to this hierarchy */ 85 /* The bitmask of subsystems currently attached to this hierarchy */
78 unsigned long actual_subsys_bits; 86 unsigned long actual_subsys_bits;
79 87
@@ -94,6 +102,9 @@ struct cgroupfs_root {
94 102
95 /* The path to use for release notifications. */ 103 /* The path to use for release notifications. */
96 char release_agent_path[PATH_MAX]; 104 char release_agent_path[PATH_MAX];
105
106 /* The name for this hierarchy - may be empty */
107 char name[MAX_CGROUP_ROOT_NAMELEN];
97}; 108};
98 109
99/* 110/*
@@ -141,6 +152,10 @@ struct css_id {
141static LIST_HEAD(roots); 152static LIST_HEAD(roots);
142static int root_count; 153static int root_count;
143 154
155static DEFINE_IDA(hierarchy_ida);
156static int next_hierarchy_id;
157static DEFINE_SPINLOCK(hierarchy_id_lock);
158
144/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ 159/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
145#define dummytop (&rootnode.top_cgroup) 160#define dummytop (&rootnode.top_cgroup)
146 161
@@ -201,6 +216,7 @@ struct cg_cgroup_link {
201 * cgroup, anchored on cgroup->css_sets 216 * cgroup, anchored on cgroup->css_sets
202 */ 217 */
203 struct list_head cgrp_link_list; 218 struct list_head cgrp_link_list;
219 struct cgroup *cgrp;
204 /* 220 /*
205 * List running through cg_cgroup_links pointing at a 221 * List running through cg_cgroup_links pointing at a
206 * single css_set object, anchored on css_set->cg_links 222 * single css_set object, anchored on css_set->cg_links
@@ -227,8 +243,11 @@ static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
227static DEFINE_RWLOCK(css_set_lock); 243static DEFINE_RWLOCK(css_set_lock);
228static int css_set_count; 244static int css_set_count;
229 245
230/* hash table for cgroup groups. This improves the performance to 246/*
231 * find an existing css_set */ 247 * hash table for cgroup groups. This improves the performance to find
248 * an existing css_set. This hash doesn't (currently) take into
249 * account cgroups in empty hierarchies.
250 */
232#define CSS_SET_HASH_BITS 7 251#define CSS_SET_HASH_BITS 7
233#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) 252#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
234static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; 253static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
@@ -248,48 +267,22 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
248 return &css_set_table[index]; 267 return &css_set_table[index];
249} 268}
250 269
270static void free_css_set_rcu(struct rcu_head *obj)
271{
272 struct css_set *cg = container_of(obj, struct css_set, rcu_head);
273 kfree(cg);
274}
275
251/* We don't maintain the lists running through each css_set to its 276/* We don't maintain the lists running through each css_set to its
252 * task until after the first call to cgroup_iter_start(). This 277 * task until after the first call to cgroup_iter_start(). This
253 * reduces the fork()/exit() overhead for people who have cgroups 278 * reduces the fork()/exit() overhead for people who have cgroups
254 * compiled into their kernel but not actually in use */ 279 * compiled into their kernel but not actually in use */
255static int use_task_css_set_links __read_mostly; 280static int use_task_css_set_links __read_mostly;
256 281
257/* When we create or destroy a css_set, the operation simply 282static void __put_css_set(struct css_set *cg, int taskexit)
258 * takes/releases a reference count on all the cgroups referenced
259 * by subsystems in this css_set. This can end up multiple-counting
260 * some cgroups, but that's OK - the ref-count is just a
261 * busy/not-busy indicator; ensuring that we only count each cgroup
262 * once would require taking a global lock to ensure that no
263 * subsystems moved between hierarchies while we were doing so.
264 *
265 * Possible TODO: decide at boot time based on the number of
266 * registered subsystems and the number of CPUs or NUMA nodes whether
267 * it's better for performance to ref-count every subsystem, or to
268 * take a global lock and only add one ref count to each hierarchy.
269 */
270
271/*
272 * unlink a css_set from the list and free it
273 */
274static void unlink_css_set(struct css_set *cg)
275{ 283{
276 struct cg_cgroup_link *link; 284 struct cg_cgroup_link *link;
277 struct cg_cgroup_link *saved_link; 285 struct cg_cgroup_link *saved_link;
278
279 hlist_del(&cg->hlist);
280 css_set_count--;
281
282 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
283 cg_link_list) {
284 list_del(&link->cg_link_list);
285 list_del(&link->cgrp_link_list);
286 kfree(link);
287 }
288}
289
290static void __put_css_set(struct css_set *cg, int taskexit)
291{
292 int i;
293 /* 286 /*
294 * Ensure that the refcount doesn't hit zero while any readers 287 * Ensure that the refcount doesn't hit zero while any readers
295 * can see it. Similar to atomic_dec_and_lock(), but for an 288 * can see it. Similar to atomic_dec_and_lock(), but for an
@@ -302,21 +295,28 @@ static void __put_css_set(struct css_set *cg, int taskexit)
302 write_unlock(&css_set_lock); 295 write_unlock(&css_set_lock);
303 return; 296 return;
304 } 297 }
305 unlink_css_set(cg);
306 write_unlock(&css_set_lock);
307 298
308 rcu_read_lock(); 299 /* This css_set is dead. unlink it and release cgroup refcounts */
309 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 300 hlist_del(&cg->hlist);
310 struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup); 301 css_set_count--;
302
303 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
304 cg_link_list) {
305 struct cgroup *cgrp = link->cgrp;
306 list_del(&link->cg_link_list);
307 list_del(&link->cgrp_link_list);
311 if (atomic_dec_and_test(&cgrp->count) && 308 if (atomic_dec_and_test(&cgrp->count) &&
312 notify_on_release(cgrp)) { 309 notify_on_release(cgrp)) {
313 if (taskexit) 310 if (taskexit)
314 set_bit(CGRP_RELEASABLE, &cgrp->flags); 311 set_bit(CGRP_RELEASABLE, &cgrp->flags);
315 check_for_release(cgrp); 312 check_for_release(cgrp);
316 } 313 }
314
315 kfree(link);
317 } 316 }
318 rcu_read_unlock(); 317
319 kfree(cg); 318 write_unlock(&css_set_lock);
319 call_rcu(&cg->rcu_head, free_css_set_rcu);
320} 320}
321 321
322/* 322/*
@@ -338,6 +338,78 @@ static inline void put_css_set_taskexit(struct css_set *cg)
338} 338}
339 339
340/* 340/*
341 * compare_css_sets - helper function for find_existing_css_set().
342 * @cg: candidate css_set being tested
343 * @old_cg: existing css_set for a task
344 * @new_cgrp: cgroup that's being entered by the task
345 * @template: desired set of css pointers in css_set (pre-calculated)
346 *
347 * Returns true if "cg" matches "old_cg" except for the hierarchy
348 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
349 */
350static bool compare_css_sets(struct css_set *cg,
351 struct css_set *old_cg,
352 struct cgroup *new_cgrp,
353 struct cgroup_subsys_state *template[])
354{
355 struct list_head *l1, *l2;
356
357 if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
358 /* Not all subsystems matched */
359 return false;
360 }
361
362 /*
363 * Compare cgroup pointers in order to distinguish between
364 * different cgroups in heirarchies with no subsystems. We
365 * could get by with just this check alone (and skip the
366 * memcmp above) but on most setups the memcmp check will
367 * avoid the need for this more expensive check on almost all
368 * candidates.
369 */
370
371 l1 = &cg->cg_links;
372 l2 = &old_cg->cg_links;
373 while (1) {
374 struct cg_cgroup_link *cgl1, *cgl2;
375 struct cgroup *cg1, *cg2;
376
377 l1 = l1->next;
378 l2 = l2->next;
379 /* See if we reached the end - both lists are equal length. */
380 if (l1 == &cg->cg_links) {
381 BUG_ON(l2 != &old_cg->cg_links);
382 break;
383 } else {
384 BUG_ON(l2 == &old_cg->cg_links);
385 }
386 /* Locate the cgroups associated with these links. */
387 cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
388 cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
389 cg1 = cgl1->cgrp;
390 cg2 = cgl2->cgrp;
391 /* Hierarchies should be linked in the same order. */
392 BUG_ON(cg1->root != cg2->root);
393
394 /*
395 * If this hierarchy is the hierarchy of the cgroup
396 * that's changing, then we need to check that this
397 * css_set points to the new cgroup; if it's any other
398 * hierarchy, then this css_set should point to the
399 * same cgroup as the old css_set.
400 */
401 if (cg1->root == new_cgrp->root) {
402 if (cg1 != new_cgrp)
403 return false;
404 } else {
405 if (cg1 != cg2)
406 return false;
407 }
408 }
409 return true;
410}
411
412/*
341 * find_existing_css_set() is a helper for 413 * find_existing_css_set() is a helper for
342 * find_css_set(), and checks to see whether an existing 414 * find_css_set(), and checks to see whether an existing
343 * css_set is suitable. 415 * css_set is suitable.
@@ -378,10 +450,11 @@ static struct css_set *find_existing_css_set(
378 450
379 hhead = css_set_hash(template); 451 hhead = css_set_hash(template);
380 hlist_for_each_entry(cg, node, hhead, hlist) { 452 hlist_for_each_entry(cg, node, hhead, hlist) {
381 if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { 453 if (!compare_css_sets(cg, oldcg, cgrp, template))
382 /* All subsystems matched */ 454 continue;
383 return cg; 455
384 } 456 /* This css_set matches what we need */
457 return cg;
385 } 458 }
386 459
387 /* No existing cgroup group matched */ 460 /* No existing cgroup group matched */
@@ -435,8 +508,14 @@ static void link_css_set(struct list_head *tmp_cg_links,
435 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, 508 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
436 cgrp_link_list); 509 cgrp_link_list);
437 link->cg = cg; 510 link->cg = cg;
511 link->cgrp = cgrp;
512 atomic_inc(&cgrp->count);
438 list_move(&link->cgrp_link_list, &cgrp->css_sets); 513 list_move(&link->cgrp_link_list, &cgrp->css_sets);
439 list_add(&link->cg_link_list, &cg->cg_links); 514 /*
515 * Always add links to the tail of the list so that the list
516 * is sorted by order of hierarchy creation
517 */
518 list_add_tail(&link->cg_link_list, &cg->cg_links);
440} 519}
441 520
442/* 521/*
@@ -451,11 +530,11 @@ static struct css_set *find_css_set(
451{ 530{
452 struct css_set *res; 531 struct css_set *res;
453 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; 532 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
454 int i;
455 533
456 struct list_head tmp_cg_links; 534 struct list_head tmp_cg_links;
457 535
458 struct hlist_head *hhead; 536 struct hlist_head *hhead;
537 struct cg_cgroup_link *link;
459 538
460 /* First see if we already have a cgroup group that matches 539 /* First see if we already have a cgroup group that matches
461 * the desired set */ 540 * the desired set */
@@ -489,20 +568,12 @@ static struct css_set *find_css_set(
489 568
490 write_lock(&css_set_lock); 569 write_lock(&css_set_lock);
491 /* Add reference counts and links from the new css_set. */ 570 /* Add reference counts and links from the new css_set. */
492 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 571 list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
493 struct cgroup *cgrp = res->subsys[i]->cgroup; 572 struct cgroup *c = link->cgrp;
494 struct cgroup_subsys *ss = subsys[i]; 573 if (c->root == cgrp->root)
495 atomic_inc(&cgrp->count); 574 c = cgrp;
496 /* 575 link_css_set(&tmp_cg_links, res, c);
497 * We want to add a link once per cgroup, so we
498 * only do it for the first subsystem in each
499 * hierarchy
500 */
501 if (ss->root->subsys_list.next == &ss->sibling)
502 link_css_set(&tmp_cg_links, res, cgrp);
503 } 576 }
504 if (list_empty(&rootnode.subsys_list))
505 link_css_set(&tmp_cg_links, res, dummytop);
506 577
507 BUG_ON(!list_empty(&tmp_cg_links)); 578 BUG_ON(!list_empty(&tmp_cg_links));
508 579
@@ -518,6 +589,41 @@ static struct css_set *find_css_set(
518} 589}
519 590
520/* 591/*
592 * Return the cgroup for "task" from the given hierarchy. Must be
593 * called with cgroup_mutex held.
594 */
595static struct cgroup *task_cgroup_from_root(struct task_struct *task,
596 struct cgroupfs_root *root)
597{
598 struct css_set *css;
599 struct cgroup *res = NULL;
600
601 BUG_ON(!mutex_is_locked(&cgroup_mutex));
602 read_lock(&css_set_lock);
603 /*
604 * No need to lock the task - since we hold cgroup_mutex the
605 * task can't change groups, so the only thing that can happen
606 * is that it exits and its css is set back to init_css_set.
607 */
608 css = task->cgroups;
609 if (css == &init_css_set) {
610 res = &root->top_cgroup;
611 } else {
612 struct cg_cgroup_link *link;
613 list_for_each_entry(link, &css->cg_links, cg_link_list) {
614 struct cgroup *c = link->cgrp;
615 if (c->root == root) {
616 res = c;
617 break;
618 }
619 }
620 }
621 read_unlock(&css_set_lock);
622 BUG_ON(!res);
623 return res;
624}
625
626/*
521 * There is one global cgroup mutex. We also require taking 627 * There is one global cgroup mutex. We also require taking
522 * task_lock() when dereferencing a task's cgroup subsys pointers. 628 * task_lock() when dereferencing a task's cgroup subsys pointers.
523 * See "The task_lock() exception", at the end of this comment. 629 * See "The task_lock() exception", at the end of this comment.
@@ -596,8 +702,8 @@ void cgroup_unlock(void)
596static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); 702static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
597static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 703static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
598static int cgroup_populate_dir(struct cgroup *cgrp); 704static int cgroup_populate_dir(struct cgroup *cgrp);
599static struct inode_operations cgroup_dir_inode_operations; 705static const struct inode_operations cgroup_dir_inode_operations;
600static struct file_operations proc_cgroupstats_operations; 706static const struct file_operations proc_cgroupstats_operations;
601 707
602static struct backing_dev_info cgroup_backing_dev_info = { 708static struct backing_dev_info cgroup_backing_dev_info = {
603 .name = "cgroup", 709 .name = "cgroup",
@@ -677,6 +783,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
677 */ 783 */
678 deactivate_super(cgrp->root->sb); 784 deactivate_super(cgrp->root->sb);
679 785
786 /*
787 * if we're getting rid of the cgroup, refcount should ensure
788 * that there are no pidlists left.
789 */
790 BUG_ON(!list_empty(&cgrp->pidlists));
791
680 call_rcu(&cgrp->rcu_head, free_cgroup_rcu); 792 call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
681 } 793 }
682 iput(inode); 794 iput(inode);
@@ -841,6 +953,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
841 seq_puts(seq, ",noprefix"); 953 seq_puts(seq, ",noprefix");
842 if (strlen(root->release_agent_path)) 954 if (strlen(root->release_agent_path))
843 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 955 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
956 if (strlen(root->name))
957 seq_printf(seq, ",name=%s", root->name);
844 mutex_unlock(&cgroup_mutex); 958 mutex_unlock(&cgroup_mutex);
845 return 0; 959 return 0;
846} 960}
@@ -849,6 +963,12 @@ struct cgroup_sb_opts {
849 unsigned long subsys_bits; 963 unsigned long subsys_bits;
850 unsigned long flags; 964 unsigned long flags;
851 char *release_agent; 965 char *release_agent;
966 char *name;
967 /* User explicitly requested empty subsystem */
968 bool none;
969
970 struct cgroupfs_root *new_root;
971
852}; 972};
853 973
854/* Convert a hierarchy specifier into a bitmask of subsystems and 974/* Convert a hierarchy specifier into a bitmask of subsystems and
@@ -863,9 +983,7 @@ static int parse_cgroupfs_options(char *data,
863 mask = ~(1UL << cpuset_subsys_id); 983 mask = ~(1UL << cpuset_subsys_id);
864#endif 984#endif
865 985
866 opts->subsys_bits = 0; 986 memset(opts, 0, sizeof(*opts));
867 opts->flags = 0;
868 opts->release_agent = NULL;
869 987
870 while ((token = strsep(&o, ",")) != NULL) { 988 while ((token = strsep(&o, ",")) != NULL) {
871 if (!*token) 989 if (!*token)
@@ -879,17 +997,42 @@ static int parse_cgroupfs_options(char *data,
879 if (!ss->disabled) 997 if (!ss->disabled)
880 opts->subsys_bits |= 1ul << i; 998 opts->subsys_bits |= 1ul << i;
881 } 999 }
1000 } else if (!strcmp(token, "none")) {
1001 /* Explicitly have no subsystems */
1002 opts->none = true;
882 } else if (!strcmp(token, "noprefix")) { 1003 } else if (!strcmp(token, "noprefix")) {
883 set_bit(ROOT_NOPREFIX, &opts->flags); 1004 set_bit(ROOT_NOPREFIX, &opts->flags);
884 } else if (!strncmp(token, "release_agent=", 14)) { 1005 } else if (!strncmp(token, "release_agent=", 14)) {
885 /* Specifying two release agents is forbidden */ 1006 /* Specifying two release agents is forbidden */
886 if (opts->release_agent) 1007 if (opts->release_agent)
887 return -EINVAL; 1008 return -EINVAL;
888 opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL); 1009 opts->release_agent =
1010 kstrndup(token + 14, PATH_MAX, GFP_KERNEL);
889 if (!opts->release_agent) 1011 if (!opts->release_agent)
890 return -ENOMEM; 1012 return -ENOMEM;
891 strncpy(opts->release_agent, token + 14, PATH_MAX - 1); 1013 } else if (!strncmp(token, "name=", 5)) {
892 opts->release_agent[PATH_MAX - 1] = 0; 1014 int i;
1015 const char *name = token + 5;
1016 /* Can't specify an empty name */
1017 if (!strlen(name))
1018 return -EINVAL;
1019 /* Must match [\w.-]+ */
1020 for (i = 0; i < strlen(name); i++) {
1021 char c = name[i];
1022 if (isalnum(c))
1023 continue;
1024 if ((c == '.') || (c == '-') || (c == '_'))
1025 continue;
1026 return -EINVAL;
1027 }
1028 /* Specifying two names is forbidden */
1029 if (opts->name)
1030 return -EINVAL;
1031 opts->name = kstrndup(name,
1032 MAX_CGROUP_ROOT_NAMELEN,
1033 GFP_KERNEL);
1034 if (!opts->name)
1035 return -ENOMEM;
893 } else { 1036 } else {
894 struct cgroup_subsys *ss; 1037 struct cgroup_subsys *ss;
895 int i; 1038 int i;
@@ -906,6 +1049,8 @@ static int parse_cgroupfs_options(char *data,
906 } 1049 }
907 } 1050 }
908 1051
1052 /* Consistency checks */
1053
909 /* 1054 /*
910 * Option noprefix was introduced just for backward compatibility 1055 * Option noprefix was introduced just for backward compatibility
911 * with the old cpuset, so we allow noprefix only if mounting just 1056 * with the old cpuset, so we allow noprefix only if mounting just
@@ -915,8 +1060,16 @@ static int parse_cgroupfs_options(char *data,
915 (opts->subsys_bits & mask)) 1060 (opts->subsys_bits & mask))
916 return -EINVAL; 1061 return -EINVAL;
917 1062
918 /* We can't have an empty hierarchy */ 1063
919 if (!opts->subsys_bits) 1064 /* Can't specify "none" and some subsystems */
1065 if (opts->subsys_bits && opts->none)
1066 return -EINVAL;
1067
1068 /*
1069 * We either have to specify by name or by subsystems. (So all
1070 * empty hierarchies must have a name).
1071 */
1072 if (!opts->subsys_bits && !opts->name)
920 return -EINVAL; 1073 return -EINVAL;
921 1074
922 return 0; 1075 return 0;
@@ -944,6 +1097,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
944 goto out_unlock; 1097 goto out_unlock;
945 } 1098 }
946 1099
1100 /* Don't allow name to change at remount */
1101 if (opts.name && strcmp(opts.name, root->name)) {
1102 ret = -EINVAL;
1103 goto out_unlock;
1104 }
1105
947 ret = rebind_subsystems(root, opts.subsys_bits); 1106 ret = rebind_subsystems(root, opts.subsys_bits);
948 if (ret) 1107 if (ret)
949 goto out_unlock; 1108 goto out_unlock;
@@ -955,13 +1114,14 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
955 strcpy(root->release_agent_path, opts.release_agent); 1114 strcpy(root->release_agent_path, opts.release_agent);
956 out_unlock: 1115 out_unlock:
957 kfree(opts.release_agent); 1116 kfree(opts.release_agent);
1117 kfree(opts.name);
958 mutex_unlock(&cgroup_mutex); 1118 mutex_unlock(&cgroup_mutex);
959 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1119 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
960 unlock_kernel(); 1120 unlock_kernel();
961 return ret; 1121 return ret;
962} 1122}
963 1123
964static struct super_operations cgroup_ops = { 1124static const struct super_operations cgroup_ops = {
965 .statfs = simple_statfs, 1125 .statfs = simple_statfs,
966 .drop_inode = generic_delete_inode, 1126 .drop_inode = generic_delete_inode,
967 .show_options = cgroup_show_options, 1127 .show_options = cgroup_show_options,
@@ -974,9 +1134,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
974 INIT_LIST_HEAD(&cgrp->children); 1134 INIT_LIST_HEAD(&cgrp->children);
975 INIT_LIST_HEAD(&cgrp->css_sets); 1135 INIT_LIST_HEAD(&cgrp->css_sets);
976 INIT_LIST_HEAD(&cgrp->release_list); 1136 INIT_LIST_HEAD(&cgrp->release_list);
977 INIT_LIST_HEAD(&cgrp->pids_list); 1137 INIT_LIST_HEAD(&cgrp->pidlists);
978 init_rwsem(&cgrp->pids_mutex); 1138 mutex_init(&cgrp->pidlist_mutex);
979} 1139}
1140
980static void init_cgroup_root(struct cgroupfs_root *root) 1141static void init_cgroup_root(struct cgroupfs_root *root)
981{ 1142{
982 struct cgroup *cgrp = &root->top_cgroup; 1143 struct cgroup *cgrp = &root->top_cgroup;
@@ -988,33 +1149,106 @@ static void init_cgroup_root(struct cgroupfs_root *root)
988 init_cgroup_housekeeping(cgrp); 1149 init_cgroup_housekeeping(cgrp);
989} 1150}
990 1151
1152static bool init_root_id(struct cgroupfs_root *root)
1153{
1154 int ret = 0;
1155
1156 do {
1157 if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
1158 return false;
1159 spin_lock(&hierarchy_id_lock);
1160 /* Try to allocate the next unused ID */
1161 ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
1162 &root->hierarchy_id);
1163 if (ret == -ENOSPC)
1164 /* Try again starting from 0 */
1165 ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
1166 if (!ret) {
1167 next_hierarchy_id = root->hierarchy_id + 1;
1168 } else if (ret != -EAGAIN) {
1169 /* Can only get here if the 31-bit IDR is full ... */
1170 BUG_ON(ret);
1171 }
1172 spin_unlock(&hierarchy_id_lock);
1173 } while (ret);
1174 return true;
1175}
1176
991static int cgroup_test_super(struct super_block *sb, void *data) 1177static int cgroup_test_super(struct super_block *sb, void *data)
992{ 1178{
993 struct cgroupfs_root *new = data; 1179 struct cgroup_sb_opts *opts = data;
994 struct cgroupfs_root *root = sb->s_fs_info; 1180 struct cgroupfs_root *root = sb->s_fs_info;
995 1181
996 /* First check subsystems */ 1182 /* If we asked for a name then it must match */
997 if (new->subsys_bits != root->subsys_bits) 1183 if (opts->name && strcmp(opts->name, root->name))
998 return 0; 1184 return 0;
999 1185
1000 /* Next check flags */ 1186 /*
1001 if (new->flags != root->flags) 1187 * If we asked for subsystems (or explicitly for no
1188 * subsystems) then they must match
1189 */
1190 if ((opts->subsys_bits || opts->none)
1191 && (opts->subsys_bits != root->subsys_bits))
1002 return 0; 1192 return 0;
1003 1193
1004 return 1; 1194 return 1;
1005} 1195}
1006 1196
1197static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1198{
1199 struct cgroupfs_root *root;
1200
1201 if (!opts->subsys_bits && !opts->none)
1202 return NULL;
1203
1204 root = kzalloc(sizeof(*root), GFP_KERNEL);
1205 if (!root)
1206 return ERR_PTR(-ENOMEM);
1207
1208 if (!init_root_id(root)) {
1209 kfree(root);
1210 return ERR_PTR(-ENOMEM);
1211 }
1212 init_cgroup_root(root);
1213
1214 root->subsys_bits = opts->subsys_bits;
1215 root->flags = opts->flags;
1216 if (opts->release_agent)
1217 strcpy(root->release_agent_path, opts->release_agent);
1218 if (opts->name)
1219 strcpy(root->name, opts->name);
1220 return root;
1221}
1222
1223static void cgroup_drop_root(struct cgroupfs_root *root)
1224{
1225 if (!root)
1226 return;
1227
1228 BUG_ON(!root->hierarchy_id);
1229 spin_lock(&hierarchy_id_lock);
1230 ida_remove(&hierarchy_ida, root->hierarchy_id);
1231 spin_unlock(&hierarchy_id_lock);
1232 kfree(root);
1233}
1234
1007static int cgroup_set_super(struct super_block *sb, void *data) 1235static int cgroup_set_super(struct super_block *sb, void *data)
1008{ 1236{
1009 int ret; 1237 int ret;
1010 struct cgroupfs_root *root = data; 1238 struct cgroup_sb_opts *opts = data;
1239
1240 /* If we don't have a new root, we can't set up a new sb */
1241 if (!opts->new_root)
1242 return -EINVAL;
1243
1244 BUG_ON(!opts->subsys_bits && !opts->none);
1011 1245
1012 ret = set_anon_super(sb, NULL); 1246 ret = set_anon_super(sb, NULL);
1013 if (ret) 1247 if (ret)
1014 return ret; 1248 return ret;
1015 1249
1016 sb->s_fs_info = root; 1250 sb->s_fs_info = opts->new_root;
1017 root->sb = sb; 1251 opts->new_root->sb = sb;
1018 1252
1019 sb->s_blocksize = PAGE_CACHE_SIZE; 1253 sb->s_blocksize = PAGE_CACHE_SIZE;
1020 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 1254 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
@@ -1051,48 +1285,43 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1051 void *data, struct vfsmount *mnt) 1285 void *data, struct vfsmount *mnt)
1052{ 1286{
1053 struct cgroup_sb_opts opts; 1287 struct cgroup_sb_opts opts;
1288 struct cgroupfs_root *root;
1054 int ret = 0; 1289 int ret = 0;
1055 struct super_block *sb; 1290 struct super_block *sb;
1056 struct cgroupfs_root *root; 1291 struct cgroupfs_root *new_root;
1057 struct list_head tmp_cg_links;
1058 1292
1059 /* First find the desired set of subsystems */ 1293 /* First find the desired set of subsystems */
1060 ret = parse_cgroupfs_options(data, &opts); 1294 ret = parse_cgroupfs_options(data, &opts);
1061 if (ret) { 1295 if (ret)
1062 kfree(opts.release_agent); 1296 goto out_err;
1063 return ret;
1064 }
1065
1066 root = kzalloc(sizeof(*root), GFP_KERNEL);
1067 if (!root) {
1068 kfree(opts.release_agent);
1069 return -ENOMEM;
1070 }
1071 1297
1072 init_cgroup_root(root); 1298 /*
1073 root->subsys_bits = opts.subsys_bits; 1299 * Allocate a new cgroup root. We may not need it if we're
1074 root->flags = opts.flags; 1300 * reusing an existing hierarchy.
1075 if (opts.release_agent) { 1301 */
1076 strcpy(root->release_agent_path, opts.release_agent); 1302 new_root = cgroup_root_from_opts(&opts);
1077 kfree(opts.release_agent); 1303 if (IS_ERR(new_root)) {
1304 ret = PTR_ERR(new_root);
1305 goto out_err;
1078 } 1306 }
1307 opts.new_root = new_root;
1079 1308
1080 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root); 1309 /* Locate an existing or new sb for this hierarchy */
1081 1310 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
1082 if (IS_ERR(sb)) { 1311 if (IS_ERR(sb)) {
1083 kfree(root); 1312 ret = PTR_ERR(sb);
1084 return PTR_ERR(sb); 1313 cgroup_drop_root(opts.new_root);
1314 goto out_err;
1085 } 1315 }
1086 1316
1087 if (sb->s_fs_info != root) { 1317 root = sb->s_fs_info;
1088 /* Reusing an existing superblock */ 1318 BUG_ON(!root);
1089 BUG_ON(sb->s_root == NULL); 1319 if (root == opts.new_root) {
1090 kfree(root); 1320 /* We used the new root structure, so this is a new hierarchy */
1091 root = NULL; 1321 struct list_head tmp_cg_links;
1092 } else {
1093 /* New superblock */
1094 struct cgroup *root_cgrp = &root->top_cgroup; 1322 struct cgroup *root_cgrp = &root->top_cgroup;
1095 struct inode *inode; 1323 struct inode *inode;
1324 struct cgroupfs_root *existing_root;
1096 int i; 1325 int i;
1097 1326
1098 BUG_ON(sb->s_root != NULL); 1327 BUG_ON(sb->s_root != NULL);
@@ -1105,6 +1334,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1105 mutex_lock(&inode->i_mutex); 1334 mutex_lock(&inode->i_mutex);
1106 mutex_lock(&cgroup_mutex); 1335 mutex_lock(&cgroup_mutex);
1107 1336
1337 if (strlen(root->name)) {
1338 /* Check for name clashes with existing mounts */
1339 for_each_active_root(existing_root) {
1340 if (!strcmp(existing_root->name, root->name)) {
1341 ret = -EBUSY;
1342 mutex_unlock(&cgroup_mutex);
1343 mutex_unlock(&inode->i_mutex);
1344 goto drop_new_super;
1345 }
1346 }
1347 }
1348
1108 /* 1349 /*
1109 * We're accessing css_set_count without locking 1350 * We're accessing css_set_count without locking
1110 * css_set_lock here, but that's OK - it can only be 1351 * css_set_lock here, but that's OK - it can only be
@@ -1123,7 +1364,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1123 if (ret == -EBUSY) { 1364 if (ret == -EBUSY) {
1124 mutex_unlock(&cgroup_mutex); 1365 mutex_unlock(&cgroup_mutex);
1125 mutex_unlock(&inode->i_mutex); 1366 mutex_unlock(&inode->i_mutex);
1126 goto free_cg_links; 1367 free_cg_links(&tmp_cg_links);
1368 goto drop_new_super;
1127 } 1369 }
1128 1370
1129 /* EBUSY should be the only error here */ 1371 /* EBUSY should be the only error here */
@@ -1155,17 +1397,27 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1155 BUG_ON(root->number_of_cgroups != 1); 1397 BUG_ON(root->number_of_cgroups != 1);
1156 1398
1157 cgroup_populate_dir(root_cgrp); 1399 cgroup_populate_dir(root_cgrp);
1158 mutex_unlock(&inode->i_mutex);
1159 mutex_unlock(&cgroup_mutex); 1400 mutex_unlock(&cgroup_mutex);
1401 mutex_unlock(&inode->i_mutex);
1402 } else {
1403 /*
1404 * We re-used an existing hierarchy - the new root (if
1405 * any) is not needed
1406 */
1407 cgroup_drop_root(opts.new_root);
1160 } 1408 }
1161 1409
1162 simple_set_mnt(mnt, sb); 1410 simple_set_mnt(mnt, sb);
1411 kfree(opts.release_agent);
1412 kfree(opts.name);
1163 return 0; 1413 return 0;
1164 1414
1165 free_cg_links:
1166 free_cg_links(&tmp_cg_links);
1167 drop_new_super: 1415 drop_new_super:
1168 deactivate_locked_super(sb); 1416 deactivate_locked_super(sb);
1417 out_err:
1418 kfree(opts.release_agent);
1419 kfree(opts.name);
1420
1169 return ret; 1421 return ret;
1170} 1422}
1171 1423
@@ -1211,7 +1463,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1211 mutex_unlock(&cgroup_mutex); 1463 mutex_unlock(&cgroup_mutex);
1212 1464
1213 kill_litter_super(sb); 1465 kill_litter_super(sb);
1214 kfree(root); 1466 cgroup_drop_root(root);
1215} 1467}
1216 1468
1217static struct file_system_type cgroup_fs_type = { 1469static struct file_system_type cgroup_fs_type = {
@@ -1276,27 +1528,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1276 return 0; 1528 return 0;
1277} 1529}
1278 1530
1279/*
1280 * Return the first subsystem attached to a cgroup's hierarchy, and
1281 * its subsystem id.
1282 */
1283
1284static void get_first_subsys(const struct cgroup *cgrp,
1285 struct cgroup_subsys_state **css, int *subsys_id)
1286{
1287 const struct cgroupfs_root *root = cgrp->root;
1288 const struct cgroup_subsys *test_ss;
1289 BUG_ON(list_empty(&root->subsys_list));
1290 test_ss = list_entry(root->subsys_list.next,
1291 struct cgroup_subsys, sibling);
1292 if (css) {
1293 *css = cgrp->subsys[test_ss->subsys_id];
1294 BUG_ON(!*css);
1295 }
1296 if (subsys_id)
1297 *subsys_id = test_ss->subsys_id;
1298}
1299
1300/** 1531/**
1301 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1532 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
1302 * @cgrp: the cgroup the task is attaching to 1533 * @cgrp: the cgroup the task is attaching to
@@ -1313,18 +1544,15 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1313 struct css_set *cg; 1544 struct css_set *cg;
1314 struct css_set *newcg; 1545 struct css_set *newcg;
1315 struct cgroupfs_root *root = cgrp->root; 1546 struct cgroupfs_root *root = cgrp->root;
1316 int subsys_id;
1317
1318 get_first_subsys(cgrp, NULL, &subsys_id);
1319 1547
1320 /* Nothing to do if the task is already in that cgroup */ 1548 /* Nothing to do if the task is already in that cgroup */
1321 oldcgrp = task_cgroup(tsk, subsys_id); 1549 oldcgrp = task_cgroup_from_root(tsk, root);
1322 if (cgrp == oldcgrp) 1550 if (cgrp == oldcgrp)
1323 return 0; 1551 return 0;
1324 1552
1325 for_each_subsys(root, ss) { 1553 for_each_subsys(root, ss) {
1326 if (ss->can_attach) { 1554 if (ss->can_attach) {
1327 retval = ss->can_attach(ss, cgrp, tsk); 1555 retval = ss->can_attach(ss, cgrp, tsk, false);
1328 if (retval) 1556 if (retval)
1329 return retval; 1557 return retval;
1330 } 1558 }
@@ -1362,7 +1590,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1362 1590
1363 for_each_subsys(root, ss) { 1591 for_each_subsys(root, ss) {
1364 if (ss->attach) 1592 if (ss->attach)
1365 ss->attach(ss, cgrp, oldcgrp, tsk); 1593 ss->attach(ss, cgrp, oldcgrp, tsk, false);
1366 } 1594 }
1367 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1595 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1368 synchronize_rcu(); 1596 synchronize_rcu();
@@ -1423,15 +1651,6 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
1423 return ret; 1651 return ret;
1424} 1652}
1425 1653
1426/* The various types of files and directories in a cgroup file system */
1427enum cgroup_filetype {
1428 FILE_ROOT,
1429 FILE_DIR,
1430 FILE_TASKLIST,
1431 FILE_NOTIFY_ON_RELEASE,
1432 FILE_RELEASE_AGENT,
1433};
1434
1435/** 1654/**
1436 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. 1655 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
1437 * @cgrp: the cgroup to be checked for liveness 1656 * @cgrp: the cgroup to be checked for liveness
@@ -1491,14 +1710,13 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
1491 return -EFAULT; 1710 return -EFAULT;
1492 1711
1493 buffer[nbytes] = 0; /* nul-terminate */ 1712 buffer[nbytes] = 0; /* nul-terminate */
1494 strstrip(buffer);
1495 if (cft->write_u64) { 1713 if (cft->write_u64) {
1496 u64 val = simple_strtoull(buffer, &end, 0); 1714 u64 val = simple_strtoull(strstrip(buffer), &end, 0);
1497 if (*end) 1715 if (*end)
1498 return -EINVAL; 1716 return -EINVAL;
1499 retval = cft->write_u64(cgrp, cft, val); 1717 retval = cft->write_u64(cgrp, cft, val);
1500 } else { 1718 } else {
1501 s64 val = simple_strtoll(buffer, &end, 0); 1719 s64 val = simple_strtoll(strstrip(buffer), &end, 0);
1502 if (*end) 1720 if (*end)
1503 return -EINVAL; 1721 return -EINVAL;
1504 retval = cft->write_s64(cgrp, cft, val); 1722 retval = cft->write_s64(cgrp, cft, val);
@@ -1534,8 +1752,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
1534 } 1752 }
1535 1753
1536 buffer[nbytes] = 0; /* nul-terminate */ 1754 buffer[nbytes] = 0; /* nul-terminate */
1537 strstrip(buffer); 1755 retval = cft->write_string(cgrp, cft, strstrip(buffer));
1538 retval = cft->write_string(cgrp, cft, buffer);
1539 if (!retval) 1756 if (!retval)
1540 retval = nbytes; 1757 retval = nbytes;
1541out: 1758out:
@@ -1644,7 +1861,7 @@ static int cgroup_seqfile_release(struct inode *inode, struct file *file)
1644 return single_release(inode, file); 1861 return single_release(inode, file);
1645} 1862}
1646 1863
1647static struct file_operations cgroup_seqfile_operations = { 1864static const struct file_operations cgroup_seqfile_operations = {
1648 .read = seq_read, 1865 .read = seq_read,
1649 .write = cgroup_file_write, 1866 .write = cgroup_file_write,
1650 .llseek = seq_lseek, 1867 .llseek = seq_lseek,
@@ -1703,7 +1920,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
1703 return simple_rename(old_dir, old_dentry, new_dir, new_dentry); 1920 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
1704} 1921}
1705 1922
1706static struct file_operations cgroup_file_operations = { 1923static const struct file_operations cgroup_file_operations = {
1707 .read = cgroup_file_read, 1924 .read = cgroup_file_read,
1708 .write = cgroup_file_write, 1925 .write = cgroup_file_write,
1709 .llseek = generic_file_llseek, 1926 .llseek = generic_file_llseek,
@@ -1711,7 +1928,7 @@ static struct file_operations cgroup_file_operations = {
1711 .release = cgroup_file_release, 1928 .release = cgroup_file_release,
1712}; 1929};
1713 1930
1714static struct inode_operations cgroup_dir_inode_operations = { 1931static const struct inode_operations cgroup_dir_inode_operations = {
1715 .lookup = simple_lookup, 1932 .lookup = simple_lookup,
1716 .mkdir = cgroup_mkdir, 1933 .mkdir = cgroup_mkdir,
1717 .rmdir = cgroup_rmdir, 1934 .rmdir = cgroup_rmdir,
@@ -1876,7 +2093,7 @@ int cgroup_task_count(const struct cgroup *cgrp)
1876 * the start of a css_set 2093 * the start of a css_set
1877 */ 2094 */
1878static void cgroup_advance_iter(struct cgroup *cgrp, 2095static void cgroup_advance_iter(struct cgroup *cgrp,
1879 struct cgroup_iter *it) 2096 struct cgroup_iter *it)
1880{ 2097{
1881 struct list_head *l = it->cg_link; 2098 struct list_head *l = it->cg_link;
1882 struct cg_cgroup_link *link; 2099 struct cg_cgroup_link *link;
@@ -2129,7 +2346,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
2129} 2346}
2130 2347
2131/* 2348/*
2132 * Stuff for reading the 'tasks' file. 2349 * Stuff for reading the 'tasks'/'procs' files.
2133 * 2350 *
2134 * Reading this file can return large amounts of data if a cgroup has 2351 * Reading this file can return large amounts of data if a cgroup has
2135 * *lots* of attached tasks. So it may need several calls to read(), 2352 * *lots* of attached tasks. So it may need several calls to read(),
@@ -2139,27 +2356,196 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
2139 */ 2356 */
2140 2357
2141/* 2358/*
2142 * Load into 'pidarray' up to 'npids' of the tasks using cgroup 2359 * The following two functions "fix" the issue where there are more pids
2143 * 'cgrp'. Return actual number of pids loaded. No need to 2360 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
2144 * task_lock(p) when reading out p->cgroup, since we're in an RCU 2361 * TODO: replace with a kernel-wide solution to this problem
2145 * read section, so the css_set can't go away, and is 2362 */
2146 * immutable after creation. 2363#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
2364static void *pidlist_allocate(int count)
2365{
2366 if (PIDLIST_TOO_LARGE(count))
2367 return vmalloc(count * sizeof(pid_t));
2368 else
2369 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
2370}
2371static void pidlist_free(void *p)
2372{
2373 if (is_vmalloc_addr(p))
2374 vfree(p);
2375 else
2376 kfree(p);
2377}
2378static void *pidlist_resize(void *p, int newcount)
2379{
2380 void *newlist;
2381 /* note: if new alloc fails, old p will still be valid either way */
2382 if (is_vmalloc_addr(p)) {
2383 newlist = vmalloc(newcount * sizeof(pid_t));
2384 if (!newlist)
2385 return NULL;
2386 memcpy(newlist, p, newcount * sizeof(pid_t));
2387 vfree(p);
2388 } else {
2389 newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
2390 }
2391 return newlist;
2392}
2393
2394/*
2395 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
2396 * If the new stripped list is sufficiently smaller and there's enough memory
2397 * to allocate a new buffer, will let go of the unneeded memory. Returns the
2398 * number of unique elements.
2399 */
2400/* is the size difference enough that we should re-allocate the array? */
2401#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
2402static int pidlist_uniq(pid_t **p, int length)
2403{
2404 int src, dest = 1;
2405 pid_t *list = *p;
2406 pid_t *newlist;
2407
2408 /*
2409 * we presume the 0th element is unique, so i starts at 1. trivial
2410 * edge cases first; no work needs to be done for either
2411 */
2412 if (length == 0 || length == 1)
2413 return length;
2414 /* src and dest walk down the list; dest counts unique elements */
2415 for (src = 1; src < length; src++) {
2416 /* find next unique element */
2417 while (list[src] == list[src-1]) {
2418 src++;
2419 if (src == length)
2420 goto after;
2421 }
2422 /* dest always points to where the next unique element goes */
2423 list[dest] = list[src];
2424 dest++;
2425 }
2426after:
2427 /*
2428 * if the length difference is large enough, we want to allocate a
2429 * smaller buffer to save memory. if this fails due to out of memory,
2430 * we'll just stay with what we've got.
2431 */
2432 if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
2433 newlist = pidlist_resize(list, dest);
2434 if (newlist)
2435 *p = newlist;
2436 }
2437 return dest;
2438}
2439
2440static int cmppid(const void *a, const void *b)
2441{
2442 return *(pid_t *)a - *(pid_t *)b;
2443}
2444
2445/*
2446 * find the appropriate pidlist for our purpose (given procs vs tasks)
2447 * returns with the lock on that pidlist already held, and takes care
2448 * of the use count, or returns NULL with no locks held if we're out of
2449 * memory.
2147 */ 2450 */
2148static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) 2451static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2452 enum cgroup_filetype type)
2149{ 2453{
2150 int n = 0, pid; 2454 struct cgroup_pidlist *l;
2455 /* don't need task_nsproxy() if we're looking at ourself */
2456 struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns);
2457 /*
2458 * We can't drop the pidlist_mutex before taking the l->mutex in case
2459 * the last ref-holder is trying to remove l from the list at the same
2460 * time. Holding the pidlist_mutex precludes somebody taking whichever
2461 * list we find out from under us - compare release_pid_array().
2462 */
2463 mutex_lock(&cgrp->pidlist_mutex);
2464 list_for_each_entry(l, &cgrp->pidlists, links) {
2465 if (l->key.type == type && l->key.ns == ns) {
2466 /* found a matching list - drop the extra refcount */
2467 put_pid_ns(ns);
2468 /* make sure l doesn't vanish out from under us */
2469 down_write(&l->mutex);
2470 mutex_unlock(&cgrp->pidlist_mutex);
2471 l->use_count++;
2472 return l;
2473 }
2474 }
2475 /* entry not found; create a new one */
2476 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
2477 if (!l) {
2478 mutex_unlock(&cgrp->pidlist_mutex);
2479 put_pid_ns(ns);
2480 return l;
2481 }
2482 init_rwsem(&l->mutex);
2483 down_write(&l->mutex);
2484 l->key.type = type;
2485 l->key.ns = ns;
2486 l->use_count = 0; /* don't increment here */
2487 l->list = NULL;
2488 l->owner = cgrp;
2489 list_add(&l->links, &cgrp->pidlists);
2490 mutex_unlock(&cgrp->pidlist_mutex);
2491 return l;
2492}
2493
2494/*
2495 * Load a cgroup's pidarray with either procs' tgids or tasks' pids
2496 */
2497static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
2498 struct cgroup_pidlist **lp)
2499{
2500 pid_t *array;
2501 int length;
2502 int pid, n = 0; /* used for populating the array */
2151 struct cgroup_iter it; 2503 struct cgroup_iter it;
2152 struct task_struct *tsk; 2504 struct task_struct *tsk;
2505 struct cgroup_pidlist *l;
2506
2507 /*
2508 * If cgroup gets more users after we read count, we won't have
2509 * enough space - tough. This race is indistinguishable to the
2510 * caller from the case that the additional cgroup users didn't
2511 * show up until sometime later on.
2512 */
2513 length = cgroup_task_count(cgrp);
2514 array = pidlist_allocate(length);
2515 if (!array)
2516 return -ENOMEM;
2517 /* now, populate the array */
2153 cgroup_iter_start(cgrp, &it); 2518 cgroup_iter_start(cgrp, &it);
2154 while ((tsk = cgroup_iter_next(cgrp, &it))) { 2519 while ((tsk = cgroup_iter_next(cgrp, &it))) {
2155 if (unlikely(n == npids)) 2520 if (unlikely(n == length))
2156 break; 2521 break;
2157 pid = task_pid_vnr(tsk); 2522 /* get tgid or pid for procs or tasks file respectively */
2158 if (pid > 0) 2523 if (type == CGROUP_FILE_PROCS)
2159 pidarray[n++] = pid; 2524 pid = task_tgid_vnr(tsk);
2525 else
2526 pid = task_pid_vnr(tsk);
2527 if (pid > 0) /* make sure to only use valid results */
2528 array[n++] = pid;
2160 } 2529 }
2161 cgroup_iter_end(cgrp, &it); 2530 cgroup_iter_end(cgrp, &it);
2162 return n; 2531 length = n;
2532 /* now sort & (if procs) strip out duplicates */
2533 sort(array, length, sizeof(pid_t), cmppid, NULL);
2534 if (type == CGROUP_FILE_PROCS)
2535 length = pidlist_uniq(&array, length);
2536 l = cgroup_pidlist_find(cgrp, type);
2537 if (!l) {
2538 pidlist_free(array);
2539 return -ENOMEM;
2540 }
2541 /* store array, freeing old if necessary - lock already held */
2542 pidlist_free(l->list);
2543 l->list = array;
2544 l->length = length;
2545 l->use_count++;
2546 up_write(&l->mutex);
2547 *lp = l;
2548 return 0;
2163} 2549}
2164 2550
2165/** 2551/**
@@ -2216,37 +2602,14 @@ err:
2216 return ret; 2602 return ret;
2217} 2603}
2218 2604
2219/*
2220 * Cache pids for all threads in the same pid namespace that are
2221 * opening the same "tasks" file.
2222 */
2223struct cgroup_pids {
2224 /* The node in cgrp->pids_list */
2225 struct list_head list;
2226 /* The cgroup those pids belong to */
2227 struct cgroup *cgrp;
2228 /* The namepsace those pids belong to */
2229 struct pid_namespace *ns;
2230 /* Array of process ids in the cgroup */
2231 pid_t *tasks_pids;
2232 /* How many files are using the this tasks_pids array */
2233 int use_count;
2234 /* Length of the current tasks_pids array */
2235 int length;
2236};
2237
2238static int cmppid(const void *a, const void *b)
2239{
2240 return *(pid_t *)a - *(pid_t *)b;
2241}
2242 2605
2243/* 2606/*
2244 * seq_file methods for the "tasks" file. The seq_file position is the 2607 * seq_file methods for the tasks/procs files. The seq_file position is the
2245 * next pid to display; the seq_file iterator is a pointer to the pid 2608 * next pid to display; the seq_file iterator is a pointer to the pid
2246 * in the cgroup->tasks_pids array. 2609 * in the cgroup->l->list array.
2247 */ 2610 */
2248 2611
2249static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) 2612static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
2250{ 2613{
2251 /* 2614 /*
2252 * Initially we receive a position value that corresponds to 2615 * Initially we receive a position value that corresponds to
@@ -2254,48 +2617,45 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2254 * after a seek to the start). Use a binary-search to find the 2617 * after a seek to the start). Use a binary-search to find the
2255 * next pid to display, if any 2618 * next pid to display, if any
2256 */ 2619 */
2257 struct cgroup_pids *cp = s->private; 2620 struct cgroup_pidlist *l = s->private;
2258 struct cgroup *cgrp = cp->cgrp;
2259 int index = 0, pid = *pos; 2621 int index = 0, pid = *pos;
2260 int *iter; 2622 int *iter;
2261 2623
2262 down_read(&cgrp->pids_mutex); 2624 down_read(&l->mutex);
2263 if (pid) { 2625 if (pid) {
2264 int end = cp->length; 2626 int end = l->length;
2265 2627
2266 while (index < end) { 2628 while (index < end) {
2267 int mid = (index + end) / 2; 2629 int mid = (index + end) / 2;
2268 if (cp->tasks_pids[mid] == pid) { 2630 if (l->list[mid] == pid) {
2269 index = mid; 2631 index = mid;
2270 break; 2632 break;
2271 } else if (cp->tasks_pids[mid] <= pid) 2633 } else if (l->list[mid] <= pid)
2272 index = mid + 1; 2634 index = mid + 1;
2273 else 2635 else
2274 end = mid; 2636 end = mid;
2275 } 2637 }
2276 } 2638 }
2277 /* If we're off the end of the array, we're done */ 2639 /* If we're off the end of the array, we're done */
2278 if (index >= cp->length) 2640 if (index >= l->length)
2279 return NULL; 2641 return NULL;
2280 /* Update the abstract position to be the actual pid that we found */ 2642 /* Update the abstract position to be the actual pid that we found */
2281 iter = cp->tasks_pids + index; 2643 iter = l->list + index;
2282 *pos = *iter; 2644 *pos = *iter;
2283 return iter; 2645 return iter;
2284} 2646}
2285 2647
2286static void cgroup_tasks_stop(struct seq_file *s, void *v) 2648static void cgroup_pidlist_stop(struct seq_file *s, void *v)
2287{ 2649{
2288 struct cgroup_pids *cp = s->private; 2650 struct cgroup_pidlist *l = s->private;
2289 struct cgroup *cgrp = cp->cgrp; 2651 up_read(&l->mutex);
2290 up_read(&cgrp->pids_mutex);
2291} 2652}
2292 2653
2293static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) 2654static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
2294{ 2655{
2295 struct cgroup_pids *cp = s->private; 2656 struct cgroup_pidlist *l = s->private;
2296 int *p = v; 2657 pid_t *p = v;
2297 int *end = cp->tasks_pids + cp->length; 2658 pid_t *end = l->list + l->length;
2298
2299 /* 2659 /*
2300 * Advance to the next pid in the array. If this goes off the 2660 * Advance to the next pid in the array. If this goes off the
2301 * end, we're done 2661 * end, we're done
@@ -2309,124 +2669,107 @@ static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2309 } 2669 }
2310} 2670}
2311 2671
2312static int cgroup_tasks_show(struct seq_file *s, void *v) 2672static int cgroup_pidlist_show(struct seq_file *s, void *v)
2313{ 2673{
2314 return seq_printf(s, "%d\n", *(int *)v); 2674 return seq_printf(s, "%d\n", *(int *)v);
2315} 2675}
2316 2676
2317static struct seq_operations cgroup_tasks_seq_operations = { 2677/*
2318 .start = cgroup_tasks_start, 2678 * seq_operations functions for iterating on pidlists through seq_file -
2319 .stop = cgroup_tasks_stop, 2679 * independent of whether it's tasks or procs
2320 .next = cgroup_tasks_next, 2680 */
2321 .show = cgroup_tasks_show, 2681static const struct seq_operations cgroup_pidlist_seq_operations = {
2682 .start = cgroup_pidlist_start,
2683 .stop = cgroup_pidlist_stop,
2684 .next = cgroup_pidlist_next,
2685 .show = cgroup_pidlist_show,
2322}; 2686};
2323 2687
2324static void release_cgroup_pid_array(struct cgroup_pids *cp) 2688static void cgroup_release_pid_array(struct cgroup_pidlist *l)
2325{ 2689{
2326 struct cgroup *cgrp = cp->cgrp; 2690 /*
2327 2691 * the case where we're the last user of this particular pidlist will
2328 down_write(&cgrp->pids_mutex); 2692 * have us remove it from the cgroup's list, which entails taking the
2329 BUG_ON(!cp->use_count); 2693 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
2330 if (!--cp->use_count) { 2694 * pidlist_mutex, we have to take pidlist_mutex first.
2331 list_del(&cp->list); 2695 */
2332 put_pid_ns(cp->ns); 2696 mutex_lock(&l->owner->pidlist_mutex);
2333 kfree(cp->tasks_pids); 2697 down_write(&l->mutex);
2334 kfree(cp); 2698 BUG_ON(!l->use_count);
2699 if (!--l->use_count) {
2700 /* we're the last user if refcount is 0; remove and free */
2701 list_del(&l->links);
2702 mutex_unlock(&l->owner->pidlist_mutex);
2703 pidlist_free(l->list);
2704 put_pid_ns(l->key.ns);
2705 up_write(&l->mutex);
2706 kfree(l);
2707 return;
2335 } 2708 }
2336 up_write(&cgrp->pids_mutex); 2709 mutex_unlock(&l->owner->pidlist_mutex);
2710 up_write(&l->mutex);
2337} 2711}
2338 2712
2339static int cgroup_tasks_release(struct inode *inode, struct file *file) 2713static int cgroup_pidlist_release(struct inode *inode, struct file *file)
2340{ 2714{
2341 struct seq_file *seq; 2715 struct cgroup_pidlist *l;
2342 struct cgroup_pids *cp;
2343
2344 if (!(file->f_mode & FMODE_READ)) 2716 if (!(file->f_mode & FMODE_READ))
2345 return 0; 2717 return 0;
2346 2718 /*
2347 seq = file->private_data; 2719 * the seq_file will only be initialized if the file was opened for
2348 cp = seq->private; 2720 * reading; hence we check if it's not null only in that case.
2349 2721 */
2350 release_cgroup_pid_array(cp); 2722 l = ((struct seq_file *)file->private_data)->private;
2723 cgroup_release_pid_array(l);
2351 return seq_release(inode, file); 2724 return seq_release(inode, file);
2352} 2725}
2353 2726
2354static struct file_operations cgroup_tasks_operations = { 2727static const struct file_operations cgroup_pidlist_operations = {
2355 .read = seq_read, 2728 .read = seq_read,
2356 .llseek = seq_lseek, 2729 .llseek = seq_lseek,
2357 .write = cgroup_file_write, 2730 .write = cgroup_file_write,
2358 .release = cgroup_tasks_release, 2731 .release = cgroup_pidlist_release,
2359}; 2732};
2360 2733
2361/* 2734/*
2362 * Handle an open on 'tasks' file. Prepare an array containing the 2735 * The following functions handle opens on a file that displays a pidlist
2363 * process id's of tasks currently attached to the cgroup being opened. 2736 * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
2737 * in the cgroup.
2364 */ 2738 */
2365 2739/* helper function for the two below it */
2366static int cgroup_tasks_open(struct inode *unused, struct file *file) 2740static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
2367{ 2741{
2368 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2742 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2369 struct pid_namespace *ns = current->nsproxy->pid_ns; 2743 struct cgroup_pidlist *l;
2370 struct cgroup_pids *cp;
2371 pid_t *pidarray;
2372 int npids;
2373 int retval; 2744 int retval;
2374 2745
2375 /* Nothing to do for write-only files */ 2746 /* Nothing to do for write-only files */
2376 if (!(file->f_mode & FMODE_READ)) 2747 if (!(file->f_mode & FMODE_READ))
2377 return 0; 2748 return 0;
2378 2749
2379 /* 2750 /* have the array populated */
2380 * If cgroup gets more users after we read count, we won't have 2751 retval = pidlist_array_load(cgrp, type, &l);
2381 * enough space - tough. This race is indistinguishable to the 2752 if (retval)
2382 * caller from the case that the additional cgroup users didn't 2753 return retval;
2383 * show up until sometime later on. 2754 /* configure file information */
2384 */ 2755 file->f_op = &cgroup_pidlist_operations;
2385 npids = cgroup_task_count(cgrp);
2386 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
2387 if (!pidarray)
2388 return -ENOMEM;
2389 npids = pid_array_load(pidarray, npids, cgrp);
2390 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
2391
2392 /*
2393 * Store the array in the cgroup, freeing the old
2394 * array if necessary
2395 */
2396 down_write(&cgrp->pids_mutex);
2397
2398 list_for_each_entry(cp, &cgrp->pids_list, list) {
2399 if (ns == cp->ns)
2400 goto found;
2401 }
2402
2403 cp = kzalloc(sizeof(*cp), GFP_KERNEL);
2404 if (!cp) {
2405 up_write(&cgrp->pids_mutex);
2406 kfree(pidarray);
2407 return -ENOMEM;
2408 }
2409 cp->cgrp = cgrp;
2410 cp->ns = ns;
2411 get_pid_ns(ns);
2412 list_add(&cp->list, &cgrp->pids_list);
2413found:
2414 kfree(cp->tasks_pids);
2415 cp->tasks_pids = pidarray;
2416 cp->length = npids;
2417 cp->use_count++;
2418 up_write(&cgrp->pids_mutex);
2419
2420 file->f_op = &cgroup_tasks_operations;
2421 2756
2422 retval = seq_open(file, &cgroup_tasks_seq_operations); 2757 retval = seq_open(file, &cgroup_pidlist_seq_operations);
2423 if (retval) { 2758 if (retval) {
2424 release_cgroup_pid_array(cp); 2759 cgroup_release_pid_array(l);
2425 return retval; 2760 return retval;
2426 } 2761 }
2427 ((struct seq_file *)file->private_data)->private = cp; 2762 ((struct seq_file *)file->private_data)->private = l;
2428 return 0; 2763 return 0;
2429} 2764}
2765static int cgroup_tasks_open(struct inode *unused, struct file *file)
2766{
2767 return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
2768}
2769static int cgroup_procs_open(struct inode *unused, struct file *file)
2770{
2771 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
2772}
2430 2773
2431static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, 2774static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
2432 struct cftype *cft) 2775 struct cftype *cft)
@@ -2449,21 +2792,27 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
2449/* 2792/*
2450 * for the common functions, 'private' gives the type of file 2793 * for the common functions, 'private' gives the type of file
2451 */ 2794 */
2795/* for hysterical raisins, we can't put this on the older files */
2796#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
2452static struct cftype files[] = { 2797static struct cftype files[] = {
2453 { 2798 {
2454 .name = "tasks", 2799 .name = "tasks",
2455 .open = cgroup_tasks_open, 2800 .open = cgroup_tasks_open,
2456 .write_u64 = cgroup_tasks_write, 2801 .write_u64 = cgroup_tasks_write,
2457 .release = cgroup_tasks_release, 2802 .release = cgroup_pidlist_release,
2458 .private = FILE_TASKLIST,
2459 .mode = S_IRUGO | S_IWUSR, 2803 .mode = S_IRUGO | S_IWUSR,
2460 }, 2804 },
2461 2805 {
2806 .name = CGROUP_FILE_GENERIC_PREFIX "procs",
2807 .open = cgroup_procs_open,
2808 /* .write_u64 = cgroup_procs_write, TODO */
2809 .release = cgroup_pidlist_release,
2810 .mode = S_IRUGO,
2811 },
2462 { 2812 {
2463 .name = "notify_on_release", 2813 .name = "notify_on_release",
2464 .read_u64 = cgroup_read_notify_on_release, 2814 .read_u64 = cgroup_read_notify_on_release,
2465 .write_u64 = cgroup_write_notify_on_release, 2815 .write_u64 = cgroup_write_notify_on_release,
2466 .private = FILE_NOTIFY_ON_RELEASE,
2467 }, 2816 },
2468}; 2817};
2469 2818
@@ -2472,7 +2821,6 @@ static struct cftype cft_release_agent = {
2472 .read_seq_string = cgroup_release_agent_show, 2821 .read_seq_string = cgroup_release_agent_show,
2473 .write_string = cgroup_release_agent_write, 2822 .write_string = cgroup_release_agent_write,
2474 .max_write_len = PATH_MAX, 2823 .max_write_len = PATH_MAX,
2475 .private = FILE_RELEASE_AGENT,
2476}; 2824};
2477 2825
2478static int cgroup_populate_dir(struct cgroup *cgrp) 2826static int cgroup_populate_dir(struct cgroup *cgrp)
@@ -2879,6 +3227,7 @@ int __init cgroup_init_early(void)
2879 init_task.cgroups = &init_css_set; 3227 init_task.cgroups = &init_css_set;
2880 3228
2881 init_css_set_link.cg = &init_css_set; 3229 init_css_set_link.cg = &init_css_set;
3230 init_css_set_link.cgrp = dummytop;
2882 list_add(&init_css_set_link.cgrp_link_list, 3231 list_add(&init_css_set_link.cgrp_link_list,
2883 &rootnode.top_cgroup.css_sets); 3232 &rootnode.top_cgroup.css_sets);
2884 list_add(&init_css_set_link.cg_link_list, 3233 list_add(&init_css_set_link.cg_link_list,
@@ -2933,7 +3282,7 @@ int __init cgroup_init(void)
2933 /* Add init_css_set to the hash table */ 3282 /* Add init_css_set to the hash table */
2934 hhead = css_set_hash(init_css_set.subsys); 3283 hhead = css_set_hash(init_css_set.subsys);
2935 hlist_add_head(&init_css_set.hlist, hhead); 3284 hlist_add_head(&init_css_set.hlist, hhead);
2936 3285 BUG_ON(!init_root_id(&rootnode));
2937 err = register_filesystem(&cgroup_fs_type); 3286 err = register_filesystem(&cgroup_fs_type);
2938 if (err < 0) 3287 if (err < 0)
2939 goto out; 3288 goto out;
@@ -2986,15 +3335,16 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
2986 for_each_active_root(root) { 3335 for_each_active_root(root) {
2987 struct cgroup_subsys *ss; 3336 struct cgroup_subsys *ss;
2988 struct cgroup *cgrp; 3337 struct cgroup *cgrp;
2989 int subsys_id;
2990 int count = 0; 3338 int count = 0;
2991 3339
2992 seq_printf(m, "%lu:", root->subsys_bits); 3340 seq_printf(m, "%d:", root->hierarchy_id);
2993 for_each_subsys(root, ss) 3341 for_each_subsys(root, ss)
2994 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 3342 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
3343 if (strlen(root->name))
3344 seq_printf(m, "%sname=%s", count ? "," : "",
3345 root->name);
2995 seq_putc(m, ':'); 3346 seq_putc(m, ':');
2996 get_first_subsys(&root->top_cgroup, NULL, &subsys_id); 3347 cgrp = task_cgroup_from_root(tsk, root);
2997 cgrp = task_cgroup(tsk, subsys_id);
2998 retval = cgroup_path(cgrp, buf, PAGE_SIZE); 3348 retval = cgroup_path(cgrp, buf, PAGE_SIZE);
2999 if (retval < 0) 3349 if (retval < 0)
3000 goto out_unlock; 3350 goto out_unlock;
@@ -3017,7 +3367,7 @@ static int cgroup_open(struct inode *inode, struct file *file)
3017 return single_open(file, proc_cgroup_show, pid); 3367 return single_open(file, proc_cgroup_show, pid);
3018} 3368}
3019 3369
3020struct file_operations proc_cgroup_operations = { 3370const struct file_operations proc_cgroup_operations = {
3021 .open = cgroup_open, 3371 .open = cgroup_open,
3022 .read = seq_read, 3372 .read = seq_read,
3023 .llseek = seq_lseek, 3373 .llseek = seq_lseek,
@@ -3033,8 +3383,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
3033 mutex_lock(&cgroup_mutex); 3383 mutex_lock(&cgroup_mutex);
3034 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3384 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3035 struct cgroup_subsys *ss = subsys[i]; 3385 struct cgroup_subsys *ss = subsys[i];
3036 seq_printf(m, "%s\t%lu\t%d\t%d\n", 3386 seq_printf(m, "%s\t%d\t%d\t%d\n",
3037 ss->name, ss->root->subsys_bits, 3387 ss->name, ss->root->hierarchy_id,
3038 ss->root->number_of_cgroups, !ss->disabled); 3388 ss->root->number_of_cgroups, !ss->disabled);
3039 } 3389 }
3040 mutex_unlock(&cgroup_mutex); 3390 mutex_unlock(&cgroup_mutex);
@@ -3046,7 +3396,7 @@ static int cgroupstats_open(struct inode *inode, struct file *file)
3046 return single_open(file, proc_cgroupstats_show, NULL); 3396 return single_open(file, proc_cgroupstats_show, NULL);
3047} 3397}
3048 3398
3049static struct file_operations proc_cgroupstats_operations = { 3399static const struct file_operations proc_cgroupstats_operations = {
3050 .open = cgroupstats_open, 3400 .open = cgroupstats_open,
3051 .read = seq_read, 3401 .read = seq_read,
3052 .llseek = seq_lseek, 3402 .llseek = seq_lseek,
@@ -3320,13 +3670,11 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
3320{ 3670{
3321 int ret; 3671 int ret;
3322 struct cgroup *target; 3672 struct cgroup *target;
3323 int subsys_id;
3324 3673
3325 if (cgrp == dummytop) 3674 if (cgrp == dummytop)
3326 return 1; 3675 return 1;
3327 3676
3328 get_first_subsys(cgrp, NULL, &subsys_id); 3677 target = task_cgroup_from_root(task, cgrp->root);
3329 target = task_cgroup(task, subsys_id);
3330 while (cgrp != target && cgrp!= cgrp->top_cgroup) 3678 while (cgrp != target && cgrp!= cgrp->top_cgroup)
3331 cgrp = cgrp->parent; 3679 cgrp = cgrp->parent;
3332 ret = (cgrp == target); 3680 ret = (cgrp == target);
@@ -3358,8 +3706,10 @@ static void check_for_release(struct cgroup *cgrp)
3358void __css_put(struct cgroup_subsys_state *css) 3706void __css_put(struct cgroup_subsys_state *css)
3359{ 3707{
3360 struct cgroup *cgrp = css->cgroup; 3708 struct cgroup *cgrp = css->cgroup;
3709 int val;
3361 rcu_read_lock(); 3710 rcu_read_lock();
3362 if (atomic_dec_return(&css->refcnt) == 1) { 3711 val = atomic_dec_return(&css->refcnt);
3712 if (val == 1) {
3363 if (notify_on_release(cgrp)) { 3713 if (notify_on_release(cgrp)) {
3364 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3714 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3365 check_for_release(cgrp); 3715 check_for_release(cgrp);
@@ -3367,6 +3717,7 @@ void __css_put(struct cgroup_subsys_state *css)
3367 cgroup_wakeup_rmdir_waiter(cgrp); 3717 cgroup_wakeup_rmdir_waiter(cgrp);
3368 } 3718 }
3369 rcu_read_unlock(); 3719 rcu_read_unlock();
3720 WARN_ON_ONCE(val < 1);
3370} 3721}
3371 3722
3372/* 3723/*
@@ -3693,3 +4044,154 @@ css_get_next(struct cgroup_subsys *ss, int id,
3693 return ret; 4044 return ret;
3694} 4045}
3695 4046
4047#ifdef CONFIG_CGROUP_DEBUG
4048static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
4049 struct cgroup *cont)
4050{
4051 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
4052
4053 if (!css)
4054 return ERR_PTR(-ENOMEM);
4055
4056 return css;
4057}
4058
4059static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
4060{
4061 kfree(cont->subsys[debug_subsys_id]);
4062}
4063
4064static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
4065{
4066 return atomic_read(&cont->count);
4067}
4068
4069static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
4070{
4071 return cgroup_task_count(cont);
4072}
4073
4074static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
4075{
4076 return (u64)(unsigned long)current->cgroups;
4077}
4078
4079static u64 current_css_set_refcount_read(struct cgroup *cont,
4080 struct cftype *cft)
4081{
4082 u64 count;
4083
4084 rcu_read_lock();
4085 count = atomic_read(&current->cgroups->refcount);
4086 rcu_read_unlock();
4087 return count;
4088}
4089
4090static int current_css_set_cg_links_read(struct cgroup *cont,
4091 struct cftype *cft,
4092 struct seq_file *seq)
4093{
4094 struct cg_cgroup_link *link;
4095 struct css_set *cg;
4096
4097 read_lock(&css_set_lock);
4098 rcu_read_lock();
4099 cg = rcu_dereference(current->cgroups);
4100 list_for_each_entry(link, &cg->cg_links, cg_link_list) {
4101 struct cgroup *c = link->cgrp;
4102 const char *name;
4103
4104 if (c->dentry)
4105 name = c->dentry->d_name.name;
4106 else
4107 name = "?";
4108 seq_printf(seq, "Root %d group %s\n",
4109 c->root->hierarchy_id, name);
4110 }
4111 rcu_read_unlock();
4112 read_unlock(&css_set_lock);
4113 return 0;
4114}
4115
4116#define MAX_TASKS_SHOWN_PER_CSS 25
4117static int cgroup_css_links_read(struct cgroup *cont,
4118 struct cftype *cft,
4119 struct seq_file *seq)
4120{
4121 struct cg_cgroup_link *link;
4122
4123 read_lock(&css_set_lock);
4124 list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
4125 struct css_set *cg = link->cg;
4126 struct task_struct *task;
4127 int count = 0;
4128 seq_printf(seq, "css_set %p\n", cg);
4129 list_for_each_entry(task, &cg->tasks, cg_list) {
4130 if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
4131 seq_puts(seq, " ...\n");
4132 break;
4133 } else {
4134 seq_printf(seq, " task %d\n",
4135 task_pid_vnr(task));
4136 }
4137 }
4138 }
4139 read_unlock(&css_set_lock);
4140 return 0;
4141}
4142
4143static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
4144{
4145 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
4146}
4147
4148static struct cftype debug_files[] = {
4149 {
4150 .name = "cgroup_refcount",
4151 .read_u64 = cgroup_refcount_read,
4152 },
4153 {
4154 .name = "taskcount",
4155 .read_u64 = debug_taskcount_read,
4156 },
4157
4158 {
4159 .name = "current_css_set",
4160 .read_u64 = current_css_set_read,
4161 },
4162
4163 {
4164 .name = "current_css_set_refcount",
4165 .read_u64 = current_css_set_refcount_read,
4166 },
4167
4168 {
4169 .name = "current_css_set_cg_links",
4170 .read_seq_string = current_css_set_cg_links_read,
4171 },
4172
4173 {
4174 .name = "cgroup_css_links",
4175 .read_seq_string = cgroup_css_links_read,
4176 },
4177
4178 {
4179 .name = "releasable",
4180 .read_u64 = releasable_read,
4181 },
4182};
4183
4184static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
4185{
4186 return cgroup_add_files(cont, ss, debug_files,
4187 ARRAY_SIZE(debug_files));
4188}
4189
4190struct cgroup_subsys debug_subsys = {
4191 .name = "debug",
4192 .create = debug_create,
4193 .destroy = debug_destroy,
4194 .populate = debug_populate,
4195 .subsys_id = debug_subsys_id,
4196};
4197#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
deleted file mode 100644
index 0c92d797baa6..000000000000
--- a/kernel/cgroup_debug.c
+++ /dev/null
@@ -1,105 +0,0 @@
1/*
2 * kernel/cgroup_debug.c - Example cgroup subsystem that
3 * exposes debug info
4 *
5 * Copyright (C) Google Inc, 2007
6 *
7 * Developed by Paul Menage (menage@google.com)
8 *
9 */
10
11#include <linux/cgroup.h>
12#include <linux/fs.h>
13#include <linux/slab.h>
14#include <linux/rcupdate.h>
15
16#include <asm/atomic.h>
17
18static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
19 struct cgroup *cont)
20{
21 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
22
23 if (!css)
24 return ERR_PTR(-ENOMEM);
25
26 return css;
27}
28
29static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
30{
31 kfree(cont->subsys[debug_subsys_id]);
32}
33
34static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
35{
36 return atomic_read(&cont->count);
37}
38
39static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
40{
41 u64 count;
42
43 count = cgroup_task_count(cont);
44 return count;
45}
46
47static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
48{
49 return (u64)(long)current->cgroups;
50}
51
52static u64 current_css_set_refcount_read(struct cgroup *cont,
53 struct cftype *cft)
54{
55 u64 count;
56
57 rcu_read_lock();
58 count = atomic_read(&current->cgroups->refcount);
59 rcu_read_unlock();
60 return count;
61}
62
63static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
64{
65 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
66}
67
68static struct cftype files[] = {
69 {
70 .name = "cgroup_refcount",
71 .read_u64 = cgroup_refcount_read,
72 },
73 {
74 .name = "taskcount",
75 .read_u64 = taskcount_read,
76 },
77
78 {
79 .name = "current_css_set",
80 .read_u64 = current_css_set_read,
81 },
82
83 {
84 .name = "current_css_set_refcount",
85 .read_u64 = current_css_set_refcount_read,
86 },
87
88 {
89 .name = "releasable",
90 .read_u64 = releasable_read,
91 },
92};
93
94static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
95{
96 return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
97}
98
99struct cgroup_subsys debug_subsys = {
100 .name = "debug",
101 .create = debug_create,
102 .destroy = debug_destroy,
103 .populate = debug_populate,
104 .subsys_id = debug_subsys_id,
105};
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index fb249e2bcada..59e9ef6aab40 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -159,7 +159,7 @@ static bool is_task_frozen_enough(struct task_struct *task)
159 */ 159 */
160static int freezer_can_attach(struct cgroup_subsys *ss, 160static int freezer_can_attach(struct cgroup_subsys *ss,
161 struct cgroup *new_cgroup, 161 struct cgroup *new_cgroup,
162 struct task_struct *task) 162 struct task_struct *task, bool threadgroup)
163{ 163{
164 struct freezer *freezer; 164 struct freezer *freezer;
165 165
@@ -177,6 +177,19 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
177 if (freezer->state == CGROUP_FROZEN) 177 if (freezer->state == CGROUP_FROZEN)
178 return -EBUSY; 178 return -EBUSY;
179 179
180 if (threadgroup) {
181 struct task_struct *c;
182
183 rcu_read_lock();
184 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
185 if (is_task_frozen_enough(c)) {
186 rcu_read_unlock();
187 return -EBUSY;
188 }
189 }
190 rcu_read_unlock();
191 }
192
180 return 0; 193 return 0;
181} 194}
182 195
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8ce10043e4ac..6ba0f1ecb212 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -401,6 +401,7 @@ int disable_nonboot_cpus(void)
401 break; 401 break;
402 } 402 }
403 } 403 }
404
404 if (!error) { 405 if (!error) {
405 BUG_ON(num_online_cpus() > 1); 406 BUG_ON(num_online_cpus() > 1);
406 /* Make sure the CPUs won't be enabled by someone else */ 407 /* Make sure the CPUs won't be enabled by someone else */
@@ -413,6 +414,14 @@ int disable_nonboot_cpus(void)
413 return error; 414 return error;
414} 415}
415 416
417void __weak arch_enable_nonboot_cpus_begin(void)
418{
419}
420
421void __weak arch_enable_nonboot_cpus_end(void)
422{
423}
424
416void __ref enable_nonboot_cpus(void) 425void __ref enable_nonboot_cpus(void)
417{ 426{
418 int cpu, error; 427 int cpu, error;
@@ -424,6 +433,9 @@ void __ref enable_nonboot_cpus(void)
424 goto out; 433 goto out;
425 434
426 printk("Enabling non-boot CPUs ...\n"); 435 printk("Enabling non-boot CPUs ...\n");
436
437 arch_enable_nonboot_cpus_begin();
438
427 for_each_cpu(cpu, frozen_cpus) { 439 for_each_cpu(cpu, frozen_cpus) {
428 error = _cpu_up(cpu, 1); 440 error = _cpu_up(cpu, 1);
429 if (!error) { 441 if (!error) {
@@ -432,6 +444,9 @@ void __ref enable_nonboot_cpus(void)
432 } 444 }
433 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 445 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
434 } 446 }
447
448 arch_enable_nonboot_cpus_end();
449
435 cpumask_clear(frozen_cpus); 450 cpumask_clear(frozen_cpus);
436out: 451out:
437 cpu_maps_update_done(); 452 cpu_maps_update_done();
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7e75a41bd508..3cf2183b472d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -537,8 +537,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
537 * element of the partition (one sched domain) to be passed to 537 * element of the partition (one sched domain) to be passed to
538 * partition_sched_domains(). 538 * partition_sched_domains().
539 */ 539 */
540/* FIXME: see the FIXME in partition_sched_domains() */ 540static int generate_sched_domains(cpumask_var_t **domains,
541static int generate_sched_domains(struct cpumask **domains,
542 struct sched_domain_attr **attributes) 541 struct sched_domain_attr **attributes)
543{ 542{
544 LIST_HEAD(q); /* queue of cpusets to be scanned */ 543 LIST_HEAD(q); /* queue of cpusets to be scanned */
@@ -546,7 +545,7 @@ static int generate_sched_domains(struct cpumask **domains,
546 struct cpuset **csa; /* array of all cpuset ptrs */ 545 struct cpuset **csa; /* array of all cpuset ptrs */
547 int csn; /* how many cpuset ptrs in csa so far */ 546 int csn; /* how many cpuset ptrs in csa so far */
548 int i, j, k; /* indices for partition finding loops */ 547 int i, j, k; /* indices for partition finding loops */
549 struct cpumask *doms; /* resulting partition; i.e. sched domains */ 548 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
550 struct sched_domain_attr *dattr; /* attributes for custom domains */ 549 struct sched_domain_attr *dattr; /* attributes for custom domains */
551 int ndoms = 0; /* number of sched domains in result */ 550 int ndoms = 0; /* number of sched domains in result */
552 int nslot; /* next empty doms[] struct cpumask slot */ 551 int nslot; /* next empty doms[] struct cpumask slot */
@@ -557,7 +556,8 @@ static int generate_sched_domains(struct cpumask **domains,
557 556
558 /* Special case for the 99% of systems with one, full, sched domain */ 557 /* Special case for the 99% of systems with one, full, sched domain */
559 if (is_sched_load_balance(&top_cpuset)) { 558 if (is_sched_load_balance(&top_cpuset)) {
560 doms = kmalloc(cpumask_size(), GFP_KERNEL); 559 ndoms = 1;
560 doms = alloc_sched_domains(ndoms);
561 if (!doms) 561 if (!doms)
562 goto done; 562 goto done;
563 563
@@ -566,9 +566,8 @@ static int generate_sched_domains(struct cpumask **domains,
566 *dattr = SD_ATTR_INIT; 566 *dattr = SD_ATTR_INIT;
567 update_domain_attr_tree(dattr, &top_cpuset); 567 update_domain_attr_tree(dattr, &top_cpuset);
568 } 568 }
569 cpumask_copy(doms, top_cpuset.cpus_allowed); 569 cpumask_copy(doms[0], top_cpuset.cpus_allowed);
570 570
571 ndoms = 1;
572 goto done; 571 goto done;
573 } 572 }
574 573
@@ -636,7 +635,7 @@ restart:
636 * Now we know how many domains to create. 635 * Now we know how many domains to create.
637 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 636 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
638 */ 637 */
639 doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL); 638 doms = alloc_sched_domains(ndoms);
640 if (!doms) 639 if (!doms)
641 goto done; 640 goto done;
642 641
@@ -656,7 +655,7 @@ restart:
656 continue; 655 continue;
657 } 656 }
658 657
659 dp = doms + nslot; 658 dp = doms[nslot];
660 659
661 if (nslot == ndoms) { 660 if (nslot == ndoms) {
662 static int warnings = 10; 661 static int warnings = 10;
@@ -718,7 +717,7 @@ done:
718static void do_rebuild_sched_domains(struct work_struct *unused) 717static void do_rebuild_sched_domains(struct work_struct *unused)
719{ 718{
720 struct sched_domain_attr *attr; 719 struct sched_domain_attr *attr;
721 struct cpumask *doms; 720 cpumask_var_t *doms;
722 int ndoms; 721 int ndoms;
723 722
724 get_online_cpus(); 723 get_online_cpus();
@@ -1324,9 +1323,10 @@ static int fmeter_getrate(struct fmeter *fmp)
1324static cpumask_var_t cpus_attach; 1323static cpumask_var_t cpus_attach;
1325 1324
1326/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1325/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1327static int cpuset_can_attach(struct cgroup_subsys *ss, 1326static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1328 struct cgroup *cont, struct task_struct *tsk) 1327 struct task_struct *tsk, bool threadgroup)
1329{ 1328{
1329 int ret;
1330 struct cpuset *cs = cgroup_cs(cont); 1330 struct cpuset *cs = cgroup_cs(cont);
1331 1331
1332 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1332 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1343,18 +1343,51 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
1343 if (tsk->flags & PF_THREAD_BOUND) 1343 if (tsk->flags & PF_THREAD_BOUND)
1344 return -EINVAL; 1344 return -EINVAL;
1345 1345
1346 return security_task_setscheduler(tsk, 0, NULL); 1346 ret = security_task_setscheduler(tsk, 0, NULL);
1347 if (ret)
1348 return ret;
1349 if (threadgroup) {
1350 struct task_struct *c;
1351
1352 rcu_read_lock();
1353 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1354 ret = security_task_setscheduler(c, 0, NULL);
1355 if (ret) {
1356 rcu_read_unlock();
1357 return ret;
1358 }
1359 }
1360 rcu_read_unlock();
1361 }
1362 return 0;
1347} 1363}
1348 1364
1349static void cpuset_attach(struct cgroup_subsys *ss, 1365static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
1350 struct cgroup *cont, struct cgroup *oldcont, 1366 struct cpuset *cs)
1351 struct task_struct *tsk) 1367{
1368 int err;
1369 /*
1370 * can_attach beforehand should guarantee that this doesn't fail.
1371 * TODO: have a better way to handle failure here
1372 */
1373 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1374 WARN_ON_ONCE(err);
1375
1376 task_lock(tsk);
1377 cpuset_change_task_nodemask(tsk, to);
1378 task_unlock(tsk);
1379 cpuset_update_task_spread_flag(cs, tsk);
1380
1381}
1382
1383static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1384 struct cgroup *oldcont, struct task_struct *tsk,
1385 bool threadgroup)
1352{ 1386{
1353 nodemask_t from, to; 1387 nodemask_t from, to;
1354 struct mm_struct *mm; 1388 struct mm_struct *mm;
1355 struct cpuset *cs = cgroup_cs(cont); 1389 struct cpuset *cs = cgroup_cs(cont);
1356 struct cpuset *oldcs = cgroup_cs(oldcont); 1390 struct cpuset *oldcs = cgroup_cs(oldcont);
1357 int err;
1358 1391
1359 if (cs == &top_cpuset) { 1392 if (cs == &top_cpuset) {
1360 cpumask_copy(cpus_attach, cpu_possible_mask); 1393 cpumask_copy(cpus_attach, cpu_possible_mask);
@@ -1363,15 +1396,19 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1363 guarantee_online_cpus(cs, cpus_attach); 1396 guarantee_online_cpus(cs, cpus_attach);
1364 guarantee_online_mems(cs, &to); 1397 guarantee_online_mems(cs, &to);
1365 } 1398 }
1366 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1367 if (err)
1368 return;
1369 1399
1370 task_lock(tsk); 1400 /* do per-task migration stuff possibly for each in the threadgroup */
1371 cpuset_change_task_nodemask(tsk, &to); 1401 cpuset_attach_task(tsk, &to, cs);
1372 task_unlock(tsk); 1402 if (threadgroup) {
1373 cpuset_update_task_spread_flag(cs, tsk); 1403 struct task_struct *c;
1404 rcu_read_lock();
1405 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1406 cpuset_attach_task(c, &to, cs);
1407 }
1408 rcu_read_unlock();
1409 }
1374 1410
1411 /* change mm; only needs to be done once even if threadgroup */
1375 from = oldcs->mems_allowed; 1412 from = oldcs->mems_allowed;
1376 to = cs->mems_allowed; 1413 to = cs->mems_allowed;
1377 mm = get_task_mm(tsk); 1414 mm = get_task_mm(tsk);
@@ -2014,7 +2051,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2014 unsigned long phase, void *unused_cpu) 2051 unsigned long phase, void *unused_cpu)
2015{ 2052{
2016 struct sched_domain_attr *attr; 2053 struct sched_domain_attr *attr;
2017 struct cpumask *doms; 2054 cpumask_var_t *doms;
2018 int ndoms; 2055 int ndoms;
2019 2056
2020 switch (phase) { 2057 switch (phase) {
@@ -2499,15 +2536,9 @@ const struct file_operations proc_cpuset_operations = {
2499}; 2536};
2500#endif /* CONFIG_PROC_PID_CPUSET */ 2537#endif /* CONFIG_PROC_PID_CPUSET */
2501 2538
2502/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */ 2539/* Display task mems_allowed in /proc/<pid>/status file. */
2503void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 2540void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2504{ 2541{
2505 seq_printf(m, "Cpus_allowed:\t");
2506 seq_cpumask(m, &task->cpus_allowed);
2507 seq_printf(m, "\n");
2508 seq_printf(m, "Cpus_allowed_list:\t");
2509 seq_cpumask_list(m, &task->cpus_allowed);
2510 seq_printf(m, "\n");
2511 seq_printf(m, "Mems_allowed:\t"); 2542 seq_printf(m, "Mems_allowed:\t");
2512 seq_nodemask(m, &task->mems_allowed); 2543 seq_nodemask(m, &task->mems_allowed);
2513 seq_printf(m, "\n"); 2544 seq_printf(m, "\n");
diff --git a/kernel/cred.c b/kernel/cred.c
index 006fcab009d5..dd76cfe5f5b0 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -147,7 +147,8 @@ static void put_cred_rcu(struct rcu_head *rcu)
147 key_put(cred->thread_keyring); 147 key_put(cred->thread_keyring);
148 key_put(cred->request_key_auth); 148 key_put(cred->request_key_auth);
149 release_tgcred(cred); 149 release_tgcred(cred);
150 put_group_info(cred->group_info); 150 if (cred->group_info)
151 put_group_info(cred->group_info);
151 free_uid(cred->user); 152 free_uid(cred->user);
152 kmem_cache_free(cred_jar, cred); 153 kmem_cache_free(cred_jar, cred);
153} 154}
@@ -781,6 +782,25 @@ EXPORT_SYMBOL(set_create_files_as);
781 782
782#ifdef CONFIG_DEBUG_CREDENTIALS 783#ifdef CONFIG_DEBUG_CREDENTIALS
783 784
785bool creds_are_invalid(const struct cred *cred)
786{
787 if (cred->magic != CRED_MAGIC)
788 return true;
789 if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
790 return true;
791#ifdef CONFIG_SECURITY_SELINUX
792 if (selinux_is_enabled()) {
793 if ((unsigned long) cred->security < PAGE_SIZE)
794 return true;
795 if ((*(u32 *)cred->security & 0xffffff00) ==
796 (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
797 return true;
798 }
799#endif
800 return false;
801}
802EXPORT_SYMBOL(creds_are_invalid);
803
784/* 804/*
785 * dump invalid credentials 805 * dump invalid credentials
786 */ 806 */
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index abb6e17505e2..ead9b610aa71 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -15,6 +15,7 @@
15 15
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/taskstats.h>
18#include <linux/time.h> 19#include <linux/time.h>
19#include <linux/sysctl.h> 20#include <linux/sysctl.h>
20#include <linux/delayacct.h> 21#include <linux/delayacct.h>
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
deleted file mode 100644
index 962a3b574f21..000000000000
--- a/kernel/dma-coherent.c
+++ /dev/null
@@ -1,176 +0,0 @@
1/*
2 * Coherent per-device memory handling.
3 * Borrowed from i386
4 */
5#include <linux/kernel.h>
6#include <linux/dma-mapping.h>
7
8struct dma_coherent_mem {
9 void *virt_base;
10 u32 device_base;
11 int size;
12 int flags;
13 unsigned long *bitmap;
14};
15
16int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
17 dma_addr_t device_addr, size_t size, int flags)
18{
19 void __iomem *mem_base = NULL;
20 int pages = size >> PAGE_SHIFT;
21 int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
22
23 if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
24 goto out;
25 if (!size)
26 goto out;
27 if (dev->dma_mem)
28 goto out;
29
30 /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
31
32 mem_base = ioremap(bus_addr, size);
33 if (!mem_base)
34 goto out;
35
36 dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
37 if (!dev->dma_mem)
38 goto out;
39 dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
40 if (!dev->dma_mem->bitmap)
41 goto free1_out;
42
43 dev->dma_mem->virt_base = mem_base;
44 dev->dma_mem->device_base = device_addr;
45 dev->dma_mem->size = pages;
46 dev->dma_mem->flags = flags;
47
48 if (flags & DMA_MEMORY_MAP)
49 return DMA_MEMORY_MAP;
50
51 return DMA_MEMORY_IO;
52
53 free1_out:
54 kfree(dev->dma_mem);
55 out:
56 if (mem_base)
57 iounmap(mem_base);
58 return 0;
59}
60EXPORT_SYMBOL(dma_declare_coherent_memory);
61
62void dma_release_declared_memory(struct device *dev)
63{
64 struct dma_coherent_mem *mem = dev->dma_mem;
65
66 if (!mem)
67 return;
68 dev->dma_mem = NULL;
69 iounmap(mem->virt_base);
70 kfree(mem->bitmap);
71 kfree(mem);
72}
73EXPORT_SYMBOL(dma_release_declared_memory);
74
75void *dma_mark_declared_memory_occupied(struct device *dev,
76 dma_addr_t device_addr, size_t size)
77{
78 struct dma_coherent_mem *mem = dev->dma_mem;
79 int pos, err;
80
81 size += device_addr & ~PAGE_MASK;
82
83 if (!mem)
84 return ERR_PTR(-EINVAL);
85
86 pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
87 err = bitmap_allocate_region(mem->bitmap, pos, get_order(size));
88 if (err != 0)
89 return ERR_PTR(err);
90 return mem->virt_base + (pos << PAGE_SHIFT);
91}
92EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
93
94/**
95 * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area
96 *
97 * @dev: device from which we allocate memory
98 * @size: size of requested memory area
99 * @dma_handle: This will be filled with the correct dma handle
100 * @ret: This pointer will be filled with the virtual address
101 * to allocated area.
102 *
103 * This function should be only called from per-arch dma_alloc_coherent()
104 * to support allocation from per-device coherent memory pools.
105 *
106 * Returns 0 if dma_alloc_coherent should continue with allocating from
107 * generic memory areas, or !0 if dma_alloc_coherent should return @ret.
108 */
109int dma_alloc_from_coherent(struct device *dev, ssize_t size,
110 dma_addr_t *dma_handle, void **ret)
111{
112 struct dma_coherent_mem *mem;
113 int order = get_order(size);
114 int pageno;
115
116 if (!dev)
117 return 0;
118 mem = dev->dma_mem;
119 if (!mem)
120 return 0;
121
122 *ret = NULL;
123
124 if (unlikely(size > (mem->size << PAGE_SHIFT)))
125 goto err;
126
127 pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
128 if (unlikely(pageno < 0))
129 goto err;
130
131 /*
132 * Memory was found in the per-device area.
133 */
134 *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
135 *ret = mem->virt_base + (pageno << PAGE_SHIFT);
136 memset(*ret, 0, size);
137
138 return 1;
139
140err:
141 /*
142 * In the case where the allocation can not be satisfied from the
143 * per-device area, try to fall back to generic memory if the
144 * constraints allow it.
145 */
146 return mem->flags & DMA_MEMORY_EXCLUSIVE;
147}
148EXPORT_SYMBOL(dma_alloc_from_coherent);
149
150/**
151 * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
152 * @dev: device from which the memory was allocated
153 * @order: the order of pages allocated
154 * @vaddr: virtual address of allocated pages
155 *
156 * This checks whether the memory was allocated from the per-device
157 * coherent memory pool and if so, releases that memory.
158 *
159 * Returns 1 if we correctly released the memory, or 0 if
160 * dma_release_coherent() should proceed with releasing memory from
161 * generic pools.
162 */
163int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
164{
165 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
166
167 if (mem && vaddr >= mem->virt_base && vaddr <
168 (mem->virt_base + (mem->size << PAGE_SHIFT))) {
169 int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
170
171 bitmap_release_region(mem->bitmap, page, order);
172 return 1;
173 }
174 return 0;
175}
176EXPORT_SYMBOL(dma_release_from_coherent);
diff --git a/kernel/exit.c b/kernel/exit.c
index ae5d8660ddff..80ae941cfd2e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -47,8 +47,9 @@
47#include <linux/tracehook.h> 47#include <linux/tracehook.h>
48#include <linux/fs_struct.h> 48#include <linux/fs_struct.h>
49#include <linux/init_task.h> 49#include <linux/init_task.h>
50#include <linux/perf_counter.h> 50#include <linux/perf_event.h>
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52#include <linux/hw_breakpoint.h>
52 53
53#include <asm/uaccess.h> 54#include <asm/uaccess.h>
54#include <asm/unistd.h> 55#include <asm/unistd.h>
@@ -110,9 +111,9 @@ static void __exit_signal(struct task_struct *tsk)
110 * We won't ever get here for the group leader, since it 111 * We won't ever get here for the group leader, since it
111 * will have been the last reference on the signal_struct. 112 * will have been the last reference on the signal_struct.
112 */ 113 */
113 sig->utime = cputime_add(sig->utime, task_utime(tsk)); 114 sig->utime = cputime_add(sig->utime, tsk->utime);
114 sig->stime = cputime_add(sig->stime, task_stime(tsk)); 115 sig->stime = cputime_add(sig->stime, tsk->stime);
115 sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); 116 sig->gtime = cputime_add(sig->gtime, tsk->gtime);
116 sig->min_flt += tsk->min_flt; 117 sig->min_flt += tsk->min_flt;
117 sig->maj_flt += tsk->maj_flt; 118 sig->maj_flt += tsk->maj_flt;
118 sig->nvcsw += tsk->nvcsw; 119 sig->nvcsw += tsk->nvcsw;
@@ -154,8 +155,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
154{ 155{
155 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 156 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
156 157
157#ifdef CONFIG_PERF_COUNTERS 158#ifdef CONFIG_PERF_EVENTS
158 WARN_ON_ONCE(tsk->perf_counter_ctxp); 159 WARN_ON_ONCE(tsk->perf_event_ctxp);
159#endif 160#endif
160 trace_sched_process_free(tsk); 161 trace_sched_process_free(tsk);
161 put_task_struct(tsk); 162 put_task_struct(tsk);
@@ -945,6 +946,8 @@ NORET_TYPE void do_exit(long code)
945 if (group_dead) { 946 if (group_dead) {
946 hrtimer_cancel(&tsk->signal->real_timer); 947 hrtimer_cancel(&tsk->signal->real_timer);
947 exit_itimers(tsk->signal); 948 exit_itimers(tsk->signal);
949 if (tsk->mm)
950 setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
948 } 951 }
949 acct_collect(code, group_dead); 952 acct_collect(code, group_dead);
950 if (group_dead) 953 if (group_dead)
@@ -972,16 +975,18 @@ NORET_TYPE void do_exit(long code)
972 disassociate_ctty(1); 975 disassociate_ctty(1);
973 976
974 module_put(task_thread_info(tsk)->exec_domain->module); 977 module_put(task_thread_info(tsk)->exec_domain->module);
975 if (tsk->binfmt)
976 module_put(tsk->binfmt->module);
977 978
978 proc_exit_connector(tsk); 979 proc_exit_connector(tsk);
979 980
980 /* 981 /*
982 * FIXME: do that only when needed, using sched_exit tracepoint
983 */
984 flush_ptrace_hw_breakpoint(tsk);
985 /*
981 * Flush inherited counters to the parent - before the parent 986 * Flush inherited counters to the parent - before the parent
982 * gets woken up by child-exit notifications. 987 * gets woken up by child-exit notifications.
983 */ 988 */
984 perf_counter_exit_task(tsk); 989 perf_event_exit_task(tsk);
985 990
986 exit_notify(tsk, group_dead); 991 exit_notify(tsk, group_dead);
987#ifdef CONFIG_NUMA 992#ifdef CONFIG_NUMA
@@ -989,8 +994,6 @@ NORET_TYPE void do_exit(long code)
989 tsk->mempolicy = NULL; 994 tsk->mempolicy = NULL;
990#endif 995#endif
991#ifdef CONFIG_FUTEX 996#ifdef CONFIG_FUTEX
992 if (unlikely(!list_empty(&tsk->pi_state_list)))
993 exit_pi_state_list(tsk);
994 if (unlikely(current->pi_state_cache)) 997 if (unlikely(current->pi_state_cache))
995 kfree(current->pi_state_cache); 998 kfree(current->pi_state_cache);
996#endif 999#endif
@@ -1093,28 +1096,28 @@ struct wait_opts {
1093 int __user *wo_stat; 1096 int __user *wo_stat;
1094 struct rusage __user *wo_rusage; 1097 struct rusage __user *wo_rusage;
1095 1098
1099 wait_queue_t child_wait;
1096 int notask_error; 1100 int notask_error;
1097}; 1101};
1098 1102
1099static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) 1103static inline
1104struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
1100{ 1105{
1101 struct pid *pid = NULL; 1106 if (type != PIDTYPE_PID)
1102 if (type == PIDTYPE_PID) 1107 task = task->group_leader;
1103 pid = task->pids[type].pid; 1108 return task->pids[type].pid;
1104 else if (type < PIDTYPE_MAX)
1105 pid = task->group_leader->pids[type].pid;
1106 return pid;
1107} 1109}
1108 1110
1109static int eligible_child(struct wait_opts *wo, struct task_struct *p) 1111static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
1110{ 1112{
1111 int err; 1113 return wo->wo_type == PIDTYPE_MAX ||
1112 1114 task_pid_type(p, wo->wo_type) == wo->wo_pid;
1113 if (wo->wo_type < PIDTYPE_MAX) { 1115}
1114 if (task_pid_type(p, wo->wo_type) != wo->wo_pid)
1115 return 0;
1116 }
1117 1116
1117static int eligible_child(struct wait_opts *wo, struct task_struct *p)
1118{
1119 if (!eligible_pid(wo, p))
1120 return 0;
1118 /* Wait for all children (clone and not) if __WALL is set; 1121 /* Wait for all children (clone and not) if __WALL is set;
1119 * otherwise, wait for clone children *only* if __WCLONE is 1122 * otherwise, wait for clone children *only* if __WCLONE is
1120 * set; otherwise, wait for non-clone children *only*. (Note: 1123 * set; otherwise, wait for non-clone children *only*. (Note:
@@ -1124,10 +1127,6 @@ static int eligible_child(struct wait_opts *wo, struct task_struct *p)
1124 && !(wo->wo_flags & __WALL)) 1127 && !(wo->wo_flags & __WALL))
1125 return 0; 1128 return 0;
1126 1129
1127 err = security_task_wait(p);
1128 if (err)
1129 return err;
1130
1131 return 1; 1130 return 1;
1132} 1131}
1133 1132
@@ -1140,18 +1139,20 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
1140 1139
1141 put_task_struct(p); 1140 put_task_struct(p);
1142 infop = wo->wo_info; 1141 infop = wo->wo_info;
1143 if (!retval) 1142 if (infop) {
1144 retval = put_user(SIGCHLD, &infop->si_signo); 1143 if (!retval)
1145 if (!retval) 1144 retval = put_user(SIGCHLD, &infop->si_signo);
1146 retval = put_user(0, &infop->si_errno); 1145 if (!retval)
1147 if (!retval) 1146 retval = put_user(0, &infop->si_errno);
1148 retval = put_user((short)why, &infop->si_code); 1147 if (!retval)
1149 if (!retval) 1148 retval = put_user((short)why, &infop->si_code);
1150 retval = put_user(pid, &infop->si_pid); 1149 if (!retval)
1151 if (!retval) 1150 retval = put_user(pid, &infop->si_pid);
1152 retval = put_user(uid, &infop->si_uid); 1151 if (!retval)
1153 if (!retval) 1152 retval = put_user(uid, &infop->si_uid);
1154 retval = put_user(status, &infop->si_status); 1153 if (!retval)
1154 retval = put_user(status, &infop->si_status);
1155 }
1155 if (!retval) 1156 if (!retval)
1156 retval = pid; 1157 retval = pid;
1157 return retval; 1158 return retval;
@@ -1208,6 +1209,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1208 if (likely(!traced) && likely(!task_detached(p))) { 1209 if (likely(!traced) && likely(!task_detached(p))) {
1209 struct signal_struct *psig; 1210 struct signal_struct *psig;
1210 struct signal_struct *sig; 1211 struct signal_struct *sig;
1212 unsigned long maxrss;
1213 cputime_t tgutime, tgstime;
1211 1214
1212 /* 1215 /*
1213 * The resource counters for the group leader are in its 1216 * The resource counters for the group leader are in its
@@ -1223,20 +1226,23 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1223 * need to protect the access to parent->signal fields, 1226 * need to protect the access to parent->signal fields,
1224 * as other threads in the parent group can be right 1227 * as other threads in the parent group can be right
1225 * here reaping other children at the same time. 1228 * here reaping other children at the same time.
1229 *
1230 * We use thread_group_times() to get times for the thread
1231 * group, which consolidates times for all threads in the
1232 * group including the group leader.
1226 */ 1233 */
1234 thread_group_times(p, &tgutime, &tgstime);
1227 spin_lock_irq(&p->real_parent->sighand->siglock); 1235 spin_lock_irq(&p->real_parent->sighand->siglock);
1228 psig = p->real_parent->signal; 1236 psig = p->real_parent->signal;
1229 sig = p->signal; 1237 sig = p->signal;
1230 psig->cutime = 1238 psig->cutime =
1231 cputime_add(psig->cutime, 1239 cputime_add(psig->cutime,
1232 cputime_add(p->utime, 1240 cputime_add(tgutime,
1233 cputime_add(sig->utime, 1241 sig->cutime));
1234 sig->cutime)));
1235 psig->cstime = 1242 psig->cstime =
1236 cputime_add(psig->cstime, 1243 cputime_add(psig->cstime,
1237 cputime_add(p->stime, 1244 cputime_add(tgstime,
1238 cputime_add(sig->stime, 1245 sig->cstime));
1239 sig->cstime)));
1240 psig->cgtime = 1246 psig->cgtime =
1241 cputime_add(psig->cgtime, 1247 cputime_add(psig->cgtime,
1242 cputime_add(p->gtime, 1248 cputime_add(p->gtime,
@@ -1256,6 +1262,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1256 psig->coublock += 1262 psig->coublock +=
1257 task_io_get_oublock(p) + 1263 task_io_get_oublock(p) +
1258 sig->oublock + sig->coublock; 1264 sig->oublock + sig->coublock;
1265 maxrss = max(sig->maxrss, sig->cmaxrss);
1266 if (psig->cmaxrss < maxrss)
1267 psig->cmaxrss = maxrss;
1259 task_io_accounting_add(&psig->ioac, &p->ioac); 1268 task_io_accounting_add(&psig->ioac, &p->ioac);
1260 task_io_accounting_add(&psig->ioac, &sig->ioac); 1269 task_io_accounting_add(&psig->ioac, &sig->ioac);
1261 spin_unlock_irq(&p->real_parent->sighand->siglock); 1270 spin_unlock_irq(&p->real_parent->sighand->siglock);
@@ -1477,13 +1486,14 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1477 * then ->notask_error is 0 if @p is an eligible child, 1486 * then ->notask_error is 0 if @p is an eligible child,
1478 * or another error from security_task_wait(), or still -ECHILD. 1487 * or another error from security_task_wait(), or still -ECHILD.
1479 */ 1488 */
1480static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent, 1489static int wait_consider_task(struct wait_opts *wo, int ptrace,
1481 int ptrace, struct task_struct *p) 1490 struct task_struct *p)
1482{ 1491{
1483 int ret = eligible_child(wo, p); 1492 int ret = eligible_child(wo, p);
1484 if (!ret) 1493 if (!ret)
1485 return ret; 1494 return ret;
1486 1495
1496 ret = security_task_wait(p);
1487 if (unlikely(ret < 0)) { 1497 if (unlikely(ret < 0)) {
1488 /* 1498 /*
1489 * If we have not yet seen any eligible child, 1499 * If we have not yet seen any eligible child,
@@ -1545,7 +1555,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1545 * Do not consider detached threads. 1555 * Do not consider detached threads.
1546 */ 1556 */
1547 if (!task_detached(p)) { 1557 if (!task_detached(p)) {
1548 int ret = wait_consider_task(wo, tsk, 0, p); 1558 int ret = wait_consider_task(wo, 0, p);
1549 if (ret) 1559 if (ret)
1550 return ret; 1560 return ret;
1551 } 1561 }
@@ -1559,7 +1569,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1559 struct task_struct *p; 1569 struct task_struct *p;
1560 1570
1561 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { 1571 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1562 int ret = wait_consider_task(wo, tsk, 1, p); 1572 int ret = wait_consider_task(wo, 1, p);
1563 if (ret) 1573 if (ret)
1564 return ret; 1574 return ret;
1565 } 1575 }
@@ -1567,15 +1577,38 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1567 return 0; 1577 return 0;
1568} 1578}
1569 1579
1580static int child_wait_callback(wait_queue_t *wait, unsigned mode,
1581 int sync, void *key)
1582{
1583 struct wait_opts *wo = container_of(wait, struct wait_opts,
1584 child_wait);
1585 struct task_struct *p = key;
1586
1587 if (!eligible_pid(wo, p))
1588 return 0;
1589
1590 if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
1591 return 0;
1592
1593 return default_wake_function(wait, mode, sync, key);
1594}
1595
1596void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
1597{
1598 __wake_up_sync_key(&parent->signal->wait_chldexit,
1599 TASK_INTERRUPTIBLE, 1, p);
1600}
1601
1570static long do_wait(struct wait_opts *wo) 1602static long do_wait(struct wait_opts *wo)
1571{ 1603{
1572 DECLARE_WAITQUEUE(wait, current);
1573 struct task_struct *tsk; 1604 struct task_struct *tsk;
1574 int retval; 1605 int retval;
1575 1606
1576 trace_sched_process_wait(wo->wo_pid); 1607 trace_sched_process_wait(wo->wo_pid);
1577 1608
1578 add_wait_queue(&current->signal->wait_chldexit,&wait); 1609 init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
1610 wo->child_wait.private = current;
1611 add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1579repeat: 1612repeat:
1580 /* 1613 /*
1581 * If there is nothing that can match our critiera just get out. 1614 * If there is nothing that can match our critiera just get out.
@@ -1616,32 +1649,7 @@ notask:
1616 } 1649 }
1617end: 1650end:
1618 __set_current_state(TASK_RUNNING); 1651 __set_current_state(TASK_RUNNING);
1619 remove_wait_queue(&current->signal->wait_chldexit,&wait); 1652 remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1620 if (wo->wo_info) {
1621 struct siginfo __user *infop = wo->wo_info;
1622
1623 if (retval > 0)
1624 retval = 0;
1625 else {
1626 /*
1627 * For a WNOHANG return, clear out all the fields
1628 * we would set so the user can easily tell the
1629 * difference.
1630 */
1631 if (!retval)
1632 retval = put_user(0, &infop->si_signo);
1633 if (!retval)
1634 retval = put_user(0, &infop->si_errno);
1635 if (!retval)
1636 retval = put_user(0, &infop->si_code);
1637 if (!retval)
1638 retval = put_user(0, &infop->si_pid);
1639 if (!retval)
1640 retval = put_user(0, &infop->si_uid);
1641 if (!retval)
1642 retval = put_user(0, &infop->si_status);
1643 }
1644 }
1645 return retval; 1653 return retval;
1646} 1654}
1647 1655
@@ -1686,6 +1694,29 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1686 wo.wo_stat = NULL; 1694 wo.wo_stat = NULL;
1687 wo.wo_rusage = ru; 1695 wo.wo_rusage = ru;
1688 ret = do_wait(&wo); 1696 ret = do_wait(&wo);
1697
1698 if (ret > 0) {
1699 ret = 0;
1700 } else if (infop) {
1701 /*
1702 * For a WNOHANG return, clear out all the fields
1703 * we would set so the user can easily tell the
1704 * difference.
1705 */
1706 if (!ret)
1707 ret = put_user(0, &infop->si_signo);
1708 if (!ret)
1709 ret = put_user(0, &infop->si_errno);
1710 if (!ret)
1711 ret = put_user(0, &infop->si_code);
1712 if (!ret)
1713 ret = put_user(0, &infop->si_pid);
1714 if (!ret)
1715 ret = put_user(0, &infop->si_uid);
1716 if (!ret)
1717 ret = put_user(0, &infop->si_status);
1718 }
1719
1689 put_pid(pid); 1720 put_pid(pid);
1690 1721
1691 /* avoid REGPARM breakage on x86: */ 1722 /* avoid REGPARM breakage on x86: */
diff --git a/kernel/fork.c b/kernel/fork.c
index bfee931ee3fb..3d6f121bbe8a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -49,6 +49,7 @@
49#include <linux/ftrace.h> 49#include <linux/ftrace.h>
50#include <linux/profile.h> 50#include <linux/profile.h>
51#include <linux/rmap.h> 51#include <linux/rmap.h>
52#include <linux/ksm.h>
52#include <linux/acct.h> 53#include <linux/acct.h>
53#include <linux/tsacct_kern.h> 54#include <linux/tsacct_kern.h>
54#include <linux/cn_proc.h> 55#include <linux/cn_proc.h>
@@ -61,7 +62,8 @@
61#include <linux/blkdev.h> 62#include <linux/blkdev.h>
62#include <linux/fs_struct.h> 63#include <linux/fs_struct.h>
63#include <linux/magic.h> 64#include <linux/magic.h>
64#include <linux/perf_counter.h> 65#include <linux/perf_event.h>
66#include <linux/posix-timers.h>
65 67
66#include <asm/pgtable.h> 68#include <asm/pgtable.h>
67#include <asm/pgalloc.h> 69#include <asm/pgalloc.h>
@@ -89,7 +91,7 @@ int nr_processes(void)
89 int cpu; 91 int cpu;
90 int total = 0; 92 int total = 0;
91 93
92 for_each_online_cpu(cpu) 94 for_each_possible_cpu(cpu)
93 total += per_cpu(process_counts, cpu); 95 total += per_cpu(process_counts, cpu);
94 96
95 return total; 97 return total;
@@ -136,9 +138,17 @@ struct kmem_cache *vm_area_cachep;
136/* SLAB cache for mm_struct structures (tsk->mm) */ 138/* SLAB cache for mm_struct structures (tsk->mm) */
137static struct kmem_cache *mm_cachep; 139static struct kmem_cache *mm_cachep;
138 140
141static void account_kernel_stack(struct thread_info *ti, int account)
142{
143 struct zone *zone = page_zone(virt_to_page(ti));
144
145 mod_zone_page_state(zone, NR_KERNEL_STACK, account);
146}
147
139void free_task(struct task_struct *tsk) 148void free_task(struct task_struct *tsk)
140{ 149{
141 prop_local_destroy_single(&tsk->dirties); 150 prop_local_destroy_single(&tsk->dirties);
151 account_kernel_stack(tsk->stack, -1);
142 free_thread_info(tsk->stack); 152 free_thread_info(tsk->stack);
143 rt_mutex_debug_task_free(tsk); 153 rt_mutex_debug_task_free(tsk);
144 ftrace_graph_exit_task(tsk); 154 ftrace_graph_exit_task(tsk);
@@ -253,6 +263,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
253 tsk->btrace_seq = 0; 263 tsk->btrace_seq = 0;
254#endif 264#endif
255 tsk->splice_pipe = NULL; 265 tsk->splice_pipe = NULL;
266
267 account_kernel_stack(ti, 1);
268
256 return tsk; 269 return tsk;
257 270
258out: 271out:
@@ -288,6 +301,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
288 rb_link = &mm->mm_rb.rb_node; 301 rb_link = &mm->mm_rb.rb_node;
289 rb_parent = NULL; 302 rb_parent = NULL;
290 pprev = &mm->mmap; 303 pprev = &mm->mmap;
304 retval = ksm_fork(mm, oldmm);
305 if (retval)
306 goto out;
291 307
292 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { 308 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
293 struct file *file; 309 struct file *file;
@@ -418,22 +434,30 @@ __setup("coredump_filter=", coredump_filter_setup);
418 434
419#include <linux/init_task.h> 435#include <linux/init_task.h>
420 436
437static void mm_init_aio(struct mm_struct *mm)
438{
439#ifdef CONFIG_AIO
440 spin_lock_init(&mm->ioctx_lock);
441 INIT_HLIST_HEAD(&mm->ioctx_list);
442#endif
443}
444
421static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) 445static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
422{ 446{
423 atomic_set(&mm->mm_users, 1); 447 atomic_set(&mm->mm_users, 1);
424 atomic_set(&mm->mm_count, 1); 448 atomic_set(&mm->mm_count, 1);
425 init_rwsem(&mm->mmap_sem); 449 init_rwsem(&mm->mmap_sem);
426 INIT_LIST_HEAD(&mm->mmlist); 450 INIT_LIST_HEAD(&mm->mmlist);
427 mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; 451 mm->flags = (current->mm) ?
452 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
428 mm->core_state = NULL; 453 mm->core_state = NULL;
429 mm->nr_ptes = 0; 454 mm->nr_ptes = 0;
430 set_mm_counter(mm, file_rss, 0); 455 set_mm_counter(mm, file_rss, 0);
431 set_mm_counter(mm, anon_rss, 0); 456 set_mm_counter(mm, anon_rss, 0);
432 spin_lock_init(&mm->page_table_lock); 457 spin_lock_init(&mm->page_table_lock);
433 spin_lock_init(&mm->ioctx_lock);
434 INIT_HLIST_HEAD(&mm->ioctx_list);
435 mm->free_area_cache = TASK_UNMAPPED_BASE; 458 mm->free_area_cache = TASK_UNMAPPED_BASE;
436 mm->cached_hole_size = ~0UL; 459 mm->cached_hole_size = ~0UL;
460 mm_init_aio(mm);
437 mm_init_owner(mm, p); 461 mm_init_owner(mm, p);
438 462
439 if (likely(!mm_alloc_pgd(mm))) { 463 if (likely(!mm_alloc_pgd(mm))) {
@@ -485,6 +509,7 @@ void mmput(struct mm_struct *mm)
485 509
486 if (atomic_dec_and_test(&mm->mm_users)) { 510 if (atomic_dec_and_test(&mm->mm_users)) {
487 exit_aio(mm); 511 exit_aio(mm);
512 ksm_exit(mm);
488 exit_mmap(mm); 513 exit_mmap(mm);
489 set_mm_exe_file(mm, NULL); 514 set_mm_exe_file(mm, NULL);
490 if (!list_empty(&mm->mmlist)) { 515 if (!list_empty(&mm->mmlist)) {
@@ -493,6 +518,8 @@ void mmput(struct mm_struct *mm)
493 spin_unlock(&mmlist_lock); 518 spin_unlock(&mmlist_lock);
494 } 519 }
495 put_swap_token(mm); 520 put_swap_token(mm);
521 if (mm->binfmt)
522 module_put(mm->binfmt->module);
496 mmdrop(mm); 523 mmdrop(mm);
497 } 524 }
498} 525}
@@ -543,12 +570,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
543 570
544 /* Get rid of any futexes when releasing the mm */ 571 /* Get rid of any futexes when releasing the mm */
545#ifdef CONFIG_FUTEX 572#ifdef CONFIG_FUTEX
546 if (unlikely(tsk->robust_list)) 573 if (unlikely(tsk->robust_list)) {
547 exit_robust_list(tsk); 574 exit_robust_list(tsk);
575 tsk->robust_list = NULL;
576 }
548#ifdef CONFIG_COMPAT 577#ifdef CONFIG_COMPAT
549 if (unlikely(tsk->compat_robust_list)) 578 if (unlikely(tsk->compat_robust_list)) {
550 compat_exit_robust_list(tsk); 579 compat_exit_robust_list(tsk);
580 tsk->compat_robust_list = NULL;
581 }
551#endif 582#endif
583 if (unlikely(!list_empty(&tsk->pi_state_list)))
584 exit_pi_state_list(tsk);
552#endif 585#endif
553 586
554 /* Get rid of any cached register state */ 587 /* Get rid of any cached register state */
@@ -618,9 +651,14 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
618 mm->hiwater_rss = get_mm_rss(mm); 651 mm->hiwater_rss = get_mm_rss(mm);
619 mm->hiwater_vm = mm->total_vm; 652 mm->hiwater_vm = mm->total_vm;
620 653
654 if (mm->binfmt && !try_module_get(mm->binfmt->module))
655 goto free_pt;
656
621 return mm; 657 return mm;
622 658
623free_pt: 659free_pt:
660 /* don't put binfmt in mmput, we haven't got module yet */
661 mm->binfmt = NULL;
624 mmput(mm); 662 mmput(mm);
625 663
626fail_nomem: 664fail_nomem:
@@ -788,10 +826,10 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
788 thread_group_cputime_init(sig); 826 thread_group_cputime_init(sig);
789 827
790 /* Expiration times and increments. */ 828 /* Expiration times and increments. */
791 sig->it_virt_expires = cputime_zero; 829 sig->it[CPUCLOCK_PROF].expires = cputime_zero;
792 sig->it_virt_incr = cputime_zero; 830 sig->it[CPUCLOCK_PROF].incr = cputime_zero;
793 sig->it_prof_expires = cputime_zero; 831 sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
794 sig->it_prof_incr = cputime_zero; 832 sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
795 833
796 /* Cached expiration times. */ 834 /* Cached expiration times. */
797 sig->cputime_expires.prof_exp = cputime_zero; 835 sig->cputime_expires.prof_exp = cputime_zero;
@@ -846,9 +884,13 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
846 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; 884 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
847 sig->gtime = cputime_zero; 885 sig->gtime = cputime_zero;
848 sig->cgtime = cputime_zero; 886 sig->cgtime = cputime_zero;
887#ifndef CONFIG_VIRT_CPU_ACCOUNTING
888 sig->prev_utime = sig->prev_stime = cputime_zero;
889#endif
849 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 890 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
850 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 891 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
851 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 892 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
893 sig->maxrss = sig->cmaxrss = 0;
852 task_io_accounting_init(&sig->ioac); 894 task_io_accounting_init(&sig->ioac);
853 sig->sum_sched_runtime = 0; 895 sig->sum_sched_runtime = 0;
854 taskstats_tgid_init(sig); 896 taskstats_tgid_init(sig);
@@ -863,6 +905,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
863 905
864 tty_audit_fork(sig); 906 tty_audit_fork(sig);
865 907
908 sig->oom_adj = current->signal->oom_adj;
909
866 return 0; 910 return 0;
867} 911}
868 912
@@ -958,6 +1002,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
958 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) 1002 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
959 return ERR_PTR(-EINVAL); 1003 return ERR_PTR(-EINVAL);
960 1004
1005 /*
1006 * Siblings of global init remain as zombies on exit since they are
1007 * not reaped by their parent (swapper). To solve this and to avoid
1008 * multi-rooted process trees, prevent global and container-inits
1009 * from creating siblings.
1010 */
1011 if ((clone_flags & CLONE_PARENT) &&
1012 current->signal->flags & SIGNAL_UNKILLABLE)
1013 return ERR_PTR(-EINVAL);
1014
961 retval = security_task_create(clone_flags); 1015 retval = security_task_create(clone_flags);
962 if (retval) 1016 if (retval)
963 goto fork_out; 1017 goto fork_out;
@@ -999,9 +1053,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
999 if (!try_module_get(task_thread_info(p)->exec_domain->module)) 1053 if (!try_module_get(task_thread_info(p)->exec_domain->module))
1000 goto bad_fork_cleanup_count; 1054 goto bad_fork_cleanup_count;
1001 1055
1002 if (p->binfmt && !try_module_get(p->binfmt->module))
1003 goto bad_fork_cleanup_put_domain;
1004
1005 p->did_exec = 0; 1056 p->did_exec = 0;
1006 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ 1057 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
1007 copy_flags(clone_flags, p); 1058 copy_flags(clone_flags, p);
@@ -1018,8 +1069,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1018 p->gtime = cputime_zero; 1069 p->gtime = cputime_zero;
1019 p->utimescaled = cputime_zero; 1070 p->utimescaled = cputime_zero;
1020 p->stimescaled = cputime_zero; 1071 p->stimescaled = cputime_zero;
1072#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1021 p->prev_utime = cputime_zero; 1073 p->prev_utime = cputime_zero;
1022 p->prev_stime = cputime_zero; 1074 p->prev_stime = cputime_zero;
1075#endif
1023 1076
1024 p->default_timer_slack_ns = current->timer_slack_ns; 1077 p->default_timer_slack_ns = current->timer_slack_ns;
1025 1078
@@ -1075,10 +1128,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1075 1128
1076 p->bts = NULL; 1129 p->bts = NULL;
1077 1130
1131 p->stack_start = stack_start;
1132
1078 /* Perform scheduler related setup. Assign this task to a CPU. */ 1133 /* Perform scheduler related setup. Assign this task to a CPU. */
1079 sched_fork(p, clone_flags); 1134 sched_fork(p, clone_flags);
1080 1135
1081 retval = perf_counter_init_task(p); 1136 retval = perf_event_init_task(p);
1082 if (retval) 1137 if (retval)
1083 goto bad_fork_cleanup_policy; 1138 goto bad_fork_cleanup_policy;
1084 1139
@@ -1253,7 +1308,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1253 write_unlock_irq(&tasklist_lock); 1308 write_unlock_irq(&tasklist_lock);
1254 proc_fork_connector(p); 1309 proc_fork_connector(p);
1255 cgroup_post_fork(p); 1310 cgroup_post_fork(p);
1256 perf_counter_fork(p); 1311 perf_event_fork(p);
1257 return p; 1312 return p;
1258 1313
1259bad_fork_free_pid: 1314bad_fork_free_pid:
@@ -1280,16 +1335,13 @@ bad_fork_cleanup_semundo:
1280bad_fork_cleanup_audit: 1335bad_fork_cleanup_audit:
1281 audit_free(p); 1336 audit_free(p);
1282bad_fork_cleanup_policy: 1337bad_fork_cleanup_policy:
1283 perf_counter_free_task(p); 1338 perf_event_free_task(p);
1284#ifdef CONFIG_NUMA 1339#ifdef CONFIG_NUMA
1285 mpol_put(p->mempolicy); 1340 mpol_put(p->mempolicy);
1286bad_fork_cleanup_cgroup: 1341bad_fork_cleanup_cgroup:
1287#endif 1342#endif
1288 cgroup_exit(p, cgroup_callbacks_done); 1343 cgroup_exit(p, cgroup_callbacks_done);
1289 delayacct_tsk_free(p); 1344 delayacct_tsk_free(p);
1290 if (p->binfmt)
1291 module_put(p->binfmt->module);
1292bad_fork_cleanup_put_domain:
1293 module_put(task_thread_info(p)->exec_domain->module); 1345 module_put(task_thread_info(p)->exec_domain->module);
1294bad_fork_cleanup_count: 1346bad_fork_cleanup_count:
1295 atomic_dec(&p->cred->user->processes); 1347 atomic_dec(&p->cred->user->processes);
diff --git a/kernel/futex.c b/kernel/futex.c
index 248dd119a86e..fb65e822fc41 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -89,36 +89,36 @@ struct futex_pi_state {
89 union futex_key key; 89 union futex_key key;
90}; 90};
91 91
92/* 92/**
93 * We use this hashed waitqueue instead of a normal wait_queue_t, so 93 * struct futex_q - The hashed futex queue entry, one per waiting task
94 * @task: the task waiting on the futex
95 * @lock_ptr: the hash bucket lock
96 * @key: the key the futex is hashed on
97 * @pi_state: optional priority inheritance state
98 * @rt_waiter: rt_waiter storage for use with requeue_pi
99 * @requeue_pi_key: the requeue_pi target futex key
100 * @bitset: bitset for the optional bitmasked wakeup
101 *
102 * We use this hashed waitqueue, instead of a normal wait_queue_t, so
94 * we can wake only the relevant ones (hashed queues may be shared). 103 * we can wake only the relevant ones (hashed queues may be shared).
95 * 104 *
96 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 105 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
97 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 106 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
98 * The order of wakup is always to make the first condition true, then 107 * The order of wakup is always to make the first condition true, then
99 * wake up q->waiter, then make the second condition true. 108 * the second.
109 *
110 * PI futexes are typically woken before they are removed from the hash list via
111 * the rt_mutex code. See unqueue_me_pi().
100 */ 112 */
101struct futex_q { 113struct futex_q {
102 struct plist_node list; 114 struct plist_node list;
103 /* Waiter reference */
104 struct task_struct *task;
105 115
106 /* Which hash list lock to use: */ 116 struct task_struct *task;
107 spinlock_t *lock_ptr; 117 spinlock_t *lock_ptr;
108
109 /* Key which the futex is hashed on: */
110 union futex_key key; 118 union futex_key key;
111
112 /* Optional priority inheritance state: */
113 struct futex_pi_state *pi_state; 119 struct futex_pi_state *pi_state;
114
115 /* rt_waiter storage for requeue_pi: */
116 struct rt_mutex_waiter *rt_waiter; 120 struct rt_mutex_waiter *rt_waiter;
117
118 /* The expected requeue pi target futex key: */
119 union futex_key *requeue_pi_key; 121 union futex_key *requeue_pi_key;
120
121 /* Bitset for the optional bitmasked wakeup */
122 u32 bitset; 122 u32 bitset;
123}; 123};
124 124
@@ -150,7 +150,8 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
150 */ 150 */
151static inline int match_futex(union futex_key *key1, union futex_key *key2) 151static inline int match_futex(union futex_key *key1, union futex_key *key2)
152{ 152{
153 return (key1->both.word == key2->both.word 153 return (key1 && key2
154 && key1->both.word == key2->both.word
154 && key1->both.ptr == key2->both.ptr 155 && key1->both.ptr == key2->both.ptr
155 && key1->both.offset == key2->both.offset); 156 && key1->both.offset == key2->both.offset);
156} 157}
@@ -198,11 +199,12 @@ static void drop_futex_key_refs(union futex_key *key)
198} 199}
199 200
200/** 201/**
201 * get_futex_key - Get parameters which are the keys for a futex. 202 * get_futex_key() - Get parameters which are the keys for a futex
202 * @uaddr: virtual address of the futex 203 * @uaddr: virtual address of the futex
203 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 204 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
204 * @key: address where result is stored. 205 * @key: address where result is stored.
205 * @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE) 206 * @rw: mapping needs to be read/write (values: VERIFY_READ,
207 * VERIFY_WRITE)
206 * 208 *
207 * Returns a negative error code or 0 209 * Returns a negative error code or 0
208 * The key words are stored in *key on success. 210 * The key words are stored in *key on success.
@@ -288,8 +290,8 @@ void put_futex_key(int fshared, union futex_key *key)
288 drop_futex_key_refs(key); 290 drop_futex_key_refs(key);
289} 291}
290 292
291/* 293/**
292 * fault_in_user_writeable - fault in user address and verify RW access 294 * fault_in_user_writeable() - Fault in user address and verify RW access
293 * @uaddr: pointer to faulting user space address 295 * @uaddr: pointer to faulting user space address
294 * 296 *
295 * Slow path to fixup the fault we just took in the atomic write 297 * Slow path to fixup the fault we just took in the atomic write
@@ -309,8 +311,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
309 311
310/** 312/**
311 * futex_top_waiter() - Return the highest priority waiter on a futex 313 * futex_top_waiter() - Return the highest priority waiter on a futex
312 * @hb: the hash bucket the futex_q's reside in 314 * @hb: the hash bucket the futex_q's reside in
313 * @key: the futex key (to distinguish it from other futex futex_q's) 315 * @key: the futex key (to distinguish it from other futex futex_q's)
314 * 316 *
315 * Must be called with the hb lock held. 317 * Must be called with the hb lock held.
316 */ 318 */
@@ -588,7 +590,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
588} 590}
589 591
590/** 592/**
591 * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex 593 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
592 * @uaddr: the pi futex user address 594 * @uaddr: the pi futex user address
593 * @hb: the pi futex hash bucket 595 * @hb: the pi futex hash bucket
594 * @key: the futex key associated with uaddr and hb 596 * @key: the futex key associated with uaddr and hb
@@ -915,8 +917,8 @@ retry:
915 hb1 = hash_futex(&key1); 917 hb1 = hash_futex(&key1);
916 hb2 = hash_futex(&key2); 918 hb2 = hash_futex(&key2);
917 919
918 double_lock_hb(hb1, hb2);
919retry_private: 920retry_private:
921 double_lock_hb(hb1, hb2);
920 op_ret = futex_atomic_op_inuser(op, uaddr2); 922 op_ret = futex_atomic_op_inuser(op, uaddr2);
921 if (unlikely(op_ret < 0)) { 923 if (unlikely(op_ret < 0)) {
922 924
@@ -1011,9 +1013,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1011 1013
1012/** 1014/**
1013 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue 1015 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1014 * q: the futex_q 1016 * @q: the futex_q
1015 * key: the key of the requeue target futex 1017 * @key: the key of the requeue target futex
1016 * hb: the hash_bucket of the requeue target futex 1018 * @hb: the hash_bucket of the requeue target futex
1017 * 1019 *
1018 * During futex_requeue, with requeue_pi=1, it is possible to acquire the 1020 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1019 * target futex if it is uncontended or via a lock steal. Set the futex_q key 1021 * target futex if it is uncontended or via a lock steal. Set the futex_q key
@@ -1027,7 +1029,6 @@ static inline
1027void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, 1029void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1028 struct futex_hash_bucket *hb) 1030 struct futex_hash_bucket *hb)
1029{ 1031{
1030 drop_futex_key_refs(&q->key);
1031 get_futex_key_refs(key); 1032 get_futex_key_refs(key);
1032 q->key = *key; 1033 q->key = *key;
1033 1034
@@ -1225,6 +1226,7 @@ retry_private:
1225 */ 1226 */
1226 if (ret == 1) { 1227 if (ret == 1) {
1227 WARN_ON(pi_state); 1228 WARN_ON(pi_state);
1229 drop_count++;
1228 task_count++; 1230 task_count++;
1229 ret = get_futex_value_locked(&curval2, uaddr2); 1231 ret = get_futex_value_locked(&curval2, uaddr2);
1230 if (!ret) 1232 if (!ret)
@@ -1303,6 +1305,7 @@ retry_private:
1303 if (ret == 1) { 1305 if (ret == 1) {
1304 /* We got the lock. */ 1306 /* We got the lock. */
1305 requeue_pi_wake_futex(this, &key2, hb2); 1307 requeue_pi_wake_futex(this, &key2, hb2);
1308 drop_count++;
1306 continue; 1309 continue;
1307 } else if (ret) { 1310 } else if (ret) {
1308 /* -EDEADLK */ 1311 /* -EDEADLK */
@@ -1350,6 +1353,25 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1350 return hb; 1353 return hb;
1351} 1354}
1352 1355
1356static inline void
1357queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1358{
1359 spin_unlock(&hb->lock);
1360 drop_futex_key_refs(&q->key);
1361}
1362
1363/**
1364 * queue_me() - Enqueue the futex_q on the futex_hash_bucket
1365 * @q: The futex_q to enqueue
1366 * @hb: The destination hash bucket
1367 *
1368 * The hb->lock must be held by the caller, and is released here. A call to
1369 * queue_me() is typically paired with exactly one call to unqueue_me(). The
1370 * exceptions involve the PI related operations, which may use unqueue_me_pi()
1371 * or nothing if the unqueue is done as part of the wake process and the unqueue
1372 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
1373 * an example).
1374 */
1353static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 1375static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1354{ 1376{
1355 int prio; 1377 int prio;
@@ -1373,19 +1395,17 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1373 spin_unlock(&hb->lock); 1395 spin_unlock(&hb->lock);
1374} 1396}
1375 1397
1376static inline void 1398/**
1377queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) 1399 * unqueue_me() - Remove the futex_q from its futex_hash_bucket
1378{ 1400 * @q: The futex_q to unqueue
1379 spin_unlock(&hb->lock); 1401 *
1380 drop_futex_key_refs(&q->key); 1402 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
1381} 1403 * be paired with exactly one earlier call to queue_me().
1382 1404 *
1383/* 1405 * Returns:
1384 * queue_me and unqueue_me must be called as a pair, each 1406 * 1 - if the futex_q was still queued (and we removed unqueued it)
1385 * exactly once. They are called with the hashed spinlock held. 1407 * 0 - if the futex_q was already removed by the waking thread
1386 */ 1408 */
1387
1388/* Return 1 if we were still queued (ie. 0 means we were woken) */
1389static int unqueue_me(struct futex_q *q) 1409static int unqueue_me(struct futex_q *q)
1390{ 1410{
1391 spinlock_t *lock_ptr; 1411 spinlock_t *lock_ptr;
@@ -1638,17 +1658,14 @@ out:
1638static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, 1658static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1639 struct hrtimer_sleeper *timeout) 1659 struct hrtimer_sleeper *timeout)
1640{ 1660{
1641 queue_me(q, hb);
1642
1643 /* 1661 /*
1644 * There might have been scheduling since the queue_me(), as we 1662 * The task state is guaranteed to be set before another task can
1645 * cannot hold a spinlock across the get_user() in case it 1663 * wake it. set_current_state() is implemented using set_mb() and
1646 * faults, and we cannot just set TASK_INTERRUPTIBLE state when 1664 * queue_me() calls spin_unlock() upon completion, both serializing
1647 * queueing ourselves into the futex hash. This code thus has to 1665 * access to the hash list and forcing another memory barrier.
1648 * rely on the futex_wake() code removing us from hash when it
1649 * wakes us up.
1650 */ 1666 */
1651 set_current_state(TASK_INTERRUPTIBLE); 1667 set_current_state(TASK_INTERRUPTIBLE);
1668 queue_me(q, hb);
1652 1669
1653 /* Arm the timer */ 1670 /* Arm the timer */
1654 if (timeout) { 1671 if (timeout) {
@@ -1658,8 +1675,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1658 } 1675 }
1659 1676
1660 /* 1677 /*
1661 * !plist_node_empty() is safe here without any lock. 1678 * If we have been removed from the hash list, then another task
1662 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. 1679 * has tried to wake us, and we can skip the call to schedule().
1663 */ 1680 */
1664 if (likely(!plist_node_empty(&q->list))) { 1681 if (likely(!plist_node_empty(&q->list))) {
1665 /* 1682 /*
@@ -1776,6 +1793,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1776 current->timer_slack_ns); 1793 current->timer_slack_ns);
1777 } 1794 }
1778 1795
1796retry:
1779 /* Prepare to wait on uaddr. */ 1797 /* Prepare to wait on uaddr. */
1780 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 1798 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
1781 if (ret) 1799 if (ret)
@@ -1793,9 +1811,14 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1793 goto out_put_key; 1811 goto out_put_key;
1794 1812
1795 /* 1813 /*
1796 * We expect signal_pending(current), but another thread may 1814 * We expect signal_pending(current), but we might be the
1797 * have handled it for us already. 1815 * victim of a spurious wakeup as well.
1798 */ 1816 */
1817 if (!signal_pending(current)) {
1818 put_futex_key(fshared, &q.key);
1819 goto retry;
1820 }
1821
1799 ret = -ERESTARTSYS; 1822 ret = -ERESTARTSYS;
1800 if (!abs_time) 1823 if (!abs_time)
1801 goto out_put_key; 1824 goto out_put_key;
@@ -2102,11 +2125,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2102 * Unqueue the futex_q and determine which it was. 2125 * Unqueue the futex_q and determine which it was.
2103 */ 2126 */
2104 plist_del(&q->list, &q->list.plist); 2127 plist_del(&q->list, &q->list.plist);
2105 drop_futex_key_refs(&q->key);
2106 2128
2129 /* Handle spurious wakeups gracefully */
2130 ret = -EWOULDBLOCK;
2107 if (timeout && !timeout->task) 2131 if (timeout && !timeout->task)
2108 ret = -ETIMEDOUT; 2132 ret = -ETIMEDOUT;
2109 else 2133 else if (signal_pending(current))
2110 ret = -ERESTARTNOINTR; 2134 ret = -ERESTARTNOINTR;
2111 } 2135 }
2112 return ret; 2136 return ret;
@@ -2114,12 +2138,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2114 2138
2115/** 2139/**
2116 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 2140 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2117 * @uaddr: the futex we initialyl wait on (non-pi) 2141 * @uaddr: the futex we initially wait on (non-pi)
2118 * @fshared: whether the futexes are shared (1) or not (0). They must be 2142 * @fshared: whether the futexes are shared (1) or not (0). They must be
2119 * the same type, no requeueing from private to shared, etc. 2143 * the same type, no requeueing from private to shared, etc.
2120 * @val: the expected value of uaddr 2144 * @val: the expected value of uaddr
2121 * @abs_time: absolute timeout 2145 * @abs_time: absolute timeout
2122 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all. 2146 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
2123 * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) 2147 * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
2124 * @uaddr2: the pi futex we will take prior to returning to user-space 2148 * @uaddr2: the pi futex we will take prior to returning to user-space
2125 * 2149 *
@@ -2246,7 +2270,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2246 res = fixup_owner(uaddr2, fshared, &q, !ret); 2270 res = fixup_owner(uaddr2, fshared, &q, !ret);
2247 /* 2271 /*
2248 * If fixup_owner() returned an error, proprogate that. If it 2272 * If fixup_owner() returned an error, proprogate that. If it
2249 * acquired the lock, clear our -ETIMEDOUT or -EINTR. 2273 * acquired the lock, clear -ETIMEDOUT or -EINTR.
2250 */ 2274 */
2251 if (res) 2275 if (res)
2252 ret = (res < 0) ? res : 0; 2276 ret = (res < 0) ? res : 0;
@@ -2302,9 +2326,9 @@ out:
2302 */ 2326 */
2303 2327
2304/** 2328/**
2305 * sys_set_robust_list - set the robust-futex list head of a task 2329 * sys_set_robust_list() - Set the robust-futex list head of a task
2306 * @head: pointer to the list-head 2330 * @head: pointer to the list-head
2307 * @len: length of the list-head, as userspace expects 2331 * @len: length of the list-head, as userspace expects
2308 */ 2332 */
2309SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, 2333SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
2310 size_t, len) 2334 size_t, len)
@@ -2323,10 +2347,10 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
2323} 2347}
2324 2348
2325/** 2349/**
2326 * sys_get_robust_list - get the robust-futex list head of a task 2350 * sys_get_robust_list() - Get the robust-futex list head of a task
2327 * @pid: pid of the process [zero for current task] 2351 * @pid: pid of the process [zero for current task]
2328 * @head_ptr: pointer to a list-head pointer, the kernel fills it in 2352 * @head_ptr: pointer to a list-head pointer, the kernel fills it in
2329 * @len_ptr: pointer to a length field, the kernel fills in the header size 2353 * @len_ptr: pointer to a length field, the kernel fills in the header size
2330 */ 2354 */
2331SYSCALL_DEFINE3(get_robust_list, int, pid, 2355SYSCALL_DEFINE3(get_robust_list, int, pid,
2332 struct robust_list_head __user * __user *, head_ptr, 2356 struct robust_list_head __user * __user *, head_ptr,
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 22e9dcfaa3d3..70a298d6da71 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -34,7 +34,7 @@ config GCOV_KERNEL
34config GCOV_PROFILE_ALL 34config GCOV_PROFILE_ALL
35 bool "Profile entire Kernel" 35 bool "Profile entire Kernel"
36 depends on GCOV_KERNEL 36 depends on GCOV_KERNEL
37 depends on S390 || X86 37 depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
38 default n 38 default n
39 ---help--- 39 ---help---
40 This options activates profiling for the entire kernel. 40 This options activates profiling for the entire kernel.
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 49da79ab8486..3e1c36e7998f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -48,36 +48,7 @@
48 48
49#include <asm/uaccess.h> 49#include <asm/uaccess.h>
50 50
51/** 51#include <trace/events/timer.h>
52 * ktime_get - get the monotonic time in ktime_t format
53 *
54 * returns the time in ktime_t format
55 */
56ktime_t ktime_get(void)
57{
58 struct timespec now;
59
60 ktime_get_ts(&now);
61
62 return timespec_to_ktime(now);
63}
64EXPORT_SYMBOL_GPL(ktime_get);
65
66/**
67 * ktime_get_real - get the real (wall-) time in ktime_t format
68 *
69 * returns the time in ktime_t format
70 */
71ktime_t ktime_get_real(void)
72{
73 struct timespec now;
74
75 getnstimeofday(&now);
76
77 return timespec_to_ktime(now);
78}
79
80EXPORT_SYMBOL_GPL(ktime_get_real);
81 52
82/* 53/*
83 * The timer bases: 54 * The timer bases:
@@ -106,31 +77,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
106 } 77 }
107}; 78};
108 79
109/**
110 * ktime_get_ts - get the monotonic clock in timespec format
111 * @ts: pointer to timespec variable
112 *
113 * The function calculates the monotonic clock from the realtime
114 * clock and the wall_to_monotonic offset and stores the result
115 * in normalized timespec format in the variable pointed to by @ts.
116 */
117void ktime_get_ts(struct timespec *ts)
118{
119 struct timespec tomono;
120 unsigned long seq;
121
122 do {
123 seq = read_seqbegin(&xtime_lock);
124 getnstimeofday(ts);
125 tomono = wall_to_monotonic;
126
127 } while (read_seqretry(&xtime_lock, seq));
128
129 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
130 ts->tv_nsec + tomono.tv_nsec);
131}
132EXPORT_SYMBOL_GPL(ktime_get_ts);
133
134/* 80/*
135 * Get the coarse grained time at the softirq based on xtime and 81 * Get the coarse grained time at the softirq based on xtime and
136 * wall_to_monotonic. 82 * wall_to_monotonic.
@@ -485,6 +431,7 @@ void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
485 debug_object_init_on_stack(timer, &hrtimer_debug_descr); 431 debug_object_init_on_stack(timer, &hrtimer_debug_descr);
486 __hrtimer_init(timer, clock_id, mode); 432 __hrtimer_init(timer, clock_id, mode);
487} 433}
434EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
488 435
489void destroy_hrtimer_on_stack(struct hrtimer *timer) 436void destroy_hrtimer_on_stack(struct hrtimer *timer)
490{ 437{
@@ -497,6 +444,26 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
497static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } 444static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
498#endif 445#endif
499 446
447static inline void
448debug_init(struct hrtimer *timer, clockid_t clockid,
449 enum hrtimer_mode mode)
450{
451 debug_hrtimer_init(timer);
452 trace_hrtimer_init(timer, clockid, mode);
453}
454
455static inline void debug_activate(struct hrtimer *timer)
456{
457 debug_hrtimer_activate(timer);
458 trace_hrtimer_start(timer);
459}
460
461static inline void debug_deactivate(struct hrtimer *timer)
462{
463 debug_hrtimer_deactivate(timer);
464 trace_hrtimer_cancel(timer);
465}
466
500/* High resolution timer related functions */ 467/* High resolution timer related functions */
501#ifdef CONFIG_HIGH_RES_TIMERS 468#ifdef CONFIG_HIGH_RES_TIMERS
502 469
@@ -542,13 +509,14 @@ static inline int hrtimer_hres_active(void)
542 * next event 509 * next event
543 * Called with interrupts disabled and base->lock held 510 * Called with interrupts disabled and base->lock held
544 */ 511 */
545static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) 512static void
513hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
546{ 514{
547 int i; 515 int i;
548 struct hrtimer_clock_base *base = cpu_base->clock_base; 516 struct hrtimer_clock_base *base = cpu_base->clock_base;
549 ktime_t expires; 517 ktime_t expires, expires_next;
550 518
551 cpu_base->expires_next.tv64 = KTIME_MAX; 519 expires_next.tv64 = KTIME_MAX;
552 520
553 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 521 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
554 struct hrtimer *timer; 522 struct hrtimer *timer;
@@ -564,10 +532,15 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
564 */ 532 */
565 if (expires.tv64 < 0) 533 if (expires.tv64 < 0)
566 expires.tv64 = 0; 534 expires.tv64 = 0;
567 if (expires.tv64 < cpu_base->expires_next.tv64) 535 if (expires.tv64 < expires_next.tv64)
568 cpu_base->expires_next = expires; 536 expires_next = expires;
569 } 537 }
570 538
539 if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
540 return;
541
542 cpu_base->expires_next.tv64 = expires_next.tv64;
543
571 if (cpu_base->expires_next.tv64 != KTIME_MAX) 544 if (cpu_base->expires_next.tv64 != KTIME_MAX)
572 tick_program_event(cpu_base->expires_next, 1); 545 tick_program_event(cpu_base->expires_next, 1);
573} 546}
@@ -650,7 +623,7 @@ static void retrigger_next_event(void *arg)
650 base->clock_base[CLOCK_REALTIME].offset = 623 base->clock_base[CLOCK_REALTIME].offset =
651 timespec_to_ktime(realtime_offset); 624 timespec_to_ktime(realtime_offset);
652 625
653 hrtimer_force_reprogram(base); 626 hrtimer_force_reprogram(base, 0);
654 spin_unlock(&base->lock); 627 spin_unlock(&base->lock);
655} 628}
656 629
@@ -753,8 +726,6 @@ static int hrtimer_switch_to_hres(void)
753 /* "Retrigger" the interrupt to get things going */ 726 /* "Retrigger" the interrupt to get things going */
754 retrigger_next_event(NULL); 727 retrigger_next_event(NULL);
755 local_irq_restore(flags); 728 local_irq_restore(flags);
756 printk(KERN_DEBUG "Switched to high resolution mode on CPU %d\n",
757 smp_processor_id());
758 return 1; 729 return 1;
759} 730}
760 731
@@ -763,7 +734,8 @@ static int hrtimer_switch_to_hres(void)
763static inline int hrtimer_hres_active(void) { return 0; } 734static inline int hrtimer_hres_active(void) { return 0; }
764static inline int hrtimer_is_hres_enabled(void) { return 0; } 735static inline int hrtimer_is_hres_enabled(void) { return 0; }
765static inline int hrtimer_switch_to_hres(void) { return 0; } 736static inline int hrtimer_switch_to_hres(void) { return 0; }
766static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } 737static inline void
738hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
767static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, 739static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
768 struct hrtimer_clock_base *base, 740 struct hrtimer_clock_base *base,
769 int wakeup) 741 int wakeup)
@@ -853,7 +825,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
853 struct hrtimer *entry; 825 struct hrtimer *entry;
854 int leftmost = 1; 826 int leftmost = 1;
855 827
856 debug_hrtimer_activate(timer); 828 debug_activate(timer);
857 829
858 /* 830 /*
859 * Find the right place in the rbtree: 831 * Find the right place in the rbtree:
@@ -906,19 +878,29 @@ static void __remove_hrtimer(struct hrtimer *timer,
906 struct hrtimer_clock_base *base, 878 struct hrtimer_clock_base *base,
907 unsigned long newstate, int reprogram) 879 unsigned long newstate, int reprogram)
908{ 880{
909 if (timer->state & HRTIMER_STATE_ENQUEUED) { 881 if (!(timer->state & HRTIMER_STATE_ENQUEUED))
910 /* 882 goto out;
911 * Remove the timer from the rbtree and replace the 883
912 * first entry pointer if necessary. 884 /*
913 */ 885 * Remove the timer from the rbtree and replace the first
914 if (base->first == &timer->node) { 886 * entry pointer if necessary.
915 base->first = rb_next(&timer->node); 887 */
916 /* Reprogram the clock event device. if enabled */ 888 if (base->first == &timer->node) {
917 if (reprogram && hrtimer_hres_active()) 889 base->first = rb_next(&timer->node);
918 hrtimer_force_reprogram(base->cpu_base); 890#ifdef CONFIG_HIGH_RES_TIMERS
891 /* Reprogram the clock event device. if enabled */
892 if (reprogram && hrtimer_hres_active()) {
893 ktime_t expires;
894
895 expires = ktime_sub(hrtimer_get_expires(timer),
896 base->offset);
897 if (base->cpu_base->expires_next.tv64 == expires.tv64)
898 hrtimer_force_reprogram(base->cpu_base, 1);
919 } 899 }
920 rb_erase(&timer->node, &base->active); 900#endif
921 } 901 }
902 rb_erase(&timer->node, &base->active);
903out:
922 timer->state = newstate; 904 timer->state = newstate;
923} 905}
924 906
@@ -939,7 +921,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
939 * reprogramming happens in the interrupt handler. This is a 921 * reprogramming happens in the interrupt handler. This is a
940 * rare case and less expensive than a smp call. 922 * rare case and less expensive than a smp call.
941 */ 923 */
942 debug_hrtimer_deactivate(timer); 924 debug_deactivate(timer);
943 timer_stats_hrtimer_clear_start_info(timer); 925 timer_stats_hrtimer_clear_start_info(timer);
944 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); 926 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
945 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 927 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
@@ -1154,7 +1136,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1154 clock_id = CLOCK_MONOTONIC; 1136 clock_id = CLOCK_MONOTONIC;
1155 1137
1156 timer->base = &cpu_base->clock_base[clock_id]; 1138 timer->base = &cpu_base->clock_base[clock_id];
1157 INIT_LIST_HEAD(&timer->cb_entry);
1158 hrtimer_init_timer_hres(timer); 1139 hrtimer_init_timer_hres(timer);
1159 1140
1160#ifdef CONFIG_TIMER_STATS 1141#ifdef CONFIG_TIMER_STATS
@@ -1173,7 +1154,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1173void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, 1154void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1174 enum hrtimer_mode mode) 1155 enum hrtimer_mode mode)
1175{ 1156{
1176 debug_hrtimer_init(timer); 1157 debug_init(timer, clock_id, mode);
1177 __hrtimer_init(timer, clock_id, mode); 1158 __hrtimer_init(timer, clock_id, mode);
1178} 1159}
1179EXPORT_SYMBOL_GPL(hrtimer_init); 1160EXPORT_SYMBOL_GPL(hrtimer_init);
@@ -1197,7 +1178,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1197} 1178}
1198EXPORT_SYMBOL_GPL(hrtimer_get_res); 1179EXPORT_SYMBOL_GPL(hrtimer_get_res);
1199 1180
1200static void __run_hrtimer(struct hrtimer *timer) 1181static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1201{ 1182{
1202 struct hrtimer_clock_base *base = timer->base; 1183 struct hrtimer_clock_base *base = timer->base;
1203 struct hrtimer_cpu_base *cpu_base = base->cpu_base; 1184 struct hrtimer_cpu_base *cpu_base = base->cpu_base;
@@ -1206,7 +1187,7 @@ static void __run_hrtimer(struct hrtimer *timer)
1206 1187
1207 WARN_ON(!irqs_disabled()); 1188 WARN_ON(!irqs_disabled());
1208 1189
1209 debug_hrtimer_deactivate(timer); 1190 debug_deactivate(timer);
1210 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); 1191 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
1211 timer_stats_account_hrtimer(timer); 1192 timer_stats_account_hrtimer(timer);
1212 fn = timer->function; 1193 fn = timer->function;
@@ -1217,7 +1198,9 @@ static void __run_hrtimer(struct hrtimer *timer)
1217 * the timer base. 1198 * the timer base.
1218 */ 1199 */
1219 spin_unlock(&cpu_base->lock); 1200 spin_unlock(&cpu_base->lock);
1201 trace_hrtimer_expire_entry(timer, now);
1220 restart = fn(timer); 1202 restart = fn(timer);
1203 trace_hrtimer_expire_exit(timer);
1221 spin_lock(&cpu_base->lock); 1204 spin_lock(&cpu_base->lock);
1222 1205
1223 /* 1206 /*
@@ -1328,7 +1311,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1328 break; 1311 break;
1329 } 1312 }
1330 1313
1331 __run_hrtimer(timer); 1314 __run_hrtimer(timer, &basenow);
1332 } 1315 }
1333 base++; 1316 base++;
1334 } 1317 }
@@ -1450,7 +1433,7 @@ void hrtimer_run_queues(void)
1450 hrtimer_get_expires_tv64(timer)) 1433 hrtimer_get_expires_tv64(timer))
1451 break; 1434 break;
1452 1435
1453 __run_hrtimer(timer); 1436 __run_hrtimer(timer, &base->softirq_time);
1454 } 1437 }
1455 spin_unlock(&cpu_base->lock); 1438 spin_unlock(&cpu_base->lock);
1456 } 1439 }
@@ -1477,6 +1460,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1477 sl->timer.function = hrtimer_wakeup; 1460 sl->timer.function = hrtimer_wakeup;
1478 sl->task = task; 1461 sl->task = task;
1479} 1462}
1463EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
1480 1464
1481static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) 1465static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
1482{ 1466{
@@ -1626,7 +1610,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1626 while ((node = rb_first(&old_base->active))) { 1610 while ((node = rb_first(&old_base->active))) {
1627 timer = rb_entry(node, struct hrtimer, node); 1611 timer = rb_entry(node, struct hrtimer, node);
1628 BUG_ON(hrtimer_callback_running(timer)); 1612 BUG_ON(hrtimer_callback_running(timer));
1629 debug_hrtimer_deactivate(timer); 1613 debug_deactivate(timer);
1630 1614
1631 /* 1615 /*
1632 * Mark it as STATE_MIGRATE not INACTIVE otherwise the 1616 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 022a4927b785..0c642d51aac2 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -144,7 +144,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
144 144
145 rcu_read_lock(); 145 rcu_read_lock();
146 do_each_thread(g, t) { 146 do_each_thread(g, t) {
147 if (!--max_count) 147 if (!max_count--)
148 goto unlock; 148 goto unlock;
149 if (!--batch_count) { 149 if (!--batch_count) {
150 batch_count = HUNG_TASK_BATCHING; 150 batch_count = HUNG_TASK_BATCHING;
@@ -171,12 +171,12 @@ static unsigned long timeout_jiffies(unsigned long timeout)
171 * Process updating of timeout sysctl 171 * Process updating of timeout sysctl
172 */ 172 */
173int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, 173int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
174 struct file *filp, void __user *buffer, 174 void __user *buffer,
175 size_t *lenp, loff_t *ppos) 175 size_t *lenp, loff_t *ppos)
176{ 176{
177 int ret; 177 int ret;
178 178
179 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); 179 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
180 180
181 if (ret || !write) 181 if (ret || !write)
182 goto out; 182 goto out;
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..cf5ee1628411
--- /dev/null
+++ b/kernel/hw_breakpoint.c
@@ -0,0 +1,423 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) 2007 Alan Stern
17 * Copyright (C) IBM Corporation, 2009
18 * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
19 *
20 * Thanks to Ingo Molnar for his many suggestions.
21 *
22 * Authors: Alan Stern <stern@rowland.harvard.edu>
23 * K.Prasad <prasad@linux.vnet.ibm.com>
24 * Frederic Weisbecker <fweisbec@gmail.com>
25 */
26
27/*
28 * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
29 * using the CPU's debug registers.
30 * This file contains the arch-independent routines.
31 */
32
33#include <linux/irqflags.h>
34#include <linux/kallsyms.h>
35#include <linux/notifier.h>
36#include <linux/kprobes.h>
37#include <linux/kdebug.h>
38#include <linux/kernel.h>
39#include <linux/module.h>
40#include <linux/percpu.h>
41#include <linux/sched.h>
42#include <linux/init.h>
43#include <linux/smp.h>
44
45#include <linux/hw_breakpoint.h>
46
47/*
48 * Constraints data
49 */
50
51/* Number of pinned cpu breakpoints in a cpu */
52static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
53
54/* Number of pinned task breakpoints in a cpu */
55static DEFINE_PER_CPU(unsigned int, task_bp_pinned[HBP_NUM]);
56
57/* Number of non-pinned cpu/task breakpoints in a cpu */
58static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
59
60/* Gather the number of total pinned and un-pinned bp in a cpuset */
61struct bp_busy_slots {
62 unsigned int pinned;
63 unsigned int flexible;
64};
65
66/* Serialize accesses to the above constraints */
67static DEFINE_MUTEX(nr_bp_mutex);
68
69/*
70 * Report the maximum number of pinned breakpoints a task
71 * have in this cpu
72 */
73static unsigned int max_task_bp_pinned(int cpu)
74{
75 int i;
76 unsigned int *tsk_pinned = per_cpu(task_bp_pinned, cpu);
77
78 for (i = HBP_NUM -1; i >= 0; i--) {
79 if (tsk_pinned[i] > 0)
80 return i + 1;
81 }
82
83 return 0;
84}
85
86/*
87 * Report the number of pinned/un-pinned breakpoints we have in
88 * a given cpu (cpu > -1) or in all of them (cpu = -1).
89 */
90static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
91{
92 if (cpu >= 0) {
93 slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
94 slots->pinned += max_task_bp_pinned(cpu);
95 slots->flexible = per_cpu(nr_bp_flexible, cpu);
96
97 return;
98 }
99
100 for_each_online_cpu(cpu) {
101 unsigned int nr;
102
103 nr = per_cpu(nr_cpu_bp_pinned, cpu);
104 nr += max_task_bp_pinned(cpu);
105
106 if (nr > slots->pinned)
107 slots->pinned = nr;
108
109 nr = per_cpu(nr_bp_flexible, cpu);
110
111 if (nr > slots->flexible)
112 slots->flexible = nr;
113 }
114}
115
116/*
117 * Add a pinned breakpoint for the given task in our constraint table
118 */
119static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
120{
121 int count = 0;
122 struct perf_event *bp;
123 struct perf_event_context *ctx = tsk->perf_event_ctxp;
124 unsigned int *tsk_pinned;
125 struct list_head *list;
126 unsigned long flags;
127
128 if (WARN_ONCE(!ctx, "No perf context for this task"))
129 return;
130
131 list = &ctx->event_list;
132
133 spin_lock_irqsave(&ctx->lock, flags);
134
135 /*
136 * The current breakpoint counter is not included in the list
137 * at the open() callback time
138 */
139 list_for_each_entry(bp, list, event_entry) {
140 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
141 count++;
142 }
143
144 spin_unlock_irqrestore(&ctx->lock, flags);
145
146 if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list"))
147 return;
148
149 tsk_pinned = per_cpu(task_bp_pinned, cpu);
150 if (enable) {
151 tsk_pinned[count]++;
152 if (count > 0)
153 tsk_pinned[count-1]--;
154 } else {
155 tsk_pinned[count]--;
156 if (count > 0)
157 tsk_pinned[count-1]++;
158 }
159}
160
161/*
162 * Add/remove the given breakpoint in our constraint table
163 */
164static void toggle_bp_slot(struct perf_event *bp, bool enable)
165{
166 int cpu = bp->cpu;
167 struct task_struct *tsk = bp->ctx->task;
168
169 /* Pinned counter task profiling */
170 if (tsk) {
171 if (cpu >= 0) {
172 toggle_bp_task_slot(tsk, cpu, enable);
173 return;
174 }
175
176 for_each_online_cpu(cpu)
177 toggle_bp_task_slot(tsk, cpu, enable);
178 return;
179 }
180
181 /* Pinned counter cpu profiling */
182 if (enable)
183 per_cpu(nr_cpu_bp_pinned, bp->cpu)++;
184 else
185 per_cpu(nr_cpu_bp_pinned, bp->cpu)--;
186}
187
188/*
189 * Contraints to check before allowing this new breakpoint counter:
190 *
191 * == Non-pinned counter == (Considered as pinned for now)
192 *
193 * - If attached to a single cpu, check:
194 *
195 * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
196 * + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM
197 *
198 * -> If there are already non-pinned counters in this cpu, it means
199 * there is already a free slot for them.
200 * Otherwise, we check that the maximum number of per task
201 * breakpoints (for this cpu) plus the number of per cpu breakpoint
202 * (for this cpu) doesn't cover every registers.
203 *
204 * - If attached to every cpus, check:
205 *
206 * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
207 * + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM
208 *
209 * -> This is roughly the same, except we check the number of per cpu
210 * bp for every cpu and we keep the max one. Same for the per tasks
211 * breakpoints.
212 *
213 *
214 * == Pinned counter ==
215 *
216 * - If attached to a single cpu, check:
217 *
218 * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
219 * + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM
220 *
221 * -> Same checks as before. But now the nr_bp_flexible, if any, must keep
222 * one register at least (or they will never be fed).
223 *
224 * - If attached to every cpus, check:
225 *
226 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
227 * + max(per_cpu(task_bp_pinned, *))) < HBP_NUM
228 */
229int reserve_bp_slot(struct perf_event *bp)
230{
231 struct bp_busy_slots slots = {0};
232 int ret = 0;
233
234 mutex_lock(&nr_bp_mutex);
235
236 fetch_bp_busy_slots(&slots, bp->cpu);
237
238 /* Flexible counters need to keep at least one slot */
239 if (slots.pinned + (!!slots.flexible) == HBP_NUM) {
240 ret = -ENOSPC;
241 goto end;
242 }
243
244 toggle_bp_slot(bp, true);
245
246end:
247 mutex_unlock(&nr_bp_mutex);
248
249 return ret;
250}
251
252void release_bp_slot(struct perf_event *bp)
253{
254 mutex_lock(&nr_bp_mutex);
255
256 toggle_bp_slot(bp, false);
257
258 mutex_unlock(&nr_bp_mutex);
259}
260
261
262int __register_perf_hw_breakpoint(struct perf_event *bp)
263{
264 int ret;
265
266 ret = reserve_bp_slot(bp);
267 if (ret)
268 return ret;
269
270 /*
271 * Ptrace breakpoints can be temporary perf events only
272 * meant to reserve a slot. In this case, it is created disabled and
273 * we don't want to check the params right now (as we put a null addr)
274 * But perf tools create events as disabled and we want to check
275 * the params for them.
276 * This is a quick hack that will be removed soon, once we remove
277 * the tmp breakpoints from ptrace
278 */
279 if (!bp->attr.disabled || bp->callback == perf_bp_event)
280 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
281
282 return ret;
283}
284
285int register_perf_hw_breakpoint(struct perf_event *bp)
286{
287 bp->callback = perf_bp_event;
288
289 return __register_perf_hw_breakpoint(bp);
290}
291
292/**
293 * register_user_hw_breakpoint - register a hardware breakpoint for user space
294 * @attr: breakpoint attributes
295 * @triggered: callback to trigger when we hit the breakpoint
296 * @tsk: pointer to 'task_struct' of the process to which the address belongs
297 */
298struct perf_event *
299register_user_hw_breakpoint(struct perf_event_attr *attr,
300 perf_callback_t triggered,
301 struct task_struct *tsk)
302{
303 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
304}
305EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
306
307/**
308 * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
309 * @bp: the breakpoint structure to modify
310 * @attr: new breakpoint attributes
311 * @triggered: callback to trigger when we hit the breakpoint
312 * @tsk: pointer to 'task_struct' of the process to which the address belongs
313 */
314struct perf_event *
315modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr,
316 perf_callback_t triggered,
317 struct task_struct *tsk)
318{
319 /*
320 * FIXME: do it without unregistering
321 * - We don't want to lose our slot
322 * - If the new bp is incorrect, don't lose the older one
323 */
324 unregister_hw_breakpoint(bp);
325
326 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
327}
328EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
329
330/**
331 * unregister_hw_breakpoint - unregister a user-space hardware breakpoint
332 * @bp: the breakpoint structure to unregister
333 */
334void unregister_hw_breakpoint(struct perf_event *bp)
335{
336 if (!bp)
337 return;
338 perf_event_release_kernel(bp);
339}
340EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
341
342/**
343 * register_wide_hw_breakpoint - register a wide breakpoint in the kernel
344 * @attr: breakpoint attributes
345 * @triggered: callback to trigger when we hit the breakpoint
346 *
347 * @return a set of per_cpu pointers to perf events
348 */
349struct perf_event **
350register_wide_hw_breakpoint(struct perf_event_attr *attr,
351 perf_callback_t triggered)
352{
353 struct perf_event **cpu_events, **pevent, *bp;
354 long err;
355 int cpu;
356
357 cpu_events = alloc_percpu(typeof(*cpu_events));
358 if (!cpu_events)
359 return ERR_PTR(-ENOMEM);
360
361 for_each_possible_cpu(cpu) {
362 pevent = per_cpu_ptr(cpu_events, cpu);
363 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
364
365 *pevent = bp;
366
367 if (IS_ERR(bp)) {
368 err = PTR_ERR(bp);
369 goto fail;
370 }
371 }
372
373 return cpu_events;
374
375fail:
376 for_each_possible_cpu(cpu) {
377 pevent = per_cpu_ptr(cpu_events, cpu);
378 if (IS_ERR(*pevent))
379 break;
380 unregister_hw_breakpoint(*pevent);
381 }
382 free_percpu(cpu_events);
383 /* return the error if any */
384 return ERR_PTR(err);
385}
386EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
387
388/**
389 * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
390 * @cpu_events: the per cpu set of events to unregister
391 */
392void unregister_wide_hw_breakpoint(struct perf_event **cpu_events)
393{
394 int cpu;
395 struct perf_event **pevent;
396
397 for_each_possible_cpu(cpu) {
398 pevent = per_cpu_ptr(cpu_events, cpu);
399 unregister_hw_breakpoint(*pevent);
400 }
401 free_percpu(cpu_events);
402}
403EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
404
405static struct notifier_block hw_breakpoint_exceptions_nb = {
406 .notifier_call = hw_breakpoint_exceptions_notify,
407 /* we need to be notified first */
408 .priority = 0x7fffffff
409};
410
411static int __init init_hw_breakpoint(void)
412{
413 return register_die_notifier(&hw_breakpoint_exceptions_nb);
414}
415core_initcall(init_hw_breakpoint);
416
417
418struct pmu perf_ops_bp = {
419 .enable = arch_install_hw_breakpoint,
420 .disable = arch_uninstall_hw_breakpoint,
421 .read = hw_breakpoint_pmu_read,
422 .unthrottle = hw_breakpoint_pmu_unthrottle
423};
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c1660194d115..ba566c261adc 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -166,11 +166,11 @@ int set_irq_data(unsigned int irq, void *data)
166EXPORT_SYMBOL(set_irq_data); 166EXPORT_SYMBOL(set_irq_data);
167 167
168/** 168/**
169 * set_irq_data - set irq type data for an irq 169 * set_irq_msi - set MSI descriptor data for an irq
170 * @irq: Interrupt number 170 * @irq: Interrupt number
171 * @entry: Pointer to MSI descriptor data 171 * @entry: Pointer to MSI descriptor data
172 * 172 *
173 * Set the hardware irq controller data for an irq 173 * Set the MSI descriptor entry for an irq
174 */ 174 */
175int set_irq_msi(unsigned int irq, struct msi_desc *entry) 175int set_irq_msi(unsigned int irq, struct msi_desc *entry)
176{ 176{
@@ -590,7 +590,7 @@ out_unlock:
590} 590}
591 591
592/** 592/**
593 * handle_percpu_IRQ - Per CPU local irq handler 593 * handle_percpu_irq - Per CPU local irq handler
594 * @irq: the interrupt number 594 * @irq: the interrupt number
595 * @desc: the interrupt description structure for this irq 595 * @desc: the interrupt description structure for this irq
596 * 596 *
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a81cf80554db..17c71bb565c6 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,6 +11,7 @@
11 */ 11 */
12 12
13#include <linux/irq.h> 13#include <linux/irq.h>
14#include <linux/sched.h>
14#include <linux/slab.h> 15#include <linux/slab.h>
15#include <linux/module.h> 16#include <linux/module.h>
16#include <linux/random.h> 17#include <linux/random.h>
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 692363dd591f..0832145fea97 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -136,7 +136,7 @@ out:
136 136
137static int default_affinity_open(struct inode *inode, struct file *file) 137static int default_affinity_open(struct inode *inode, struct file *file)
138{ 138{
139 return single_open(file, default_affinity_show, NULL); 139 return single_open(file, default_affinity_show, PDE(inode)->data);
140} 140}
141 141
142static const struct file_operations default_affinity_proc_fops = { 142static const struct file_operations default_affinity_proc_fops = {
@@ -148,18 +148,28 @@ static const struct file_operations default_affinity_proc_fops = {
148}; 148};
149#endif 149#endif
150 150
151static int irq_spurious_read(char *page, char **start, off_t off, 151static int irq_spurious_proc_show(struct seq_file *m, void *v)
152 int count, int *eof, void *data)
153{ 152{
154 struct irq_desc *desc = irq_to_desc((long) data); 153 struct irq_desc *desc = irq_to_desc((long) m->private);
155 return sprintf(page, "count %u\n" 154
156 "unhandled %u\n" 155 seq_printf(m, "count %u\n" "unhandled %u\n" "last_unhandled %u ms\n",
157 "last_unhandled %u ms\n", 156 desc->irq_count, desc->irqs_unhandled,
158 desc->irq_count, 157 jiffies_to_msecs(desc->last_unhandled));
159 desc->irqs_unhandled, 158 return 0;
160 jiffies_to_msecs(desc->last_unhandled)); 159}
160
161static int irq_spurious_proc_open(struct inode *inode, struct file *file)
162{
163 return single_open(file, irq_spurious_proc_show, NULL);
161} 164}
162 165
166static const struct file_operations irq_spurious_proc_fops = {
167 .open = irq_spurious_proc_open,
168 .read = seq_read,
169 .llseek = seq_lseek,
170 .release = single_release,
171};
172
163#define MAX_NAMELEN 128 173#define MAX_NAMELEN 128
164 174
165static int name_unique(unsigned int irq, struct irqaction *new_action) 175static int name_unique(unsigned int irq, struct irqaction *new_action)
@@ -204,7 +214,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
204void register_irq_proc(unsigned int irq, struct irq_desc *desc) 214void register_irq_proc(unsigned int irq, struct irq_desc *desc)
205{ 215{
206 char name [MAX_NAMELEN]; 216 char name [MAX_NAMELEN];
207 struct proc_dir_entry *entry;
208 217
209 if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) 218 if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir)
210 return; 219 return;
@@ -214,6 +223,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
214 223
215 /* create /proc/irq/1234 */ 224 /* create /proc/irq/1234 */
216 desc->dir = proc_mkdir(name, root_irq_dir); 225 desc->dir = proc_mkdir(name, root_irq_dir);
226 if (!desc->dir)
227 return;
217 228
218#ifdef CONFIG_SMP 229#ifdef CONFIG_SMP
219 /* create /proc/irq/<irq>/smp_affinity */ 230 /* create /proc/irq/<irq>/smp_affinity */
@@ -221,11 +232,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
221 &irq_affinity_proc_fops, (void *)(long)irq); 232 &irq_affinity_proc_fops, (void *)(long)irq);
222#endif 233#endif
223 234
224 entry = create_proc_entry("spurious", 0444, desc->dir); 235 proc_create_data("spurious", 0444, desc->dir,
225 if (entry) { 236 &irq_spurious_proc_fops, (void *)(long)irq);
226 entry->data = (void *)(long)irq;
227 entry->read_proc = irq_spurious_read;
228 }
229} 237}
230 238
231#undef MAX_NAMELEN 239#undef MAX_NAMELEN
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 114e704760fe..22b0a6eedf24 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -104,7 +104,7 @@ static int misrouted_irq(int irq)
104 return ok; 104 return ok;
105} 105}
106 106
107static void poll_all_shared_irqs(void) 107static void poll_spurious_irqs(unsigned long dummy)
108{ 108{
109 struct irq_desc *desc; 109 struct irq_desc *desc;
110 int i; 110 int i;
@@ -121,25 +121,15 @@ static void poll_all_shared_irqs(void)
121 if (!(status & IRQ_SPURIOUS_DISABLED)) 121 if (!(status & IRQ_SPURIOUS_DISABLED))
122 continue; 122 continue;
123 123
124 local_irq_disable();
124 try_one_irq(i, desc); 125 try_one_irq(i, desc);
126 local_irq_enable();
125 } 127 }
126}
127
128static void poll_spurious_irqs(unsigned long dummy)
129{
130 poll_all_shared_irqs();
131 128
132 mod_timer(&poll_spurious_irq_timer, 129 mod_timer(&poll_spurious_irq_timer,
133 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 130 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
134} 131}
135 132
136#ifdef CONFIG_DEBUG_SHIRQ
137void debug_poll_all_shared_irqs(void)
138{
139 poll_all_shared_irqs();
140}
141#endif
142
143/* 133/*
144 * If 99,900 of the previous 100,000 interrupts have not been handled 134 * If 99,900 of the previous 100,000 interrupts have not been handled
145 * then assume that the IRQ is stuck in some manner. Drop a diagnostic 135 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 58762f7077ec..b03451ede528 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -12,6 +12,7 @@
12#include <linux/time.h> 12#include <linux/time.h>
13#include <linux/posix-timers.h> 13#include <linux/posix-timers.h>
14#include <linux/hrtimer.h> 14#include <linux/hrtimer.h>
15#include <trace/events/timer.h>
15 16
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17 18
@@ -41,10 +42,43 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer)
41 return ktime_to_timeval(rem); 42 return ktime_to_timeval(rem);
42} 43}
43 44
45static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
46 struct itimerval *const value)
47{
48 cputime_t cval, cinterval;
49 struct cpu_itimer *it = &tsk->signal->it[clock_id];
50
51 spin_lock_irq(&tsk->sighand->siglock);
52
53 cval = it->expires;
54 cinterval = it->incr;
55 if (!cputime_eq(cval, cputime_zero)) {
56 struct task_cputime cputime;
57 cputime_t t;
58
59 thread_group_cputimer(tsk, &cputime);
60 if (clock_id == CPUCLOCK_PROF)
61 t = cputime_add(cputime.utime, cputime.stime);
62 else
63 /* CPUCLOCK_VIRT */
64 t = cputime.utime;
65
66 if (cputime_le(cval, t))
67 /* about to fire */
68 cval = cputime_one_jiffy;
69 else
70 cval = cputime_sub(cval, t);
71 }
72
73 spin_unlock_irq(&tsk->sighand->siglock);
74
75 cputime_to_timeval(cval, &value->it_value);
76 cputime_to_timeval(cinterval, &value->it_interval);
77}
78
44int do_getitimer(int which, struct itimerval *value) 79int do_getitimer(int which, struct itimerval *value)
45{ 80{
46 struct task_struct *tsk = current; 81 struct task_struct *tsk = current;
47 cputime_t cinterval, cval;
48 82
49 switch (which) { 83 switch (which) {
50 case ITIMER_REAL: 84 case ITIMER_REAL:
@@ -55,44 +89,10 @@ int do_getitimer(int which, struct itimerval *value)
55 spin_unlock_irq(&tsk->sighand->siglock); 89 spin_unlock_irq(&tsk->sighand->siglock);
56 break; 90 break;
57 case ITIMER_VIRTUAL: 91 case ITIMER_VIRTUAL:
58 spin_lock_irq(&tsk->sighand->siglock); 92 get_cpu_itimer(tsk, CPUCLOCK_VIRT, value);
59 cval = tsk->signal->it_virt_expires;
60 cinterval = tsk->signal->it_virt_incr;
61 if (!cputime_eq(cval, cputime_zero)) {
62 struct task_cputime cputime;
63 cputime_t utime;
64
65 thread_group_cputimer(tsk, &cputime);
66 utime = cputime.utime;
67 if (cputime_le(cval, utime)) { /* about to fire */
68 cval = jiffies_to_cputime(1);
69 } else {
70 cval = cputime_sub(cval, utime);
71 }
72 }
73 spin_unlock_irq(&tsk->sighand->siglock);
74 cputime_to_timeval(cval, &value->it_value);
75 cputime_to_timeval(cinterval, &value->it_interval);
76 break; 93 break;
77 case ITIMER_PROF: 94 case ITIMER_PROF:
78 spin_lock_irq(&tsk->sighand->siglock); 95 get_cpu_itimer(tsk, CPUCLOCK_PROF, value);
79 cval = tsk->signal->it_prof_expires;
80 cinterval = tsk->signal->it_prof_incr;
81 if (!cputime_eq(cval, cputime_zero)) {
82 struct task_cputime times;
83 cputime_t ptime;
84
85 thread_group_cputimer(tsk, &times);
86 ptime = cputime_add(times.utime, times.stime);
87 if (cputime_le(cval, ptime)) { /* about to fire */
88 cval = jiffies_to_cputime(1);
89 } else {
90 cval = cputime_sub(cval, ptime);
91 }
92 }
93 spin_unlock_irq(&tsk->sighand->siglock);
94 cputime_to_timeval(cval, &value->it_value);
95 cputime_to_timeval(cinterval, &value->it_interval);
96 break; 96 break;
97 default: 97 default:
98 return(-EINVAL); 98 return(-EINVAL);
@@ -123,11 +123,62 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
123 struct signal_struct *sig = 123 struct signal_struct *sig =
124 container_of(timer, struct signal_struct, real_timer); 124 container_of(timer, struct signal_struct, real_timer);
125 125
126 trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0);
126 kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid); 127 kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid);
127 128
128 return HRTIMER_NORESTART; 129 return HRTIMER_NORESTART;
129} 130}
130 131
132static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns)
133{
134 struct timespec ts;
135 s64 cpu_ns;
136
137 cputime_to_timespec(ct, &ts);
138 cpu_ns = timespec_to_ns(&ts);
139
140 return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns;
141}
142
143static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
144 const struct itimerval *const value,
145 struct itimerval *const ovalue)
146{
147 cputime_t cval, nval, cinterval, ninterval;
148 s64 ns_ninterval, ns_nval;
149 struct cpu_itimer *it = &tsk->signal->it[clock_id];
150
151 nval = timeval_to_cputime(&value->it_value);
152 ns_nval = timeval_to_ns(&value->it_value);
153 ninterval = timeval_to_cputime(&value->it_interval);
154 ns_ninterval = timeval_to_ns(&value->it_interval);
155
156 it->incr_error = cputime_sub_ns(ninterval, ns_ninterval);
157 it->error = cputime_sub_ns(nval, ns_nval);
158
159 spin_lock_irq(&tsk->sighand->siglock);
160
161 cval = it->expires;
162 cinterval = it->incr;
163 if (!cputime_eq(cval, cputime_zero) ||
164 !cputime_eq(nval, cputime_zero)) {
165 if (cputime_gt(nval, cputime_zero))
166 nval = cputime_add(nval, cputime_one_jiffy);
167 set_process_cpu_timer(tsk, clock_id, &nval, &cval);
168 }
169 it->expires = nval;
170 it->incr = ninterval;
171 trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
172 ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
173
174 spin_unlock_irq(&tsk->sighand->siglock);
175
176 if (ovalue) {
177 cputime_to_timeval(cval, &ovalue->it_value);
178 cputime_to_timeval(cinterval, &ovalue->it_interval);
179 }
180}
181
131/* 182/*
132 * Returns true if the timeval is in canonical form 183 * Returns true if the timeval is in canonical form
133 */ 184 */
@@ -139,7 +190,6 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
139 struct task_struct *tsk = current; 190 struct task_struct *tsk = current;
140 struct hrtimer *timer; 191 struct hrtimer *timer;
141 ktime_t expires; 192 ktime_t expires;
142 cputime_t cval, cinterval, nval, ninterval;
143 193
144 /* 194 /*
145 * Validate the timevals in value. 195 * Validate the timevals in value.
@@ -171,51 +221,14 @@ again:
171 } else 221 } else
172 tsk->signal->it_real_incr.tv64 = 0; 222 tsk->signal->it_real_incr.tv64 = 0;
173 223
224 trace_itimer_state(ITIMER_REAL, value, 0);
174 spin_unlock_irq(&tsk->sighand->siglock); 225 spin_unlock_irq(&tsk->sighand->siglock);
175 break; 226 break;
176 case ITIMER_VIRTUAL: 227 case ITIMER_VIRTUAL:
177 nval = timeval_to_cputime(&value->it_value); 228 set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue);
178 ninterval = timeval_to_cputime(&value->it_interval);
179 spin_lock_irq(&tsk->sighand->siglock);
180 cval = tsk->signal->it_virt_expires;
181 cinterval = tsk->signal->it_virt_incr;
182 if (!cputime_eq(cval, cputime_zero) ||
183 !cputime_eq(nval, cputime_zero)) {
184 if (cputime_gt(nval, cputime_zero))
185 nval = cputime_add(nval,
186 jiffies_to_cputime(1));
187 set_process_cpu_timer(tsk, CPUCLOCK_VIRT,
188 &nval, &cval);
189 }
190 tsk->signal->it_virt_expires = nval;
191 tsk->signal->it_virt_incr = ninterval;
192 spin_unlock_irq(&tsk->sighand->siglock);
193 if (ovalue) {
194 cputime_to_timeval(cval, &ovalue->it_value);
195 cputime_to_timeval(cinterval, &ovalue->it_interval);
196 }
197 break; 229 break;
198 case ITIMER_PROF: 230 case ITIMER_PROF:
199 nval = timeval_to_cputime(&value->it_value); 231 set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue);
200 ninterval = timeval_to_cputime(&value->it_interval);
201 spin_lock_irq(&tsk->sighand->siglock);
202 cval = tsk->signal->it_prof_expires;
203 cinterval = tsk->signal->it_prof_incr;
204 if (!cputime_eq(cval, cputime_zero) ||
205 !cputime_eq(nval, cputime_zero)) {
206 if (cputime_gt(nval, cputime_zero))
207 nval = cputime_add(nval,
208 jiffies_to_cputime(1));
209 set_process_cpu_timer(tsk, CPUCLOCK_PROF,
210 &nval, &cval);
211 }
212 tsk->signal->it_prof_expires = nval;
213 tsk->signal->it_prof_incr = ninterval;
214 spin_unlock_irq(&tsk->sighand->siglock);
215 if (ovalue) {
216 cputime_to_timeval(cval, &ovalue->it_value);
217 cputime_to_timeval(cinterval, &ovalue->it_interval);
218 }
219 break; 232 break;
220 default: 233 default:
221 return -EINVAL; 234 return -EINVAL;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 3a29dbe7898e..8e5288a8a355 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -59,7 +59,8 @@ static inline int is_kernel_inittext(unsigned long addr)
59 59
60static inline int is_kernel_text(unsigned long addr) 60static inline int is_kernel_text(unsigned long addr)
61{ 61{
62 if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) 62 if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
63 arch_is_kernel_text(addr))
63 return 1; 64 return 1;
64 return in_gate_area_no_task(addr); 65 return in_gate_area_no_task(addr);
65} 66}
@@ -180,6 +181,7 @@ unsigned long kallsyms_lookup_name(const char *name)
180 } 181 }
181 return module_kallsyms_lookup_name(name); 182 return module_kallsyms_lookup_name(name);
182} 183}
184EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
183 185
184int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, 186int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
185 unsigned long), 187 unsigned long),
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 26539e3228e5..3765ff3c1bbe 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -117,7 +117,7 @@ EXPORT_SYMBOL(kfifo_free);
117 * writer, you don't need extra locking to use these functions. 117 * writer, you don't need extra locking to use these functions.
118 */ 118 */
119unsigned int __kfifo_put(struct kfifo *fifo, 119unsigned int __kfifo_put(struct kfifo *fifo,
120 unsigned char *buffer, unsigned int len) 120 const unsigned char *buffer, unsigned int len)
121{ 121{
122 unsigned int l; 122 unsigned int l;
123 123
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 9147a3190c9d..7d7014634022 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -870,7 +870,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
870 870
871 /* 871 /*
872 * All threads that don't have debuggerinfo should be 872 * All threads that don't have debuggerinfo should be
873 * in __schedule() sleeping, since all other CPUs 873 * in schedule() sleeping, since all other CPUs
874 * are in kgdb_wait, and thus have debuggerinfo. 874 * are in kgdb_wait, and thus have debuggerinfo.
875 */ 875 */
876 if (local_debuggerinfo) { 876 if (local_debuggerinfo) {
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9fcb53a11f87..25b103190364 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -80,16 +80,16 @@ int __request_module(bool wait, const char *fmt, ...)
80#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ 80#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
81 static int kmod_loop_msg; 81 static int kmod_loop_msg;
82 82
83 ret = security_kernel_module_request();
84 if (ret)
85 return ret;
86
87 va_start(args, fmt); 83 va_start(args, fmt);
88 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); 84 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
89 va_end(args); 85 va_end(args);
90 if (ret >= MODULE_NAME_LEN) 86 if (ret >= MODULE_NAME_LEN)
91 return -ENAMETOOLONG; 87 return -ENAMETOOLONG;
92 88
89 ret = security_kernel_module_request(module_name);
90 if (ret)
91 return ret;
92
93 /* If modprobe needs a service that is in a module, we get a recursive 93 /* If modprobe needs a service that is in a module, we get a recursive
94 * loop. Limit the number of running kmod threads to max_threads/2 or 94 * loop. Limit the number of running kmod threads to max_threads/2 or
95 * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method 95 * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ef177d653b2c..e5342a344c43 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -90,6 +90,9 @@ static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
90 */ 90 */
91static struct kprobe_blackpoint kprobe_blacklist[] = { 91static struct kprobe_blackpoint kprobe_blacklist[] = {
92 {"preempt_schedule",}, 92 {"preempt_schedule",},
93 {"native_get_debugreg",},
94 {"irq_entries_start",},
95 {"common_interrupt",},
93 {NULL} /* Terminator */ 96 {NULL} /* Terminator */
94}; 97};
95 98
@@ -673,6 +676,40 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
673 return (kprobe_opcode_t *)(((char *)addr) + p->offset); 676 return (kprobe_opcode_t *)(((char *)addr) + p->offset);
674} 677}
675 678
679/* Check passed kprobe is valid and return kprobe in kprobe_table. */
680static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
681{
682 struct kprobe *old_p, *list_p;
683
684 old_p = get_kprobe(p->addr);
685 if (unlikely(!old_p))
686 return NULL;
687
688 if (p != old_p) {
689 list_for_each_entry_rcu(list_p, &old_p->list, list)
690 if (list_p == p)
691 /* kprobe p is a valid probe */
692 goto valid;
693 return NULL;
694 }
695valid:
696 return old_p;
697}
698
699/* Return error if the kprobe is being re-registered */
700static inline int check_kprobe_rereg(struct kprobe *p)
701{
702 int ret = 0;
703 struct kprobe *old_p;
704
705 mutex_lock(&kprobe_mutex);
706 old_p = __get_valid_kprobe(p);
707 if (old_p)
708 ret = -EINVAL;
709 mutex_unlock(&kprobe_mutex);
710 return ret;
711}
712
676int __kprobes register_kprobe(struct kprobe *p) 713int __kprobes register_kprobe(struct kprobe *p)
677{ 714{
678 int ret = 0; 715 int ret = 0;
@@ -685,6 +722,10 @@ int __kprobes register_kprobe(struct kprobe *p)
685 return -EINVAL; 722 return -EINVAL;
686 p->addr = addr; 723 p->addr = addr;
687 724
725 ret = check_kprobe_rereg(p);
726 if (ret)
727 return ret;
728
688 preempt_disable(); 729 preempt_disable();
689 if (!kernel_text_address((unsigned long) p->addr) || 730 if (!kernel_text_address((unsigned long) p->addr) ||
690 in_kprobes_functions((unsigned long) p->addr)) { 731 in_kprobes_functions((unsigned long) p->addr)) {
@@ -754,26 +795,6 @@ out:
754} 795}
755EXPORT_SYMBOL_GPL(register_kprobe); 796EXPORT_SYMBOL_GPL(register_kprobe);
756 797
757/* Check passed kprobe is valid and return kprobe in kprobe_table. */
758static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
759{
760 struct kprobe *old_p, *list_p;
761
762 old_p = get_kprobe(p->addr);
763 if (unlikely(!old_p))
764 return NULL;
765
766 if (p != old_p) {
767 list_for_each_entry_rcu(list_p, &old_p->list, list)
768 if (list_p == p)
769 /* kprobe p is a valid probe */
770 goto valid;
771 return NULL;
772 }
773valid:
774 return old_p;
775}
776
777/* 798/*
778 * Unregister a kprobe without a scheduler synchronization. 799 * Unregister a kprobe without a scheduler synchronization.
779 */ 800 */
@@ -1014,9 +1035,9 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
1014 /* Pre-allocate memory for max kretprobe instances */ 1035 /* Pre-allocate memory for max kretprobe instances */
1015 if (rp->maxactive <= 0) { 1036 if (rp->maxactive <= 0) {
1016#ifdef CONFIG_PREEMPT 1037#ifdef CONFIG_PREEMPT
1017 rp->maxactive = max(10, 2 * NR_CPUS); 1038 rp->maxactive = max(10, 2 * num_possible_cpus());
1018#else 1039#else
1019 rp->maxactive = NR_CPUS; 1040 rp->maxactive = num_possible_cpus();
1020#endif 1041#endif
1021 } 1042 }
1022 spin_lock_init(&rp->lock); 1043 spin_lock_init(&rp->lock);
@@ -1141,6 +1162,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1141 arch_remove_kprobe(p); 1162 arch_remove_kprobe(p);
1142} 1163}
1143 1164
1165void __kprobes dump_kprobe(struct kprobe *kp)
1166{
1167 printk(KERN_WARNING "Dumping kprobe:\n");
1168 printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n",
1169 kp->symbol_name, kp->addr, kp->offset);
1170}
1171
1144/* Module notifier call back, checking kprobes on the module */ 1172/* Module notifier call back, checking kprobes on the module */
1145static int __kprobes kprobes_module_callback(struct notifier_block *nb, 1173static int __kprobes kprobes_module_callback(struct notifier_block *nb,
1146 unsigned long val, void *data) 1174 unsigned long val, void *data)
@@ -1321,7 +1349,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
1321 return 0; 1349 return 0;
1322} 1350}
1323 1351
1324static struct seq_operations kprobes_seq_ops = { 1352static const struct seq_operations kprobes_seq_ops = {
1325 .start = kprobe_seq_start, 1353 .start = kprobe_seq_start,
1326 .next = kprobe_seq_next, 1354 .next = kprobe_seq_next,
1327 .stop = kprobe_seq_stop, 1355 .stop = kprobe_seq_stop,
@@ -1333,7 +1361,7 @@ static int __kprobes kprobes_open(struct inode *inode, struct file *filp)
1333 return seq_open(filp, &kprobes_seq_ops); 1361 return seq_open(filp, &kprobes_seq_ops);
1334} 1362}
1335 1363
1336static struct file_operations debugfs_kprobes_operations = { 1364static const struct file_operations debugfs_kprobes_operations = {
1337 .open = kprobes_open, 1365 .open = kprobes_open,
1338 .read = seq_read, 1366 .read = seq_read,
1339 .llseek = seq_lseek, 1367 .llseek = seq_lseek,
@@ -1515,7 +1543,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
1515 return count; 1543 return count;
1516} 1544}
1517 1545
1518static struct file_operations fops_kp = { 1546static const struct file_operations fops_kp = {
1519 .read = read_enabled_file_bool, 1547 .read = read_enabled_file_bool,
1520 .write = write_enabled_file_bool, 1548 .write = write_enabled_file_bool,
1521}; 1549};
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 5fe709982caa..ab7ae57773e1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -150,29 +150,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
150EXPORT_SYMBOL(kthread_create); 150EXPORT_SYMBOL(kthread_create);
151 151
152/** 152/**
153 * kthread_bind - bind a just-created kthread to a cpu.
154 * @k: thread created by kthread_create().
155 * @cpu: cpu (might not be online, must be possible) for @k to run on.
156 *
157 * Description: This function is equivalent to set_cpus_allowed(),
158 * except that @cpu doesn't need to be online, and the thread must be
159 * stopped (i.e., just returned from kthread_create()).
160 */
161void kthread_bind(struct task_struct *k, unsigned int cpu)
162{
163 /* Must have done schedule() in kthread() before we set_task_cpu */
164 if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) {
165 WARN_ON(1);
166 return;
167 }
168 set_task_cpu(k, cpu);
169 k->cpus_allowed = cpumask_of_cpu(cpu);
170 k->rt.nr_cpus_allowed = 1;
171 k->flags |= PF_THREAD_BOUND;
172}
173EXPORT_SYMBOL(kthread_bind);
174
175/**
176 * kthread_stop - stop a thread created by kthread_create(). 153 * kthread_stop - stop a thread created by kthread_create().
177 * @k: thread created by kthread_create(). 154 * @k: thread created by kthread_create().
178 * 155 *
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f74d2d7aa605..f5dcd36d3151 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -49,7 +49,7 @@
49#include "lockdep_internals.h" 49#include "lockdep_internals.h"
50 50
51#define CREATE_TRACE_POINTS 51#define CREATE_TRACE_POINTS
52#include <trace/events/lockdep.h> 52#include <trace/events/lock.h>
53 53
54#ifdef CONFIG_PROVE_LOCKING 54#ifdef CONFIG_PROVE_LOCKING
55int prove_locking = 1; 55int prove_locking = 1;
@@ -142,6 +142,11 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
142#ifdef CONFIG_LOCK_STAT 142#ifdef CONFIG_LOCK_STAT
143static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); 143static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
144 144
145static inline u64 lockstat_clock(void)
146{
147 return cpu_clock(smp_processor_id());
148}
149
145static int lock_point(unsigned long points[], unsigned long ip) 150static int lock_point(unsigned long points[], unsigned long ip)
146{ 151{
147 int i; 152 int i;
@@ -158,7 +163,7 @@ static int lock_point(unsigned long points[], unsigned long ip)
158 return i; 163 return i;
159} 164}
160 165
161static void lock_time_inc(struct lock_time *lt, s64 time) 166static void lock_time_inc(struct lock_time *lt, u64 time)
162{ 167{
163 if (time > lt->max) 168 if (time > lt->max)
164 lt->max = time; 169 lt->max = time;
@@ -234,12 +239,12 @@ static void put_lock_stats(struct lock_class_stats *stats)
234static void lock_release_holdtime(struct held_lock *hlock) 239static void lock_release_holdtime(struct held_lock *hlock)
235{ 240{
236 struct lock_class_stats *stats; 241 struct lock_class_stats *stats;
237 s64 holdtime; 242 u64 holdtime;
238 243
239 if (!lock_stat) 244 if (!lock_stat)
240 return; 245 return;
241 246
242 holdtime = sched_clock() - hlock->holdtime_stamp; 247 holdtime = lockstat_clock() - hlock->holdtime_stamp;
243 248
244 stats = get_lock_stats(hlock_class(hlock)); 249 stats = get_lock_stats(hlock_class(hlock));
245 if (hlock->read) 250 if (hlock->read)
@@ -578,6 +583,9 @@ static int static_obj(void *obj)
578 if ((addr >= start) && (addr < end)) 583 if ((addr >= start) && (addr < end))
579 return 1; 584 return 1;
580 585
586 if (arch_is_kernel_data(addr))
587 return 1;
588
581#ifdef CONFIG_SMP 589#ifdef CONFIG_SMP
582 /* 590 /*
583 * percpu var? 591 * percpu var?
@@ -2789,7 +2797,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2789 hlock->references = references; 2797 hlock->references = references;
2790#ifdef CONFIG_LOCK_STAT 2798#ifdef CONFIG_LOCK_STAT
2791 hlock->waittime_stamp = 0; 2799 hlock->waittime_stamp = 0;
2792 hlock->holdtime_stamp = sched_clock(); 2800 hlock->holdtime_stamp = lockstat_clock();
2793#endif 2801#endif
2794 2802
2795 if (check == 2 && !mark_irqflags(curr, hlock)) 2803 if (check == 2 && !mark_irqflags(curr, hlock))
@@ -3319,7 +3327,7 @@ found_it:
3319 if (hlock->instance != lock) 3327 if (hlock->instance != lock)
3320 return; 3328 return;
3321 3329
3322 hlock->waittime_stamp = sched_clock(); 3330 hlock->waittime_stamp = lockstat_clock();
3323 3331
3324 contention_point = lock_point(hlock_class(hlock)->contention_point, ip); 3332 contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
3325 contending_point = lock_point(hlock_class(hlock)->contending_point, 3333 contending_point = lock_point(hlock_class(hlock)->contending_point,
@@ -3342,8 +3350,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3342 struct held_lock *hlock, *prev_hlock; 3350 struct held_lock *hlock, *prev_hlock;
3343 struct lock_class_stats *stats; 3351 struct lock_class_stats *stats;
3344 unsigned int depth; 3352 unsigned int depth;
3345 u64 now; 3353 u64 now, waittime = 0;
3346 s64 waittime = 0;
3347 int i, cpu; 3354 int i, cpu;
3348 3355
3349 depth = curr->lockdep_depth; 3356 depth = curr->lockdep_depth;
@@ -3371,7 +3378,7 @@ found_it:
3371 3378
3372 cpu = smp_processor_id(); 3379 cpu = smp_processor_id();
3373 if (hlock->waittime_stamp) { 3380 if (hlock->waittime_stamp) {
3374 now = sched_clock(); 3381 now = lockstat_clock();
3375 waittime = now - hlock->waittime_stamp; 3382 waittime = now - hlock->waittime_stamp;
3376 hlock->holdtime_stamp = now; 3383 hlock->holdtime_stamp = now;
3377 } 3384 }
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d4b3dbc79fdb..d4aba4f3584c 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -594,7 +594,7 @@ static int ls_show(struct seq_file *m, void *v)
594 return 0; 594 return 0;
595} 595}
596 596
597static struct seq_operations lockstat_ops = { 597static const struct seq_operations lockstat_ops = {
598 .start = ls_start, 598 .start = ls_start,
599 .next = ls_next, 599 .next = ls_next,
600 .stop = ls_stop, 600 .stop = ls_stop,
diff --git a/kernel/marker.c b/kernel/marker.c
deleted file mode 100644
index ea54f2647868..000000000000
--- a/kernel/marker.c
+++ /dev/null
@@ -1,930 +0,0 @@
1/*
2 * Copyright (C) 2007 Mathieu Desnoyers
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 */
18#include <linux/module.h>
19#include <linux/mutex.h>
20#include <linux/types.h>
21#include <linux/jhash.h>
22#include <linux/list.h>
23#include <linux/rcupdate.h>
24#include <linux/marker.h>
25#include <linux/err.h>
26#include <linux/slab.h>
27
28extern struct marker __start___markers[];
29extern struct marker __stop___markers[];
30
31/* Set to 1 to enable marker debug output */
32static const int marker_debug;
33
34/*
35 * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
36 * and module markers and the hash table.
37 */
38static DEFINE_MUTEX(markers_mutex);
39
40/*
41 * Marker hash table, containing the active markers.
42 * Protected by module_mutex.
43 */
44#define MARKER_HASH_BITS 6
45#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
46static struct hlist_head marker_table[MARKER_TABLE_SIZE];
47
48/*
49 * Note about RCU :
50 * It is used to make sure every handler has finished using its private data
51 * between two consecutive operation (add or remove) on a given marker. It is
52 * also used to delay the free of multiple probes array until a quiescent state
53 * is reached.
54 * marker entries modifications are protected by the markers_mutex.
55 */
56struct marker_entry {
57 struct hlist_node hlist;
58 char *format;
59 /* Probe wrapper */
60 void (*call)(const struct marker *mdata, void *call_private, ...);
61 struct marker_probe_closure single;
62 struct marker_probe_closure *multi;
63 int refcount; /* Number of times armed. 0 if disarmed. */
64 struct rcu_head rcu;
65 void *oldptr;
66 int rcu_pending;
67 unsigned char ptype:1;
68 unsigned char format_allocated:1;
69 char name[0]; /* Contains name'\0'format'\0' */
70};
71
72/**
73 * __mark_empty_function - Empty probe callback
74 * @probe_private: probe private data
75 * @call_private: call site private data
76 * @fmt: format string
77 * @...: variable argument list
78 *
79 * Empty callback provided as a probe to the markers. By providing this to a
80 * disabled marker, we make sure the execution flow is always valid even
81 * though the function pointer change and the marker enabling are two distinct
82 * operations that modifies the execution flow of preemptible code.
83 */
84notrace void __mark_empty_function(void *probe_private, void *call_private,
85 const char *fmt, va_list *args)
86{
87}
88EXPORT_SYMBOL_GPL(__mark_empty_function);
89
90/*
91 * marker_probe_cb Callback that prepares the variable argument list for probes.
92 * @mdata: pointer of type struct marker
93 * @call_private: caller site private data
94 * @...: Variable argument list.
95 *
96 * Since we do not use "typical" pointer based RCU in the 1 argument case, we
97 * need to put a full smp_rmb() in this branch. This is why we do not use
98 * rcu_dereference() for the pointer read.
99 */
100notrace void marker_probe_cb(const struct marker *mdata,
101 void *call_private, ...)
102{
103 va_list args;
104 char ptype;
105
106 /*
107 * rcu_read_lock_sched does two things : disabling preemption to make
108 * sure the teardown of the callbacks can be done correctly when they
109 * are in modules and they insure RCU read coherency.
110 */
111 rcu_read_lock_sched_notrace();
112 ptype = mdata->ptype;
113 if (likely(!ptype)) {
114 marker_probe_func *func;
115 /* Must read the ptype before ptr. They are not data dependant,
116 * so we put an explicit smp_rmb() here. */
117 smp_rmb();
118 func = mdata->single.func;
119 /* Must read the ptr before private data. They are not data
120 * dependant, so we put an explicit smp_rmb() here. */
121 smp_rmb();
122 va_start(args, call_private);
123 func(mdata->single.probe_private, call_private, mdata->format,
124 &args);
125 va_end(args);
126 } else {
127 struct marker_probe_closure *multi;
128 int i;
129 /*
130 * Read mdata->ptype before mdata->multi.
131 */
132 smp_rmb();
133 multi = mdata->multi;
134 /*
135 * multi points to an array, therefore accessing the array
136 * depends on reading multi. However, even in this case,
137 * we must insure that the pointer is read _before_ the array
138 * data. Same as rcu_dereference, but we need a full smp_rmb()
139 * in the fast path, so put the explicit barrier here.
140 */
141 smp_read_barrier_depends();
142 for (i = 0; multi[i].func; i++) {
143 va_start(args, call_private);
144 multi[i].func(multi[i].probe_private, call_private,
145 mdata->format, &args);
146 va_end(args);
147 }
148 }
149 rcu_read_unlock_sched_notrace();
150}
151EXPORT_SYMBOL_GPL(marker_probe_cb);
152
153/*
154 * marker_probe_cb Callback that does not prepare the variable argument list.
155 * @mdata: pointer of type struct marker
156 * @call_private: caller site private data
157 * @...: Variable argument list.
158 *
159 * Should be connected to markers "MARK_NOARGS".
160 */
161static notrace void marker_probe_cb_noarg(const struct marker *mdata,
162 void *call_private, ...)
163{
164 va_list args; /* not initialized */
165 char ptype;
166
167 rcu_read_lock_sched_notrace();
168 ptype = mdata->ptype;
169 if (likely(!ptype)) {
170 marker_probe_func *func;
171 /* Must read the ptype before ptr. They are not data dependant,
172 * so we put an explicit smp_rmb() here. */
173 smp_rmb();
174 func = mdata->single.func;
175 /* Must read the ptr before private data. They are not data
176 * dependant, so we put an explicit smp_rmb() here. */
177 smp_rmb();
178 func(mdata->single.probe_private, call_private, mdata->format,
179 &args);
180 } else {
181 struct marker_probe_closure *multi;
182 int i;
183 /*
184 * Read mdata->ptype before mdata->multi.
185 */
186 smp_rmb();
187 multi = mdata->multi;
188 /*
189 * multi points to an array, therefore accessing the array
190 * depends on reading multi. However, even in this case,
191 * we must insure that the pointer is read _before_ the array
192 * data. Same as rcu_dereference, but we need a full smp_rmb()
193 * in the fast path, so put the explicit barrier here.
194 */
195 smp_read_barrier_depends();
196 for (i = 0; multi[i].func; i++)
197 multi[i].func(multi[i].probe_private, call_private,
198 mdata->format, &args);
199 }
200 rcu_read_unlock_sched_notrace();
201}
202
203static void free_old_closure(struct rcu_head *head)
204{
205 struct marker_entry *entry = container_of(head,
206 struct marker_entry, rcu);
207 kfree(entry->oldptr);
208 /* Make sure we free the data before setting the pending flag to 0 */
209 smp_wmb();
210 entry->rcu_pending = 0;
211}
212
213static void debug_print_probes(struct marker_entry *entry)
214{
215 int i;
216
217 if (!marker_debug)
218 return;
219
220 if (!entry->ptype) {
221 printk(KERN_DEBUG "Single probe : %p %p\n",
222 entry->single.func,
223 entry->single.probe_private);
224 } else {
225 for (i = 0; entry->multi[i].func; i++)
226 printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
227 entry->multi[i].func,
228 entry->multi[i].probe_private);
229 }
230}
231
232static struct marker_probe_closure *
233marker_entry_add_probe(struct marker_entry *entry,
234 marker_probe_func *probe, void *probe_private)
235{
236 int nr_probes = 0;
237 struct marker_probe_closure *old, *new;
238
239 WARN_ON(!probe);
240
241 debug_print_probes(entry);
242 old = entry->multi;
243 if (!entry->ptype) {
244 if (entry->single.func == probe &&
245 entry->single.probe_private == probe_private)
246 return ERR_PTR(-EBUSY);
247 if (entry->single.func == __mark_empty_function) {
248 /* 0 -> 1 probes */
249 entry->single.func = probe;
250 entry->single.probe_private = probe_private;
251 entry->refcount = 1;
252 entry->ptype = 0;
253 debug_print_probes(entry);
254 return NULL;
255 } else {
256 /* 1 -> 2 probes */
257 nr_probes = 1;
258 old = NULL;
259 }
260 } else {
261 /* (N -> N+1), (N != 0, 1) probes */
262 for (nr_probes = 0; old[nr_probes].func; nr_probes++)
263 if (old[nr_probes].func == probe
264 && old[nr_probes].probe_private
265 == probe_private)
266 return ERR_PTR(-EBUSY);
267 }
268 /* + 2 : one for new probe, one for NULL func */
269 new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure),
270 GFP_KERNEL);
271 if (new == NULL)
272 return ERR_PTR(-ENOMEM);
273 if (!old)
274 new[0] = entry->single;
275 else
276 memcpy(new, old,
277 nr_probes * sizeof(struct marker_probe_closure));
278 new[nr_probes].func = probe;
279 new[nr_probes].probe_private = probe_private;
280 entry->refcount = nr_probes + 1;
281 entry->multi = new;
282 entry->ptype = 1;
283 debug_print_probes(entry);
284 return old;
285}
286
287static struct marker_probe_closure *
288marker_entry_remove_probe(struct marker_entry *entry,
289 marker_probe_func *probe, void *probe_private)
290{
291 int nr_probes = 0, nr_del = 0, i;
292 struct marker_probe_closure *old, *new;
293
294 old = entry->multi;
295
296 debug_print_probes(entry);
297 if (!entry->ptype) {
298 /* 0 -> N is an error */
299 WARN_ON(entry->single.func == __mark_empty_function);
300 /* 1 -> 0 probes */
301 WARN_ON(probe && entry->single.func != probe);
302 WARN_ON(entry->single.probe_private != probe_private);
303 entry->single.func = __mark_empty_function;
304 entry->refcount = 0;
305 entry->ptype = 0;
306 debug_print_probes(entry);
307 return NULL;
308 } else {
309 /* (N -> M), (N > 1, M >= 0) probes */
310 for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
311 if ((!probe || old[nr_probes].func == probe)
312 && old[nr_probes].probe_private
313 == probe_private)
314 nr_del++;
315 }
316 }
317
318 if (nr_probes - nr_del == 0) {
319 /* N -> 0, (N > 1) */
320 entry->single.func = __mark_empty_function;
321 entry->refcount = 0;
322 entry->ptype = 0;
323 } else if (nr_probes - nr_del == 1) {
324 /* N -> 1, (N > 1) */
325 for (i = 0; old[i].func; i++)
326 if ((probe && old[i].func != probe) ||
327 old[i].probe_private != probe_private)
328 entry->single = old[i];
329 entry->refcount = 1;
330 entry->ptype = 0;
331 } else {
332 int j = 0;
333 /* N -> M, (N > 1, M > 1) */
334 /* + 1 for NULL */
335 new = kzalloc((nr_probes - nr_del + 1)
336 * sizeof(struct marker_probe_closure), GFP_KERNEL);
337 if (new == NULL)
338 return ERR_PTR(-ENOMEM);
339 for (i = 0; old[i].func; i++)
340 if ((probe && old[i].func != probe) ||
341 old[i].probe_private != probe_private)
342 new[j++] = old[i];
343 entry->refcount = nr_probes - nr_del;
344 entry->ptype = 1;
345 entry->multi = new;
346 }
347 debug_print_probes(entry);
348 return old;
349}
350
351/*
352 * Get marker if the marker is present in the marker hash table.
353 * Must be called with markers_mutex held.
354 * Returns NULL if not present.
355 */
356static struct marker_entry *get_marker(const char *name)
357{
358 struct hlist_head *head;
359 struct hlist_node *node;
360 struct marker_entry *e;
361 u32 hash = jhash(name, strlen(name), 0);
362
363 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
364 hlist_for_each_entry(e, node, head, hlist) {
365 if (!strcmp(name, e->name))
366 return e;
367 }
368 return NULL;
369}
370
371/*
372 * Add the marker to the marker hash table. Must be called with markers_mutex
373 * held.
374 */
375static struct marker_entry *add_marker(const char *name, const char *format)
376{
377 struct hlist_head *head;
378 struct hlist_node *node;
379 struct marker_entry *e;
380 size_t name_len = strlen(name) + 1;
381 size_t format_len = 0;
382 u32 hash = jhash(name, name_len-1, 0);
383
384 if (format)
385 format_len = strlen(format) + 1;
386 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
387 hlist_for_each_entry(e, node, head, hlist) {
388 if (!strcmp(name, e->name)) {
389 printk(KERN_NOTICE
390 "Marker %s busy\n", name);
391 return ERR_PTR(-EBUSY); /* Already there */
392 }
393 }
394 /*
395 * Using kmalloc here to allocate a variable length element. Could
396 * cause some memory fragmentation if overused.
397 */
398 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
399 GFP_KERNEL);
400 if (!e)
401 return ERR_PTR(-ENOMEM);
402 memcpy(&e->name[0], name, name_len);
403 if (format) {
404 e->format = &e->name[name_len];
405 memcpy(e->format, format, format_len);
406 if (strcmp(e->format, MARK_NOARGS) == 0)
407 e->call = marker_probe_cb_noarg;
408 else
409 e->call = marker_probe_cb;
410 trace_mark(core_marker_format, "name %s format %s",
411 e->name, e->format);
412 } else {
413 e->format = NULL;
414 e->call = marker_probe_cb;
415 }
416 e->single.func = __mark_empty_function;
417 e->single.probe_private = NULL;
418 e->multi = NULL;
419 e->ptype = 0;
420 e->format_allocated = 0;
421 e->refcount = 0;
422 e->rcu_pending = 0;
423 hlist_add_head(&e->hlist, head);
424 return e;
425}
426
427/*
428 * Remove the marker from the marker hash table. Must be called with mutex_lock
429 * held.
430 */
431static int remove_marker(const char *name)
432{
433 struct hlist_head *head;
434 struct hlist_node *node;
435 struct marker_entry *e;
436 int found = 0;
437 size_t len = strlen(name) + 1;
438 u32 hash = jhash(name, len-1, 0);
439
440 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
441 hlist_for_each_entry(e, node, head, hlist) {
442 if (!strcmp(name, e->name)) {
443 found = 1;
444 break;
445 }
446 }
447 if (!found)
448 return -ENOENT;
449 if (e->single.func != __mark_empty_function)
450 return -EBUSY;
451 hlist_del(&e->hlist);
452 if (e->format_allocated)
453 kfree(e->format);
454 /* Make sure the call_rcu has been executed */
455 if (e->rcu_pending)
456 rcu_barrier_sched();
457 kfree(e);
458 return 0;
459}
460
461/*
462 * Set the mark_entry format to the format found in the element.
463 */
464static int marker_set_format(struct marker_entry *entry, const char *format)
465{
466 entry->format = kstrdup(format, GFP_KERNEL);
467 if (!entry->format)
468 return -ENOMEM;
469 entry->format_allocated = 1;
470
471 trace_mark(core_marker_format, "name %s format %s",
472 entry->name, entry->format);
473 return 0;
474}
475
476/*
477 * Sets the probe callback corresponding to one marker.
478 */
479static int set_marker(struct marker_entry *entry, struct marker *elem,
480 int active)
481{
482 int ret = 0;
483 WARN_ON(strcmp(entry->name, elem->name) != 0);
484
485 if (entry->format) {
486 if (strcmp(entry->format, elem->format) != 0) {
487 printk(KERN_NOTICE
488 "Format mismatch for probe %s "
489 "(%s), marker (%s)\n",
490 entry->name,
491 entry->format,
492 elem->format);
493 return -EPERM;
494 }
495 } else {
496 ret = marker_set_format(entry, elem->format);
497 if (ret)
498 return ret;
499 }
500
501 /*
502 * probe_cb setup (statically known) is done here. It is
503 * asynchronous with the rest of execution, therefore we only
504 * pass from a "safe" callback (with argument) to an "unsafe"
505 * callback (does not set arguments).
506 */
507 elem->call = entry->call;
508 /*
509 * Sanity check :
510 * We only update the single probe private data when the ptr is
511 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
512 */
513 WARN_ON(elem->single.func != __mark_empty_function
514 && elem->single.probe_private != entry->single.probe_private
515 && !elem->ptype);
516 elem->single.probe_private = entry->single.probe_private;
517 /*
518 * Make sure the private data is valid when we update the
519 * single probe ptr.
520 */
521 smp_wmb();
522 elem->single.func = entry->single.func;
523 /*
524 * We also make sure that the new probe callbacks array is consistent
525 * before setting a pointer to it.
526 */
527 rcu_assign_pointer(elem->multi, entry->multi);
528 /*
529 * Update the function or multi probe array pointer before setting the
530 * ptype.
531 */
532 smp_wmb();
533 elem->ptype = entry->ptype;
534
535 if (elem->tp_name && (active ^ elem->state)) {
536 WARN_ON(!elem->tp_cb);
537 /*
538 * It is ok to directly call the probe registration because type
539 * checking has been done in the __trace_mark_tp() macro.
540 */
541
542 if (active) {
543 /*
544 * try_module_get should always succeed because we hold
545 * lock_module() to get the tp_cb address.
546 */
547 ret = try_module_get(__module_text_address(
548 (unsigned long)elem->tp_cb));
549 BUG_ON(!ret);
550 ret = tracepoint_probe_register_noupdate(
551 elem->tp_name,
552 elem->tp_cb);
553 } else {
554 ret = tracepoint_probe_unregister_noupdate(
555 elem->tp_name,
556 elem->tp_cb);
557 /*
558 * tracepoint_probe_update_all() must be called
559 * before the module containing tp_cb is unloaded.
560 */
561 module_put(__module_text_address(
562 (unsigned long)elem->tp_cb));
563 }
564 }
565 elem->state = active;
566
567 return ret;
568}
569
570/*
571 * Disable a marker and its probe callback.
572 * Note: only waiting an RCU period after setting elem->call to the empty
573 * function insures that the original callback is not used anymore. This insured
574 * by rcu_read_lock_sched around the call site.
575 */
576static void disable_marker(struct marker *elem)
577{
578 int ret;
579
580 /* leave "call" as is. It is known statically. */
581 if (elem->tp_name && elem->state) {
582 WARN_ON(!elem->tp_cb);
583 /*
584 * It is ok to directly call the probe registration because type
585 * checking has been done in the __trace_mark_tp() macro.
586 */
587 ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
588 elem->tp_cb);
589 WARN_ON(ret);
590 /*
591 * tracepoint_probe_update_all() must be called
592 * before the module containing tp_cb is unloaded.
593 */
594 module_put(__module_text_address((unsigned long)elem->tp_cb));
595 }
596 elem->state = 0;
597 elem->single.func = __mark_empty_function;
598 /* Update the function before setting the ptype */
599 smp_wmb();
600 elem->ptype = 0; /* single probe */
601 /*
602 * Leave the private data and id there, because removal is racy and
603 * should be done only after an RCU period. These are never used until
604 * the next initialization anyway.
605 */
606}
607
608/**
609 * marker_update_probe_range - Update a probe range
610 * @begin: beginning of the range
611 * @end: end of the range
612 *
613 * Updates the probe callback corresponding to a range of markers.
614 */
615void marker_update_probe_range(struct marker *begin,
616 struct marker *end)
617{
618 struct marker *iter;
619 struct marker_entry *mark_entry;
620
621 mutex_lock(&markers_mutex);
622 for (iter = begin; iter < end; iter++) {
623 mark_entry = get_marker(iter->name);
624 if (mark_entry) {
625 set_marker(mark_entry, iter, !!mark_entry->refcount);
626 /*
627 * ignore error, continue
628 */
629 } else {
630 disable_marker(iter);
631 }
632 }
633 mutex_unlock(&markers_mutex);
634}
635
636/*
637 * Update probes, removing the faulty probes.
638 *
639 * Internal callback only changed before the first probe is connected to it.
640 * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
641 * transitions. All other transitions will leave the old private data valid.
642 * This makes the non-atomicity of the callback/private data updates valid.
643 *
644 * "special case" updates :
645 * 0 -> 1 callback
646 * 1 -> 0 callback
647 * 1 -> 2 callbacks
648 * 2 -> 1 callbacks
649 * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
650 * Site effect : marker_set_format may delete the marker entry (creating a
651 * replacement).
652 */
653static void marker_update_probes(void)
654{
655 /* Core kernel markers */
656 marker_update_probe_range(__start___markers, __stop___markers);
657 /* Markers in modules. */
658 module_update_markers();
659 tracepoint_probe_update_all();
660}
661
662/**
663 * marker_probe_register - Connect a probe to a marker
664 * @name: marker name
665 * @format: format string
666 * @probe: probe handler
667 * @probe_private: probe private data
668 *
669 * private data must be a valid allocated memory address, or NULL.
670 * Returns 0 if ok, error value on error.
671 * The probe address must at least be aligned on the architecture pointer size.
672 */
673int marker_probe_register(const char *name, const char *format,
674 marker_probe_func *probe, void *probe_private)
675{
676 struct marker_entry *entry;
677 int ret = 0;
678 struct marker_probe_closure *old;
679
680 mutex_lock(&markers_mutex);
681 entry = get_marker(name);
682 if (!entry) {
683 entry = add_marker(name, format);
684 if (IS_ERR(entry))
685 ret = PTR_ERR(entry);
686 } else if (format) {
687 if (!entry->format)
688 ret = marker_set_format(entry, format);
689 else if (strcmp(entry->format, format))
690 ret = -EPERM;
691 }
692 if (ret)
693 goto end;
694
695 /*
696 * If we detect that a call_rcu is pending for this marker,
697 * make sure it's executed now.
698 */
699 if (entry->rcu_pending)
700 rcu_barrier_sched();
701 old = marker_entry_add_probe(entry, probe, probe_private);
702 if (IS_ERR(old)) {
703 ret = PTR_ERR(old);
704 goto end;
705 }
706 mutex_unlock(&markers_mutex);
707 marker_update_probes();
708 mutex_lock(&markers_mutex);
709 entry = get_marker(name);
710 if (!entry)
711 goto end;
712 if (entry->rcu_pending)
713 rcu_barrier_sched();
714 entry->oldptr = old;
715 entry->rcu_pending = 1;
716 /* write rcu_pending before calling the RCU callback */
717 smp_wmb();
718 call_rcu_sched(&entry->rcu, free_old_closure);
719end:
720 mutex_unlock(&markers_mutex);
721 return ret;
722}
723EXPORT_SYMBOL_GPL(marker_probe_register);
724
725/**
726 * marker_probe_unregister - Disconnect a probe from a marker
727 * @name: marker name
728 * @probe: probe function pointer
729 * @probe_private: probe private data
730 *
731 * Returns the private data given to marker_probe_register, or an ERR_PTR().
732 * We do not need to call a synchronize_sched to make sure the probes have
733 * finished running before doing a module unload, because the module unload
734 * itself uses stop_machine(), which insures that every preempt disabled section
735 * have finished.
736 */
737int marker_probe_unregister(const char *name,
738 marker_probe_func *probe, void *probe_private)
739{
740 struct marker_entry *entry;
741 struct marker_probe_closure *old;
742 int ret = -ENOENT;
743
744 mutex_lock(&markers_mutex);
745 entry = get_marker(name);
746 if (!entry)
747 goto end;
748 if (entry->rcu_pending)
749 rcu_barrier_sched();
750 old = marker_entry_remove_probe(entry, probe, probe_private);
751 mutex_unlock(&markers_mutex);
752 marker_update_probes();
753 mutex_lock(&markers_mutex);
754 entry = get_marker(name);
755 if (!entry)
756 goto end;
757 if (entry->rcu_pending)
758 rcu_barrier_sched();
759 entry->oldptr = old;
760 entry->rcu_pending = 1;
761 /* write rcu_pending before calling the RCU callback */
762 smp_wmb();
763 call_rcu_sched(&entry->rcu, free_old_closure);
764 remove_marker(name); /* Ignore busy error message */
765 ret = 0;
766end:
767 mutex_unlock(&markers_mutex);
768 return ret;
769}
770EXPORT_SYMBOL_GPL(marker_probe_unregister);
771
772static struct marker_entry *
773get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
774{
775 struct marker_entry *entry;
776 unsigned int i;
777 struct hlist_head *head;
778 struct hlist_node *node;
779
780 for (i = 0; i < MARKER_TABLE_SIZE; i++) {
781 head = &marker_table[i];
782 hlist_for_each_entry(entry, node, head, hlist) {
783 if (!entry->ptype) {
784 if (entry->single.func == probe
785 && entry->single.probe_private
786 == probe_private)
787 return entry;
788 } else {
789 struct marker_probe_closure *closure;
790 closure = entry->multi;
791 for (i = 0; closure[i].func; i++) {
792 if (closure[i].func == probe &&
793 closure[i].probe_private
794 == probe_private)
795 return entry;
796 }
797 }
798 }
799 }
800 return NULL;
801}
802
803/**
804 * marker_probe_unregister_private_data - Disconnect a probe from a marker
805 * @probe: probe function
806 * @probe_private: probe private data
807 *
808 * Unregister a probe by providing the registered private data.
809 * Only removes the first marker found in hash table.
810 * Return 0 on success or error value.
811 * We do not need to call a synchronize_sched to make sure the probes have
812 * finished running before doing a module unload, because the module unload
813 * itself uses stop_machine(), which insures that every preempt disabled section
814 * have finished.
815 */
816int marker_probe_unregister_private_data(marker_probe_func *probe,
817 void *probe_private)
818{
819 struct marker_entry *entry;
820 int ret = 0;
821 struct marker_probe_closure *old;
822
823 mutex_lock(&markers_mutex);
824 entry = get_marker_from_private_data(probe, probe_private);
825 if (!entry) {
826 ret = -ENOENT;
827 goto end;
828 }
829 if (entry->rcu_pending)
830 rcu_barrier_sched();
831 old = marker_entry_remove_probe(entry, NULL, probe_private);
832 mutex_unlock(&markers_mutex);
833 marker_update_probes();
834 mutex_lock(&markers_mutex);
835 entry = get_marker_from_private_data(probe, probe_private);
836 if (!entry)
837 goto end;
838 if (entry->rcu_pending)
839 rcu_barrier_sched();
840 entry->oldptr = old;
841 entry->rcu_pending = 1;
842 /* write rcu_pending before calling the RCU callback */
843 smp_wmb();
844 call_rcu_sched(&entry->rcu, free_old_closure);
845 remove_marker(entry->name); /* Ignore busy error message */
846end:
847 mutex_unlock(&markers_mutex);
848 return ret;
849}
850EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
851
852/**
853 * marker_get_private_data - Get a marker's probe private data
854 * @name: marker name
855 * @probe: probe to match
856 * @num: get the nth matching probe's private data
857 *
858 * Returns the nth private data pointer (starting from 0) matching, or an
859 * ERR_PTR.
860 * Returns the private data pointer, or an ERR_PTR.
861 * The private data pointer should _only_ be dereferenced if the caller is the
862 * owner of the data, or its content could vanish. This is mostly used to
863 * confirm that a caller is the owner of a registered probe.
864 */
865void *marker_get_private_data(const char *name, marker_probe_func *probe,
866 int num)
867{
868 struct hlist_head *head;
869 struct hlist_node *node;
870 struct marker_entry *e;
871 size_t name_len = strlen(name) + 1;
872 u32 hash = jhash(name, name_len-1, 0);
873 int i;
874
875 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
876 hlist_for_each_entry(e, node, head, hlist) {
877 if (!strcmp(name, e->name)) {
878 if (!e->ptype) {
879 if (num == 0 && e->single.func == probe)
880 return e->single.probe_private;
881 } else {
882 struct marker_probe_closure *closure;
883 int match = 0;
884 closure = e->multi;
885 for (i = 0; closure[i].func; i++) {
886 if (closure[i].func != probe)
887 continue;
888 if (match++ == num)
889 return closure[i].probe_private;
890 }
891 }
892 break;
893 }
894 }
895 return ERR_PTR(-ENOENT);
896}
897EXPORT_SYMBOL_GPL(marker_get_private_data);
898
899#ifdef CONFIG_MODULES
900
901int marker_module_notify(struct notifier_block *self,
902 unsigned long val, void *data)
903{
904 struct module *mod = data;
905
906 switch (val) {
907 case MODULE_STATE_COMING:
908 marker_update_probe_range(mod->markers,
909 mod->markers + mod->num_markers);
910 break;
911 case MODULE_STATE_GOING:
912 marker_update_probe_range(mod->markers,
913 mod->markers + mod->num_markers);
914 break;
915 }
916 return 0;
917}
918
919struct notifier_block marker_module_nb = {
920 .notifier_call = marker_module_notify,
921 .priority = 0,
922};
923
924static int init_markers(void)
925{
926 return register_module_notifier(&marker_module_nb);
927}
928__initcall(init_markers);
929
930#endif /* CONFIG_MODULES */
diff --git a/kernel/module.c b/kernel/module.c
index 46580edff0cb..5842a71cf052 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -47,6 +47,7 @@
47#include <linux/rculist.h> 47#include <linux/rculist.h>
48#include <asm/uaccess.h> 48#include <asm/uaccess.h>
49#include <asm/cacheflush.h> 49#include <asm/cacheflush.h>
50#include <asm/mmu_context.h>
50#include <linux/license.h> 51#include <linux/license.h>
51#include <asm/sections.h> 52#include <asm/sections.h>
52#include <linux/tracepoint.h> 53#include <linux/tracepoint.h>
@@ -369,7 +370,7 @@ EXPORT_SYMBOL_GPL(find_module);
369 370
370#ifdef CONFIG_SMP 371#ifdef CONFIG_SMP
371 372
372#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 373#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
373 374
374static void *percpu_modalloc(unsigned long size, unsigned long align, 375static void *percpu_modalloc(unsigned long size, unsigned long align,
375 const char *name) 376 const char *name)
@@ -394,7 +395,7 @@ static void percpu_modfree(void *freeme)
394 free_percpu(freeme); 395 free_percpu(freeme);
395} 396}
396 397
397#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ 398#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
398 399
399/* Number of blocks used and allocated. */ 400/* Number of blocks used and allocated. */
400static unsigned int pcpu_num_used, pcpu_num_allocated; 401static unsigned int pcpu_num_used, pcpu_num_allocated;
@@ -540,7 +541,7 @@ static int percpu_modinit(void)
540} 541}
541__initcall(percpu_modinit); 542__initcall(percpu_modinit);
542 543
543#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ 544#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
544 545
545static unsigned int find_pcpusec(Elf_Ehdr *hdr, 546static unsigned int find_pcpusec(Elf_Ehdr *hdr,
546 Elf_Shdr *sechdrs, 547 Elf_Shdr *sechdrs,
@@ -1186,7 +1187,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1186 1187
1187 /* Count loaded sections and allocate structures */ 1188 /* Count loaded sections and allocate structures */
1188 for (i = 0; i < nsect; i++) 1189 for (i = 0; i < nsect; i++)
1189 if (sechdrs[i].sh_flags & SHF_ALLOC) 1190 if (sechdrs[i].sh_flags & SHF_ALLOC
1191 && sechdrs[i].sh_size)
1190 nloaded++; 1192 nloaded++;
1191 size[0] = ALIGN(sizeof(*sect_attrs) 1193 size[0] = ALIGN(sizeof(*sect_attrs)
1192 + nloaded * sizeof(sect_attrs->attrs[0]), 1194 + nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1206,6 +1208,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1206 for (i = 0; i < nsect; i++) { 1208 for (i = 0; i < nsect; i++) {
1207 if (! (sechdrs[i].sh_flags & SHF_ALLOC)) 1209 if (! (sechdrs[i].sh_flags & SHF_ALLOC))
1208 continue; 1210 continue;
1211 if (!sechdrs[i].sh_size)
1212 continue;
1209 sattr->address = sechdrs[i].sh_addr; 1213 sattr->address = sechdrs[i].sh_addr;
1210 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, 1214 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
1211 GFP_KERNEL); 1215 GFP_KERNEL);
@@ -1535,6 +1539,10 @@ static void free_module(struct module *mod)
1535 1539
1536 /* Finally, free the core (containing the module structure) */ 1540 /* Finally, free the core (containing the module structure) */
1537 module_free(mod, mod->module_core); 1541 module_free(mod, mod->module_core);
1542
1543#ifdef CONFIG_MPU
1544 update_protections(current->mm);
1545#endif
1538} 1546}
1539 1547
1540void *__symbol_get(const char *symbol) 1548void *__symbol_get(const char *symbol)
@@ -1792,6 +1800,17 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
1792 } 1800 }
1793} 1801}
1794 1802
1803static void free_modinfo(struct module *mod)
1804{
1805 struct module_attribute *attr;
1806 int i;
1807
1808 for (i = 0; (attr = modinfo_attrs[i]); i++) {
1809 if (attr->free)
1810 attr->free(mod);
1811 }
1812}
1813
1795#ifdef CONFIG_KALLSYMS 1814#ifdef CONFIG_KALLSYMS
1796 1815
1797/* lookup symbol in given range of kernel_symbols */ 1816/* lookup symbol in given range of kernel_symbols */
@@ -1857,13 +1876,93 @@ static char elf_type(const Elf_Sym *sym,
1857 return '?'; 1876 return '?';
1858} 1877}
1859 1878
1879static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
1880 unsigned int shnum)
1881{
1882 const Elf_Shdr *sec;
1883
1884 if (src->st_shndx == SHN_UNDEF
1885 || src->st_shndx >= shnum
1886 || !src->st_name)
1887 return false;
1888
1889 sec = sechdrs + src->st_shndx;
1890 if (!(sec->sh_flags & SHF_ALLOC)
1891#ifndef CONFIG_KALLSYMS_ALL
1892 || !(sec->sh_flags & SHF_EXECINSTR)
1893#endif
1894 || (sec->sh_entsize & INIT_OFFSET_MASK))
1895 return false;
1896
1897 return true;
1898}
1899
1900static unsigned long layout_symtab(struct module *mod,
1901 Elf_Shdr *sechdrs,
1902 unsigned int symindex,
1903 unsigned int strindex,
1904 const Elf_Ehdr *hdr,
1905 const char *secstrings,
1906 unsigned long *pstroffs,
1907 unsigned long *strmap)
1908{
1909 unsigned long symoffs;
1910 Elf_Shdr *symsect = sechdrs + symindex;
1911 Elf_Shdr *strsect = sechdrs + strindex;
1912 const Elf_Sym *src;
1913 const char *strtab;
1914 unsigned int i, nsrc, ndst;
1915
1916 /* Put symbol section at end of init part of module. */
1917 symsect->sh_flags |= SHF_ALLOC;
1918 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
1919 symindex) | INIT_OFFSET_MASK;
1920 DEBUGP("\t%s\n", secstrings + symsect->sh_name);
1921
1922 src = (void *)hdr + symsect->sh_offset;
1923 nsrc = symsect->sh_size / sizeof(*src);
1924 strtab = (void *)hdr + strsect->sh_offset;
1925 for (ndst = i = 1; i < nsrc; ++i, ++src)
1926 if (is_core_symbol(src, sechdrs, hdr->e_shnum)) {
1927 unsigned int j = src->st_name;
1928
1929 while(!__test_and_set_bit(j, strmap) && strtab[j])
1930 ++j;
1931 ++ndst;
1932 }
1933
1934 /* Append room for core symbols at end of core part. */
1935 symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
1936 mod->core_size = symoffs + ndst * sizeof(Elf_Sym);
1937
1938 /* Put string table section at end of init part of module. */
1939 strsect->sh_flags |= SHF_ALLOC;
1940 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
1941 strindex) | INIT_OFFSET_MASK;
1942 DEBUGP("\t%s\n", secstrings + strsect->sh_name);
1943
1944 /* Append room for core symbols' strings at end of core part. */
1945 *pstroffs = mod->core_size;
1946 __set_bit(0, strmap);
1947 mod->core_size += bitmap_weight(strmap, strsect->sh_size);
1948
1949 return symoffs;
1950}
1951
1860static void add_kallsyms(struct module *mod, 1952static void add_kallsyms(struct module *mod,
1861 Elf_Shdr *sechdrs, 1953 Elf_Shdr *sechdrs,
1954 unsigned int shnum,
1862 unsigned int symindex, 1955 unsigned int symindex,
1863 unsigned int strindex, 1956 unsigned int strindex,
1864 const char *secstrings) 1957 unsigned long symoffs,
1958 unsigned long stroffs,
1959 const char *secstrings,
1960 unsigned long *strmap)
1865{ 1961{
1866 unsigned int i; 1962 unsigned int i, ndst;
1963 const Elf_Sym *src;
1964 Elf_Sym *dst;
1965 char *s;
1867 1966
1868 mod->symtab = (void *)sechdrs[symindex].sh_addr; 1967 mod->symtab = (void *)sechdrs[symindex].sh_addr;
1869 mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); 1968 mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
@@ -1873,13 +1972,46 @@ static void add_kallsyms(struct module *mod,
1873 for (i = 0; i < mod->num_symtab; i++) 1972 for (i = 0; i < mod->num_symtab; i++)
1874 mod->symtab[i].st_info 1973 mod->symtab[i].st_info
1875 = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); 1974 = elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
1975
1976 mod->core_symtab = dst = mod->module_core + symoffs;
1977 src = mod->symtab;
1978 *dst = *src;
1979 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
1980 if (!is_core_symbol(src, sechdrs, shnum))
1981 continue;
1982 dst[ndst] = *src;
1983 dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name);
1984 ++ndst;
1985 }
1986 mod->core_num_syms = ndst;
1987
1988 mod->core_strtab = s = mod->module_core + stroffs;
1989 for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i)
1990 if (test_bit(i, strmap))
1991 *++s = mod->strtab[i];
1876} 1992}
1877#else 1993#else
1994static inline unsigned long layout_symtab(struct module *mod,
1995 Elf_Shdr *sechdrs,
1996 unsigned int symindex,
1997 unsigned int strindex,
1998 const Elf_Ehdr *hdr,
1999 const char *secstrings,
2000 unsigned long *pstroffs,
2001 unsigned long *strmap)
2002{
2003 return 0;
2004}
2005
1878static inline void add_kallsyms(struct module *mod, 2006static inline void add_kallsyms(struct module *mod,
1879 Elf_Shdr *sechdrs, 2007 Elf_Shdr *sechdrs,
2008 unsigned int shnum,
1880 unsigned int symindex, 2009 unsigned int symindex,
1881 unsigned int strindex, 2010 unsigned int strindex,
1882 const char *secstrings) 2011 unsigned long symoffs,
2012 unsigned long stroffs,
2013 const char *secstrings,
2014 const unsigned long *strmap)
1883{ 2015{
1884} 2016}
1885#endif /* CONFIG_KALLSYMS */ 2017#endif /* CONFIG_KALLSYMS */
@@ -1954,6 +2086,8 @@ static noinline struct module *load_module(void __user *umod,
1954 struct module *mod; 2086 struct module *mod;
1955 long err = 0; 2087 long err = 0;
1956 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 2088 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
2089 unsigned long symoffs, stroffs, *strmap;
2090
1957 mm_segment_t old_fs; 2091 mm_segment_t old_fs;
1958 2092
1959 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", 2093 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2035,11 +2169,6 @@ static noinline struct module *load_module(void __user *umod,
2035 /* Don't keep modinfo and version sections. */ 2169 /* Don't keep modinfo and version sections. */
2036 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2170 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2037 sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2171 sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2038#ifdef CONFIG_KALLSYMS
2039 /* Keep symbol and string tables for decoding later. */
2040 sechdrs[symindex].sh_flags |= SHF_ALLOC;
2041 sechdrs[strindex].sh_flags |= SHF_ALLOC;
2042#endif
2043 2172
2044 /* Check module struct version now, before we try to use module. */ 2173 /* Check module struct version now, before we try to use module. */
2045 if (!check_modstruct_version(sechdrs, versindex, mod)) { 2174 if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -2075,6 +2204,13 @@ static noinline struct module *load_module(void __user *umod,
2075 goto free_hdr; 2204 goto free_hdr;
2076 } 2205 }
2077 2206
2207 strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size)
2208 * sizeof(long), GFP_KERNEL);
2209 if (!strmap) {
2210 err = -ENOMEM;
2211 goto free_mod;
2212 }
2213
2078 if (find_module(mod->name)) { 2214 if (find_module(mod->name)) {
2079 err = -EEXIST; 2215 err = -EEXIST;
2080 goto free_mod; 2216 goto free_mod;
@@ -2104,6 +2240,8 @@ static noinline struct module *load_module(void __user *umod,
2104 this is done generically; there doesn't appear to be any 2240 this is done generically; there doesn't appear to be any
2105 special cases for the architectures. */ 2241 special cases for the architectures. */
2106 layout_sections(mod, hdr, sechdrs, secstrings); 2242 layout_sections(mod, hdr, sechdrs, secstrings);
2243 symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr,
2244 secstrings, &stroffs, strmap);
2107 2245
2108 /* Do the allocs. */ 2246 /* Do the allocs. */
2109 ptr = module_alloc_update_bounds(mod->core_size); 2247 ptr = module_alloc_update_bounds(mod->core_size);
@@ -2237,10 +2375,6 @@ static noinline struct module *load_module(void __user *umod,
2237 sizeof(*mod->ctors), &mod->num_ctors); 2375 sizeof(*mod->ctors), &mod->num_ctors);
2238#endif 2376#endif
2239 2377
2240#ifdef CONFIG_MARKERS
2241 mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
2242 sizeof(*mod->markers), &mod->num_markers);
2243#endif
2244#ifdef CONFIG_TRACEPOINTS 2378#ifdef CONFIG_TRACEPOINTS
2245 mod->tracepoints = section_objs(hdr, sechdrs, secstrings, 2379 mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
2246 "__tracepoints", 2380 "__tracepoints",
@@ -2312,7 +2446,10 @@ static noinline struct module *load_module(void __user *umod,
2312 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, 2446 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
2313 sechdrs[pcpuindex].sh_size); 2447 sechdrs[pcpuindex].sh_size);
2314 2448
2315 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); 2449 add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
2450 symoffs, stroffs, secstrings, strmap);
2451 kfree(strmap);
2452 strmap = NULL;
2316 2453
2317 if (!mod->taints) { 2454 if (!mod->taints) {
2318 struct _ddebug *debug; 2455 struct _ddebug *debug;
@@ -2384,13 +2521,14 @@ static noinline struct module *load_module(void __user *umod,
2384 synchronize_sched(); 2521 synchronize_sched();
2385 module_arch_cleanup(mod); 2522 module_arch_cleanup(mod);
2386 cleanup: 2523 cleanup:
2524 free_modinfo(mod);
2387 kobject_del(&mod->mkobj.kobj); 2525 kobject_del(&mod->mkobj.kobj);
2388 kobject_put(&mod->mkobj.kobj); 2526 kobject_put(&mod->mkobj.kobj);
2389 free_unload: 2527 free_unload:
2390 module_unload_free(mod); 2528 module_unload_free(mod);
2391#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2529#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
2392 free_init:
2393 percpu_modfree(mod->refptr); 2530 percpu_modfree(mod->refptr);
2531 free_init:
2394#endif 2532#endif
2395 module_free(mod, mod->module_init); 2533 module_free(mod, mod->module_init);
2396 free_core: 2534 free_core:
@@ -2401,6 +2539,7 @@ static noinline struct module *load_module(void __user *umod,
2401 percpu_modfree(percpu); 2539 percpu_modfree(percpu);
2402 free_mod: 2540 free_mod:
2403 kfree(args); 2541 kfree(args);
2542 kfree(strmap);
2404 free_hdr: 2543 free_hdr:
2405 vfree(hdr); 2544 vfree(hdr);
2406 return ERR_PTR(err); 2545 return ERR_PTR(err);
@@ -2490,6 +2629,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2490 /* Drop initial reference. */ 2629 /* Drop initial reference. */
2491 module_put(mod); 2630 module_put(mod);
2492 trim_init_extable(mod); 2631 trim_init_extable(mod);
2632#ifdef CONFIG_KALLSYMS
2633 mod->num_symtab = mod->core_num_syms;
2634 mod->symtab = mod->core_symtab;
2635 mod->strtab = mod->core_strtab;
2636#endif
2493 module_free(mod, mod->module_init); 2637 module_free(mod, mod->module_init);
2494 mod->module_init = NULL; 2638 mod->module_init = NULL;
2495 mod->init_size = 0; 2639 mod->init_size = 0;
@@ -2951,27 +3095,12 @@ void module_layout(struct module *mod,
2951 struct modversion_info *ver, 3095 struct modversion_info *ver,
2952 struct kernel_param *kp, 3096 struct kernel_param *kp,
2953 struct kernel_symbol *ks, 3097 struct kernel_symbol *ks,
2954 struct marker *marker,
2955 struct tracepoint *tp) 3098 struct tracepoint *tp)
2956{ 3099{
2957} 3100}
2958EXPORT_SYMBOL(module_layout); 3101EXPORT_SYMBOL(module_layout);
2959#endif 3102#endif
2960 3103
2961#ifdef CONFIG_MARKERS
2962void module_update_markers(void)
2963{
2964 struct module *mod;
2965
2966 mutex_lock(&module_mutex);
2967 list_for_each_entry(mod, &modules, list)
2968 if (!mod->taints)
2969 marker_update_probe_range(mod->markers,
2970 mod->markers + mod->num_markers);
2971 mutex_unlock(&module_mutex);
2972}
2973#endif
2974
2975#ifdef CONFIG_TRACEPOINTS 3104#ifdef CONFIG_TRACEPOINTS
2976void module_update_tracepoints(void) 3105void module_update_tracepoints(void)
2977{ 3106{
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 50d022e5a560..ec815a960b5d 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -16,6 +16,7 @@
16#include <linux/delay.h> 16#include <linux/delay.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/poison.h> 18#include <linux/poison.h>
19#include <linux/sched.h>
19#include <linux/spinlock.h> 20#include <linux/spinlock.h>
20#include <linux/kallsyms.h> 21#include <linux/kallsyms.h>
21#include <linux/interrupt.h> 22#include <linux/interrupt.h>
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 947b3ad551f8..632f04c57d82 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -148,8 +148,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
148 148
149 preempt_disable(); 149 preempt_disable();
150 mutex_acquire(&lock->dep_map, subclass, 0, ip); 150 mutex_acquire(&lock->dep_map, subclass, 0, ip);
151#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && \ 151
152 !defined(CONFIG_HAVE_DEFAULT_NO_SPIN_MUTEXES) 152#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
153 /* 153 /*
154 * Optimistic spinning. 154 * Optimistic spinning.
155 * 155 *
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 61d5aa5eced3..acd24e7643eb 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -558,7 +558,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
558 558
559static ATOMIC_NOTIFIER_HEAD(die_chain); 559static ATOMIC_NOTIFIER_HEAD(die_chain);
560 560
561int notrace notify_die(enum die_val val, const char *str, 561int notrace __kprobes notify_die(enum die_val val, const char *str,
562 struct pt_regs *regs, long err, int trap, int sig) 562 struct pt_regs *regs, long err, int trap, int sig)
563{ 563{
564 struct die_args args = { 564 struct die_args args = {
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 5aa854f9e5ae..2a5dfec8efe0 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -42,8 +42,8 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
42 * (hence either you are in the same cgroup as task, or in an 42 * (hence either you are in the same cgroup as task, or in an
43 * ancestor cgroup thereof) 43 * ancestor cgroup thereof)
44 */ 44 */
45static int ns_can_attach(struct cgroup_subsys *ss, 45static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
46 struct cgroup *new_cgroup, struct task_struct *task) 46 struct task_struct *task, bool threadgroup)
47{ 47{
48 if (current != task) { 48 if (current != task) {
49 if (!capable(CAP_SYS_ADMIN)) 49 if (!capable(CAP_SYS_ADMIN))
@@ -56,6 +56,18 @@ static int ns_can_attach(struct cgroup_subsys *ss,
56 if (!cgroup_is_descendant(new_cgroup, task)) 56 if (!cgroup_is_descendant(new_cgroup, task))
57 return -EPERM; 57 return -EPERM;
58 58
59 if (threadgroup) {
60 struct task_struct *c;
61 rcu_read_lock();
62 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
63 if (!cgroup_is_descendant(new_cgroup, c)) {
64 rcu_read_unlock();
65 return -EPERM;
66 }
67 }
68 rcu_read_unlock();
69 }
70
59 return 0; 71 return 0;
60} 72}
61 73
diff --git a/kernel/panic.c b/kernel/panic.c
index 512ab73b0ca3..96b45d0b4ba5 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -90,6 +90,8 @@ NORET_TYPE void panic(const char * fmt, ...)
90 90
91 atomic_notifier_call_chain(&panic_notifier_list, 0, buf); 91 atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
92 92
93 bust_spinlocks(0);
94
93 if (!panic_blink) 95 if (!panic_blink)
94 panic_blink = no_blink; 96 panic_blink = no_blink;
95 97
@@ -136,7 +138,6 @@ NORET_TYPE void panic(const char * fmt, ...)
136 mdelay(1); 138 mdelay(1);
137 i++; 139 i++;
138 } 140 }
139 bust_spinlocks(0);
140} 141}
141 142
142EXPORT_SYMBOL(panic); 143EXPORT_SYMBOL(panic);
@@ -177,7 +178,7 @@ static const struct tnt tnts[] = {
177 * 'W' - Taint on warning. 178 * 'W' - Taint on warning.
178 * 'C' - modules from drivers/staging are loaded. 179 * 'C' - modules from drivers/staging are loaded.
179 * 180 *
180 * The string is overwritten by the next call to print_taint(). 181 * The string is overwritten by the next call to print_tainted().
181 */ 182 */
182const char *print_tainted(void) 183const char *print_tainted(void)
183{ 184{
diff --git a/kernel/params.c b/kernel/params.c
index 7f6912ced2ba..d656c276508d 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -23,6 +23,7 @@
23#include <linux/device.h> 23#include <linux/device.h>
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ctype.h>
26 27
27#if 0 28#if 0
28#define DEBUGP printk 29#define DEBUGP printk
@@ -87,7 +88,7 @@ static char *next_arg(char *args, char **param, char **val)
87 } 88 }
88 89
89 for (i = 0; args[i]; i++) { 90 for (i = 0; args[i]; i++) {
90 if (args[i] == ' ' && !in_quote) 91 if (isspace(args[i]) && !in_quote)
91 break; 92 break;
92 if (equals == 0) { 93 if (equals == 0) {
93 if (args[i] == '=') 94 if (args[i] == '=')
@@ -121,7 +122,7 @@ static char *next_arg(char *args, char **param, char **val)
121 next = args + i; 122 next = args + i;
122 123
123 /* Chew up trailing spaces. */ 124 /* Chew up trailing spaces. */
124 while (*next == ' ') 125 while (isspace(*next))
125 next++; 126 next++;
126 return next; 127 return next;
127} 128}
@@ -138,7 +139,7 @@ int parse_args(const char *name,
138 DEBUGP("Parsing ARGS: %s\n", args); 139 DEBUGP("Parsing ARGS: %s\n", args);
139 140
140 /* Chew leading spaces */ 141 /* Chew leading spaces */
141 while (*args == ' ') 142 while (isspace(*args))
142 args++; 143 args++;
143 144
144 while (*args) { 145 while (*args) {
@@ -217,15 +218,11 @@ int param_set_charp(const char *val, struct kernel_param *kp)
217 return -ENOSPC; 218 return -ENOSPC;
218 } 219 }
219 220
220 if (kp->flags & KPARAM_KMALLOCED)
221 kfree(*(char **)kp->arg);
222
223 /* This is a hack. We can't need to strdup in early boot, and we 221 /* This is a hack. We can't need to strdup in early boot, and we
224 * don't need to; this mangled commandline is preserved. */ 222 * don't need to; this mangled commandline is preserved. */
225 if (slab_is_available()) { 223 if (slab_is_available()) {
226 kp->flags |= KPARAM_KMALLOCED;
227 *(char **)kp->arg = kstrdup(val, GFP_KERNEL); 224 *(char **)kp->arg = kstrdup(val, GFP_KERNEL);
228 if (!kp->arg) 225 if (!*(char **)kp->arg)
229 return -ENOMEM; 226 return -ENOMEM;
230 } else 227 } else
231 *(const char **)kp->arg = val; 228 *(const char **)kp->arg = val;
@@ -303,6 +300,7 @@ static int param_array(const char *name,
303 unsigned int min, unsigned int max, 300 unsigned int min, unsigned int max,
304 void *elem, int elemsize, 301 void *elem, int elemsize,
305 int (*set)(const char *, struct kernel_param *kp), 302 int (*set)(const char *, struct kernel_param *kp),
303 u16 flags,
306 unsigned int *num) 304 unsigned int *num)
307{ 305{
308 int ret; 306 int ret;
@@ -312,6 +310,7 @@ static int param_array(const char *name,
312 /* Get the name right for errors. */ 310 /* Get the name right for errors. */
313 kp.name = name; 311 kp.name = name;
314 kp.arg = elem; 312 kp.arg = elem;
313 kp.flags = flags;
315 314
316 /* No equals sign? */ 315 /* No equals sign? */
317 if (!val) { 316 if (!val) {
@@ -357,7 +356,8 @@ int param_array_set(const char *val, struct kernel_param *kp)
357 unsigned int temp_num; 356 unsigned int temp_num;
358 357
359 return param_array(kp->name, val, 1, arr->max, arr->elem, 358 return param_array(kp->name, val, 1, arr->max, arr->elem,
360 arr->elemsize, arr->set, arr->num ?: &temp_num); 359 arr->elemsize, arr->set, kp->flags,
360 arr->num ?: &temp_num);
361} 361}
362 362
363int param_array_get(char *buffer, struct kernel_param *kp) 363int param_array_get(char *buffer, struct kernel_param *kp)
@@ -604,11 +604,7 @@ void module_param_sysfs_remove(struct module *mod)
604 604
605void destroy_params(const struct kernel_param *params, unsigned num) 605void destroy_params(const struct kernel_param *params, unsigned num)
606{ 606{
607 unsigned int i; 607 /* FIXME: This should free kmalloced charp parameters. It doesn't. */
608
609 for (i = 0; i < num; i++)
610 if (params[i].flags & KPARAM_KMALLOCED)
611 kfree(*(char **)params[i].arg);
612} 608}
613 609
614static void __init kernel_add_sysfs_param(const char *name, 610static void __init kernel_add_sysfs_param(const char *name,
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
deleted file mode 100644
index e0d91fdf0c3c..000000000000
--- a/kernel/perf_counter.c
+++ /dev/null
@@ -1,4962 +0,0 @@
1/*
2 * Performance counter core code
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/sysfs.h>
19#include <linux/dcache.h>
20#include <linux/percpu.h>
21#include <linux/ptrace.h>
22#include <linux/vmstat.h>
23#include <linux/hardirq.h>
24#include <linux/rculist.h>
25#include <linux/uaccess.h>
26#include <linux/syscalls.h>
27#include <linux/anon_inodes.h>
28#include <linux/kernel_stat.h>
29#include <linux/perf_counter.h>
30
31#include <asm/irq_regs.h>
32
33/*
34 * Each CPU has a list of per CPU counters:
35 */
36DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
37
38int perf_max_counters __read_mostly = 1;
39static int perf_reserved_percpu __read_mostly;
40static int perf_overcommit __read_mostly = 1;
41
42static atomic_t nr_counters __read_mostly;
43static atomic_t nr_mmap_counters __read_mostly;
44static atomic_t nr_comm_counters __read_mostly;
45static atomic_t nr_task_counters __read_mostly;
46
47/*
48 * perf counter paranoia level:
49 * -1 - not paranoid at all
50 * 0 - disallow raw tracepoint access for unpriv
51 * 1 - disallow cpu counters for unpriv
52 * 2 - disallow kernel profiling for unpriv
53 */
54int sysctl_perf_counter_paranoid __read_mostly = 1;
55
56static inline bool perf_paranoid_tracepoint_raw(void)
57{
58 return sysctl_perf_counter_paranoid > -1;
59}
60
61static inline bool perf_paranoid_cpu(void)
62{
63 return sysctl_perf_counter_paranoid > 0;
64}
65
66static inline bool perf_paranoid_kernel(void)
67{
68 return sysctl_perf_counter_paranoid > 1;
69}
70
71int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
72
73/*
74 * max perf counter sample rate
75 */
76int sysctl_perf_counter_sample_rate __read_mostly = 100000;
77
78static atomic64_t perf_counter_id;
79
80/*
81 * Lock for (sysadmin-configurable) counter reservations:
82 */
83static DEFINE_SPINLOCK(perf_resource_lock);
84
85/*
86 * Architecture provided APIs - weak aliases:
87 */
88extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
89{
90 return NULL;
91}
92
93void __weak hw_perf_disable(void) { barrier(); }
94void __weak hw_perf_enable(void) { barrier(); }
95
96void __weak hw_perf_counter_setup(int cpu) { barrier(); }
97void __weak hw_perf_counter_setup_online(int cpu) { barrier(); }
98
99int __weak
100hw_perf_group_sched_in(struct perf_counter *group_leader,
101 struct perf_cpu_context *cpuctx,
102 struct perf_counter_context *ctx, int cpu)
103{
104 return 0;
105}
106
107void __weak perf_counter_print_debug(void) { }
108
109static DEFINE_PER_CPU(int, disable_count);
110
111void __perf_disable(void)
112{
113 __get_cpu_var(disable_count)++;
114}
115
116bool __perf_enable(void)
117{
118 return !--__get_cpu_var(disable_count);
119}
120
121void perf_disable(void)
122{
123 __perf_disable();
124 hw_perf_disable();
125}
126
127void perf_enable(void)
128{
129 if (__perf_enable())
130 hw_perf_enable();
131}
132
133static void get_ctx(struct perf_counter_context *ctx)
134{
135 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
136}
137
138static void free_ctx(struct rcu_head *head)
139{
140 struct perf_counter_context *ctx;
141
142 ctx = container_of(head, struct perf_counter_context, rcu_head);
143 kfree(ctx);
144}
145
146static void put_ctx(struct perf_counter_context *ctx)
147{
148 if (atomic_dec_and_test(&ctx->refcount)) {
149 if (ctx->parent_ctx)
150 put_ctx(ctx->parent_ctx);
151 if (ctx->task)
152 put_task_struct(ctx->task);
153 call_rcu(&ctx->rcu_head, free_ctx);
154 }
155}
156
157static void unclone_ctx(struct perf_counter_context *ctx)
158{
159 if (ctx->parent_ctx) {
160 put_ctx(ctx->parent_ctx);
161 ctx->parent_ctx = NULL;
162 }
163}
164
165/*
166 * If we inherit counters we want to return the parent counter id
167 * to userspace.
168 */
169static u64 primary_counter_id(struct perf_counter *counter)
170{
171 u64 id = counter->id;
172
173 if (counter->parent)
174 id = counter->parent->id;
175
176 return id;
177}
178
179/*
180 * Get the perf_counter_context for a task and lock it.
181 * This has to cope with with the fact that until it is locked,
182 * the context could get moved to another task.
183 */
184static struct perf_counter_context *
185perf_lock_task_context(struct task_struct *task, unsigned long *flags)
186{
187 struct perf_counter_context *ctx;
188
189 rcu_read_lock();
190 retry:
191 ctx = rcu_dereference(task->perf_counter_ctxp);
192 if (ctx) {
193 /*
194 * If this context is a clone of another, it might
195 * get swapped for another underneath us by
196 * perf_counter_task_sched_out, though the
197 * rcu_read_lock() protects us from any context
198 * getting freed. Lock the context and check if it
199 * got swapped before we could get the lock, and retry
200 * if so. If we locked the right context, then it
201 * can't get swapped on us any more.
202 */
203 spin_lock_irqsave(&ctx->lock, *flags);
204 if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
205 spin_unlock_irqrestore(&ctx->lock, *flags);
206 goto retry;
207 }
208
209 if (!atomic_inc_not_zero(&ctx->refcount)) {
210 spin_unlock_irqrestore(&ctx->lock, *flags);
211 ctx = NULL;
212 }
213 }
214 rcu_read_unlock();
215 return ctx;
216}
217
218/*
219 * Get the context for a task and increment its pin_count so it
220 * can't get swapped to another task. This also increments its
221 * reference count so that the context can't get freed.
222 */
223static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
224{
225 struct perf_counter_context *ctx;
226 unsigned long flags;
227
228 ctx = perf_lock_task_context(task, &flags);
229 if (ctx) {
230 ++ctx->pin_count;
231 spin_unlock_irqrestore(&ctx->lock, flags);
232 }
233 return ctx;
234}
235
236static void perf_unpin_context(struct perf_counter_context *ctx)
237{
238 unsigned long flags;
239
240 spin_lock_irqsave(&ctx->lock, flags);
241 --ctx->pin_count;
242 spin_unlock_irqrestore(&ctx->lock, flags);
243 put_ctx(ctx);
244}
245
246/*
247 * Add a counter from the lists for its context.
248 * Must be called with ctx->mutex and ctx->lock held.
249 */
250static void
251list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
252{
253 struct perf_counter *group_leader = counter->group_leader;
254
255 /*
256 * Depending on whether it is a standalone or sibling counter,
257 * add it straight to the context's counter list, or to the group
258 * leader's sibling list:
259 */
260 if (group_leader == counter)
261 list_add_tail(&counter->list_entry, &ctx->counter_list);
262 else {
263 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
264 group_leader->nr_siblings++;
265 }
266
267 list_add_rcu(&counter->event_entry, &ctx->event_list);
268 ctx->nr_counters++;
269 if (counter->attr.inherit_stat)
270 ctx->nr_stat++;
271}
272
273/*
274 * Remove a counter from the lists for its context.
275 * Must be called with ctx->mutex and ctx->lock held.
276 */
277static void
278list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
279{
280 struct perf_counter *sibling, *tmp;
281
282 if (list_empty(&counter->list_entry))
283 return;
284 ctx->nr_counters--;
285 if (counter->attr.inherit_stat)
286 ctx->nr_stat--;
287
288 list_del_init(&counter->list_entry);
289 list_del_rcu(&counter->event_entry);
290
291 if (counter->group_leader != counter)
292 counter->group_leader->nr_siblings--;
293
294 /*
295 * If this was a group counter with sibling counters then
296 * upgrade the siblings to singleton counters by adding them
297 * to the context list directly:
298 */
299 list_for_each_entry_safe(sibling, tmp,
300 &counter->sibling_list, list_entry) {
301
302 list_move_tail(&sibling->list_entry, &ctx->counter_list);
303 sibling->group_leader = sibling;
304 }
305}
306
307static void
308counter_sched_out(struct perf_counter *counter,
309 struct perf_cpu_context *cpuctx,
310 struct perf_counter_context *ctx)
311{
312 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
313 return;
314
315 counter->state = PERF_COUNTER_STATE_INACTIVE;
316 if (counter->pending_disable) {
317 counter->pending_disable = 0;
318 counter->state = PERF_COUNTER_STATE_OFF;
319 }
320 counter->tstamp_stopped = ctx->time;
321 counter->pmu->disable(counter);
322 counter->oncpu = -1;
323
324 if (!is_software_counter(counter))
325 cpuctx->active_oncpu--;
326 ctx->nr_active--;
327 if (counter->attr.exclusive || !cpuctx->active_oncpu)
328 cpuctx->exclusive = 0;
329}
330
331static void
332group_sched_out(struct perf_counter *group_counter,
333 struct perf_cpu_context *cpuctx,
334 struct perf_counter_context *ctx)
335{
336 struct perf_counter *counter;
337
338 if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
339 return;
340
341 counter_sched_out(group_counter, cpuctx, ctx);
342
343 /*
344 * Schedule out siblings (if any):
345 */
346 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
347 counter_sched_out(counter, cpuctx, ctx);
348
349 if (group_counter->attr.exclusive)
350 cpuctx->exclusive = 0;
351}
352
353/*
354 * Cross CPU call to remove a performance counter
355 *
356 * We disable the counter on the hardware level first. After that we
357 * remove it from the context list.
358 */
359static void __perf_counter_remove_from_context(void *info)
360{
361 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
362 struct perf_counter *counter = info;
363 struct perf_counter_context *ctx = counter->ctx;
364
365 /*
366 * If this is a task context, we need to check whether it is
367 * the current task context of this cpu. If not it has been
368 * scheduled out before the smp call arrived.
369 */
370 if (ctx->task && cpuctx->task_ctx != ctx)
371 return;
372
373 spin_lock(&ctx->lock);
374 /*
375 * Protect the list operation against NMI by disabling the
376 * counters on a global level.
377 */
378 perf_disable();
379
380 counter_sched_out(counter, cpuctx, ctx);
381
382 list_del_counter(counter, ctx);
383
384 if (!ctx->task) {
385 /*
386 * Allow more per task counters with respect to the
387 * reservation:
388 */
389 cpuctx->max_pertask =
390 min(perf_max_counters - ctx->nr_counters,
391 perf_max_counters - perf_reserved_percpu);
392 }
393
394 perf_enable();
395 spin_unlock(&ctx->lock);
396}
397
398
399/*
400 * Remove the counter from a task's (or a CPU's) list of counters.
401 *
402 * Must be called with ctx->mutex held.
403 *
404 * CPU counters are removed with a smp call. For task counters we only
405 * call when the task is on a CPU.
406 *
407 * If counter->ctx is a cloned context, callers must make sure that
408 * every task struct that counter->ctx->task could possibly point to
409 * remains valid. This is OK when called from perf_release since
410 * that only calls us on the top-level context, which can't be a clone.
411 * When called from perf_counter_exit_task, it's OK because the
412 * context has been detached from its task.
413 */
414static void perf_counter_remove_from_context(struct perf_counter *counter)
415{
416 struct perf_counter_context *ctx = counter->ctx;
417 struct task_struct *task = ctx->task;
418
419 if (!task) {
420 /*
421 * Per cpu counters are removed via an smp call and
422 * the removal is always sucessful.
423 */
424 smp_call_function_single(counter->cpu,
425 __perf_counter_remove_from_context,
426 counter, 1);
427 return;
428 }
429
430retry:
431 task_oncpu_function_call(task, __perf_counter_remove_from_context,
432 counter);
433
434 spin_lock_irq(&ctx->lock);
435 /*
436 * If the context is active we need to retry the smp call.
437 */
438 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
439 spin_unlock_irq(&ctx->lock);
440 goto retry;
441 }
442
443 /*
444 * The lock prevents that this context is scheduled in so we
445 * can remove the counter safely, if the call above did not
446 * succeed.
447 */
448 if (!list_empty(&counter->list_entry)) {
449 list_del_counter(counter, ctx);
450 }
451 spin_unlock_irq(&ctx->lock);
452}
453
454static inline u64 perf_clock(void)
455{
456 return cpu_clock(smp_processor_id());
457}
458
459/*
460 * Update the record of the current time in a context.
461 */
462static void update_context_time(struct perf_counter_context *ctx)
463{
464 u64 now = perf_clock();
465
466 ctx->time += now - ctx->timestamp;
467 ctx->timestamp = now;
468}
469
470/*
471 * Update the total_time_enabled and total_time_running fields for a counter.
472 */
473static void update_counter_times(struct perf_counter *counter)
474{
475 struct perf_counter_context *ctx = counter->ctx;
476 u64 run_end;
477
478 if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
479 counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE)
480 return;
481
482 counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
483
484 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
485 run_end = counter->tstamp_stopped;
486 else
487 run_end = ctx->time;
488
489 counter->total_time_running = run_end - counter->tstamp_running;
490}
491
492/*
493 * Update total_time_enabled and total_time_running for all counters in a group.
494 */
495static void update_group_times(struct perf_counter *leader)
496{
497 struct perf_counter *counter;
498
499 update_counter_times(leader);
500 list_for_each_entry(counter, &leader->sibling_list, list_entry)
501 update_counter_times(counter);
502}
503
504/*
505 * Cross CPU call to disable a performance counter
506 */
507static void __perf_counter_disable(void *info)
508{
509 struct perf_counter *counter = info;
510 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
511 struct perf_counter_context *ctx = counter->ctx;
512
513 /*
514 * If this is a per-task counter, need to check whether this
515 * counter's task is the current task on this cpu.
516 */
517 if (ctx->task && cpuctx->task_ctx != ctx)
518 return;
519
520 spin_lock(&ctx->lock);
521
522 /*
523 * If the counter is on, turn it off.
524 * If it is in error state, leave it in error state.
525 */
526 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
527 update_context_time(ctx);
528 update_group_times(counter);
529 if (counter == counter->group_leader)
530 group_sched_out(counter, cpuctx, ctx);
531 else
532 counter_sched_out(counter, cpuctx, ctx);
533 counter->state = PERF_COUNTER_STATE_OFF;
534 }
535
536 spin_unlock(&ctx->lock);
537}
538
539/*
540 * Disable a counter.
541 *
542 * If counter->ctx is a cloned context, callers must make sure that
543 * every task struct that counter->ctx->task could possibly point to
544 * remains valid. This condition is satisifed when called through
545 * perf_counter_for_each_child or perf_counter_for_each because they
546 * hold the top-level counter's child_mutex, so any descendant that
547 * goes to exit will block in sync_child_counter.
548 * When called from perf_pending_counter it's OK because counter->ctx
549 * is the current context on this CPU and preemption is disabled,
550 * hence we can't get into perf_counter_task_sched_out for this context.
551 */
552static void perf_counter_disable(struct perf_counter *counter)
553{
554 struct perf_counter_context *ctx = counter->ctx;
555 struct task_struct *task = ctx->task;
556
557 if (!task) {
558 /*
559 * Disable the counter on the cpu that it's on
560 */
561 smp_call_function_single(counter->cpu, __perf_counter_disable,
562 counter, 1);
563 return;
564 }
565
566 retry:
567 task_oncpu_function_call(task, __perf_counter_disable, counter);
568
569 spin_lock_irq(&ctx->lock);
570 /*
571 * If the counter is still active, we need to retry the cross-call.
572 */
573 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
574 spin_unlock_irq(&ctx->lock);
575 goto retry;
576 }
577
578 /*
579 * Since we have the lock this context can't be scheduled
580 * in, so we can change the state safely.
581 */
582 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
583 update_group_times(counter);
584 counter->state = PERF_COUNTER_STATE_OFF;
585 }
586
587 spin_unlock_irq(&ctx->lock);
588}
589
590static int
591counter_sched_in(struct perf_counter *counter,
592 struct perf_cpu_context *cpuctx,
593 struct perf_counter_context *ctx,
594 int cpu)
595{
596 if (counter->state <= PERF_COUNTER_STATE_OFF)
597 return 0;
598
599 counter->state = PERF_COUNTER_STATE_ACTIVE;
600 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
601 /*
602 * The new state must be visible before we turn it on in the hardware:
603 */
604 smp_wmb();
605
606 if (counter->pmu->enable(counter)) {
607 counter->state = PERF_COUNTER_STATE_INACTIVE;
608 counter->oncpu = -1;
609 return -EAGAIN;
610 }
611
612 counter->tstamp_running += ctx->time - counter->tstamp_stopped;
613
614 if (!is_software_counter(counter))
615 cpuctx->active_oncpu++;
616 ctx->nr_active++;
617
618 if (counter->attr.exclusive)
619 cpuctx->exclusive = 1;
620
621 return 0;
622}
623
624static int
625group_sched_in(struct perf_counter *group_counter,
626 struct perf_cpu_context *cpuctx,
627 struct perf_counter_context *ctx,
628 int cpu)
629{
630 struct perf_counter *counter, *partial_group;
631 int ret;
632
633 if (group_counter->state == PERF_COUNTER_STATE_OFF)
634 return 0;
635
636 ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
637 if (ret)
638 return ret < 0 ? ret : 0;
639
640 if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
641 return -EAGAIN;
642
643 /*
644 * Schedule in siblings as one group (if any):
645 */
646 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
647 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
648 partial_group = counter;
649 goto group_error;
650 }
651 }
652
653 return 0;
654
655group_error:
656 /*
657 * Groups can be scheduled in as one unit only, so undo any
658 * partial group before returning:
659 */
660 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
661 if (counter == partial_group)
662 break;
663 counter_sched_out(counter, cpuctx, ctx);
664 }
665 counter_sched_out(group_counter, cpuctx, ctx);
666
667 return -EAGAIN;
668}
669
670/*
671 * Return 1 for a group consisting entirely of software counters,
672 * 0 if the group contains any hardware counters.
673 */
674static int is_software_only_group(struct perf_counter *leader)
675{
676 struct perf_counter *counter;
677
678 if (!is_software_counter(leader))
679 return 0;
680
681 list_for_each_entry(counter, &leader->sibling_list, list_entry)
682 if (!is_software_counter(counter))
683 return 0;
684
685 return 1;
686}
687
688/*
689 * Work out whether we can put this counter group on the CPU now.
690 */
691static int group_can_go_on(struct perf_counter *counter,
692 struct perf_cpu_context *cpuctx,
693 int can_add_hw)
694{
695 /*
696 * Groups consisting entirely of software counters can always go on.
697 */
698 if (is_software_only_group(counter))
699 return 1;
700 /*
701 * If an exclusive group is already on, no other hardware
702 * counters can go on.
703 */
704 if (cpuctx->exclusive)
705 return 0;
706 /*
707 * If this group is exclusive and there are already
708 * counters on the CPU, it can't go on.
709 */
710 if (counter->attr.exclusive && cpuctx->active_oncpu)
711 return 0;
712 /*
713 * Otherwise, try to add it if all previous groups were able
714 * to go on.
715 */
716 return can_add_hw;
717}
718
719static void add_counter_to_ctx(struct perf_counter *counter,
720 struct perf_counter_context *ctx)
721{
722 list_add_counter(counter, ctx);
723 counter->tstamp_enabled = ctx->time;
724 counter->tstamp_running = ctx->time;
725 counter->tstamp_stopped = ctx->time;
726}
727
728/*
729 * Cross CPU call to install and enable a performance counter
730 *
731 * Must be called with ctx->mutex held
732 */
733static void __perf_install_in_context(void *info)
734{
735 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
736 struct perf_counter *counter = info;
737 struct perf_counter_context *ctx = counter->ctx;
738 struct perf_counter *leader = counter->group_leader;
739 int cpu = smp_processor_id();
740 int err;
741
742 /*
743 * If this is a task context, we need to check whether it is
744 * the current task context of this cpu. If not it has been
745 * scheduled out before the smp call arrived.
746 * Or possibly this is the right context but it isn't
747 * on this cpu because it had no counters.
748 */
749 if (ctx->task && cpuctx->task_ctx != ctx) {
750 if (cpuctx->task_ctx || ctx->task != current)
751 return;
752 cpuctx->task_ctx = ctx;
753 }
754
755 spin_lock(&ctx->lock);
756 ctx->is_active = 1;
757 update_context_time(ctx);
758
759 /*
760 * Protect the list operation against NMI by disabling the
761 * counters on a global level. NOP for non NMI based counters.
762 */
763 perf_disable();
764
765 add_counter_to_ctx(counter, ctx);
766
767 /*
768 * Don't put the counter on if it is disabled or if
769 * it is in a group and the group isn't on.
770 */
771 if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
772 (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
773 goto unlock;
774
775 /*
776 * An exclusive counter can't go on if there are already active
777 * hardware counters, and no hardware counter can go on if there
778 * is already an exclusive counter on.
779 */
780 if (!group_can_go_on(counter, cpuctx, 1))
781 err = -EEXIST;
782 else
783 err = counter_sched_in(counter, cpuctx, ctx, cpu);
784
785 if (err) {
786 /*
787 * This counter couldn't go on. If it is in a group
788 * then we have to pull the whole group off.
789 * If the counter group is pinned then put it in error state.
790 */
791 if (leader != counter)
792 group_sched_out(leader, cpuctx, ctx);
793 if (leader->attr.pinned) {
794 update_group_times(leader);
795 leader->state = PERF_COUNTER_STATE_ERROR;
796 }
797 }
798
799 if (!err && !ctx->task && cpuctx->max_pertask)
800 cpuctx->max_pertask--;
801
802 unlock:
803 perf_enable();
804
805 spin_unlock(&ctx->lock);
806}
807
808/*
809 * Attach a performance counter to a context
810 *
811 * First we add the counter to the list with the hardware enable bit
812 * in counter->hw_config cleared.
813 *
814 * If the counter is attached to a task which is on a CPU we use a smp
815 * call to enable it in the task context. The task might have been
816 * scheduled away, but we check this in the smp call again.
817 *
818 * Must be called with ctx->mutex held.
819 */
820static void
821perf_install_in_context(struct perf_counter_context *ctx,
822 struct perf_counter *counter,
823 int cpu)
824{
825 struct task_struct *task = ctx->task;
826
827 if (!task) {
828 /*
829 * Per cpu counters are installed via an smp call and
830 * the install is always sucessful.
831 */
832 smp_call_function_single(cpu, __perf_install_in_context,
833 counter, 1);
834 return;
835 }
836
837retry:
838 task_oncpu_function_call(task, __perf_install_in_context,
839 counter);
840
841 spin_lock_irq(&ctx->lock);
842 /*
843 * we need to retry the smp call.
844 */
845 if (ctx->is_active && list_empty(&counter->list_entry)) {
846 spin_unlock_irq(&ctx->lock);
847 goto retry;
848 }
849
850 /*
851 * The lock prevents that this context is scheduled in so we
852 * can add the counter safely, if it the call above did not
853 * succeed.
854 */
855 if (list_empty(&counter->list_entry))
856 add_counter_to_ctx(counter, ctx);
857 spin_unlock_irq(&ctx->lock);
858}
859
860/*
861 * Put a counter into inactive state and update time fields.
862 * Enabling the leader of a group effectively enables all
863 * the group members that aren't explicitly disabled, so we
864 * have to update their ->tstamp_enabled also.
865 * Note: this works for group members as well as group leaders
866 * since the non-leader members' sibling_lists will be empty.
867 */
868static void __perf_counter_mark_enabled(struct perf_counter *counter,
869 struct perf_counter_context *ctx)
870{
871 struct perf_counter *sub;
872
873 counter->state = PERF_COUNTER_STATE_INACTIVE;
874 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
875 list_for_each_entry(sub, &counter->sibling_list, list_entry)
876 if (sub->state >= PERF_COUNTER_STATE_INACTIVE)
877 sub->tstamp_enabled =
878 ctx->time - sub->total_time_enabled;
879}
880
881/*
882 * Cross CPU call to enable a performance counter
883 */
884static void __perf_counter_enable(void *info)
885{
886 struct perf_counter *counter = info;
887 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
888 struct perf_counter_context *ctx = counter->ctx;
889 struct perf_counter *leader = counter->group_leader;
890 int err;
891
892 /*
893 * If this is a per-task counter, need to check whether this
894 * counter's task is the current task on this cpu.
895 */
896 if (ctx->task && cpuctx->task_ctx != ctx) {
897 if (cpuctx->task_ctx || ctx->task != current)
898 return;
899 cpuctx->task_ctx = ctx;
900 }
901
902 spin_lock(&ctx->lock);
903 ctx->is_active = 1;
904 update_context_time(ctx);
905
906 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
907 goto unlock;
908 __perf_counter_mark_enabled(counter, ctx);
909
910 /*
911 * If the counter is in a group and isn't the group leader,
912 * then don't put it on unless the group is on.
913 */
914 if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
915 goto unlock;
916
917 if (!group_can_go_on(counter, cpuctx, 1)) {
918 err = -EEXIST;
919 } else {
920 perf_disable();
921 if (counter == leader)
922 err = group_sched_in(counter, cpuctx, ctx,
923 smp_processor_id());
924 else
925 err = counter_sched_in(counter, cpuctx, ctx,
926 smp_processor_id());
927 perf_enable();
928 }
929
930 if (err) {
931 /*
932 * If this counter can't go on and it's part of a
933 * group, then the whole group has to come off.
934 */
935 if (leader != counter)
936 group_sched_out(leader, cpuctx, ctx);
937 if (leader->attr.pinned) {
938 update_group_times(leader);
939 leader->state = PERF_COUNTER_STATE_ERROR;
940 }
941 }
942
943 unlock:
944 spin_unlock(&ctx->lock);
945}
946
947/*
948 * Enable a counter.
949 *
950 * If counter->ctx is a cloned context, callers must make sure that
951 * every task struct that counter->ctx->task could possibly point to
952 * remains valid. This condition is satisfied when called through
953 * perf_counter_for_each_child or perf_counter_for_each as described
954 * for perf_counter_disable.
955 */
956static void perf_counter_enable(struct perf_counter *counter)
957{
958 struct perf_counter_context *ctx = counter->ctx;
959 struct task_struct *task = ctx->task;
960
961 if (!task) {
962 /*
963 * Enable the counter on the cpu that it's on
964 */
965 smp_call_function_single(counter->cpu, __perf_counter_enable,
966 counter, 1);
967 return;
968 }
969
970 spin_lock_irq(&ctx->lock);
971 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
972 goto out;
973
974 /*
975 * If the counter is in error state, clear that first.
976 * That way, if we see the counter in error state below, we
977 * know that it has gone back into error state, as distinct
978 * from the task having been scheduled away before the
979 * cross-call arrived.
980 */
981 if (counter->state == PERF_COUNTER_STATE_ERROR)
982 counter->state = PERF_COUNTER_STATE_OFF;
983
984 retry:
985 spin_unlock_irq(&ctx->lock);
986 task_oncpu_function_call(task, __perf_counter_enable, counter);
987
988 spin_lock_irq(&ctx->lock);
989
990 /*
991 * If the context is active and the counter is still off,
992 * we need to retry the cross-call.
993 */
994 if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
995 goto retry;
996
997 /*
998 * Since we have the lock this context can't be scheduled
999 * in, so we can change the state safely.
1000 */
1001 if (counter->state == PERF_COUNTER_STATE_OFF)
1002 __perf_counter_mark_enabled(counter, ctx);
1003
1004 out:
1005 spin_unlock_irq(&ctx->lock);
1006}
1007
1008static int perf_counter_refresh(struct perf_counter *counter, int refresh)
1009{
1010 /*
1011 * not supported on inherited counters
1012 */
1013 if (counter->attr.inherit)
1014 return -EINVAL;
1015
1016 atomic_add(refresh, &counter->event_limit);
1017 perf_counter_enable(counter);
1018
1019 return 0;
1020}
1021
1022void __perf_counter_sched_out(struct perf_counter_context *ctx,
1023 struct perf_cpu_context *cpuctx)
1024{
1025 struct perf_counter *counter;
1026
1027 spin_lock(&ctx->lock);
1028 ctx->is_active = 0;
1029 if (likely(!ctx->nr_counters))
1030 goto out;
1031 update_context_time(ctx);
1032
1033 perf_disable();
1034 if (ctx->nr_active) {
1035 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1036 if (counter != counter->group_leader)
1037 counter_sched_out(counter, cpuctx, ctx);
1038 else
1039 group_sched_out(counter, cpuctx, ctx);
1040 }
1041 }
1042 perf_enable();
1043 out:
1044 spin_unlock(&ctx->lock);
1045}
1046
1047/*
1048 * Test whether two contexts are equivalent, i.e. whether they
1049 * have both been cloned from the same version of the same context
1050 * and they both have the same number of enabled counters.
1051 * If the number of enabled counters is the same, then the set
1052 * of enabled counters should be the same, because these are both
1053 * inherited contexts, therefore we can't access individual counters
1054 * in them directly with an fd; we can only enable/disable all
1055 * counters via prctl, or enable/disable all counters in a family
1056 * via ioctl, which will have the same effect on both contexts.
1057 */
1058static int context_equiv(struct perf_counter_context *ctx1,
1059 struct perf_counter_context *ctx2)
1060{
1061 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1062 && ctx1->parent_gen == ctx2->parent_gen
1063 && !ctx1->pin_count && !ctx2->pin_count;
1064}
1065
1066static void __perf_counter_read(void *counter);
1067
1068static void __perf_counter_sync_stat(struct perf_counter *counter,
1069 struct perf_counter *next_counter)
1070{
1071 u64 value;
1072
1073 if (!counter->attr.inherit_stat)
1074 return;
1075
1076 /*
1077 * Update the counter value, we cannot use perf_counter_read()
1078 * because we're in the middle of a context switch and have IRQs
1079 * disabled, which upsets smp_call_function_single(), however
1080 * we know the counter must be on the current CPU, therefore we
1081 * don't need to use it.
1082 */
1083 switch (counter->state) {
1084 case PERF_COUNTER_STATE_ACTIVE:
1085 __perf_counter_read(counter);
1086 break;
1087
1088 case PERF_COUNTER_STATE_INACTIVE:
1089 update_counter_times(counter);
1090 break;
1091
1092 default:
1093 break;
1094 }
1095
1096 /*
1097 * In order to keep per-task stats reliable we need to flip the counter
1098 * values when we flip the contexts.
1099 */
1100 value = atomic64_read(&next_counter->count);
1101 value = atomic64_xchg(&counter->count, value);
1102 atomic64_set(&next_counter->count, value);
1103
1104 swap(counter->total_time_enabled, next_counter->total_time_enabled);
1105 swap(counter->total_time_running, next_counter->total_time_running);
1106
1107 /*
1108 * Since we swizzled the values, update the user visible data too.
1109 */
1110 perf_counter_update_userpage(counter);
1111 perf_counter_update_userpage(next_counter);
1112}
1113
1114#define list_next_entry(pos, member) \
1115 list_entry(pos->member.next, typeof(*pos), member)
1116
1117static void perf_counter_sync_stat(struct perf_counter_context *ctx,
1118 struct perf_counter_context *next_ctx)
1119{
1120 struct perf_counter *counter, *next_counter;
1121
1122 if (!ctx->nr_stat)
1123 return;
1124
1125 counter = list_first_entry(&ctx->event_list,
1126 struct perf_counter, event_entry);
1127
1128 next_counter = list_first_entry(&next_ctx->event_list,
1129 struct perf_counter, event_entry);
1130
1131 while (&counter->event_entry != &ctx->event_list &&
1132 &next_counter->event_entry != &next_ctx->event_list) {
1133
1134 __perf_counter_sync_stat(counter, next_counter);
1135
1136 counter = list_next_entry(counter, event_entry);
1137 next_counter = list_next_entry(next_counter, event_entry);
1138 }
1139}
1140
1141/*
1142 * Called from scheduler to remove the counters of the current task,
1143 * with interrupts disabled.
1144 *
1145 * We stop each counter and update the counter value in counter->count.
1146 *
1147 * This does not protect us against NMI, but disable()
1148 * sets the disabled bit in the control field of counter _before_
1149 * accessing the counter control register. If a NMI hits, then it will
1150 * not restart the counter.
1151 */
1152void perf_counter_task_sched_out(struct task_struct *task,
1153 struct task_struct *next, int cpu)
1154{
1155 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1156 struct perf_counter_context *ctx = task->perf_counter_ctxp;
1157 struct perf_counter_context *next_ctx;
1158 struct perf_counter_context *parent;
1159 struct pt_regs *regs;
1160 int do_switch = 1;
1161
1162 regs = task_pt_regs(task);
1163 perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1164
1165 if (likely(!ctx || !cpuctx->task_ctx))
1166 return;
1167
1168 update_context_time(ctx);
1169
1170 rcu_read_lock();
1171 parent = rcu_dereference(ctx->parent_ctx);
1172 next_ctx = next->perf_counter_ctxp;
1173 if (parent && next_ctx &&
1174 rcu_dereference(next_ctx->parent_ctx) == parent) {
1175 /*
1176 * Looks like the two contexts are clones, so we might be
1177 * able to optimize the context switch. We lock both
1178 * contexts and check that they are clones under the
1179 * lock (including re-checking that neither has been
1180 * uncloned in the meantime). It doesn't matter which
1181 * order we take the locks because no other cpu could
1182 * be trying to lock both of these tasks.
1183 */
1184 spin_lock(&ctx->lock);
1185 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1186 if (context_equiv(ctx, next_ctx)) {
1187 /*
1188 * XXX do we need a memory barrier of sorts
1189 * wrt to rcu_dereference() of perf_counter_ctxp
1190 */
1191 task->perf_counter_ctxp = next_ctx;
1192 next->perf_counter_ctxp = ctx;
1193 ctx->task = next;
1194 next_ctx->task = task;
1195 do_switch = 0;
1196
1197 perf_counter_sync_stat(ctx, next_ctx);
1198 }
1199 spin_unlock(&next_ctx->lock);
1200 spin_unlock(&ctx->lock);
1201 }
1202 rcu_read_unlock();
1203
1204 if (do_switch) {
1205 __perf_counter_sched_out(ctx, cpuctx);
1206 cpuctx->task_ctx = NULL;
1207 }
1208}
1209
1210/*
1211 * Called with IRQs disabled
1212 */
1213static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
1214{
1215 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1216
1217 if (!cpuctx->task_ctx)
1218 return;
1219
1220 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1221 return;
1222
1223 __perf_counter_sched_out(ctx, cpuctx);
1224 cpuctx->task_ctx = NULL;
1225}
1226
1227/*
1228 * Called with IRQs disabled
1229 */
1230static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
1231{
1232 __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
1233}
1234
1235static void
1236__perf_counter_sched_in(struct perf_counter_context *ctx,
1237 struct perf_cpu_context *cpuctx, int cpu)
1238{
1239 struct perf_counter *counter;
1240 int can_add_hw = 1;
1241
1242 spin_lock(&ctx->lock);
1243 ctx->is_active = 1;
1244 if (likely(!ctx->nr_counters))
1245 goto out;
1246
1247 ctx->timestamp = perf_clock();
1248
1249 perf_disable();
1250
1251 /*
1252 * First go through the list and put on any pinned groups
1253 * in order to give them the best chance of going on.
1254 */
1255 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1256 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1257 !counter->attr.pinned)
1258 continue;
1259 if (counter->cpu != -1 && counter->cpu != cpu)
1260 continue;
1261
1262 if (counter != counter->group_leader)
1263 counter_sched_in(counter, cpuctx, ctx, cpu);
1264 else {
1265 if (group_can_go_on(counter, cpuctx, 1))
1266 group_sched_in(counter, cpuctx, ctx, cpu);
1267 }
1268
1269 /*
1270 * If this pinned group hasn't been scheduled,
1271 * put it in error state.
1272 */
1273 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1274 update_group_times(counter);
1275 counter->state = PERF_COUNTER_STATE_ERROR;
1276 }
1277 }
1278
1279 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1280 /*
1281 * Ignore counters in OFF or ERROR state, and
1282 * ignore pinned counters since we did them already.
1283 */
1284 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1285 counter->attr.pinned)
1286 continue;
1287
1288 /*
1289 * Listen to the 'cpu' scheduling filter constraint
1290 * of counters:
1291 */
1292 if (counter->cpu != -1 && counter->cpu != cpu)
1293 continue;
1294
1295 if (counter != counter->group_leader) {
1296 if (counter_sched_in(counter, cpuctx, ctx, cpu))
1297 can_add_hw = 0;
1298 } else {
1299 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
1300 if (group_sched_in(counter, cpuctx, ctx, cpu))
1301 can_add_hw = 0;
1302 }
1303 }
1304 }
1305 perf_enable();
1306 out:
1307 spin_unlock(&ctx->lock);
1308}
1309
1310/*
1311 * Called from scheduler to add the counters of the current task
1312 * with interrupts disabled.
1313 *
1314 * We restore the counter value and then enable it.
1315 *
1316 * This does not protect us against NMI, but enable()
1317 * sets the enabled bit in the control field of counter _before_
1318 * accessing the counter control register. If a NMI hits, then it will
1319 * keep the counter running.
1320 */
1321void perf_counter_task_sched_in(struct task_struct *task, int cpu)
1322{
1323 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1324 struct perf_counter_context *ctx = task->perf_counter_ctxp;
1325
1326 if (likely(!ctx))
1327 return;
1328 if (cpuctx->task_ctx == ctx)
1329 return;
1330 __perf_counter_sched_in(ctx, cpuctx, cpu);
1331 cpuctx->task_ctx = ctx;
1332}
1333
1334static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1335{
1336 struct perf_counter_context *ctx = &cpuctx->ctx;
1337
1338 __perf_counter_sched_in(ctx, cpuctx, cpu);
1339}
1340
1341#define MAX_INTERRUPTS (~0ULL)
1342
1343static void perf_log_throttle(struct perf_counter *counter, int enable);
1344
1345static void perf_adjust_period(struct perf_counter *counter, u64 events)
1346{
1347 struct hw_perf_counter *hwc = &counter->hw;
1348 u64 period, sample_period;
1349 s64 delta;
1350
1351 events *= hwc->sample_period;
1352 period = div64_u64(events, counter->attr.sample_freq);
1353
1354 delta = (s64)(period - hwc->sample_period);
1355 delta = (delta + 7) / 8; /* low pass filter */
1356
1357 sample_period = hwc->sample_period + delta;
1358
1359 if (!sample_period)
1360 sample_period = 1;
1361
1362 hwc->sample_period = sample_period;
1363}
1364
1365static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
1366{
1367 struct perf_counter *counter;
1368 struct hw_perf_counter *hwc;
1369 u64 interrupts, freq;
1370
1371 spin_lock(&ctx->lock);
1372 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1373 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
1374 continue;
1375
1376 hwc = &counter->hw;
1377
1378 interrupts = hwc->interrupts;
1379 hwc->interrupts = 0;
1380
1381 /*
1382 * unthrottle counters on the tick
1383 */
1384 if (interrupts == MAX_INTERRUPTS) {
1385 perf_log_throttle(counter, 1);
1386 counter->pmu->unthrottle(counter);
1387 interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
1388 }
1389
1390 if (!counter->attr.freq || !counter->attr.sample_freq)
1391 continue;
1392
1393 /*
1394 * if the specified freq < HZ then we need to skip ticks
1395 */
1396 if (counter->attr.sample_freq < HZ) {
1397 freq = counter->attr.sample_freq;
1398
1399 hwc->freq_count += freq;
1400 hwc->freq_interrupts += interrupts;
1401
1402 if (hwc->freq_count < HZ)
1403 continue;
1404
1405 interrupts = hwc->freq_interrupts;
1406 hwc->freq_interrupts = 0;
1407 hwc->freq_count -= HZ;
1408 } else
1409 freq = HZ;
1410
1411 perf_adjust_period(counter, freq * interrupts);
1412
1413 /*
1414 * In order to avoid being stalled by an (accidental) huge
1415 * sample period, force reset the sample period if we didn't
1416 * get any events in this freq period.
1417 */
1418 if (!interrupts) {
1419 perf_disable();
1420 counter->pmu->disable(counter);
1421 atomic64_set(&hwc->period_left, 0);
1422 counter->pmu->enable(counter);
1423 perf_enable();
1424 }
1425 }
1426 spin_unlock(&ctx->lock);
1427}
1428
1429/*
1430 * Round-robin a context's counters:
1431 */
1432static void rotate_ctx(struct perf_counter_context *ctx)
1433{
1434 struct perf_counter *counter;
1435
1436 if (!ctx->nr_counters)
1437 return;
1438
1439 spin_lock(&ctx->lock);
1440 /*
1441 * Rotate the first entry last (works just fine for group counters too):
1442 */
1443 perf_disable();
1444 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1445 list_move_tail(&counter->list_entry, &ctx->counter_list);
1446 break;
1447 }
1448 perf_enable();
1449
1450 spin_unlock(&ctx->lock);
1451}
1452
1453void perf_counter_task_tick(struct task_struct *curr, int cpu)
1454{
1455 struct perf_cpu_context *cpuctx;
1456 struct perf_counter_context *ctx;
1457
1458 if (!atomic_read(&nr_counters))
1459 return;
1460
1461 cpuctx = &per_cpu(perf_cpu_context, cpu);
1462 ctx = curr->perf_counter_ctxp;
1463
1464 perf_ctx_adjust_freq(&cpuctx->ctx);
1465 if (ctx)
1466 perf_ctx_adjust_freq(ctx);
1467
1468 perf_counter_cpu_sched_out(cpuctx);
1469 if (ctx)
1470 __perf_counter_task_sched_out(ctx);
1471
1472 rotate_ctx(&cpuctx->ctx);
1473 if (ctx)
1474 rotate_ctx(ctx);
1475
1476 perf_counter_cpu_sched_in(cpuctx, cpu);
1477 if (ctx)
1478 perf_counter_task_sched_in(curr, cpu);
1479}
1480
1481/*
1482 * Enable all of a task's counters that have been marked enable-on-exec.
1483 * This expects task == current.
1484 */
1485static void perf_counter_enable_on_exec(struct task_struct *task)
1486{
1487 struct perf_counter_context *ctx;
1488 struct perf_counter *counter;
1489 unsigned long flags;
1490 int enabled = 0;
1491
1492 local_irq_save(flags);
1493 ctx = task->perf_counter_ctxp;
1494 if (!ctx || !ctx->nr_counters)
1495 goto out;
1496
1497 __perf_counter_task_sched_out(ctx);
1498
1499 spin_lock(&ctx->lock);
1500
1501 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1502 if (!counter->attr.enable_on_exec)
1503 continue;
1504 counter->attr.enable_on_exec = 0;
1505 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
1506 continue;
1507 __perf_counter_mark_enabled(counter, ctx);
1508 enabled = 1;
1509 }
1510
1511 /*
1512 * Unclone this context if we enabled any counter.
1513 */
1514 if (enabled)
1515 unclone_ctx(ctx);
1516
1517 spin_unlock(&ctx->lock);
1518
1519 perf_counter_task_sched_in(task, smp_processor_id());
1520 out:
1521 local_irq_restore(flags);
1522}
1523
1524/*
1525 * Cross CPU call to read the hardware counter
1526 */
1527static void __perf_counter_read(void *info)
1528{
1529 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1530 struct perf_counter *counter = info;
1531 struct perf_counter_context *ctx = counter->ctx;
1532 unsigned long flags;
1533
1534 /*
1535 * If this is a task context, we need to check whether it is
1536 * the current task context of this cpu. If not it has been
1537 * scheduled out before the smp call arrived. In that case
1538 * counter->count would have been updated to a recent sample
1539 * when the counter was scheduled out.
1540 */
1541 if (ctx->task && cpuctx->task_ctx != ctx)
1542 return;
1543
1544 local_irq_save(flags);
1545 if (ctx->is_active)
1546 update_context_time(ctx);
1547 counter->pmu->read(counter);
1548 update_counter_times(counter);
1549 local_irq_restore(flags);
1550}
1551
1552static u64 perf_counter_read(struct perf_counter *counter)
1553{
1554 /*
1555 * If counter is enabled and currently active on a CPU, update the
1556 * value in the counter structure:
1557 */
1558 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1559 smp_call_function_single(counter->oncpu,
1560 __perf_counter_read, counter, 1);
1561 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1562 update_counter_times(counter);
1563 }
1564
1565 return atomic64_read(&counter->count);
1566}
1567
1568/*
1569 * Initialize the perf_counter context in a task_struct:
1570 */
1571static void
1572__perf_counter_init_context(struct perf_counter_context *ctx,
1573 struct task_struct *task)
1574{
1575 memset(ctx, 0, sizeof(*ctx));
1576 spin_lock_init(&ctx->lock);
1577 mutex_init(&ctx->mutex);
1578 INIT_LIST_HEAD(&ctx->counter_list);
1579 INIT_LIST_HEAD(&ctx->event_list);
1580 atomic_set(&ctx->refcount, 1);
1581 ctx->task = task;
1582}
1583
1584static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1585{
1586 struct perf_counter_context *ctx;
1587 struct perf_cpu_context *cpuctx;
1588 struct task_struct *task;
1589 unsigned long flags;
1590 int err;
1591
1592 /*
1593 * If cpu is not a wildcard then this is a percpu counter:
1594 */
1595 if (cpu != -1) {
1596 /* Must be root to operate on a CPU counter: */
1597 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1598 return ERR_PTR(-EACCES);
1599
1600 if (cpu < 0 || cpu > num_possible_cpus())
1601 return ERR_PTR(-EINVAL);
1602
1603 /*
1604 * We could be clever and allow to attach a counter to an
1605 * offline CPU and activate it when the CPU comes up, but
1606 * that's for later.
1607 */
1608 if (!cpu_isset(cpu, cpu_online_map))
1609 return ERR_PTR(-ENODEV);
1610
1611 cpuctx = &per_cpu(perf_cpu_context, cpu);
1612 ctx = &cpuctx->ctx;
1613 get_ctx(ctx);
1614
1615 return ctx;
1616 }
1617
1618 rcu_read_lock();
1619 if (!pid)
1620 task = current;
1621 else
1622 task = find_task_by_vpid(pid);
1623 if (task)
1624 get_task_struct(task);
1625 rcu_read_unlock();
1626
1627 if (!task)
1628 return ERR_PTR(-ESRCH);
1629
1630 /*
1631 * Can't attach counters to a dying task.
1632 */
1633 err = -ESRCH;
1634 if (task->flags & PF_EXITING)
1635 goto errout;
1636
1637 /* Reuse ptrace permission checks for now. */
1638 err = -EACCES;
1639 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1640 goto errout;
1641
1642 retry:
1643 ctx = perf_lock_task_context(task, &flags);
1644 if (ctx) {
1645 unclone_ctx(ctx);
1646 spin_unlock_irqrestore(&ctx->lock, flags);
1647 }
1648
1649 if (!ctx) {
1650 ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
1651 err = -ENOMEM;
1652 if (!ctx)
1653 goto errout;
1654 __perf_counter_init_context(ctx, task);
1655 get_ctx(ctx);
1656 if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
1657 /*
1658 * We raced with some other task; use
1659 * the context they set.
1660 */
1661 kfree(ctx);
1662 goto retry;
1663 }
1664 get_task_struct(task);
1665 }
1666
1667 put_task_struct(task);
1668 return ctx;
1669
1670 errout:
1671 put_task_struct(task);
1672 return ERR_PTR(err);
1673}
1674
1675static void free_counter_rcu(struct rcu_head *head)
1676{
1677 struct perf_counter *counter;
1678
1679 counter = container_of(head, struct perf_counter, rcu_head);
1680 if (counter->ns)
1681 put_pid_ns(counter->ns);
1682 kfree(counter);
1683}
1684
1685static void perf_pending_sync(struct perf_counter *counter);
1686
1687static void free_counter(struct perf_counter *counter)
1688{
1689 perf_pending_sync(counter);
1690
1691 if (!counter->parent) {
1692 atomic_dec(&nr_counters);
1693 if (counter->attr.mmap)
1694 atomic_dec(&nr_mmap_counters);
1695 if (counter->attr.comm)
1696 atomic_dec(&nr_comm_counters);
1697 if (counter->attr.task)
1698 atomic_dec(&nr_task_counters);
1699 }
1700
1701 if (counter->output) {
1702 fput(counter->output->filp);
1703 counter->output = NULL;
1704 }
1705
1706 if (counter->destroy)
1707 counter->destroy(counter);
1708
1709 put_ctx(counter->ctx);
1710 call_rcu(&counter->rcu_head, free_counter_rcu);
1711}
1712
1713/*
1714 * Called when the last reference to the file is gone.
1715 */
1716static int perf_release(struct inode *inode, struct file *file)
1717{
1718 struct perf_counter *counter = file->private_data;
1719 struct perf_counter_context *ctx = counter->ctx;
1720
1721 file->private_data = NULL;
1722
1723 WARN_ON_ONCE(ctx->parent_ctx);
1724 mutex_lock(&ctx->mutex);
1725 perf_counter_remove_from_context(counter);
1726 mutex_unlock(&ctx->mutex);
1727
1728 mutex_lock(&counter->owner->perf_counter_mutex);
1729 list_del_init(&counter->owner_entry);
1730 mutex_unlock(&counter->owner->perf_counter_mutex);
1731 put_task_struct(counter->owner);
1732
1733 free_counter(counter);
1734
1735 return 0;
1736}
1737
1738static int perf_counter_read_size(struct perf_counter *counter)
1739{
1740 int entry = sizeof(u64); /* value */
1741 int size = 0;
1742 int nr = 1;
1743
1744 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1745 size += sizeof(u64);
1746
1747 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1748 size += sizeof(u64);
1749
1750 if (counter->attr.read_format & PERF_FORMAT_ID)
1751 entry += sizeof(u64);
1752
1753 if (counter->attr.read_format & PERF_FORMAT_GROUP) {
1754 nr += counter->group_leader->nr_siblings;
1755 size += sizeof(u64);
1756 }
1757
1758 size += entry * nr;
1759
1760 return size;
1761}
1762
1763static u64 perf_counter_read_value(struct perf_counter *counter)
1764{
1765 struct perf_counter *child;
1766 u64 total = 0;
1767
1768 total += perf_counter_read(counter);
1769 list_for_each_entry(child, &counter->child_list, child_list)
1770 total += perf_counter_read(child);
1771
1772 return total;
1773}
1774
1775static int perf_counter_read_entry(struct perf_counter *counter,
1776 u64 read_format, char __user *buf)
1777{
1778 int n = 0, count = 0;
1779 u64 values[2];
1780
1781 values[n++] = perf_counter_read_value(counter);
1782 if (read_format & PERF_FORMAT_ID)
1783 values[n++] = primary_counter_id(counter);
1784
1785 count = n * sizeof(u64);
1786
1787 if (copy_to_user(buf, values, count))
1788 return -EFAULT;
1789
1790 return count;
1791}
1792
1793static int perf_counter_read_group(struct perf_counter *counter,
1794 u64 read_format, char __user *buf)
1795{
1796 struct perf_counter *leader = counter->group_leader, *sub;
1797 int n = 0, size = 0, err = -EFAULT;
1798 u64 values[3];
1799
1800 values[n++] = 1 + leader->nr_siblings;
1801 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1802 values[n++] = leader->total_time_enabled +
1803 atomic64_read(&leader->child_total_time_enabled);
1804 }
1805 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1806 values[n++] = leader->total_time_running +
1807 atomic64_read(&leader->child_total_time_running);
1808 }
1809
1810 size = n * sizeof(u64);
1811
1812 if (copy_to_user(buf, values, size))
1813 return -EFAULT;
1814
1815 err = perf_counter_read_entry(leader, read_format, buf + size);
1816 if (err < 0)
1817 return err;
1818
1819 size += err;
1820
1821 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1822 err = perf_counter_read_entry(sub, read_format,
1823 buf + size);
1824 if (err < 0)
1825 return err;
1826
1827 size += err;
1828 }
1829
1830 return size;
1831}
1832
1833static int perf_counter_read_one(struct perf_counter *counter,
1834 u64 read_format, char __user *buf)
1835{
1836 u64 values[4];
1837 int n = 0;
1838
1839 values[n++] = perf_counter_read_value(counter);
1840 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1841 values[n++] = counter->total_time_enabled +
1842 atomic64_read(&counter->child_total_time_enabled);
1843 }
1844 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1845 values[n++] = counter->total_time_running +
1846 atomic64_read(&counter->child_total_time_running);
1847 }
1848 if (read_format & PERF_FORMAT_ID)
1849 values[n++] = primary_counter_id(counter);
1850
1851 if (copy_to_user(buf, values, n * sizeof(u64)))
1852 return -EFAULT;
1853
1854 return n * sizeof(u64);
1855}
1856
1857/*
1858 * Read the performance counter - simple non blocking version for now
1859 */
1860static ssize_t
1861perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1862{
1863 u64 read_format = counter->attr.read_format;
1864 int ret;
1865
1866 /*
1867 * Return end-of-file for a read on a counter that is in
1868 * error state (i.e. because it was pinned but it couldn't be
1869 * scheduled on to the CPU at some point).
1870 */
1871 if (counter->state == PERF_COUNTER_STATE_ERROR)
1872 return 0;
1873
1874 if (count < perf_counter_read_size(counter))
1875 return -ENOSPC;
1876
1877 WARN_ON_ONCE(counter->ctx->parent_ctx);
1878 mutex_lock(&counter->child_mutex);
1879 if (read_format & PERF_FORMAT_GROUP)
1880 ret = perf_counter_read_group(counter, read_format, buf);
1881 else
1882 ret = perf_counter_read_one(counter, read_format, buf);
1883 mutex_unlock(&counter->child_mutex);
1884
1885 return ret;
1886}
1887
1888static ssize_t
1889perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1890{
1891 struct perf_counter *counter = file->private_data;
1892
1893 return perf_read_hw(counter, buf, count);
1894}
1895
1896static unsigned int perf_poll(struct file *file, poll_table *wait)
1897{
1898 struct perf_counter *counter = file->private_data;
1899 struct perf_mmap_data *data;
1900 unsigned int events = POLL_HUP;
1901
1902 rcu_read_lock();
1903 data = rcu_dereference(counter->data);
1904 if (data)
1905 events = atomic_xchg(&data->poll, 0);
1906 rcu_read_unlock();
1907
1908 poll_wait(file, &counter->waitq, wait);
1909
1910 return events;
1911}
1912
1913static void perf_counter_reset(struct perf_counter *counter)
1914{
1915 (void)perf_counter_read(counter);
1916 atomic64_set(&counter->count, 0);
1917 perf_counter_update_userpage(counter);
1918}
1919
1920/*
1921 * Holding the top-level counter's child_mutex means that any
1922 * descendant process that has inherited this counter will block
1923 * in sync_child_counter if it goes to exit, thus satisfying the
1924 * task existence requirements of perf_counter_enable/disable.
1925 */
1926static void perf_counter_for_each_child(struct perf_counter *counter,
1927 void (*func)(struct perf_counter *))
1928{
1929 struct perf_counter *child;
1930
1931 WARN_ON_ONCE(counter->ctx->parent_ctx);
1932 mutex_lock(&counter->child_mutex);
1933 func(counter);
1934 list_for_each_entry(child, &counter->child_list, child_list)
1935 func(child);
1936 mutex_unlock(&counter->child_mutex);
1937}
1938
1939static void perf_counter_for_each(struct perf_counter *counter,
1940 void (*func)(struct perf_counter *))
1941{
1942 struct perf_counter_context *ctx = counter->ctx;
1943 struct perf_counter *sibling;
1944
1945 WARN_ON_ONCE(ctx->parent_ctx);
1946 mutex_lock(&ctx->mutex);
1947 counter = counter->group_leader;
1948
1949 perf_counter_for_each_child(counter, func);
1950 func(counter);
1951 list_for_each_entry(sibling, &counter->sibling_list, list_entry)
1952 perf_counter_for_each_child(counter, func);
1953 mutex_unlock(&ctx->mutex);
1954}
1955
1956static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
1957{
1958 struct perf_counter_context *ctx = counter->ctx;
1959 unsigned long size;
1960 int ret = 0;
1961 u64 value;
1962
1963 if (!counter->attr.sample_period)
1964 return -EINVAL;
1965
1966 size = copy_from_user(&value, arg, sizeof(value));
1967 if (size != sizeof(value))
1968 return -EFAULT;
1969
1970 if (!value)
1971 return -EINVAL;
1972
1973 spin_lock_irq(&ctx->lock);
1974 if (counter->attr.freq) {
1975 if (value > sysctl_perf_counter_sample_rate) {
1976 ret = -EINVAL;
1977 goto unlock;
1978 }
1979
1980 counter->attr.sample_freq = value;
1981 } else {
1982 counter->attr.sample_period = value;
1983 counter->hw.sample_period = value;
1984 }
1985unlock:
1986 spin_unlock_irq(&ctx->lock);
1987
1988 return ret;
1989}
1990
1991int perf_counter_set_output(struct perf_counter *counter, int output_fd);
1992
1993static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1994{
1995 struct perf_counter *counter = file->private_data;
1996 void (*func)(struct perf_counter *);
1997 u32 flags = arg;
1998
1999 switch (cmd) {
2000 case PERF_COUNTER_IOC_ENABLE:
2001 func = perf_counter_enable;
2002 break;
2003 case PERF_COUNTER_IOC_DISABLE:
2004 func = perf_counter_disable;
2005 break;
2006 case PERF_COUNTER_IOC_RESET:
2007 func = perf_counter_reset;
2008 break;
2009
2010 case PERF_COUNTER_IOC_REFRESH:
2011 return perf_counter_refresh(counter, arg);
2012
2013 case PERF_COUNTER_IOC_PERIOD:
2014 return perf_counter_period(counter, (u64 __user *)arg);
2015
2016 case PERF_COUNTER_IOC_SET_OUTPUT:
2017 return perf_counter_set_output(counter, arg);
2018
2019 default:
2020 return -ENOTTY;
2021 }
2022
2023 if (flags & PERF_IOC_FLAG_GROUP)
2024 perf_counter_for_each(counter, func);
2025 else
2026 perf_counter_for_each_child(counter, func);
2027
2028 return 0;
2029}
2030
2031int perf_counter_task_enable(void)
2032{
2033 struct perf_counter *counter;
2034
2035 mutex_lock(&current->perf_counter_mutex);
2036 list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
2037 perf_counter_for_each_child(counter, perf_counter_enable);
2038 mutex_unlock(&current->perf_counter_mutex);
2039
2040 return 0;
2041}
2042
2043int perf_counter_task_disable(void)
2044{
2045 struct perf_counter *counter;
2046
2047 mutex_lock(&current->perf_counter_mutex);
2048 list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
2049 perf_counter_for_each_child(counter, perf_counter_disable);
2050 mutex_unlock(&current->perf_counter_mutex);
2051
2052 return 0;
2053}
2054
2055#ifndef PERF_COUNTER_INDEX_OFFSET
2056# define PERF_COUNTER_INDEX_OFFSET 0
2057#endif
2058
2059static int perf_counter_index(struct perf_counter *counter)
2060{
2061 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
2062 return 0;
2063
2064 return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET;
2065}
2066
2067/*
2068 * Callers need to ensure there can be no nesting of this function, otherwise
2069 * the seqlock logic goes bad. We can not serialize this because the arch
2070 * code calls this from NMI context.
2071 */
2072void perf_counter_update_userpage(struct perf_counter *counter)
2073{
2074 struct perf_counter_mmap_page *userpg;
2075 struct perf_mmap_data *data;
2076
2077 rcu_read_lock();
2078 data = rcu_dereference(counter->data);
2079 if (!data)
2080 goto unlock;
2081
2082 userpg = data->user_page;
2083
2084 /*
2085 * Disable preemption so as to not let the corresponding user-space
2086 * spin too long if we get preempted.
2087 */
2088 preempt_disable();
2089 ++userpg->lock;
2090 barrier();
2091 userpg->index = perf_counter_index(counter);
2092 userpg->offset = atomic64_read(&counter->count);
2093 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
2094 userpg->offset -= atomic64_read(&counter->hw.prev_count);
2095
2096 userpg->time_enabled = counter->total_time_enabled +
2097 atomic64_read(&counter->child_total_time_enabled);
2098
2099 userpg->time_running = counter->total_time_running +
2100 atomic64_read(&counter->child_total_time_running);
2101
2102 barrier();
2103 ++userpg->lock;
2104 preempt_enable();
2105unlock:
2106 rcu_read_unlock();
2107}
2108
2109static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2110{
2111 struct perf_counter *counter = vma->vm_file->private_data;
2112 struct perf_mmap_data *data;
2113 int ret = VM_FAULT_SIGBUS;
2114
2115 if (vmf->flags & FAULT_FLAG_MKWRITE) {
2116 if (vmf->pgoff == 0)
2117 ret = 0;
2118 return ret;
2119 }
2120
2121 rcu_read_lock();
2122 data = rcu_dereference(counter->data);
2123 if (!data)
2124 goto unlock;
2125
2126 if (vmf->pgoff == 0) {
2127 vmf->page = virt_to_page(data->user_page);
2128 } else {
2129 int nr = vmf->pgoff - 1;
2130
2131 if ((unsigned)nr > data->nr_pages)
2132 goto unlock;
2133
2134 if (vmf->flags & FAULT_FLAG_WRITE)
2135 goto unlock;
2136
2137 vmf->page = virt_to_page(data->data_pages[nr]);
2138 }
2139
2140 get_page(vmf->page);
2141 vmf->page->mapping = vma->vm_file->f_mapping;
2142 vmf->page->index = vmf->pgoff;
2143
2144 ret = 0;
2145unlock:
2146 rcu_read_unlock();
2147
2148 return ret;
2149}
2150
2151static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
2152{
2153 struct perf_mmap_data *data;
2154 unsigned long size;
2155 int i;
2156
2157 WARN_ON(atomic_read(&counter->mmap_count));
2158
2159 size = sizeof(struct perf_mmap_data);
2160 size += nr_pages * sizeof(void *);
2161
2162 data = kzalloc(size, GFP_KERNEL);
2163 if (!data)
2164 goto fail;
2165
2166 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2167 if (!data->user_page)
2168 goto fail_user_page;
2169
2170 for (i = 0; i < nr_pages; i++) {
2171 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2172 if (!data->data_pages[i])
2173 goto fail_data_pages;
2174 }
2175
2176 data->nr_pages = nr_pages;
2177 atomic_set(&data->lock, -1);
2178
2179 rcu_assign_pointer(counter->data, data);
2180
2181 return 0;
2182
2183fail_data_pages:
2184 for (i--; i >= 0; i--)
2185 free_page((unsigned long)data->data_pages[i]);
2186
2187 free_page((unsigned long)data->user_page);
2188
2189fail_user_page:
2190 kfree(data);
2191
2192fail:
2193 return -ENOMEM;
2194}
2195
2196static void perf_mmap_free_page(unsigned long addr)
2197{
2198 struct page *page = virt_to_page((void *)addr);
2199
2200 page->mapping = NULL;
2201 __free_page(page);
2202}
2203
2204static void __perf_mmap_data_free(struct rcu_head *rcu_head)
2205{
2206 struct perf_mmap_data *data;
2207 int i;
2208
2209 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2210
2211 perf_mmap_free_page((unsigned long)data->user_page);
2212 for (i = 0; i < data->nr_pages; i++)
2213 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2214
2215 kfree(data);
2216}
2217
2218static void perf_mmap_data_free(struct perf_counter *counter)
2219{
2220 struct perf_mmap_data *data = counter->data;
2221
2222 WARN_ON(atomic_read(&counter->mmap_count));
2223
2224 rcu_assign_pointer(counter->data, NULL);
2225 call_rcu(&data->rcu_head, __perf_mmap_data_free);
2226}
2227
2228static void perf_mmap_open(struct vm_area_struct *vma)
2229{
2230 struct perf_counter *counter = vma->vm_file->private_data;
2231
2232 atomic_inc(&counter->mmap_count);
2233}
2234
2235static void perf_mmap_close(struct vm_area_struct *vma)
2236{
2237 struct perf_counter *counter = vma->vm_file->private_data;
2238
2239 WARN_ON_ONCE(counter->ctx->parent_ctx);
2240 if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) {
2241 struct user_struct *user = current_user();
2242
2243 atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
2244 vma->vm_mm->locked_vm -= counter->data->nr_locked;
2245 perf_mmap_data_free(counter);
2246 mutex_unlock(&counter->mmap_mutex);
2247 }
2248}
2249
2250static struct vm_operations_struct perf_mmap_vmops = {
2251 .open = perf_mmap_open,
2252 .close = perf_mmap_close,
2253 .fault = perf_mmap_fault,
2254 .page_mkwrite = perf_mmap_fault,
2255};
2256
2257static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2258{
2259 struct perf_counter *counter = file->private_data;
2260 unsigned long user_locked, user_lock_limit;
2261 struct user_struct *user = current_user();
2262 unsigned long locked, lock_limit;
2263 unsigned long vma_size;
2264 unsigned long nr_pages;
2265 long user_extra, extra;
2266 int ret = 0;
2267
2268 if (!(vma->vm_flags & VM_SHARED))
2269 return -EINVAL;
2270
2271 vma_size = vma->vm_end - vma->vm_start;
2272 nr_pages = (vma_size / PAGE_SIZE) - 1;
2273
2274 /*
2275 * If we have data pages ensure they're a power-of-two number, so we
2276 * can do bitmasks instead of modulo.
2277 */
2278 if (nr_pages != 0 && !is_power_of_2(nr_pages))
2279 return -EINVAL;
2280
2281 if (vma_size != PAGE_SIZE * (1 + nr_pages))
2282 return -EINVAL;
2283
2284 if (vma->vm_pgoff != 0)
2285 return -EINVAL;
2286
2287 WARN_ON_ONCE(counter->ctx->parent_ctx);
2288 mutex_lock(&counter->mmap_mutex);
2289 if (counter->output) {
2290 ret = -EINVAL;
2291 goto unlock;
2292 }
2293
2294 if (atomic_inc_not_zero(&counter->mmap_count)) {
2295 if (nr_pages != counter->data->nr_pages)
2296 ret = -EINVAL;
2297 goto unlock;
2298 }
2299
2300 user_extra = nr_pages + 1;
2301 user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
2302
2303 /*
2304 * Increase the limit linearly with more CPUs:
2305 */
2306 user_lock_limit *= num_online_cpus();
2307
2308 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2309
2310 extra = 0;
2311 if (user_locked > user_lock_limit)
2312 extra = user_locked - user_lock_limit;
2313
2314 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
2315 lock_limit >>= PAGE_SHIFT;
2316 locked = vma->vm_mm->locked_vm + extra;
2317
2318 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
2319 ret = -EPERM;
2320 goto unlock;
2321 }
2322
2323 WARN_ON(counter->data);
2324 ret = perf_mmap_data_alloc(counter, nr_pages);
2325 if (ret)
2326 goto unlock;
2327
2328 atomic_set(&counter->mmap_count, 1);
2329 atomic_long_add(user_extra, &user->locked_vm);
2330 vma->vm_mm->locked_vm += extra;
2331 counter->data->nr_locked = extra;
2332 if (vma->vm_flags & VM_WRITE)
2333 counter->data->writable = 1;
2334
2335unlock:
2336 mutex_unlock(&counter->mmap_mutex);
2337
2338 vma->vm_flags |= VM_RESERVED;
2339 vma->vm_ops = &perf_mmap_vmops;
2340
2341 return ret;
2342}
2343
2344static int perf_fasync(int fd, struct file *filp, int on)
2345{
2346 struct inode *inode = filp->f_path.dentry->d_inode;
2347 struct perf_counter *counter = filp->private_data;
2348 int retval;
2349
2350 mutex_lock(&inode->i_mutex);
2351 retval = fasync_helper(fd, filp, on, &counter->fasync);
2352 mutex_unlock(&inode->i_mutex);
2353
2354 if (retval < 0)
2355 return retval;
2356
2357 return 0;
2358}
2359
2360static const struct file_operations perf_fops = {
2361 .release = perf_release,
2362 .read = perf_read,
2363 .poll = perf_poll,
2364 .unlocked_ioctl = perf_ioctl,
2365 .compat_ioctl = perf_ioctl,
2366 .mmap = perf_mmap,
2367 .fasync = perf_fasync,
2368};
2369
2370/*
2371 * Perf counter wakeup
2372 *
2373 * If there's data, ensure we set the poll() state and publish everything
2374 * to user-space before waking everybody up.
2375 */
2376
2377void perf_counter_wakeup(struct perf_counter *counter)
2378{
2379 wake_up_all(&counter->waitq);
2380
2381 if (counter->pending_kill) {
2382 kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
2383 counter->pending_kill = 0;
2384 }
2385}
2386
2387/*
2388 * Pending wakeups
2389 *
2390 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2391 *
2392 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2393 * single linked list and use cmpxchg() to add entries lockless.
2394 */
2395
2396static void perf_pending_counter(struct perf_pending_entry *entry)
2397{
2398 struct perf_counter *counter = container_of(entry,
2399 struct perf_counter, pending);
2400
2401 if (counter->pending_disable) {
2402 counter->pending_disable = 0;
2403 __perf_counter_disable(counter);
2404 }
2405
2406 if (counter->pending_wakeup) {
2407 counter->pending_wakeup = 0;
2408 perf_counter_wakeup(counter);
2409 }
2410}
2411
2412#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2413
2414static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2415 PENDING_TAIL,
2416};
2417
2418static void perf_pending_queue(struct perf_pending_entry *entry,
2419 void (*func)(struct perf_pending_entry *))
2420{
2421 struct perf_pending_entry **head;
2422
2423 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2424 return;
2425
2426 entry->func = func;
2427
2428 head = &get_cpu_var(perf_pending_head);
2429
2430 do {
2431 entry->next = *head;
2432 } while (cmpxchg(head, entry->next, entry) != entry->next);
2433
2434 set_perf_counter_pending();
2435
2436 put_cpu_var(perf_pending_head);
2437}
2438
2439static int __perf_pending_run(void)
2440{
2441 struct perf_pending_entry *list;
2442 int nr = 0;
2443
2444 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2445 while (list != PENDING_TAIL) {
2446 void (*func)(struct perf_pending_entry *);
2447 struct perf_pending_entry *entry = list;
2448
2449 list = list->next;
2450
2451 func = entry->func;
2452 entry->next = NULL;
2453 /*
2454 * Ensure we observe the unqueue before we issue the wakeup,
2455 * so that we won't be waiting forever.
2456 * -- see perf_not_pending().
2457 */
2458 smp_wmb();
2459
2460 func(entry);
2461 nr++;
2462 }
2463
2464 return nr;
2465}
2466
2467static inline int perf_not_pending(struct perf_counter *counter)
2468{
2469 /*
2470 * If we flush on whatever cpu we run, there is a chance we don't
2471 * need to wait.
2472 */
2473 get_cpu();
2474 __perf_pending_run();
2475 put_cpu();
2476
2477 /*
2478 * Ensure we see the proper queue state before going to sleep
2479 * so that we do not miss the wakeup. -- see perf_pending_handle()
2480 */
2481 smp_rmb();
2482 return counter->pending.next == NULL;
2483}
2484
2485static void perf_pending_sync(struct perf_counter *counter)
2486{
2487 wait_event(counter->waitq, perf_not_pending(counter));
2488}
2489
2490void perf_counter_do_pending(void)
2491{
2492 __perf_pending_run();
2493}
2494
2495/*
2496 * Callchain support -- arch specific
2497 */
2498
2499__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2500{
2501 return NULL;
2502}
2503
2504/*
2505 * Output
2506 */
2507
2508struct perf_output_handle {
2509 struct perf_counter *counter;
2510 struct perf_mmap_data *data;
2511 unsigned long head;
2512 unsigned long offset;
2513 int nmi;
2514 int sample;
2515 int locked;
2516 unsigned long flags;
2517};
2518
2519static bool perf_output_space(struct perf_mmap_data *data,
2520 unsigned int offset, unsigned int head)
2521{
2522 unsigned long tail;
2523 unsigned long mask;
2524
2525 if (!data->writable)
2526 return true;
2527
2528 mask = (data->nr_pages << PAGE_SHIFT) - 1;
2529 /*
2530 * Userspace could choose to issue a mb() before updating the tail
2531 * pointer. So that all reads will be completed before the write is
2532 * issued.
2533 */
2534 tail = ACCESS_ONCE(data->user_page->data_tail);
2535 smp_rmb();
2536
2537 offset = (offset - tail) & mask;
2538 head = (head - tail) & mask;
2539
2540 if ((int)(head - offset) < 0)
2541 return false;
2542
2543 return true;
2544}
2545
2546static void perf_output_wakeup(struct perf_output_handle *handle)
2547{
2548 atomic_set(&handle->data->poll, POLL_IN);
2549
2550 if (handle->nmi) {
2551 handle->counter->pending_wakeup = 1;
2552 perf_pending_queue(&handle->counter->pending,
2553 perf_pending_counter);
2554 } else
2555 perf_counter_wakeup(handle->counter);
2556}
2557
2558/*
2559 * Curious locking construct.
2560 *
2561 * We need to ensure a later event doesn't publish a head when a former
2562 * event isn't done writing. However since we need to deal with NMIs we
2563 * cannot fully serialize things.
2564 *
2565 * What we do is serialize between CPUs so we only have to deal with NMI
2566 * nesting on a single CPU.
2567 *
2568 * We only publish the head (and generate a wakeup) when the outer-most
2569 * event completes.
2570 */
2571static void perf_output_lock(struct perf_output_handle *handle)
2572{
2573 struct perf_mmap_data *data = handle->data;
2574 int cpu;
2575
2576 handle->locked = 0;
2577
2578 local_irq_save(handle->flags);
2579 cpu = smp_processor_id();
2580
2581 if (in_nmi() && atomic_read(&data->lock) == cpu)
2582 return;
2583
2584 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2585 cpu_relax();
2586
2587 handle->locked = 1;
2588}
2589
2590static void perf_output_unlock(struct perf_output_handle *handle)
2591{
2592 struct perf_mmap_data *data = handle->data;
2593 unsigned long head;
2594 int cpu;
2595
2596 data->done_head = data->head;
2597
2598 if (!handle->locked)
2599 goto out;
2600
2601again:
2602 /*
2603 * The xchg implies a full barrier that ensures all writes are done
2604 * before we publish the new head, matched by a rmb() in userspace when
2605 * reading this position.
2606 */
2607 while ((head = atomic_long_xchg(&data->done_head, 0)))
2608 data->user_page->data_head = head;
2609
2610 /*
2611 * NMI can happen here, which means we can miss a done_head update.
2612 */
2613
2614 cpu = atomic_xchg(&data->lock, -1);
2615 WARN_ON_ONCE(cpu != smp_processor_id());
2616
2617 /*
2618 * Therefore we have to validate we did not indeed do so.
2619 */
2620 if (unlikely(atomic_long_read(&data->done_head))) {
2621 /*
2622 * Since we had it locked, we can lock it again.
2623 */
2624 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2625 cpu_relax();
2626
2627 goto again;
2628 }
2629
2630 if (atomic_xchg(&data->wakeup, 0))
2631 perf_output_wakeup(handle);
2632out:
2633 local_irq_restore(handle->flags);
2634}
2635
2636static void perf_output_copy(struct perf_output_handle *handle,
2637 const void *buf, unsigned int len)
2638{
2639 unsigned int pages_mask;
2640 unsigned int offset;
2641 unsigned int size;
2642 void **pages;
2643
2644 offset = handle->offset;
2645 pages_mask = handle->data->nr_pages - 1;
2646 pages = handle->data->data_pages;
2647
2648 do {
2649 unsigned int page_offset;
2650 int nr;
2651
2652 nr = (offset >> PAGE_SHIFT) & pages_mask;
2653 page_offset = offset & (PAGE_SIZE - 1);
2654 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2655
2656 memcpy(pages[nr] + page_offset, buf, size);
2657
2658 len -= size;
2659 buf += size;
2660 offset += size;
2661 } while (len);
2662
2663 handle->offset = offset;
2664
2665 /*
2666 * Check we didn't copy past our reservation window, taking the
2667 * possible unsigned int wrap into account.
2668 */
2669 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2670}
2671
2672#define perf_output_put(handle, x) \
2673 perf_output_copy((handle), &(x), sizeof(x))
2674
2675static int perf_output_begin(struct perf_output_handle *handle,
2676 struct perf_counter *counter, unsigned int size,
2677 int nmi, int sample)
2678{
2679 struct perf_counter *output_counter;
2680 struct perf_mmap_data *data;
2681 unsigned int offset, head;
2682 int have_lost;
2683 struct {
2684 struct perf_event_header header;
2685 u64 id;
2686 u64 lost;
2687 } lost_event;
2688
2689 rcu_read_lock();
2690 /*
2691 * For inherited counters we send all the output towards the parent.
2692 */
2693 if (counter->parent)
2694 counter = counter->parent;
2695
2696 output_counter = rcu_dereference(counter->output);
2697 if (output_counter)
2698 counter = output_counter;
2699
2700 data = rcu_dereference(counter->data);
2701 if (!data)
2702 goto out;
2703
2704 handle->data = data;
2705 handle->counter = counter;
2706 handle->nmi = nmi;
2707 handle->sample = sample;
2708
2709 if (!data->nr_pages)
2710 goto fail;
2711
2712 have_lost = atomic_read(&data->lost);
2713 if (have_lost)
2714 size += sizeof(lost_event);
2715
2716 perf_output_lock(handle);
2717
2718 do {
2719 offset = head = atomic_long_read(&data->head);
2720 head += size;
2721 if (unlikely(!perf_output_space(data, offset, head)))
2722 goto fail;
2723 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2724
2725 handle->offset = offset;
2726 handle->head = head;
2727
2728 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
2729 atomic_set(&data->wakeup, 1);
2730
2731 if (have_lost) {
2732 lost_event.header.type = PERF_EVENT_LOST;
2733 lost_event.header.misc = 0;
2734 lost_event.header.size = sizeof(lost_event);
2735 lost_event.id = counter->id;
2736 lost_event.lost = atomic_xchg(&data->lost, 0);
2737
2738 perf_output_put(handle, lost_event);
2739 }
2740
2741 return 0;
2742
2743fail:
2744 atomic_inc(&data->lost);
2745 perf_output_unlock(handle);
2746out:
2747 rcu_read_unlock();
2748
2749 return -ENOSPC;
2750}
2751
2752static void perf_output_end(struct perf_output_handle *handle)
2753{
2754 struct perf_counter *counter = handle->counter;
2755 struct perf_mmap_data *data = handle->data;
2756
2757 int wakeup_events = counter->attr.wakeup_events;
2758
2759 if (handle->sample && wakeup_events) {
2760 int events = atomic_inc_return(&data->events);
2761 if (events >= wakeup_events) {
2762 atomic_sub(wakeup_events, &data->events);
2763 atomic_set(&data->wakeup, 1);
2764 }
2765 }
2766
2767 perf_output_unlock(handle);
2768 rcu_read_unlock();
2769}
2770
2771static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
2772{
2773 /*
2774 * only top level counters have the pid namespace they were created in
2775 */
2776 if (counter->parent)
2777 counter = counter->parent;
2778
2779 return task_tgid_nr_ns(p, counter->ns);
2780}
2781
2782static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
2783{
2784 /*
2785 * only top level counters have the pid namespace they were created in
2786 */
2787 if (counter->parent)
2788 counter = counter->parent;
2789
2790 return task_pid_nr_ns(p, counter->ns);
2791}
2792
2793static void perf_output_read_one(struct perf_output_handle *handle,
2794 struct perf_counter *counter)
2795{
2796 u64 read_format = counter->attr.read_format;
2797 u64 values[4];
2798 int n = 0;
2799
2800 values[n++] = atomic64_read(&counter->count);
2801 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2802 values[n++] = counter->total_time_enabled +
2803 atomic64_read(&counter->child_total_time_enabled);
2804 }
2805 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2806 values[n++] = counter->total_time_running +
2807 atomic64_read(&counter->child_total_time_running);
2808 }
2809 if (read_format & PERF_FORMAT_ID)
2810 values[n++] = primary_counter_id(counter);
2811
2812 perf_output_copy(handle, values, n * sizeof(u64));
2813}
2814
2815/*
2816 * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
2817 */
2818static void perf_output_read_group(struct perf_output_handle *handle,
2819 struct perf_counter *counter)
2820{
2821 struct perf_counter *leader = counter->group_leader, *sub;
2822 u64 read_format = counter->attr.read_format;
2823 u64 values[5];
2824 int n = 0;
2825
2826 values[n++] = 1 + leader->nr_siblings;
2827
2828 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2829 values[n++] = leader->total_time_enabled;
2830
2831 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2832 values[n++] = leader->total_time_running;
2833
2834 if (leader != counter)
2835 leader->pmu->read(leader);
2836
2837 values[n++] = atomic64_read(&leader->count);
2838 if (read_format & PERF_FORMAT_ID)
2839 values[n++] = primary_counter_id(leader);
2840
2841 perf_output_copy(handle, values, n * sizeof(u64));
2842
2843 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2844 n = 0;
2845
2846 if (sub != counter)
2847 sub->pmu->read(sub);
2848
2849 values[n++] = atomic64_read(&sub->count);
2850 if (read_format & PERF_FORMAT_ID)
2851 values[n++] = primary_counter_id(sub);
2852
2853 perf_output_copy(handle, values, n * sizeof(u64));
2854 }
2855}
2856
2857static void perf_output_read(struct perf_output_handle *handle,
2858 struct perf_counter *counter)
2859{
2860 if (counter->attr.read_format & PERF_FORMAT_GROUP)
2861 perf_output_read_group(handle, counter);
2862 else
2863 perf_output_read_one(handle, counter);
2864}
2865
2866void perf_counter_output(struct perf_counter *counter, int nmi,
2867 struct perf_sample_data *data)
2868{
2869 int ret;
2870 u64 sample_type = counter->attr.sample_type;
2871 struct perf_output_handle handle;
2872 struct perf_event_header header;
2873 u64 ip;
2874 struct {
2875 u32 pid, tid;
2876 } tid_entry;
2877 struct perf_callchain_entry *callchain = NULL;
2878 int callchain_size = 0;
2879 u64 time;
2880 struct {
2881 u32 cpu, reserved;
2882 } cpu_entry;
2883
2884 header.type = PERF_EVENT_SAMPLE;
2885 header.size = sizeof(header);
2886
2887 header.misc = 0;
2888 header.misc |= perf_misc_flags(data->regs);
2889
2890 if (sample_type & PERF_SAMPLE_IP) {
2891 ip = perf_instruction_pointer(data->regs);
2892 header.size += sizeof(ip);
2893 }
2894
2895 if (sample_type & PERF_SAMPLE_TID) {
2896 /* namespace issues */
2897 tid_entry.pid = perf_counter_pid(counter, current);
2898 tid_entry.tid = perf_counter_tid(counter, current);
2899
2900 header.size += sizeof(tid_entry);
2901 }
2902
2903 if (sample_type & PERF_SAMPLE_TIME) {
2904 /*
2905 * Maybe do better on x86 and provide cpu_clock_nmi()
2906 */
2907 time = sched_clock();
2908
2909 header.size += sizeof(u64);
2910 }
2911
2912 if (sample_type & PERF_SAMPLE_ADDR)
2913 header.size += sizeof(u64);
2914
2915 if (sample_type & PERF_SAMPLE_ID)
2916 header.size += sizeof(u64);
2917
2918 if (sample_type & PERF_SAMPLE_STREAM_ID)
2919 header.size += sizeof(u64);
2920
2921 if (sample_type & PERF_SAMPLE_CPU) {
2922 header.size += sizeof(cpu_entry);
2923
2924 cpu_entry.cpu = raw_smp_processor_id();
2925 cpu_entry.reserved = 0;
2926 }
2927
2928 if (sample_type & PERF_SAMPLE_PERIOD)
2929 header.size += sizeof(u64);
2930
2931 if (sample_type & PERF_SAMPLE_READ)
2932 header.size += perf_counter_read_size(counter);
2933
2934 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2935 callchain = perf_callchain(data->regs);
2936
2937 if (callchain) {
2938 callchain_size = (1 + callchain->nr) * sizeof(u64);
2939 header.size += callchain_size;
2940 } else
2941 header.size += sizeof(u64);
2942 }
2943
2944 if (sample_type & PERF_SAMPLE_RAW) {
2945 int size = sizeof(u32);
2946
2947 if (data->raw)
2948 size += data->raw->size;
2949 else
2950 size += sizeof(u32);
2951
2952 WARN_ON_ONCE(size & (sizeof(u64)-1));
2953 header.size += size;
2954 }
2955
2956 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
2957 if (ret)
2958 return;
2959
2960 perf_output_put(&handle, header);
2961
2962 if (sample_type & PERF_SAMPLE_IP)
2963 perf_output_put(&handle, ip);
2964
2965 if (sample_type & PERF_SAMPLE_TID)
2966 perf_output_put(&handle, tid_entry);
2967
2968 if (sample_type & PERF_SAMPLE_TIME)
2969 perf_output_put(&handle, time);
2970
2971 if (sample_type & PERF_SAMPLE_ADDR)
2972 perf_output_put(&handle, data->addr);
2973
2974 if (sample_type & PERF_SAMPLE_ID) {
2975 u64 id = primary_counter_id(counter);
2976
2977 perf_output_put(&handle, id);
2978 }
2979
2980 if (sample_type & PERF_SAMPLE_STREAM_ID)
2981 perf_output_put(&handle, counter->id);
2982
2983 if (sample_type & PERF_SAMPLE_CPU)
2984 perf_output_put(&handle, cpu_entry);
2985
2986 if (sample_type & PERF_SAMPLE_PERIOD)
2987 perf_output_put(&handle, data->period);
2988
2989 if (sample_type & PERF_SAMPLE_READ)
2990 perf_output_read(&handle, counter);
2991
2992 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2993 if (callchain)
2994 perf_output_copy(&handle, callchain, callchain_size);
2995 else {
2996 u64 nr = 0;
2997 perf_output_put(&handle, nr);
2998 }
2999 }
3000
3001 if (sample_type & PERF_SAMPLE_RAW) {
3002 if (data->raw) {
3003 perf_output_put(&handle, data->raw->size);
3004 perf_output_copy(&handle, data->raw->data, data->raw->size);
3005 } else {
3006 struct {
3007 u32 size;
3008 u32 data;
3009 } raw = {
3010 .size = sizeof(u32),
3011 .data = 0,
3012 };
3013 perf_output_put(&handle, raw);
3014 }
3015 }
3016
3017 perf_output_end(&handle);
3018}
3019
3020/*
3021 * read event
3022 */
3023
3024struct perf_read_event {
3025 struct perf_event_header header;
3026
3027 u32 pid;
3028 u32 tid;
3029};
3030
3031static void
3032perf_counter_read_event(struct perf_counter *counter,
3033 struct task_struct *task)
3034{
3035 struct perf_output_handle handle;
3036 struct perf_read_event event = {
3037 .header = {
3038 .type = PERF_EVENT_READ,
3039 .misc = 0,
3040 .size = sizeof(event) + perf_counter_read_size(counter),
3041 },
3042 .pid = perf_counter_pid(counter, task),
3043 .tid = perf_counter_tid(counter, task),
3044 };
3045 int ret;
3046
3047 ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
3048 if (ret)
3049 return;
3050
3051 perf_output_put(&handle, event);
3052 perf_output_read(&handle, counter);
3053
3054 perf_output_end(&handle);
3055}
3056
3057/*
3058 * task tracking -- fork/exit
3059 *
3060 * enabled by: attr.comm | attr.mmap | attr.task
3061 */
3062
3063struct perf_task_event {
3064 struct task_struct *task;
3065 struct perf_counter_context *task_ctx;
3066
3067 struct {
3068 struct perf_event_header header;
3069
3070 u32 pid;
3071 u32 ppid;
3072 u32 tid;
3073 u32 ptid;
3074 } event;
3075};
3076
3077static void perf_counter_task_output(struct perf_counter *counter,
3078 struct perf_task_event *task_event)
3079{
3080 struct perf_output_handle handle;
3081 int size = task_event->event.header.size;
3082 struct task_struct *task = task_event->task;
3083 int ret = perf_output_begin(&handle, counter, size, 0, 0);
3084
3085 if (ret)
3086 return;
3087
3088 task_event->event.pid = perf_counter_pid(counter, task);
3089 task_event->event.ppid = perf_counter_pid(counter, current);
3090
3091 task_event->event.tid = perf_counter_tid(counter, task);
3092 task_event->event.ptid = perf_counter_tid(counter, current);
3093
3094 perf_output_put(&handle, task_event->event);
3095 perf_output_end(&handle);
3096}
3097
3098static int perf_counter_task_match(struct perf_counter *counter)
3099{
3100 if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
3101 return 1;
3102
3103 return 0;
3104}
3105
3106static void perf_counter_task_ctx(struct perf_counter_context *ctx,
3107 struct perf_task_event *task_event)
3108{
3109 struct perf_counter *counter;
3110
3111 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3112 return;
3113
3114 rcu_read_lock();
3115 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3116 if (perf_counter_task_match(counter))
3117 perf_counter_task_output(counter, task_event);
3118 }
3119 rcu_read_unlock();
3120}
3121
3122static void perf_counter_task_event(struct perf_task_event *task_event)
3123{
3124 struct perf_cpu_context *cpuctx;
3125 struct perf_counter_context *ctx = task_event->task_ctx;
3126
3127 cpuctx = &get_cpu_var(perf_cpu_context);
3128 perf_counter_task_ctx(&cpuctx->ctx, task_event);
3129 put_cpu_var(perf_cpu_context);
3130
3131 rcu_read_lock();
3132 if (!ctx)
3133 ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
3134 if (ctx)
3135 perf_counter_task_ctx(ctx, task_event);
3136 rcu_read_unlock();
3137}
3138
3139static void perf_counter_task(struct task_struct *task,
3140 struct perf_counter_context *task_ctx,
3141 int new)
3142{
3143 struct perf_task_event task_event;
3144
3145 if (!atomic_read(&nr_comm_counters) &&
3146 !atomic_read(&nr_mmap_counters) &&
3147 !atomic_read(&nr_task_counters))
3148 return;
3149
3150 task_event = (struct perf_task_event){
3151 .task = task,
3152 .task_ctx = task_ctx,
3153 .event = {
3154 .header = {
3155 .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
3156 .misc = 0,
3157 .size = sizeof(task_event.event),
3158 },
3159 /* .pid */
3160 /* .ppid */
3161 /* .tid */
3162 /* .ptid */
3163 },
3164 };
3165
3166 perf_counter_task_event(&task_event);
3167}
3168
3169void perf_counter_fork(struct task_struct *task)
3170{
3171 perf_counter_task(task, NULL, 1);
3172}
3173
3174/*
3175 * comm tracking
3176 */
3177
3178struct perf_comm_event {
3179 struct task_struct *task;
3180 char *comm;
3181 int comm_size;
3182
3183 struct {
3184 struct perf_event_header header;
3185
3186 u32 pid;
3187 u32 tid;
3188 } event;
3189};
3190
3191static void perf_counter_comm_output(struct perf_counter *counter,
3192 struct perf_comm_event *comm_event)
3193{
3194 struct perf_output_handle handle;
3195 int size = comm_event->event.header.size;
3196 int ret = perf_output_begin(&handle, counter, size, 0, 0);
3197
3198 if (ret)
3199 return;
3200
3201 comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
3202 comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
3203
3204 perf_output_put(&handle, comm_event->event);
3205 perf_output_copy(&handle, comm_event->comm,
3206 comm_event->comm_size);
3207 perf_output_end(&handle);
3208}
3209
3210static int perf_counter_comm_match(struct perf_counter *counter)
3211{
3212 if (counter->attr.comm)
3213 return 1;
3214
3215 return 0;
3216}
3217
3218static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
3219 struct perf_comm_event *comm_event)
3220{
3221 struct perf_counter *counter;
3222
3223 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3224 return;
3225
3226 rcu_read_lock();
3227 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3228 if (perf_counter_comm_match(counter))
3229 perf_counter_comm_output(counter, comm_event);
3230 }
3231 rcu_read_unlock();
3232}
3233
3234static void perf_counter_comm_event(struct perf_comm_event *comm_event)
3235{
3236 struct perf_cpu_context *cpuctx;
3237 struct perf_counter_context *ctx;
3238 unsigned int size;
3239 char comm[TASK_COMM_LEN];
3240
3241 memset(comm, 0, sizeof(comm));
3242 strncpy(comm, comm_event->task->comm, sizeof(comm));
3243 size = ALIGN(strlen(comm)+1, sizeof(u64));
3244
3245 comm_event->comm = comm;
3246 comm_event->comm_size = size;
3247
3248 comm_event->event.header.size = sizeof(comm_event->event) + size;
3249
3250 cpuctx = &get_cpu_var(perf_cpu_context);
3251 perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
3252 put_cpu_var(perf_cpu_context);
3253
3254 rcu_read_lock();
3255 /*
3256 * doesn't really matter which of the child contexts the
3257 * events ends up in.
3258 */
3259 ctx = rcu_dereference(current->perf_counter_ctxp);
3260 if (ctx)
3261 perf_counter_comm_ctx(ctx, comm_event);
3262 rcu_read_unlock();
3263}
3264
3265void perf_counter_comm(struct task_struct *task)
3266{
3267 struct perf_comm_event comm_event;
3268
3269 if (task->perf_counter_ctxp)
3270 perf_counter_enable_on_exec(task);
3271
3272 if (!atomic_read(&nr_comm_counters))
3273 return;
3274
3275 comm_event = (struct perf_comm_event){
3276 .task = task,
3277 /* .comm */
3278 /* .comm_size */
3279 .event = {
3280 .header = {
3281 .type = PERF_EVENT_COMM,
3282 .misc = 0,
3283 /* .size */
3284 },
3285 /* .pid */
3286 /* .tid */
3287 },
3288 };
3289
3290 perf_counter_comm_event(&comm_event);
3291}
3292
3293/*
3294 * mmap tracking
3295 */
3296
3297struct perf_mmap_event {
3298 struct vm_area_struct *vma;
3299
3300 const char *file_name;
3301 int file_size;
3302
3303 struct {
3304 struct perf_event_header header;
3305
3306 u32 pid;
3307 u32 tid;
3308 u64 start;
3309 u64 len;
3310 u64 pgoff;
3311 } event;
3312};
3313
3314static void perf_counter_mmap_output(struct perf_counter *counter,
3315 struct perf_mmap_event *mmap_event)
3316{
3317 struct perf_output_handle handle;
3318 int size = mmap_event->event.header.size;
3319 int ret = perf_output_begin(&handle, counter, size, 0, 0);
3320
3321 if (ret)
3322 return;
3323
3324 mmap_event->event.pid = perf_counter_pid(counter, current);
3325 mmap_event->event.tid = perf_counter_tid(counter, current);
3326
3327 perf_output_put(&handle, mmap_event->event);
3328 perf_output_copy(&handle, mmap_event->file_name,
3329 mmap_event->file_size);
3330 perf_output_end(&handle);
3331}
3332
3333static int perf_counter_mmap_match(struct perf_counter *counter,
3334 struct perf_mmap_event *mmap_event)
3335{
3336 if (counter->attr.mmap)
3337 return 1;
3338
3339 return 0;
3340}
3341
3342static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
3343 struct perf_mmap_event *mmap_event)
3344{
3345 struct perf_counter *counter;
3346
3347 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3348 return;
3349
3350 rcu_read_lock();
3351 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3352 if (perf_counter_mmap_match(counter, mmap_event))
3353 perf_counter_mmap_output(counter, mmap_event);
3354 }
3355 rcu_read_unlock();
3356}
3357
3358static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
3359{
3360 struct perf_cpu_context *cpuctx;
3361 struct perf_counter_context *ctx;
3362 struct vm_area_struct *vma = mmap_event->vma;
3363 struct file *file = vma->vm_file;
3364 unsigned int size;
3365 char tmp[16];
3366 char *buf = NULL;
3367 const char *name;
3368
3369 memset(tmp, 0, sizeof(tmp));
3370
3371 if (file) {
3372 /*
3373 * d_path works from the end of the buffer backwards, so we
3374 * need to add enough zero bytes after the string to handle
3375 * the 64bit alignment we do later.
3376 */
3377 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3378 if (!buf) {
3379 name = strncpy(tmp, "//enomem", sizeof(tmp));
3380 goto got_name;
3381 }
3382 name = d_path(&file->f_path, buf, PATH_MAX);
3383 if (IS_ERR(name)) {
3384 name = strncpy(tmp, "//toolong", sizeof(tmp));
3385 goto got_name;
3386 }
3387 } else {
3388 if (arch_vma_name(mmap_event->vma)) {
3389 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3390 sizeof(tmp));
3391 goto got_name;
3392 }
3393
3394 if (!vma->vm_mm) {
3395 name = strncpy(tmp, "[vdso]", sizeof(tmp));
3396 goto got_name;
3397 }
3398
3399 name = strncpy(tmp, "//anon", sizeof(tmp));
3400 goto got_name;
3401 }
3402
3403got_name:
3404 size = ALIGN(strlen(name)+1, sizeof(u64));
3405
3406 mmap_event->file_name = name;
3407 mmap_event->file_size = size;
3408
3409 mmap_event->event.header.size = sizeof(mmap_event->event) + size;
3410
3411 cpuctx = &get_cpu_var(perf_cpu_context);
3412 perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
3413 put_cpu_var(perf_cpu_context);
3414
3415 rcu_read_lock();
3416 /*
3417 * doesn't really matter which of the child contexts the
3418 * events ends up in.
3419 */
3420 ctx = rcu_dereference(current->perf_counter_ctxp);
3421 if (ctx)
3422 perf_counter_mmap_ctx(ctx, mmap_event);
3423 rcu_read_unlock();
3424
3425 kfree(buf);
3426}
3427
3428void __perf_counter_mmap(struct vm_area_struct *vma)
3429{
3430 struct perf_mmap_event mmap_event;
3431
3432 if (!atomic_read(&nr_mmap_counters))
3433 return;
3434
3435 mmap_event = (struct perf_mmap_event){
3436 .vma = vma,
3437 /* .file_name */
3438 /* .file_size */
3439 .event = {
3440 .header = {
3441 .type = PERF_EVENT_MMAP,
3442 .misc = 0,
3443 /* .size */
3444 },
3445 /* .pid */
3446 /* .tid */
3447 .start = vma->vm_start,
3448 .len = vma->vm_end - vma->vm_start,
3449 .pgoff = vma->vm_pgoff,
3450 },
3451 };
3452
3453 perf_counter_mmap_event(&mmap_event);
3454}
3455
3456/*
3457 * IRQ throttle logging
3458 */
3459
3460static void perf_log_throttle(struct perf_counter *counter, int enable)
3461{
3462 struct perf_output_handle handle;
3463 int ret;
3464
3465 struct {
3466 struct perf_event_header header;
3467 u64 time;
3468 u64 id;
3469 u64 stream_id;
3470 } throttle_event = {
3471 .header = {
3472 .type = PERF_EVENT_THROTTLE,
3473 .misc = 0,
3474 .size = sizeof(throttle_event),
3475 },
3476 .time = sched_clock(),
3477 .id = primary_counter_id(counter),
3478 .stream_id = counter->id,
3479 };
3480
3481 if (enable)
3482 throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
3483
3484 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
3485 if (ret)
3486 return;
3487
3488 perf_output_put(&handle, throttle_event);
3489 perf_output_end(&handle);
3490}
3491
3492/*
3493 * Generic counter overflow handling, sampling.
3494 */
3495
3496int perf_counter_overflow(struct perf_counter *counter, int nmi,
3497 struct perf_sample_data *data)
3498{
3499 int events = atomic_read(&counter->event_limit);
3500 int throttle = counter->pmu->unthrottle != NULL;
3501 struct hw_perf_counter *hwc = &counter->hw;
3502 int ret = 0;
3503
3504 if (!throttle) {
3505 hwc->interrupts++;
3506 } else {
3507 if (hwc->interrupts != MAX_INTERRUPTS) {
3508 hwc->interrupts++;
3509 if (HZ * hwc->interrupts >
3510 (u64)sysctl_perf_counter_sample_rate) {
3511 hwc->interrupts = MAX_INTERRUPTS;
3512 perf_log_throttle(counter, 0);
3513 ret = 1;
3514 }
3515 } else {
3516 /*
3517 * Keep re-disabling counters even though on the previous
3518 * pass we disabled it - just in case we raced with a
3519 * sched-in and the counter got enabled again:
3520 */
3521 ret = 1;
3522 }
3523 }
3524
3525 if (counter->attr.freq) {
3526 u64 now = sched_clock();
3527 s64 delta = now - hwc->freq_stamp;
3528
3529 hwc->freq_stamp = now;
3530
3531 if (delta > 0 && delta < TICK_NSEC)
3532 perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
3533 }
3534
3535 /*
3536 * XXX event_limit might not quite work as expected on inherited
3537 * counters
3538 */
3539
3540 counter->pending_kill = POLL_IN;
3541 if (events && atomic_dec_and_test(&counter->event_limit)) {
3542 ret = 1;
3543 counter->pending_kill = POLL_HUP;
3544 if (nmi) {
3545 counter->pending_disable = 1;
3546 perf_pending_queue(&counter->pending,
3547 perf_pending_counter);
3548 } else
3549 perf_counter_disable(counter);
3550 }
3551
3552 perf_counter_output(counter, nmi, data);
3553 return ret;
3554}
3555
3556/*
3557 * Generic software counter infrastructure
3558 */
3559
3560/*
3561 * We directly increment counter->count and keep a second value in
3562 * counter->hw.period_left to count intervals. This period counter
3563 * is kept in the range [-sample_period, 0] so that we can use the
3564 * sign as trigger.
3565 */
3566
3567static u64 perf_swcounter_set_period(struct perf_counter *counter)
3568{
3569 struct hw_perf_counter *hwc = &counter->hw;
3570 u64 period = hwc->last_period;
3571 u64 nr, offset;
3572 s64 old, val;
3573
3574 hwc->last_period = hwc->sample_period;
3575
3576again:
3577 old = val = atomic64_read(&hwc->period_left);
3578 if (val < 0)
3579 return 0;
3580
3581 nr = div64_u64(period + val, period);
3582 offset = nr * period;
3583 val -= offset;
3584 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3585 goto again;
3586
3587 return nr;
3588}
3589
3590static void perf_swcounter_overflow(struct perf_counter *counter,
3591 int nmi, struct perf_sample_data *data)
3592{
3593 struct hw_perf_counter *hwc = &counter->hw;
3594 u64 overflow;
3595
3596 data->period = counter->hw.last_period;
3597 overflow = perf_swcounter_set_period(counter);
3598
3599 if (hwc->interrupts == MAX_INTERRUPTS)
3600 return;
3601
3602 for (; overflow; overflow--) {
3603 if (perf_counter_overflow(counter, nmi, data)) {
3604 /*
3605 * We inhibit the overflow from happening when
3606 * hwc->interrupts == MAX_INTERRUPTS.
3607 */
3608 break;
3609 }
3610 }
3611}
3612
3613static void perf_swcounter_unthrottle(struct perf_counter *counter)
3614{
3615 /*
3616 * Nothing to do, we already reset hwc->interrupts.
3617 */
3618}
3619
3620static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3621 int nmi, struct perf_sample_data *data)
3622{
3623 struct hw_perf_counter *hwc = &counter->hw;
3624
3625 atomic64_add(nr, &counter->count);
3626
3627 if (!hwc->sample_period)
3628 return;
3629
3630 if (!data->regs)
3631 return;
3632
3633 if (!atomic64_add_negative(nr, &hwc->period_left))
3634 perf_swcounter_overflow(counter, nmi, data);
3635}
3636
3637static int perf_swcounter_is_counting(struct perf_counter *counter)
3638{
3639 /*
3640 * The counter is active, we're good!
3641 */
3642 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
3643 return 1;
3644
3645 /*
3646 * The counter is off/error, not counting.
3647 */
3648 if (counter->state != PERF_COUNTER_STATE_INACTIVE)
3649 return 0;
3650
3651 /*
3652 * The counter is inactive, if the context is active
3653 * we're part of a group that didn't make it on the 'pmu',
3654 * not counting.
3655 */
3656 if (counter->ctx->is_active)
3657 return 0;
3658
3659 /*
3660 * We're inactive and the context is too, this means the
3661 * task is scheduled out, we're counting events that happen
3662 * to us, like migration events.
3663 */
3664 return 1;
3665}
3666
3667static int perf_swcounter_match(struct perf_counter *counter,
3668 enum perf_type_id type,
3669 u32 event, struct pt_regs *regs)
3670{
3671 if (!perf_swcounter_is_counting(counter))
3672 return 0;
3673
3674 if (counter->attr.type != type)
3675 return 0;
3676 if (counter->attr.config != event)
3677 return 0;
3678
3679 if (regs) {
3680 if (counter->attr.exclude_user && user_mode(regs))
3681 return 0;
3682
3683 if (counter->attr.exclude_kernel && !user_mode(regs))
3684 return 0;
3685 }
3686
3687 return 1;
3688}
3689
3690static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3691 enum perf_type_id type,
3692 u32 event, u64 nr, int nmi,
3693 struct perf_sample_data *data)
3694{
3695 struct perf_counter *counter;
3696
3697 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3698 return;
3699
3700 rcu_read_lock();
3701 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3702 if (perf_swcounter_match(counter, type, event, data->regs))
3703 perf_swcounter_add(counter, nr, nmi, data);
3704 }
3705 rcu_read_unlock();
3706}
3707
3708static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
3709{
3710 if (in_nmi())
3711 return &cpuctx->recursion[3];
3712
3713 if (in_irq())
3714 return &cpuctx->recursion[2];
3715
3716 if (in_softirq())
3717 return &cpuctx->recursion[1];
3718
3719 return &cpuctx->recursion[0];
3720}
3721
3722static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
3723 u64 nr, int nmi,
3724 struct perf_sample_data *data)
3725{
3726 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3727 int *recursion = perf_swcounter_recursion_context(cpuctx);
3728 struct perf_counter_context *ctx;
3729
3730 if (*recursion)
3731 goto out;
3732
3733 (*recursion)++;
3734 barrier();
3735
3736 perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
3737 nr, nmi, data);
3738 rcu_read_lock();
3739 /*
3740 * doesn't really matter which of the child contexts the
3741 * events ends up in.
3742 */
3743 ctx = rcu_dereference(current->perf_counter_ctxp);
3744 if (ctx)
3745 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
3746 rcu_read_unlock();
3747
3748 barrier();
3749 (*recursion)--;
3750
3751out:
3752 put_cpu_var(perf_cpu_context);
3753}
3754
3755void __perf_swcounter_event(u32 event, u64 nr, int nmi,
3756 struct pt_regs *regs, u64 addr)
3757{
3758 struct perf_sample_data data = {
3759 .regs = regs,
3760 .addr = addr,
3761 };
3762
3763 do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
3764}
3765
3766static void perf_swcounter_read(struct perf_counter *counter)
3767{
3768}
3769
3770static int perf_swcounter_enable(struct perf_counter *counter)
3771{
3772 struct hw_perf_counter *hwc = &counter->hw;
3773
3774 if (hwc->sample_period) {
3775 hwc->last_period = hwc->sample_period;
3776 perf_swcounter_set_period(counter);
3777 }
3778 return 0;
3779}
3780
3781static void perf_swcounter_disable(struct perf_counter *counter)
3782{
3783}
3784
3785static const struct pmu perf_ops_generic = {
3786 .enable = perf_swcounter_enable,
3787 .disable = perf_swcounter_disable,
3788 .read = perf_swcounter_read,
3789 .unthrottle = perf_swcounter_unthrottle,
3790};
3791
3792/*
3793 * hrtimer based swcounter callback
3794 */
3795
3796static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3797{
3798 enum hrtimer_restart ret = HRTIMER_RESTART;
3799 struct perf_sample_data data;
3800 struct perf_counter *counter;
3801 u64 period;
3802
3803 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3804 counter->pmu->read(counter);
3805
3806 data.addr = 0;
3807 data.regs = get_irq_regs();
3808 /*
3809 * In case we exclude kernel IPs or are somehow not in interrupt
3810 * context, provide the next best thing, the user IP.
3811 */
3812 if ((counter->attr.exclude_kernel || !data.regs) &&
3813 !counter->attr.exclude_user)
3814 data.regs = task_pt_regs(current);
3815
3816 if (data.regs) {
3817 if (perf_counter_overflow(counter, 0, &data))
3818 ret = HRTIMER_NORESTART;
3819 }
3820
3821 period = max_t(u64, 10000, counter->hw.sample_period);
3822 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3823
3824 return ret;
3825}
3826
3827/*
3828 * Software counter: cpu wall time clock
3829 */
3830
3831static void cpu_clock_perf_counter_update(struct perf_counter *counter)
3832{
3833 int cpu = raw_smp_processor_id();
3834 s64 prev;
3835 u64 now;
3836
3837 now = cpu_clock(cpu);
3838 prev = atomic64_read(&counter->hw.prev_count);
3839 atomic64_set(&counter->hw.prev_count, now);
3840 atomic64_add(now - prev, &counter->count);
3841}
3842
3843static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
3844{
3845 struct hw_perf_counter *hwc = &counter->hw;
3846 int cpu = raw_smp_processor_id();
3847
3848 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
3849 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3850 hwc->hrtimer.function = perf_swcounter_hrtimer;
3851 if (hwc->sample_period) {
3852 u64 period = max_t(u64, 10000, hwc->sample_period);
3853 __hrtimer_start_range_ns(&hwc->hrtimer,
3854 ns_to_ktime(period), 0,
3855 HRTIMER_MODE_REL, 0);
3856 }
3857
3858 return 0;
3859}
3860
3861static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
3862{
3863 if (counter->hw.sample_period)
3864 hrtimer_cancel(&counter->hw.hrtimer);
3865 cpu_clock_perf_counter_update(counter);
3866}
3867
3868static void cpu_clock_perf_counter_read(struct perf_counter *counter)
3869{
3870 cpu_clock_perf_counter_update(counter);
3871}
3872
3873static const struct pmu perf_ops_cpu_clock = {
3874 .enable = cpu_clock_perf_counter_enable,
3875 .disable = cpu_clock_perf_counter_disable,
3876 .read = cpu_clock_perf_counter_read,
3877};
3878
3879/*
3880 * Software counter: task time clock
3881 */
3882
3883static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
3884{
3885 u64 prev;
3886 s64 delta;
3887
3888 prev = atomic64_xchg(&counter->hw.prev_count, now);
3889 delta = now - prev;
3890 atomic64_add(delta, &counter->count);
3891}
3892
3893static int task_clock_perf_counter_enable(struct perf_counter *counter)
3894{
3895 struct hw_perf_counter *hwc = &counter->hw;
3896 u64 now;
3897
3898 now = counter->ctx->time;
3899
3900 atomic64_set(&hwc->prev_count, now);
3901 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3902 hwc->hrtimer.function = perf_swcounter_hrtimer;
3903 if (hwc->sample_period) {
3904 u64 period = max_t(u64, 10000, hwc->sample_period);
3905 __hrtimer_start_range_ns(&hwc->hrtimer,
3906 ns_to_ktime(period), 0,
3907 HRTIMER_MODE_REL, 0);
3908 }
3909
3910 return 0;
3911}
3912
3913static void task_clock_perf_counter_disable(struct perf_counter *counter)
3914{
3915 if (counter->hw.sample_period)
3916 hrtimer_cancel(&counter->hw.hrtimer);
3917 task_clock_perf_counter_update(counter, counter->ctx->time);
3918
3919}
3920
3921static void task_clock_perf_counter_read(struct perf_counter *counter)
3922{
3923 u64 time;
3924
3925 if (!in_nmi()) {
3926 update_context_time(counter->ctx);
3927 time = counter->ctx->time;
3928 } else {
3929 u64 now = perf_clock();
3930 u64 delta = now - counter->ctx->timestamp;
3931 time = counter->ctx->time + delta;
3932 }
3933
3934 task_clock_perf_counter_update(counter, time);
3935}
3936
3937static const struct pmu perf_ops_task_clock = {
3938 .enable = task_clock_perf_counter_enable,
3939 .disable = task_clock_perf_counter_disable,
3940 .read = task_clock_perf_counter_read,
3941};
3942
3943#ifdef CONFIG_EVENT_PROFILE
3944void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
3945 int entry_size)
3946{
3947 struct perf_raw_record raw = {
3948 .size = entry_size,
3949 .data = record,
3950 };
3951
3952 struct perf_sample_data data = {
3953 .regs = get_irq_regs(),
3954 .addr = addr,
3955 .raw = &raw,
3956 };
3957
3958 if (!data.regs)
3959 data.regs = task_pt_regs(current);
3960
3961 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
3962}
3963EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3964
3965extern int ftrace_profile_enable(int);
3966extern void ftrace_profile_disable(int);
3967
3968static void tp_perf_counter_destroy(struct perf_counter *counter)
3969{
3970 ftrace_profile_disable(counter->attr.config);
3971}
3972
3973static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3974{
3975 /*
3976 * Raw tracepoint data is a severe data leak, only allow root to
3977 * have these.
3978 */
3979 if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
3980 perf_paranoid_tracepoint_raw() &&
3981 !capable(CAP_SYS_ADMIN))
3982 return ERR_PTR(-EPERM);
3983
3984 if (ftrace_profile_enable(counter->attr.config))
3985 return NULL;
3986
3987 counter->destroy = tp_perf_counter_destroy;
3988
3989 return &perf_ops_generic;
3990}
3991#else
3992static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3993{
3994 return NULL;
3995}
3996#endif
3997
3998atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
3999
4000static void sw_perf_counter_destroy(struct perf_counter *counter)
4001{
4002 u64 event = counter->attr.config;
4003
4004 WARN_ON(counter->parent);
4005
4006 atomic_dec(&perf_swcounter_enabled[event]);
4007}
4008
4009static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
4010{
4011 const struct pmu *pmu = NULL;
4012 u64 event = counter->attr.config;
4013
4014 /*
4015 * Software counters (currently) can't in general distinguish
4016 * between user, kernel and hypervisor events.
4017 * However, context switches and cpu migrations are considered
4018 * to be kernel events, and page faults are never hypervisor
4019 * events.
4020 */
4021 switch (event) {
4022 case PERF_COUNT_SW_CPU_CLOCK:
4023 pmu = &perf_ops_cpu_clock;
4024
4025 break;
4026 case PERF_COUNT_SW_TASK_CLOCK:
4027 /*
4028 * If the user instantiates this as a per-cpu counter,
4029 * use the cpu_clock counter instead.
4030 */
4031 if (counter->ctx->task)
4032 pmu = &perf_ops_task_clock;
4033 else
4034 pmu = &perf_ops_cpu_clock;
4035
4036 break;
4037 case PERF_COUNT_SW_PAGE_FAULTS:
4038 case PERF_COUNT_SW_PAGE_FAULTS_MIN:
4039 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4040 case PERF_COUNT_SW_CONTEXT_SWITCHES:
4041 case PERF_COUNT_SW_CPU_MIGRATIONS:
4042 if (!counter->parent) {
4043 atomic_inc(&perf_swcounter_enabled[event]);
4044 counter->destroy = sw_perf_counter_destroy;
4045 }
4046 pmu = &perf_ops_generic;
4047 break;
4048 }
4049
4050 return pmu;
4051}
4052
4053/*
4054 * Allocate and initialize a counter structure
4055 */
4056static struct perf_counter *
4057perf_counter_alloc(struct perf_counter_attr *attr,
4058 int cpu,
4059 struct perf_counter_context *ctx,
4060 struct perf_counter *group_leader,
4061 struct perf_counter *parent_counter,
4062 gfp_t gfpflags)
4063{
4064 const struct pmu *pmu;
4065 struct perf_counter *counter;
4066 struct hw_perf_counter *hwc;
4067 long err;
4068
4069 counter = kzalloc(sizeof(*counter), gfpflags);
4070 if (!counter)
4071 return ERR_PTR(-ENOMEM);
4072
4073 /*
4074 * Single counters are their own group leaders, with an
4075 * empty sibling list:
4076 */
4077 if (!group_leader)
4078 group_leader = counter;
4079
4080 mutex_init(&counter->child_mutex);
4081 INIT_LIST_HEAD(&counter->child_list);
4082
4083 INIT_LIST_HEAD(&counter->list_entry);
4084 INIT_LIST_HEAD(&counter->event_entry);
4085 INIT_LIST_HEAD(&counter->sibling_list);
4086 init_waitqueue_head(&counter->waitq);
4087
4088 mutex_init(&counter->mmap_mutex);
4089
4090 counter->cpu = cpu;
4091 counter->attr = *attr;
4092 counter->group_leader = group_leader;
4093 counter->pmu = NULL;
4094 counter->ctx = ctx;
4095 counter->oncpu = -1;
4096
4097 counter->parent = parent_counter;
4098
4099 counter->ns = get_pid_ns(current->nsproxy->pid_ns);
4100 counter->id = atomic64_inc_return(&perf_counter_id);
4101
4102 counter->state = PERF_COUNTER_STATE_INACTIVE;
4103
4104 if (attr->disabled)
4105 counter->state = PERF_COUNTER_STATE_OFF;
4106
4107 pmu = NULL;
4108
4109 hwc = &counter->hw;
4110 hwc->sample_period = attr->sample_period;
4111 if (attr->freq && attr->sample_freq)
4112 hwc->sample_period = 1;
4113 hwc->last_period = hwc->sample_period;
4114
4115 atomic64_set(&hwc->period_left, hwc->sample_period);
4116
4117 /*
4118 * we currently do not support PERF_FORMAT_GROUP on inherited counters
4119 */
4120 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4121 goto done;
4122
4123 switch (attr->type) {
4124 case PERF_TYPE_RAW:
4125 case PERF_TYPE_HARDWARE:
4126 case PERF_TYPE_HW_CACHE:
4127 pmu = hw_perf_counter_init(counter);
4128 break;
4129
4130 case PERF_TYPE_SOFTWARE:
4131 pmu = sw_perf_counter_init(counter);
4132 break;
4133
4134 case PERF_TYPE_TRACEPOINT:
4135 pmu = tp_perf_counter_init(counter);
4136 break;
4137
4138 default:
4139 break;
4140 }
4141done:
4142 err = 0;
4143 if (!pmu)
4144 err = -EINVAL;
4145 else if (IS_ERR(pmu))
4146 err = PTR_ERR(pmu);
4147
4148 if (err) {
4149 if (counter->ns)
4150 put_pid_ns(counter->ns);
4151 kfree(counter);
4152 return ERR_PTR(err);
4153 }
4154
4155 counter->pmu = pmu;
4156
4157 if (!counter->parent) {
4158 atomic_inc(&nr_counters);
4159 if (counter->attr.mmap)
4160 atomic_inc(&nr_mmap_counters);
4161 if (counter->attr.comm)
4162 atomic_inc(&nr_comm_counters);
4163 if (counter->attr.task)
4164 atomic_inc(&nr_task_counters);
4165 }
4166
4167 return counter;
4168}
4169
4170static int perf_copy_attr(struct perf_counter_attr __user *uattr,
4171 struct perf_counter_attr *attr)
4172{
4173 int ret;
4174 u32 size;
4175
4176 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4177 return -EFAULT;
4178
4179 /*
4180 * zero the full structure, so that a short copy will be nice.
4181 */
4182 memset(attr, 0, sizeof(*attr));
4183
4184 ret = get_user(size, &uattr->size);
4185 if (ret)
4186 return ret;
4187
4188 if (size > PAGE_SIZE) /* silly large */
4189 goto err_size;
4190
4191 if (!size) /* abi compat */
4192 size = PERF_ATTR_SIZE_VER0;
4193
4194 if (size < PERF_ATTR_SIZE_VER0)
4195 goto err_size;
4196
4197 /*
4198 * If we're handed a bigger struct than we know of,
4199 * ensure all the unknown bits are 0.
4200 */
4201 if (size > sizeof(*attr)) {
4202 unsigned long val;
4203 unsigned long __user *addr;
4204 unsigned long __user *end;
4205
4206 addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr),
4207 sizeof(unsigned long));
4208 end = PTR_ALIGN((void __user *)uattr + size,
4209 sizeof(unsigned long));
4210
4211 for (; addr < end; addr += sizeof(unsigned long)) {
4212 ret = get_user(val, addr);
4213 if (ret)
4214 return ret;
4215 if (val)
4216 goto err_size;
4217 }
4218 }
4219
4220 ret = copy_from_user(attr, uattr, size);
4221 if (ret)
4222 return -EFAULT;
4223
4224 /*
4225 * If the type exists, the corresponding creation will verify
4226 * the attr->config.
4227 */
4228 if (attr->type >= PERF_TYPE_MAX)
4229 return -EINVAL;
4230
4231 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
4232 return -EINVAL;
4233
4234 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4235 return -EINVAL;
4236
4237 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4238 return -EINVAL;
4239
4240out:
4241 return ret;
4242
4243err_size:
4244 put_user(sizeof(*attr), &uattr->size);
4245 ret = -E2BIG;
4246 goto out;
4247}
4248
4249int perf_counter_set_output(struct perf_counter *counter, int output_fd)
4250{
4251 struct perf_counter *output_counter = NULL;
4252 struct file *output_file = NULL;
4253 struct perf_counter *old_output;
4254 int fput_needed = 0;
4255 int ret = -EINVAL;
4256
4257 if (!output_fd)
4258 goto set;
4259
4260 output_file = fget_light(output_fd, &fput_needed);
4261 if (!output_file)
4262 return -EBADF;
4263
4264 if (output_file->f_op != &perf_fops)
4265 goto out;
4266
4267 output_counter = output_file->private_data;
4268
4269 /* Don't chain output fds */
4270 if (output_counter->output)
4271 goto out;
4272
4273 /* Don't set an output fd when we already have an output channel */
4274 if (counter->data)
4275 goto out;
4276
4277 atomic_long_inc(&output_file->f_count);
4278
4279set:
4280 mutex_lock(&counter->mmap_mutex);
4281 old_output = counter->output;
4282 rcu_assign_pointer(counter->output, output_counter);
4283 mutex_unlock(&counter->mmap_mutex);
4284
4285 if (old_output) {
4286 /*
4287 * we need to make sure no existing perf_output_*()
4288 * is still referencing this counter.
4289 */
4290 synchronize_rcu();
4291 fput(old_output->filp);
4292 }
4293
4294 ret = 0;
4295out:
4296 fput_light(output_file, fput_needed);
4297 return ret;
4298}
4299
4300/**
4301 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
4302 *
4303 * @attr_uptr: event type attributes for monitoring/sampling
4304 * @pid: target pid
4305 * @cpu: target cpu
4306 * @group_fd: group leader counter fd
4307 */
4308SYSCALL_DEFINE5(perf_counter_open,
4309 struct perf_counter_attr __user *, attr_uptr,
4310 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4311{
4312 struct perf_counter *counter, *group_leader;
4313 struct perf_counter_attr attr;
4314 struct perf_counter_context *ctx;
4315 struct file *counter_file = NULL;
4316 struct file *group_file = NULL;
4317 int fput_needed = 0;
4318 int fput_needed2 = 0;
4319 int err;
4320
4321 /* for future expandability... */
4322 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4323 return -EINVAL;
4324
4325 err = perf_copy_attr(attr_uptr, &attr);
4326 if (err)
4327 return err;
4328
4329 if (!attr.exclude_kernel) {
4330 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4331 return -EACCES;
4332 }
4333
4334 if (attr.freq) {
4335 if (attr.sample_freq > sysctl_perf_counter_sample_rate)
4336 return -EINVAL;
4337 }
4338
4339 /*
4340 * Get the target context (task or percpu):
4341 */
4342 ctx = find_get_context(pid, cpu);
4343 if (IS_ERR(ctx))
4344 return PTR_ERR(ctx);
4345
4346 /*
4347 * Look up the group leader (we will attach this counter to it):
4348 */
4349 group_leader = NULL;
4350 if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4351 err = -EINVAL;
4352 group_file = fget_light(group_fd, &fput_needed);
4353 if (!group_file)
4354 goto err_put_context;
4355 if (group_file->f_op != &perf_fops)
4356 goto err_put_context;
4357
4358 group_leader = group_file->private_data;
4359 /*
4360 * Do not allow a recursive hierarchy (this new sibling
4361 * becoming part of another group-sibling):
4362 */
4363 if (group_leader->group_leader != group_leader)
4364 goto err_put_context;
4365 /*
4366 * Do not allow to attach to a group in a different
4367 * task or CPU context:
4368 */
4369 if (group_leader->ctx != ctx)
4370 goto err_put_context;
4371 /*
4372 * Only a group leader can be exclusive or pinned
4373 */
4374 if (attr.exclusive || attr.pinned)
4375 goto err_put_context;
4376 }
4377
4378 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
4379 NULL, GFP_KERNEL);
4380 err = PTR_ERR(counter);
4381 if (IS_ERR(counter))
4382 goto err_put_context;
4383
4384 err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
4385 if (err < 0)
4386 goto err_free_put_context;
4387
4388 counter_file = fget_light(err, &fput_needed2);
4389 if (!counter_file)
4390 goto err_free_put_context;
4391
4392 if (flags & PERF_FLAG_FD_OUTPUT) {
4393 err = perf_counter_set_output(counter, group_fd);
4394 if (err)
4395 goto err_fput_free_put_context;
4396 }
4397
4398 counter->filp = counter_file;
4399 WARN_ON_ONCE(ctx->parent_ctx);
4400 mutex_lock(&ctx->mutex);
4401 perf_install_in_context(ctx, counter, cpu);
4402 ++ctx->generation;
4403 mutex_unlock(&ctx->mutex);
4404
4405 counter->owner = current;
4406 get_task_struct(current);
4407 mutex_lock(&current->perf_counter_mutex);
4408 list_add_tail(&counter->owner_entry, &current->perf_counter_list);
4409 mutex_unlock(&current->perf_counter_mutex);
4410
4411err_fput_free_put_context:
4412 fput_light(counter_file, fput_needed2);
4413
4414err_free_put_context:
4415 if (err < 0)
4416 kfree(counter);
4417
4418err_put_context:
4419 if (err < 0)
4420 put_ctx(ctx);
4421
4422 fput_light(group_file, fput_needed);
4423
4424 return err;
4425}
4426
4427/*
4428 * inherit a counter from parent task to child task:
4429 */
4430static struct perf_counter *
4431inherit_counter(struct perf_counter *parent_counter,
4432 struct task_struct *parent,
4433 struct perf_counter_context *parent_ctx,
4434 struct task_struct *child,
4435 struct perf_counter *group_leader,
4436 struct perf_counter_context *child_ctx)
4437{
4438 struct perf_counter *child_counter;
4439
4440 /*
4441 * Instead of creating recursive hierarchies of counters,
4442 * we link inherited counters back to the original parent,
4443 * which has a filp for sure, which we use as the reference
4444 * count:
4445 */
4446 if (parent_counter->parent)
4447 parent_counter = parent_counter->parent;
4448
4449 child_counter = perf_counter_alloc(&parent_counter->attr,
4450 parent_counter->cpu, child_ctx,
4451 group_leader, parent_counter,
4452 GFP_KERNEL);
4453 if (IS_ERR(child_counter))
4454 return child_counter;
4455 get_ctx(child_ctx);
4456
4457 /*
4458 * Make the child state follow the state of the parent counter,
4459 * not its attr.disabled bit. We hold the parent's mutex,
4460 * so we won't race with perf_counter_{en, dis}able_family.
4461 */
4462 if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
4463 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
4464 else
4465 child_counter->state = PERF_COUNTER_STATE_OFF;
4466
4467 if (parent_counter->attr.freq)
4468 child_counter->hw.sample_period = parent_counter->hw.sample_period;
4469
4470 /*
4471 * Link it up in the child's context:
4472 */
4473 add_counter_to_ctx(child_counter, child_ctx);
4474
4475 /*
4476 * Get a reference to the parent filp - we will fput it
4477 * when the child counter exits. This is safe to do because
4478 * we are in the parent and we know that the filp still
4479 * exists and has a nonzero count:
4480 */
4481 atomic_long_inc(&parent_counter->filp->f_count);
4482
4483 /*
4484 * Link this into the parent counter's child list
4485 */
4486 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
4487 mutex_lock(&parent_counter->child_mutex);
4488 list_add_tail(&child_counter->child_list, &parent_counter->child_list);
4489 mutex_unlock(&parent_counter->child_mutex);
4490
4491 return child_counter;
4492}
4493
4494static int inherit_group(struct perf_counter *parent_counter,
4495 struct task_struct *parent,
4496 struct perf_counter_context *parent_ctx,
4497 struct task_struct *child,
4498 struct perf_counter_context *child_ctx)
4499{
4500 struct perf_counter *leader;
4501 struct perf_counter *sub;
4502 struct perf_counter *child_ctr;
4503
4504 leader = inherit_counter(parent_counter, parent, parent_ctx,
4505 child, NULL, child_ctx);
4506 if (IS_ERR(leader))
4507 return PTR_ERR(leader);
4508 list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
4509 child_ctr = inherit_counter(sub, parent, parent_ctx,
4510 child, leader, child_ctx);
4511 if (IS_ERR(child_ctr))
4512 return PTR_ERR(child_ctr);
4513 }
4514 return 0;
4515}
4516
4517static void sync_child_counter(struct perf_counter *child_counter,
4518 struct task_struct *child)
4519{
4520 struct perf_counter *parent_counter = child_counter->parent;
4521 u64 child_val;
4522
4523 if (child_counter->attr.inherit_stat)
4524 perf_counter_read_event(child_counter, child);
4525
4526 child_val = atomic64_read(&child_counter->count);
4527
4528 /*
4529 * Add back the child's count to the parent's count:
4530 */
4531 atomic64_add(child_val, &parent_counter->count);
4532 atomic64_add(child_counter->total_time_enabled,
4533 &parent_counter->child_total_time_enabled);
4534 atomic64_add(child_counter->total_time_running,
4535 &parent_counter->child_total_time_running);
4536
4537 /*
4538 * Remove this counter from the parent's list
4539 */
4540 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
4541 mutex_lock(&parent_counter->child_mutex);
4542 list_del_init(&child_counter->child_list);
4543 mutex_unlock(&parent_counter->child_mutex);
4544
4545 /*
4546 * Release the parent counter, if this was the last
4547 * reference to it.
4548 */
4549 fput(parent_counter->filp);
4550}
4551
4552static void
4553__perf_counter_exit_task(struct perf_counter *child_counter,
4554 struct perf_counter_context *child_ctx,
4555 struct task_struct *child)
4556{
4557 struct perf_counter *parent_counter;
4558
4559 update_counter_times(child_counter);
4560 perf_counter_remove_from_context(child_counter);
4561
4562 parent_counter = child_counter->parent;
4563 /*
4564 * It can happen that parent exits first, and has counters
4565 * that are still around due to the child reference. These
4566 * counters need to be zapped - but otherwise linger.
4567 */
4568 if (parent_counter) {
4569 sync_child_counter(child_counter, child);
4570 free_counter(child_counter);
4571 }
4572}
4573
4574/*
4575 * When a child task exits, feed back counter values to parent counters.
4576 */
4577void perf_counter_exit_task(struct task_struct *child)
4578{
4579 struct perf_counter *child_counter, *tmp;
4580 struct perf_counter_context *child_ctx;
4581 unsigned long flags;
4582
4583 if (likely(!child->perf_counter_ctxp)) {
4584 perf_counter_task(child, NULL, 0);
4585 return;
4586 }
4587
4588 local_irq_save(flags);
4589 /*
4590 * We can't reschedule here because interrupts are disabled,
4591 * and either child is current or it is a task that can't be
4592 * scheduled, so we are now safe from rescheduling changing
4593 * our context.
4594 */
4595 child_ctx = child->perf_counter_ctxp;
4596 __perf_counter_task_sched_out(child_ctx);
4597
4598 /*
4599 * Take the context lock here so that if find_get_context is
4600 * reading child->perf_counter_ctxp, we wait until it has
4601 * incremented the context's refcount before we do put_ctx below.
4602 */
4603 spin_lock(&child_ctx->lock);
4604 child->perf_counter_ctxp = NULL;
4605 /*
4606 * If this context is a clone; unclone it so it can't get
4607 * swapped to another process while we're removing all
4608 * the counters from it.
4609 */
4610 unclone_ctx(child_ctx);
4611 spin_unlock_irqrestore(&child_ctx->lock, flags);
4612
4613 /*
4614 * Report the task dead after unscheduling the counters so that we
4615 * won't get any samples after PERF_EVENT_EXIT. We can however still
4616 * get a few PERF_EVENT_READ events.
4617 */
4618 perf_counter_task(child, child_ctx, 0);
4619
4620 /*
4621 * We can recurse on the same lock type through:
4622 *
4623 * __perf_counter_exit_task()
4624 * sync_child_counter()
4625 * fput(parent_counter->filp)
4626 * perf_release()
4627 * mutex_lock(&ctx->mutex)
4628 *
4629 * But since its the parent context it won't be the same instance.
4630 */
4631 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4632
4633again:
4634 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
4635 list_entry)
4636 __perf_counter_exit_task(child_counter, child_ctx, child);
4637
4638 /*
4639 * If the last counter was a group counter, it will have appended all
4640 * its siblings to the list, but we obtained 'tmp' before that which
4641 * will still point to the list head terminating the iteration.
4642 */
4643 if (!list_empty(&child_ctx->counter_list))
4644 goto again;
4645
4646 mutex_unlock(&child_ctx->mutex);
4647
4648 put_ctx(child_ctx);
4649}
4650
4651/*
4652 * free an unexposed, unused context as created by inheritance by
4653 * init_task below, used by fork() in case of fail.
4654 */
4655void perf_counter_free_task(struct task_struct *task)
4656{
4657 struct perf_counter_context *ctx = task->perf_counter_ctxp;
4658 struct perf_counter *counter, *tmp;
4659
4660 if (!ctx)
4661 return;
4662
4663 mutex_lock(&ctx->mutex);
4664again:
4665 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) {
4666 struct perf_counter *parent = counter->parent;
4667
4668 if (WARN_ON_ONCE(!parent))
4669 continue;
4670
4671 mutex_lock(&parent->child_mutex);
4672 list_del_init(&counter->child_list);
4673 mutex_unlock(&parent->child_mutex);
4674
4675 fput(parent->filp);
4676
4677 list_del_counter(counter, ctx);
4678 free_counter(counter);
4679 }
4680
4681 if (!list_empty(&ctx->counter_list))
4682 goto again;
4683
4684 mutex_unlock(&ctx->mutex);
4685
4686 put_ctx(ctx);
4687}
4688
4689/*
4690 * Initialize the perf_counter context in task_struct
4691 */
4692int perf_counter_init_task(struct task_struct *child)
4693{
4694 struct perf_counter_context *child_ctx, *parent_ctx;
4695 struct perf_counter_context *cloned_ctx;
4696 struct perf_counter *counter;
4697 struct task_struct *parent = current;
4698 int inherited_all = 1;
4699 int ret = 0;
4700
4701 child->perf_counter_ctxp = NULL;
4702
4703 mutex_init(&child->perf_counter_mutex);
4704 INIT_LIST_HEAD(&child->perf_counter_list);
4705
4706 if (likely(!parent->perf_counter_ctxp))
4707 return 0;
4708
4709 /*
4710 * This is executed from the parent task context, so inherit
4711 * counters that have been marked for cloning.
4712 * First allocate and initialize a context for the child.
4713 */
4714
4715 child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
4716 if (!child_ctx)
4717 return -ENOMEM;
4718
4719 __perf_counter_init_context(child_ctx, child);
4720 child->perf_counter_ctxp = child_ctx;
4721 get_task_struct(child);
4722
4723 /*
4724 * If the parent's context is a clone, pin it so it won't get
4725 * swapped under us.
4726 */
4727 parent_ctx = perf_pin_task_context(parent);
4728
4729 /*
4730 * No need to check if parent_ctx != NULL here; since we saw
4731 * it non-NULL earlier, the only reason for it to become NULL
4732 * is if we exit, and since we're currently in the middle of
4733 * a fork we can't be exiting at the same time.
4734 */
4735
4736 /*
4737 * Lock the parent list. No need to lock the child - not PID
4738 * hashed yet and not running, so nobody can access it.
4739 */
4740 mutex_lock(&parent_ctx->mutex);
4741
4742 /*
4743 * We dont have to disable NMIs - we are only looking at
4744 * the list, not manipulating it:
4745 */
4746 list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) {
4747 if (counter != counter->group_leader)
4748 continue;
4749
4750 if (!counter->attr.inherit) {
4751 inherited_all = 0;
4752 continue;
4753 }
4754
4755 ret = inherit_group(counter, parent, parent_ctx,
4756 child, child_ctx);
4757 if (ret) {
4758 inherited_all = 0;
4759 break;
4760 }
4761 }
4762
4763 if (inherited_all) {
4764 /*
4765 * Mark the child context as a clone of the parent
4766 * context, or of whatever the parent is a clone of.
4767 * Note that if the parent is a clone, it could get
4768 * uncloned at any point, but that doesn't matter
4769 * because the list of counters and the generation
4770 * count can't have changed since we took the mutex.
4771 */
4772 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
4773 if (cloned_ctx) {
4774 child_ctx->parent_ctx = cloned_ctx;
4775 child_ctx->parent_gen = parent_ctx->parent_gen;
4776 } else {
4777 child_ctx->parent_ctx = parent_ctx;
4778 child_ctx->parent_gen = parent_ctx->generation;
4779 }
4780 get_ctx(child_ctx->parent_ctx);
4781 }
4782
4783 mutex_unlock(&parent_ctx->mutex);
4784
4785 perf_unpin_context(parent_ctx);
4786
4787 return ret;
4788}
4789
4790static void __cpuinit perf_counter_init_cpu(int cpu)
4791{
4792 struct perf_cpu_context *cpuctx;
4793
4794 cpuctx = &per_cpu(perf_cpu_context, cpu);
4795 __perf_counter_init_context(&cpuctx->ctx, NULL);
4796
4797 spin_lock(&perf_resource_lock);
4798 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
4799 spin_unlock(&perf_resource_lock);
4800
4801 hw_perf_counter_setup(cpu);
4802}
4803
4804#ifdef CONFIG_HOTPLUG_CPU
4805static void __perf_counter_exit_cpu(void *info)
4806{
4807 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4808 struct perf_counter_context *ctx = &cpuctx->ctx;
4809 struct perf_counter *counter, *tmp;
4810
4811 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
4812 __perf_counter_remove_from_context(counter);
4813}
4814static void perf_counter_exit_cpu(int cpu)
4815{
4816 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4817 struct perf_counter_context *ctx = &cpuctx->ctx;
4818
4819 mutex_lock(&ctx->mutex);
4820 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
4821 mutex_unlock(&ctx->mutex);
4822}
4823#else
4824static inline void perf_counter_exit_cpu(int cpu) { }
4825#endif
4826
4827static int __cpuinit
4828perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4829{
4830 unsigned int cpu = (long)hcpu;
4831
4832 switch (action) {
4833
4834 case CPU_UP_PREPARE:
4835 case CPU_UP_PREPARE_FROZEN:
4836 perf_counter_init_cpu(cpu);
4837 break;
4838
4839 case CPU_ONLINE:
4840 case CPU_ONLINE_FROZEN:
4841 hw_perf_counter_setup_online(cpu);
4842 break;
4843
4844 case CPU_DOWN_PREPARE:
4845 case CPU_DOWN_PREPARE_FROZEN:
4846 perf_counter_exit_cpu(cpu);
4847 break;
4848
4849 default:
4850 break;
4851 }
4852
4853 return NOTIFY_OK;
4854}
4855
4856/*
4857 * This has to have a higher priority than migration_notifier in sched.c.
4858 */
4859static struct notifier_block __cpuinitdata perf_cpu_nb = {
4860 .notifier_call = perf_cpu_notify,
4861 .priority = 20,
4862};
4863
4864void __init perf_counter_init(void)
4865{
4866 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
4867 (void *)(long)smp_processor_id());
4868 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
4869 (void *)(long)smp_processor_id());
4870 register_cpu_notifier(&perf_cpu_nb);
4871}
4872
4873static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
4874{
4875 return sprintf(buf, "%d\n", perf_reserved_percpu);
4876}
4877
4878static ssize_t
4879perf_set_reserve_percpu(struct sysdev_class *class,
4880 const char *buf,
4881 size_t count)
4882{
4883 struct perf_cpu_context *cpuctx;
4884 unsigned long val;
4885 int err, cpu, mpt;
4886
4887 err = strict_strtoul(buf, 10, &val);
4888 if (err)
4889 return err;
4890 if (val > perf_max_counters)
4891 return -EINVAL;
4892
4893 spin_lock(&perf_resource_lock);
4894 perf_reserved_percpu = val;
4895 for_each_online_cpu(cpu) {
4896 cpuctx = &per_cpu(perf_cpu_context, cpu);
4897 spin_lock_irq(&cpuctx->ctx.lock);
4898 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
4899 perf_max_counters - perf_reserved_percpu);
4900 cpuctx->max_pertask = mpt;
4901 spin_unlock_irq(&cpuctx->ctx.lock);
4902 }
4903 spin_unlock(&perf_resource_lock);
4904
4905 return count;
4906}
4907
4908static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
4909{
4910 return sprintf(buf, "%d\n", perf_overcommit);
4911}
4912
4913static ssize_t
4914perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
4915{
4916 unsigned long val;
4917 int err;
4918
4919 err = strict_strtoul(buf, 10, &val);
4920 if (err)
4921 return err;
4922 if (val > 1)
4923 return -EINVAL;
4924
4925 spin_lock(&perf_resource_lock);
4926 perf_overcommit = val;
4927 spin_unlock(&perf_resource_lock);
4928
4929 return count;
4930}
4931
4932static SYSDEV_CLASS_ATTR(
4933 reserve_percpu,
4934 0644,
4935 perf_show_reserve_percpu,
4936 perf_set_reserve_percpu
4937 );
4938
4939static SYSDEV_CLASS_ATTR(
4940 overcommit,
4941 0644,
4942 perf_show_overcommit,
4943 perf_set_overcommit
4944 );
4945
4946static struct attribute *perfclass_attrs[] = {
4947 &attr_reserve_percpu.attr,
4948 &attr_overcommit.attr,
4949 NULL
4950};
4951
4952static struct attribute_group perfclass_attr_group = {
4953 .attrs = perfclass_attrs,
4954 .name = "perf_counters",
4955};
4956
4957static int __init perf_counter_sysfs_init(void)
4958{
4959 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
4960 &perfclass_attr_group);
4961}
4962device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
new file mode 100644
index 000000000000..6b7ddba1dd64
--- /dev/null
+++ b/kernel/perf_event.c
@@ -0,0 +1,5359 @@
1/*
2 * Performance events core code:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/sysfs.h>
19#include <linux/dcache.h>
20#include <linux/percpu.h>
21#include <linux/ptrace.h>
22#include <linux/vmstat.h>
23#include <linux/vmalloc.h>
24#include <linux/hardirq.h>
25#include <linux/rculist.h>
26#include <linux/uaccess.h>
27#include <linux/syscalls.h>
28#include <linux/anon_inodes.h>
29#include <linux/kernel_stat.h>
30#include <linux/perf_event.h>
31#include <linux/ftrace_event.h>
32#include <linux/hw_breakpoint.h>
33
34#include <asm/irq_regs.h>
35
36/*
37 * Each CPU has a list of per CPU events:
38 */
39DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
40
41int perf_max_events __read_mostly = 1;
42static int perf_reserved_percpu __read_mostly;
43static int perf_overcommit __read_mostly = 1;
44
45static atomic_t nr_events __read_mostly;
46static atomic_t nr_mmap_events __read_mostly;
47static atomic_t nr_comm_events __read_mostly;
48static atomic_t nr_task_events __read_mostly;
49
50/*
51 * perf event paranoia level:
52 * -1 - not paranoid at all
53 * 0 - disallow raw tracepoint access for unpriv
54 * 1 - disallow cpu events for unpriv
55 * 2 - disallow kernel profiling for unpriv
56 */
57int sysctl_perf_event_paranoid __read_mostly = 1;
58
59static inline bool perf_paranoid_tracepoint_raw(void)
60{
61 return sysctl_perf_event_paranoid > -1;
62}
63
64static inline bool perf_paranoid_cpu(void)
65{
66 return sysctl_perf_event_paranoid > 0;
67}
68
69static inline bool perf_paranoid_kernel(void)
70{
71 return sysctl_perf_event_paranoid > 1;
72}
73
74int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
75
76/*
77 * max perf event sample rate
78 */
79int sysctl_perf_event_sample_rate __read_mostly = 100000;
80
81static atomic64_t perf_event_id;
82
83/*
84 * Lock for (sysadmin-configurable) event reservations:
85 */
86static DEFINE_SPINLOCK(perf_resource_lock);
87
88/*
89 * Architecture provided APIs - weak aliases:
90 */
91extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
92{
93 return NULL;
94}
95
96void __weak hw_perf_disable(void) { barrier(); }
97void __weak hw_perf_enable(void) { barrier(); }
98
99void __weak hw_perf_event_setup(int cpu) { barrier(); }
100void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
101
102int __weak
103hw_perf_group_sched_in(struct perf_event *group_leader,
104 struct perf_cpu_context *cpuctx,
105 struct perf_event_context *ctx, int cpu)
106{
107 return 0;
108}
109
110void __weak perf_event_print_debug(void) { }
111
112static DEFINE_PER_CPU(int, perf_disable_count);
113
114void __perf_disable(void)
115{
116 __get_cpu_var(perf_disable_count)++;
117}
118
119bool __perf_enable(void)
120{
121 return !--__get_cpu_var(perf_disable_count);
122}
123
124void perf_disable(void)
125{
126 __perf_disable();
127 hw_perf_disable();
128}
129
130void perf_enable(void)
131{
132 if (__perf_enable())
133 hw_perf_enable();
134}
135
136static void get_ctx(struct perf_event_context *ctx)
137{
138 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
139}
140
141static void free_ctx(struct rcu_head *head)
142{
143 struct perf_event_context *ctx;
144
145 ctx = container_of(head, struct perf_event_context, rcu_head);
146 kfree(ctx);
147}
148
149static void put_ctx(struct perf_event_context *ctx)
150{
151 if (atomic_dec_and_test(&ctx->refcount)) {
152 if (ctx->parent_ctx)
153 put_ctx(ctx->parent_ctx);
154 if (ctx->task)
155 put_task_struct(ctx->task);
156 call_rcu(&ctx->rcu_head, free_ctx);
157 }
158}
159
160static void unclone_ctx(struct perf_event_context *ctx)
161{
162 if (ctx->parent_ctx) {
163 put_ctx(ctx->parent_ctx);
164 ctx->parent_ctx = NULL;
165 }
166}
167
168/*
169 * If we inherit events we want to return the parent event id
170 * to userspace.
171 */
172static u64 primary_event_id(struct perf_event *event)
173{
174 u64 id = event->id;
175
176 if (event->parent)
177 id = event->parent->id;
178
179 return id;
180}
181
182/*
183 * Get the perf_event_context for a task and lock it.
184 * This has to cope with with the fact that until it is locked,
185 * the context could get moved to another task.
186 */
187static struct perf_event_context *
188perf_lock_task_context(struct task_struct *task, unsigned long *flags)
189{
190 struct perf_event_context *ctx;
191
192 rcu_read_lock();
193 retry:
194 ctx = rcu_dereference(task->perf_event_ctxp);
195 if (ctx) {
196 /*
197 * If this context is a clone of another, it might
198 * get swapped for another underneath us by
199 * perf_event_task_sched_out, though the
200 * rcu_read_lock() protects us from any context
201 * getting freed. Lock the context and check if it
202 * got swapped before we could get the lock, and retry
203 * if so. If we locked the right context, then it
204 * can't get swapped on us any more.
205 */
206 spin_lock_irqsave(&ctx->lock, *flags);
207 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
208 spin_unlock_irqrestore(&ctx->lock, *flags);
209 goto retry;
210 }
211
212 if (!atomic_inc_not_zero(&ctx->refcount)) {
213 spin_unlock_irqrestore(&ctx->lock, *flags);
214 ctx = NULL;
215 }
216 }
217 rcu_read_unlock();
218 return ctx;
219}
220
221/*
222 * Get the context for a task and increment its pin_count so it
223 * can't get swapped to another task. This also increments its
224 * reference count so that the context can't get freed.
225 */
226static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
227{
228 struct perf_event_context *ctx;
229 unsigned long flags;
230
231 ctx = perf_lock_task_context(task, &flags);
232 if (ctx) {
233 ++ctx->pin_count;
234 spin_unlock_irqrestore(&ctx->lock, flags);
235 }
236 return ctx;
237}
238
239static void perf_unpin_context(struct perf_event_context *ctx)
240{
241 unsigned long flags;
242
243 spin_lock_irqsave(&ctx->lock, flags);
244 --ctx->pin_count;
245 spin_unlock_irqrestore(&ctx->lock, flags);
246 put_ctx(ctx);
247}
248
249static inline u64 perf_clock(void)
250{
251 return cpu_clock(smp_processor_id());
252}
253
254/*
255 * Update the record of the current time in a context.
256 */
257static void update_context_time(struct perf_event_context *ctx)
258{
259 u64 now = perf_clock();
260
261 ctx->time += now - ctx->timestamp;
262 ctx->timestamp = now;
263}
264
265/*
266 * Update the total_time_enabled and total_time_running fields for a event.
267 */
268static void update_event_times(struct perf_event *event)
269{
270 struct perf_event_context *ctx = event->ctx;
271 u64 run_end;
272
273 if (event->state < PERF_EVENT_STATE_INACTIVE ||
274 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
275 return;
276
277 if (ctx->is_active)
278 run_end = ctx->time;
279 else
280 run_end = event->tstamp_stopped;
281
282 event->total_time_enabled = run_end - event->tstamp_enabled;
283
284 if (event->state == PERF_EVENT_STATE_INACTIVE)
285 run_end = event->tstamp_stopped;
286 else
287 run_end = ctx->time;
288
289 event->total_time_running = run_end - event->tstamp_running;
290}
291
292/*
293 * Add a event from the lists for its context.
294 * Must be called with ctx->mutex and ctx->lock held.
295 */
296static void
297list_add_event(struct perf_event *event, struct perf_event_context *ctx)
298{
299 struct perf_event *group_leader = event->group_leader;
300
301 /*
302 * Depending on whether it is a standalone or sibling event,
303 * add it straight to the context's event list, or to the group
304 * leader's sibling list:
305 */
306 if (group_leader == event)
307 list_add_tail(&event->group_entry, &ctx->group_list);
308 else {
309 list_add_tail(&event->group_entry, &group_leader->sibling_list);
310 group_leader->nr_siblings++;
311 }
312
313 list_add_rcu(&event->event_entry, &ctx->event_list);
314 ctx->nr_events++;
315 if (event->attr.inherit_stat)
316 ctx->nr_stat++;
317}
318
319/*
320 * Remove a event from the lists for its context.
321 * Must be called with ctx->mutex and ctx->lock held.
322 */
323static void
324list_del_event(struct perf_event *event, struct perf_event_context *ctx)
325{
326 struct perf_event *sibling, *tmp;
327
328 if (list_empty(&event->group_entry))
329 return;
330 ctx->nr_events--;
331 if (event->attr.inherit_stat)
332 ctx->nr_stat--;
333
334 list_del_init(&event->group_entry);
335 list_del_rcu(&event->event_entry);
336
337 if (event->group_leader != event)
338 event->group_leader->nr_siblings--;
339
340 update_event_times(event);
341
342 /*
343 * If event was in error state, then keep it
344 * that way, otherwise bogus counts will be
345 * returned on read(). The only way to get out
346 * of error state is by explicit re-enabling
347 * of the event
348 */
349 if (event->state > PERF_EVENT_STATE_OFF)
350 event->state = PERF_EVENT_STATE_OFF;
351
352 /*
353 * If this was a group event with sibling events then
354 * upgrade the siblings to singleton events by adding them
355 * to the context list directly:
356 */
357 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
358
359 list_move_tail(&sibling->group_entry, &ctx->group_list);
360 sibling->group_leader = sibling;
361 }
362}
363
364static void
365event_sched_out(struct perf_event *event,
366 struct perf_cpu_context *cpuctx,
367 struct perf_event_context *ctx)
368{
369 if (event->state != PERF_EVENT_STATE_ACTIVE)
370 return;
371
372 event->state = PERF_EVENT_STATE_INACTIVE;
373 if (event->pending_disable) {
374 event->pending_disable = 0;
375 event->state = PERF_EVENT_STATE_OFF;
376 }
377 event->tstamp_stopped = ctx->time;
378 event->pmu->disable(event);
379 event->oncpu = -1;
380
381 if (!is_software_event(event))
382 cpuctx->active_oncpu--;
383 ctx->nr_active--;
384 if (event->attr.exclusive || !cpuctx->active_oncpu)
385 cpuctx->exclusive = 0;
386}
387
388static void
389group_sched_out(struct perf_event *group_event,
390 struct perf_cpu_context *cpuctx,
391 struct perf_event_context *ctx)
392{
393 struct perf_event *event;
394
395 if (group_event->state != PERF_EVENT_STATE_ACTIVE)
396 return;
397
398 event_sched_out(group_event, cpuctx, ctx);
399
400 /*
401 * Schedule out siblings (if any):
402 */
403 list_for_each_entry(event, &group_event->sibling_list, group_entry)
404 event_sched_out(event, cpuctx, ctx);
405
406 if (group_event->attr.exclusive)
407 cpuctx->exclusive = 0;
408}
409
410/*
411 * Cross CPU call to remove a performance event
412 *
413 * We disable the event on the hardware level first. After that we
414 * remove it from the context list.
415 */
416static void __perf_event_remove_from_context(void *info)
417{
418 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
419 struct perf_event *event = info;
420 struct perf_event_context *ctx = event->ctx;
421
422 /*
423 * If this is a task context, we need to check whether it is
424 * the current task context of this cpu. If not it has been
425 * scheduled out before the smp call arrived.
426 */
427 if (ctx->task && cpuctx->task_ctx != ctx)
428 return;
429
430 spin_lock(&ctx->lock);
431 /*
432 * Protect the list operation against NMI by disabling the
433 * events on a global level.
434 */
435 perf_disable();
436
437 event_sched_out(event, cpuctx, ctx);
438
439 list_del_event(event, ctx);
440
441 if (!ctx->task) {
442 /*
443 * Allow more per task events with respect to the
444 * reservation:
445 */
446 cpuctx->max_pertask =
447 min(perf_max_events - ctx->nr_events,
448 perf_max_events - perf_reserved_percpu);
449 }
450
451 perf_enable();
452 spin_unlock(&ctx->lock);
453}
454
455
456/*
457 * Remove the event from a task's (or a CPU's) list of events.
458 *
459 * Must be called with ctx->mutex held.
460 *
461 * CPU events are removed with a smp call. For task events we only
462 * call when the task is on a CPU.
463 *
464 * If event->ctx is a cloned context, callers must make sure that
465 * every task struct that event->ctx->task could possibly point to
466 * remains valid. This is OK when called from perf_release since
467 * that only calls us on the top-level context, which can't be a clone.
468 * When called from perf_event_exit_task, it's OK because the
469 * context has been detached from its task.
470 */
471static void perf_event_remove_from_context(struct perf_event *event)
472{
473 struct perf_event_context *ctx = event->ctx;
474 struct task_struct *task = ctx->task;
475
476 if (!task) {
477 /*
478 * Per cpu events are removed via an smp call and
479 * the removal is always sucessful.
480 */
481 smp_call_function_single(event->cpu,
482 __perf_event_remove_from_context,
483 event, 1);
484 return;
485 }
486
487retry:
488 task_oncpu_function_call(task, __perf_event_remove_from_context,
489 event);
490
491 spin_lock_irq(&ctx->lock);
492 /*
493 * If the context is active we need to retry the smp call.
494 */
495 if (ctx->nr_active && !list_empty(&event->group_entry)) {
496 spin_unlock_irq(&ctx->lock);
497 goto retry;
498 }
499
500 /*
501 * The lock prevents that this context is scheduled in so we
502 * can remove the event safely, if the call above did not
503 * succeed.
504 */
505 if (!list_empty(&event->group_entry))
506 list_del_event(event, ctx);
507 spin_unlock_irq(&ctx->lock);
508}
509
510/*
511 * Update total_time_enabled and total_time_running for all events in a group.
512 */
513static void update_group_times(struct perf_event *leader)
514{
515 struct perf_event *event;
516
517 update_event_times(leader);
518 list_for_each_entry(event, &leader->sibling_list, group_entry)
519 update_event_times(event);
520}
521
522/*
523 * Cross CPU call to disable a performance event
524 */
525static void __perf_event_disable(void *info)
526{
527 struct perf_event *event = info;
528 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
529 struct perf_event_context *ctx = event->ctx;
530
531 /*
532 * If this is a per-task event, need to check whether this
533 * event's task is the current task on this cpu.
534 */
535 if (ctx->task && cpuctx->task_ctx != ctx)
536 return;
537
538 spin_lock(&ctx->lock);
539
540 /*
541 * If the event is on, turn it off.
542 * If it is in error state, leave it in error state.
543 */
544 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
545 update_context_time(ctx);
546 update_group_times(event);
547 if (event == event->group_leader)
548 group_sched_out(event, cpuctx, ctx);
549 else
550 event_sched_out(event, cpuctx, ctx);
551 event->state = PERF_EVENT_STATE_OFF;
552 }
553
554 spin_unlock(&ctx->lock);
555}
556
557/*
558 * Disable a event.
559 *
560 * If event->ctx is a cloned context, callers must make sure that
561 * every task struct that event->ctx->task could possibly point to
562 * remains valid. This condition is satisifed when called through
563 * perf_event_for_each_child or perf_event_for_each because they
564 * hold the top-level event's child_mutex, so any descendant that
565 * goes to exit will block in sync_child_event.
566 * When called from perf_pending_event it's OK because event->ctx
567 * is the current context on this CPU and preemption is disabled,
568 * hence we can't get into perf_event_task_sched_out for this context.
569 */
570static void perf_event_disable(struct perf_event *event)
571{
572 struct perf_event_context *ctx = event->ctx;
573 struct task_struct *task = ctx->task;
574
575 if (!task) {
576 /*
577 * Disable the event on the cpu that it's on
578 */
579 smp_call_function_single(event->cpu, __perf_event_disable,
580 event, 1);
581 return;
582 }
583
584 retry:
585 task_oncpu_function_call(task, __perf_event_disable, event);
586
587 spin_lock_irq(&ctx->lock);
588 /*
589 * If the event is still active, we need to retry the cross-call.
590 */
591 if (event->state == PERF_EVENT_STATE_ACTIVE) {
592 spin_unlock_irq(&ctx->lock);
593 goto retry;
594 }
595
596 /*
597 * Since we have the lock this context can't be scheduled
598 * in, so we can change the state safely.
599 */
600 if (event->state == PERF_EVENT_STATE_INACTIVE) {
601 update_group_times(event);
602 event->state = PERF_EVENT_STATE_OFF;
603 }
604
605 spin_unlock_irq(&ctx->lock);
606}
607
608static int
609event_sched_in(struct perf_event *event,
610 struct perf_cpu_context *cpuctx,
611 struct perf_event_context *ctx,
612 int cpu)
613{
614 if (event->state <= PERF_EVENT_STATE_OFF)
615 return 0;
616
617 event->state = PERF_EVENT_STATE_ACTIVE;
618 event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
619 /*
620 * The new state must be visible before we turn it on in the hardware:
621 */
622 smp_wmb();
623
624 if (event->pmu->enable(event)) {
625 event->state = PERF_EVENT_STATE_INACTIVE;
626 event->oncpu = -1;
627 return -EAGAIN;
628 }
629
630 event->tstamp_running += ctx->time - event->tstamp_stopped;
631
632 if (!is_software_event(event))
633 cpuctx->active_oncpu++;
634 ctx->nr_active++;
635
636 if (event->attr.exclusive)
637 cpuctx->exclusive = 1;
638
639 return 0;
640}
641
642static int
643group_sched_in(struct perf_event *group_event,
644 struct perf_cpu_context *cpuctx,
645 struct perf_event_context *ctx,
646 int cpu)
647{
648 struct perf_event *event, *partial_group;
649 int ret;
650
651 if (group_event->state == PERF_EVENT_STATE_OFF)
652 return 0;
653
654 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
655 if (ret)
656 return ret < 0 ? ret : 0;
657
658 if (event_sched_in(group_event, cpuctx, ctx, cpu))
659 return -EAGAIN;
660
661 /*
662 * Schedule in siblings as one group (if any):
663 */
664 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
665 if (event_sched_in(event, cpuctx, ctx, cpu)) {
666 partial_group = event;
667 goto group_error;
668 }
669 }
670
671 return 0;
672
673group_error:
674 /*
675 * Groups can be scheduled in as one unit only, so undo any
676 * partial group before returning:
677 */
678 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
679 if (event == partial_group)
680 break;
681 event_sched_out(event, cpuctx, ctx);
682 }
683 event_sched_out(group_event, cpuctx, ctx);
684
685 return -EAGAIN;
686}
687
688/*
689 * Return 1 for a group consisting entirely of software events,
690 * 0 if the group contains any hardware events.
691 */
692static int is_software_only_group(struct perf_event *leader)
693{
694 struct perf_event *event;
695
696 if (!is_software_event(leader))
697 return 0;
698
699 list_for_each_entry(event, &leader->sibling_list, group_entry)
700 if (!is_software_event(event))
701 return 0;
702
703 return 1;
704}
705
706/*
707 * Work out whether we can put this event group on the CPU now.
708 */
709static int group_can_go_on(struct perf_event *event,
710 struct perf_cpu_context *cpuctx,
711 int can_add_hw)
712{
713 /*
714 * Groups consisting entirely of software events can always go on.
715 */
716 if (is_software_only_group(event))
717 return 1;
718 /*
719 * If an exclusive group is already on, no other hardware
720 * events can go on.
721 */
722 if (cpuctx->exclusive)
723 return 0;
724 /*
725 * If this group is exclusive and there are already
726 * events on the CPU, it can't go on.
727 */
728 if (event->attr.exclusive && cpuctx->active_oncpu)
729 return 0;
730 /*
731 * Otherwise, try to add it if all previous groups were able
732 * to go on.
733 */
734 return can_add_hw;
735}
736
737static void add_event_to_ctx(struct perf_event *event,
738 struct perf_event_context *ctx)
739{
740 list_add_event(event, ctx);
741 event->tstamp_enabled = ctx->time;
742 event->tstamp_running = ctx->time;
743 event->tstamp_stopped = ctx->time;
744}
745
746/*
747 * Cross CPU call to install and enable a performance event
748 *
749 * Must be called with ctx->mutex held
750 */
751static void __perf_install_in_context(void *info)
752{
753 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
754 struct perf_event *event = info;
755 struct perf_event_context *ctx = event->ctx;
756 struct perf_event *leader = event->group_leader;
757 int cpu = smp_processor_id();
758 int err;
759
760 /*
761 * If this is a task context, we need to check whether it is
762 * the current task context of this cpu. If not it has been
763 * scheduled out before the smp call arrived.
764 * Or possibly this is the right context but it isn't
765 * on this cpu because it had no events.
766 */
767 if (ctx->task && cpuctx->task_ctx != ctx) {
768 if (cpuctx->task_ctx || ctx->task != current)
769 return;
770 cpuctx->task_ctx = ctx;
771 }
772
773 spin_lock(&ctx->lock);
774 ctx->is_active = 1;
775 update_context_time(ctx);
776
777 /*
778 * Protect the list operation against NMI by disabling the
779 * events on a global level. NOP for non NMI based events.
780 */
781 perf_disable();
782
783 add_event_to_ctx(event, ctx);
784
785 /*
786 * Don't put the event on if it is disabled or if
787 * it is in a group and the group isn't on.
788 */
789 if (event->state != PERF_EVENT_STATE_INACTIVE ||
790 (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
791 goto unlock;
792
793 /*
794 * An exclusive event can't go on if there are already active
795 * hardware events, and no hardware event can go on if there
796 * is already an exclusive event on.
797 */
798 if (!group_can_go_on(event, cpuctx, 1))
799 err = -EEXIST;
800 else
801 err = event_sched_in(event, cpuctx, ctx, cpu);
802
803 if (err) {
804 /*
805 * This event couldn't go on. If it is in a group
806 * then we have to pull the whole group off.
807 * If the event group is pinned then put it in error state.
808 */
809 if (leader != event)
810 group_sched_out(leader, cpuctx, ctx);
811 if (leader->attr.pinned) {
812 update_group_times(leader);
813 leader->state = PERF_EVENT_STATE_ERROR;
814 }
815 }
816
817 if (!err && !ctx->task && cpuctx->max_pertask)
818 cpuctx->max_pertask--;
819
820 unlock:
821 perf_enable();
822
823 spin_unlock(&ctx->lock);
824}
825
826/*
827 * Attach a performance event to a context
828 *
829 * First we add the event to the list with the hardware enable bit
830 * in event->hw_config cleared.
831 *
832 * If the event is attached to a task which is on a CPU we use a smp
833 * call to enable it in the task context. The task might have been
834 * scheduled away, but we check this in the smp call again.
835 *
836 * Must be called with ctx->mutex held.
837 */
838static void
839perf_install_in_context(struct perf_event_context *ctx,
840 struct perf_event *event,
841 int cpu)
842{
843 struct task_struct *task = ctx->task;
844
845 if (!task) {
846 /*
847 * Per cpu events are installed via an smp call and
848 * the install is always sucessful.
849 */
850 smp_call_function_single(cpu, __perf_install_in_context,
851 event, 1);
852 return;
853 }
854
855retry:
856 task_oncpu_function_call(task, __perf_install_in_context,
857 event);
858
859 spin_lock_irq(&ctx->lock);
860 /*
861 * we need to retry the smp call.
862 */
863 if (ctx->is_active && list_empty(&event->group_entry)) {
864 spin_unlock_irq(&ctx->lock);
865 goto retry;
866 }
867
868 /*
869 * The lock prevents that this context is scheduled in so we
870 * can add the event safely, if it the call above did not
871 * succeed.
872 */
873 if (list_empty(&event->group_entry))
874 add_event_to_ctx(event, ctx);
875 spin_unlock_irq(&ctx->lock);
876}
877
878/*
879 * Put a event into inactive state and update time fields.
880 * Enabling the leader of a group effectively enables all
881 * the group members that aren't explicitly disabled, so we
882 * have to update their ->tstamp_enabled also.
883 * Note: this works for group members as well as group leaders
884 * since the non-leader members' sibling_lists will be empty.
885 */
886static void __perf_event_mark_enabled(struct perf_event *event,
887 struct perf_event_context *ctx)
888{
889 struct perf_event *sub;
890
891 event->state = PERF_EVENT_STATE_INACTIVE;
892 event->tstamp_enabled = ctx->time - event->total_time_enabled;
893 list_for_each_entry(sub, &event->sibling_list, group_entry)
894 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
895 sub->tstamp_enabled =
896 ctx->time - sub->total_time_enabled;
897}
898
899/*
900 * Cross CPU call to enable a performance event
901 */
902static void __perf_event_enable(void *info)
903{
904 struct perf_event *event = info;
905 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
906 struct perf_event_context *ctx = event->ctx;
907 struct perf_event *leader = event->group_leader;
908 int err;
909
910 /*
911 * If this is a per-task event, need to check whether this
912 * event's task is the current task on this cpu.
913 */
914 if (ctx->task && cpuctx->task_ctx != ctx) {
915 if (cpuctx->task_ctx || ctx->task != current)
916 return;
917 cpuctx->task_ctx = ctx;
918 }
919
920 spin_lock(&ctx->lock);
921 ctx->is_active = 1;
922 update_context_time(ctx);
923
924 if (event->state >= PERF_EVENT_STATE_INACTIVE)
925 goto unlock;
926 __perf_event_mark_enabled(event, ctx);
927
928 /*
929 * If the event is in a group and isn't the group leader,
930 * then don't put it on unless the group is on.
931 */
932 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
933 goto unlock;
934
935 if (!group_can_go_on(event, cpuctx, 1)) {
936 err = -EEXIST;
937 } else {
938 perf_disable();
939 if (event == leader)
940 err = group_sched_in(event, cpuctx, ctx,
941 smp_processor_id());
942 else
943 err = event_sched_in(event, cpuctx, ctx,
944 smp_processor_id());
945 perf_enable();
946 }
947
948 if (err) {
949 /*
950 * If this event can't go on and it's part of a
951 * group, then the whole group has to come off.
952 */
953 if (leader != event)
954 group_sched_out(leader, cpuctx, ctx);
955 if (leader->attr.pinned) {
956 update_group_times(leader);
957 leader->state = PERF_EVENT_STATE_ERROR;
958 }
959 }
960
961 unlock:
962 spin_unlock(&ctx->lock);
963}
964
965/*
966 * Enable a event.
967 *
968 * If event->ctx is a cloned context, callers must make sure that
969 * every task struct that event->ctx->task could possibly point to
970 * remains valid. This condition is satisfied when called through
971 * perf_event_for_each_child or perf_event_for_each as described
972 * for perf_event_disable.
973 */
974static void perf_event_enable(struct perf_event *event)
975{
976 struct perf_event_context *ctx = event->ctx;
977 struct task_struct *task = ctx->task;
978
979 if (!task) {
980 /*
981 * Enable the event on the cpu that it's on
982 */
983 smp_call_function_single(event->cpu, __perf_event_enable,
984 event, 1);
985 return;
986 }
987
988 spin_lock_irq(&ctx->lock);
989 if (event->state >= PERF_EVENT_STATE_INACTIVE)
990 goto out;
991
992 /*
993 * If the event is in error state, clear that first.
994 * That way, if we see the event in error state below, we
995 * know that it has gone back into error state, as distinct
996 * from the task having been scheduled away before the
997 * cross-call arrived.
998 */
999 if (event->state == PERF_EVENT_STATE_ERROR)
1000 event->state = PERF_EVENT_STATE_OFF;
1001
1002 retry:
1003 spin_unlock_irq(&ctx->lock);
1004 task_oncpu_function_call(task, __perf_event_enable, event);
1005
1006 spin_lock_irq(&ctx->lock);
1007
1008 /*
1009 * If the context is active and the event is still off,
1010 * we need to retry the cross-call.
1011 */
1012 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
1013 goto retry;
1014
1015 /*
1016 * Since we have the lock this context can't be scheduled
1017 * in, so we can change the state safely.
1018 */
1019 if (event->state == PERF_EVENT_STATE_OFF)
1020 __perf_event_mark_enabled(event, ctx);
1021
1022 out:
1023 spin_unlock_irq(&ctx->lock);
1024}
1025
1026static int perf_event_refresh(struct perf_event *event, int refresh)
1027{
1028 /*
1029 * not supported on inherited events
1030 */
1031 if (event->attr.inherit)
1032 return -EINVAL;
1033
1034 atomic_add(refresh, &event->event_limit);
1035 perf_event_enable(event);
1036
1037 return 0;
1038}
1039
1040void __perf_event_sched_out(struct perf_event_context *ctx,
1041 struct perf_cpu_context *cpuctx)
1042{
1043 struct perf_event *event;
1044
1045 spin_lock(&ctx->lock);
1046 ctx->is_active = 0;
1047 if (likely(!ctx->nr_events))
1048 goto out;
1049 update_context_time(ctx);
1050
1051 perf_disable();
1052 if (ctx->nr_active) {
1053 list_for_each_entry(event, &ctx->group_list, group_entry)
1054 group_sched_out(event, cpuctx, ctx);
1055 }
1056 perf_enable();
1057 out:
1058 spin_unlock(&ctx->lock);
1059}
1060
1061/*
1062 * Test whether two contexts are equivalent, i.e. whether they
1063 * have both been cloned from the same version of the same context
1064 * and they both have the same number of enabled events.
1065 * If the number of enabled events is the same, then the set
1066 * of enabled events should be the same, because these are both
1067 * inherited contexts, therefore we can't access individual events
1068 * in them directly with an fd; we can only enable/disable all
1069 * events via prctl, or enable/disable all events in a family
1070 * via ioctl, which will have the same effect on both contexts.
1071 */
1072static int context_equiv(struct perf_event_context *ctx1,
1073 struct perf_event_context *ctx2)
1074{
1075 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1076 && ctx1->parent_gen == ctx2->parent_gen
1077 && !ctx1->pin_count && !ctx2->pin_count;
1078}
1079
1080static void __perf_event_sync_stat(struct perf_event *event,
1081 struct perf_event *next_event)
1082{
1083 u64 value;
1084
1085 if (!event->attr.inherit_stat)
1086 return;
1087
1088 /*
1089 * Update the event value, we cannot use perf_event_read()
1090 * because we're in the middle of a context switch and have IRQs
1091 * disabled, which upsets smp_call_function_single(), however
1092 * we know the event must be on the current CPU, therefore we
1093 * don't need to use it.
1094 */
1095 switch (event->state) {
1096 case PERF_EVENT_STATE_ACTIVE:
1097 event->pmu->read(event);
1098 /* fall-through */
1099
1100 case PERF_EVENT_STATE_INACTIVE:
1101 update_event_times(event);
1102 break;
1103
1104 default:
1105 break;
1106 }
1107
1108 /*
1109 * In order to keep per-task stats reliable we need to flip the event
1110 * values when we flip the contexts.
1111 */
1112 value = atomic64_read(&next_event->count);
1113 value = atomic64_xchg(&event->count, value);
1114 atomic64_set(&next_event->count, value);
1115
1116 swap(event->total_time_enabled, next_event->total_time_enabled);
1117 swap(event->total_time_running, next_event->total_time_running);
1118
1119 /*
1120 * Since we swizzled the values, update the user visible data too.
1121 */
1122 perf_event_update_userpage(event);
1123 perf_event_update_userpage(next_event);
1124}
1125
1126#define list_next_entry(pos, member) \
1127 list_entry(pos->member.next, typeof(*pos), member)
1128
1129static void perf_event_sync_stat(struct perf_event_context *ctx,
1130 struct perf_event_context *next_ctx)
1131{
1132 struct perf_event *event, *next_event;
1133
1134 if (!ctx->nr_stat)
1135 return;
1136
1137 update_context_time(ctx);
1138
1139 event = list_first_entry(&ctx->event_list,
1140 struct perf_event, event_entry);
1141
1142 next_event = list_first_entry(&next_ctx->event_list,
1143 struct perf_event, event_entry);
1144
1145 while (&event->event_entry != &ctx->event_list &&
1146 &next_event->event_entry != &next_ctx->event_list) {
1147
1148 __perf_event_sync_stat(event, next_event);
1149
1150 event = list_next_entry(event, event_entry);
1151 next_event = list_next_entry(next_event, event_entry);
1152 }
1153}
1154
1155/*
1156 * Called from scheduler to remove the events of the current task,
1157 * with interrupts disabled.
1158 *
1159 * We stop each event and update the event value in event->count.
1160 *
1161 * This does not protect us against NMI, but disable()
1162 * sets the disabled bit in the control field of event _before_
1163 * accessing the event control register. If a NMI hits, then it will
1164 * not restart the event.
1165 */
1166void perf_event_task_sched_out(struct task_struct *task,
1167 struct task_struct *next, int cpu)
1168{
1169 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1170 struct perf_event_context *ctx = task->perf_event_ctxp;
1171 struct perf_event_context *next_ctx;
1172 struct perf_event_context *parent;
1173 struct pt_regs *regs;
1174 int do_switch = 1;
1175
1176 regs = task_pt_regs(task);
1177 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1178
1179 if (likely(!ctx || !cpuctx->task_ctx))
1180 return;
1181
1182 rcu_read_lock();
1183 parent = rcu_dereference(ctx->parent_ctx);
1184 next_ctx = next->perf_event_ctxp;
1185 if (parent && next_ctx &&
1186 rcu_dereference(next_ctx->parent_ctx) == parent) {
1187 /*
1188 * Looks like the two contexts are clones, so we might be
1189 * able to optimize the context switch. We lock both
1190 * contexts and check that they are clones under the
1191 * lock (including re-checking that neither has been
1192 * uncloned in the meantime). It doesn't matter which
1193 * order we take the locks because no other cpu could
1194 * be trying to lock both of these tasks.
1195 */
1196 spin_lock(&ctx->lock);
1197 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1198 if (context_equiv(ctx, next_ctx)) {
1199 /*
1200 * XXX do we need a memory barrier of sorts
1201 * wrt to rcu_dereference() of perf_event_ctxp
1202 */
1203 task->perf_event_ctxp = next_ctx;
1204 next->perf_event_ctxp = ctx;
1205 ctx->task = next;
1206 next_ctx->task = task;
1207 do_switch = 0;
1208
1209 perf_event_sync_stat(ctx, next_ctx);
1210 }
1211 spin_unlock(&next_ctx->lock);
1212 spin_unlock(&ctx->lock);
1213 }
1214 rcu_read_unlock();
1215
1216 if (do_switch) {
1217 __perf_event_sched_out(ctx, cpuctx);
1218 cpuctx->task_ctx = NULL;
1219 }
1220}
1221
1222/*
1223 * Called with IRQs disabled
1224 */
1225static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1226{
1227 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1228
1229 if (!cpuctx->task_ctx)
1230 return;
1231
1232 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1233 return;
1234
1235 __perf_event_sched_out(ctx, cpuctx);
1236 cpuctx->task_ctx = NULL;
1237}
1238
1239/*
1240 * Called with IRQs disabled
1241 */
1242static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
1243{
1244 __perf_event_sched_out(&cpuctx->ctx, cpuctx);
1245}
1246
1247static void
1248__perf_event_sched_in(struct perf_event_context *ctx,
1249 struct perf_cpu_context *cpuctx, int cpu)
1250{
1251 struct perf_event *event;
1252 int can_add_hw = 1;
1253
1254 spin_lock(&ctx->lock);
1255 ctx->is_active = 1;
1256 if (likely(!ctx->nr_events))
1257 goto out;
1258
1259 ctx->timestamp = perf_clock();
1260
1261 perf_disable();
1262
1263 /*
1264 * First go through the list and put on any pinned groups
1265 * in order to give them the best chance of going on.
1266 */
1267 list_for_each_entry(event, &ctx->group_list, group_entry) {
1268 if (event->state <= PERF_EVENT_STATE_OFF ||
1269 !event->attr.pinned)
1270 continue;
1271 if (event->cpu != -1 && event->cpu != cpu)
1272 continue;
1273
1274 if (group_can_go_on(event, cpuctx, 1))
1275 group_sched_in(event, cpuctx, ctx, cpu);
1276
1277 /*
1278 * If this pinned group hasn't been scheduled,
1279 * put it in error state.
1280 */
1281 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1282 update_group_times(event);
1283 event->state = PERF_EVENT_STATE_ERROR;
1284 }
1285 }
1286
1287 list_for_each_entry(event, &ctx->group_list, group_entry) {
1288 /*
1289 * Ignore events in OFF or ERROR state, and
1290 * ignore pinned events since we did them already.
1291 */
1292 if (event->state <= PERF_EVENT_STATE_OFF ||
1293 event->attr.pinned)
1294 continue;
1295
1296 /*
1297 * Listen to the 'cpu' scheduling filter constraint
1298 * of events:
1299 */
1300 if (event->cpu != -1 && event->cpu != cpu)
1301 continue;
1302
1303 if (group_can_go_on(event, cpuctx, can_add_hw))
1304 if (group_sched_in(event, cpuctx, ctx, cpu))
1305 can_add_hw = 0;
1306 }
1307 perf_enable();
1308 out:
1309 spin_unlock(&ctx->lock);
1310}
1311
1312/*
1313 * Called from scheduler to add the events of the current task
1314 * with interrupts disabled.
1315 *
1316 * We restore the event value and then enable it.
1317 *
1318 * This does not protect us against NMI, but enable()
1319 * sets the enabled bit in the control field of event _before_
1320 * accessing the event control register. If a NMI hits, then it will
1321 * keep the event running.
1322 */
1323void perf_event_task_sched_in(struct task_struct *task, int cpu)
1324{
1325 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1326 struct perf_event_context *ctx = task->perf_event_ctxp;
1327
1328 if (likely(!ctx))
1329 return;
1330 if (cpuctx->task_ctx == ctx)
1331 return;
1332 __perf_event_sched_in(ctx, cpuctx, cpu);
1333 cpuctx->task_ctx = ctx;
1334}
1335
1336static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1337{
1338 struct perf_event_context *ctx = &cpuctx->ctx;
1339
1340 __perf_event_sched_in(ctx, cpuctx, cpu);
1341}
1342
1343#define MAX_INTERRUPTS (~0ULL)
1344
1345static void perf_log_throttle(struct perf_event *event, int enable);
1346
1347static void perf_adjust_period(struct perf_event *event, u64 events)
1348{
1349 struct hw_perf_event *hwc = &event->hw;
1350 u64 period, sample_period;
1351 s64 delta;
1352
1353 events *= hwc->sample_period;
1354 period = div64_u64(events, event->attr.sample_freq);
1355
1356 delta = (s64)(period - hwc->sample_period);
1357 delta = (delta + 7) / 8; /* low pass filter */
1358
1359 sample_period = hwc->sample_period + delta;
1360
1361 if (!sample_period)
1362 sample_period = 1;
1363
1364 hwc->sample_period = sample_period;
1365}
1366
1367static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1368{
1369 struct perf_event *event;
1370 struct hw_perf_event *hwc;
1371 u64 interrupts, freq;
1372
1373 spin_lock(&ctx->lock);
1374 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
1375 if (event->state != PERF_EVENT_STATE_ACTIVE)
1376 continue;
1377
1378 hwc = &event->hw;
1379
1380 interrupts = hwc->interrupts;
1381 hwc->interrupts = 0;
1382
1383 /*
1384 * unthrottle events on the tick
1385 */
1386 if (interrupts == MAX_INTERRUPTS) {
1387 perf_log_throttle(event, 1);
1388 event->pmu->unthrottle(event);
1389 interrupts = 2*sysctl_perf_event_sample_rate/HZ;
1390 }
1391
1392 if (!event->attr.freq || !event->attr.sample_freq)
1393 continue;
1394
1395 /*
1396 * if the specified freq < HZ then we need to skip ticks
1397 */
1398 if (event->attr.sample_freq < HZ) {
1399 freq = event->attr.sample_freq;
1400
1401 hwc->freq_count += freq;
1402 hwc->freq_interrupts += interrupts;
1403
1404 if (hwc->freq_count < HZ)
1405 continue;
1406
1407 interrupts = hwc->freq_interrupts;
1408 hwc->freq_interrupts = 0;
1409 hwc->freq_count -= HZ;
1410 } else
1411 freq = HZ;
1412
1413 perf_adjust_period(event, freq * interrupts);
1414
1415 /*
1416 * In order to avoid being stalled by an (accidental) huge
1417 * sample period, force reset the sample period if we didn't
1418 * get any events in this freq period.
1419 */
1420 if (!interrupts) {
1421 perf_disable();
1422 event->pmu->disable(event);
1423 atomic64_set(&hwc->period_left, 0);
1424 event->pmu->enable(event);
1425 perf_enable();
1426 }
1427 }
1428 spin_unlock(&ctx->lock);
1429}
1430
1431/*
1432 * Round-robin a context's events:
1433 */
1434static void rotate_ctx(struct perf_event_context *ctx)
1435{
1436 struct perf_event *event;
1437
1438 if (!ctx->nr_events)
1439 return;
1440
1441 spin_lock(&ctx->lock);
1442 /*
1443 * Rotate the first entry last (works just fine for group events too):
1444 */
1445 perf_disable();
1446 list_for_each_entry(event, &ctx->group_list, group_entry) {
1447 list_move_tail(&event->group_entry, &ctx->group_list);
1448 break;
1449 }
1450 perf_enable();
1451
1452 spin_unlock(&ctx->lock);
1453}
1454
1455void perf_event_task_tick(struct task_struct *curr, int cpu)
1456{
1457 struct perf_cpu_context *cpuctx;
1458 struct perf_event_context *ctx;
1459
1460 if (!atomic_read(&nr_events))
1461 return;
1462
1463 cpuctx = &per_cpu(perf_cpu_context, cpu);
1464 ctx = curr->perf_event_ctxp;
1465
1466 perf_ctx_adjust_freq(&cpuctx->ctx);
1467 if (ctx)
1468 perf_ctx_adjust_freq(ctx);
1469
1470 perf_event_cpu_sched_out(cpuctx);
1471 if (ctx)
1472 __perf_event_task_sched_out(ctx);
1473
1474 rotate_ctx(&cpuctx->ctx);
1475 if (ctx)
1476 rotate_ctx(ctx);
1477
1478 perf_event_cpu_sched_in(cpuctx, cpu);
1479 if (ctx)
1480 perf_event_task_sched_in(curr, cpu);
1481}
1482
1483/*
1484 * Enable all of a task's events that have been marked enable-on-exec.
1485 * This expects task == current.
1486 */
1487static void perf_event_enable_on_exec(struct task_struct *task)
1488{
1489 struct perf_event_context *ctx;
1490 struct perf_event *event;
1491 unsigned long flags;
1492 int enabled = 0;
1493
1494 local_irq_save(flags);
1495 ctx = task->perf_event_ctxp;
1496 if (!ctx || !ctx->nr_events)
1497 goto out;
1498
1499 __perf_event_task_sched_out(ctx);
1500
1501 spin_lock(&ctx->lock);
1502
1503 list_for_each_entry(event, &ctx->group_list, group_entry) {
1504 if (!event->attr.enable_on_exec)
1505 continue;
1506 event->attr.enable_on_exec = 0;
1507 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1508 continue;
1509 __perf_event_mark_enabled(event, ctx);
1510 enabled = 1;
1511 }
1512
1513 /*
1514 * Unclone this context if we enabled any event.
1515 */
1516 if (enabled)
1517 unclone_ctx(ctx);
1518
1519 spin_unlock(&ctx->lock);
1520
1521 perf_event_task_sched_in(task, smp_processor_id());
1522 out:
1523 local_irq_restore(flags);
1524}
1525
1526/*
1527 * Cross CPU call to read the hardware event
1528 */
1529static void __perf_event_read(void *info)
1530{
1531 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1532 struct perf_event *event = info;
1533 struct perf_event_context *ctx = event->ctx;
1534
1535 /*
1536 * If this is a task context, we need to check whether it is
1537 * the current task context of this cpu. If not it has been
1538 * scheduled out before the smp call arrived. In that case
1539 * event->count would have been updated to a recent sample
1540 * when the event was scheduled out.
1541 */
1542 if (ctx->task && cpuctx->task_ctx != ctx)
1543 return;
1544
1545 spin_lock(&ctx->lock);
1546 update_context_time(ctx);
1547 update_event_times(event);
1548 spin_unlock(&ctx->lock);
1549
1550 event->pmu->read(event);
1551}
1552
1553static u64 perf_event_read(struct perf_event *event)
1554{
1555 /*
1556 * If event is enabled and currently active on a CPU, update the
1557 * value in the event structure:
1558 */
1559 if (event->state == PERF_EVENT_STATE_ACTIVE) {
1560 smp_call_function_single(event->oncpu,
1561 __perf_event_read, event, 1);
1562 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1563 struct perf_event_context *ctx = event->ctx;
1564 unsigned long flags;
1565
1566 spin_lock_irqsave(&ctx->lock, flags);
1567 update_context_time(ctx);
1568 update_event_times(event);
1569 spin_unlock_irqrestore(&ctx->lock, flags);
1570 }
1571
1572 return atomic64_read(&event->count);
1573}
1574
1575/*
1576 * Initialize the perf_event context in a task_struct:
1577 */
1578static void
1579__perf_event_init_context(struct perf_event_context *ctx,
1580 struct task_struct *task)
1581{
1582 memset(ctx, 0, sizeof(*ctx));
1583 spin_lock_init(&ctx->lock);
1584 mutex_init(&ctx->mutex);
1585 INIT_LIST_HEAD(&ctx->group_list);
1586 INIT_LIST_HEAD(&ctx->event_list);
1587 atomic_set(&ctx->refcount, 1);
1588 ctx->task = task;
1589}
1590
1591static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1592{
1593 struct perf_event_context *ctx;
1594 struct perf_cpu_context *cpuctx;
1595 struct task_struct *task;
1596 unsigned long flags;
1597 int err;
1598
1599 /*
1600 * If cpu is not a wildcard then this is a percpu event:
1601 */
1602 if (cpu != -1) {
1603 /* Must be root to operate on a CPU event: */
1604 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1605 return ERR_PTR(-EACCES);
1606
1607 if (cpu < 0 || cpu > num_possible_cpus())
1608 return ERR_PTR(-EINVAL);
1609
1610 /*
1611 * We could be clever and allow to attach a event to an
1612 * offline CPU and activate it when the CPU comes up, but
1613 * that's for later.
1614 */
1615 if (!cpu_isset(cpu, cpu_online_map))
1616 return ERR_PTR(-ENODEV);
1617
1618 cpuctx = &per_cpu(perf_cpu_context, cpu);
1619 ctx = &cpuctx->ctx;
1620 get_ctx(ctx);
1621
1622 return ctx;
1623 }
1624
1625 rcu_read_lock();
1626 if (!pid)
1627 task = current;
1628 else
1629 task = find_task_by_vpid(pid);
1630 if (task)
1631 get_task_struct(task);
1632 rcu_read_unlock();
1633
1634 if (!task)
1635 return ERR_PTR(-ESRCH);
1636
1637 /*
1638 * Can't attach events to a dying task.
1639 */
1640 err = -ESRCH;
1641 if (task->flags & PF_EXITING)
1642 goto errout;
1643
1644 /* Reuse ptrace permission checks for now. */
1645 err = -EACCES;
1646 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1647 goto errout;
1648
1649 retry:
1650 ctx = perf_lock_task_context(task, &flags);
1651 if (ctx) {
1652 unclone_ctx(ctx);
1653 spin_unlock_irqrestore(&ctx->lock, flags);
1654 }
1655
1656 if (!ctx) {
1657 ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1658 err = -ENOMEM;
1659 if (!ctx)
1660 goto errout;
1661 __perf_event_init_context(ctx, task);
1662 get_ctx(ctx);
1663 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
1664 /*
1665 * We raced with some other task; use
1666 * the context they set.
1667 */
1668 kfree(ctx);
1669 goto retry;
1670 }
1671 get_task_struct(task);
1672 }
1673
1674 put_task_struct(task);
1675 return ctx;
1676
1677 errout:
1678 put_task_struct(task);
1679 return ERR_PTR(err);
1680}
1681
1682static void perf_event_free_filter(struct perf_event *event);
1683
1684static void free_event_rcu(struct rcu_head *head)
1685{
1686 struct perf_event *event;
1687
1688 event = container_of(head, struct perf_event, rcu_head);
1689 if (event->ns)
1690 put_pid_ns(event->ns);
1691 perf_event_free_filter(event);
1692 kfree(event);
1693}
1694
1695static void perf_pending_sync(struct perf_event *event);
1696
1697static void free_event(struct perf_event *event)
1698{
1699 perf_pending_sync(event);
1700
1701 if (!event->parent) {
1702 atomic_dec(&nr_events);
1703 if (event->attr.mmap)
1704 atomic_dec(&nr_mmap_events);
1705 if (event->attr.comm)
1706 atomic_dec(&nr_comm_events);
1707 if (event->attr.task)
1708 atomic_dec(&nr_task_events);
1709 }
1710
1711 if (event->output) {
1712 fput(event->output->filp);
1713 event->output = NULL;
1714 }
1715
1716 if (event->destroy)
1717 event->destroy(event);
1718
1719 put_ctx(event->ctx);
1720 call_rcu(&event->rcu_head, free_event_rcu);
1721}
1722
1723int perf_event_release_kernel(struct perf_event *event)
1724{
1725 struct perf_event_context *ctx = event->ctx;
1726
1727 WARN_ON_ONCE(ctx->parent_ctx);
1728 mutex_lock(&ctx->mutex);
1729 perf_event_remove_from_context(event);
1730 mutex_unlock(&ctx->mutex);
1731
1732 mutex_lock(&event->owner->perf_event_mutex);
1733 list_del_init(&event->owner_entry);
1734 mutex_unlock(&event->owner->perf_event_mutex);
1735 put_task_struct(event->owner);
1736
1737 free_event(event);
1738
1739 return 0;
1740}
1741EXPORT_SYMBOL_GPL(perf_event_release_kernel);
1742
1743/*
1744 * Called when the last reference to the file is gone.
1745 */
1746static int perf_release(struct inode *inode, struct file *file)
1747{
1748 struct perf_event *event = file->private_data;
1749
1750 file->private_data = NULL;
1751
1752 return perf_event_release_kernel(event);
1753}
1754
1755static int perf_event_read_size(struct perf_event *event)
1756{
1757 int entry = sizeof(u64); /* value */
1758 int size = 0;
1759 int nr = 1;
1760
1761 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1762 size += sizeof(u64);
1763
1764 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1765 size += sizeof(u64);
1766
1767 if (event->attr.read_format & PERF_FORMAT_ID)
1768 entry += sizeof(u64);
1769
1770 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1771 nr += event->group_leader->nr_siblings;
1772 size += sizeof(u64);
1773 }
1774
1775 size += entry * nr;
1776
1777 return size;
1778}
1779
1780u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
1781{
1782 struct perf_event *child;
1783 u64 total = 0;
1784
1785 *enabled = 0;
1786 *running = 0;
1787
1788 mutex_lock(&event->child_mutex);
1789 total += perf_event_read(event);
1790 *enabled += event->total_time_enabled +
1791 atomic64_read(&event->child_total_time_enabled);
1792 *running += event->total_time_running +
1793 atomic64_read(&event->child_total_time_running);
1794
1795 list_for_each_entry(child, &event->child_list, child_list) {
1796 total += perf_event_read(child);
1797 *enabled += child->total_time_enabled;
1798 *running += child->total_time_running;
1799 }
1800 mutex_unlock(&event->child_mutex);
1801
1802 return total;
1803}
1804EXPORT_SYMBOL_GPL(perf_event_read_value);
1805
1806static int perf_event_read_group(struct perf_event *event,
1807 u64 read_format, char __user *buf)
1808{
1809 struct perf_event *leader = event->group_leader, *sub;
1810 int n = 0, size = 0, ret = -EFAULT;
1811 struct perf_event_context *ctx = leader->ctx;
1812 u64 values[5];
1813 u64 count, enabled, running;
1814
1815 mutex_lock(&ctx->mutex);
1816 count = perf_event_read_value(leader, &enabled, &running);
1817
1818 values[n++] = 1 + leader->nr_siblings;
1819 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1820 values[n++] = enabled;
1821 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1822 values[n++] = running;
1823 values[n++] = count;
1824 if (read_format & PERF_FORMAT_ID)
1825 values[n++] = primary_event_id(leader);
1826
1827 size = n * sizeof(u64);
1828
1829 if (copy_to_user(buf, values, size))
1830 goto unlock;
1831
1832 ret = size;
1833
1834 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1835 n = 0;
1836
1837 values[n++] = perf_event_read_value(sub, &enabled, &running);
1838 if (read_format & PERF_FORMAT_ID)
1839 values[n++] = primary_event_id(sub);
1840
1841 size = n * sizeof(u64);
1842
1843 if (copy_to_user(buf + ret, values, size)) {
1844 ret = -EFAULT;
1845 goto unlock;
1846 }
1847
1848 ret += size;
1849 }
1850unlock:
1851 mutex_unlock(&ctx->mutex);
1852
1853 return ret;
1854}
1855
1856static int perf_event_read_one(struct perf_event *event,
1857 u64 read_format, char __user *buf)
1858{
1859 u64 enabled, running;
1860 u64 values[4];
1861 int n = 0;
1862
1863 values[n++] = perf_event_read_value(event, &enabled, &running);
1864 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1865 values[n++] = enabled;
1866 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1867 values[n++] = running;
1868 if (read_format & PERF_FORMAT_ID)
1869 values[n++] = primary_event_id(event);
1870
1871 if (copy_to_user(buf, values, n * sizeof(u64)))
1872 return -EFAULT;
1873
1874 return n * sizeof(u64);
1875}
1876
1877/*
1878 * Read the performance event - simple non blocking version for now
1879 */
1880static ssize_t
1881perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
1882{
1883 u64 read_format = event->attr.read_format;
1884 int ret;
1885
1886 /*
1887 * Return end-of-file for a read on a event that is in
1888 * error state (i.e. because it was pinned but it couldn't be
1889 * scheduled on to the CPU at some point).
1890 */
1891 if (event->state == PERF_EVENT_STATE_ERROR)
1892 return 0;
1893
1894 if (count < perf_event_read_size(event))
1895 return -ENOSPC;
1896
1897 WARN_ON_ONCE(event->ctx->parent_ctx);
1898 if (read_format & PERF_FORMAT_GROUP)
1899 ret = perf_event_read_group(event, read_format, buf);
1900 else
1901 ret = perf_event_read_one(event, read_format, buf);
1902
1903 return ret;
1904}
1905
1906static ssize_t
1907perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1908{
1909 struct perf_event *event = file->private_data;
1910
1911 return perf_read_hw(event, buf, count);
1912}
1913
1914static unsigned int perf_poll(struct file *file, poll_table *wait)
1915{
1916 struct perf_event *event = file->private_data;
1917 struct perf_mmap_data *data;
1918 unsigned int events = POLL_HUP;
1919
1920 rcu_read_lock();
1921 data = rcu_dereference(event->data);
1922 if (data)
1923 events = atomic_xchg(&data->poll, 0);
1924 rcu_read_unlock();
1925
1926 poll_wait(file, &event->waitq, wait);
1927
1928 return events;
1929}
1930
1931static void perf_event_reset(struct perf_event *event)
1932{
1933 (void)perf_event_read(event);
1934 atomic64_set(&event->count, 0);
1935 perf_event_update_userpage(event);
1936}
1937
1938/*
1939 * Holding the top-level event's child_mutex means that any
1940 * descendant process that has inherited this event will block
1941 * in sync_child_event if it goes to exit, thus satisfying the
1942 * task existence requirements of perf_event_enable/disable.
1943 */
1944static void perf_event_for_each_child(struct perf_event *event,
1945 void (*func)(struct perf_event *))
1946{
1947 struct perf_event *child;
1948
1949 WARN_ON_ONCE(event->ctx->parent_ctx);
1950 mutex_lock(&event->child_mutex);
1951 func(event);
1952 list_for_each_entry(child, &event->child_list, child_list)
1953 func(child);
1954 mutex_unlock(&event->child_mutex);
1955}
1956
1957static void perf_event_for_each(struct perf_event *event,
1958 void (*func)(struct perf_event *))
1959{
1960 struct perf_event_context *ctx = event->ctx;
1961 struct perf_event *sibling;
1962
1963 WARN_ON_ONCE(ctx->parent_ctx);
1964 mutex_lock(&ctx->mutex);
1965 event = event->group_leader;
1966
1967 perf_event_for_each_child(event, func);
1968 func(event);
1969 list_for_each_entry(sibling, &event->sibling_list, group_entry)
1970 perf_event_for_each_child(event, func);
1971 mutex_unlock(&ctx->mutex);
1972}
1973
1974static int perf_event_period(struct perf_event *event, u64 __user *arg)
1975{
1976 struct perf_event_context *ctx = event->ctx;
1977 unsigned long size;
1978 int ret = 0;
1979 u64 value;
1980
1981 if (!event->attr.sample_period)
1982 return -EINVAL;
1983
1984 size = copy_from_user(&value, arg, sizeof(value));
1985 if (size != sizeof(value))
1986 return -EFAULT;
1987
1988 if (!value)
1989 return -EINVAL;
1990
1991 spin_lock_irq(&ctx->lock);
1992 if (event->attr.freq) {
1993 if (value > sysctl_perf_event_sample_rate) {
1994 ret = -EINVAL;
1995 goto unlock;
1996 }
1997
1998 event->attr.sample_freq = value;
1999 } else {
2000 event->attr.sample_period = value;
2001 event->hw.sample_period = value;
2002 }
2003unlock:
2004 spin_unlock_irq(&ctx->lock);
2005
2006 return ret;
2007}
2008
2009static int perf_event_set_output(struct perf_event *event, int output_fd);
2010static int perf_event_set_filter(struct perf_event *event, void __user *arg);
2011
2012static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2013{
2014 struct perf_event *event = file->private_data;
2015 void (*func)(struct perf_event *);
2016 u32 flags = arg;
2017
2018 switch (cmd) {
2019 case PERF_EVENT_IOC_ENABLE:
2020 func = perf_event_enable;
2021 break;
2022 case PERF_EVENT_IOC_DISABLE:
2023 func = perf_event_disable;
2024 break;
2025 case PERF_EVENT_IOC_RESET:
2026 func = perf_event_reset;
2027 break;
2028
2029 case PERF_EVENT_IOC_REFRESH:
2030 return perf_event_refresh(event, arg);
2031
2032 case PERF_EVENT_IOC_PERIOD:
2033 return perf_event_period(event, (u64 __user *)arg);
2034
2035 case PERF_EVENT_IOC_SET_OUTPUT:
2036 return perf_event_set_output(event, arg);
2037
2038 case PERF_EVENT_IOC_SET_FILTER:
2039 return perf_event_set_filter(event, (void __user *)arg);
2040
2041 default:
2042 return -ENOTTY;
2043 }
2044
2045 if (flags & PERF_IOC_FLAG_GROUP)
2046 perf_event_for_each(event, func);
2047 else
2048 perf_event_for_each_child(event, func);
2049
2050 return 0;
2051}
2052
2053int perf_event_task_enable(void)
2054{
2055 struct perf_event *event;
2056
2057 mutex_lock(&current->perf_event_mutex);
2058 list_for_each_entry(event, &current->perf_event_list, owner_entry)
2059 perf_event_for_each_child(event, perf_event_enable);
2060 mutex_unlock(&current->perf_event_mutex);
2061
2062 return 0;
2063}
2064
2065int perf_event_task_disable(void)
2066{
2067 struct perf_event *event;
2068
2069 mutex_lock(&current->perf_event_mutex);
2070 list_for_each_entry(event, &current->perf_event_list, owner_entry)
2071 perf_event_for_each_child(event, perf_event_disable);
2072 mutex_unlock(&current->perf_event_mutex);
2073
2074 return 0;
2075}
2076
2077#ifndef PERF_EVENT_INDEX_OFFSET
2078# define PERF_EVENT_INDEX_OFFSET 0
2079#endif
2080
2081static int perf_event_index(struct perf_event *event)
2082{
2083 if (event->state != PERF_EVENT_STATE_ACTIVE)
2084 return 0;
2085
2086 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2087}
2088
2089/*
2090 * Callers need to ensure there can be no nesting of this function, otherwise
2091 * the seqlock logic goes bad. We can not serialize this because the arch
2092 * code calls this from NMI context.
2093 */
2094void perf_event_update_userpage(struct perf_event *event)
2095{
2096 struct perf_event_mmap_page *userpg;
2097 struct perf_mmap_data *data;
2098
2099 rcu_read_lock();
2100 data = rcu_dereference(event->data);
2101 if (!data)
2102 goto unlock;
2103
2104 userpg = data->user_page;
2105
2106 /*
2107 * Disable preemption so as to not let the corresponding user-space
2108 * spin too long if we get preempted.
2109 */
2110 preempt_disable();
2111 ++userpg->lock;
2112 barrier();
2113 userpg->index = perf_event_index(event);
2114 userpg->offset = atomic64_read(&event->count);
2115 if (event->state == PERF_EVENT_STATE_ACTIVE)
2116 userpg->offset -= atomic64_read(&event->hw.prev_count);
2117
2118 userpg->time_enabled = event->total_time_enabled +
2119 atomic64_read(&event->child_total_time_enabled);
2120
2121 userpg->time_running = event->total_time_running +
2122 atomic64_read(&event->child_total_time_running);
2123
2124 barrier();
2125 ++userpg->lock;
2126 preempt_enable();
2127unlock:
2128 rcu_read_unlock();
2129}
2130
2131static unsigned long perf_data_size(struct perf_mmap_data *data)
2132{
2133 return data->nr_pages << (PAGE_SHIFT + data->data_order);
2134}
2135
2136#ifndef CONFIG_PERF_USE_VMALLOC
2137
2138/*
2139 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
2140 */
2141
2142static struct page *
2143perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2144{
2145 if (pgoff > data->nr_pages)
2146 return NULL;
2147
2148 if (pgoff == 0)
2149 return virt_to_page(data->user_page);
2150
2151 return virt_to_page(data->data_pages[pgoff - 1]);
2152}
2153
2154static struct perf_mmap_data *
2155perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2156{
2157 struct perf_mmap_data *data;
2158 unsigned long size;
2159 int i;
2160
2161 WARN_ON(atomic_read(&event->mmap_count));
2162
2163 size = sizeof(struct perf_mmap_data);
2164 size += nr_pages * sizeof(void *);
2165
2166 data = kzalloc(size, GFP_KERNEL);
2167 if (!data)
2168 goto fail;
2169
2170 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2171 if (!data->user_page)
2172 goto fail_user_page;
2173
2174 for (i = 0; i < nr_pages; i++) {
2175 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2176 if (!data->data_pages[i])
2177 goto fail_data_pages;
2178 }
2179
2180 data->data_order = 0;
2181 data->nr_pages = nr_pages;
2182
2183 return data;
2184
2185fail_data_pages:
2186 for (i--; i >= 0; i--)
2187 free_page((unsigned long)data->data_pages[i]);
2188
2189 free_page((unsigned long)data->user_page);
2190
2191fail_user_page:
2192 kfree(data);
2193
2194fail:
2195 return NULL;
2196}
2197
2198static void perf_mmap_free_page(unsigned long addr)
2199{
2200 struct page *page = virt_to_page((void *)addr);
2201
2202 page->mapping = NULL;
2203 __free_page(page);
2204}
2205
2206static void perf_mmap_data_free(struct perf_mmap_data *data)
2207{
2208 int i;
2209
2210 perf_mmap_free_page((unsigned long)data->user_page);
2211 for (i = 0; i < data->nr_pages; i++)
2212 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2213 kfree(data);
2214}
2215
2216#else
2217
2218/*
2219 * Back perf_mmap() with vmalloc memory.
2220 *
2221 * Required for architectures that have d-cache aliasing issues.
2222 */
2223
2224static struct page *
2225perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2226{
2227 if (pgoff > (1UL << data->data_order))
2228 return NULL;
2229
2230 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
2231}
2232
2233static void perf_mmap_unmark_page(void *addr)
2234{
2235 struct page *page = vmalloc_to_page(addr);
2236
2237 page->mapping = NULL;
2238}
2239
2240static void perf_mmap_data_free_work(struct work_struct *work)
2241{
2242 struct perf_mmap_data *data;
2243 void *base;
2244 int i, nr;
2245
2246 data = container_of(work, struct perf_mmap_data, work);
2247 nr = 1 << data->data_order;
2248
2249 base = data->user_page;
2250 for (i = 0; i < nr + 1; i++)
2251 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2252
2253 vfree(base);
2254 kfree(data);
2255}
2256
2257static void perf_mmap_data_free(struct perf_mmap_data *data)
2258{
2259 schedule_work(&data->work);
2260}
2261
2262static struct perf_mmap_data *
2263perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2264{
2265 struct perf_mmap_data *data;
2266 unsigned long size;
2267 void *all_buf;
2268
2269 WARN_ON(atomic_read(&event->mmap_count));
2270
2271 size = sizeof(struct perf_mmap_data);
2272 size += sizeof(void *);
2273
2274 data = kzalloc(size, GFP_KERNEL);
2275 if (!data)
2276 goto fail;
2277
2278 INIT_WORK(&data->work, perf_mmap_data_free_work);
2279
2280 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2281 if (!all_buf)
2282 goto fail_all_buf;
2283
2284 data->user_page = all_buf;
2285 data->data_pages[0] = all_buf + PAGE_SIZE;
2286 data->data_order = ilog2(nr_pages);
2287 data->nr_pages = 1;
2288
2289 return data;
2290
2291fail_all_buf:
2292 kfree(data);
2293
2294fail:
2295 return NULL;
2296}
2297
2298#endif
2299
2300static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2301{
2302 struct perf_event *event = vma->vm_file->private_data;
2303 struct perf_mmap_data *data;
2304 int ret = VM_FAULT_SIGBUS;
2305
2306 if (vmf->flags & FAULT_FLAG_MKWRITE) {
2307 if (vmf->pgoff == 0)
2308 ret = 0;
2309 return ret;
2310 }
2311
2312 rcu_read_lock();
2313 data = rcu_dereference(event->data);
2314 if (!data)
2315 goto unlock;
2316
2317 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
2318 goto unlock;
2319
2320 vmf->page = perf_mmap_to_page(data, vmf->pgoff);
2321 if (!vmf->page)
2322 goto unlock;
2323
2324 get_page(vmf->page);
2325 vmf->page->mapping = vma->vm_file->f_mapping;
2326 vmf->page->index = vmf->pgoff;
2327
2328 ret = 0;
2329unlock:
2330 rcu_read_unlock();
2331
2332 return ret;
2333}
2334
2335static void
2336perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2337{
2338 long max_size = perf_data_size(data);
2339
2340 atomic_set(&data->lock, -1);
2341
2342 if (event->attr.watermark) {
2343 data->watermark = min_t(long, max_size,
2344 event->attr.wakeup_watermark);
2345 }
2346
2347 if (!data->watermark)
2348 data->watermark = max_size / 2;
2349
2350
2351 rcu_assign_pointer(event->data, data);
2352}
2353
2354static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2355{
2356 struct perf_mmap_data *data;
2357
2358 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2359 perf_mmap_data_free(data);
2360}
2361
2362static void perf_mmap_data_release(struct perf_event *event)
2363{
2364 struct perf_mmap_data *data = event->data;
2365
2366 WARN_ON(atomic_read(&event->mmap_count));
2367
2368 rcu_assign_pointer(event->data, NULL);
2369 call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
2370}
2371
2372static void perf_mmap_open(struct vm_area_struct *vma)
2373{
2374 struct perf_event *event = vma->vm_file->private_data;
2375
2376 atomic_inc(&event->mmap_count);
2377}
2378
2379static void perf_mmap_close(struct vm_area_struct *vma)
2380{
2381 struct perf_event *event = vma->vm_file->private_data;
2382
2383 WARN_ON_ONCE(event->ctx->parent_ctx);
2384 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2385 unsigned long size = perf_data_size(event->data);
2386 struct user_struct *user = current_user();
2387
2388 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2389 vma->vm_mm->locked_vm -= event->data->nr_locked;
2390 perf_mmap_data_release(event);
2391 mutex_unlock(&event->mmap_mutex);
2392 }
2393}
2394
2395static const struct vm_operations_struct perf_mmap_vmops = {
2396 .open = perf_mmap_open,
2397 .close = perf_mmap_close,
2398 .fault = perf_mmap_fault,
2399 .page_mkwrite = perf_mmap_fault,
2400};
2401
2402static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2403{
2404 struct perf_event *event = file->private_data;
2405 unsigned long user_locked, user_lock_limit;
2406 struct user_struct *user = current_user();
2407 unsigned long locked, lock_limit;
2408 struct perf_mmap_data *data;
2409 unsigned long vma_size;
2410 unsigned long nr_pages;
2411 long user_extra, extra;
2412 int ret = 0;
2413
2414 if (!(vma->vm_flags & VM_SHARED))
2415 return -EINVAL;
2416
2417 vma_size = vma->vm_end - vma->vm_start;
2418 nr_pages = (vma_size / PAGE_SIZE) - 1;
2419
2420 /*
2421 * If we have data pages ensure they're a power-of-two number, so we
2422 * can do bitmasks instead of modulo.
2423 */
2424 if (nr_pages != 0 && !is_power_of_2(nr_pages))
2425 return -EINVAL;
2426
2427 if (vma_size != PAGE_SIZE * (1 + nr_pages))
2428 return -EINVAL;
2429
2430 if (vma->vm_pgoff != 0)
2431 return -EINVAL;
2432
2433 WARN_ON_ONCE(event->ctx->parent_ctx);
2434 mutex_lock(&event->mmap_mutex);
2435 if (event->output) {
2436 ret = -EINVAL;
2437 goto unlock;
2438 }
2439
2440 if (atomic_inc_not_zero(&event->mmap_count)) {
2441 if (nr_pages != event->data->nr_pages)
2442 ret = -EINVAL;
2443 goto unlock;
2444 }
2445
2446 user_extra = nr_pages + 1;
2447 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
2448
2449 /*
2450 * Increase the limit linearly with more CPUs:
2451 */
2452 user_lock_limit *= num_online_cpus();
2453
2454 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2455
2456 extra = 0;
2457 if (user_locked > user_lock_limit)
2458 extra = user_locked - user_lock_limit;
2459
2460 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
2461 lock_limit >>= PAGE_SHIFT;
2462 locked = vma->vm_mm->locked_vm + extra;
2463
2464 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
2465 !capable(CAP_IPC_LOCK)) {
2466 ret = -EPERM;
2467 goto unlock;
2468 }
2469
2470 WARN_ON(event->data);
2471
2472 data = perf_mmap_data_alloc(event, nr_pages);
2473 ret = -ENOMEM;
2474 if (!data)
2475 goto unlock;
2476
2477 ret = 0;
2478 perf_mmap_data_init(event, data);
2479
2480 atomic_set(&event->mmap_count, 1);
2481 atomic_long_add(user_extra, &user->locked_vm);
2482 vma->vm_mm->locked_vm += extra;
2483 event->data->nr_locked = extra;
2484 if (vma->vm_flags & VM_WRITE)
2485 event->data->writable = 1;
2486
2487unlock:
2488 mutex_unlock(&event->mmap_mutex);
2489
2490 vma->vm_flags |= VM_RESERVED;
2491 vma->vm_ops = &perf_mmap_vmops;
2492
2493 return ret;
2494}
2495
2496static int perf_fasync(int fd, struct file *filp, int on)
2497{
2498 struct inode *inode = filp->f_path.dentry->d_inode;
2499 struct perf_event *event = filp->private_data;
2500 int retval;
2501
2502 mutex_lock(&inode->i_mutex);
2503 retval = fasync_helper(fd, filp, on, &event->fasync);
2504 mutex_unlock(&inode->i_mutex);
2505
2506 if (retval < 0)
2507 return retval;
2508
2509 return 0;
2510}
2511
2512static const struct file_operations perf_fops = {
2513 .release = perf_release,
2514 .read = perf_read,
2515 .poll = perf_poll,
2516 .unlocked_ioctl = perf_ioctl,
2517 .compat_ioctl = perf_ioctl,
2518 .mmap = perf_mmap,
2519 .fasync = perf_fasync,
2520};
2521
2522/*
2523 * Perf event wakeup
2524 *
2525 * If there's data, ensure we set the poll() state and publish everything
2526 * to user-space before waking everybody up.
2527 */
2528
2529void perf_event_wakeup(struct perf_event *event)
2530{
2531 wake_up_all(&event->waitq);
2532
2533 if (event->pending_kill) {
2534 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
2535 event->pending_kill = 0;
2536 }
2537}
2538
2539/*
2540 * Pending wakeups
2541 *
2542 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2543 *
2544 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2545 * single linked list and use cmpxchg() to add entries lockless.
2546 */
2547
2548static void perf_pending_event(struct perf_pending_entry *entry)
2549{
2550 struct perf_event *event = container_of(entry,
2551 struct perf_event, pending);
2552
2553 if (event->pending_disable) {
2554 event->pending_disable = 0;
2555 __perf_event_disable(event);
2556 }
2557
2558 if (event->pending_wakeup) {
2559 event->pending_wakeup = 0;
2560 perf_event_wakeup(event);
2561 }
2562}
2563
2564#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2565
2566static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2567 PENDING_TAIL,
2568};
2569
2570static void perf_pending_queue(struct perf_pending_entry *entry,
2571 void (*func)(struct perf_pending_entry *))
2572{
2573 struct perf_pending_entry **head;
2574
2575 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2576 return;
2577
2578 entry->func = func;
2579
2580 head = &get_cpu_var(perf_pending_head);
2581
2582 do {
2583 entry->next = *head;
2584 } while (cmpxchg(head, entry->next, entry) != entry->next);
2585
2586 set_perf_event_pending();
2587
2588 put_cpu_var(perf_pending_head);
2589}
2590
2591static int __perf_pending_run(void)
2592{
2593 struct perf_pending_entry *list;
2594 int nr = 0;
2595
2596 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2597 while (list != PENDING_TAIL) {
2598 void (*func)(struct perf_pending_entry *);
2599 struct perf_pending_entry *entry = list;
2600
2601 list = list->next;
2602
2603 func = entry->func;
2604 entry->next = NULL;
2605 /*
2606 * Ensure we observe the unqueue before we issue the wakeup,
2607 * so that we won't be waiting forever.
2608 * -- see perf_not_pending().
2609 */
2610 smp_wmb();
2611
2612 func(entry);
2613 nr++;
2614 }
2615
2616 return nr;
2617}
2618
2619static inline int perf_not_pending(struct perf_event *event)
2620{
2621 /*
2622 * If we flush on whatever cpu we run, there is a chance we don't
2623 * need to wait.
2624 */
2625 get_cpu();
2626 __perf_pending_run();
2627 put_cpu();
2628
2629 /*
2630 * Ensure we see the proper queue state before going to sleep
2631 * so that we do not miss the wakeup. -- see perf_pending_handle()
2632 */
2633 smp_rmb();
2634 return event->pending.next == NULL;
2635}
2636
2637static void perf_pending_sync(struct perf_event *event)
2638{
2639 wait_event(event->waitq, perf_not_pending(event));
2640}
2641
2642void perf_event_do_pending(void)
2643{
2644 __perf_pending_run();
2645}
2646
2647/*
2648 * Callchain support -- arch specific
2649 */
2650
2651__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2652{
2653 return NULL;
2654}
2655
2656/*
2657 * Output
2658 */
2659static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2660 unsigned long offset, unsigned long head)
2661{
2662 unsigned long mask;
2663
2664 if (!data->writable)
2665 return true;
2666
2667 mask = perf_data_size(data) - 1;
2668
2669 offset = (offset - tail) & mask;
2670 head = (head - tail) & mask;
2671
2672 if ((int)(head - offset) < 0)
2673 return false;
2674
2675 return true;
2676}
2677
2678static void perf_output_wakeup(struct perf_output_handle *handle)
2679{
2680 atomic_set(&handle->data->poll, POLL_IN);
2681
2682 if (handle->nmi) {
2683 handle->event->pending_wakeup = 1;
2684 perf_pending_queue(&handle->event->pending,
2685 perf_pending_event);
2686 } else
2687 perf_event_wakeup(handle->event);
2688}
2689
2690/*
2691 * Curious locking construct.
2692 *
2693 * We need to ensure a later event_id doesn't publish a head when a former
2694 * event_id isn't done writing. However since we need to deal with NMIs we
2695 * cannot fully serialize things.
2696 *
2697 * What we do is serialize between CPUs so we only have to deal with NMI
2698 * nesting on a single CPU.
2699 *
2700 * We only publish the head (and generate a wakeup) when the outer-most
2701 * event_id completes.
2702 */
2703static void perf_output_lock(struct perf_output_handle *handle)
2704{
2705 struct perf_mmap_data *data = handle->data;
2706 int cur, cpu = get_cpu();
2707
2708 handle->locked = 0;
2709
2710 for (;;) {
2711 cur = atomic_cmpxchg(&data->lock, -1, cpu);
2712 if (cur == -1) {
2713 handle->locked = 1;
2714 break;
2715 }
2716 if (cur == cpu)
2717 break;
2718
2719 cpu_relax();
2720 }
2721}
2722
2723static void perf_output_unlock(struct perf_output_handle *handle)
2724{
2725 struct perf_mmap_data *data = handle->data;
2726 unsigned long head;
2727 int cpu;
2728
2729 data->done_head = data->head;
2730
2731 if (!handle->locked)
2732 goto out;
2733
2734again:
2735 /*
2736 * The xchg implies a full barrier that ensures all writes are done
2737 * before we publish the new head, matched by a rmb() in userspace when
2738 * reading this position.
2739 */
2740 while ((head = atomic_long_xchg(&data->done_head, 0)))
2741 data->user_page->data_head = head;
2742
2743 /*
2744 * NMI can happen here, which means we can miss a done_head update.
2745 */
2746
2747 cpu = atomic_xchg(&data->lock, -1);
2748 WARN_ON_ONCE(cpu != smp_processor_id());
2749
2750 /*
2751 * Therefore we have to validate we did not indeed do so.
2752 */
2753 if (unlikely(atomic_long_read(&data->done_head))) {
2754 /*
2755 * Since we had it locked, we can lock it again.
2756 */
2757 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2758 cpu_relax();
2759
2760 goto again;
2761 }
2762
2763 if (atomic_xchg(&data->wakeup, 0))
2764 perf_output_wakeup(handle);
2765out:
2766 put_cpu();
2767}
2768
2769void perf_output_copy(struct perf_output_handle *handle,
2770 const void *buf, unsigned int len)
2771{
2772 unsigned int pages_mask;
2773 unsigned long offset;
2774 unsigned int size;
2775 void **pages;
2776
2777 offset = handle->offset;
2778 pages_mask = handle->data->nr_pages - 1;
2779 pages = handle->data->data_pages;
2780
2781 do {
2782 unsigned long page_offset;
2783 unsigned long page_size;
2784 int nr;
2785
2786 nr = (offset >> PAGE_SHIFT) & pages_mask;
2787 page_size = 1UL << (handle->data->data_order + PAGE_SHIFT);
2788 page_offset = offset & (page_size - 1);
2789 size = min_t(unsigned int, page_size - page_offset, len);
2790
2791 memcpy(pages[nr] + page_offset, buf, size);
2792
2793 len -= size;
2794 buf += size;
2795 offset += size;
2796 } while (len);
2797
2798 handle->offset = offset;
2799
2800 /*
2801 * Check we didn't copy past our reservation window, taking the
2802 * possible unsigned int wrap into account.
2803 */
2804 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2805}
2806
2807int perf_output_begin(struct perf_output_handle *handle,
2808 struct perf_event *event, unsigned int size,
2809 int nmi, int sample)
2810{
2811 struct perf_event *output_event;
2812 struct perf_mmap_data *data;
2813 unsigned long tail, offset, head;
2814 int have_lost;
2815 struct {
2816 struct perf_event_header header;
2817 u64 id;
2818 u64 lost;
2819 } lost_event;
2820
2821 rcu_read_lock();
2822 /*
2823 * For inherited events we send all the output towards the parent.
2824 */
2825 if (event->parent)
2826 event = event->parent;
2827
2828 output_event = rcu_dereference(event->output);
2829 if (output_event)
2830 event = output_event;
2831
2832 data = rcu_dereference(event->data);
2833 if (!data)
2834 goto out;
2835
2836 handle->data = data;
2837 handle->event = event;
2838 handle->nmi = nmi;
2839 handle->sample = sample;
2840
2841 if (!data->nr_pages)
2842 goto fail;
2843
2844 have_lost = atomic_read(&data->lost);
2845 if (have_lost)
2846 size += sizeof(lost_event);
2847
2848 perf_output_lock(handle);
2849
2850 do {
2851 /*
2852 * Userspace could choose to issue a mb() before updating the
2853 * tail pointer. So that all reads will be completed before the
2854 * write is issued.
2855 */
2856 tail = ACCESS_ONCE(data->user_page->data_tail);
2857 smp_rmb();
2858 offset = head = atomic_long_read(&data->head);
2859 head += size;
2860 if (unlikely(!perf_output_space(data, tail, offset, head)))
2861 goto fail;
2862 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2863
2864 handle->offset = offset;
2865 handle->head = head;
2866
2867 if (head - tail > data->watermark)
2868 atomic_set(&data->wakeup, 1);
2869
2870 if (have_lost) {
2871 lost_event.header.type = PERF_RECORD_LOST;
2872 lost_event.header.misc = 0;
2873 lost_event.header.size = sizeof(lost_event);
2874 lost_event.id = event->id;
2875 lost_event.lost = atomic_xchg(&data->lost, 0);
2876
2877 perf_output_put(handle, lost_event);
2878 }
2879
2880 return 0;
2881
2882fail:
2883 atomic_inc(&data->lost);
2884 perf_output_unlock(handle);
2885out:
2886 rcu_read_unlock();
2887
2888 return -ENOSPC;
2889}
2890
2891void perf_output_end(struct perf_output_handle *handle)
2892{
2893 struct perf_event *event = handle->event;
2894 struct perf_mmap_data *data = handle->data;
2895
2896 int wakeup_events = event->attr.wakeup_events;
2897
2898 if (handle->sample && wakeup_events) {
2899 int events = atomic_inc_return(&data->events);
2900 if (events >= wakeup_events) {
2901 atomic_sub(wakeup_events, &data->events);
2902 atomic_set(&data->wakeup, 1);
2903 }
2904 }
2905
2906 perf_output_unlock(handle);
2907 rcu_read_unlock();
2908}
2909
2910static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
2911{
2912 /*
2913 * only top level events have the pid namespace they were created in
2914 */
2915 if (event->parent)
2916 event = event->parent;
2917
2918 return task_tgid_nr_ns(p, event->ns);
2919}
2920
2921static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
2922{
2923 /*
2924 * only top level events have the pid namespace they were created in
2925 */
2926 if (event->parent)
2927 event = event->parent;
2928
2929 return task_pid_nr_ns(p, event->ns);
2930}
2931
2932static void perf_output_read_one(struct perf_output_handle *handle,
2933 struct perf_event *event)
2934{
2935 u64 read_format = event->attr.read_format;
2936 u64 values[4];
2937 int n = 0;
2938
2939 values[n++] = atomic64_read(&event->count);
2940 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2941 values[n++] = event->total_time_enabled +
2942 atomic64_read(&event->child_total_time_enabled);
2943 }
2944 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2945 values[n++] = event->total_time_running +
2946 atomic64_read(&event->child_total_time_running);
2947 }
2948 if (read_format & PERF_FORMAT_ID)
2949 values[n++] = primary_event_id(event);
2950
2951 perf_output_copy(handle, values, n * sizeof(u64));
2952}
2953
2954/*
2955 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
2956 */
2957static void perf_output_read_group(struct perf_output_handle *handle,
2958 struct perf_event *event)
2959{
2960 struct perf_event *leader = event->group_leader, *sub;
2961 u64 read_format = event->attr.read_format;
2962 u64 values[5];
2963 int n = 0;
2964
2965 values[n++] = 1 + leader->nr_siblings;
2966
2967 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2968 values[n++] = leader->total_time_enabled;
2969
2970 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2971 values[n++] = leader->total_time_running;
2972
2973 if (leader != event)
2974 leader->pmu->read(leader);
2975
2976 values[n++] = atomic64_read(&leader->count);
2977 if (read_format & PERF_FORMAT_ID)
2978 values[n++] = primary_event_id(leader);
2979
2980 perf_output_copy(handle, values, n * sizeof(u64));
2981
2982 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
2983 n = 0;
2984
2985 if (sub != event)
2986 sub->pmu->read(sub);
2987
2988 values[n++] = atomic64_read(&sub->count);
2989 if (read_format & PERF_FORMAT_ID)
2990 values[n++] = primary_event_id(sub);
2991
2992 perf_output_copy(handle, values, n * sizeof(u64));
2993 }
2994}
2995
2996static void perf_output_read(struct perf_output_handle *handle,
2997 struct perf_event *event)
2998{
2999 if (event->attr.read_format & PERF_FORMAT_GROUP)
3000 perf_output_read_group(handle, event);
3001 else
3002 perf_output_read_one(handle, event);
3003}
3004
3005void perf_output_sample(struct perf_output_handle *handle,
3006 struct perf_event_header *header,
3007 struct perf_sample_data *data,
3008 struct perf_event *event)
3009{
3010 u64 sample_type = data->type;
3011
3012 perf_output_put(handle, *header);
3013
3014 if (sample_type & PERF_SAMPLE_IP)
3015 perf_output_put(handle, data->ip);
3016
3017 if (sample_type & PERF_SAMPLE_TID)
3018 perf_output_put(handle, data->tid_entry);
3019
3020 if (sample_type & PERF_SAMPLE_TIME)
3021 perf_output_put(handle, data->time);
3022
3023 if (sample_type & PERF_SAMPLE_ADDR)
3024 perf_output_put(handle, data->addr);
3025
3026 if (sample_type & PERF_SAMPLE_ID)
3027 perf_output_put(handle, data->id);
3028
3029 if (sample_type & PERF_SAMPLE_STREAM_ID)
3030 perf_output_put(handle, data->stream_id);
3031
3032 if (sample_type & PERF_SAMPLE_CPU)
3033 perf_output_put(handle, data->cpu_entry);
3034
3035 if (sample_type & PERF_SAMPLE_PERIOD)
3036 perf_output_put(handle, data->period);
3037
3038 if (sample_type & PERF_SAMPLE_READ)
3039 perf_output_read(handle, event);
3040
3041 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3042 if (data->callchain) {
3043 int size = 1;
3044
3045 if (data->callchain)
3046 size += data->callchain->nr;
3047
3048 size *= sizeof(u64);
3049
3050 perf_output_copy(handle, data->callchain, size);
3051 } else {
3052 u64 nr = 0;
3053 perf_output_put(handle, nr);
3054 }
3055 }
3056
3057 if (sample_type & PERF_SAMPLE_RAW) {
3058 if (data->raw) {
3059 perf_output_put(handle, data->raw->size);
3060 perf_output_copy(handle, data->raw->data,
3061 data->raw->size);
3062 } else {
3063 struct {
3064 u32 size;
3065 u32 data;
3066 } raw = {
3067 .size = sizeof(u32),
3068 .data = 0,
3069 };
3070 perf_output_put(handle, raw);
3071 }
3072 }
3073}
3074
3075void perf_prepare_sample(struct perf_event_header *header,
3076 struct perf_sample_data *data,
3077 struct perf_event *event,
3078 struct pt_regs *regs)
3079{
3080 u64 sample_type = event->attr.sample_type;
3081
3082 data->type = sample_type;
3083
3084 header->type = PERF_RECORD_SAMPLE;
3085 header->size = sizeof(*header);
3086
3087 header->misc = 0;
3088 header->misc |= perf_misc_flags(regs);
3089
3090 if (sample_type & PERF_SAMPLE_IP) {
3091 data->ip = perf_instruction_pointer(regs);
3092
3093 header->size += sizeof(data->ip);
3094 }
3095
3096 if (sample_type & PERF_SAMPLE_TID) {
3097 /* namespace issues */
3098 data->tid_entry.pid = perf_event_pid(event, current);
3099 data->tid_entry.tid = perf_event_tid(event, current);
3100
3101 header->size += sizeof(data->tid_entry);
3102 }
3103
3104 if (sample_type & PERF_SAMPLE_TIME) {
3105 data->time = perf_clock();
3106
3107 header->size += sizeof(data->time);
3108 }
3109
3110 if (sample_type & PERF_SAMPLE_ADDR)
3111 header->size += sizeof(data->addr);
3112
3113 if (sample_type & PERF_SAMPLE_ID) {
3114 data->id = primary_event_id(event);
3115
3116 header->size += sizeof(data->id);
3117 }
3118
3119 if (sample_type & PERF_SAMPLE_STREAM_ID) {
3120 data->stream_id = event->id;
3121
3122 header->size += sizeof(data->stream_id);
3123 }
3124
3125 if (sample_type & PERF_SAMPLE_CPU) {
3126 data->cpu_entry.cpu = raw_smp_processor_id();
3127 data->cpu_entry.reserved = 0;
3128
3129 header->size += sizeof(data->cpu_entry);
3130 }
3131
3132 if (sample_type & PERF_SAMPLE_PERIOD)
3133 header->size += sizeof(data->period);
3134
3135 if (sample_type & PERF_SAMPLE_READ)
3136 header->size += perf_event_read_size(event);
3137
3138 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3139 int size = 1;
3140
3141 data->callchain = perf_callchain(regs);
3142
3143 if (data->callchain)
3144 size += data->callchain->nr;
3145
3146 header->size += size * sizeof(u64);
3147 }
3148
3149 if (sample_type & PERF_SAMPLE_RAW) {
3150 int size = sizeof(u32);
3151
3152 if (data->raw)
3153 size += data->raw->size;
3154 else
3155 size += sizeof(u32);
3156
3157 WARN_ON_ONCE(size & (sizeof(u64)-1));
3158 header->size += size;
3159 }
3160}
3161
3162static void perf_event_output(struct perf_event *event, int nmi,
3163 struct perf_sample_data *data,
3164 struct pt_regs *regs)
3165{
3166 struct perf_output_handle handle;
3167 struct perf_event_header header;
3168
3169 perf_prepare_sample(&header, data, event, regs);
3170
3171 if (perf_output_begin(&handle, event, header.size, nmi, 1))
3172 return;
3173
3174 perf_output_sample(&handle, &header, data, event);
3175
3176 perf_output_end(&handle);
3177}
3178
3179/*
3180 * read event_id
3181 */
3182
3183struct perf_read_event {
3184 struct perf_event_header header;
3185
3186 u32 pid;
3187 u32 tid;
3188};
3189
3190static void
3191perf_event_read_event(struct perf_event *event,
3192 struct task_struct *task)
3193{
3194 struct perf_output_handle handle;
3195 struct perf_read_event read_event = {
3196 .header = {
3197 .type = PERF_RECORD_READ,
3198 .misc = 0,
3199 .size = sizeof(read_event) + perf_event_read_size(event),
3200 },
3201 .pid = perf_event_pid(event, task),
3202 .tid = perf_event_tid(event, task),
3203 };
3204 int ret;
3205
3206 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3207 if (ret)
3208 return;
3209
3210 perf_output_put(&handle, read_event);
3211 perf_output_read(&handle, event);
3212
3213 perf_output_end(&handle);
3214}
3215
3216/*
3217 * task tracking -- fork/exit
3218 *
3219 * enabled by: attr.comm | attr.mmap | attr.task
3220 */
3221
3222struct perf_task_event {
3223 struct task_struct *task;
3224 struct perf_event_context *task_ctx;
3225
3226 struct {
3227 struct perf_event_header header;
3228
3229 u32 pid;
3230 u32 ppid;
3231 u32 tid;
3232 u32 ptid;
3233 u64 time;
3234 } event_id;
3235};
3236
3237static void perf_event_task_output(struct perf_event *event,
3238 struct perf_task_event *task_event)
3239{
3240 struct perf_output_handle handle;
3241 int size;
3242 struct task_struct *task = task_event->task;
3243 int ret;
3244
3245 size = task_event->event_id.header.size;
3246 ret = perf_output_begin(&handle, event, size, 0, 0);
3247
3248 if (ret)
3249 return;
3250
3251 task_event->event_id.pid = perf_event_pid(event, task);
3252 task_event->event_id.ppid = perf_event_pid(event, current);
3253
3254 task_event->event_id.tid = perf_event_tid(event, task);
3255 task_event->event_id.ptid = perf_event_tid(event, current);
3256
3257 task_event->event_id.time = perf_clock();
3258
3259 perf_output_put(&handle, task_event->event_id);
3260
3261 perf_output_end(&handle);
3262}
3263
3264static int perf_event_task_match(struct perf_event *event)
3265{
3266 if (event->attr.comm || event->attr.mmap || event->attr.task)
3267 return 1;
3268
3269 return 0;
3270}
3271
3272static void perf_event_task_ctx(struct perf_event_context *ctx,
3273 struct perf_task_event *task_event)
3274{
3275 struct perf_event *event;
3276
3277 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3278 if (perf_event_task_match(event))
3279 perf_event_task_output(event, task_event);
3280 }
3281}
3282
3283static void perf_event_task_event(struct perf_task_event *task_event)
3284{
3285 struct perf_cpu_context *cpuctx;
3286 struct perf_event_context *ctx = task_event->task_ctx;
3287
3288 rcu_read_lock();
3289 cpuctx = &get_cpu_var(perf_cpu_context);
3290 perf_event_task_ctx(&cpuctx->ctx, task_event);
3291 put_cpu_var(perf_cpu_context);
3292
3293 if (!ctx)
3294 ctx = rcu_dereference(task_event->task->perf_event_ctxp);
3295 if (ctx)
3296 perf_event_task_ctx(ctx, task_event);
3297 rcu_read_unlock();
3298}
3299
3300static void perf_event_task(struct task_struct *task,
3301 struct perf_event_context *task_ctx,
3302 int new)
3303{
3304 struct perf_task_event task_event;
3305
3306 if (!atomic_read(&nr_comm_events) &&
3307 !atomic_read(&nr_mmap_events) &&
3308 !atomic_read(&nr_task_events))
3309 return;
3310
3311 task_event = (struct perf_task_event){
3312 .task = task,
3313 .task_ctx = task_ctx,
3314 .event_id = {
3315 .header = {
3316 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3317 .misc = 0,
3318 .size = sizeof(task_event.event_id),
3319 },
3320 /* .pid */
3321 /* .ppid */
3322 /* .tid */
3323 /* .ptid */
3324 },
3325 };
3326
3327 perf_event_task_event(&task_event);
3328}
3329
3330void perf_event_fork(struct task_struct *task)
3331{
3332 perf_event_task(task, NULL, 1);
3333}
3334
3335/*
3336 * comm tracking
3337 */
3338
3339struct perf_comm_event {
3340 struct task_struct *task;
3341 char *comm;
3342 int comm_size;
3343
3344 struct {
3345 struct perf_event_header header;
3346
3347 u32 pid;
3348 u32 tid;
3349 } event_id;
3350};
3351
3352static void perf_event_comm_output(struct perf_event *event,
3353 struct perf_comm_event *comm_event)
3354{
3355 struct perf_output_handle handle;
3356 int size = comm_event->event_id.header.size;
3357 int ret = perf_output_begin(&handle, event, size, 0, 0);
3358
3359 if (ret)
3360 return;
3361
3362 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3363 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
3364
3365 perf_output_put(&handle, comm_event->event_id);
3366 perf_output_copy(&handle, comm_event->comm,
3367 comm_event->comm_size);
3368 perf_output_end(&handle);
3369}
3370
3371static int perf_event_comm_match(struct perf_event *event)
3372{
3373 if (event->attr.comm)
3374 return 1;
3375
3376 return 0;
3377}
3378
3379static void perf_event_comm_ctx(struct perf_event_context *ctx,
3380 struct perf_comm_event *comm_event)
3381{
3382 struct perf_event *event;
3383
3384 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3385 if (perf_event_comm_match(event))
3386 perf_event_comm_output(event, comm_event);
3387 }
3388}
3389
3390static void perf_event_comm_event(struct perf_comm_event *comm_event)
3391{
3392 struct perf_cpu_context *cpuctx;
3393 struct perf_event_context *ctx;
3394 unsigned int size;
3395 char comm[TASK_COMM_LEN];
3396
3397 memset(comm, 0, sizeof(comm));
3398 strlcpy(comm, comm_event->task->comm, sizeof(comm));
3399 size = ALIGN(strlen(comm)+1, sizeof(u64));
3400
3401 comm_event->comm = comm;
3402 comm_event->comm_size = size;
3403
3404 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3405
3406 rcu_read_lock();
3407 cpuctx = &get_cpu_var(perf_cpu_context);
3408 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3409 put_cpu_var(perf_cpu_context);
3410
3411 /*
3412 * doesn't really matter which of the child contexts the
3413 * events ends up in.
3414 */
3415 ctx = rcu_dereference(current->perf_event_ctxp);
3416 if (ctx)
3417 perf_event_comm_ctx(ctx, comm_event);
3418 rcu_read_unlock();
3419}
3420
3421void perf_event_comm(struct task_struct *task)
3422{
3423 struct perf_comm_event comm_event;
3424
3425 if (task->perf_event_ctxp)
3426 perf_event_enable_on_exec(task);
3427
3428 if (!atomic_read(&nr_comm_events))
3429 return;
3430
3431 comm_event = (struct perf_comm_event){
3432 .task = task,
3433 /* .comm */
3434 /* .comm_size */
3435 .event_id = {
3436 .header = {
3437 .type = PERF_RECORD_COMM,
3438 .misc = 0,
3439 /* .size */
3440 },
3441 /* .pid */
3442 /* .tid */
3443 },
3444 };
3445
3446 perf_event_comm_event(&comm_event);
3447}
3448
3449/*
3450 * mmap tracking
3451 */
3452
3453struct perf_mmap_event {
3454 struct vm_area_struct *vma;
3455
3456 const char *file_name;
3457 int file_size;
3458
3459 struct {
3460 struct perf_event_header header;
3461
3462 u32 pid;
3463 u32 tid;
3464 u64 start;
3465 u64 len;
3466 u64 pgoff;
3467 } event_id;
3468};
3469
3470static void perf_event_mmap_output(struct perf_event *event,
3471 struct perf_mmap_event *mmap_event)
3472{
3473 struct perf_output_handle handle;
3474 int size = mmap_event->event_id.header.size;
3475 int ret = perf_output_begin(&handle, event, size, 0, 0);
3476
3477 if (ret)
3478 return;
3479
3480 mmap_event->event_id.pid = perf_event_pid(event, current);
3481 mmap_event->event_id.tid = perf_event_tid(event, current);
3482
3483 perf_output_put(&handle, mmap_event->event_id);
3484 perf_output_copy(&handle, mmap_event->file_name,
3485 mmap_event->file_size);
3486 perf_output_end(&handle);
3487}
3488
3489static int perf_event_mmap_match(struct perf_event *event,
3490 struct perf_mmap_event *mmap_event)
3491{
3492 if (event->attr.mmap)
3493 return 1;
3494
3495 return 0;
3496}
3497
3498static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3499 struct perf_mmap_event *mmap_event)
3500{
3501 struct perf_event *event;
3502
3503 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3504 if (perf_event_mmap_match(event, mmap_event))
3505 perf_event_mmap_output(event, mmap_event);
3506 }
3507}
3508
3509static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3510{
3511 struct perf_cpu_context *cpuctx;
3512 struct perf_event_context *ctx;
3513 struct vm_area_struct *vma = mmap_event->vma;
3514 struct file *file = vma->vm_file;
3515 unsigned int size;
3516 char tmp[16];
3517 char *buf = NULL;
3518 const char *name;
3519
3520 memset(tmp, 0, sizeof(tmp));
3521
3522 if (file) {
3523 /*
3524 * d_path works from the end of the buffer backwards, so we
3525 * need to add enough zero bytes after the string to handle
3526 * the 64bit alignment we do later.
3527 */
3528 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3529 if (!buf) {
3530 name = strncpy(tmp, "//enomem", sizeof(tmp));
3531 goto got_name;
3532 }
3533 name = d_path(&file->f_path, buf, PATH_MAX);
3534 if (IS_ERR(name)) {
3535 name = strncpy(tmp, "//toolong", sizeof(tmp));
3536 goto got_name;
3537 }
3538 } else {
3539 if (arch_vma_name(mmap_event->vma)) {
3540 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3541 sizeof(tmp));
3542 goto got_name;
3543 }
3544
3545 if (!vma->vm_mm) {
3546 name = strncpy(tmp, "[vdso]", sizeof(tmp));
3547 goto got_name;
3548 }
3549
3550 name = strncpy(tmp, "//anon", sizeof(tmp));
3551 goto got_name;
3552 }
3553
3554got_name:
3555 size = ALIGN(strlen(name)+1, sizeof(u64));
3556
3557 mmap_event->file_name = name;
3558 mmap_event->file_size = size;
3559
3560 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3561
3562 rcu_read_lock();
3563 cpuctx = &get_cpu_var(perf_cpu_context);
3564 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3565 put_cpu_var(perf_cpu_context);
3566
3567 /*
3568 * doesn't really matter which of the child contexts the
3569 * events ends up in.
3570 */
3571 ctx = rcu_dereference(current->perf_event_ctxp);
3572 if (ctx)
3573 perf_event_mmap_ctx(ctx, mmap_event);
3574 rcu_read_unlock();
3575
3576 kfree(buf);
3577}
3578
3579void __perf_event_mmap(struct vm_area_struct *vma)
3580{
3581 struct perf_mmap_event mmap_event;
3582
3583 if (!atomic_read(&nr_mmap_events))
3584 return;
3585
3586 mmap_event = (struct perf_mmap_event){
3587 .vma = vma,
3588 /* .file_name */
3589 /* .file_size */
3590 .event_id = {
3591 .header = {
3592 .type = PERF_RECORD_MMAP,
3593 .misc = 0,
3594 /* .size */
3595 },
3596 /* .pid */
3597 /* .tid */
3598 .start = vma->vm_start,
3599 .len = vma->vm_end - vma->vm_start,
3600 .pgoff = vma->vm_pgoff,
3601 },
3602 };
3603
3604 perf_event_mmap_event(&mmap_event);
3605}
3606
3607/*
3608 * IRQ throttle logging
3609 */
3610
3611static void perf_log_throttle(struct perf_event *event, int enable)
3612{
3613 struct perf_output_handle handle;
3614 int ret;
3615
3616 struct {
3617 struct perf_event_header header;
3618 u64 time;
3619 u64 id;
3620 u64 stream_id;
3621 } throttle_event = {
3622 .header = {
3623 .type = PERF_RECORD_THROTTLE,
3624 .misc = 0,
3625 .size = sizeof(throttle_event),
3626 },
3627 .time = perf_clock(),
3628 .id = primary_event_id(event),
3629 .stream_id = event->id,
3630 };
3631
3632 if (enable)
3633 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
3634
3635 ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
3636 if (ret)
3637 return;
3638
3639 perf_output_put(&handle, throttle_event);
3640 perf_output_end(&handle);
3641}
3642
3643/*
3644 * Generic event overflow handling, sampling.
3645 */
3646
3647static int __perf_event_overflow(struct perf_event *event, int nmi,
3648 int throttle, struct perf_sample_data *data,
3649 struct pt_regs *regs)
3650{
3651 int events = atomic_read(&event->event_limit);
3652 struct hw_perf_event *hwc = &event->hw;
3653 int ret = 0;
3654
3655 throttle = (throttle && event->pmu->unthrottle != NULL);
3656
3657 if (!throttle) {
3658 hwc->interrupts++;
3659 } else {
3660 if (hwc->interrupts != MAX_INTERRUPTS) {
3661 hwc->interrupts++;
3662 if (HZ * hwc->interrupts >
3663 (u64)sysctl_perf_event_sample_rate) {
3664 hwc->interrupts = MAX_INTERRUPTS;
3665 perf_log_throttle(event, 0);
3666 ret = 1;
3667 }
3668 } else {
3669 /*
3670 * Keep re-disabling events even though on the previous
3671 * pass we disabled it - just in case we raced with a
3672 * sched-in and the event got enabled again:
3673 */
3674 ret = 1;
3675 }
3676 }
3677
3678 if (event->attr.freq) {
3679 u64 now = perf_clock();
3680 s64 delta = now - hwc->freq_stamp;
3681
3682 hwc->freq_stamp = now;
3683
3684 if (delta > 0 && delta < TICK_NSEC)
3685 perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
3686 }
3687
3688 /*
3689 * XXX event_limit might not quite work as expected on inherited
3690 * events
3691 */
3692
3693 event->pending_kill = POLL_IN;
3694 if (events && atomic_dec_and_test(&event->event_limit)) {
3695 ret = 1;
3696 event->pending_kill = POLL_HUP;
3697 if (nmi) {
3698 event->pending_disable = 1;
3699 perf_pending_queue(&event->pending,
3700 perf_pending_event);
3701 } else
3702 perf_event_disable(event);
3703 }
3704
3705 if (event->overflow_handler)
3706 event->overflow_handler(event, nmi, data, regs);
3707 else
3708 perf_event_output(event, nmi, data, regs);
3709
3710 return ret;
3711}
3712
3713int perf_event_overflow(struct perf_event *event, int nmi,
3714 struct perf_sample_data *data,
3715 struct pt_regs *regs)
3716{
3717 return __perf_event_overflow(event, nmi, 1, data, regs);
3718}
3719
3720/*
3721 * Generic software event infrastructure
3722 */
3723
3724/*
3725 * We directly increment event->count and keep a second value in
3726 * event->hw.period_left to count intervals. This period event
3727 * is kept in the range [-sample_period, 0] so that we can use the
3728 * sign as trigger.
3729 */
3730
3731static u64 perf_swevent_set_period(struct perf_event *event)
3732{
3733 struct hw_perf_event *hwc = &event->hw;
3734 u64 period = hwc->last_period;
3735 u64 nr, offset;
3736 s64 old, val;
3737
3738 hwc->last_period = hwc->sample_period;
3739
3740again:
3741 old = val = atomic64_read(&hwc->period_left);
3742 if (val < 0)
3743 return 0;
3744
3745 nr = div64_u64(period + val, period);
3746 offset = nr * period;
3747 val -= offset;
3748 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3749 goto again;
3750
3751 return nr;
3752}
3753
3754static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
3755 int nmi, struct perf_sample_data *data,
3756 struct pt_regs *regs)
3757{
3758 struct hw_perf_event *hwc = &event->hw;
3759 int throttle = 0;
3760
3761 data->period = event->hw.last_period;
3762 if (!overflow)
3763 overflow = perf_swevent_set_period(event);
3764
3765 if (hwc->interrupts == MAX_INTERRUPTS)
3766 return;
3767
3768 for (; overflow; overflow--) {
3769 if (__perf_event_overflow(event, nmi, throttle,
3770 data, regs)) {
3771 /*
3772 * We inhibit the overflow from happening when
3773 * hwc->interrupts == MAX_INTERRUPTS.
3774 */
3775 break;
3776 }
3777 throttle = 1;
3778 }
3779}
3780
3781static void perf_swevent_unthrottle(struct perf_event *event)
3782{
3783 /*
3784 * Nothing to do, we already reset hwc->interrupts.
3785 */
3786}
3787
3788static void perf_swevent_add(struct perf_event *event, u64 nr,
3789 int nmi, struct perf_sample_data *data,
3790 struct pt_regs *regs)
3791{
3792 struct hw_perf_event *hwc = &event->hw;
3793
3794 atomic64_add(nr, &event->count);
3795
3796 if (!regs)
3797 return;
3798
3799 if (!hwc->sample_period)
3800 return;
3801
3802 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
3803 return perf_swevent_overflow(event, 1, nmi, data, regs);
3804
3805 if (atomic64_add_negative(nr, &hwc->period_left))
3806 return;
3807
3808 perf_swevent_overflow(event, 0, nmi, data, regs);
3809}
3810
3811static int perf_swevent_is_counting(struct perf_event *event)
3812{
3813 /*
3814 * The event is active, we're good!
3815 */
3816 if (event->state == PERF_EVENT_STATE_ACTIVE)
3817 return 1;
3818
3819 /*
3820 * The event is off/error, not counting.
3821 */
3822 if (event->state != PERF_EVENT_STATE_INACTIVE)
3823 return 0;
3824
3825 /*
3826 * The event is inactive, if the context is active
3827 * we're part of a group that didn't make it on the 'pmu',
3828 * not counting.
3829 */
3830 if (event->ctx->is_active)
3831 return 0;
3832
3833 /*
3834 * We're inactive and the context is too, this means the
3835 * task is scheduled out, we're counting events that happen
3836 * to us, like migration events.
3837 */
3838 return 1;
3839}
3840
3841static int perf_tp_event_match(struct perf_event *event,
3842 struct perf_sample_data *data);
3843
3844static int perf_exclude_event(struct perf_event *event,
3845 struct pt_regs *regs)
3846{
3847 if (regs) {
3848 if (event->attr.exclude_user && user_mode(regs))
3849 return 1;
3850
3851 if (event->attr.exclude_kernel && !user_mode(regs))
3852 return 1;
3853 }
3854
3855 return 0;
3856}
3857
3858static int perf_swevent_match(struct perf_event *event,
3859 enum perf_type_id type,
3860 u32 event_id,
3861 struct perf_sample_data *data,
3862 struct pt_regs *regs)
3863{
3864 if (!perf_swevent_is_counting(event))
3865 return 0;
3866
3867 if (event->attr.type != type)
3868 return 0;
3869
3870 if (event->attr.config != event_id)
3871 return 0;
3872
3873 if (perf_exclude_event(event, regs))
3874 return 0;
3875
3876 if (event->attr.type == PERF_TYPE_TRACEPOINT &&
3877 !perf_tp_event_match(event, data))
3878 return 0;
3879
3880 return 1;
3881}
3882
3883static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3884 enum perf_type_id type,
3885 u32 event_id, u64 nr, int nmi,
3886 struct perf_sample_data *data,
3887 struct pt_regs *regs)
3888{
3889 struct perf_event *event;
3890
3891 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3892 if (perf_swevent_match(event, type, event_id, data, regs))
3893 perf_swevent_add(event, nr, nmi, data, regs);
3894 }
3895}
3896
3897int perf_swevent_get_recursion_context(void)
3898{
3899 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3900 int rctx;
3901
3902 if (in_nmi())
3903 rctx = 3;
3904 else if (in_irq())
3905 rctx = 2;
3906 else if (in_softirq())
3907 rctx = 1;
3908 else
3909 rctx = 0;
3910
3911 if (cpuctx->recursion[rctx]) {
3912 put_cpu_var(perf_cpu_context);
3913 return -1;
3914 }
3915
3916 cpuctx->recursion[rctx]++;
3917 barrier();
3918
3919 return rctx;
3920}
3921EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
3922
3923void perf_swevent_put_recursion_context(int rctx)
3924{
3925 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
3926 barrier();
3927 cpuctx->recursion[rctx]--;
3928 put_cpu_var(perf_cpu_context);
3929}
3930EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
3931
3932static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3933 u64 nr, int nmi,
3934 struct perf_sample_data *data,
3935 struct pt_regs *regs)
3936{
3937 struct perf_cpu_context *cpuctx;
3938 struct perf_event_context *ctx;
3939
3940 cpuctx = &__get_cpu_var(perf_cpu_context);
3941 rcu_read_lock();
3942 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3943 nr, nmi, data, regs);
3944 /*
3945 * doesn't really matter which of the child contexts the
3946 * events ends up in.
3947 */
3948 ctx = rcu_dereference(current->perf_event_ctxp);
3949 if (ctx)
3950 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3951 rcu_read_unlock();
3952}
3953
3954void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3955 struct pt_regs *regs, u64 addr)
3956{
3957 struct perf_sample_data data;
3958 int rctx;
3959
3960 rctx = perf_swevent_get_recursion_context();
3961 if (rctx < 0)
3962 return;
3963
3964 data.addr = addr;
3965 data.raw = NULL;
3966
3967 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
3968
3969 perf_swevent_put_recursion_context(rctx);
3970}
3971
3972static void perf_swevent_read(struct perf_event *event)
3973{
3974}
3975
3976static int perf_swevent_enable(struct perf_event *event)
3977{
3978 struct hw_perf_event *hwc = &event->hw;
3979
3980 if (hwc->sample_period) {
3981 hwc->last_period = hwc->sample_period;
3982 perf_swevent_set_period(event);
3983 }
3984 return 0;
3985}
3986
3987static void perf_swevent_disable(struct perf_event *event)
3988{
3989}
3990
3991static const struct pmu perf_ops_generic = {
3992 .enable = perf_swevent_enable,
3993 .disable = perf_swevent_disable,
3994 .read = perf_swevent_read,
3995 .unthrottle = perf_swevent_unthrottle,
3996};
3997
3998/*
3999 * hrtimer based swevent callback
4000 */
4001
4002static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4003{
4004 enum hrtimer_restart ret = HRTIMER_RESTART;
4005 struct perf_sample_data data;
4006 struct pt_regs *regs;
4007 struct perf_event *event;
4008 u64 period;
4009
4010 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4011 event->pmu->read(event);
4012
4013 data.addr = 0;
4014 data.period = event->hw.last_period;
4015 regs = get_irq_regs();
4016 /*
4017 * In case we exclude kernel IPs or are somehow not in interrupt
4018 * context, provide the next best thing, the user IP.
4019 */
4020 if ((event->attr.exclude_kernel || !regs) &&
4021 !event->attr.exclude_user)
4022 regs = task_pt_regs(current);
4023
4024 if (regs) {
4025 if (!(event->attr.exclude_idle && current->pid == 0))
4026 if (perf_event_overflow(event, 0, &data, regs))
4027 ret = HRTIMER_NORESTART;
4028 }
4029
4030 period = max_t(u64, 10000, event->hw.sample_period);
4031 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4032
4033 return ret;
4034}
4035
4036static void perf_swevent_start_hrtimer(struct perf_event *event)
4037{
4038 struct hw_perf_event *hwc = &event->hw;
4039
4040 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4041 hwc->hrtimer.function = perf_swevent_hrtimer;
4042 if (hwc->sample_period) {
4043 u64 period;
4044
4045 if (hwc->remaining) {
4046 if (hwc->remaining < 0)
4047 period = 10000;
4048 else
4049 period = hwc->remaining;
4050 hwc->remaining = 0;
4051 } else {
4052 period = max_t(u64, 10000, hwc->sample_period);
4053 }
4054 __hrtimer_start_range_ns(&hwc->hrtimer,
4055 ns_to_ktime(period), 0,
4056 HRTIMER_MODE_REL, 0);
4057 }
4058}
4059
4060static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4061{
4062 struct hw_perf_event *hwc = &event->hw;
4063
4064 if (hwc->sample_period) {
4065 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4066 hwc->remaining = ktime_to_ns(remaining);
4067
4068 hrtimer_cancel(&hwc->hrtimer);
4069 }
4070}
4071
4072/*
4073 * Software event: cpu wall time clock
4074 */
4075
4076static void cpu_clock_perf_event_update(struct perf_event *event)
4077{
4078 int cpu = raw_smp_processor_id();
4079 s64 prev;
4080 u64 now;
4081
4082 now = cpu_clock(cpu);
4083 prev = atomic64_read(&event->hw.prev_count);
4084 atomic64_set(&event->hw.prev_count, now);
4085 atomic64_add(now - prev, &event->count);
4086}
4087
4088static int cpu_clock_perf_event_enable(struct perf_event *event)
4089{
4090 struct hw_perf_event *hwc = &event->hw;
4091 int cpu = raw_smp_processor_id();
4092
4093 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
4094 perf_swevent_start_hrtimer(event);
4095
4096 return 0;
4097}
4098
4099static void cpu_clock_perf_event_disable(struct perf_event *event)
4100{
4101 perf_swevent_cancel_hrtimer(event);
4102 cpu_clock_perf_event_update(event);
4103}
4104
4105static void cpu_clock_perf_event_read(struct perf_event *event)
4106{
4107 cpu_clock_perf_event_update(event);
4108}
4109
4110static const struct pmu perf_ops_cpu_clock = {
4111 .enable = cpu_clock_perf_event_enable,
4112 .disable = cpu_clock_perf_event_disable,
4113 .read = cpu_clock_perf_event_read,
4114};
4115
4116/*
4117 * Software event: task time clock
4118 */
4119
4120static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4121{
4122 u64 prev;
4123 s64 delta;
4124
4125 prev = atomic64_xchg(&event->hw.prev_count, now);
4126 delta = now - prev;
4127 atomic64_add(delta, &event->count);
4128}
4129
4130static int task_clock_perf_event_enable(struct perf_event *event)
4131{
4132 struct hw_perf_event *hwc = &event->hw;
4133 u64 now;
4134
4135 now = event->ctx->time;
4136
4137 atomic64_set(&hwc->prev_count, now);
4138
4139 perf_swevent_start_hrtimer(event);
4140
4141 return 0;
4142}
4143
4144static void task_clock_perf_event_disable(struct perf_event *event)
4145{
4146 perf_swevent_cancel_hrtimer(event);
4147 task_clock_perf_event_update(event, event->ctx->time);
4148
4149}
4150
4151static void task_clock_perf_event_read(struct perf_event *event)
4152{
4153 u64 time;
4154
4155 if (!in_nmi()) {
4156 update_context_time(event->ctx);
4157 time = event->ctx->time;
4158 } else {
4159 u64 now = perf_clock();
4160 u64 delta = now - event->ctx->timestamp;
4161 time = event->ctx->time + delta;
4162 }
4163
4164 task_clock_perf_event_update(event, time);
4165}
4166
4167static const struct pmu perf_ops_task_clock = {
4168 .enable = task_clock_perf_event_enable,
4169 .disable = task_clock_perf_event_disable,
4170 .read = task_clock_perf_event_read,
4171};
4172
4173#ifdef CONFIG_EVENT_PROFILE
4174
4175void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4176 int entry_size)
4177{
4178 struct perf_raw_record raw = {
4179 .size = entry_size,
4180 .data = record,
4181 };
4182
4183 struct perf_sample_data data = {
4184 .addr = addr,
4185 .raw = &raw,
4186 };
4187
4188 struct pt_regs *regs = get_irq_regs();
4189
4190 if (!regs)
4191 regs = task_pt_regs(current);
4192
4193 /* Trace events already protected against recursion */
4194 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4195 &data, regs);
4196}
4197EXPORT_SYMBOL_GPL(perf_tp_event);
4198
4199static int perf_tp_event_match(struct perf_event *event,
4200 struct perf_sample_data *data)
4201{
4202 void *record = data->raw->data;
4203
4204 if (likely(!event->filter) || filter_match_preds(event->filter, record))
4205 return 1;
4206 return 0;
4207}
4208
4209static void tp_perf_event_destroy(struct perf_event *event)
4210{
4211 ftrace_profile_disable(event->attr.config);
4212}
4213
4214static const struct pmu *tp_perf_event_init(struct perf_event *event)
4215{
4216 /*
4217 * Raw tracepoint data is a severe data leak, only allow root to
4218 * have these.
4219 */
4220 if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4221 perf_paranoid_tracepoint_raw() &&
4222 !capable(CAP_SYS_ADMIN))
4223 return ERR_PTR(-EPERM);
4224
4225 if (ftrace_profile_enable(event->attr.config))
4226 return NULL;
4227
4228 event->destroy = tp_perf_event_destroy;
4229
4230 return &perf_ops_generic;
4231}
4232
4233static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4234{
4235 char *filter_str;
4236 int ret;
4237
4238 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4239 return -EINVAL;
4240
4241 filter_str = strndup_user(arg, PAGE_SIZE);
4242 if (IS_ERR(filter_str))
4243 return PTR_ERR(filter_str);
4244
4245 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
4246
4247 kfree(filter_str);
4248 return ret;
4249}
4250
4251static void perf_event_free_filter(struct perf_event *event)
4252{
4253 ftrace_profile_free_filter(event);
4254}
4255
4256#else
4257
4258static int perf_tp_event_match(struct perf_event *event,
4259 struct perf_sample_data *data)
4260{
4261 return 1;
4262}
4263
4264static const struct pmu *tp_perf_event_init(struct perf_event *event)
4265{
4266 return NULL;
4267}
4268
4269static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4270{
4271 return -ENOENT;
4272}
4273
4274static void perf_event_free_filter(struct perf_event *event)
4275{
4276}
4277
4278#endif /* CONFIG_EVENT_PROFILE */
4279
4280#ifdef CONFIG_HAVE_HW_BREAKPOINT
4281static void bp_perf_event_destroy(struct perf_event *event)
4282{
4283 release_bp_slot(event);
4284}
4285
4286static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4287{
4288 int err;
4289 /*
4290 * The breakpoint is already filled if we haven't created the counter
4291 * through perf syscall
4292 * FIXME: manage to get trigerred to NULL if it comes from syscalls
4293 */
4294 if (!bp->callback)
4295 err = register_perf_hw_breakpoint(bp);
4296 else
4297 err = __register_perf_hw_breakpoint(bp);
4298 if (err)
4299 return ERR_PTR(err);
4300
4301 bp->destroy = bp_perf_event_destroy;
4302
4303 return &perf_ops_bp;
4304}
4305
4306void perf_bp_event(struct perf_event *bp, void *data)
4307{
4308 struct perf_sample_data sample;
4309 struct pt_regs *regs = data;
4310
4311 sample.addr = bp->attr.bp_addr;
4312
4313 if (!perf_exclude_event(bp, regs))
4314 perf_swevent_add(bp, 1, 1, &sample, regs);
4315}
4316#else
4317static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4318{
4319 return NULL;
4320}
4321
4322void perf_bp_event(struct perf_event *bp, void *regs)
4323{
4324}
4325#endif
4326
4327atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4328
4329static void sw_perf_event_destroy(struct perf_event *event)
4330{
4331 u64 event_id = event->attr.config;
4332
4333 WARN_ON(event->parent);
4334
4335 atomic_dec(&perf_swevent_enabled[event_id]);
4336}
4337
4338static const struct pmu *sw_perf_event_init(struct perf_event *event)
4339{
4340 const struct pmu *pmu = NULL;
4341 u64 event_id = event->attr.config;
4342
4343 /*
4344 * Software events (currently) can't in general distinguish
4345 * between user, kernel and hypervisor events.
4346 * However, context switches and cpu migrations are considered
4347 * to be kernel events, and page faults are never hypervisor
4348 * events.
4349 */
4350 switch (event_id) {
4351 case PERF_COUNT_SW_CPU_CLOCK:
4352 pmu = &perf_ops_cpu_clock;
4353
4354 break;
4355 case PERF_COUNT_SW_TASK_CLOCK:
4356 /*
4357 * If the user instantiates this as a per-cpu event,
4358 * use the cpu_clock event instead.
4359 */
4360 if (event->ctx->task)
4361 pmu = &perf_ops_task_clock;
4362 else
4363 pmu = &perf_ops_cpu_clock;
4364
4365 break;
4366 case PERF_COUNT_SW_PAGE_FAULTS:
4367 case PERF_COUNT_SW_PAGE_FAULTS_MIN:
4368 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4369 case PERF_COUNT_SW_CONTEXT_SWITCHES:
4370 case PERF_COUNT_SW_CPU_MIGRATIONS:
4371 case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4372 case PERF_COUNT_SW_EMULATION_FAULTS:
4373 if (!event->parent) {
4374 atomic_inc(&perf_swevent_enabled[event_id]);
4375 event->destroy = sw_perf_event_destroy;
4376 }
4377 pmu = &perf_ops_generic;
4378 break;
4379 }
4380
4381 return pmu;
4382}
4383
4384/*
4385 * Allocate and initialize a event structure
4386 */
4387static struct perf_event *
4388perf_event_alloc(struct perf_event_attr *attr,
4389 int cpu,
4390 struct perf_event_context *ctx,
4391 struct perf_event *group_leader,
4392 struct perf_event *parent_event,
4393 perf_callback_t callback,
4394 gfp_t gfpflags)
4395{
4396 const struct pmu *pmu;
4397 struct perf_event *event;
4398 struct hw_perf_event *hwc;
4399 long err;
4400
4401 event = kzalloc(sizeof(*event), gfpflags);
4402 if (!event)
4403 return ERR_PTR(-ENOMEM);
4404
4405 /*
4406 * Single events are their own group leaders, with an
4407 * empty sibling list:
4408 */
4409 if (!group_leader)
4410 group_leader = event;
4411
4412 mutex_init(&event->child_mutex);
4413 INIT_LIST_HEAD(&event->child_list);
4414
4415 INIT_LIST_HEAD(&event->group_entry);
4416 INIT_LIST_HEAD(&event->event_entry);
4417 INIT_LIST_HEAD(&event->sibling_list);
4418 init_waitqueue_head(&event->waitq);
4419
4420 mutex_init(&event->mmap_mutex);
4421
4422 event->cpu = cpu;
4423 event->attr = *attr;
4424 event->group_leader = group_leader;
4425 event->pmu = NULL;
4426 event->ctx = ctx;
4427 event->oncpu = -1;
4428
4429 event->parent = parent_event;
4430
4431 event->ns = get_pid_ns(current->nsproxy->pid_ns);
4432 event->id = atomic64_inc_return(&perf_event_id);
4433
4434 event->state = PERF_EVENT_STATE_INACTIVE;
4435
4436 if (!callback && parent_event)
4437 callback = parent_event->callback;
4438
4439 event->callback = callback;
4440
4441 if (attr->disabled)
4442 event->state = PERF_EVENT_STATE_OFF;
4443
4444 pmu = NULL;
4445
4446 hwc = &event->hw;
4447 hwc->sample_period = attr->sample_period;
4448 if (attr->freq && attr->sample_freq)
4449 hwc->sample_period = 1;
4450 hwc->last_period = hwc->sample_period;
4451
4452 atomic64_set(&hwc->period_left, hwc->sample_period);
4453
4454 /*
4455 * we currently do not support PERF_FORMAT_GROUP on inherited events
4456 */
4457 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4458 goto done;
4459
4460 switch (attr->type) {
4461 case PERF_TYPE_RAW:
4462 case PERF_TYPE_HARDWARE:
4463 case PERF_TYPE_HW_CACHE:
4464 pmu = hw_perf_event_init(event);
4465 break;
4466
4467 case PERF_TYPE_SOFTWARE:
4468 pmu = sw_perf_event_init(event);
4469 break;
4470
4471 case PERF_TYPE_TRACEPOINT:
4472 pmu = tp_perf_event_init(event);
4473 break;
4474
4475 case PERF_TYPE_BREAKPOINT:
4476 pmu = bp_perf_event_init(event);
4477 break;
4478
4479
4480 default:
4481 break;
4482 }
4483done:
4484 err = 0;
4485 if (!pmu)
4486 err = -EINVAL;
4487 else if (IS_ERR(pmu))
4488 err = PTR_ERR(pmu);
4489
4490 if (err) {
4491 if (event->ns)
4492 put_pid_ns(event->ns);
4493 kfree(event);
4494 return ERR_PTR(err);
4495 }
4496
4497 event->pmu = pmu;
4498
4499 if (!event->parent) {
4500 atomic_inc(&nr_events);
4501 if (event->attr.mmap)
4502 atomic_inc(&nr_mmap_events);
4503 if (event->attr.comm)
4504 atomic_inc(&nr_comm_events);
4505 if (event->attr.task)
4506 atomic_inc(&nr_task_events);
4507 }
4508
4509 return event;
4510}
4511
4512static int perf_copy_attr(struct perf_event_attr __user *uattr,
4513 struct perf_event_attr *attr)
4514{
4515 u32 size;
4516 int ret;
4517
4518 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4519 return -EFAULT;
4520
4521 /*
4522 * zero the full structure, so that a short copy will be nice.
4523 */
4524 memset(attr, 0, sizeof(*attr));
4525
4526 ret = get_user(size, &uattr->size);
4527 if (ret)
4528 return ret;
4529
4530 if (size > PAGE_SIZE) /* silly large */
4531 goto err_size;
4532
4533 if (!size) /* abi compat */
4534 size = PERF_ATTR_SIZE_VER0;
4535
4536 if (size < PERF_ATTR_SIZE_VER0)
4537 goto err_size;
4538
4539 /*
4540 * If we're handed a bigger struct than we know of,
4541 * ensure all the unknown bits are 0 - i.e. new
4542 * user-space does not rely on any kernel feature
4543 * extensions we dont know about yet.
4544 */
4545 if (size > sizeof(*attr)) {
4546 unsigned char __user *addr;
4547 unsigned char __user *end;
4548 unsigned char val;
4549
4550 addr = (void __user *)uattr + sizeof(*attr);
4551 end = (void __user *)uattr + size;
4552
4553 for (; addr < end; addr++) {
4554 ret = get_user(val, addr);
4555 if (ret)
4556 return ret;
4557 if (val)
4558 goto err_size;
4559 }
4560 size = sizeof(*attr);
4561 }
4562
4563 ret = copy_from_user(attr, uattr, size);
4564 if (ret)
4565 return -EFAULT;
4566
4567 /*
4568 * If the type exists, the corresponding creation will verify
4569 * the attr->config.
4570 */
4571 if (attr->type >= PERF_TYPE_MAX)
4572 return -EINVAL;
4573
4574 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
4575 return -EINVAL;
4576
4577 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4578 return -EINVAL;
4579
4580 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4581 return -EINVAL;
4582
4583out:
4584 return ret;
4585
4586err_size:
4587 put_user(sizeof(*attr), &uattr->size);
4588 ret = -E2BIG;
4589 goto out;
4590}
4591
4592static int perf_event_set_output(struct perf_event *event, int output_fd)
4593{
4594 struct perf_event *output_event = NULL;
4595 struct file *output_file = NULL;
4596 struct perf_event *old_output;
4597 int fput_needed = 0;
4598 int ret = -EINVAL;
4599
4600 if (!output_fd)
4601 goto set;
4602
4603 output_file = fget_light(output_fd, &fput_needed);
4604 if (!output_file)
4605 return -EBADF;
4606
4607 if (output_file->f_op != &perf_fops)
4608 goto out;
4609
4610 output_event = output_file->private_data;
4611
4612 /* Don't chain output fds */
4613 if (output_event->output)
4614 goto out;
4615
4616 /* Don't set an output fd when we already have an output channel */
4617 if (event->data)
4618 goto out;
4619
4620 atomic_long_inc(&output_file->f_count);
4621
4622set:
4623 mutex_lock(&event->mmap_mutex);
4624 old_output = event->output;
4625 rcu_assign_pointer(event->output, output_event);
4626 mutex_unlock(&event->mmap_mutex);
4627
4628 if (old_output) {
4629 /*
4630 * we need to make sure no existing perf_output_*()
4631 * is still referencing this event.
4632 */
4633 synchronize_rcu();
4634 fput(old_output->filp);
4635 }
4636
4637 ret = 0;
4638out:
4639 fput_light(output_file, fput_needed);
4640 return ret;
4641}
4642
4643/**
4644 * sys_perf_event_open - open a performance event, associate it to a task/cpu
4645 *
4646 * @attr_uptr: event_id type attributes for monitoring/sampling
4647 * @pid: target pid
4648 * @cpu: target cpu
4649 * @group_fd: group leader event fd
4650 */
4651SYSCALL_DEFINE5(perf_event_open,
4652 struct perf_event_attr __user *, attr_uptr,
4653 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4654{
4655 struct perf_event *event, *group_leader;
4656 struct perf_event_attr attr;
4657 struct perf_event_context *ctx;
4658 struct file *event_file = NULL;
4659 struct file *group_file = NULL;
4660 int fput_needed = 0;
4661 int fput_needed2 = 0;
4662 int err;
4663
4664 /* for future expandability... */
4665 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4666 return -EINVAL;
4667
4668 err = perf_copy_attr(attr_uptr, &attr);
4669 if (err)
4670 return err;
4671
4672 if (!attr.exclude_kernel) {
4673 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4674 return -EACCES;
4675 }
4676
4677 if (attr.freq) {
4678 if (attr.sample_freq > sysctl_perf_event_sample_rate)
4679 return -EINVAL;
4680 }
4681
4682 /*
4683 * Get the target context (task or percpu):
4684 */
4685 ctx = find_get_context(pid, cpu);
4686 if (IS_ERR(ctx))
4687 return PTR_ERR(ctx);
4688
4689 /*
4690 * Look up the group leader (we will attach this event to it):
4691 */
4692 group_leader = NULL;
4693 if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4694 err = -EINVAL;
4695 group_file = fget_light(group_fd, &fput_needed);
4696 if (!group_file)
4697 goto err_put_context;
4698 if (group_file->f_op != &perf_fops)
4699 goto err_put_context;
4700
4701 group_leader = group_file->private_data;
4702 /*
4703 * Do not allow a recursive hierarchy (this new sibling
4704 * becoming part of another group-sibling):
4705 */
4706 if (group_leader->group_leader != group_leader)
4707 goto err_put_context;
4708 /*
4709 * Do not allow to attach to a group in a different
4710 * task or CPU context:
4711 */
4712 if (group_leader->ctx != ctx)
4713 goto err_put_context;
4714 /*
4715 * Only a group leader can be exclusive or pinned
4716 */
4717 if (attr.exclusive || attr.pinned)
4718 goto err_put_context;
4719 }
4720
4721 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4722 NULL, NULL, GFP_KERNEL);
4723 err = PTR_ERR(event);
4724 if (IS_ERR(event))
4725 goto err_put_context;
4726
4727 err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0);
4728 if (err < 0)
4729 goto err_free_put_context;
4730
4731 event_file = fget_light(err, &fput_needed2);
4732 if (!event_file)
4733 goto err_free_put_context;
4734
4735 if (flags & PERF_FLAG_FD_OUTPUT) {
4736 err = perf_event_set_output(event, group_fd);
4737 if (err)
4738 goto err_fput_free_put_context;
4739 }
4740
4741 event->filp = event_file;
4742 WARN_ON_ONCE(ctx->parent_ctx);
4743 mutex_lock(&ctx->mutex);
4744 perf_install_in_context(ctx, event, cpu);
4745 ++ctx->generation;
4746 mutex_unlock(&ctx->mutex);
4747
4748 event->owner = current;
4749 get_task_struct(current);
4750 mutex_lock(&current->perf_event_mutex);
4751 list_add_tail(&event->owner_entry, &current->perf_event_list);
4752 mutex_unlock(&current->perf_event_mutex);
4753
4754err_fput_free_put_context:
4755 fput_light(event_file, fput_needed2);
4756
4757err_free_put_context:
4758 if (err < 0)
4759 kfree(event);
4760
4761err_put_context:
4762 if (err < 0)
4763 put_ctx(ctx);
4764
4765 fput_light(group_file, fput_needed);
4766
4767 return err;
4768}
4769
4770/**
4771 * perf_event_create_kernel_counter
4772 *
4773 * @attr: attributes of the counter to create
4774 * @cpu: cpu in which the counter is bound
4775 * @pid: task to profile
4776 */
4777struct perf_event *
4778perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
4779 pid_t pid, perf_callback_t callback)
4780{
4781 struct perf_event *event;
4782 struct perf_event_context *ctx;
4783 int err;
4784
4785 /*
4786 * Get the target context (task or percpu):
4787 */
4788
4789 ctx = find_get_context(pid, cpu);
4790 if (IS_ERR(ctx)) {
4791 err = PTR_ERR(ctx);
4792 goto err_exit;
4793 }
4794
4795 event = perf_event_alloc(attr, cpu, ctx, NULL,
4796 NULL, callback, GFP_KERNEL);
4797 if (IS_ERR(event)) {
4798 err = PTR_ERR(event);
4799 goto err_put_context;
4800 }
4801
4802 event->filp = NULL;
4803 WARN_ON_ONCE(ctx->parent_ctx);
4804 mutex_lock(&ctx->mutex);
4805 perf_install_in_context(ctx, event, cpu);
4806 ++ctx->generation;
4807 mutex_unlock(&ctx->mutex);
4808
4809 event->owner = current;
4810 get_task_struct(current);
4811 mutex_lock(&current->perf_event_mutex);
4812 list_add_tail(&event->owner_entry, &current->perf_event_list);
4813 mutex_unlock(&current->perf_event_mutex);
4814
4815 return event;
4816
4817 err_put_context:
4818 put_ctx(ctx);
4819 err_exit:
4820 return ERR_PTR(err);
4821}
4822EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
4823
4824/*
4825 * inherit a event from parent task to child task:
4826 */
4827static struct perf_event *
4828inherit_event(struct perf_event *parent_event,
4829 struct task_struct *parent,
4830 struct perf_event_context *parent_ctx,
4831 struct task_struct *child,
4832 struct perf_event *group_leader,
4833 struct perf_event_context *child_ctx)
4834{
4835 struct perf_event *child_event;
4836
4837 /*
4838 * Instead of creating recursive hierarchies of events,
4839 * we link inherited events back to the original parent,
4840 * which has a filp for sure, which we use as the reference
4841 * count:
4842 */
4843 if (parent_event->parent)
4844 parent_event = parent_event->parent;
4845
4846 child_event = perf_event_alloc(&parent_event->attr,
4847 parent_event->cpu, child_ctx,
4848 group_leader, parent_event,
4849 NULL, GFP_KERNEL);
4850 if (IS_ERR(child_event))
4851 return child_event;
4852 get_ctx(child_ctx);
4853
4854 /*
4855 * Make the child state follow the state of the parent event,
4856 * not its attr.disabled bit. We hold the parent's mutex,
4857 * so we won't race with perf_event_{en, dis}able_family.
4858 */
4859 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
4860 child_event->state = PERF_EVENT_STATE_INACTIVE;
4861 else
4862 child_event->state = PERF_EVENT_STATE_OFF;
4863
4864 if (parent_event->attr.freq)
4865 child_event->hw.sample_period = parent_event->hw.sample_period;
4866
4867 child_event->overflow_handler = parent_event->overflow_handler;
4868
4869 /*
4870 * Link it up in the child's context:
4871 */
4872 add_event_to_ctx(child_event, child_ctx);
4873
4874 /*
4875 * Get a reference to the parent filp - we will fput it
4876 * when the child event exits. This is safe to do because
4877 * we are in the parent and we know that the filp still
4878 * exists and has a nonzero count:
4879 */
4880 atomic_long_inc(&parent_event->filp->f_count);
4881
4882 /*
4883 * Link this into the parent event's child list
4884 */
4885 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4886 mutex_lock(&parent_event->child_mutex);
4887 list_add_tail(&child_event->child_list, &parent_event->child_list);
4888 mutex_unlock(&parent_event->child_mutex);
4889
4890 return child_event;
4891}
4892
4893static int inherit_group(struct perf_event *parent_event,
4894 struct task_struct *parent,
4895 struct perf_event_context *parent_ctx,
4896 struct task_struct *child,
4897 struct perf_event_context *child_ctx)
4898{
4899 struct perf_event *leader;
4900 struct perf_event *sub;
4901 struct perf_event *child_ctr;
4902
4903 leader = inherit_event(parent_event, parent, parent_ctx,
4904 child, NULL, child_ctx);
4905 if (IS_ERR(leader))
4906 return PTR_ERR(leader);
4907 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
4908 child_ctr = inherit_event(sub, parent, parent_ctx,
4909 child, leader, child_ctx);
4910 if (IS_ERR(child_ctr))
4911 return PTR_ERR(child_ctr);
4912 }
4913 return 0;
4914}
4915
4916static void sync_child_event(struct perf_event *child_event,
4917 struct task_struct *child)
4918{
4919 struct perf_event *parent_event = child_event->parent;
4920 u64 child_val;
4921
4922 if (child_event->attr.inherit_stat)
4923 perf_event_read_event(child_event, child);
4924
4925 child_val = atomic64_read(&child_event->count);
4926
4927 /*
4928 * Add back the child's count to the parent's count:
4929 */
4930 atomic64_add(child_val, &parent_event->count);
4931 atomic64_add(child_event->total_time_enabled,
4932 &parent_event->child_total_time_enabled);
4933 atomic64_add(child_event->total_time_running,
4934 &parent_event->child_total_time_running);
4935
4936 /*
4937 * Remove this event from the parent's list
4938 */
4939 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4940 mutex_lock(&parent_event->child_mutex);
4941 list_del_init(&child_event->child_list);
4942 mutex_unlock(&parent_event->child_mutex);
4943
4944 /*
4945 * Release the parent event, if this was the last
4946 * reference to it.
4947 */
4948 fput(parent_event->filp);
4949}
4950
4951static void
4952__perf_event_exit_task(struct perf_event *child_event,
4953 struct perf_event_context *child_ctx,
4954 struct task_struct *child)
4955{
4956 struct perf_event *parent_event;
4957
4958 perf_event_remove_from_context(child_event);
4959
4960 parent_event = child_event->parent;
4961 /*
4962 * It can happen that parent exits first, and has events
4963 * that are still around due to the child reference. These
4964 * events need to be zapped - but otherwise linger.
4965 */
4966 if (parent_event) {
4967 sync_child_event(child_event, child);
4968 free_event(child_event);
4969 }
4970}
4971
4972/*
4973 * When a child task exits, feed back event values to parent events.
4974 */
4975void perf_event_exit_task(struct task_struct *child)
4976{
4977 struct perf_event *child_event, *tmp;
4978 struct perf_event_context *child_ctx;
4979 unsigned long flags;
4980
4981 if (likely(!child->perf_event_ctxp)) {
4982 perf_event_task(child, NULL, 0);
4983 return;
4984 }
4985
4986 local_irq_save(flags);
4987 /*
4988 * We can't reschedule here because interrupts are disabled,
4989 * and either child is current or it is a task that can't be
4990 * scheduled, so we are now safe from rescheduling changing
4991 * our context.
4992 */
4993 child_ctx = child->perf_event_ctxp;
4994 __perf_event_task_sched_out(child_ctx);
4995
4996 /*
4997 * Take the context lock here so that if find_get_context is
4998 * reading child->perf_event_ctxp, we wait until it has
4999 * incremented the context's refcount before we do put_ctx below.
5000 */
5001 spin_lock(&child_ctx->lock);
5002 child->perf_event_ctxp = NULL;
5003 /*
5004 * If this context is a clone; unclone it so it can't get
5005 * swapped to another process while we're removing all
5006 * the events from it.
5007 */
5008 unclone_ctx(child_ctx);
5009 update_context_time(child_ctx);
5010 spin_unlock_irqrestore(&child_ctx->lock, flags);
5011
5012 /*
5013 * Report the task dead after unscheduling the events so that we
5014 * won't get any samples after PERF_RECORD_EXIT. We can however still
5015 * get a few PERF_RECORD_READ events.
5016 */
5017 perf_event_task(child, child_ctx, 0);
5018
5019 /*
5020 * We can recurse on the same lock type through:
5021 *
5022 * __perf_event_exit_task()
5023 * sync_child_event()
5024 * fput(parent_event->filp)
5025 * perf_release()
5026 * mutex_lock(&ctx->mutex)
5027 *
5028 * But since its the parent context it won't be the same instance.
5029 */
5030 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
5031
5032again:
5033 list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
5034 group_entry)
5035 __perf_event_exit_task(child_event, child_ctx, child);
5036
5037 /*
5038 * If the last event was a group event, it will have appended all
5039 * its siblings to the list, but we obtained 'tmp' before that which
5040 * will still point to the list head terminating the iteration.
5041 */
5042 if (!list_empty(&child_ctx->group_list))
5043 goto again;
5044
5045 mutex_unlock(&child_ctx->mutex);
5046
5047 put_ctx(child_ctx);
5048}
5049
5050/*
5051 * free an unexposed, unused context as created by inheritance by
5052 * init_task below, used by fork() in case of fail.
5053 */
5054void perf_event_free_task(struct task_struct *task)
5055{
5056 struct perf_event_context *ctx = task->perf_event_ctxp;
5057 struct perf_event *event, *tmp;
5058
5059 if (!ctx)
5060 return;
5061
5062 mutex_lock(&ctx->mutex);
5063again:
5064 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
5065 struct perf_event *parent = event->parent;
5066
5067 if (WARN_ON_ONCE(!parent))
5068 continue;
5069
5070 mutex_lock(&parent->child_mutex);
5071 list_del_init(&event->child_list);
5072 mutex_unlock(&parent->child_mutex);
5073
5074 fput(parent->filp);
5075
5076 list_del_event(event, ctx);
5077 free_event(event);
5078 }
5079
5080 if (!list_empty(&ctx->group_list))
5081 goto again;
5082
5083 mutex_unlock(&ctx->mutex);
5084
5085 put_ctx(ctx);
5086}
5087
5088/*
5089 * Initialize the perf_event context in task_struct
5090 */
5091int perf_event_init_task(struct task_struct *child)
5092{
5093 struct perf_event_context *child_ctx, *parent_ctx;
5094 struct perf_event_context *cloned_ctx;
5095 struct perf_event *event;
5096 struct task_struct *parent = current;
5097 int inherited_all = 1;
5098 int ret = 0;
5099
5100 child->perf_event_ctxp = NULL;
5101
5102 mutex_init(&child->perf_event_mutex);
5103 INIT_LIST_HEAD(&child->perf_event_list);
5104
5105 if (likely(!parent->perf_event_ctxp))
5106 return 0;
5107
5108 /*
5109 * This is executed from the parent task context, so inherit
5110 * events that have been marked for cloning.
5111 * First allocate and initialize a context for the child.
5112 */
5113
5114 child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
5115 if (!child_ctx)
5116 return -ENOMEM;
5117
5118 __perf_event_init_context(child_ctx, child);
5119 child->perf_event_ctxp = child_ctx;
5120 get_task_struct(child);
5121
5122 /*
5123 * If the parent's context is a clone, pin it so it won't get
5124 * swapped under us.
5125 */
5126 parent_ctx = perf_pin_task_context(parent);
5127
5128 /*
5129 * No need to check if parent_ctx != NULL here; since we saw
5130 * it non-NULL earlier, the only reason for it to become NULL
5131 * is if we exit, and since we're currently in the middle of
5132 * a fork we can't be exiting at the same time.
5133 */
5134
5135 /*
5136 * Lock the parent list. No need to lock the child - not PID
5137 * hashed yet and not running, so nobody can access it.
5138 */
5139 mutex_lock(&parent_ctx->mutex);
5140
5141 /*
5142 * We dont have to disable NMIs - we are only looking at
5143 * the list, not manipulating it:
5144 */
5145 list_for_each_entry(event, &parent_ctx->group_list, group_entry) {
5146
5147 if (!event->attr.inherit) {
5148 inherited_all = 0;
5149 continue;
5150 }
5151
5152 ret = inherit_group(event, parent, parent_ctx,
5153 child, child_ctx);
5154 if (ret) {
5155 inherited_all = 0;
5156 break;
5157 }
5158 }
5159
5160 if (inherited_all) {
5161 /*
5162 * Mark the child context as a clone of the parent
5163 * context, or of whatever the parent is a clone of.
5164 * Note that if the parent is a clone, it could get
5165 * uncloned at any point, but that doesn't matter
5166 * because the list of events and the generation
5167 * count can't have changed since we took the mutex.
5168 */
5169 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
5170 if (cloned_ctx) {
5171 child_ctx->parent_ctx = cloned_ctx;
5172 child_ctx->parent_gen = parent_ctx->parent_gen;
5173 } else {
5174 child_ctx->parent_ctx = parent_ctx;
5175 child_ctx->parent_gen = parent_ctx->generation;
5176 }
5177 get_ctx(child_ctx->parent_ctx);
5178 }
5179
5180 mutex_unlock(&parent_ctx->mutex);
5181
5182 perf_unpin_context(parent_ctx);
5183
5184 return ret;
5185}
5186
5187static void __cpuinit perf_event_init_cpu(int cpu)
5188{
5189 struct perf_cpu_context *cpuctx;
5190
5191 cpuctx = &per_cpu(perf_cpu_context, cpu);
5192 __perf_event_init_context(&cpuctx->ctx, NULL);
5193
5194 spin_lock(&perf_resource_lock);
5195 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5196 spin_unlock(&perf_resource_lock);
5197
5198 hw_perf_event_setup(cpu);
5199}
5200
5201#ifdef CONFIG_HOTPLUG_CPU
5202static void __perf_event_exit_cpu(void *info)
5203{
5204 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
5205 struct perf_event_context *ctx = &cpuctx->ctx;
5206 struct perf_event *event, *tmp;
5207
5208 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
5209 __perf_event_remove_from_context(event);
5210}
5211static void perf_event_exit_cpu(int cpu)
5212{
5213 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
5214 struct perf_event_context *ctx = &cpuctx->ctx;
5215
5216 mutex_lock(&ctx->mutex);
5217 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5218 mutex_unlock(&ctx->mutex);
5219}
5220#else
5221static inline void perf_event_exit_cpu(int cpu) { }
5222#endif
5223
5224static int __cpuinit
5225perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5226{
5227 unsigned int cpu = (long)hcpu;
5228
5229 switch (action) {
5230
5231 case CPU_UP_PREPARE:
5232 case CPU_UP_PREPARE_FROZEN:
5233 perf_event_init_cpu(cpu);
5234 break;
5235
5236 case CPU_ONLINE:
5237 case CPU_ONLINE_FROZEN:
5238 hw_perf_event_setup_online(cpu);
5239 break;
5240
5241 case CPU_DOWN_PREPARE:
5242 case CPU_DOWN_PREPARE_FROZEN:
5243 perf_event_exit_cpu(cpu);
5244 break;
5245
5246 default:
5247 break;
5248 }
5249
5250 return NOTIFY_OK;
5251}
5252
5253/*
5254 * This has to have a higher priority than migration_notifier in sched.c.
5255 */
5256static struct notifier_block __cpuinitdata perf_cpu_nb = {
5257 .notifier_call = perf_cpu_notify,
5258 .priority = 20,
5259};
5260
5261void __init perf_event_init(void)
5262{
5263 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
5264 (void *)(long)smp_processor_id());
5265 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
5266 (void *)(long)smp_processor_id());
5267 register_cpu_notifier(&perf_cpu_nb);
5268}
5269
5270static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
5271{
5272 return sprintf(buf, "%d\n", perf_reserved_percpu);
5273}
5274
5275static ssize_t
5276perf_set_reserve_percpu(struct sysdev_class *class,
5277 const char *buf,
5278 size_t count)
5279{
5280 struct perf_cpu_context *cpuctx;
5281 unsigned long val;
5282 int err, cpu, mpt;
5283
5284 err = strict_strtoul(buf, 10, &val);
5285 if (err)
5286 return err;
5287 if (val > perf_max_events)
5288 return -EINVAL;
5289
5290 spin_lock(&perf_resource_lock);
5291 perf_reserved_percpu = val;
5292 for_each_online_cpu(cpu) {
5293 cpuctx = &per_cpu(perf_cpu_context, cpu);
5294 spin_lock_irq(&cpuctx->ctx.lock);
5295 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5296 perf_max_events - perf_reserved_percpu);
5297 cpuctx->max_pertask = mpt;
5298 spin_unlock_irq(&cpuctx->ctx.lock);
5299 }
5300 spin_unlock(&perf_resource_lock);
5301
5302 return count;
5303}
5304
5305static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
5306{
5307 return sprintf(buf, "%d\n", perf_overcommit);
5308}
5309
5310static ssize_t
5311perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
5312{
5313 unsigned long val;
5314 int err;
5315
5316 err = strict_strtoul(buf, 10, &val);
5317 if (err)
5318 return err;
5319 if (val > 1)
5320 return -EINVAL;
5321
5322 spin_lock(&perf_resource_lock);
5323 perf_overcommit = val;
5324 spin_unlock(&perf_resource_lock);
5325
5326 return count;
5327}
5328
5329static SYSDEV_CLASS_ATTR(
5330 reserve_percpu,
5331 0644,
5332 perf_show_reserve_percpu,
5333 perf_set_reserve_percpu
5334 );
5335
5336static SYSDEV_CLASS_ATTR(
5337 overcommit,
5338 0644,
5339 perf_show_overcommit,
5340 perf_set_overcommit
5341 );
5342
5343static struct attribute *perfclass_attrs[] = {
5344 &attr_reserve_percpu.attr,
5345 &attr_overcommit.attr,
5346 NULL
5347};
5348
5349static struct attribute_group perfclass_attr_group = {
5350 .attrs = perfclass_attrs,
5351 .name = "perf_events",
5352};
5353
5354static int __init perf_event_sysfs_init(void)
5355{
5356 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
5357 &perfclass_attr_group);
5358}
5359device_initcall(perf_event_sysfs_init);
diff --git a/kernel/pid.c b/kernel/pid.c
index 31310b5d3f50..d3f722d20f9c 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -40,7 +40,7 @@
40#define pid_hashfn(nr, ns) \ 40#define pid_hashfn(nr, ns) \
41 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) 41 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
42static struct hlist_head *pid_hash; 42static struct hlist_head *pid_hash;
43static int pidhash_shift; 43static unsigned int pidhash_shift = 4;
44struct pid init_struct_pid = INIT_STRUCT_PID; 44struct pid init_struct_pid = INIT_STRUCT_PID;
45 45
46int pid_max = PID_MAX_DEFAULT; 46int pid_max = PID_MAX_DEFAULT;
@@ -499,19 +499,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
499void __init pidhash_init(void) 499void __init pidhash_init(void)
500{ 500{
501 int i, pidhash_size; 501 int i, pidhash_size;
502 unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
503 502
504 pidhash_shift = max(4, fls(megabytes * 4)); 503 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
505 pidhash_shift = min(12, pidhash_shift); 504 HASH_EARLY | HASH_SMALL,
505 &pidhash_shift, NULL, 4096);
506 pidhash_size = 1 << pidhash_shift; 506 pidhash_size = 1 << pidhash_shift;
507 507
508 printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
509 pidhash_size, pidhash_shift,
510 pidhash_size * sizeof(struct hlist_head));
511
512 pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash)));
513 if (!pid_hash)
514 panic("Could not alloc pidhash!\n");
515 for (i = 0; i < pidhash_size; i++) 508 for (i = 0; i < pidhash_size; i++)
516 INIT_HLIST_HEAD(&pid_hash[i]); 509 INIT_HLIST_HEAD(&pid_hash[i]);
517} 510}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 821722ae58a7..86b3796b0436 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -118,7 +118,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old
118{ 118{
119 if (!(flags & CLONE_NEWPID)) 119 if (!(flags & CLONE_NEWPID))
120 return get_pid_ns(old_ns); 120 return get_pid_ns(old_ns);
121 if (flags & CLONE_THREAD) 121 if (flags & (CLONE_THREAD|CLONE_PARENT))
122 return ERR_PTR(-EINVAL); 122 return ERR_PTR(-EINVAL);
123 return create_pid_namespace(old_ns); 123 return create_pid_namespace(old_ns);
124} 124}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index e33a21cb9407..5c9dc228747b 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -8,17 +8,18 @@
8#include <linux/math64.h> 8#include <linux/math64.h>
9#include <asm/uaccess.h> 9#include <asm/uaccess.h>
10#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
11#include <trace/events/timer.h>
11 12
12/* 13/*
13 * Called after updating RLIMIT_CPU to set timer expiration if necessary. 14 * Called after updating RLIMIT_CPU to set timer expiration if necessary.
14 */ 15 */
15void update_rlimit_cpu(unsigned long rlim_new) 16void update_rlimit_cpu(unsigned long rlim_new)
16{ 17{
17 cputime_t cputime; 18 cputime_t cputime = secs_to_cputime(rlim_new);
19 struct signal_struct *const sig = current->signal;
18 20
19 cputime = secs_to_cputime(rlim_new); 21 if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) ||
20 if (cputime_eq(current->signal->it_prof_expires, cputime_zero) || 22 cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) {
21 cputime_gt(current->signal->it_prof_expires, cputime)) {
22 spin_lock_irq(&current->sighand->siglock); 23 spin_lock_irq(&current->sighand->siglock);
23 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); 24 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
24 spin_unlock_irq(&current->sighand->siglock); 25 spin_unlock_irq(&current->sighand->siglock);
@@ -542,6 +543,17 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
542 now); 543 now);
543} 544}
544 545
546static inline int expires_gt(cputime_t expires, cputime_t new_exp)
547{
548 return cputime_eq(expires, cputime_zero) ||
549 cputime_gt(expires, new_exp);
550}
551
552static inline int expires_le(cputime_t expires, cputime_t new_exp)
553{
554 return !cputime_eq(expires, cputime_zero) &&
555 cputime_le(expires, new_exp);
556}
545/* 557/*
546 * Insert the timer on the appropriate list before any timers that 558 * Insert the timer on the appropriate list before any timers that
547 * expire later. This must be called with the tasklist_lock held 559 * expire later. This must be called with the tasklist_lock held
@@ -586,34 +598,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
586 */ 598 */
587 599
588 if (CPUCLOCK_PERTHREAD(timer->it_clock)) { 600 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
601 union cpu_time_count *exp = &nt->expires;
602
589 switch (CPUCLOCK_WHICH(timer->it_clock)) { 603 switch (CPUCLOCK_WHICH(timer->it_clock)) {
590 default: 604 default:
591 BUG(); 605 BUG();
592 case CPUCLOCK_PROF: 606 case CPUCLOCK_PROF:
593 if (cputime_eq(p->cputime_expires.prof_exp, 607 if (expires_gt(p->cputime_expires.prof_exp,
594 cputime_zero) || 608 exp->cpu))
595 cputime_gt(p->cputime_expires.prof_exp, 609 p->cputime_expires.prof_exp = exp->cpu;
596 nt->expires.cpu))
597 p->cputime_expires.prof_exp =
598 nt->expires.cpu;
599 break; 610 break;
600 case CPUCLOCK_VIRT: 611 case CPUCLOCK_VIRT:
601 if (cputime_eq(p->cputime_expires.virt_exp, 612 if (expires_gt(p->cputime_expires.virt_exp,
602 cputime_zero) || 613 exp->cpu))
603 cputime_gt(p->cputime_expires.virt_exp, 614 p->cputime_expires.virt_exp = exp->cpu;
604 nt->expires.cpu))
605 p->cputime_expires.virt_exp =
606 nt->expires.cpu;
607 break; 615 break;
608 case CPUCLOCK_SCHED: 616 case CPUCLOCK_SCHED:
609 if (p->cputime_expires.sched_exp == 0 || 617 if (p->cputime_expires.sched_exp == 0 ||
610 p->cputime_expires.sched_exp > 618 p->cputime_expires.sched_exp > exp->sched)
611 nt->expires.sched)
612 p->cputime_expires.sched_exp = 619 p->cputime_expires.sched_exp =
613 nt->expires.sched; 620 exp->sched;
614 break; 621 break;
615 } 622 }
616 } else { 623 } else {
624 struct signal_struct *const sig = p->signal;
625 union cpu_time_count *exp = &timer->it.cpu.expires;
626
617 /* 627 /*
618 * For a process timer, set the cached expiration time. 628 * For a process timer, set the cached expiration time.
619 */ 629 */
@@ -621,30 +631,23 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
621 default: 631 default:
622 BUG(); 632 BUG();
623 case CPUCLOCK_VIRT: 633 case CPUCLOCK_VIRT:
624 if (!cputime_eq(p->signal->it_virt_expires, 634 if (expires_le(sig->it[CPUCLOCK_VIRT].expires,
625 cputime_zero) && 635 exp->cpu))
626 cputime_lt(p->signal->it_virt_expires,
627 timer->it.cpu.expires.cpu))
628 break; 636 break;
629 p->signal->cputime_expires.virt_exp = 637 sig->cputime_expires.virt_exp = exp->cpu;
630 timer->it.cpu.expires.cpu;
631 break; 638 break;
632 case CPUCLOCK_PROF: 639 case CPUCLOCK_PROF:
633 if (!cputime_eq(p->signal->it_prof_expires, 640 if (expires_le(sig->it[CPUCLOCK_PROF].expires,
634 cputime_zero) && 641 exp->cpu))
635 cputime_lt(p->signal->it_prof_expires,
636 timer->it.cpu.expires.cpu))
637 break; 642 break;
638 i = p->signal->rlim[RLIMIT_CPU].rlim_cur; 643 i = sig->rlim[RLIMIT_CPU].rlim_cur;
639 if (i != RLIM_INFINITY && 644 if (i != RLIM_INFINITY &&
640 i <= cputime_to_secs(timer->it.cpu.expires.cpu)) 645 i <= cputime_to_secs(exp->cpu))
641 break; 646 break;
642 p->signal->cputime_expires.prof_exp = 647 sig->cputime_expires.prof_exp = exp->cpu;
643 timer->it.cpu.expires.cpu;
644 break; 648 break;
645 case CPUCLOCK_SCHED: 649 case CPUCLOCK_SCHED:
646 p->signal->cputime_expires.sched_exp = 650 sig->cputime_expires.sched_exp = exp->sched;
647 timer->it.cpu.expires.sched;
648 break; 651 break;
649 } 652 }
650 } 653 }
@@ -1071,6 +1074,40 @@ static void stop_process_timers(struct task_struct *tsk)
1071 spin_unlock_irqrestore(&cputimer->lock, flags); 1074 spin_unlock_irqrestore(&cputimer->lock, flags);
1072} 1075}
1073 1076
1077static u32 onecputick;
1078
1079static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1080 cputime_t *expires, cputime_t cur_time, int signo)
1081{
1082 if (cputime_eq(it->expires, cputime_zero))
1083 return;
1084
1085 if (cputime_ge(cur_time, it->expires)) {
1086 if (!cputime_eq(it->incr, cputime_zero)) {
1087 it->expires = cputime_add(it->expires, it->incr);
1088 it->error += it->incr_error;
1089 if (it->error >= onecputick) {
1090 it->expires = cputime_sub(it->expires,
1091 cputime_one_jiffy);
1092 it->error -= onecputick;
1093 }
1094 } else {
1095 it->expires = cputime_zero;
1096 }
1097
1098 trace_itimer_expire(signo == SIGPROF ?
1099 ITIMER_PROF : ITIMER_VIRTUAL,
1100 tsk->signal->leader_pid, cur_time);
1101 __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
1102 }
1103
1104 if (!cputime_eq(it->expires, cputime_zero) &&
1105 (cputime_eq(*expires, cputime_zero) ||
1106 cputime_lt(it->expires, *expires))) {
1107 *expires = it->expires;
1108 }
1109}
1110
1074/* 1111/*
1075 * Check for any per-thread CPU timers that have fired and move them 1112 * Check for any per-thread CPU timers that have fired and move them
1076 * off the tsk->*_timers list onto the firing list. Per-thread timers 1113 * off the tsk->*_timers list onto the firing list. Per-thread timers
@@ -1090,10 +1127,10 @@ static void check_process_timers(struct task_struct *tsk,
1090 * Don't sample the current process CPU clocks if there are no timers. 1127 * Don't sample the current process CPU clocks if there are no timers.
1091 */ 1128 */
1092 if (list_empty(&timers[CPUCLOCK_PROF]) && 1129 if (list_empty(&timers[CPUCLOCK_PROF]) &&
1093 cputime_eq(sig->it_prof_expires, cputime_zero) && 1130 cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) &&
1094 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && 1131 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
1095 list_empty(&timers[CPUCLOCK_VIRT]) && 1132 list_empty(&timers[CPUCLOCK_VIRT]) &&
1096 cputime_eq(sig->it_virt_expires, cputime_zero) && 1133 cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
1097 list_empty(&timers[CPUCLOCK_SCHED])) { 1134 list_empty(&timers[CPUCLOCK_SCHED])) {
1098 stop_process_timers(tsk); 1135 stop_process_timers(tsk);
1099 return; 1136 return;
@@ -1153,38 +1190,11 @@ static void check_process_timers(struct task_struct *tsk,
1153 /* 1190 /*
1154 * Check for the special case process timers. 1191 * Check for the special case process timers.
1155 */ 1192 */
1156 if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { 1193 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime,
1157 if (cputime_ge(ptime, sig->it_prof_expires)) { 1194 SIGPROF);
1158 /* ITIMER_PROF fires and reloads. */ 1195 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
1159 sig->it_prof_expires = sig->it_prof_incr; 1196 SIGVTALRM);
1160 if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { 1197
1161 sig->it_prof_expires = cputime_add(
1162 sig->it_prof_expires, ptime);
1163 }
1164 __group_send_sig_info(SIGPROF, SEND_SIG_PRIV, tsk);
1165 }
1166 if (!cputime_eq(sig->it_prof_expires, cputime_zero) &&
1167 (cputime_eq(prof_expires, cputime_zero) ||
1168 cputime_lt(sig->it_prof_expires, prof_expires))) {
1169 prof_expires = sig->it_prof_expires;
1170 }
1171 }
1172 if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
1173 if (cputime_ge(utime, sig->it_virt_expires)) {
1174 /* ITIMER_VIRTUAL fires and reloads. */
1175 sig->it_virt_expires = sig->it_virt_incr;
1176 if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
1177 sig->it_virt_expires = cputime_add(
1178 sig->it_virt_expires, utime);
1179 }
1180 __group_send_sig_info(SIGVTALRM, SEND_SIG_PRIV, tsk);
1181 }
1182 if (!cputime_eq(sig->it_virt_expires, cputime_zero) &&
1183 (cputime_eq(virt_expires, cputime_zero) ||
1184 cputime_lt(sig->it_virt_expires, virt_expires))) {
1185 virt_expires = sig->it_virt_expires;
1186 }
1187 }
1188 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { 1198 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
1189 unsigned long psecs = cputime_to_secs(ptime); 1199 unsigned long psecs = cputime_to_secs(ptime);
1190 cputime_t x; 1200 cputime_t x;
@@ -1457,7 +1467,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1457 if (!cputime_eq(*oldval, cputime_zero)) { 1467 if (!cputime_eq(*oldval, cputime_zero)) {
1458 if (cputime_le(*oldval, now.cpu)) { 1468 if (cputime_le(*oldval, now.cpu)) {
1459 /* Just about to fire. */ 1469 /* Just about to fire. */
1460 *oldval = jiffies_to_cputime(1); 1470 *oldval = cputime_one_jiffy;
1461 } else { 1471 } else {
1462 *oldval = cputime_sub(*oldval, now.cpu); 1472 *oldval = cputime_sub(*oldval, now.cpu);
1463 } 1473 }
@@ -1703,10 +1713,15 @@ static __init int init_posix_cpu_timers(void)
1703 .nsleep = thread_cpu_nsleep, 1713 .nsleep = thread_cpu_nsleep,
1704 .nsleep_restart = thread_cpu_nsleep_restart, 1714 .nsleep_restart = thread_cpu_nsleep_restart,
1705 }; 1715 };
1716 struct timespec ts;
1706 1717
1707 register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); 1718 register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
1708 register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); 1719 register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
1709 1720
1721 cputime_to_timespec(cputime_one_jiffy, &ts);
1722 onecputick = ts.tv_nsec;
1723 WARN_ON(ts.tv_sec != 0);
1724
1710 return 0; 1725 return 0;
1711} 1726}
1712__initcall(init_posix_cpu_timers); 1727__initcall(init_posix_cpu_timers);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index d089d052c4a9..495440779ce3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -242,6 +242,25 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
242 return 0; 242 return 0;
243} 243}
244 244
245
246static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
247{
248 *tp = current_kernel_time();
249 return 0;
250}
251
252static int posix_get_monotonic_coarse(clockid_t which_clock,
253 struct timespec *tp)
254{
255 *tp = get_monotonic_coarse();
256 return 0;
257}
258
259int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
260{
261 *tp = ktime_to_timespec(KTIME_LOW_RES);
262 return 0;
263}
245/* 264/*
246 * Initialize everything, well, just everything in Posix clocks/timers ;) 265 * Initialize everything, well, just everything in Posix clocks/timers ;)
247 */ 266 */
@@ -262,10 +281,26 @@ static __init int init_posix_timers(void)
262 .timer_create = no_timer_create, 281 .timer_create = no_timer_create,
263 .nsleep = no_nsleep, 282 .nsleep = no_nsleep,
264 }; 283 };
284 struct k_clock clock_realtime_coarse = {
285 .clock_getres = posix_get_coarse_res,
286 .clock_get = posix_get_realtime_coarse,
287 .clock_set = do_posix_clock_nosettime,
288 .timer_create = no_timer_create,
289 .nsleep = no_nsleep,
290 };
291 struct k_clock clock_monotonic_coarse = {
292 .clock_getres = posix_get_coarse_res,
293 .clock_get = posix_get_monotonic_coarse,
294 .clock_set = do_posix_clock_nosettime,
295 .timer_create = no_timer_create,
296 .nsleep = no_nsleep,
297 };
265 298
266 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 299 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
267 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); 300 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
268 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); 301 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
302 register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
303 register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
269 304
270 posix_timers_cache = kmem_cache_create("posix_timers_cache", 305 posix_timers_cache = kmem_cache_create("posix_timers_cache",
271 sizeof (struct k_itimer), 0, SLAB_PANIC, 306 sizeof (struct k_itimer), 0, SLAB_PANIC,
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 72067cbdb37f..91e09d3b2eb2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -208,3 +208,17 @@ config APM_EMULATION
208 random kernel OOPSes or reboots that don't seem to be related to 208 random kernel OOPSes or reboots that don't seem to be related to
209 anything, try disabling/enabling this option (or disabling/enabling 209 anything, try disabling/enabling this option (or disabling/enabling
210 APM in your BIOS). 210 APM in your BIOS).
211
212config PM_RUNTIME
213 bool "Run-time PM core functionality"
214 depends on PM
215 ---help---
216 Enable functionality allowing I/O devices to be put into energy-saving
217 (low power) states at run time (or autosuspended) after a specified
218 period of inactivity and woken up in response to a hardware-generated
219 wake-up event or a driver's request.
220
221 Hardware support is generally required for this functionality to work
222 and the bus type drivers of the buses the devices are on are
223 responsible for the actual handling of the autosuspend requests and
224 wake-up events.
diff --git a/kernel/power/console.c b/kernel/power/console.c
index a3961b205de7..5187136fe1de 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -14,56 +14,13 @@
14#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) 14#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
15 15
16static int orig_fgconsole, orig_kmsg; 16static int orig_fgconsole, orig_kmsg;
17static int disable_vt_switch;
18
19/*
20 * Normally during a suspend, we allocate a new console and switch to it.
21 * When we resume, we switch back to the original console. This switch
22 * can be slow, so on systems where the framebuffer can handle restoration
23 * of video registers anyways, there's little point in doing the console
24 * switch. This function allows you to disable it by passing it '0'.
25 */
26void pm_set_vt_switch(int do_switch)
27{
28 acquire_console_sem();
29 disable_vt_switch = !do_switch;
30 release_console_sem();
31}
32EXPORT_SYMBOL(pm_set_vt_switch);
33 17
34int pm_prepare_console(void) 18int pm_prepare_console(void)
35{ 19{
36 acquire_console_sem(); 20 orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
37 21 if (orig_fgconsole < 0)
38 if (disable_vt_switch) {
39 release_console_sem();
40 return 0;
41 }
42
43 orig_fgconsole = fg_console;
44
45 if (vc_allocate(SUSPEND_CONSOLE)) {
46 /* we can't have a free VC for now. Too bad,
47 * we don't want to mess the screen for now. */
48 release_console_sem();
49 return 1; 22 return 1;
50 }
51 23
52 if (set_console(SUSPEND_CONSOLE)) {
53 /*
54 * We're unable to switch to the SUSPEND_CONSOLE.
55 * Let the calling function know so it can decide
56 * what to do.
57 */
58 release_console_sem();
59 return 1;
60 }
61 release_console_sem();
62
63 if (vt_waitactive(SUSPEND_CONSOLE)) {
64 pr_debug("Suspend: Can't switch VCs.");
65 return 1;
66 }
67 orig_kmsg = kmsg_redirect; 24 orig_kmsg = kmsg_redirect;
68 kmsg_redirect = SUSPEND_CONSOLE; 25 kmsg_redirect = SUSPEND_CONSOLE;
69 return 0; 26 return 0;
@@ -71,19 +28,9 @@ int pm_prepare_console(void)
71 28
72void pm_restore_console(void) 29void pm_restore_console(void)
73{ 30{
74 acquire_console_sem(); 31 if (orig_fgconsole >= 0) {
75 if (disable_vt_switch) { 32 vt_move_to_console(orig_fgconsole, 0);
76 release_console_sem(); 33 kmsg_redirect = orig_kmsg;
77 return;
78 }
79 set_console(orig_fgconsole);
80 release_console_sem();
81
82 if (vt_waitactive(orig_fgconsole)) {
83 pr_debug("Resume: Can't switch VCs.");
84 return;
85 } 34 }
86
87 kmsg_redirect = orig_kmsg;
88} 35}
89#endif 36#endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 81d2e7464893..04a9e90d248f 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -298,8 +298,8 @@ int hibernation_snapshot(int platform_mode)
298 if (error) 298 if (error)
299 return error; 299 return error;
300 300
301 /* Free memory before shutting down devices. */ 301 /* Preallocate image memory before shutting down devices. */
302 error = swsusp_shrink_memory(); 302 error = hibernate_preallocate_memory();
303 if (error) 303 if (error)
304 goto Close; 304 goto Close;
305 305
@@ -315,6 +315,10 @@ int hibernation_snapshot(int platform_mode)
315 /* Control returns here after successful restore */ 315 /* Control returns here after successful restore */
316 316
317 Resume_devices: 317 Resume_devices:
318 /* We may need to release the preallocated image pages here. */
319 if (error || !in_suspend)
320 swsusp_free();
321
318 dpm_resume_end(in_suspend ? 322 dpm_resume_end(in_suspend ?
319 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 323 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
320 resume_console(); 324 resume_console();
@@ -460,11 +464,11 @@ int hibernation_platform_enter(void)
460 464
461 error = hibernation_ops->prepare(); 465 error = hibernation_ops->prepare();
462 if (error) 466 if (error)
463 goto Platofrm_finish; 467 goto Platform_finish;
464 468
465 error = disable_nonboot_cpus(); 469 error = disable_nonboot_cpus();
466 if (error) 470 if (error)
467 goto Platofrm_finish; 471 goto Platform_finish;
468 472
469 local_irq_disable(); 473 local_irq_disable();
470 sysdev_suspend(PMSG_HIBERNATE); 474 sysdev_suspend(PMSG_HIBERNATE);
@@ -476,7 +480,7 @@ int hibernation_platform_enter(void)
476 * We don't need to reenable the nonboot CPUs or resume consoles, since 480 * We don't need to reenable the nonboot CPUs or resume consoles, since
477 * the system is going to be halted anyway. 481 * the system is going to be halted anyway.
478 */ 482 */
479 Platofrm_finish: 483 Platform_finish:
480 hibernation_ops->finish(); 484 hibernation_ops->finish();
481 485
482 dpm_suspend_noirq(PMSG_RESTORE); 486 dpm_suspend_noirq(PMSG_RESTORE);
@@ -578,7 +582,10 @@ int hibernate(void)
578 goto Thaw; 582 goto Thaw;
579 583
580 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); 584 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
581 if (in_suspend && !error) { 585 if (error)
586 goto Thaw;
587
588 if (in_suspend) {
582 unsigned int flags = 0; 589 unsigned int flags = 0;
583 590
584 if (hibernation_mode == HIBERNATION_PLATFORM) 591 if (hibernation_mode == HIBERNATION_PLATFORM)
@@ -590,8 +597,8 @@ int hibernate(void)
590 power_down(); 597 power_down();
591 } else { 598 } else {
592 pr_debug("PM: Image restored successfully.\n"); 599 pr_debug("PM: Image restored successfully.\n");
593 swsusp_free();
594 } 600 }
601
595 Thaw: 602 Thaw:
596 thaw_processes(); 603 thaw_processes();
597 Finish: 604 Finish:
@@ -686,21 +693,22 @@ static int software_resume(void)
686 /* The snapshot device should not be opened while we're running */ 693 /* The snapshot device should not be opened while we're running */
687 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { 694 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
688 error = -EBUSY; 695 error = -EBUSY;
696 swsusp_close(FMODE_READ);
689 goto Unlock; 697 goto Unlock;
690 } 698 }
691 699
692 pm_prepare_console(); 700 pm_prepare_console();
693 error = pm_notifier_call_chain(PM_RESTORE_PREPARE); 701 error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
694 if (error) 702 if (error)
695 goto Finish; 703 goto close_finish;
696 704
697 error = usermodehelper_disable(); 705 error = usermodehelper_disable();
698 if (error) 706 if (error)
699 goto Finish; 707 goto close_finish;
700 708
701 error = create_basic_memory_bitmaps(); 709 error = create_basic_memory_bitmaps();
702 if (error) 710 if (error)
703 goto Finish; 711 goto close_finish;
704 712
705 pr_debug("PM: Preparing processes for restore.\n"); 713 pr_debug("PM: Preparing processes for restore.\n");
706 error = prepare_processes(); 714 error = prepare_processes();
@@ -712,6 +720,7 @@ static int software_resume(void)
712 pr_debug("PM: Reading hibernation image.\n"); 720 pr_debug("PM: Reading hibernation image.\n");
713 721
714 error = swsusp_read(&flags); 722 error = swsusp_read(&flags);
723 swsusp_close(FMODE_READ);
715 if (!error) 724 if (!error)
716 hibernation_restore(flags & SF_PLATFORM_MODE); 725 hibernation_restore(flags & SF_PLATFORM_MODE);
717 726
@@ -730,6 +739,9 @@ static int software_resume(void)
730 mutex_unlock(&pm_mutex); 739 mutex_unlock(&pm_mutex);
731 pr_debug("PM: Resume from disk failed.\n"); 740 pr_debug("PM: Resume from disk failed.\n");
732 return error; 741 return error;
742close_finish:
743 swsusp_close(FMODE_READ);
744 goto Finish;
733} 745}
734 746
735late_initcall(software_resume); 747late_initcall(software_resume);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f710e36930cc..347d2cc88cd0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -11,6 +11,7 @@
11#include <linux/kobject.h> 11#include <linux/kobject.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/resume-trace.h> 13#include <linux/resume-trace.h>
14#include <linux/workqueue.h>
14 15
15#include "power.h" 16#include "power.h"
16 17
@@ -217,8 +218,24 @@ static struct attribute_group attr_group = {
217 .attrs = g, 218 .attrs = g,
218}; 219};
219 220
221#ifdef CONFIG_PM_RUNTIME
222struct workqueue_struct *pm_wq;
223
224static int __init pm_start_workqueue(void)
225{
226 pm_wq = create_freezeable_workqueue("pm");
227
228 return pm_wq ? 0 : -ENOMEM;
229}
230#else
231static inline int pm_start_workqueue(void) { return 0; }
232#endif
233
220static int __init pm_init(void) 234static int __init pm_init(void)
221{ 235{
236 int error = pm_start_workqueue();
237 if (error)
238 return error;
222 power_kobj = kobject_create_and_add("power", NULL); 239 power_kobj = kobject_create_and_add("power", NULL);
223 if (!power_kobj) 240 if (!power_kobj)
224 return -ENOMEM; 241 return -ENOMEM;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 26d5a26f82e3..46c5a26630a3 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -74,7 +74,7 @@ extern asmlinkage int swsusp_arch_resume(void);
74 74
75extern int create_basic_memory_bitmaps(void); 75extern int create_basic_memory_bitmaps(void);
76extern void free_basic_memory_bitmaps(void); 76extern void free_basic_memory_bitmaps(void);
77extern int swsusp_shrink_memory(void); 77extern int hibernate_preallocate_memory(void);
78 78
79/** 79/**
80 * Auxiliary structure used for reading the snapshot image data and 80 * Auxiliary structure used for reading the snapshot image data and
diff --git a/kernel/power/process.c b/kernel/power/process.c
index da2072d73811..cc2e55373b68 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -9,6 +9,7 @@
9#undef DEBUG 9#undef DEBUG
10 10
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/oom.h>
12#include <linux/suspend.h> 13#include <linux/suspend.h>
13#include <linux/module.h> 14#include <linux/module.h>
14#include <linux/syscalls.h> 15#include <linux/syscalls.h>
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 523a451b45d3..36cb168e4330 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -233,7 +233,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
233 233
234#define BM_END_OF_MAP (~0UL) 234#define BM_END_OF_MAP (~0UL)
235 235
236#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) 236#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE)
237 237
238struct bm_block { 238struct bm_block {
239 struct list_head hook; /* hook into a list of bitmap blocks */ 239 struct list_head hook; /* hook into a list of bitmap blocks */
@@ -275,7 +275,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
275 275
276/** 276/**
277 * create_bm_block_list - create a list of block bitmap objects 277 * create_bm_block_list - create a list of block bitmap objects
278 * @nr_blocks - number of blocks to allocate 278 * @pages - number of pages to track
279 * @list - list to put the allocated blocks into 279 * @list - list to put the allocated blocks into
280 * @ca - chain allocator to be used for allocating memory 280 * @ca - chain allocator to be used for allocating memory
281 */ 281 */
@@ -619,7 +619,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
619 BUG_ON(!region); 619 BUG_ON(!region);
620 } else 620 } else
621 /* This allocation cannot fail */ 621 /* This allocation cannot fail */
622 region = alloc_bootmem_low(sizeof(struct nosave_region)); 622 region = alloc_bootmem(sizeof(struct nosave_region));
623 region->start_pfn = start_pfn; 623 region->start_pfn = start_pfn;
624 region->end_pfn = end_pfn; 624 region->end_pfn = end_pfn;
625 list_add_tail(&region->list, &nosave_regions); 625 list_add_tail(&region->list, &nosave_regions);
@@ -853,7 +853,7 @@ static unsigned int count_highmem_pages(void)
853 struct zone *zone; 853 struct zone *zone;
854 unsigned int n = 0; 854 unsigned int n = 0;
855 855
856 for_each_zone(zone) { 856 for_each_populated_zone(zone) {
857 unsigned long pfn, max_zone_pfn; 857 unsigned long pfn, max_zone_pfn;
858 858
859 if (!is_highmem(zone)) 859 if (!is_highmem(zone))
@@ -916,7 +916,7 @@ static unsigned int count_data_pages(void)
916 unsigned long pfn, max_zone_pfn; 916 unsigned long pfn, max_zone_pfn;
917 unsigned int n = 0; 917 unsigned int n = 0;
918 918
919 for_each_zone(zone) { 919 for_each_populated_zone(zone) {
920 if (is_highmem(zone)) 920 if (is_highmem(zone))
921 continue; 921 continue;
922 922
@@ -1010,7 +1010,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
1010 struct zone *zone; 1010 struct zone *zone;
1011 unsigned long pfn; 1011 unsigned long pfn;
1012 1012
1013 for_each_zone(zone) { 1013 for_each_populated_zone(zone) {
1014 unsigned long max_zone_pfn; 1014 unsigned long max_zone_pfn;
1015 1015
1016 mark_free_pages(zone); 1016 mark_free_pages(zone);
@@ -1033,6 +1033,25 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
1033static unsigned int nr_copy_pages; 1033static unsigned int nr_copy_pages;
1034/* Number of pages needed for saving the original pfns of the image pages */ 1034/* Number of pages needed for saving the original pfns of the image pages */
1035static unsigned int nr_meta_pages; 1035static unsigned int nr_meta_pages;
1036/*
1037 * Numbers of normal and highmem page frames allocated for hibernation image
1038 * before suspending devices.
1039 */
1040unsigned int alloc_normal, alloc_highmem;
1041/*
1042 * Memory bitmap used for marking saveable pages (during hibernation) or
1043 * hibernation image pages (during restore)
1044 */
1045static struct memory_bitmap orig_bm;
1046/*
1047 * Memory bitmap used during hibernation for marking allocated page frames that
1048 * will contain copies of saveable pages. During restore it is initially used
1049 * for marking hibernation image pages, but then the set bits from it are
1050 * duplicated in @orig_bm and it is released. On highmem systems it is next
1051 * used for marking "safe" highmem pages, but it has to be reinitialized for
1052 * this purpose.
1053 */
1054static struct memory_bitmap copy_bm;
1036 1055
1037/** 1056/**
1038 * swsusp_free - free pages allocated for the suspend. 1057 * swsusp_free - free pages allocated for the suspend.
@@ -1046,7 +1065,7 @@ void swsusp_free(void)
1046 struct zone *zone; 1065 struct zone *zone;
1047 unsigned long pfn, max_zone_pfn; 1066 unsigned long pfn, max_zone_pfn;
1048 1067
1049 for_each_zone(zone) { 1068 for_each_populated_zone(zone) {
1050 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1069 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1051 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1070 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1052 if (pfn_valid(pfn)) { 1071 if (pfn_valid(pfn)) {
@@ -1064,74 +1083,286 @@ void swsusp_free(void)
1064 nr_meta_pages = 0; 1083 nr_meta_pages = 0;
1065 restore_pblist = NULL; 1084 restore_pblist = NULL;
1066 buffer = NULL; 1085 buffer = NULL;
1086 alloc_normal = 0;
1087 alloc_highmem = 0;
1067} 1088}
1068 1089
1090/* Helper functions used for the shrinking of memory. */
1091
1092#define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN)
1093
1069/** 1094/**
1070 * swsusp_shrink_memory - Try to free as much memory as needed 1095 * preallocate_image_pages - Allocate a number of pages for hibernation image
1071 * 1096 * @nr_pages: Number of page frames to allocate.
1072 * ... but do not OOM-kill anyone 1097 * @mask: GFP flags to use for the allocation.
1073 * 1098 *
1074 * Notice: all userland should be stopped before it is called, or 1099 * Return value: Number of page frames actually allocated
1075 * livelock is possible. 1100 */
1101static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
1102{
1103 unsigned long nr_alloc = 0;
1104
1105 while (nr_pages > 0) {
1106 struct page *page;
1107
1108 page = alloc_image_page(mask);
1109 if (!page)
1110 break;
1111 memory_bm_set_bit(&copy_bm, page_to_pfn(page));
1112 if (PageHighMem(page))
1113 alloc_highmem++;
1114 else
1115 alloc_normal++;
1116 nr_pages--;
1117 nr_alloc++;
1118 }
1119
1120 return nr_alloc;
1121}
1122
1123static unsigned long preallocate_image_memory(unsigned long nr_pages)
1124{
1125 return preallocate_image_pages(nr_pages, GFP_IMAGE);
1126}
1127
1128#ifdef CONFIG_HIGHMEM
1129static unsigned long preallocate_image_highmem(unsigned long nr_pages)
1130{
1131 return preallocate_image_pages(nr_pages, GFP_IMAGE | __GFP_HIGHMEM);
1132}
1133
1134/**
1135 * __fraction - Compute (an approximation of) x * (multiplier / base)
1076 */ 1136 */
1137static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
1138{
1139 x *= multiplier;
1140 do_div(x, base);
1141 return (unsigned long)x;
1142}
1143
1144static unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1145 unsigned long highmem,
1146 unsigned long total)
1147{
1148 unsigned long alloc = __fraction(nr_pages, highmem, total);
1077 1149
1078#define SHRINK_BITE 10000 1150 return preallocate_image_pages(alloc, GFP_IMAGE | __GFP_HIGHMEM);
1079static inline unsigned long __shrink_memory(long tmp) 1151}
1152#else /* CONFIG_HIGHMEM */
1153static inline unsigned long preallocate_image_highmem(unsigned long nr_pages)
1080{ 1154{
1081 if (tmp > SHRINK_BITE) 1155 return 0;
1082 tmp = SHRINK_BITE;
1083 return shrink_all_memory(tmp);
1084} 1156}
1085 1157
1086int swsusp_shrink_memory(void) 1158static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1159 unsigned long highmem,
1160 unsigned long total)
1161{
1162 return 0;
1163}
1164#endif /* CONFIG_HIGHMEM */
1165
1166/**
1167 * free_unnecessary_pages - Release preallocated pages not needed for the image
1168 */
1169static void free_unnecessary_pages(void)
1170{
1171 unsigned long save_highmem, to_free_normal, to_free_highmem;
1172
1173 to_free_normal = alloc_normal - count_data_pages();
1174 save_highmem = count_highmem_pages();
1175 if (alloc_highmem > save_highmem) {
1176 to_free_highmem = alloc_highmem - save_highmem;
1177 } else {
1178 to_free_highmem = 0;
1179 to_free_normal -= save_highmem - alloc_highmem;
1180 }
1181
1182 memory_bm_position_reset(&copy_bm);
1183
1184 while (to_free_normal > 0 && to_free_highmem > 0) {
1185 unsigned long pfn = memory_bm_next_pfn(&copy_bm);
1186 struct page *page = pfn_to_page(pfn);
1187
1188 if (PageHighMem(page)) {
1189 if (!to_free_highmem)
1190 continue;
1191 to_free_highmem--;
1192 alloc_highmem--;
1193 } else {
1194 if (!to_free_normal)
1195 continue;
1196 to_free_normal--;
1197 alloc_normal--;
1198 }
1199 memory_bm_clear_bit(&copy_bm, pfn);
1200 swsusp_unset_page_forbidden(page);
1201 swsusp_unset_page_free(page);
1202 __free_page(page);
1203 }
1204}
1205
1206/**
1207 * minimum_image_size - Estimate the minimum acceptable size of an image
1208 * @saveable: Number of saveable pages in the system.
1209 *
1210 * We want to avoid attempting to free too much memory too hard, so estimate the
1211 * minimum acceptable size of a hibernation image to use as the lower limit for
1212 * preallocating memory.
1213 *
1214 * We assume that the minimum image size should be proportional to
1215 *
1216 * [number of saveable pages] - [number of pages that can be freed in theory]
1217 *
1218 * where the second term is the sum of (1) reclaimable slab pages, (2) active
1219 * and (3) inactive anonymouns pages, (4) active and (5) inactive file pages,
1220 * minus mapped file pages.
1221 */
1222static unsigned long minimum_image_size(unsigned long saveable)
1223{
1224 unsigned long size;
1225
1226 size = global_page_state(NR_SLAB_RECLAIMABLE)
1227 + global_page_state(NR_ACTIVE_ANON)
1228 + global_page_state(NR_INACTIVE_ANON)
1229 + global_page_state(NR_ACTIVE_FILE)
1230 + global_page_state(NR_INACTIVE_FILE)
1231 - global_page_state(NR_FILE_MAPPED);
1232
1233 return saveable <= size ? 0 : saveable - size;
1234}
1235
1236/**
1237 * hibernate_preallocate_memory - Preallocate memory for hibernation image
1238 *
1239 * To create a hibernation image it is necessary to make a copy of every page
1240 * frame in use. We also need a number of page frames to be free during
1241 * hibernation for allocations made while saving the image and for device
1242 * drivers, in case they need to allocate memory from their hibernation
1243 * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES,
1244 * respectively, both of which are rough estimates). To make this happen, we
1245 * compute the total number of available page frames and allocate at least
1246 *
1247 * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES
1248 *
1249 * of them, which corresponds to the maximum size of a hibernation image.
1250 *
1251 * If image_size is set below the number following from the above formula,
1252 * the preallocation of memory is continued until the total number of saveable
1253 * pages in the system is below the requested image size or the minimum
1254 * acceptable image size returned by minimum_image_size(), whichever is greater.
1255 */
1256int hibernate_preallocate_memory(void)
1087{ 1257{
1088 long tmp;
1089 struct zone *zone; 1258 struct zone *zone;
1090 unsigned long pages = 0; 1259 unsigned long saveable, size, max_size, count, highmem, pages = 0;
1091 unsigned int i = 0; 1260 unsigned long alloc, save_highmem, pages_highmem;
1092 char *p = "-\\|/";
1093 struct timeval start, stop; 1261 struct timeval start, stop;
1262 int error;
1094 1263
1095 printk(KERN_INFO "PM: Shrinking memory... "); 1264 printk(KERN_INFO "PM: Preallocating image memory... ");
1096 do_gettimeofday(&start); 1265 do_gettimeofday(&start);
1097 do {
1098 long size, highmem_size;
1099
1100 highmem_size = count_highmem_pages();
1101 size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
1102 tmp = size;
1103 size += highmem_size;
1104 for_each_populated_zone(zone) {
1105 tmp += snapshot_additional_pages(zone);
1106 if (is_highmem(zone)) {
1107 highmem_size -=
1108 zone_page_state(zone, NR_FREE_PAGES);
1109 } else {
1110 tmp -= zone_page_state(zone, NR_FREE_PAGES);
1111 tmp += zone->lowmem_reserve[ZONE_NORMAL];
1112 }
1113 }
1114 1266
1115 if (highmem_size < 0) 1267 error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
1116 highmem_size = 0; 1268 if (error)
1269 goto err_out;
1117 1270
1118 tmp += highmem_size; 1271 error = memory_bm_create(&copy_bm, GFP_IMAGE, PG_ANY);
1119 if (tmp > 0) { 1272 if (error)
1120 tmp = __shrink_memory(tmp); 1273 goto err_out;
1121 if (!tmp) 1274
1122 return -ENOMEM; 1275 alloc_normal = 0;
1123 pages += tmp; 1276 alloc_highmem = 0;
1124 } else if (size > image_size / PAGE_SIZE) { 1277
1125 tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); 1278 /* Count the number of saveable data pages. */
1126 pages += tmp; 1279 save_highmem = count_highmem_pages();
1127 } 1280 saveable = count_data_pages();
1128 printk("\b%c", p[i++%4]); 1281
1129 } while (tmp > 0); 1282 /*
1283 * Compute the total number of page frames we can use (count) and the
1284 * number of pages needed for image metadata (size).
1285 */
1286 count = saveable;
1287 saveable += save_highmem;
1288 highmem = save_highmem;
1289 size = 0;
1290 for_each_populated_zone(zone) {
1291 size += snapshot_additional_pages(zone);
1292 if (is_highmem(zone))
1293 highmem += zone_page_state(zone, NR_FREE_PAGES);
1294 else
1295 count += zone_page_state(zone, NR_FREE_PAGES);
1296 }
1297 count += highmem;
1298 count -= totalreserve_pages;
1299
1300 /* Compute the maximum number of saveable pages to leave in memory. */
1301 max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES;
1302 size = DIV_ROUND_UP(image_size, PAGE_SIZE);
1303 if (size > max_size)
1304 size = max_size;
1305 /*
1306 * If the maximum is not less than the current number of saveable pages
1307 * in memory, allocate page frames for the image and we're done.
1308 */
1309 if (size >= saveable) {
1310 pages = preallocate_image_highmem(save_highmem);
1311 pages += preallocate_image_memory(saveable - pages);
1312 goto out;
1313 }
1314
1315 /* Estimate the minimum size of the image. */
1316 pages = minimum_image_size(saveable);
1317 if (size < pages)
1318 size = min_t(unsigned long, pages, max_size);
1319
1320 /*
1321 * Let the memory management subsystem know that we're going to need a
1322 * large number of page frames to allocate and make it free some memory.
1323 * NOTE: If this is not done, performance will be hurt badly in some
1324 * test cases.
1325 */
1326 shrink_all_memory(saveable - size);
1327
1328 /*
1329 * The number of saveable pages in memory was too high, so apply some
1330 * pressure to decrease it. First, make room for the largest possible
1331 * image and fail if that doesn't work. Next, try to decrease the size
1332 * of the image as much as indicated by 'size' using allocations from
1333 * highmem and non-highmem zones separately.
1334 */
1335 pages_highmem = preallocate_image_highmem(highmem / 2);
1336 alloc = (count - max_size) - pages_highmem;
1337 pages = preallocate_image_memory(alloc);
1338 if (pages < alloc)
1339 goto err_out;
1340 size = max_size - size;
1341 alloc = size;
1342 size = preallocate_highmem_fraction(size, highmem, count);
1343 pages_highmem += size;
1344 alloc -= size;
1345 pages += preallocate_image_memory(alloc);
1346 pages += pages_highmem;
1347
1348 /*
1349 * We only need as many page frames for the image as there are saveable
1350 * pages in memory, but we have allocated more. Release the excessive
1351 * ones now.
1352 */
1353 free_unnecessary_pages();
1354
1355 out:
1130 do_gettimeofday(&stop); 1356 do_gettimeofday(&stop);
1131 printk("\bdone (%lu pages freed)\n", pages); 1357 printk(KERN_CONT "done (allocated %lu pages)\n", pages);
1132 swsusp_show_speed(&start, &stop, pages, "Freed"); 1358 swsusp_show_speed(&start, &stop, pages, "Allocated");
1133 1359
1134 return 0; 1360 return 0;
1361
1362 err_out:
1363 printk(KERN_CONT "\n");
1364 swsusp_free();
1365 return -ENOMEM;
1135} 1366}
1136 1367
1137#ifdef CONFIG_HIGHMEM 1368#ifdef CONFIG_HIGHMEM
@@ -1142,7 +1373,7 @@ int swsusp_shrink_memory(void)
1142 1373
1143static unsigned int count_pages_for_highmem(unsigned int nr_highmem) 1374static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
1144{ 1375{
1145 unsigned int free_highmem = count_free_highmem_pages(); 1376 unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem;
1146 1377
1147 if (free_highmem >= nr_highmem) 1378 if (free_highmem >= nr_highmem)
1148 nr_highmem = 0; 1379 nr_highmem = 0;
@@ -1164,19 +1395,17 @@ count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
1164static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) 1395static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
1165{ 1396{
1166 struct zone *zone; 1397 struct zone *zone;
1167 unsigned int free = 0, meta = 0; 1398 unsigned int free = alloc_normal;
1168 1399
1169 for_each_zone(zone) { 1400 for_each_populated_zone(zone)
1170 meta += snapshot_additional_pages(zone);
1171 if (!is_highmem(zone)) 1401 if (!is_highmem(zone))
1172 free += zone_page_state(zone, NR_FREE_PAGES); 1402 free += zone_page_state(zone, NR_FREE_PAGES);
1173 }
1174 1403
1175 nr_pages += count_pages_for_highmem(nr_highmem); 1404 nr_pages += count_pages_for_highmem(nr_highmem);
1176 pr_debug("PM: Normal pages needed: %u + %u + %u, available pages: %u\n", 1405 pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n",
1177 nr_pages, PAGES_FOR_IO, meta, free); 1406 nr_pages, PAGES_FOR_IO, free);
1178 1407
1179 return free > nr_pages + PAGES_FOR_IO + meta; 1408 return free > nr_pages + PAGES_FOR_IO;
1180} 1409}
1181 1410
1182#ifdef CONFIG_HIGHMEM 1411#ifdef CONFIG_HIGHMEM
@@ -1198,7 +1427,7 @@ static inline int get_highmem_buffer(int safe_needed)
1198 */ 1427 */
1199 1428
1200static inline unsigned int 1429static inline unsigned int
1201alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem) 1430alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
1202{ 1431{
1203 unsigned int to_alloc = count_free_highmem_pages(); 1432 unsigned int to_alloc = count_free_highmem_pages();
1204 1433
@@ -1218,7 +1447,7 @@ alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
1218static inline int get_highmem_buffer(int safe_needed) { return 0; } 1447static inline int get_highmem_buffer(int safe_needed) { return 0; }
1219 1448
1220static inline unsigned int 1449static inline unsigned int
1221alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } 1450alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
1222#endif /* CONFIG_HIGHMEM */ 1451#endif /* CONFIG_HIGHMEM */
1223 1452
1224/** 1453/**
@@ -1237,51 +1466,36 @@ static int
1237swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, 1466swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
1238 unsigned int nr_pages, unsigned int nr_highmem) 1467 unsigned int nr_pages, unsigned int nr_highmem)
1239{ 1468{
1240 int error; 1469 int error = 0;
1241
1242 error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
1243 if (error)
1244 goto Free;
1245
1246 error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
1247 if (error)
1248 goto Free;
1249 1470
1250 if (nr_highmem > 0) { 1471 if (nr_highmem > 0) {
1251 error = get_highmem_buffer(PG_ANY); 1472 error = get_highmem_buffer(PG_ANY);
1252 if (error) 1473 if (error)
1253 goto Free; 1474 goto err_out;
1254 1475 if (nr_highmem > alloc_highmem) {
1255 nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem); 1476 nr_highmem -= alloc_highmem;
1477 nr_pages += alloc_highmem_pages(copy_bm, nr_highmem);
1478 }
1256 } 1479 }
1257 while (nr_pages-- > 0) { 1480 if (nr_pages > alloc_normal) {
1258 struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); 1481 nr_pages -= alloc_normal;
1259 1482 while (nr_pages-- > 0) {
1260 if (!page) 1483 struct page *page;
1261 goto Free;
1262 1484
1263 memory_bm_set_bit(copy_bm, page_to_pfn(page)); 1485 page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
1486 if (!page)
1487 goto err_out;
1488 memory_bm_set_bit(copy_bm, page_to_pfn(page));
1489 }
1264 } 1490 }
1491
1265 return 0; 1492 return 0;
1266 1493
1267 Free: 1494 err_out:
1268 swsusp_free(); 1495 swsusp_free();
1269 return -ENOMEM; 1496 return error;
1270} 1497}
1271 1498
1272/* Memory bitmap used for marking saveable pages (during suspend) or the
1273 * suspend image pages (during resume)
1274 */
1275static struct memory_bitmap orig_bm;
1276/* Memory bitmap used on suspend for marking allocated pages that will contain
1277 * the copies of saveable pages. During resume it is initially used for
1278 * marking the suspend image pages, but then its set bits are duplicated in
1279 * @orig_bm and it is released. Next, on systems with high memory, it may be
1280 * used for marking "safe" highmem pages, but it has to be reinitialized for
1281 * this purpose.
1282 */
1283static struct memory_bitmap copy_bm;
1284
1285asmlinkage int swsusp_save(void) 1499asmlinkage int swsusp_save(void)
1286{ 1500{
1287 unsigned int nr_pages, nr_highmem; 1501 unsigned int nr_pages, nr_highmem;
@@ -1474,7 +1688,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
1474 unsigned long pfn, max_zone_pfn; 1688 unsigned long pfn, max_zone_pfn;
1475 1689
1476 /* Clear page flags */ 1690 /* Clear page flags */
1477 for_each_zone(zone) { 1691 for_each_populated_zone(zone) {
1478 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1692 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1479 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1693 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1480 if (pfn_valid(pfn)) 1694 if (pfn_valid(pfn))
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 17d8bb1acf9c..25596e450ac7 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -19,7 +19,7 @@
19 * The time it takes is system-specific though, so when we test this 19 * The time it takes is system-specific though, so when we test this
20 * during system bootup we allow a LOT of time. 20 * during system bootup we allow a LOT of time.
21 */ 21 */
22#define TEST_SUSPEND_SECONDS 5 22#define TEST_SUSPEND_SECONDS 10
23 23
24static unsigned long suspend_test_start_time; 24static unsigned long suspend_test_start_time;
25 25
@@ -49,7 +49,8 @@ void suspend_test_finish(const char *label)
49 * has some performance issues. The stack dump of a WARN_ON 49 * has some performance issues. The stack dump of a WARN_ON
50 * is more likely to get the right attention than a printk... 50 * is more likely to get the right attention than a printk...
51 */ 51 */
52 WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label); 52 WARN(msec > (TEST_SUSPEND_SECONDS * 1000),
53 "Component: %s, time: %u\n", label, msec);
53} 54}
54 55
55/* 56/*
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8ba052c86d48..890f6b11b1d3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -13,7 +13,6 @@
13 13
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/file.h> 15#include <linux/file.h>
16#include <linux/utsname.h>
17#include <linux/delay.h> 16#include <linux/delay.h>
18#include <linux/bitops.h> 17#include <linux/bitops.h>
19#include <linux/genhd.h> 18#include <linux/genhd.h>
@@ -315,7 +314,6 @@ static int save_image(struct swap_map_handle *handle,
315{ 314{
316 unsigned int m; 315 unsigned int m;
317 int ret; 316 int ret;
318 int error = 0;
319 int nr_pages; 317 int nr_pages;
320 int err2; 318 int err2;
321 struct bio *bio; 319 struct bio *bio;
@@ -330,26 +328,27 @@ static int save_image(struct swap_map_handle *handle,
330 nr_pages = 0; 328 nr_pages = 0;
331 bio = NULL; 329 bio = NULL;
332 do_gettimeofday(&start); 330 do_gettimeofday(&start);
333 do { 331 while (1) {
334 ret = snapshot_read_next(snapshot, PAGE_SIZE); 332 ret = snapshot_read_next(snapshot, PAGE_SIZE);
335 if (ret > 0) { 333 if (ret <= 0)
336 error = swap_write_page(handle, data_of(*snapshot), 334 break;
337 &bio); 335 ret = swap_write_page(handle, data_of(*snapshot), &bio);
338 if (error) 336 if (ret)
339 break; 337 break;
340 if (!(nr_pages % m)) 338 if (!(nr_pages % m))
341 printk("\b\b\b\b%3d%%", nr_pages / m); 339 printk("\b\b\b\b%3d%%", nr_pages / m);
342 nr_pages++; 340 nr_pages++;
343 } 341 }
344 } while (ret > 0);
345 err2 = wait_on_bio_chain(&bio); 342 err2 = wait_on_bio_chain(&bio);
346 do_gettimeofday(&stop); 343 do_gettimeofday(&stop);
347 if (!error) 344 if (!ret)
348 error = err2; 345 ret = err2;
349 if (!error) 346 if (!ret)
350 printk("\b\b\b\bdone\n"); 347 printk("\b\b\b\bdone\n");
348 else
349 printk("\n");
351 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 350 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
352 return error; 351 return ret;
353} 352}
354 353
355/** 354/**
@@ -537,7 +536,8 @@ static int load_image(struct swap_map_handle *handle,
537 snapshot_write_finalize(snapshot); 536 snapshot_write_finalize(snapshot);
538 if (!snapshot_image_loaded(snapshot)) 537 if (!snapshot_image_loaded(snapshot))
539 error = -ENODATA; 538 error = -ENODATA;
540 } 539 } else
540 printk("\n");
541 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 541 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
542 return error; 542 return error;
543} 543}
@@ -573,8 +573,6 @@ int swsusp_read(unsigned int *flags_p)
573 error = load_image(&handle, &snapshot, header->pages - 1); 573 error = load_image(&handle, &snapshot, header->pages - 1);
574 release_swap_reader(&handle); 574 release_swap_reader(&handle);
575 575
576 blkdev_put(resume_bdev, FMODE_READ);
577
578 if (!error) 576 if (!error)
579 pr_debug("PM: Image successfully loaded\n"); 577 pr_debug("PM: Image successfully loaded\n");
580 else 578 else
@@ -597,7 +595,7 @@ int swsusp_check(void)
597 error = bio_read_page(swsusp_resume_block, 595 error = bio_read_page(swsusp_resume_block,
598 swsusp_header, NULL); 596 swsusp_header, NULL);
599 if (error) 597 if (error)
600 return error; 598 goto put;
601 599
602 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { 600 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
603 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); 601 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
@@ -605,8 +603,10 @@ int swsusp_check(void)
605 error = bio_write_page(swsusp_resume_block, 603 error = bio_write_page(swsusp_resume_block,
606 swsusp_header, NULL); 604 swsusp_header, NULL);
607 } else { 605 } else {
608 return -EINVAL; 606 error = -EINVAL;
609 } 607 }
608
609put:
610 if (error) 610 if (error)
611 blkdev_put(resume_bdev, FMODE_READ); 611 blkdev_put(resume_bdev, FMODE_READ);
612 else 612 else
diff --git a/kernel/printk.c b/kernel/printk.c
index e10d193a833a..b5ac4d99c667 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -33,6 +33,7 @@
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/kexec.h> 35#include <linux/kexec.h>
36#include <linux/ratelimit.h>
36 37
37#include <asm/uaccess.h> 38#include <asm/uaccess.h>
38 39
@@ -206,12 +207,11 @@ __setup("log_buf_len=", log_buf_len_setup);
206#ifdef CONFIG_BOOT_PRINTK_DELAY 207#ifdef CONFIG_BOOT_PRINTK_DELAY
207 208
208static unsigned int boot_delay; /* msecs delay after each printk during bootup */ 209static unsigned int boot_delay; /* msecs delay after each printk during bootup */
209static unsigned long long printk_delay_msec; /* per msec, based on boot_delay */ 210static unsigned long long loops_per_msec; /* based on boot_delay */
210 211
211static int __init boot_delay_setup(char *str) 212static int __init boot_delay_setup(char *str)
212{ 213{
213 unsigned long lpj; 214 unsigned long lpj;
214 unsigned long long loops_per_msec;
215 215
216 lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */ 216 lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */
217 loops_per_msec = (unsigned long long)lpj / 1000 * HZ; 217 loops_per_msec = (unsigned long long)lpj / 1000 * HZ;
@@ -220,10 +220,9 @@ static int __init boot_delay_setup(char *str)
220 if (boot_delay > 10 * 1000) 220 if (boot_delay > 10 * 1000)
221 boot_delay = 0; 221 boot_delay = 0;
222 222
223 printk_delay_msec = loops_per_msec; 223 pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, "
224 printk(KERN_DEBUG "boot_delay: %u, preset_lpj: %ld, lpj: %lu, " 224 "HZ: %d, loops_per_msec: %llu\n",
225 "HZ: %d, printk_delay_msec: %llu\n", 225 boot_delay, preset_lpj, lpj, HZ, loops_per_msec);
226 boot_delay, preset_lpj, lpj, HZ, printk_delay_msec);
227 return 1; 226 return 1;
228} 227}
229__setup("boot_delay=", boot_delay_setup); 228__setup("boot_delay=", boot_delay_setup);
@@ -236,7 +235,7 @@ static void boot_delay_msec(void)
236 if (boot_delay == 0 || system_state != SYSTEM_BOOTING) 235 if (boot_delay == 0 || system_state != SYSTEM_BOOTING)
237 return; 236 return;
238 237
239 k = (unsigned long long)printk_delay_msec * boot_delay; 238 k = (unsigned long long)loops_per_msec * boot_delay;
240 239
241 timeout = jiffies + msecs_to_jiffies(boot_delay); 240 timeout = jiffies + msecs_to_jiffies(boot_delay);
242 while (k) { 241 while (k) {
@@ -655,6 +654,20 @@ static int recursion_bug;
655static int new_text_line = 1; 654static int new_text_line = 1;
656static char printk_buf[1024]; 655static char printk_buf[1024];
657 656
657int printk_delay_msec __read_mostly;
658
659static inline void printk_delay(void)
660{
661 if (unlikely(printk_delay_msec)) {
662 int m = printk_delay_msec;
663
664 while (m--) {
665 mdelay(1);
666 touch_nmi_watchdog();
667 }
668 }
669}
670
658asmlinkage int vprintk(const char *fmt, va_list args) 671asmlinkage int vprintk(const char *fmt, va_list args)
659{ 672{
660 int printed_len = 0; 673 int printed_len = 0;
@@ -664,6 +677,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
664 char *p; 677 char *p;
665 678
666 boot_delay_msec(); 679 boot_delay_msec();
680 printk_delay();
667 681
668 preempt_disable(); 682 preempt_disable();
669 /* This stops the holder of console_sem just where we want him */ 683 /* This stops the holder of console_sem just where we want him */
@@ -1075,12 +1089,6 @@ void __sched console_conditional_schedule(void)
1075} 1089}
1076EXPORT_SYMBOL(console_conditional_schedule); 1090EXPORT_SYMBOL(console_conditional_schedule);
1077 1091
1078void console_print(const char *s)
1079{
1080 printk(KERN_EMERG "%s", s);
1081}
1082EXPORT_SYMBOL(console_print);
1083
1084void console_unblank(void) 1092void console_unblank(void)
1085{ 1093{
1086 struct console *c; 1094 struct console *c;
@@ -1369,11 +1377,11 @@ late_initcall(disable_boot_consoles);
1369 */ 1377 */
1370DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); 1378DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
1371 1379
1372int printk_ratelimit(void) 1380int __printk_ratelimit(const char *func)
1373{ 1381{
1374 return __ratelimit(&printk_ratelimit_state); 1382 return ___ratelimit(&printk_ratelimit_state, func);
1375} 1383}
1376EXPORT_SYMBOL(printk_ratelimit); 1384EXPORT_SYMBOL(__printk_ratelimit);
1377 1385
1378/** 1386/**
1379 * printk_timed_ratelimit - caller-controlled printk ratelimiting 1387 * printk_timed_ratelimit - caller-controlled printk ratelimiting
diff --git a/kernel/profile.c b/kernel/profile.c
index 419250ebec4d..a55d3a367ae8 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -442,48 +442,51 @@ void profile_tick(int type)
442 442
443#ifdef CONFIG_PROC_FS 443#ifdef CONFIG_PROC_FS
444#include <linux/proc_fs.h> 444#include <linux/proc_fs.h>
445#include <linux/seq_file.h>
445#include <asm/uaccess.h> 446#include <asm/uaccess.h>
446 447
447static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, 448static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
448 int count, int *eof, void *data)
449{ 449{
450 int len = cpumask_scnprintf(page, count, data); 450 seq_cpumask(m, prof_cpu_mask);
451 if (count - len < 2) 451 seq_putc(m, '\n');
452 return -EINVAL; 452 return 0;
453 len += sprintf(page + len, "\n");
454 return len;
455} 453}
456 454
457static int prof_cpu_mask_write_proc(struct file *file, 455static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file)
458 const char __user *buffer, unsigned long count, void *data) 456{
457 return single_open(file, prof_cpu_mask_proc_show, NULL);
458}
459
460static ssize_t prof_cpu_mask_proc_write(struct file *file,
461 const char __user *buffer, size_t count, loff_t *pos)
459{ 462{
460 struct cpumask *mask = data;
461 unsigned long full_count = count, err;
462 cpumask_var_t new_value; 463 cpumask_var_t new_value;
464 int err;
463 465
464 if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) 466 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
465 return -ENOMEM; 467 return -ENOMEM;
466 468
467 err = cpumask_parse_user(buffer, count, new_value); 469 err = cpumask_parse_user(buffer, count, new_value);
468 if (!err) { 470 if (!err) {
469 cpumask_copy(mask, new_value); 471 cpumask_copy(prof_cpu_mask, new_value);
470 err = full_count; 472 err = count;
471 } 473 }
472 free_cpumask_var(new_value); 474 free_cpumask_var(new_value);
473 return err; 475 return err;
474} 476}
475 477
478static const struct file_operations prof_cpu_mask_proc_fops = {
479 .open = prof_cpu_mask_proc_open,
480 .read = seq_read,
481 .llseek = seq_lseek,
482 .release = single_release,
483 .write = prof_cpu_mask_proc_write,
484};
485
476void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) 486void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
477{ 487{
478 struct proc_dir_entry *entry;
479
480 /* create /proc/irq/prof_cpu_mask */ 488 /* create /proc/irq/prof_cpu_mask */
481 entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); 489 proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops);
482 if (!entry)
483 return;
484 entry->data = prof_cpu_mask;
485 entry->read_proc = prof_cpu_mask_read_proc;
486 entry->write_proc = prof_cpu_mask_write_proc;
487} 490}
488 491
489/* 492/*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 307c285af59e..23bd09cd042e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -266,9 +266,10 @@ static int ignoring_children(struct sighand_struct *sigh)
266 * or self-reaping. Do notification now if it would have happened earlier. 266 * or self-reaping. Do notification now if it would have happened earlier.
267 * If it should reap itself, return true. 267 * If it should reap itself, return true.
268 * 268 *
269 * If it's our own child, there is no notification to do. 269 * If it's our own child, there is no notification to do. But if our normal
270 * But if our normal children self-reap, then this child 270 * children self-reap, then this child was prevented by ptrace and we must
271 * was prevented by ptrace and we must reap it now. 271 * reap it now, in that case we must also wake up sub-threads sleeping in
272 * do_wait().
272 */ 273 */
273static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) 274static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
274{ 275{
@@ -278,8 +279,10 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
278 if (!task_detached(p) && thread_group_empty(p)) { 279 if (!task_detached(p) && thread_group_empty(p)) {
279 if (!same_thread_group(p->real_parent, tracer)) 280 if (!same_thread_group(p->real_parent, tracer))
280 do_notify_parent(p, p->exit_signal); 281 do_notify_parent(p, p->exit_signal);
281 else if (ignoring_children(tracer->sighand)) 282 else if (ignoring_children(tracer->sighand)) {
283 __wake_up_parent(p, tracer);
282 p->exit_signal = -1; 284 p->exit_signal = -1;
285 }
283 } 286 }
284 if (task_detached(p)) { 287 if (task_detached(p)) {
285 /* Mark it as in the process of being reaped. */ 288 /* Mark it as in the process of being reaped. */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index bd5d5c8e5140..9b7fd4723878 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -19,7 +19,7 @@
19 * 19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com> 20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com> 21 * Manfred Spraul <manfred@colorfullife.com>
22 * 22 *
23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com> 23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. 24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
25 * Papers: 25 * Papers:
@@ -27,7 +27,7 @@
27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) 27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
28 * 28 *
29 * For detailed explanation of Read-Copy Update mechanism see - 29 * For detailed explanation of Read-Copy Update mechanism see -
30 * http://lse.sourceforge.net/locking/rcupdate.html 30 * http://lse.sourceforge.net/locking/rcupdate.html
31 * 31 *
32 */ 32 */
33#include <linux/types.h> 33#include <linux/types.h>
@@ -44,23 +44,13 @@
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h>
48
49enum rcu_barrier {
50 RCU_BARRIER_STD,
51 RCU_BARRIER_BH,
52 RCU_BARRIER_SCHED,
53};
54 47
55static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 48#ifdef CONFIG_DEBUG_LOCK_ALLOC
56static atomic_t rcu_barrier_cpu_count; 49static struct lock_class_key rcu_lock_key;
57static DEFINE_MUTEX(rcu_barrier_mutex); 50struct lockdep_map rcu_lock_map =
58static struct completion rcu_barrier_completion; 51 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
59int rcu_scheduler_active __read_mostly; 52EXPORT_SYMBOL_GPL(rcu_lock_map);
60 53#endif
61static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0);
62static struct rcu_head rcu_migrate_head[3];
63static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq);
64 54
65/* 55/*
66 * Awaken the corresponding synchronize_rcu() instance now that a 56 * Awaken the corresponding synchronize_rcu() instance now that a
@@ -73,199 +63,3 @@ void wakeme_after_rcu(struct rcu_head *head)
73 rcu = container_of(head, struct rcu_synchronize, head); 63 rcu = container_of(head, struct rcu_synchronize, head);
74 complete(&rcu->completion); 64 complete(&rcu->completion);
75} 65}
76
77/**
78 * synchronize_rcu - wait until a grace period has elapsed.
79 *
80 * Control will return to the caller some time after a full grace
81 * period has elapsed, in other words after all currently executing RCU
82 * read-side critical sections have completed. RCU read-side critical
83 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
84 * and may be nested.
85 */
86void synchronize_rcu(void)
87{
88 struct rcu_synchronize rcu;
89
90 if (rcu_blocking_is_gp())
91 return;
92
93 init_completion(&rcu.completion);
94 /* Will wake me after RCU finished. */
95 call_rcu(&rcu.head, wakeme_after_rcu);
96 /* Wait for it. */
97 wait_for_completion(&rcu.completion);
98}
99EXPORT_SYMBOL_GPL(synchronize_rcu);
100
101/**
102 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
103 *
104 * Control will return to the caller some time after a full rcu_bh grace
105 * period has elapsed, in other words after all currently executing rcu_bh
106 * read-side critical sections have completed. RCU read-side critical
107 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
108 * and may be nested.
109 */
110void synchronize_rcu_bh(void)
111{
112 struct rcu_synchronize rcu;
113
114 if (rcu_blocking_is_gp())
115 return;
116
117 init_completion(&rcu.completion);
118 /* Will wake me after RCU finished. */
119 call_rcu_bh(&rcu.head, wakeme_after_rcu);
120 /* Wait for it. */
121 wait_for_completion(&rcu.completion);
122}
123EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
124
125static void rcu_barrier_callback(struct rcu_head *notused)
126{
127 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
128 complete(&rcu_barrier_completion);
129}
130
131/*
132 * Called with preemption disabled, and from cross-cpu IRQ context.
133 */
134static void rcu_barrier_func(void *type)
135{
136 int cpu = smp_processor_id();
137 struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
138
139 atomic_inc(&rcu_barrier_cpu_count);
140 switch ((enum rcu_barrier)type) {
141 case RCU_BARRIER_STD:
142 call_rcu(head, rcu_barrier_callback);
143 break;
144 case RCU_BARRIER_BH:
145 call_rcu_bh(head, rcu_barrier_callback);
146 break;
147 case RCU_BARRIER_SCHED:
148 call_rcu_sched(head, rcu_barrier_callback);
149 break;
150 }
151}
152
153static inline void wait_migrated_callbacks(void)
154{
155 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
156 smp_mb(); /* In case we didn't sleep. */
157}
158
159/*
160 * Orchestrate the specified type of RCU barrier, waiting for all
161 * RCU callbacks of the specified type to complete.
162 */
163static void _rcu_barrier(enum rcu_barrier type)
164{
165 BUG_ON(in_interrupt());
166 /* Take cpucontrol mutex to protect against CPU hotplug */
167 mutex_lock(&rcu_barrier_mutex);
168 init_completion(&rcu_barrier_completion);
169 /*
170 * Initialize rcu_barrier_cpu_count to 1, then invoke
171 * rcu_barrier_func() on each CPU, so that each CPU also has
172 * incremented rcu_barrier_cpu_count. Only then is it safe to
173 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
174 * might complete its grace period before all of the other CPUs
175 * did their increment, causing this function to return too
176 * early.
177 */
178 atomic_set(&rcu_barrier_cpu_count, 1);
179 on_each_cpu(rcu_barrier_func, (void *)type, 1);
180 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
181 complete(&rcu_barrier_completion);
182 wait_for_completion(&rcu_barrier_completion);
183 mutex_unlock(&rcu_barrier_mutex);
184 wait_migrated_callbacks();
185}
186
187/**
188 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
189 */
190void rcu_barrier(void)
191{
192 _rcu_barrier(RCU_BARRIER_STD);
193}
194EXPORT_SYMBOL_GPL(rcu_barrier);
195
196/**
197 * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
198 */
199void rcu_barrier_bh(void)
200{
201 _rcu_barrier(RCU_BARRIER_BH);
202}
203EXPORT_SYMBOL_GPL(rcu_barrier_bh);
204
205/**
206 * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
207 */
208void rcu_barrier_sched(void)
209{
210 _rcu_barrier(RCU_BARRIER_SCHED);
211}
212EXPORT_SYMBOL_GPL(rcu_barrier_sched);
213
214static void rcu_migrate_callback(struct rcu_head *notused)
215{
216 if (atomic_dec_and_test(&rcu_migrate_type_count))
217 wake_up(&rcu_migrate_wq);
218}
219
220extern int rcu_cpu_notify(struct notifier_block *self,
221 unsigned long action, void *hcpu);
222
223static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
224 unsigned long action, void *hcpu)
225{
226 rcu_cpu_notify(self, action, hcpu);
227 if (action == CPU_DYING) {
228 /*
229 * preempt_disable() in on_each_cpu() prevents stop_machine(),
230 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
231 * returns, all online cpus have queued rcu_barrier_func(),
232 * and the dead cpu(if it exist) queues rcu_migrate_callback()s.
233 *
234 * These callbacks ensure _rcu_barrier() waits for all
235 * RCU callbacks of the specified type to complete.
236 */
237 atomic_set(&rcu_migrate_type_count, 3);
238 call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
239 call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
240 call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
241 } else if (action == CPU_DOWN_PREPARE) {
242 /* Don't need to wait until next removal operation. */
243 /* rcu_migrate_head is protected by cpu_add_remove_lock */
244 wait_migrated_callbacks();
245 }
246
247 return NOTIFY_OK;
248}
249
250void __init rcu_init(void)
251{
252 int i;
253
254 __rcu_init();
255 cpu_notifier(rcu_barrier_cpu_hotplug, 0);
256
257 /*
258 * We don't need protection against CPU-hotplug here because
259 * this is called early in boot, before either interrupts
260 * or the scheduler are operational.
261 */
262 for_each_online_cpu(i)
263 rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i);
264}
265
266void rcu_scheduler_starting(void)
267{
268 WARN_ON(num_online_cpus() != 1);
269 WARN_ON(nr_context_switches() > 0);
270 rcu_scheduler_active = 1;
271}
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
new file mode 100644
index 000000000000..9f6d9ff2572c
--- /dev/null
+++ b/kernel/rcutiny.c
@@ -0,0 +1,282 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2008
19 *
20 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU
24 */
25#include <linux/moduleparam.h>
26#include <linux/completion.h>
27#include <linux/interrupt.h>
28#include <linux/notifier.h>
29#include <linux/rcupdate.h>
30#include <linux/kernel.h>
31#include <linux/module.h>
32#include <linux/mutex.h>
33#include <linux/sched.h>
34#include <linux/types.h>
35#include <linux/init.h>
36#include <linux/time.h>
37#include <linux/cpu.h>
38
39/* Global control variables for rcupdate callback mechanism. */
40struct rcu_ctrlblk {
41 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
42 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
43 struct rcu_head **curtail; /* ->next pointer of last CB. */
44};
45
46/* Definition for rcupdate control block. */
47static struct rcu_ctrlblk rcu_ctrlblk = {
48 .donetail = &rcu_ctrlblk.rcucblist,
49 .curtail = &rcu_ctrlblk.rcucblist,
50};
51
52static struct rcu_ctrlblk rcu_bh_ctrlblk = {
53 .donetail = &rcu_bh_ctrlblk.rcucblist,
54 .curtail = &rcu_bh_ctrlblk.rcucblist,
55};
56
57#ifdef CONFIG_NO_HZ
58
59static long rcu_dynticks_nesting = 1;
60
61/*
62 * Enter dynticks-idle mode, which is an extended quiescent state
63 * if we have fully entered that mode (i.e., if the new value of
64 * dynticks_nesting is zero).
65 */
66void rcu_enter_nohz(void)
67{
68 if (--rcu_dynticks_nesting == 0)
69 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
70}
71
72/*
73 * Exit dynticks-idle mode, so that we are no longer in an extended
74 * quiescent state.
75 */
76void rcu_exit_nohz(void)
77{
78 rcu_dynticks_nesting++;
79}
80
81#endif /* #ifdef CONFIG_NO_HZ */
82
83/*
84 * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc().
85 * Also disable irqs to avoid confusion due to interrupt handlers
86 * invoking call_rcu().
87 */
88static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
89{
90 unsigned long flags;
91
92 local_irq_save(flags);
93 if (rcp->rcucblist != NULL &&
94 rcp->donetail != rcp->curtail) {
95 rcp->donetail = rcp->curtail;
96 local_irq_restore(flags);
97 return 1;
98 }
99 local_irq_restore(flags);
100
101 return 0;
102}
103
104/*
105 * Record an rcu quiescent state. And an rcu_bh quiescent state while we
106 * are at it, given that any rcu quiescent state is also an rcu_bh
107 * quiescent state. Use "+" instead of "||" to defeat short circuiting.
108 */
109void rcu_sched_qs(int cpu)
110{
111 if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk))
112 raise_softirq(RCU_SOFTIRQ);
113}
114
115/*
116 * Record an rcu_bh quiescent state.
117 */
118void rcu_bh_qs(int cpu)
119{
120 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
121 raise_softirq(RCU_SOFTIRQ);
122}
123
124/*
125 * Check to see if the scheduling-clock interrupt came from an extended
126 * quiescent state, and, if so, tell RCU about it.
127 */
128void rcu_check_callbacks(int cpu, int user)
129{
130 if (user ||
131 (idle_cpu(cpu) &&
132 !in_softirq() &&
133 hardirq_count() <= (1 << HARDIRQ_SHIFT)))
134 rcu_sched_qs(cpu);
135 else if (!in_softirq())
136 rcu_bh_qs(cpu);
137}
138
139/*
140 * Helper function for rcu_process_callbacks() that operates on the
141 * specified rcu_ctrlkblk structure.
142 */
143static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
144{
145 struct rcu_head *next, *list;
146 unsigned long flags;
147
148 /* If no RCU callbacks ready to invoke, just return. */
149 if (&rcp->rcucblist == rcp->donetail)
150 return;
151
152 /* Move the ready-to-invoke callbacks to a local list. */
153 local_irq_save(flags);
154 list = rcp->rcucblist;
155 rcp->rcucblist = *rcp->donetail;
156 *rcp->donetail = NULL;
157 if (rcp->curtail == rcp->donetail)
158 rcp->curtail = &rcp->rcucblist;
159 rcp->donetail = &rcp->rcucblist;
160 local_irq_restore(flags);
161
162 /* Invoke the callbacks on the local list. */
163 while (list) {
164 next = list->next;
165 prefetch(next);
166 list->func(list);
167 list = next;
168 }
169}
170
171/*
172 * Invoke any callbacks whose grace period has completed.
173 */
174static void rcu_process_callbacks(struct softirq_action *unused)
175{
176 __rcu_process_callbacks(&rcu_ctrlblk);
177 __rcu_process_callbacks(&rcu_bh_ctrlblk);
178}
179
180/*
181 * Wait for a grace period to elapse. But it is illegal to invoke
182 * synchronize_sched() from within an RCU read-side critical section.
183 * Therefore, any legal call to synchronize_sched() is a quiescent
184 * state, and so on a UP system, synchronize_sched() need do nothing.
185 * Ditto for synchronize_rcu_bh(). (But Lai Jiangshan points out the
186 * benefits of doing might_sleep() to reduce latency.)
187 *
188 * Cool, huh? (Due to Josh Triplett.)
189 *
190 * But we want to make this a static inline later.
191 */
192void synchronize_sched(void)
193{
194 cond_resched();
195}
196EXPORT_SYMBOL_GPL(synchronize_sched);
197
198void synchronize_rcu_bh(void)
199{
200 synchronize_sched();
201}
202EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
203
204/*
205 * Helper function for call_rcu() and call_rcu_bh().
206 */
207static void __call_rcu(struct rcu_head *head,
208 void (*func)(struct rcu_head *rcu),
209 struct rcu_ctrlblk *rcp)
210{
211 unsigned long flags;
212
213 head->func = func;
214 head->next = NULL;
215
216 local_irq_save(flags);
217 *rcp->curtail = head;
218 rcp->curtail = &head->next;
219 local_irq_restore(flags);
220}
221
222/*
223 * Post an RCU callback to be invoked after the end of an RCU grace
224 * period. But since we have but one CPU, that would be after any
225 * quiescent state.
226 */
227void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
228{
229 __call_rcu(head, func, &rcu_ctrlblk);
230}
231EXPORT_SYMBOL_GPL(call_rcu);
232
233/*
234 * Post an RCU bottom-half callback to be invoked after any subsequent
235 * quiescent state.
236 */
237void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
238{
239 __call_rcu(head, func, &rcu_bh_ctrlblk);
240}
241EXPORT_SYMBOL_GPL(call_rcu_bh);
242
243void rcu_barrier(void)
244{
245 struct rcu_synchronize rcu;
246
247 init_completion(&rcu.completion);
248 /* Will wake me after RCU finished. */
249 call_rcu(&rcu.head, wakeme_after_rcu);
250 /* Wait for it. */
251 wait_for_completion(&rcu.completion);
252}
253EXPORT_SYMBOL_GPL(rcu_barrier);
254
255void rcu_barrier_bh(void)
256{
257 struct rcu_synchronize rcu;
258
259 init_completion(&rcu.completion);
260 /* Will wake me after RCU finished. */
261 call_rcu_bh(&rcu.head, wakeme_after_rcu);
262 /* Wait for it. */
263 wait_for_completion(&rcu.completion);
264}
265EXPORT_SYMBOL_GPL(rcu_barrier_bh);
266
267void rcu_barrier_sched(void)
268{
269 struct rcu_synchronize rcu;
270
271 init_completion(&rcu.completion);
272 /* Will wake me after RCU finished. */
273 call_rcu_sched(&rcu.head, wakeme_after_rcu);
274 /* Wait for it. */
275 wait_for_completion(&rcu.completion);
276}
277EXPORT_SYMBOL_GPL(rcu_barrier_sched);
278
279void __init rcu_init(void)
280{
281 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
282}
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index b33db539a8ad..a621a67ef4e3 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -18,7 +18,7 @@
18 * Copyright (C) IBM Corporation, 2005, 2006 18 * Copyright (C) IBM Corporation, 2005, 2006
19 * 19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com> 20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 * Josh Triplett <josh@freedesktop.org> 21 * Josh Triplett <josh@freedesktop.org>
22 * 22 *
23 * See also: Documentation/RCU/torture.txt 23 * See also: Documentation/RCU/torture.txt
24 */ 24 */
@@ -50,7 +50,7 @@
50 50
51MODULE_LICENSE("GPL"); 51MODULE_LICENSE("GPL");
52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
53 "Josh Triplett <josh@freedesktop.org>"); 53 "Josh Triplett <josh@freedesktop.org>");
54 54
55static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ 55static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
56static int nfakewriters = 4; /* # fake writer threads */ 56static int nfakewriters = 4; /* # fake writer threads */
@@ -110,8 +110,8 @@ struct rcu_torture {
110}; 110};
111 111
112static LIST_HEAD(rcu_torture_freelist); 112static LIST_HEAD(rcu_torture_freelist);
113static struct rcu_torture *rcu_torture_current = NULL; 113static struct rcu_torture *rcu_torture_current;
114static long rcu_torture_current_version = 0; 114static long rcu_torture_current_version;
115static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; 115static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
116static DEFINE_SPINLOCK(rcu_torture_lock); 116static DEFINE_SPINLOCK(rcu_torture_lock);
117static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = 117static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
@@ -124,11 +124,11 @@ static atomic_t n_rcu_torture_alloc_fail;
124static atomic_t n_rcu_torture_free; 124static atomic_t n_rcu_torture_free;
125static atomic_t n_rcu_torture_mberror; 125static atomic_t n_rcu_torture_mberror;
126static atomic_t n_rcu_torture_error; 126static atomic_t n_rcu_torture_error;
127static long n_rcu_torture_timers = 0; 127static long n_rcu_torture_timers;
128static struct list_head rcu_torture_removed; 128static struct list_head rcu_torture_removed;
129static cpumask_var_t shuffle_tmp_mask; 129static cpumask_var_t shuffle_tmp_mask;
130 130
131static int stutter_pause_test = 0; 131static int stutter_pause_test;
132 132
133#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) 133#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
134#define RCUTORTURE_RUNNABLE_INIT 1 134#define RCUTORTURE_RUNNABLE_INIT 1
@@ -267,7 +267,8 @@ struct rcu_torture_ops {
267 int irq_capable; 267 int irq_capable;
268 char *name; 268 char *name;
269}; 269};
270static struct rcu_torture_ops *cur_ops = NULL; 270
271static struct rcu_torture_ops *cur_ops;
271 272
272/* 273/*
273 * Definitions for rcu torture testing. 274 * Definitions for rcu torture testing.
@@ -281,14 +282,17 @@ static int rcu_torture_read_lock(void) __acquires(RCU)
281 282
282static void rcu_read_delay(struct rcu_random_state *rrsp) 283static void rcu_read_delay(struct rcu_random_state *rrsp)
283{ 284{
284 long delay; 285 const unsigned long shortdelay_us = 200;
285 const long longdelay = 200; 286 const unsigned long longdelay_ms = 50;
286 287
287 /* We want there to be long-running readers, but not all the time. */ 288 /* We want a short delay sometimes to make a reader delay the grace
289 * period, and we want a long delay occasionally to trigger
290 * force_quiescent_state. */
288 291
289 delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay); 292 if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
290 if (!delay) 293 mdelay(longdelay_ms);
291 udelay(longdelay); 294 if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
295 udelay(shortdelay_us);
292} 296}
293 297
294static void rcu_torture_read_unlock(int idx) __releases(RCU) 298static void rcu_torture_read_unlock(int idx) __releases(RCU)
@@ -323,6 +327,11 @@ rcu_torture_cb(struct rcu_head *p)
323 cur_ops->deferred_free(rp); 327 cur_ops->deferred_free(rp);
324} 328}
325 329
330static int rcu_no_completed(void)
331{
332 return 0;
333}
334
326static void rcu_torture_deferred_free(struct rcu_torture *p) 335static void rcu_torture_deferred_free(struct rcu_torture *p)
327{ 336{
328 call_rcu(&p->rtort_rcu, rcu_torture_cb); 337 call_rcu(&p->rtort_rcu, rcu_torture_cb);
@@ -339,8 +348,8 @@ static struct rcu_torture_ops rcu_ops = {
339 .sync = synchronize_rcu, 348 .sync = synchronize_rcu,
340 .cb_barrier = rcu_barrier, 349 .cb_barrier = rcu_barrier,
341 .stats = NULL, 350 .stats = NULL,
342 .irq_capable = 1, 351 .irq_capable = 1,
343 .name = "rcu" 352 .name = "rcu"
344}; 353};
345 354
346static void rcu_sync_torture_deferred_free(struct rcu_torture *p) 355static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
@@ -384,6 +393,21 @@ static struct rcu_torture_ops rcu_sync_ops = {
384 .name = "rcu_sync" 393 .name = "rcu_sync"
385}; 394};
386 395
396static struct rcu_torture_ops rcu_expedited_ops = {
397 .init = rcu_sync_torture_init,
398 .cleanup = NULL,
399 .readlock = rcu_torture_read_lock,
400 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
401 .readunlock = rcu_torture_read_unlock,
402 .completed = rcu_no_completed,
403 .deferred_free = rcu_sync_torture_deferred_free,
404 .sync = synchronize_rcu_expedited,
405 .cb_barrier = NULL,
406 .stats = NULL,
407 .irq_capable = 1,
408 .name = "rcu_expedited"
409};
410
387/* 411/*
388 * Definitions for rcu_bh torture testing. 412 * Definitions for rcu_bh torture testing.
389 */ 413 */
@@ -543,6 +567,25 @@ static struct rcu_torture_ops srcu_ops = {
543 .name = "srcu" 567 .name = "srcu"
544}; 568};
545 569
570static void srcu_torture_synchronize_expedited(void)
571{
572 synchronize_srcu_expedited(&srcu_ctl);
573}
574
575static struct rcu_torture_ops srcu_expedited_ops = {
576 .init = srcu_torture_init,
577 .cleanup = srcu_torture_cleanup,
578 .readlock = srcu_torture_read_lock,
579 .read_delay = srcu_read_delay,
580 .readunlock = srcu_torture_read_unlock,
581 .completed = srcu_torture_completed,
582 .deferred_free = rcu_sync_torture_deferred_free,
583 .sync = srcu_torture_synchronize_expedited,
584 .cb_barrier = NULL,
585 .stats = srcu_torture_stats,
586 .name = "srcu_expedited"
587};
588
546/* 589/*
547 * Definitions for sched torture testing. 590 * Definitions for sched torture testing.
548 */ 591 */
@@ -558,11 +601,6 @@ static void sched_torture_read_unlock(int idx)
558 preempt_enable(); 601 preempt_enable();
559} 602}
560 603
561static int sched_torture_completed(void)
562{
563 return 0;
564}
565
566static void rcu_sched_torture_deferred_free(struct rcu_torture *p) 604static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
567{ 605{
568 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); 606 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
@@ -579,7 +617,7 @@ static struct rcu_torture_ops sched_ops = {
579 .readlock = sched_torture_read_lock, 617 .readlock = sched_torture_read_lock,
580 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 618 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
581 .readunlock = sched_torture_read_unlock, 619 .readunlock = sched_torture_read_unlock,
582 .completed = sched_torture_completed, 620 .completed = rcu_no_completed,
583 .deferred_free = rcu_sched_torture_deferred_free, 621 .deferred_free = rcu_sched_torture_deferred_free,
584 .sync = sched_torture_synchronize, 622 .sync = sched_torture_synchronize,
585 .cb_barrier = rcu_barrier_sched, 623 .cb_barrier = rcu_barrier_sched,
@@ -588,13 +626,13 @@ static struct rcu_torture_ops sched_ops = {
588 .name = "sched" 626 .name = "sched"
589}; 627};
590 628
591static struct rcu_torture_ops sched_ops_sync = { 629static struct rcu_torture_ops sched_sync_ops = {
592 .init = rcu_sync_torture_init, 630 .init = rcu_sync_torture_init,
593 .cleanup = NULL, 631 .cleanup = NULL,
594 .readlock = sched_torture_read_lock, 632 .readlock = sched_torture_read_lock,
595 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 633 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
596 .readunlock = sched_torture_read_unlock, 634 .readunlock = sched_torture_read_unlock,
597 .completed = sched_torture_completed, 635 .completed = rcu_no_completed,
598 .deferred_free = rcu_sync_torture_deferred_free, 636 .deferred_free = rcu_sync_torture_deferred_free,
599 .sync = sched_torture_synchronize, 637 .sync = sched_torture_synchronize,
600 .cb_barrier = NULL, 638 .cb_barrier = NULL,
@@ -602,15 +640,13 @@ static struct rcu_torture_ops sched_ops_sync = {
602 .name = "sched_sync" 640 .name = "sched_sync"
603}; 641};
604 642
605extern int rcu_expedited_torture_stats(char *page);
606
607static struct rcu_torture_ops sched_expedited_ops = { 643static struct rcu_torture_ops sched_expedited_ops = {
608 .init = rcu_sync_torture_init, 644 .init = rcu_sync_torture_init,
609 .cleanup = NULL, 645 .cleanup = NULL,
610 .readlock = sched_torture_read_lock, 646 .readlock = sched_torture_read_lock,
611 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 647 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
612 .readunlock = sched_torture_read_unlock, 648 .readunlock = sched_torture_read_unlock,
613 .completed = sched_torture_completed, 649 .completed = rcu_no_completed,
614 .deferred_free = rcu_sync_torture_deferred_free, 650 .deferred_free = rcu_sync_torture_deferred_free,
615 .sync = synchronize_sched_expedited, 651 .sync = synchronize_sched_expedited,
616 .cb_barrier = NULL, 652 .cb_barrier = NULL,
@@ -638,14 +674,15 @@ rcu_torture_writer(void *arg)
638 674
639 do { 675 do {
640 schedule_timeout_uninterruptible(1); 676 schedule_timeout_uninterruptible(1);
641 if ((rp = rcu_torture_alloc()) == NULL) 677 rp = rcu_torture_alloc();
678 if (rp == NULL)
642 continue; 679 continue;
643 rp->rtort_pipe_count = 0; 680 rp->rtort_pipe_count = 0;
644 udelay(rcu_random(&rand) & 0x3ff); 681 udelay(rcu_random(&rand) & 0x3ff);
645 old_rp = rcu_torture_current; 682 old_rp = rcu_torture_current;
646 rp->rtort_mbtest = 1; 683 rp->rtort_mbtest = 1;
647 rcu_assign_pointer(rcu_torture_current, rp); 684 rcu_assign_pointer(rcu_torture_current, rp);
648 smp_wmb(); 685 smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
649 if (old_rp) { 686 if (old_rp) {
650 i = old_rp->rtort_pipe_count; 687 i = old_rp->rtort_pipe_count;
651 if (i > RCU_TORTURE_PIPE_LEN) 688 if (i > RCU_TORTURE_PIPE_LEN)
@@ -1094,9 +1131,10 @@ rcu_torture_init(void)
1094 int cpu; 1131 int cpu;
1095 int firsterr = 0; 1132 int firsterr = 0;
1096 static struct rcu_torture_ops *torture_ops[] = 1133 static struct rcu_torture_ops *torture_ops[] =
1097 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1134 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1098 &sched_expedited_ops, 1135 &rcu_bh_ops, &rcu_bh_sync_ops,
1099 &srcu_ops, &sched_ops, &sched_ops_sync, }; 1136 &srcu_ops, &srcu_expedited_ops,
1137 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1100 1138
1101 mutex_lock(&fullstop_mutex); 1139 mutex_lock(&fullstop_mutex);
1102 1140
@@ -1107,10 +1145,14 @@ rcu_torture_init(void)
1107 break; 1145 break;
1108 } 1146 }
1109 if (i == ARRAY_SIZE(torture_ops)) { 1147 if (i == ARRAY_SIZE(torture_ops)) {
1110 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", 1148 printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n",
1111 torture_type); 1149 torture_type);
1150 printk(KERN_ALERT "rcu-torture types:");
1151 for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
1152 printk(KERN_ALERT " %s", torture_ops[i]->name);
1153 printk(KERN_ALERT "\n");
1112 mutex_unlock(&fullstop_mutex); 1154 mutex_unlock(&fullstop_mutex);
1113 return (-EINVAL); 1155 return -EINVAL;
1114 } 1156 }
1115 if (cur_ops->init) 1157 if (cur_ops->init)
1116 cur_ops->init(); /* no "goto unwind" prior to this point!!! */ 1158 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
@@ -1161,7 +1203,7 @@ rcu_torture_init(void)
1161 goto unwind; 1203 goto unwind;
1162 } 1204 }
1163 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), 1205 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
1164 GFP_KERNEL); 1206 GFP_KERNEL);
1165 if (fakewriter_tasks == NULL) { 1207 if (fakewriter_tasks == NULL) {
1166 VERBOSE_PRINTK_ERRSTRING("out of memory"); 1208 VERBOSE_PRINTK_ERRSTRING("out of memory");
1167 firsterr = -ENOMEM; 1209 firsterr = -ENOMEM;
@@ -1170,7 +1212,7 @@ rcu_torture_init(void)
1170 for (i = 0; i < nfakewriters; i++) { 1212 for (i = 0; i < nfakewriters; i++) {
1171 VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); 1213 VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task");
1172 fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, 1214 fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
1173 "rcu_torture_fakewriter"); 1215 "rcu_torture_fakewriter");
1174 if (IS_ERR(fakewriter_tasks[i])) { 1216 if (IS_ERR(fakewriter_tasks[i])) {
1175 firsterr = PTR_ERR(fakewriter_tasks[i]); 1217 firsterr = PTR_ERR(fakewriter_tasks[i]);
1176 VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); 1218 VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 6b11b07cfe7f..53ae9598f798 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -25,7 +25,7 @@
25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. 25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
26 * 26 *
27 * For detailed explanation of Read-Copy Update mechanism see - 27 * For detailed explanation of Read-Copy Update mechanism see -
28 * Documentation/RCU 28 * Documentation/RCU
29 */ 29 */
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/kernel.h> 31#include <linux/kernel.h>
@@ -46,30 +46,30 @@
46#include <linux/cpu.h> 46#include <linux/cpu.h>
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h>
49 50
50#include "rcutree.h" 51#include "rcutree.h"
51 52
52#ifdef CONFIG_DEBUG_LOCK_ALLOC
53static struct lock_class_key rcu_lock_key;
54struct lockdep_map rcu_lock_map =
55 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
56EXPORT_SYMBOL_GPL(rcu_lock_map);
57#endif
58
59/* Data structures. */ 53/* Data structures. */
60 54
55static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
56
61#define RCU_STATE_INITIALIZER(name) { \ 57#define RCU_STATE_INITIALIZER(name) { \
62 .level = { &name.node[0] }, \ 58 .level = { &name.node[0] }, \
63 .levelcnt = { \ 59 .levelcnt = { \
64 NUM_RCU_LVL_0, /* root of hierarchy. */ \ 60 NUM_RCU_LVL_0, /* root of hierarchy. */ \
65 NUM_RCU_LVL_1, \ 61 NUM_RCU_LVL_1, \
66 NUM_RCU_LVL_2, \ 62 NUM_RCU_LVL_2, \
67 NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \ 63 NUM_RCU_LVL_3, \
64 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
68 }, \ 65 }, \
69 .signaled = RCU_SIGNAL_INIT, \ 66 .signaled = RCU_GP_IDLE, \
70 .gpnum = -300, \ 67 .gpnum = -300, \
71 .completed = -300, \ 68 .completed = -300, \
72 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ 69 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
70 .orphan_cbs_list = NULL, \
71 .orphan_cbs_tail = &name.orphan_cbs_list, \
72 .orphan_qlen = 0, \
73 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ 73 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
74 .n_force_qs = 0, \ 74 .n_force_qs = 0, \
75 .n_force_qs_ngp = 0, \ 75 .n_force_qs_ngp = 0, \
@@ -81,24 +81,18 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
83 83
84extern long rcu_batches_completed_sched(void); 84static int rcu_scheduler_active __read_mostly;
85static struct rcu_node *rcu_get_root(struct rcu_state *rsp);
86static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp,
87 struct rcu_node *rnp, unsigned long flags);
88static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags);
89#ifdef CONFIG_HOTPLUG_CPU
90static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp);
91#endif /* #ifdef CONFIG_HOTPLUG_CPU */
92static void __rcu_process_callbacks(struct rcu_state *rsp,
93 struct rcu_data *rdp);
94static void __call_rcu(struct rcu_head *head,
95 void (*func)(struct rcu_head *rcu),
96 struct rcu_state *rsp);
97static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp);
98static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp,
99 int preemptable);
100 85
101#include "rcutree_plugin.h" 86
87/*
88 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
89 * permit this function to be invoked without holding the root rcu_node
90 * structure's ->lock, but of course results can be subject to change.
91 */
92static int rcu_gp_in_progress(struct rcu_state *rsp)
93{
94 return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum);
95}
102 96
103/* 97/*
104 * Note a quiescent state. Because we do not need to know 98 * Note a quiescent state. Because we do not need to know
@@ -107,27 +101,23 @@ static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp,
107 */ 101 */
108void rcu_sched_qs(int cpu) 102void rcu_sched_qs(int cpu)
109{ 103{
110 unsigned long flags;
111 struct rcu_data *rdp; 104 struct rcu_data *rdp;
112 105
113 local_irq_save(flags);
114 rdp = &per_cpu(rcu_sched_data, cpu); 106 rdp = &per_cpu(rcu_sched_data, cpu);
107 rdp->passed_quiesc_completed = rdp->gpnum - 1;
108 barrier();
115 rdp->passed_quiesc = 1; 109 rdp->passed_quiesc = 1;
116 rdp->passed_quiesc_completed = rdp->completed; 110 rcu_preempt_note_context_switch(cpu);
117 rcu_preempt_qs(cpu);
118 local_irq_restore(flags);
119} 111}
120 112
121void rcu_bh_qs(int cpu) 113void rcu_bh_qs(int cpu)
122{ 114{
123 unsigned long flags;
124 struct rcu_data *rdp; 115 struct rcu_data *rdp;
125 116
126 local_irq_save(flags);
127 rdp = &per_cpu(rcu_bh_data, cpu); 117 rdp = &per_cpu(rcu_bh_data, cpu);
118 rdp->passed_quiesc_completed = rdp->gpnum - 1;
119 barrier();
128 rdp->passed_quiesc = 1; 120 rdp->passed_quiesc = 1;
129 rdp->passed_quiesc_completed = rdp->completed;
130 local_irq_restore(flags);
131} 121}
132 122
133#ifdef CONFIG_NO_HZ 123#ifdef CONFIG_NO_HZ
@@ -141,6 +131,10 @@ static int blimit = 10; /* Maximum callbacks per softirq. */
141static int qhimark = 10000; /* If this many pending, ignore blimit. */ 131static int qhimark = 10000; /* If this many pending, ignore blimit. */
142static int qlowmark = 100; /* Once only this many pending, use blimit. */ 132static int qlowmark = 100; /* Once only this many pending, use blimit. */
143 133
134module_param(blimit, int, 0);
135module_param(qhimark, int, 0);
136module_param(qlowmark, int, 0);
137
144static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 138static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
145static int rcu_pending(int cpu); 139static int rcu_pending(int cpu);
146 140
@@ -177,9 +171,7 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
177static int 171static int
178cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 172cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
179{ 173{
180 /* ACCESS_ONCE() because we are accessing outside of lock. */ 174 return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp);
181 return *rdp->nxttail[RCU_DONE_TAIL] &&
182 ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
183} 175}
184 176
185/* 177/*
@@ -349,31 +341,12 @@ void rcu_irq_exit(void)
349 set_need_resched(); 341 set_need_resched();
350} 342}
351 343
352/*
353 * Record the specified "completed" value, which is later used to validate
354 * dynticks counter manipulations. Specify "rsp->completed - 1" to
355 * unconditionally invalidate any future dynticks manipulations (which is
356 * useful at the beginning of a grace period).
357 */
358static void dyntick_record_completed(struct rcu_state *rsp, long comp)
359{
360 rsp->dynticks_completed = comp;
361}
362
363#ifdef CONFIG_SMP 344#ifdef CONFIG_SMP
364 345
365/* 346/*
366 * Recall the previously recorded value of the completion for dynticks.
367 */
368static long dyntick_recall_completed(struct rcu_state *rsp)
369{
370 return rsp->dynticks_completed;
371}
372
373/*
374 * Snapshot the specified CPU's dynticks counter so that we can later 347 * Snapshot the specified CPU's dynticks counter so that we can later
375 * credit them with an implicit quiescent state. Return 1 if this CPU 348 * credit them with an implicit quiescent state. Return 1 if this CPU
376 * is already in a quiescent state courtesy of dynticks idle mode. 349 * is in dynticks idle mode, which is an extended quiescent state.
377 */ 350 */
378static int dyntick_save_progress_counter(struct rcu_data *rdp) 351static int dyntick_save_progress_counter(struct rcu_data *rdp)
379{ 352{
@@ -433,24 +406,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
433 406
434#else /* #ifdef CONFIG_NO_HZ */ 407#else /* #ifdef CONFIG_NO_HZ */
435 408
436static void dyntick_record_completed(struct rcu_state *rsp, long comp)
437{
438}
439
440#ifdef CONFIG_SMP 409#ifdef CONFIG_SMP
441 410
442/*
443 * If there are no dynticks, then the only way that a CPU can passively
444 * be in a quiescent state is to be offline. Unlike dynticks idle, which
445 * is a point in time during the prior (already finished) grace period,
446 * an offline CPU is always in a quiescent state, and thus can be
447 * unconditionally applied. So just return the current value of completed.
448 */
449static long dyntick_recall_completed(struct rcu_state *rsp)
450{
451 return rsp->completed;
452}
453
454static int dyntick_save_progress_counter(struct rcu_data *rdp) 411static int dyntick_save_progress_counter(struct rcu_data *rdp)
455{ 412{
456 return 0; 413 return 0;
@@ -479,30 +436,34 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
479 long delta; 436 long delta;
480 unsigned long flags; 437 unsigned long flags;
481 struct rcu_node *rnp = rcu_get_root(rsp); 438 struct rcu_node *rnp = rcu_get_root(rsp);
482 struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
483 struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
484 439
485 /* Only let one CPU complain about others per time interval. */ 440 /* Only let one CPU complain about others per time interval. */
486 441
487 spin_lock_irqsave(&rnp->lock, flags); 442 spin_lock_irqsave(&rnp->lock, flags);
488 delta = jiffies - rsp->jiffies_stall; 443 delta = jiffies - rsp->jiffies_stall;
489 if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) { 444 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
490 spin_unlock_irqrestore(&rnp->lock, flags); 445 spin_unlock_irqrestore(&rnp->lock, flags);
491 return; 446 return;
492 } 447 }
493 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 448 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
449
450 /*
451 * Now rat on any tasks that got kicked up to the root rcu_node
452 * due to CPU offlining.
453 */
454 rcu_print_task_stall(rnp);
494 spin_unlock_irqrestore(&rnp->lock, flags); 455 spin_unlock_irqrestore(&rnp->lock, flags);
495 456
496 /* OK, time to rat on our buddy... */ 457 /* OK, time to rat on our buddy... */
497 458
498 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 459 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
499 for (; rnp_cur < rnp_end; rnp_cur++) { 460 rcu_for_each_leaf_node(rsp, rnp) {
500 rcu_print_task_stall(rnp); 461 rcu_print_task_stall(rnp);
501 if (rnp_cur->qsmask == 0) 462 if (rnp->qsmask == 0)
502 continue; 463 continue;
503 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++) 464 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
504 if (rnp_cur->qsmask & (1UL << cpu)) 465 if (rnp->qsmask & (1UL << cpu))
505 printk(" %d", rnp_cur->grplo + cpu); 466 printk(" %d", rnp->grplo + cpu);
506 } 467 }
507 printk(" (detected by %d, t=%ld jiffies)\n", 468 printk(" (detected by %d, t=%ld jiffies)\n",
508 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 469 smp_processor_id(), (long)(jiffies - rsp->gp_start));
@@ -541,8 +502,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
541 /* We haven't checked in, so go dump stack. */ 502 /* We haven't checked in, so go dump stack. */
542 print_cpu_stall(rsp); 503 print_cpu_stall(rsp);
543 504
544 } else if (rsp->gpnum != rsp->completed && 505 } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) {
545 delta >= RCU_STALL_RAT_DELAY) {
546 506
547 /* They had two time units to dump stack, so complain. */ 507 /* They had two time units to dump stack, so complain. */
548 print_other_cpu_stall(rsp); 508 print_other_cpu_stall(rsp);
@@ -564,13 +524,33 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
564/* 524/*
565 * Update CPU-local rcu_data state to record the newly noticed grace period. 525 * Update CPU-local rcu_data state to record the newly noticed grace period.
566 * This is used both when we started the grace period and when we notice 526 * This is used both when we started the grace period and when we notice
567 * that someone else started the grace period. 527 * that someone else started the grace period. The caller must hold the
528 * ->lock of the leaf rcu_node structure corresponding to the current CPU,
529 * and must have irqs disabled.
568 */ 530 */
531static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
532{
533 if (rdp->gpnum != rnp->gpnum) {
534 rdp->qs_pending = 1;
535 rdp->passed_quiesc = 0;
536 rdp->gpnum = rnp->gpnum;
537 }
538}
539
569static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) 540static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
570{ 541{
571 rdp->qs_pending = 1; 542 unsigned long flags;
572 rdp->passed_quiesc = 0; 543 struct rcu_node *rnp;
573 rdp->gpnum = rsp->gpnum; 544
545 local_irq_save(flags);
546 rnp = rdp->mynode;
547 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
548 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
549 local_irq_restore(flags);
550 return;
551 }
552 __note_new_gpnum(rsp, rnp, rdp);
553 spin_unlock_irqrestore(&rnp->lock, flags);
574} 554}
575 555
576/* 556/*
@@ -594,6 +574,79 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
594} 574}
595 575
596/* 576/*
577 * Advance this CPU's callbacks, but only if the current grace period
578 * has ended. This may be called only from the CPU to whom the rdp
579 * belongs. In addition, the corresponding leaf rcu_node structure's
580 * ->lock must be held by the caller, with irqs disabled.
581 */
582static void
583__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
584{
585 /* Did another grace period end? */
586 if (rdp->completed != rnp->completed) {
587
588 /* Advance callbacks. No harm if list empty. */
589 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
590 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
591 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
592
593 /* Remember that we saw this grace-period completion. */
594 rdp->completed = rnp->completed;
595 }
596}
597
598/*
599 * Advance this CPU's callbacks, but only if the current grace period
600 * has ended. This may be called only from the CPU to whom the rdp
601 * belongs.
602 */
603static void
604rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
605{
606 unsigned long flags;
607 struct rcu_node *rnp;
608
609 local_irq_save(flags);
610 rnp = rdp->mynode;
611 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
612 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
613 local_irq_restore(flags);
614 return;
615 }
616 __rcu_process_gp_end(rsp, rnp, rdp);
617 spin_unlock_irqrestore(&rnp->lock, flags);
618}
619
620/*
621 * Do per-CPU grace-period initialization for running CPU. The caller
622 * must hold the lock of the leaf rcu_node structure corresponding to
623 * this CPU.
624 */
625static void
626rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
627{
628 /* Prior grace period ended, so advance callbacks for current CPU. */
629 __rcu_process_gp_end(rsp, rnp, rdp);
630
631 /*
632 * Because this CPU just now started the new grace period, we know
633 * that all of its callbacks will be covered by this upcoming grace
634 * period, even the ones that were registered arbitrarily recently.
635 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
636 *
637 * Other CPUs cannot be sure exactly when the grace period started.
638 * Therefore, their recently registered callbacks must pass through
639 * an additional RCU_NEXT_READY stage, so that they will be handled
640 * by the next RCU grace period.
641 */
642 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
643 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
644
645 /* Set state so that this CPU will detect the next quiescent state. */
646 __note_new_gpnum(rsp, rnp, rdp);
647}
648
649/*
597 * Start a new RCU grace period if warranted, re-initializing the hierarchy 650 * Start a new RCU grace period if warranted, re-initializing the hierarchy
598 * in preparation for detecting the next grace period. The caller must hold 651 * in preparation for detecting the next grace period. The caller must hold
599 * the root node's ->lock, which is released before return. Hard irqs must 652 * the root node's ->lock, which is released before return. Hard irqs must
@@ -605,34 +658,43 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
605{ 658{
606 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 659 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
607 struct rcu_node *rnp = rcu_get_root(rsp); 660 struct rcu_node *rnp = rcu_get_root(rsp);
608 struct rcu_node *rnp_cur;
609 struct rcu_node *rnp_end;
610 661
611 if (!cpu_needs_another_gp(rsp, rdp)) { 662 if (!cpu_needs_another_gp(rsp, rdp)) {
612 spin_unlock_irqrestore(&rnp->lock, flags); 663 if (rnp->completed == rsp->completed) {
664 spin_unlock_irqrestore(&rnp->lock, flags);
665 return;
666 }
667 spin_unlock(&rnp->lock); /* irqs remain disabled. */
668
669 /*
670 * Propagate new ->completed value to rcu_node structures
671 * so that other CPUs don't have to wait until the start
672 * of the next grace period to process their callbacks.
673 */
674 rcu_for_each_node_breadth_first(rsp, rnp) {
675 spin_lock(&rnp->lock); /* irqs already disabled. */
676 rnp->completed = rsp->completed;
677 spin_unlock(&rnp->lock); /* irqs remain disabled. */
678 }
679 local_irq_restore(flags);
613 return; 680 return;
614 } 681 }
615 682
616 /* Advance to a new grace period and initialize state. */ 683 /* Advance to a new grace period and initialize state. */
617 rsp->gpnum++; 684 rsp->gpnum++;
685 WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
618 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 686 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
619 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 687 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
620 record_gp_stall_check_time(rsp); 688 record_gp_stall_check_time(rsp);
621 dyntick_record_completed(rsp, rsp->completed - 1);
622 note_new_gpnum(rsp, rdp);
623
624 /*
625 * Because we are first, we know that all our callbacks will
626 * be covered by this upcoming grace period, even the ones
627 * that were registered arbitrarily recently.
628 */
629 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
630 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
631 689
632 /* Special-case the common single-level case. */ 690 /* Special-case the common single-level case. */
633 if (NUM_RCU_NODES == 1) { 691 if (NUM_RCU_NODES == 1) {
692 rcu_preempt_check_blocked_tasks(rnp);
634 rnp->qsmask = rnp->qsmaskinit; 693 rnp->qsmask = rnp->qsmaskinit;
694 rnp->gpnum = rsp->gpnum;
695 rnp->completed = rsp->completed;
635 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 696 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
697 rcu_start_gp_per_cpu(rsp, rnp, rdp);
636 spin_unlock_irqrestore(&rnp->lock, flags); 698 spin_unlock_irqrestore(&rnp->lock, flags);
637 return; 699 return;
638 } 700 }
@@ -644,101 +706,71 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
644 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 706 spin_lock(&rsp->onofflock); /* irqs already disabled. */
645 707
646 /* 708 /*
647 * Set the quiescent-state-needed bits in all the non-leaf RCU 709 * Set the quiescent-state-needed bits in all the rcu_node
648 * nodes for all currently online CPUs. This operation relies 710 * structures for all currently online CPUs in breadth-first
649 * on the layout of the hierarchy within the rsp->node[] array. 711 * order, starting from the root rcu_node structure. This
650 * Note that other CPUs will access only the leaves of the 712 * operation relies on the layout of the hierarchy within the
651 * hierarchy, which still indicate that no grace period is in 713 * rsp->node[] array. Note that other CPUs will access only
652 * progress. In addition, we have excluded CPU-hotplug operations. 714 * the leaves of the hierarchy, which still indicate that no
653 * 715 * grace period is in progress, at least until the corresponding
654 * We therefore do not need to hold any locks. Any required 716 * leaf node has been initialized. In addition, we have excluded
655 * memory barriers will be supplied by the locks guarding the 717 * CPU-hotplug operations.
656 * leaf rcu_nodes in the hierarchy.
657 */
658
659 rnp_end = rsp->level[NUM_RCU_LVLS - 1];
660 for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
661 rnp_cur->qsmask = rnp_cur->qsmaskinit;
662
663 /*
664 * Now set up the leaf nodes. Here we must be careful. First,
665 * we need to hold the lock in order to exclude other CPUs, which
666 * might be contending for the leaf nodes' locks. Second, as
667 * soon as we initialize a given leaf node, its CPUs might run
668 * up the rest of the hierarchy. We must therefore acquire locks
669 * for each node that we touch during this stage. (But we still
670 * are excluding CPU-hotplug operations.)
671 * 718 *
672 * Note that the grace period cannot complete until we finish 719 * Note that the grace period cannot complete until we finish
673 * the initialization process, as there will be at least one 720 * the initialization process, as there will be at least one
674 * qsmask bit set in the root node until that time, namely the 721 * qsmask bit set in the root node until that time, namely the
675 * one corresponding to this CPU. 722 * one corresponding to this CPU, due to the fact that we have
723 * irqs disabled.
676 */ 724 */
677 rnp_end = &rsp->node[NUM_RCU_NODES]; 725 rcu_for_each_node_breadth_first(rsp, rnp) {
678 rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; 726 spin_lock(&rnp->lock); /* irqs already disabled. */
679 for (; rnp_cur < rnp_end; rnp_cur++) { 727 rcu_preempt_check_blocked_tasks(rnp);
680 spin_lock(&rnp_cur->lock); /* irqs already disabled. */ 728 rnp->qsmask = rnp->qsmaskinit;
681 rnp_cur->qsmask = rnp_cur->qsmaskinit; 729 rnp->gpnum = rsp->gpnum;
682 spin_unlock(&rnp_cur->lock); /* irqs already disabled. */ 730 rnp->completed = rsp->completed;
731 if (rnp == rdp->mynode)
732 rcu_start_gp_per_cpu(rsp, rnp, rdp);
733 spin_unlock(&rnp->lock); /* irqs remain disabled. */
683 } 734 }
684 735
736 rnp = rcu_get_root(rsp);
737 spin_lock(&rnp->lock); /* irqs already disabled. */
685 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ 738 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
739 spin_unlock(&rnp->lock); /* irqs remain disabled. */
686 spin_unlock_irqrestore(&rsp->onofflock, flags); 740 spin_unlock_irqrestore(&rsp->onofflock, flags);
687} 741}
688 742
689/* 743/*
690 * Advance this CPU's callbacks, but only if the current grace period 744 * Report a full set of quiescent states to the specified rcu_state
691 * has ended. This may be called only from the CPU to whom the rdp 745 * data structure. This involves cleaning up after the prior grace
692 * belongs. 746 * period and letting rcu_start_gp() start up the next grace period
693 */ 747 * if one is needed. Note that the caller must hold rnp->lock, as
694static void 748 * required by rcu_start_gp(), which will release it.
695rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
696{
697 long completed_snap;
698 unsigned long flags;
699
700 local_irq_save(flags);
701 completed_snap = ACCESS_ONCE(rsp->completed); /* outside of lock. */
702
703 /* Did another grace period end? */
704 if (rdp->completed != completed_snap) {
705
706 /* Advance callbacks. No harm if list empty. */
707 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
708 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
709 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
710
711 /* Remember that we saw this grace-period completion. */
712 rdp->completed = completed_snap;
713 }
714 local_irq_restore(flags);
715}
716
717/*
718 * Clean up after the prior grace period and let rcu_start_gp() start up
719 * the next grace period if one is needed. Note that the caller must
720 * hold rnp->lock, as required by rcu_start_gp(), which will release it.
721 */ 749 */
722static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags) 750static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
723 __releases(rnp->lock) 751 __releases(rcu_get_root(rsp)->lock)
724{ 752{
753 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
725 rsp->completed = rsp->gpnum; 754 rsp->completed = rsp->gpnum;
726 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); 755 rsp->signaled = RCU_GP_IDLE;
727 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 756 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
728} 757}
729 758
730/* 759/*
731 * Similar to cpu_quiet(), for which it is a helper function. Allows 760 * Similar to rcu_report_qs_rdp(), for which it is a helper function.
732 * a group of CPUs to be quieted at one go, though all the CPUs in the 761 * Allows quiescent states for a group of CPUs to be reported at one go
733 * group must be represented by the same leaf rcu_node structure. 762 * to the specified rcu_node structure, though all the CPUs in the group
734 * That structure's lock must be held upon entry, and it is released 763 * must be represented by the same rcu_node structure (which need not be
735 * before return. 764 * a leaf rcu_node structure, though it often will be). That structure's
765 * lock must be held upon entry, and it is released before return.
736 */ 766 */
737static void 767static void
738cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, 768rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
739 unsigned long flags) 769 struct rcu_node *rnp, unsigned long flags)
740 __releases(rnp->lock) 770 __releases(rnp->lock)
741{ 771{
772 struct rcu_node *rnp_c;
773
742 /* Walk up the rcu_node hierarchy. */ 774 /* Walk up the rcu_node hierarchy. */
743 for (;;) { 775 for (;;) {
744 if (!(rnp->qsmask & mask)) { 776 if (!(rnp->qsmask & mask)) {
@@ -762,27 +794,31 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
762 break; 794 break;
763 } 795 }
764 spin_unlock_irqrestore(&rnp->lock, flags); 796 spin_unlock_irqrestore(&rnp->lock, flags);
797 rnp_c = rnp;
765 rnp = rnp->parent; 798 rnp = rnp->parent;
766 spin_lock_irqsave(&rnp->lock, flags); 799 spin_lock_irqsave(&rnp->lock, flags);
800 WARN_ON_ONCE(rnp_c->qsmask);
767 } 801 }
768 802
769 /* 803 /*
770 * Get here if we are the last CPU to pass through a quiescent 804 * Get here if we are the last CPU to pass through a quiescent
771 * state for this grace period. Invoke cpu_quiet_msk_finish() 805 * state for this grace period. Invoke rcu_report_qs_rsp()
772 * to clean up and start the next grace period if one is needed. 806 * to clean up and start the next grace period if one is needed.
773 */ 807 */
774 cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */ 808 rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */
775} 809}
776 810
777/* 811/*
778 * Record a quiescent state for the specified CPU, which must either be 812 * Record a quiescent state for the specified CPU to that CPU's rcu_data
779 * the current CPU or an offline CPU. The lastcomp argument is used to 813 * structure. This must be either called from the specified CPU, or
780 * make sure we are still in the grace period of interest. We don't want 814 * called when the specified CPU is known to be offline (and when it is
781 * to end the current grace period based on quiescent states detected in 815 * also known that no other CPU is concurrently trying to help the offline
782 * an earlier grace period! 816 * CPU). The lastcomp argument is used to make sure we are still in the
817 * grace period of interest. We don't want to end the current grace period
818 * based on quiescent states detected in an earlier grace period!
783 */ 819 */
784static void 820static void
785cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) 821rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
786{ 822{
787 unsigned long flags; 823 unsigned long flags;
788 unsigned long mask; 824 unsigned long mask;
@@ -790,15 +826,15 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
790 826
791 rnp = rdp->mynode; 827 rnp = rdp->mynode;
792 spin_lock_irqsave(&rnp->lock, flags); 828 spin_lock_irqsave(&rnp->lock, flags);
793 if (lastcomp != ACCESS_ONCE(rsp->completed)) { 829 if (lastcomp != rnp->completed) {
794 830
795 /* 831 /*
796 * Someone beat us to it for this grace period, so leave. 832 * Someone beat us to it for this grace period, so leave.
797 * The race with GP start is resolved by the fact that we 833 * The race with GP start is resolved by the fact that we
798 * hold the leaf rcu_node lock, so that the per-CPU bits 834 * hold the leaf rcu_node lock, so that the per-CPU bits
799 * cannot yet be initialized -- so we would simply find our 835 * cannot yet be initialized -- so we would simply find our
800 * CPU's bit already cleared in cpu_quiet_msk() if this race 836 * CPU's bit already cleared in rcu_report_qs_rnp() if this
801 * occurred. 837 * race occurred.
802 */ 838 */
803 rdp->passed_quiesc = 0; /* try again later! */ 839 rdp->passed_quiesc = 0; /* try again later! */
804 spin_unlock_irqrestore(&rnp->lock, flags); 840 spin_unlock_irqrestore(&rnp->lock, flags);
@@ -814,10 +850,9 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
814 * This GP can't end until cpu checks in, so all of our 850 * This GP can't end until cpu checks in, so all of our
815 * callbacks can be processed during the next GP. 851 * callbacks can be processed during the next GP.
816 */ 852 */
817 rdp = rsp->rda[smp_processor_id()];
818 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 853 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
819 854
820 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */ 855 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
821 } 856 }
822} 857}
823 858
@@ -848,75 +883,113 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
848 if (!rdp->passed_quiesc) 883 if (!rdp->passed_quiesc)
849 return; 884 return;
850 885
851 /* Tell RCU we are done (but cpu_quiet() will be the judge of that). */ 886 /*
852 cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); 887 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
888 * judge of that).
889 */
890 rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
853} 891}
854 892
855#ifdef CONFIG_HOTPLUG_CPU 893#ifdef CONFIG_HOTPLUG_CPU
856 894
857/* 895/*
896 * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the
897 * specified flavor of RCU. The callbacks will be adopted by the next
898 * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever
899 * comes first. Because this is invoked from the CPU_DYING notifier,
900 * irqs are already disabled.
901 */
902static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
903{
904 int i;
905 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
906
907 if (rdp->nxtlist == NULL)
908 return; /* irqs disabled, so comparison is stable. */
909 spin_lock(&rsp->onofflock); /* irqs already disabled. */
910 *rsp->orphan_cbs_tail = rdp->nxtlist;
911 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
912 rdp->nxtlist = NULL;
913 for (i = 0; i < RCU_NEXT_SIZE; i++)
914 rdp->nxttail[i] = &rdp->nxtlist;
915 rsp->orphan_qlen += rdp->qlen;
916 rdp->qlen = 0;
917 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
918}
919
920/*
921 * Adopt previously orphaned RCU callbacks.
922 */
923static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
924{
925 unsigned long flags;
926 struct rcu_data *rdp;
927
928 spin_lock_irqsave(&rsp->onofflock, flags);
929 rdp = rsp->rda[smp_processor_id()];
930 if (rsp->orphan_cbs_list == NULL) {
931 spin_unlock_irqrestore(&rsp->onofflock, flags);
932 return;
933 }
934 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
935 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
936 rdp->qlen += rsp->orphan_qlen;
937 rsp->orphan_cbs_list = NULL;
938 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
939 rsp->orphan_qlen = 0;
940 spin_unlock_irqrestore(&rsp->onofflock, flags);
941}
942
943/*
858 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy 944 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
859 * and move all callbacks from the outgoing CPU to the current one. 945 * and move all callbacks from the outgoing CPU to the current one.
860 */ 946 */
861static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) 947static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
862{ 948{
863 int i;
864 unsigned long flags; 949 unsigned long flags;
865 long lastcomp;
866 unsigned long mask; 950 unsigned long mask;
951 int need_report = 0;
867 struct rcu_data *rdp = rsp->rda[cpu]; 952 struct rcu_data *rdp = rsp->rda[cpu];
868 struct rcu_data *rdp_me;
869 struct rcu_node *rnp; 953 struct rcu_node *rnp;
870 954
871 /* Exclude any attempts to start a new grace period. */ 955 /* Exclude any attempts to start a new grace period. */
872 spin_lock_irqsave(&rsp->onofflock, flags); 956 spin_lock_irqsave(&rsp->onofflock, flags);
873 957
874 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 958 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
875 rnp = rdp->mynode; 959 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */
876 mask = rdp->grpmask; /* rnp->grplo is constant. */ 960 mask = rdp->grpmask; /* rnp->grplo is constant. */
877 do { 961 do {
878 spin_lock(&rnp->lock); /* irqs already disabled. */ 962 spin_lock(&rnp->lock); /* irqs already disabled. */
879 rnp->qsmaskinit &= ~mask; 963 rnp->qsmaskinit &= ~mask;
880 if (rnp->qsmaskinit != 0) { 964 if (rnp->qsmaskinit != 0) {
881 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 965 if (rnp != rdp->mynode)
966 spin_unlock(&rnp->lock); /* irqs remain disabled. */
882 break; 967 break;
883 } 968 }
884 rcu_preempt_offline_tasks(rsp, rnp); 969 if (rnp == rdp->mynode)
970 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
971 else
972 spin_unlock(&rnp->lock); /* irqs remain disabled. */
885 mask = rnp->grpmask; 973 mask = rnp->grpmask;
886 spin_unlock(&rnp->lock); /* irqs remain disabled. */
887 rnp = rnp->parent; 974 rnp = rnp->parent;
888 } while (rnp != NULL); 975 } while (rnp != NULL);
889 lastcomp = rsp->completed;
890
891 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
892
893 /* Being offline is a quiescent state, so go record it. */
894 cpu_quiet(cpu, rsp, rdp, lastcomp);
895 976
896 /* 977 /*
897 * Move callbacks from the outgoing CPU to the running CPU. 978 * We still hold the leaf rcu_node structure lock here, and
898 * Note that the outgoing CPU is now quiscent, so it is now 979 * irqs are still disabled. The reason for this subterfuge is
899 * (uncharacteristically) safe to access its rcu_data structure. 980 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
900 * Note also that we must carefully retain the order of the 981 * held leads to deadlock.
901 * outgoing CPU's callbacks in order for rcu_barrier() to work
902 * correctly. Finally, note that we start all the callbacks
903 * afresh, even those that have passed through a grace period
904 * and are therefore ready to invoke. The theory is that hotplug
905 * events are rare, and that if they are frequent enough to
906 * indefinitely delay callbacks, you have far worse things to
907 * be worrying about.
908 */ 982 */
909 rdp_me = rsp->rda[smp_processor_id()]; 983 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
910 if (rdp->nxtlist != NULL) { 984 rnp = rdp->mynode;
911 *rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; 985 if (need_report & RCU_OFL_TASKS_NORM_GP)
912 rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 986 rcu_report_unblock_qs_rnp(rnp, flags);
913 rdp->nxtlist = NULL; 987 else
914 for (i = 0; i < RCU_NEXT_SIZE; i++) 988 spin_unlock_irqrestore(&rnp->lock, flags);
915 rdp->nxttail[i] = &rdp->nxtlist; 989 if (need_report & RCU_OFL_TASKS_EXP_GP)
916 rdp_me->qlen += rdp->qlen; 990 rcu_report_exp_rnp(rsp, rnp);
917 rdp->qlen = 0; 991
918 } 992 rcu_adopt_orphan_cbs(rsp);
919 local_irq_restore(flags);
920} 993}
921 994
922/* 995/*
@@ -934,6 +1007,14 @@ static void rcu_offline_cpu(int cpu)
934 1007
935#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1008#else /* #ifdef CONFIG_HOTPLUG_CPU */
936 1009
1010static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
1011{
1012}
1013
1014static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1015{
1016}
1017
937static void rcu_offline_cpu(int cpu) 1018static void rcu_offline_cpu(int cpu)
938{ 1019{
939} 1020}
@@ -944,7 +1025,7 @@ static void rcu_offline_cpu(int cpu)
944 * Invoke any RCU callbacks that have made it to the end of their grace 1025 * Invoke any RCU callbacks that have made it to the end of their grace
945 * period. Thottle as specified by rdp->blimit. 1026 * period. Thottle as specified by rdp->blimit.
946 */ 1027 */
947static void rcu_do_batch(struct rcu_data *rdp) 1028static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
948{ 1029{
949 unsigned long flags; 1030 unsigned long flags;
950 struct rcu_head *next, *list, **tail; 1031 struct rcu_head *next, *list, **tail;
@@ -997,6 +1078,13 @@ static void rcu_do_batch(struct rcu_data *rdp)
997 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) 1078 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
998 rdp->blimit = blimit; 1079 rdp->blimit = blimit;
999 1080
1081 /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
1082 if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) {
1083 rdp->qlen_last_fqs_check = 0;
1084 rdp->n_force_qs_snap = rsp->n_force_qs;
1085 } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
1086 rdp->qlen_last_fqs_check = rdp->qlen;
1087
1000 local_irq_restore(flags); 1088 local_irq_restore(flags);
1001 1089
1002 /* Re-raise the RCU softirq if there are callbacks remaining. */ 1090 /* Re-raise the RCU softirq if there are callbacks remaining. */
@@ -1066,33 +1154,32 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1066 int cpu; 1154 int cpu;
1067 unsigned long flags; 1155 unsigned long flags;
1068 unsigned long mask; 1156 unsigned long mask;
1069 struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; 1157 struct rcu_node *rnp;
1070 struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
1071 1158
1072 for (; rnp_cur < rnp_end; rnp_cur++) { 1159 rcu_for_each_leaf_node(rsp, rnp) {
1073 mask = 0; 1160 mask = 0;
1074 spin_lock_irqsave(&rnp_cur->lock, flags); 1161 spin_lock_irqsave(&rnp->lock, flags);
1075 if (rsp->completed != lastcomp) { 1162 if (rnp->completed != lastcomp) {
1076 spin_unlock_irqrestore(&rnp_cur->lock, flags); 1163 spin_unlock_irqrestore(&rnp->lock, flags);
1077 return 1; 1164 return 1;
1078 } 1165 }
1079 if (rnp_cur->qsmask == 0) { 1166 if (rnp->qsmask == 0) {
1080 spin_unlock_irqrestore(&rnp_cur->lock, flags); 1167 spin_unlock_irqrestore(&rnp->lock, flags);
1081 continue; 1168 continue;
1082 } 1169 }
1083 cpu = rnp_cur->grplo; 1170 cpu = rnp->grplo;
1084 bit = 1; 1171 bit = 1;
1085 for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) { 1172 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
1086 if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu])) 1173 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1087 mask |= bit; 1174 mask |= bit;
1088 } 1175 }
1089 if (mask != 0 && rsp->completed == lastcomp) { 1176 if (mask != 0 && rnp->completed == lastcomp) {
1090 1177
1091 /* cpu_quiet_msk() releases rnp_cur->lock. */ 1178 /* rcu_report_qs_rnp() releases rnp->lock. */
1092 cpu_quiet_msk(mask, rsp, rnp_cur, flags); 1179 rcu_report_qs_rnp(mask, rsp, rnp, flags);
1093 continue; 1180 continue;
1094 } 1181 }
1095 spin_unlock_irqrestore(&rnp_cur->lock, flags); 1182 spin_unlock_irqrestore(&rnp->lock, flags);
1096 } 1183 }
1097 return 0; 1184 return 0;
1098} 1185}
@@ -1107,8 +1194,9 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1107 long lastcomp; 1194 long lastcomp;
1108 struct rcu_node *rnp = rcu_get_root(rsp); 1195 struct rcu_node *rnp = rcu_get_root(rsp);
1109 u8 signaled; 1196 u8 signaled;
1197 u8 forcenow;
1110 1198
1111 if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) 1199 if (!rcu_gp_in_progress(rsp))
1112 return; /* No grace period in progress, nothing to force. */ 1200 return; /* No grace period in progress, nothing to force. */
1113 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) { 1201 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) {
1114 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ 1202 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
@@ -1119,19 +1207,20 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1119 goto unlock_ret; /* no emergency and done recently. */ 1207 goto unlock_ret; /* no emergency and done recently. */
1120 rsp->n_force_qs++; 1208 rsp->n_force_qs++;
1121 spin_lock(&rnp->lock); 1209 spin_lock(&rnp->lock);
1122 lastcomp = rsp->completed; 1210 lastcomp = rsp->gpnum - 1;
1123 signaled = rsp->signaled; 1211 signaled = rsp->signaled;
1124 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 1212 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1125 if (lastcomp == rsp->gpnum) { 1213 if(!rcu_gp_in_progress(rsp)) {
1126 rsp->n_force_qs_ngp++; 1214 rsp->n_force_qs_ngp++;
1127 spin_unlock(&rnp->lock); 1215 spin_unlock(&rnp->lock);
1128 goto unlock_ret; /* no GP in progress, time updated. */ 1216 goto unlock_ret; /* no GP in progress, time updated. */
1129 } 1217 }
1130 spin_unlock(&rnp->lock); 1218 spin_unlock(&rnp->lock);
1131 switch (signaled) { 1219 switch (signaled) {
1220 case RCU_GP_IDLE:
1132 case RCU_GP_INIT: 1221 case RCU_GP_INIT:
1133 1222
1134 break; /* grace period still initializing, ignore. */ 1223 break; /* grace period idle or initializing, ignore. */
1135 1224
1136 case RCU_SAVE_DYNTICK: 1225 case RCU_SAVE_DYNTICK:
1137 1226
@@ -1142,20 +1231,29 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1142 if (rcu_process_dyntick(rsp, lastcomp, 1231 if (rcu_process_dyntick(rsp, lastcomp,
1143 dyntick_save_progress_counter)) 1232 dyntick_save_progress_counter))
1144 goto unlock_ret; 1233 goto unlock_ret;
1234 /* fall into next case. */
1235
1236 case RCU_SAVE_COMPLETED:
1145 1237
1146 /* Update state, record completion counter. */ 1238 /* Update state, record completion counter. */
1239 forcenow = 0;
1147 spin_lock(&rnp->lock); 1240 spin_lock(&rnp->lock);
1148 if (lastcomp == rsp->completed) { 1241 if (lastcomp + 1 == rsp->gpnum &&
1242 lastcomp == rsp->completed &&
1243 rsp->signaled == signaled) {
1149 rsp->signaled = RCU_FORCE_QS; 1244 rsp->signaled = RCU_FORCE_QS;
1150 dyntick_record_completed(rsp, lastcomp); 1245 rsp->completed_fqs = lastcomp;
1246 forcenow = signaled == RCU_SAVE_COMPLETED;
1151 } 1247 }
1152 spin_unlock(&rnp->lock); 1248 spin_unlock(&rnp->lock);
1153 break; 1249 if (!forcenow)
1250 break;
1251 /* fall into next case. */
1154 1252
1155 case RCU_FORCE_QS: 1253 case RCU_FORCE_QS:
1156 1254
1157 /* Check dyntick-idle state, send IPI to laggarts. */ 1255 /* Check dyntick-idle state, send IPI to laggarts. */
1158 if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp), 1256 if (rcu_process_dyntick(rsp, rsp->completed_fqs,
1159 rcu_implicit_dynticks_qs)) 1257 rcu_implicit_dynticks_qs))
1160 goto unlock_ret; 1258 goto unlock_ret;
1161 1259
@@ -1211,7 +1309,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1211 } 1309 }
1212 1310
1213 /* If there are callbacks ready, invoke them. */ 1311 /* If there are callbacks ready, invoke them. */
1214 rcu_do_batch(rdp); 1312 rcu_do_batch(rsp, rdp);
1215} 1313}
1216 1314
1217/* 1315/*
@@ -1267,7 +1365,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1267 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1365 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1268 1366
1269 /* Start a new grace period if one not already started. */ 1367 /* Start a new grace period if one not already started. */
1270 if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) { 1368 if (!rcu_gp_in_progress(rsp)) {
1271 unsigned long nestflag; 1369 unsigned long nestflag;
1272 struct rcu_node *rnp_root = rcu_get_root(rsp); 1370 struct rcu_node *rnp_root = rcu_get_root(rsp);
1273 1371
@@ -1275,10 +1373,20 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1275 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ 1373 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1276 } 1374 }
1277 1375
1278 /* Force the grace period if too many callbacks or too long waiting. */ 1376 /*
1279 if (unlikely(++rdp->qlen > qhimark)) { 1377 * Force the grace period if too many callbacks or too long waiting.
1378 * Enforce hysteresis, and don't invoke force_quiescent_state()
1379 * if some other CPU has recently done so. Also, don't bother
1380 * invoking force_quiescent_state() if the newly enqueued callback
1381 * is the only one waiting for a grace period to complete.
1382 */
1383 if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
1280 rdp->blimit = LONG_MAX; 1384 rdp->blimit = LONG_MAX;
1281 force_quiescent_state(rsp, 0); 1385 if (rsp->n_force_qs == rdp->n_force_qs_snap &&
1386 *rdp->nxttail[RCU_DONE_TAIL] != head)
1387 force_quiescent_state(rsp, 0);
1388 rdp->n_force_qs_snap = rsp->n_force_qs;
1389 rdp->qlen_last_fqs_check = rdp->qlen;
1282 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) 1390 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
1283 force_quiescent_state(rsp, 1); 1391 force_quiescent_state(rsp, 1);
1284 local_irq_restore(flags); 1392 local_irq_restore(flags);
@@ -1302,6 +1410,68 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1302} 1410}
1303EXPORT_SYMBOL_GPL(call_rcu_bh); 1411EXPORT_SYMBOL_GPL(call_rcu_bh);
1304 1412
1413/**
1414 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
1415 *
1416 * Control will return to the caller some time after a full rcu-sched
1417 * grace period has elapsed, in other words after all currently executing
1418 * rcu-sched read-side critical sections have completed. These read-side
1419 * critical sections are delimited by rcu_read_lock_sched() and
1420 * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
1421 * local_irq_disable(), and so on may be used in place of
1422 * rcu_read_lock_sched().
1423 *
1424 * This means that all preempt_disable code sequences, including NMI and
1425 * hardware-interrupt handlers, in progress on entry will have completed
1426 * before this primitive returns. However, this does not guarantee that
1427 * softirq handlers will have completed, since in some kernels, these
1428 * handlers can run in process context, and can block.
1429 *
1430 * This primitive provides the guarantees made by the (now removed)
1431 * synchronize_kernel() API. In contrast, synchronize_rcu() only
1432 * guarantees that rcu_read_lock() sections will have completed.
1433 * In "classic RCU", these two guarantees happen to be one and
1434 * the same, but can differ in realtime RCU implementations.
1435 */
1436void synchronize_sched(void)
1437{
1438 struct rcu_synchronize rcu;
1439
1440 if (rcu_blocking_is_gp())
1441 return;
1442
1443 init_completion(&rcu.completion);
1444 /* Will wake me after RCU finished. */
1445 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1446 /* Wait for it. */
1447 wait_for_completion(&rcu.completion);
1448}
1449EXPORT_SYMBOL_GPL(synchronize_sched);
1450
1451/**
1452 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
1453 *
1454 * Control will return to the caller some time after a full rcu_bh grace
1455 * period has elapsed, in other words after all currently executing rcu_bh
1456 * read-side critical sections have completed. RCU read-side critical
1457 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
1458 * and may be nested.
1459 */
1460void synchronize_rcu_bh(void)
1461{
1462 struct rcu_synchronize rcu;
1463
1464 if (rcu_blocking_is_gp())
1465 return;
1466
1467 init_completion(&rcu.completion);
1468 /* Will wake me after RCU finished. */
1469 call_rcu_bh(&rcu.head, wakeme_after_rcu);
1470 /* Wait for it. */
1471 wait_for_completion(&rcu.completion);
1472}
1473EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
1474
1305/* 1475/*
1306 * Check to see if there is any immediate RCU-related work to be done 1476 * Check to see if there is any immediate RCU-related work to be done
1307 * by the current CPU, for the specified type of RCU, returning 1 if so. 1477 * by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -1311,6 +1481,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
1311 */ 1481 */
1312static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) 1482static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1313{ 1483{
1484 struct rcu_node *rnp = rdp->mynode;
1485
1314 rdp->n_rcu_pending++; 1486 rdp->n_rcu_pending++;
1315 1487
1316 /* Check for CPU stalls, if enabled. */ 1488 /* Check for CPU stalls, if enabled. */
@@ -1335,19 +1507,19 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1335 } 1507 }
1336 1508
1337 /* Has another RCU grace period completed? */ 1509 /* Has another RCU grace period completed? */
1338 if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */ 1510 if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
1339 rdp->n_rp_gp_completed++; 1511 rdp->n_rp_gp_completed++;
1340 return 1; 1512 return 1;
1341 } 1513 }
1342 1514
1343 /* Has a new RCU grace period started? */ 1515 /* Has a new RCU grace period started? */
1344 if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */ 1516 if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
1345 rdp->n_rp_gp_started++; 1517 rdp->n_rp_gp_started++;
1346 return 1; 1518 return 1;
1347 } 1519 }
1348 1520
1349 /* Has an RCU GP gone long enough to send resched IPIs &c? */ 1521 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1350 if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) && 1522 if (rcu_gp_in_progress(rsp) &&
1351 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { 1523 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) {
1352 rdp->n_rp_need_fqs++; 1524 rdp->n_rp_need_fqs++;
1353 return 1; 1525 return 1;
@@ -1385,6 +1557,97 @@ int rcu_needs_cpu(int cpu)
1385} 1557}
1386 1558
1387/* 1559/*
1560 * This function is invoked towards the end of the scheduler's initialization
1561 * process. Before this is called, the idle task might contain
1562 * RCU read-side critical sections (during which time, this idle
1563 * task is booting the system). After this function is called, the
1564 * idle tasks are prohibited from containing RCU read-side critical
1565 * sections.
1566 */
1567void rcu_scheduler_starting(void)
1568{
1569 WARN_ON(num_online_cpus() != 1);
1570 WARN_ON(nr_context_switches() > 0);
1571 rcu_scheduler_active = 1;
1572}
1573
1574static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
1575static atomic_t rcu_barrier_cpu_count;
1576static DEFINE_MUTEX(rcu_barrier_mutex);
1577static struct completion rcu_barrier_completion;
1578
1579static void rcu_barrier_callback(struct rcu_head *notused)
1580{
1581 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
1582 complete(&rcu_barrier_completion);
1583}
1584
1585/*
1586 * Called with preemption disabled, and from cross-cpu IRQ context.
1587 */
1588static void rcu_barrier_func(void *type)
1589{
1590 int cpu = smp_processor_id();
1591 struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
1592 void (*call_rcu_func)(struct rcu_head *head,
1593 void (*func)(struct rcu_head *head));
1594
1595 atomic_inc(&rcu_barrier_cpu_count);
1596 call_rcu_func = type;
1597 call_rcu_func(head, rcu_barrier_callback);
1598}
1599
1600/*
1601 * Orchestrate the specified type of RCU barrier, waiting for all
1602 * RCU callbacks of the specified type to complete.
1603 */
1604static void _rcu_barrier(struct rcu_state *rsp,
1605 void (*call_rcu_func)(struct rcu_head *head,
1606 void (*func)(struct rcu_head *head)))
1607{
1608 BUG_ON(in_interrupt());
1609 /* Take mutex to serialize concurrent rcu_barrier() requests. */
1610 mutex_lock(&rcu_barrier_mutex);
1611 init_completion(&rcu_barrier_completion);
1612 /*
1613 * Initialize rcu_barrier_cpu_count to 1, then invoke
1614 * rcu_barrier_func() on each CPU, so that each CPU also has
1615 * incremented rcu_barrier_cpu_count. Only then is it safe to
1616 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
1617 * might complete its grace period before all of the other CPUs
1618 * did their increment, causing this function to return too
1619 * early.
1620 */
1621 atomic_set(&rcu_barrier_cpu_count, 1);
1622 preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
1623 rcu_adopt_orphan_cbs(rsp);
1624 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
1625 preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
1626 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
1627 complete(&rcu_barrier_completion);
1628 wait_for_completion(&rcu_barrier_completion);
1629 mutex_unlock(&rcu_barrier_mutex);
1630}
1631
1632/**
1633 * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
1634 */
1635void rcu_barrier_bh(void)
1636{
1637 _rcu_barrier(&rcu_bh_state, call_rcu_bh);
1638}
1639EXPORT_SYMBOL_GPL(rcu_barrier_bh);
1640
1641/**
1642 * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
1643 */
1644void rcu_barrier_sched(void)
1645{
1646 _rcu_barrier(&rcu_sched_state, call_rcu_sched);
1647}
1648EXPORT_SYMBOL_GPL(rcu_barrier_sched);
1649
1650/*
1388 * Do boot-time initialization of a CPU's per-CPU RCU data. 1651 * Do boot-time initialization of a CPU's per-CPU RCU data.
1389 */ 1652 */
1390static void __init 1653static void __init
@@ -1419,21 +1682,18 @@ static void __cpuinit
1419rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) 1682rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1420{ 1683{
1421 unsigned long flags; 1684 unsigned long flags;
1422 long lastcomp;
1423 unsigned long mask; 1685 unsigned long mask;
1424 struct rcu_data *rdp = rsp->rda[cpu]; 1686 struct rcu_data *rdp = rsp->rda[cpu];
1425 struct rcu_node *rnp = rcu_get_root(rsp); 1687 struct rcu_node *rnp = rcu_get_root(rsp);
1426 1688
1427 /* Set up local state, ensuring consistent view of global state. */ 1689 /* Set up local state, ensuring consistent view of global state. */
1428 spin_lock_irqsave(&rnp->lock, flags); 1690 spin_lock_irqsave(&rnp->lock, flags);
1429 lastcomp = rsp->completed;
1430 rdp->completed = lastcomp;
1431 rdp->gpnum = lastcomp;
1432 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1691 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1433 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1692 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1434 rdp->beenonline = 1; /* We have now been online. */ 1693 rdp->beenonline = 1; /* We have now been online. */
1435 rdp->preemptable = preemptable; 1694 rdp->preemptable = preemptable;
1436 rdp->passed_quiesc_completed = lastcomp - 1; 1695 rdp->qlen_last_fqs_check = 0;
1696 rdp->n_force_qs_snap = rsp->n_force_qs;
1437 rdp->blimit = blimit; 1697 rdp->blimit = blimit;
1438 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1698 spin_unlock(&rnp->lock); /* irqs remain disabled. */
1439 1699
@@ -1453,24 +1713,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1453 spin_lock(&rnp->lock); /* irqs already disabled. */ 1713 spin_lock(&rnp->lock); /* irqs already disabled. */
1454 rnp->qsmaskinit |= mask; 1714 rnp->qsmaskinit |= mask;
1455 mask = rnp->grpmask; 1715 mask = rnp->grpmask;
1716 if (rnp == rdp->mynode) {
1717 rdp->gpnum = rnp->completed; /* if GP in progress... */
1718 rdp->completed = rnp->completed;
1719 rdp->passed_quiesc_completed = rnp->completed - 1;
1720 }
1456 spin_unlock(&rnp->lock); /* irqs already disabled. */ 1721 spin_unlock(&rnp->lock); /* irqs already disabled. */
1457 rnp = rnp->parent; 1722 rnp = rnp->parent;
1458 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 1723 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
1459 1724
1460 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 1725 spin_unlock_irqrestore(&rsp->onofflock, flags);
1461
1462 /*
1463 * A new grace period might start here. If so, we will be part of
1464 * it, and its gpnum will be greater than ours, so we will
1465 * participate. It is also possible for the gpnum to have been
1466 * incremented before this function was called, and the bitmasks
1467 * to not be filled out until now, in which case we will also
1468 * participate due to our gpnum being behind.
1469 */
1470
1471 /* Since it is coming online, the CPU is in a quiescent state. */
1472 cpu_quiet(cpu, rsp, rdp, lastcomp);
1473 local_irq_restore(flags);
1474} 1726}
1475 1727
1476static void __cpuinit rcu_online_cpu(int cpu) 1728static void __cpuinit rcu_online_cpu(int cpu)
@@ -1483,8 +1735,8 @@ static void __cpuinit rcu_online_cpu(int cpu)
1483/* 1735/*
1484 * Handle CPU online/offline notification events. 1736 * Handle CPU online/offline notification events.
1485 */ 1737 */
1486int __cpuinit rcu_cpu_notify(struct notifier_block *self, 1738static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1487 unsigned long action, void *hcpu) 1739 unsigned long action, void *hcpu)
1488{ 1740{
1489 long cpu = (long)hcpu; 1741 long cpu = (long)hcpu;
1490 1742
@@ -1493,6 +1745,22 @@ int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1493 case CPU_UP_PREPARE_FROZEN: 1745 case CPU_UP_PREPARE_FROZEN:
1494 rcu_online_cpu(cpu); 1746 rcu_online_cpu(cpu);
1495 break; 1747 break;
1748 case CPU_DYING:
1749 case CPU_DYING_FROZEN:
1750 /*
1751 * preempt_disable() in _rcu_barrier() prevents stop_machine(),
1752 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
1753 * returns, all online cpus have queued rcu_barrier_func().
1754 * The dying CPU clears its cpu_online_mask bit and
1755 * moves all of its RCU callbacks to ->orphan_cbs_list
1756 * in the context of stop_machine(), so subsequent calls
1757 * to _rcu_barrier() will adopt these callbacks and only
1758 * then queue rcu_barrier_func() on all remaining CPUs.
1759 */
1760 rcu_send_cbs_to_orphanage(&rcu_bh_state);
1761 rcu_send_cbs_to_orphanage(&rcu_sched_state);
1762 rcu_preempt_send_cbs_to_orphanage();
1763 break;
1496 case CPU_DEAD: 1764 case CPU_DEAD:
1497 case CPU_DEAD_FROZEN: 1765 case CPU_DEAD_FROZEN:
1498 case CPU_UP_CANCELED: 1766 case CPU_UP_CANCELED:
@@ -1556,6 +1824,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1556 rnp = rsp->level[i]; 1824 rnp = rsp->level[i];
1557 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1825 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1558 spin_lock_init(&rnp->lock); 1826 spin_lock_init(&rnp->lock);
1827 lockdep_set_class(&rnp->lock, &rcu_node_class[i]);
1559 rnp->gpnum = 0; 1828 rnp->gpnum = 0;
1560 rnp->qsmask = 0; 1829 rnp->qsmask = 0;
1561 rnp->qsmaskinit = 0; 1830 rnp->qsmaskinit = 0;
@@ -1576,6 +1845,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1576 rnp->level = i; 1845 rnp->level = i;
1577 INIT_LIST_HEAD(&rnp->blocked_tasks[0]); 1846 INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
1578 INIT_LIST_HEAD(&rnp->blocked_tasks[1]); 1847 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1848 INIT_LIST_HEAD(&rnp->blocked_tasks[2]);
1849 INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
1579 } 1850 }
1580 } 1851 }
1581} 1852}
@@ -1587,6 +1858,10 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1587 */ 1858 */
1588#define RCU_INIT_FLAVOR(rsp, rcu_data) \ 1859#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1589do { \ 1860do { \
1861 int i; \
1862 int j; \
1863 struct rcu_node *rnp; \
1864 \
1590 rcu_init_one(rsp); \ 1865 rcu_init_one(rsp); \
1591 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \ 1866 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1592 j = 0; \ 1867 j = 0; \
@@ -1599,41 +1874,30 @@ do { \
1599 } \ 1874 } \
1600} while (0) 1875} while (0)
1601 1876
1602#ifdef CONFIG_TREE_PREEMPT_RCU 1877void __init rcu_init(void)
1603
1604void __init __rcu_init_preempt(void)
1605{
1606 int i; /* All used by RCU_INIT_FLAVOR(). */
1607 int j;
1608 struct rcu_node *rnp;
1609
1610 RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
1611}
1612
1613#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1614
1615void __init __rcu_init_preempt(void)
1616{ 1878{
1617} 1879 int i;
1618
1619#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1620
1621void __init __rcu_init(void)
1622{
1623 int i; /* All used by RCU_INIT_FLAVOR(). */
1624 int j;
1625 struct rcu_node *rnp;
1626 1880
1627 rcu_bootup_announce(); 1881 rcu_bootup_announce();
1628#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1629 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 1883 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1630#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 1884#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1885#if NUM_RCU_LVL_4 != 0
1886 printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n");
1887#endif /* #if NUM_RCU_LVL_4 != 0 */
1631 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); 1888 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
1632 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); 1889 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
1633 __rcu_init_preempt(); 1890 __rcu_init_preempt();
1634 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1891 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1892
1893 /*
1894 * We don't need protection against CPU-hotplug here because
1895 * this is called early in boot, before either interrupts
1896 * or the scheduler are operational.
1897 */
1898 cpu_notifier(rcu_cpu_notify, 0);
1899 for_each_online_cpu(i)
1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i);
1635} 1901}
1636 1902
1637module_param(blimit, int, 0); 1903#include "rcutree_plugin.h"
1638module_param(qhimark, int, 0);
1639module_param(qlowmark, int, 0);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index bf8a6f9f134d..d2a0046f63b2 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -34,10 +34,11 @@
34 * In practice, this has not been tested, so there is probably some 34 * In practice, this has not been tested, so there is probably some
35 * bug somewhere. 35 * bug somewhere.
36 */ 36 */
37#define MAX_RCU_LVLS 3 37#define MAX_RCU_LVLS 4
38#define RCU_FANOUT (CONFIG_RCU_FANOUT) 38#define RCU_FANOUT (CONFIG_RCU_FANOUT)
39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) 39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) 40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
41#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT)
41 42
42#if NR_CPUS <= RCU_FANOUT 43#if NR_CPUS <= RCU_FANOUT
43# define NUM_RCU_LVLS 1 44# define NUM_RCU_LVLS 1
@@ -45,23 +46,33 @@
45# define NUM_RCU_LVL_1 (NR_CPUS) 46# define NUM_RCU_LVL_1 (NR_CPUS)
46# define NUM_RCU_LVL_2 0 47# define NUM_RCU_LVL_2 0
47# define NUM_RCU_LVL_3 0 48# define NUM_RCU_LVL_3 0
49# define NUM_RCU_LVL_4 0
48#elif NR_CPUS <= RCU_FANOUT_SQ 50#elif NR_CPUS <= RCU_FANOUT_SQ
49# define NUM_RCU_LVLS 2 51# define NUM_RCU_LVLS 2
50# define NUM_RCU_LVL_0 1 52# define NUM_RCU_LVL_0 1
51# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT) 53# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
52# define NUM_RCU_LVL_2 (NR_CPUS) 54# define NUM_RCU_LVL_2 (NR_CPUS)
53# define NUM_RCU_LVL_3 0 55# define NUM_RCU_LVL_3 0
56# define NUM_RCU_LVL_4 0
54#elif NR_CPUS <= RCU_FANOUT_CUBE 57#elif NR_CPUS <= RCU_FANOUT_CUBE
55# define NUM_RCU_LVLS 3 58# define NUM_RCU_LVLS 3
56# define NUM_RCU_LVL_0 1 59# define NUM_RCU_LVL_0 1
57# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ) 60# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
58# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT)) 61# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
59# define NUM_RCU_LVL_3 NR_CPUS 62# define NUM_RCU_LVL_3 NR_CPUS
63# define NUM_RCU_LVL_4 0
64#elif NR_CPUS <= RCU_FANOUT_FOURTH
65# define NUM_RCU_LVLS 4
66# define NUM_RCU_LVL_0 1
67# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE)
68# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
69# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
70# define NUM_RCU_LVL_4 NR_CPUS
60#else 71#else
61# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" 72# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
62#endif /* #if (NR_CPUS) <= RCU_FANOUT */ 73#endif /* #if (NR_CPUS) <= RCU_FANOUT */
63 74
64#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) 75#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
65#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) 76#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
66 77
67/* 78/*
@@ -79,24 +90,67 @@ struct rcu_dynticks {
79 * Definition for node within the RCU grace-period-detection hierarchy. 90 * Definition for node within the RCU grace-period-detection hierarchy.
80 */ 91 */
81struct rcu_node { 92struct rcu_node {
82 spinlock_t lock; 93 spinlock_t lock; /* Root rcu_node's lock protects some */
94 /* rcu_state fields as well as following. */
83 long gpnum; /* Current grace period for this node. */ 95 long gpnum; /* Current grace period for this node. */
84 /* This will either be equal to or one */ 96 /* This will either be equal to or one */
85 /* behind the root rcu_node's gpnum. */ 97 /* behind the root rcu_node's gpnum. */
98 long completed; /* Last grace period completed for this node. */
99 /* This will either be equal to or one */
100 /* behind the root rcu_node's gpnum. */
86 unsigned long qsmask; /* CPUs or groups that need to switch in */ 101 unsigned long qsmask; /* CPUs or groups that need to switch in */
87 /* order for current grace period to proceed.*/ 102 /* order for current grace period to proceed.*/
103 /* In leaf rcu_node, each bit corresponds to */
104 /* an rcu_data structure, otherwise, each */
105 /* bit corresponds to a child rcu_node */
106 /* structure. */
107 unsigned long expmask; /* Groups that have ->blocked_tasks[] */
108 /* elements that need to drain to allow the */
109 /* current expedited grace period to */
110 /* complete (only for TREE_PREEMPT_RCU). */
88 unsigned long qsmaskinit; 111 unsigned long qsmaskinit;
89 /* Per-GP initialization for qsmask. */ 112 /* Per-GP initial value for qsmask & expmask. */
90 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 113 unsigned long grpmask; /* Mask to apply to parent qsmask. */
114 /* Only one bit will be set in this mask. */
91 int grplo; /* lowest-numbered CPU or group here. */ 115 int grplo; /* lowest-numbered CPU or group here. */
92 int grphi; /* highest-numbered CPU or group here. */ 116 int grphi; /* highest-numbered CPU or group here. */
93 u8 grpnum; /* CPU/group number for next level up. */ 117 u8 grpnum; /* CPU/group number for next level up. */
94 u8 level; /* root is at level 0. */ 118 u8 level; /* root is at level 0. */
95 struct rcu_node *parent; 119 struct rcu_node *parent;
96 struct list_head blocked_tasks[2]; 120 struct list_head blocked_tasks[4];
97 /* Tasks blocked in RCU read-side critsect. */ 121 /* Tasks blocked in RCU read-side critsect. */
122 /* Grace period number (->gpnum) x blocked */
123 /* by tasks on the (x & 0x1) element of the */
124 /* blocked_tasks[] array. */
98} ____cacheline_internodealigned_in_smp; 125} ____cacheline_internodealigned_in_smp;
99 126
127/*
128 * Do a full breadth-first scan of the rcu_node structures for the
129 * specified rcu_state structure.
130 */
131#define rcu_for_each_node_breadth_first(rsp, rnp) \
132 for ((rnp) = &(rsp)->node[0]; \
133 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
134
135/*
136 * Do a breadth-first scan of the non-leaf rcu_node structures for the
137 * specified rcu_state structure. Note that if there is a singleton
138 * rcu_node tree with but one rcu_node structure, this loop is a no-op.
139 */
140#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
141 for ((rnp) = &(rsp)->node[0]; \
142 (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
143
144/*
145 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
146 * structure. Note that if there is a singleton rcu_node tree with but
147 * one rcu_node structure, this loop -will- visit the rcu_node structure.
148 * It is still a leaf node, even if it is also the root node.
149 */
150#define rcu_for_each_leaf_node(rsp, rnp) \
151 for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
152 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
153
100/* Index values for nxttail array in struct rcu_data. */ 154/* Index values for nxttail array in struct rcu_data. */
101#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ 155#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
102#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ 156#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
@@ -126,23 +180,30 @@ struct rcu_data {
126 * Any of the partitions might be empty, in which case the 180 * Any of the partitions might be empty, in which case the
127 * pointer to that partition will be equal to the pointer for 181 * pointer to that partition will be equal to the pointer for
128 * the following partition. When the list is empty, all of 182 * the following partition. When the list is empty, all of
129 * the nxttail elements point to nxtlist, which is NULL. 183 * the nxttail elements point to the ->nxtlist pointer itself,
184 * which in that case is NULL.
130 * 185 *
131 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
132 * Entries that might have arrived after current GP ended
133 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
134 * Entries known to have arrived before current GP ended
135 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
136 * Entries that batch # <= ->completed - 1: waiting for current GP
137 * [nxtlist, *nxttail[RCU_DONE_TAIL]): 186 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
138 * Entries that batch # <= ->completed 187 * Entries that batch # <= ->completed
139 * The grace period for these entries has completed, and 188 * The grace period for these entries has completed, and
140 * the other grace-period-completed entries may be moved 189 * the other grace-period-completed entries may be moved
141 * here temporarily in rcu_process_callbacks(). 190 * here temporarily in rcu_process_callbacks().
191 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
192 * Entries that batch # <= ->completed - 1: waiting for current GP
193 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
194 * Entries known to have arrived before current GP ended
195 * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]):
196 * Entries that might have arrived after current GP ended
197 * Note that the value of *nxttail[RCU_NEXT_TAIL] will
198 * always be NULL, as this is the end of the list.
142 */ 199 */
143 struct rcu_head *nxtlist; 200 struct rcu_head *nxtlist;
144 struct rcu_head **nxttail[RCU_NEXT_SIZE]; 201 struct rcu_head **nxttail[RCU_NEXT_SIZE];
145 long qlen; /* # of queued callbacks */ 202 long qlen; /* # of queued callbacks */
203 long qlen_last_fqs_check;
204 /* qlen at last check for QS forcing */
205 unsigned long n_force_qs_snap;
206 /* did other CPU force QS recently? */
146 long blimit; /* Upper limit on a processed batch */ 207 long blimit; /* Upper limit on a processed batch */
147 208
148#ifdef CONFIG_NO_HZ 209#ifdef CONFIG_NO_HZ
@@ -173,13 +234,15 @@ struct rcu_data {
173}; 234};
174 235
175/* Values for signaled field in struct rcu_state. */ 236/* Values for signaled field in struct rcu_state. */
176#define RCU_GP_INIT 0 /* Grace period being initialized. */ 237#define RCU_GP_IDLE 0 /* No grace period in progress. */
177#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */ 238#define RCU_GP_INIT 1 /* Grace period being initialized. */
178#define RCU_FORCE_QS 2 /* Need to force quiescent state. */ 239#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
240#define RCU_SAVE_COMPLETED 3 /* Need to save rsp->completed. */
241#define RCU_FORCE_QS 4 /* Need to force quiescent state. */
179#ifdef CONFIG_NO_HZ 242#ifdef CONFIG_NO_HZ
180#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 243#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
181#else /* #ifdef CONFIG_NO_HZ */ 244#else /* #ifdef CONFIG_NO_HZ */
182#define RCU_SIGNAL_INIT RCU_FORCE_QS 245#define RCU_SIGNAL_INIT RCU_SAVE_COMPLETED
183#endif /* #else #ifdef CONFIG_NO_HZ */ 246#endif /* #else #ifdef CONFIG_NO_HZ */
184 247
185#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 248#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
@@ -216,10 +279,23 @@ struct rcu_state {
216 /* Force QS state. */ 279 /* Force QS state. */
217 long gpnum; /* Current gp number. */ 280 long gpnum; /* Current gp number. */
218 long completed; /* # of last completed gp. */ 281 long completed; /* # of last completed gp. */
282
283 /* End of fields guarded by root rcu_node's lock. */
284
219 spinlock_t onofflock; /* exclude on/offline and */ 285 spinlock_t onofflock; /* exclude on/offline and */
220 /* starting new GP. */ 286 /* starting new GP. Also */
287 /* protects the following */
288 /* orphan_cbs fields. */
289 struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */
290 /* orphaned by all CPUs in */
291 /* a given leaf rcu_node */
292 /* going offline. */
293 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
294 long orphan_qlen; /* Number of orphaned cbs. */
221 spinlock_t fqslock; /* Only one task forcing */ 295 spinlock_t fqslock; /* Only one task forcing */
222 /* quiescent states. */ 296 /* quiescent states. */
297 long completed_fqs; /* Value of completed @ snap. */
298 /* Protected by fqslock. */
223 unsigned long jiffies_force_qs; /* Time at which to invoke */ 299 unsigned long jiffies_force_qs; /* Time at which to invoke */
224 /* force_quiescent_state(). */ 300 /* force_quiescent_state(). */
225 unsigned long n_force_qs; /* Number of calls to */ 301 unsigned long n_force_qs; /* Number of calls to */
@@ -234,11 +310,15 @@ struct rcu_state {
234 unsigned long jiffies_stall; /* Time at which to check */ 310 unsigned long jiffies_stall; /* Time at which to check */
235 /* for CPU stalls. */ 311 /* for CPU stalls. */
236#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 312#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
237#ifdef CONFIG_NO_HZ
238 long dynticks_completed; /* Value of completed @ snap. */
239#endif /* #ifdef CONFIG_NO_HZ */
240}; 313};
241 314
315/* Return values for rcu_preempt_offline_tasks(). */
316
317#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */
318 /* GP were moved to root. */
319#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */
320 /* GP were moved to root. */
321
242#ifdef RCU_TREE_NONCORE 322#ifdef RCU_TREE_NONCORE
243 323
244/* 324/*
@@ -255,5 +335,37 @@ extern struct rcu_state rcu_preempt_state;
255DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); 335DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
256#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 336#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
257 337
258#endif /* #ifdef RCU_TREE_NONCORE */ 338#else /* #ifdef RCU_TREE_NONCORE */
339
340/* Forward declarations for rcutree_plugin.h */
341static void rcu_bootup_announce(void);
342long rcu_batches_completed(void);
343static void rcu_preempt_note_context_switch(int cpu);
344static int rcu_preempted_readers(struct rcu_node *rnp);
345#ifdef CONFIG_HOTPLUG_CPU
346static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
347 unsigned long flags);
348#endif /* #ifdef CONFIG_HOTPLUG_CPU */
349#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
350static void rcu_print_task_stall(struct rcu_node *rnp);
351#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
352static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
353#ifdef CONFIG_HOTPLUG_CPU
354static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
355 struct rcu_node *rnp,
356 struct rcu_data *rdp);
357static void rcu_preempt_offline_cpu(int cpu);
358#endif /* #ifdef CONFIG_HOTPLUG_CPU */
359static void rcu_preempt_check_callbacks(int cpu);
360static void rcu_preempt_process_callbacks(void);
361void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
362#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
363static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
364#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
365static int rcu_preempt_pending(int cpu);
366static int rcu_preempt_needs_cpu(int cpu);
367static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
368static void rcu_preempt_send_cbs_to_orphanage(void);
369static void __init __rcu_init_preempt(void);
259 370
371#endif /* #else #ifdef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 47789369ea59..37fbccdf41d5 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -24,16 +24,19 @@
24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com> 24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
25 */ 25 */
26 26
27#include <linux/delay.h>
27 28
28#ifdef CONFIG_TREE_PREEMPT_RCU 29#ifdef CONFIG_TREE_PREEMPT_RCU
29 30
30struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 31struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
31DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 32DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
32 33
34static int rcu_preempted_readers_exp(struct rcu_node *rnp);
35
33/* 36/*
34 * Tell them what RCU they are running. 37 * Tell them what RCU they are running.
35 */ 38 */
36static inline void rcu_bootup_announce(void) 39static void __init rcu_bootup_announce(void)
37{ 40{
38 printk(KERN_INFO 41 printk(KERN_INFO
39 "Experimental preemptable hierarchical RCU implementation.\n"); 42 "Experimental preemptable hierarchical RCU implementation.\n");
@@ -64,22 +67,31 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
64 * not in a quiescent state. There might be any number of tasks blocked 67 * not in a quiescent state. There might be any number of tasks blocked
65 * while in an RCU read-side critical section. 68 * while in an RCU read-side critical section.
66 */ 69 */
67static void rcu_preempt_qs_record(int cpu) 70static void rcu_preempt_qs(int cpu)
68{ 71{
69 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 72 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
73 rdp->passed_quiesc_completed = rdp->gpnum - 1;
74 barrier();
70 rdp->passed_quiesc = 1; 75 rdp->passed_quiesc = 1;
71 rdp->passed_quiesc_completed = rdp->completed;
72} 76}
73 77
74/* 78/*
75 * We have entered the scheduler or are between softirqs in ksoftirqd. 79 * We have entered the scheduler, and the current task might soon be
76 * If we are in an RCU read-side critical section, we need to reflect 80 * context-switched away from. If this task is in an RCU read-side
77 * that in the state of the rcu_node structure corresponding to this CPU. 81 * critical section, we will no longer be able to rely on the CPU to
78 * Caller must disable hardirqs. 82 * record that fact, so we enqueue the task on the appropriate entry
83 * of the blocked_tasks[] array. The task will dequeue itself when
84 * it exits the outermost enclosing RCU read-side critical section.
85 * Therefore, the current grace period cannot be permitted to complete
86 * until the blocked_tasks[] entry indexed by the low-order bit of
87 * rnp->gpnum empties.
88 *
89 * Caller must disable preemption.
79 */ 90 */
80static void rcu_preempt_qs(int cpu) 91static void rcu_preempt_note_context_switch(int cpu)
81{ 92{
82 struct task_struct *t = current; 93 struct task_struct *t = current;
94 unsigned long flags;
83 int phase; 95 int phase;
84 struct rcu_data *rdp; 96 struct rcu_data *rdp;
85 struct rcu_node *rnp; 97 struct rcu_node *rnp;
@@ -90,7 +102,7 @@ static void rcu_preempt_qs(int cpu)
90 /* Possibly blocking in an RCU read-side critical section. */ 102 /* Possibly blocking in an RCU read-side critical section. */
91 rdp = rcu_preempt_state.rda[cpu]; 103 rdp = rcu_preempt_state.rda[cpu];
92 rnp = rdp->mynode; 104 rnp = rdp->mynode;
93 spin_lock(&rnp->lock); 105 spin_lock_irqsave(&rnp->lock, flags);
94 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 106 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
95 t->rcu_blocked_node = rnp; 107 t->rcu_blocked_node = rnp;
96 108
@@ -103,11 +115,15 @@ static void rcu_preempt_qs(int cpu)
103 * state for the current grace period), then as long 115 * state for the current grace period), then as long
104 * as that task remains queued, the current grace period 116 * as that task remains queued, the current grace period
105 * cannot end. 117 * cannot end.
118 *
119 * But first, note that the current CPU must still be
120 * on line!
106 */ 121 */
107 phase = !(rnp->qsmask & rdp->grpmask) ^ (rnp->gpnum & 0x1); 122 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
123 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
124 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
108 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); 125 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
109 smp_mb(); /* Ensure later ctxt swtch seen after above. */ 126 spin_unlock_irqrestore(&rnp->lock, flags);
110 spin_unlock(&rnp->lock);
111 } 127 }
112 128
113 /* 129 /*
@@ -119,9 +135,10 @@ static void rcu_preempt_qs(int cpu)
119 * grace period, then the fact that the task has been enqueued 135 * grace period, then the fact that the task has been enqueued
120 * means that we continue to block the current grace period. 136 * means that we continue to block the current grace period.
121 */ 137 */
122 rcu_preempt_qs_record(cpu); 138 rcu_preempt_qs(cpu);
123 t->rcu_read_unlock_special &= ~(RCU_READ_UNLOCK_NEED_QS | 139 local_irq_save(flags);
124 RCU_READ_UNLOCK_GOT_QS); 140 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
141 local_irq_restore(flags);
125} 142}
126 143
127/* 144/*
@@ -136,11 +153,65 @@ void __rcu_read_lock(void)
136} 153}
137EXPORT_SYMBOL_GPL(__rcu_read_lock); 154EXPORT_SYMBOL_GPL(__rcu_read_lock);
138 155
156/*
157 * Check for preempted RCU readers blocking the current grace period
158 * for the specified rcu_node structure. If the caller needs a reliable
159 * answer, it must hold the rcu_node's ->lock.
160 */
161static int rcu_preempted_readers(struct rcu_node *rnp)
162{
163 int phase = rnp->gpnum & 0x1;
164
165 return !list_empty(&rnp->blocked_tasks[phase]) ||
166 !list_empty(&rnp->blocked_tasks[phase + 2]);
167}
168
169/*
170 * Record a quiescent state for all tasks that were previously queued
171 * on the specified rcu_node structure and that were blocking the current
172 * RCU grace period. The caller must hold the specified rnp->lock with
173 * irqs disabled, and this lock is released upon return, but irqs remain
174 * disabled.
175 */
176static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
177 __releases(rnp->lock)
178{
179 unsigned long mask;
180 struct rcu_node *rnp_p;
181
182 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
183 spin_unlock_irqrestore(&rnp->lock, flags);
184 return; /* Still need more quiescent states! */
185 }
186
187 rnp_p = rnp->parent;
188 if (rnp_p == NULL) {
189 /*
190 * Either there is only one rcu_node in the tree,
191 * or tasks were kicked up to root rcu_node due to
192 * CPUs going offline.
193 */
194 rcu_report_qs_rsp(&rcu_preempt_state, flags);
195 return;
196 }
197
198 /* Report up the rest of the hierarchy. */
199 mask = rnp->grpmask;
200 spin_unlock(&rnp->lock); /* irqs remain disabled. */
201 spin_lock(&rnp_p->lock); /* irqs already disabled. */
202 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
203}
204
205/*
206 * Handle special cases during rcu_read_unlock(), such as needing to
207 * notify RCU core processing or task having blocked during the RCU
208 * read-side critical section.
209 */
139static void rcu_read_unlock_special(struct task_struct *t) 210static void rcu_read_unlock_special(struct task_struct *t)
140{ 211{
141 int empty; 212 int empty;
213 int empty_exp;
142 unsigned long flags; 214 unsigned long flags;
143 unsigned long mask;
144 struct rcu_node *rnp; 215 struct rcu_node *rnp;
145 int special; 216 int special;
146 217
@@ -157,7 +228,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
157 special = t->rcu_read_unlock_special; 228 special = t->rcu_read_unlock_special;
158 if (special & RCU_READ_UNLOCK_NEED_QS) { 229 if (special & RCU_READ_UNLOCK_NEED_QS) {
159 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 230 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
160 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_GOT_QS; 231 rcu_preempt_qs(smp_processor_id());
161 } 232 }
162 233
163 /* Hardware IRQ handlers cannot block. */ 234 /* Hardware IRQ handlers cannot block. */
@@ -177,42 +248,36 @@ static void rcu_read_unlock_special(struct task_struct *t)
177 */ 248 */
178 for (;;) { 249 for (;;) {
179 rnp = t->rcu_blocked_node; 250 rnp = t->rcu_blocked_node;
180 spin_lock(&rnp->lock); 251 spin_lock(&rnp->lock); /* irqs already disabled. */
181 if (rnp == t->rcu_blocked_node) 252 if (rnp == t->rcu_blocked_node)
182 break; 253 break;
183 spin_unlock(&rnp->lock); 254 spin_unlock(&rnp->lock); /* irqs remain disabled. */
184 } 255 }
185 empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); 256 empty = !rcu_preempted_readers(rnp);
257 empty_exp = !rcu_preempted_readers_exp(rnp);
258 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
186 list_del_init(&t->rcu_node_entry); 259 list_del_init(&t->rcu_node_entry);
187 t->rcu_blocked_node = NULL; 260 t->rcu_blocked_node = NULL;
188 261
189 /* 262 /*
190 * If this was the last task on the current list, and if 263 * If this was the last task on the current list, and if
191 * we aren't waiting on any CPUs, report the quiescent state. 264 * we aren't waiting on any CPUs, report the quiescent state.
192 * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk() 265 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
193 * drop rnp->lock and restore irq.
194 */ 266 */
195 if (!empty && rnp->qsmask == 0 && 267 if (empty)
196 list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) {
197 t->rcu_read_unlock_special &=
198 ~(RCU_READ_UNLOCK_NEED_QS |
199 RCU_READ_UNLOCK_GOT_QS);
200 if (rnp->parent == NULL) {
201 /* Only one rcu_node in the tree. */
202 cpu_quiet_msk_finish(&rcu_preempt_state, flags);
203 return;
204 }
205 /* Report up the rest of the hierarchy. */
206 mask = rnp->grpmask;
207 spin_unlock_irqrestore(&rnp->lock, flags); 268 spin_unlock_irqrestore(&rnp->lock, flags);
208 rnp = rnp->parent; 269 else
209 spin_lock_irqsave(&rnp->lock, flags); 270 rcu_report_unblock_qs_rnp(rnp, flags);
210 cpu_quiet_msk(mask, &rcu_preempt_state, rnp, flags); 271
211 return; 272 /*
212 } 273 * If this was the last task on the expedited lists,
213 spin_unlock(&rnp->lock); 274 * then we need to report up the rcu_node hierarchy.
275 */
276 if (!empty_exp && !rcu_preempted_readers_exp(rnp))
277 rcu_report_exp_rnp(&rcu_preempt_state, rnp);
278 } else {
279 local_irq_restore(flags);
214 } 280 }
215 local_irq_restore(flags);
216} 281}
217 282
218/* 283/*
@@ -243,12 +308,12 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
243{ 308{
244 unsigned long flags; 309 unsigned long flags;
245 struct list_head *lp; 310 struct list_head *lp;
246 int phase = rnp->gpnum & 0x1; 311 int phase;
247 struct task_struct *t; 312 struct task_struct *t;
248 313
249 if (!list_empty(&rnp->blocked_tasks[phase])) { 314 if (rcu_preempted_readers(rnp)) {
250 spin_lock_irqsave(&rnp->lock, flags); 315 spin_lock_irqsave(&rnp->lock, flags);
251 phase = rnp->gpnum & 0x1; /* re-read under lock. */ 316 phase = rnp->gpnum & 0x1;
252 lp = &rnp->blocked_tasks[phase]; 317 lp = &rnp->blocked_tasks[phase];
253 list_for_each_entry(t, lp, rcu_node_entry) 318 list_for_each_entry(t, lp, rcu_node_entry)
254 printk(" P%d", t->pid); 319 printk(" P%d", t->pid);
@@ -259,13 +324,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
259#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 324#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
260 325
261/* 326/*
262 * Check for preempted RCU readers for the specified rcu_node structure. 327 * Check that the list of blocked tasks for the newly completed grace
263 * If the caller needs a reliable answer, it must hold the rcu_node's 328 * period is in fact empty. It is a serious bug to complete a grace
264 * >lock. 329 * period that still has RCU readers blocked! This function must be
330 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
331 * must be held by the caller.
265 */ 332 */
266static int rcu_preempted_readers(struct rcu_node *rnp) 333static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
267{ 334{
268 return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); 335 WARN_ON_ONCE(rcu_preempted_readers(rnp));
336 WARN_ON_ONCE(rnp->qsmask);
269} 337}
270 338
271#ifdef CONFIG_HOTPLUG_CPU 339#ifdef CONFIG_HOTPLUG_CPU
@@ -276,22 +344,34 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
276 * rcu_node. The reason for not just moving them to the immediate 344 * rcu_node. The reason for not just moving them to the immediate
277 * parent is to remove the need for rcu_read_unlock_special() to 345 * parent is to remove the need for rcu_read_unlock_special() to
278 * make more than two attempts to acquire the target rcu_node's lock. 346 * make more than two attempts to acquire the target rcu_node's lock.
347 * Returns true if there were tasks blocking the current RCU grace
348 * period.
349 *
350 * Returns 1 if there was previously a task blocking the current grace
351 * period on the specified rcu_node structure.
279 * 352 *
280 * The caller must hold rnp->lock with irqs disabled. 353 * The caller must hold rnp->lock with irqs disabled.
281 */ 354 */
282static void rcu_preempt_offline_tasks(struct rcu_state *rsp, 355static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
283 struct rcu_node *rnp) 356 struct rcu_node *rnp,
357 struct rcu_data *rdp)
284{ 358{
285 int i; 359 int i;
286 struct list_head *lp; 360 struct list_head *lp;
287 struct list_head *lp_root; 361 struct list_head *lp_root;
362 int retval = 0;
288 struct rcu_node *rnp_root = rcu_get_root(rsp); 363 struct rcu_node *rnp_root = rcu_get_root(rsp);
289 struct task_struct *tp; 364 struct task_struct *tp;
290 365
291 if (rnp == rnp_root) { 366 if (rnp == rnp_root) {
292 WARN_ONCE(1, "Last CPU thought to be offlined?"); 367 WARN_ONCE(1, "Last CPU thought to be offlined?");
293 return; /* Shouldn't happen: at least one CPU online. */ 368 return 0; /* Shouldn't happen: at least one CPU online. */
294 } 369 }
370 WARN_ON_ONCE(rnp != rdp->mynode &&
371 (!list_empty(&rnp->blocked_tasks[0]) ||
372 !list_empty(&rnp->blocked_tasks[1]) ||
373 !list_empty(&rnp->blocked_tasks[2]) ||
374 !list_empty(&rnp->blocked_tasks[3])));
295 375
296 /* 376 /*
297 * Move tasks up to root rcu_node. Rely on the fact that the 377 * Move tasks up to root rcu_node. Rely on the fact that the
@@ -299,7 +379,11 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
299 * rcu_nodes in terms of gp_num value. This fact allows us to 379 * rcu_nodes in terms of gp_num value. This fact allows us to
300 * move the blocked_tasks[] array directly, element by element. 380 * move the blocked_tasks[] array directly, element by element.
301 */ 381 */
302 for (i = 0; i < 2; i++) { 382 if (rcu_preempted_readers(rnp))
383 retval |= RCU_OFL_TASKS_NORM_GP;
384 if (rcu_preempted_readers_exp(rnp))
385 retval |= RCU_OFL_TASKS_EXP_GP;
386 for (i = 0; i < 4; i++) {
303 lp = &rnp->blocked_tasks[i]; 387 lp = &rnp->blocked_tasks[i];
304 lp_root = &rnp_root->blocked_tasks[i]; 388 lp_root = &rnp_root->blocked_tasks[i];
305 while (!list_empty(lp)) { 389 while (!list_empty(lp)) {
@@ -311,6 +395,7 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
311 spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 395 spin_unlock(&rnp_root->lock); /* irqs remain disabled */
312 } 396 }
313 } 397 }
398 return retval;
314} 399}
315 400
316/* 401/*
@@ -335,20 +420,12 @@ static void rcu_preempt_check_callbacks(int cpu)
335 struct task_struct *t = current; 420 struct task_struct *t = current;
336 421
337 if (t->rcu_read_lock_nesting == 0) { 422 if (t->rcu_read_lock_nesting == 0) {
338 t->rcu_read_unlock_special &= 423 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
339 ~(RCU_READ_UNLOCK_NEED_QS | RCU_READ_UNLOCK_GOT_QS); 424 rcu_preempt_qs(cpu);
340 rcu_preempt_qs_record(cpu);
341 return; 425 return;
342 } 426 }
343 if (per_cpu(rcu_preempt_data, cpu).qs_pending) { 427 if (per_cpu(rcu_preempt_data, cpu).qs_pending)
344 if (t->rcu_read_unlock_special & RCU_READ_UNLOCK_GOT_QS) { 428 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
345 rcu_preempt_qs_record(cpu);
346 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_GOT_QS;
347 } else if (!(t->rcu_read_unlock_special &
348 RCU_READ_UNLOCK_NEED_QS)) {
349 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
350 }
351 }
352} 429}
353 430
354/* 431/*
@@ -369,6 +446,186 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
369} 446}
370EXPORT_SYMBOL_GPL(call_rcu); 447EXPORT_SYMBOL_GPL(call_rcu);
371 448
449/**
450 * synchronize_rcu - wait until a grace period has elapsed.
451 *
452 * Control will return to the caller some time after a full grace
453 * period has elapsed, in other words after all currently executing RCU
454 * read-side critical sections have completed. RCU read-side critical
455 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
456 * and may be nested.
457 */
458void synchronize_rcu(void)
459{
460 struct rcu_synchronize rcu;
461
462 if (!rcu_scheduler_active)
463 return;
464
465 init_completion(&rcu.completion);
466 /* Will wake me after RCU finished. */
467 call_rcu(&rcu.head, wakeme_after_rcu);
468 /* Wait for it. */
469 wait_for_completion(&rcu.completion);
470}
471EXPORT_SYMBOL_GPL(synchronize_rcu);
472
473static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
474static long sync_rcu_preempt_exp_count;
475static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
476
477/*
478 * Return non-zero if there are any tasks in RCU read-side critical
479 * sections blocking the current preemptible-RCU expedited grace period.
480 * If there is no preemptible-RCU expedited grace period currently in
481 * progress, returns zero unconditionally.
482 */
483static int rcu_preempted_readers_exp(struct rcu_node *rnp)
484{
485 return !list_empty(&rnp->blocked_tasks[2]) ||
486 !list_empty(&rnp->blocked_tasks[3]);
487}
488
489/*
490 * return non-zero if there is no RCU expedited grace period in progress
491 * for the specified rcu_node structure, in other words, if all CPUs and
492 * tasks covered by the specified rcu_node structure have done their bit
493 * for the current expedited grace period. Works only for preemptible
494 * RCU -- other RCU implementation use other means.
495 *
496 * Caller must hold sync_rcu_preempt_exp_mutex.
497 */
498static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
499{
500 return !rcu_preempted_readers_exp(rnp) &&
501 ACCESS_ONCE(rnp->expmask) == 0;
502}
503
504/*
505 * Report the exit from RCU read-side critical section for the last task
506 * that queued itself during or before the current expedited preemptible-RCU
507 * grace period. This event is reported either to the rcu_node structure on
508 * which the task was queued or to one of that rcu_node structure's ancestors,
509 * recursively up the tree. (Calm down, calm down, we do the recursion
510 * iteratively!)
511 *
512 * Caller must hold sync_rcu_preempt_exp_mutex.
513 */
514static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
515{
516 unsigned long flags;
517 unsigned long mask;
518
519 spin_lock_irqsave(&rnp->lock, flags);
520 for (;;) {
521 if (!sync_rcu_preempt_exp_done(rnp))
522 break;
523 if (rnp->parent == NULL) {
524 wake_up(&sync_rcu_preempt_exp_wq);
525 break;
526 }
527 mask = rnp->grpmask;
528 spin_unlock(&rnp->lock); /* irqs remain disabled */
529 rnp = rnp->parent;
530 spin_lock(&rnp->lock); /* irqs already disabled */
531 rnp->expmask &= ~mask;
532 }
533 spin_unlock_irqrestore(&rnp->lock, flags);
534}
535
536/*
537 * Snapshot the tasks blocking the newly started preemptible-RCU expedited
538 * grace period for the specified rcu_node structure. If there are no such
539 * tasks, report it up the rcu_node hierarchy.
540 *
541 * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock.
542 */
543static void
544sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
545{
546 int must_wait;
547
548 spin_lock(&rnp->lock); /* irqs already disabled */
549 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
550 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
551 must_wait = rcu_preempted_readers_exp(rnp);
552 spin_unlock(&rnp->lock); /* irqs remain disabled */
553 if (!must_wait)
554 rcu_report_exp_rnp(rsp, rnp);
555}
556
557/*
558 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
559 * is to invoke synchronize_sched_expedited() to push all the tasks to
560 * the ->blocked_tasks[] lists, move all entries from the first set of
561 * ->blocked_tasks[] lists to the second set, and finally wait for this
562 * second set to drain.
563 */
564void synchronize_rcu_expedited(void)
565{
566 unsigned long flags;
567 struct rcu_node *rnp;
568 struct rcu_state *rsp = &rcu_preempt_state;
569 long snap;
570 int trycount = 0;
571
572 smp_mb(); /* Caller's modifications seen first by other CPUs. */
573 snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
574 smp_mb(); /* Above access cannot bleed into critical section. */
575
576 /*
577 * Acquire lock, falling back to synchronize_rcu() if too many
578 * lock-acquisition failures. Of course, if someone does the
579 * expedited grace period for us, just leave.
580 */
581 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
582 if (trycount++ < 10)
583 udelay(trycount * num_online_cpus());
584 else {
585 synchronize_rcu();
586 return;
587 }
588 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
589 goto mb_ret; /* Others did our work for us. */
590 }
591 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
592 goto unlock_mb_ret; /* Others did our work for us. */
593
594 /* force all RCU readers onto blocked_tasks[]. */
595 synchronize_sched_expedited();
596
597 spin_lock_irqsave(&rsp->onofflock, flags);
598
599 /* Initialize ->expmask for all non-leaf rcu_node structures. */
600 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
601 spin_lock(&rnp->lock); /* irqs already disabled. */
602 rnp->expmask = rnp->qsmaskinit;
603 spin_unlock(&rnp->lock); /* irqs remain disabled. */
604 }
605
606 /* Snapshot current state of ->blocked_tasks[] lists. */
607 rcu_for_each_leaf_node(rsp, rnp)
608 sync_rcu_preempt_exp_init(rsp, rnp);
609 if (NUM_RCU_NODES > 1)
610 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
611
612 spin_unlock_irqrestore(&rsp->onofflock, flags);
613
614 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */
615 rnp = rcu_get_root(rsp);
616 wait_event(sync_rcu_preempt_exp_wq,
617 sync_rcu_preempt_exp_done(rnp));
618
619 /* Clean up and exit. */
620 smp_mb(); /* ensure expedited GP seen before counter increment. */
621 ACCESS_ONCE(sync_rcu_preempt_exp_count)++;
622unlock_mb_ret:
623 mutex_unlock(&sync_rcu_preempt_exp_mutex);
624mb_ret:
625 smp_mb(); /* ensure subsequent action seen after grace period. */
626}
627EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
628
372/* 629/*
373 * Check to see if there is any immediate preemptable-RCU-related work 630 * Check to see if there is any immediate preemptable-RCU-related work
374 * to be done. 631 * to be done.
@@ -387,6 +644,15 @@ static int rcu_preempt_needs_cpu(int cpu)
387 return !!per_cpu(rcu_preempt_data, cpu).nxtlist; 644 return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
388} 645}
389 646
647/**
648 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
649 */
650void rcu_barrier(void)
651{
652 _rcu_barrier(&rcu_preempt_state, call_rcu);
653}
654EXPORT_SYMBOL_GPL(rcu_barrier);
655
390/* 656/*
391 * Initialize preemptable RCU's per-CPU data. 657 * Initialize preemptable RCU's per-CPU data.
392 */ 658 */
@@ -396,6 +662,22 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
396} 662}
397 663
398/* 664/*
665 * Move preemptable RCU's callbacks to ->orphan_cbs_list.
666 */
667static void rcu_preempt_send_cbs_to_orphanage(void)
668{
669 rcu_send_cbs_to_orphanage(&rcu_preempt_state);
670}
671
672/*
673 * Initialize preemptable RCU's state structures.
674 */
675static void __init __rcu_init_preempt(void)
676{
677 RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
678}
679
680/*
399 * Check for a task exiting while in a preemptable-RCU read-side 681 * Check for a task exiting while in a preemptable-RCU read-side
400 * critical section, clean up if so. No need to issue warnings, 682 * critical section, clean up if so. No need to issue warnings,
401 * as debug_check_no_locks_held() already does this if lockdep 683 * as debug_check_no_locks_held() already does this if lockdep
@@ -416,7 +698,7 @@ void exit_rcu(void)
416/* 698/*
417 * Tell them what RCU they are running. 699 * Tell them what RCU they are running.
418 */ 700 */
419static inline void rcu_bootup_announce(void) 701static void __init rcu_bootup_announce(void)
420{ 702{
421 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 703 printk(KERN_INFO "Hierarchical RCU implementation.\n");
422} 704}
@@ -434,10 +716,29 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
434 * Because preemptable RCU does not exist, we never have to check for 716 * Because preemptable RCU does not exist, we never have to check for
435 * CPUs being in quiescent states. 717 * CPUs being in quiescent states.
436 */ 718 */
437static void rcu_preempt_qs(int cpu) 719static void rcu_preempt_note_context_switch(int cpu)
720{
721}
722
723/*
724 * Because preemptable RCU does not exist, there are never any preempted
725 * RCU readers.
726 */
727static int rcu_preempted_readers(struct rcu_node *rnp)
438{ 728{
729 return 0;
730}
731
732#ifdef CONFIG_HOTPLUG_CPU
733
734/* Because preemptible RCU does not exist, no quieting of tasks. */
735static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
736{
737 spin_unlock_irqrestore(&rnp->lock, flags);
439} 738}
440 739
740#endif /* #ifdef CONFIG_HOTPLUG_CPU */
741
441#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 742#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
442 743
443/* 744/*
@@ -451,23 +752,28 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
451#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 752#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
452 753
453/* 754/*
454 * Because preemptable RCU does not exist, there are never any preempted 755 * Because there is no preemptable RCU, there can be no readers blocked,
455 * RCU readers. 756 * so there is no need to check for blocked tasks. So check only for
757 * bogus qsmask values.
456 */ 758 */
457static int rcu_preempted_readers(struct rcu_node *rnp) 759static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
458{ 760{
459 return 0; 761 WARN_ON_ONCE(rnp->qsmask);
460} 762}
461 763
462#ifdef CONFIG_HOTPLUG_CPU 764#ifdef CONFIG_HOTPLUG_CPU
463 765
464/* 766/*
465 * Because preemptable RCU does not exist, it never needs to migrate 767 * Because preemptable RCU does not exist, it never needs to migrate
466 * tasks that were blocked within RCU read-side critical sections. 768 * tasks that were blocked within RCU read-side critical sections, and
769 * such non-existent tasks cannot possibly have been blocking the current
770 * grace period.
467 */ 771 */
468static void rcu_preempt_offline_tasks(struct rcu_state *rsp, 772static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
469 struct rcu_node *rnp) 773 struct rcu_node *rnp,
774 struct rcu_data *rdp)
470{ 775{
776 return 0;
471} 777}
472 778
473/* 779/*
@@ -484,7 +790,7 @@ static void rcu_preempt_offline_cpu(int cpu)
484 * Because preemptable RCU does not exist, it never has any callbacks 790 * Because preemptable RCU does not exist, it never has any callbacks
485 * to check. 791 * to check.
486 */ 792 */
487void rcu_preempt_check_callbacks(int cpu) 793static void rcu_preempt_check_callbacks(int cpu)
488{ 794{
489} 795}
490 796
@@ -492,7 +798,7 @@ void rcu_preempt_check_callbacks(int cpu)
492 * Because preemptable RCU does not exist, it never has any callbacks 798 * Because preemptable RCU does not exist, it never has any callbacks
493 * to process. 799 * to process.
494 */ 800 */
495void rcu_preempt_process_callbacks(void) 801static void rcu_preempt_process_callbacks(void)
496{ 802{
497} 803}
498 804
@@ -506,6 +812,30 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
506EXPORT_SYMBOL_GPL(call_rcu); 812EXPORT_SYMBOL_GPL(call_rcu);
507 813
508/* 814/*
815 * Wait for an rcu-preempt grace period, but make it happen quickly.
816 * But because preemptable RCU does not exist, map to rcu-sched.
817 */
818void synchronize_rcu_expedited(void)
819{
820 synchronize_sched_expedited();
821}
822EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
823
824#ifdef CONFIG_HOTPLUG_CPU
825
826/*
827 * Because preemptable RCU does not exist, there is never any need to
828 * report on tasks preempted in RCU read-side critical sections during
829 * expedited RCU grace periods.
830 */
831static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
832{
833 return;
834}
835
836#endif /* #ifdef CONFIG_HOTPLUG_CPU */
837
838/*
509 * Because preemptable RCU does not exist, it never has any work to do. 839 * Because preemptable RCU does not exist, it never has any work to do.
510 */ 840 */
511static int rcu_preempt_pending(int cpu) 841static int rcu_preempt_pending(int cpu)
@@ -522,6 +852,16 @@ static int rcu_preempt_needs_cpu(int cpu)
522} 852}
523 853
524/* 854/*
855 * Because preemptable RCU does not exist, rcu_barrier() is just
856 * another name for rcu_barrier_sched().
857 */
858void rcu_barrier(void)
859{
860 rcu_barrier_sched();
861}
862EXPORT_SYMBOL_GPL(rcu_barrier);
863
864/*
525 * Because preemptable RCU does not exist, there is no per-CPU 865 * Because preemptable RCU does not exist, there is no per-CPU
526 * data to initialize. 866 * data to initialize.
527 */ 867 */
@@ -529,4 +869,18 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
529{ 869{
530} 870}
531 871
872/*
873 * Because there is no preemptable RCU, there are no callbacks to move.
874 */
875static void rcu_preempt_send_cbs_to_orphanage(void)
876{
877}
878
879/*
880 * Because preemptable RCU does not exist, it need not be initialized.
881 */
882static void __init __rcu_init_preempt(void)
883{
884}
885
532#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 886#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 0ea1bff69727..9d2c88423b31 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -20,7 +20,7 @@
20 * Papers: http://www.rdrop.com/users/paulmck/RCU 20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 * 21 *
22 * For detailed explanation of Read-Copy Update mechanism see - 22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU 23 * Documentation/RCU
24 * 24 *
25 */ 25 */
26#include <linux/types.h> 26#include <linux/types.h>
@@ -93,7 +93,7 @@ static int rcudata_open(struct inode *inode, struct file *file)
93 return single_open(file, show_rcudata, NULL); 93 return single_open(file, show_rcudata, NULL);
94} 94}
95 95
96static struct file_operations rcudata_fops = { 96static const struct file_operations rcudata_fops = {
97 .owner = THIS_MODULE, 97 .owner = THIS_MODULE,
98 .open = rcudata_open, 98 .open = rcudata_open,
99 .read = seq_read, 99 .read = seq_read,
@@ -145,7 +145,7 @@ static int rcudata_csv_open(struct inode *inode, struct file *file)
145 return single_open(file, show_rcudata_csv, NULL); 145 return single_open(file, show_rcudata_csv, NULL);
146} 146}
147 147
148static struct file_operations rcudata_csv_fops = { 148static const struct file_operations rcudata_csv_fops = {
149 .owner = THIS_MODULE, 149 .owner = THIS_MODULE,
150 .open = rcudata_csv_open, 150 .open = rcudata_csv_open,
151 .read = seq_read, 151 .read = seq_read,
@@ -155,24 +155,32 @@ static struct file_operations rcudata_csv_fops = {
155 155
156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
157{ 157{
158 long gpnum;
158 int level = 0; 159 int level = 0;
160 int phase;
159 struct rcu_node *rnp; 161 struct rcu_node *rnp;
160 162
163 gpnum = rsp->gpnum;
161 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " 164 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
162 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", 165 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
163 rsp->completed, rsp->gpnum, rsp->signaled, 166 rsp->completed, gpnum, rsp->signaled,
164 (long)(rsp->jiffies_force_qs - jiffies), 167 (long)(rsp->jiffies_force_qs - jiffies),
165 (int)(jiffies & 0xffff), 168 (int)(jiffies & 0xffff),
166 rsp->n_force_qs, rsp->n_force_qs_ngp, 169 rsp->n_force_qs, rsp->n_force_qs_ngp,
167 rsp->n_force_qs - rsp->n_force_qs_ngp, 170 rsp->n_force_qs - rsp->n_force_qs_ngp,
168 rsp->n_force_qs_lh); 171 rsp->n_force_qs_lh, rsp->orphan_qlen);
169 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { 172 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
170 if (rnp->level != level) { 173 if (rnp->level != level) {
171 seq_puts(m, "\n"); 174 seq_puts(m, "\n");
172 level = rnp->level; 175 level = rnp->level;
173 } 176 }
174 seq_printf(m, "%lx/%lx %d:%d ^%d ", 177 phase = gpnum & 0x1;
178 seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ",
175 rnp->qsmask, rnp->qsmaskinit, 179 rnp->qsmask, rnp->qsmaskinit,
180 "T."[list_empty(&rnp->blocked_tasks[phase])],
181 "E."[list_empty(&rnp->blocked_tasks[phase + 2])],
182 "T."[list_empty(&rnp->blocked_tasks[!phase])],
183 "E."[list_empty(&rnp->blocked_tasks[!phase + 2])],
176 rnp->grplo, rnp->grphi, rnp->grpnum); 184 rnp->grplo, rnp->grphi, rnp->grpnum);
177 } 185 }
178 seq_puts(m, "\n"); 186 seq_puts(m, "\n");
@@ -196,7 +204,7 @@ static int rcuhier_open(struct inode *inode, struct file *file)
196 return single_open(file, show_rcuhier, NULL); 204 return single_open(file, show_rcuhier, NULL);
197} 205}
198 206
199static struct file_operations rcuhier_fops = { 207static const struct file_operations rcuhier_fops = {
200 .owner = THIS_MODULE, 208 .owner = THIS_MODULE,
201 .open = rcuhier_open, 209 .open = rcuhier_open,
202 .read = seq_read, 210 .read = seq_read,
@@ -222,7 +230,7 @@ static int rcugp_open(struct inode *inode, struct file *file)
222 return single_open(file, show_rcugp, NULL); 230 return single_open(file, show_rcugp, NULL);
223} 231}
224 232
225static struct file_operations rcugp_fops = { 233static const struct file_operations rcugp_fops = {
226 .owner = THIS_MODULE, 234 .owner = THIS_MODULE,
227 .open = rcugp_open, 235 .open = rcugp_open,
228 .read = seq_read, 236 .read = seq_read,
@@ -276,7 +284,7 @@ static int rcu_pending_open(struct inode *inode, struct file *file)
276 return single_open(file, show_rcu_pending, NULL); 284 return single_open(file, show_rcu_pending, NULL);
277} 285}
278 286
279static struct file_operations rcu_pending_fops = { 287static const struct file_operations rcu_pending_fops = {
280 .owner = THIS_MODULE, 288 .owner = THIS_MODULE,
281 .open = rcu_pending_open, 289 .open = rcu_pending_open,
282 .read = seq_read, 290 .read = seq_read,
diff --git a/kernel/relay.c b/kernel/relay.c
index bc188549788f..760c26209a3c 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -60,7 +60,7 @@ static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
60/* 60/*
61 * vm_ops for relay file mappings. 61 * vm_ops for relay file mappings.
62 */ 62 */
63static struct vm_operations_struct relay_file_mmap_ops = { 63static const struct vm_operations_struct relay_file_mmap_ops = {
64 .fault = relay_buf_fault, 64 .fault = relay_buf_fault,
65 .close = relay_file_mmap_close, 65 .close = relay_file_mmap_close,
66}; 66};
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index e1338f074314..bcdabf37c40b 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -19,6 +19,7 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
19{ 19{
20 spin_lock_init(&counter->lock); 20 spin_lock_init(&counter->lock);
21 counter->limit = RESOURCE_MAX; 21 counter->limit = RESOURCE_MAX;
22 counter->soft_limit = RESOURCE_MAX;
22 counter->parent = parent; 23 counter->parent = parent;
23} 24}
24 25
@@ -101,6 +102,8 @@ res_counter_member(struct res_counter *counter, int member)
101 return &counter->limit; 102 return &counter->limit;
102 case RES_FAILCNT: 103 case RES_FAILCNT:
103 return &counter->failcnt; 104 return &counter->failcnt;
105 case RES_SOFT_LIMIT:
106 return &counter->soft_limit;
104 }; 107 };
105 108
106 BUG(); 109 BUG();
diff --git a/kernel/resource.c b/kernel/resource.c
index 78b087221c15..fb11a58b9594 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -223,13 +223,13 @@ int release_resource(struct resource *old)
223 223
224EXPORT_SYMBOL(release_resource); 224EXPORT_SYMBOL(release_resource);
225 225
226#if defined(CONFIG_MEMORY_HOTPLUG) && !defined(CONFIG_ARCH_HAS_WALK_MEMORY) 226#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
227/* 227/*
228 * Finds the lowest memory reosurce exists within [res->start.res->end) 228 * Finds the lowest memory reosurce exists within [res->start.res->end)
229 * the caller must specify res->start, res->end, res->flags. 229 * the caller must specify res->start, res->end, res->flags and "name".
230 * If found, returns 0, res is overwritten, if not found, returns -1. 230 * If found, returns 0, res is overwritten, if not found, returns -1.
231 */ 231 */
232static int find_next_system_ram(struct resource *res) 232static int find_next_system_ram(struct resource *res, char *name)
233{ 233{
234 resource_size_t start, end; 234 resource_size_t start, end;
235 struct resource *p; 235 struct resource *p;
@@ -245,6 +245,8 @@ static int find_next_system_ram(struct resource *res)
245 /* system ram is just marked as IORESOURCE_MEM */ 245 /* system ram is just marked as IORESOURCE_MEM */
246 if (p->flags != res->flags) 246 if (p->flags != res->flags)
247 continue; 247 continue;
248 if (name && strcmp(p->name, name))
249 continue;
248 if (p->start > end) { 250 if (p->start > end) {
249 p = NULL; 251 p = NULL;
250 break; 252 break;
@@ -262,19 +264,26 @@ static int find_next_system_ram(struct resource *res)
262 res->end = p->end; 264 res->end = p->end;
263 return 0; 265 return 0;
264} 266}
265int 267
266walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg, 268/*
267 int (*func)(unsigned long, unsigned long, void *)) 269 * This function calls callback against all memory range of "System RAM"
270 * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
271 * Now, this function is only for "System RAM".
272 */
273int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
274 void *arg, int (*func)(unsigned long, unsigned long, void *))
268{ 275{
269 struct resource res; 276 struct resource res;
270 unsigned long pfn, len; 277 unsigned long pfn, len;
271 u64 orig_end; 278 u64 orig_end;
272 int ret = -1; 279 int ret = -1;
280
273 res.start = (u64) start_pfn << PAGE_SHIFT; 281 res.start = (u64) start_pfn << PAGE_SHIFT;
274 res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; 282 res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
275 res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; 283 res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
276 orig_end = res.end; 284 orig_end = res.end;
277 while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { 285 while ((res.start < res.end) &&
286 (find_next_system_ram(&res, "System RAM") >= 0)) {
278 pfn = (unsigned long)(res.start >> PAGE_SHIFT); 287 pfn = (unsigned long)(res.start >> PAGE_SHIFT);
279 len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); 288 len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT);
280 ret = (*func)(pfn, len, arg); 289 ret = (*func)(pfn, len, arg);
diff --git a/kernel/sched.c b/kernel/sched.c
index e27a53685ed9..aa31244caa9f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,7 +39,7 @@
39#include <linux/completion.h> 39#include <linux/completion.h>
40#include <linux/kernel_stat.h> 40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
42#include <linux/perf_counter.h> 42#include <linux/perf_event.h>
43#include <linux/security.h> 43#include <linux/security.h>
44#include <linux/notifier.h> 44#include <linux/notifier.h>
45#include <linux/profile.h> 45#include <linux/profile.h>
@@ -119,8 +119,6 @@
119 */ 119 */
120#define RUNTIME_INF ((u64)~0ULL) 120#define RUNTIME_INF ((u64)~0ULL)
121 121
122static void double_rq_lock(struct rq *rq1, struct rq *rq2);
123
124static inline int rt_policy(int policy) 122static inline int rt_policy(int policy)
125{ 123{
126 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 124 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -295,12 +293,12 @@ struct task_group root_task_group;
295/* Default task group's sched entity on each cpu */ 293/* Default task group's sched entity on each cpu */
296static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
297/* Default task group's cfs_rq on each cpu */ 295/* Default task group's cfs_rq on each cpu */
298static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp; 296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
299#endif /* CONFIG_FAIR_GROUP_SCHED */ 297#endif /* CONFIG_FAIR_GROUP_SCHED */
300 298
301#ifdef CONFIG_RT_GROUP_SCHED 299#ifdef CONFIG_RT_GROUP_SCHED
302static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
303static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
304#endif /* CONFIG_RT_GROUP_SCHED */ 302#endif /* CONFIG_RT_GROUP_SCHED */
305#else /* !CONFIG_USER_SCHED */ 303#else /* !CONFIG_USER_SCHED */
306#define root_task_group init_task_group 304#define root_task_group init_task_group
@@ -311,6 +309,8 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
311 */ 309 */
312static DEFINE_SPINLOCK(task_group_lock); 310static DEFINE_SPINLOCK(task_group_lock);
313 311
312#ifdef CONFIG_FAIR_GROUP_SCHED
313
314#ifdef CONFIG_SMP 314#ifdef CONFIG_SMP
315static int root_task_group_empty(void) 315static int root_task_group_empty(void)
316{ 316{
@@ -318,7 +318,6 @@ static int root_task_group_empty(void)
318} 318}
319#endif 319#endif
320 320
321#ifdef CONFIG_FAIR_GROUP_SCHED
322#ifdef CONFIG_USER_SCHED 321#ifdef CONFIG_USER_SCHED
323# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
324#else /* !CONFIG_USER_SCHED */ 323#else /* !CONFIG_USER_SCHED */
@@ -378,13 +377,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
378 377
379#else 378#else
380 379
381#ifdef CONFIG_SMP
382static int root_task_group_empty(void)
383{
384 return 1;
385}
386#endif
387
388static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 380static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
389static inline struct task_group *task_group(struct task_struct *p) 381static inline struct task_group *task_group(struct task_struct *p)
390{ 382{
@@ -514,14 +506,6 @@ struct root_domain {
514#ifdef CONFIG_SMP 506#ifdef CONFIG_SMP
515 struct cpupri cpupri; 507 struct cpupri cpupri;
516#endif 508#endif
517#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
518 /*
519 * Preferred wake up cpu nominated by sched_mc balance that will be
520 * used when most cpus are idle in the system indicating overall very
521 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
522 */
523 unsigned int sched_mc_preferred_wakeup_cpu;
524#endif
525}; 509};
526 510
527/* 511/*
@@ -551,14 +535,12 @@ struct rq {
551 #define CPU_LOAD_IDX_MAX 5 535 #define CPU_LOAD_IDX_MAX 5
552 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 536 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
553#ifdef CONFIG_NO_HZ 537#ifdef CONFIG_NO_HZ
554 unsigned long last_tick_seen;
555 unsigned char in_nohz_recently; 538 unsigned char in_nohz_recently;
556#endif 539#endif
557 /* capture load from *all* tasks on this cpu: */ 540 /* capture load from *all* tasks on this cpu: */
558 struct load_weight load; 541 struct load_weight load;
559 unsigned long nr_load_updates; 542 unsigned long nr_load_updates;
560 u64 nr_switches; 543 u64 nr_switches;
561 u64 nr_migrations_in;
562 544
563 struct cfs_rq cfs; 545 struct cfs_rq cfs;
564 struct rt_rq rt; 546 struct rt_rq rt;
@@ -607,6 +589,8 @@ struct rq {
607 589
608 u64 rt_avg; 590 u64 rt_avg;
609 u64 age_stamp; 591 u64 age_stamp;
592 u64 idle_stamp;
593 u64 avg_idle;
610#endif 594#endif
611 595
612 /* calc_load related fields */ 596 /* calc_load related fields */
@@ -646,9 +630,10 @@ struct rq {
646 630
647static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 631static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
648 632
649static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) 633static inline
634void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
650{ 635{
651 rq->curr->sched_class->check_preempt_curr(rq, p, sync); 636 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
652} 637}
653 638
654static inline int cpu_of(struct rq *rq) 639static inline int cpu_of(struct rq *rq)
@@ -692,20 +677,15 @@ inline void update_rq_clock(struct rq *rq)
692 677
693/** 678/**
694 * runqueue_is_locked 679 * runqueue_is_locked
680 * @cpu: the processor in question.
695 * 681 *
696 * Returns true if the current cpu runqueue is locked. 682 * Returns true if the current cpu runqueue is locked.
697 * This interface allows printk to be called with the runqueue lock 683 * This interface allows printk to be called with the runqueue lock
698 * held and know whether or not it is OK to wake up the klogd. 684 * held and know whether or not it is OK to wake up the klogd.
699 */ 685 */
700int runqueue_is_locked(void) 686int runqueue_is_locked(int cpu)
701{ 687{
702 int cpu = get_cpu(); 688 return spin_is_locked(&cpu_rq(cpu)->lock);
703 struct rq *rq = cpu_rq(cpu);
704 int ret;
705
706 ret = spin_is_locked(&rq->lock);
707 put_cpu();
708 return ret;
709} 689}
710 690
711/* 691/*
@@ -792,7 +772,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
792 if (!sched_feat_names[i]) 772 if (!sched_feat_names[i])
793 return -EINVAL; 773 return -EINVAL;
794 774
795 filp->f_pos += cnt; 775 *ppos += cnt;
796 776
797 return cnt; 777 return cnt;
798} 778}
@@ -802,7 +782,7 @@ static int sched_feat_open(struct inode *inode, struct file *filp)
802 return single_open(filp, sched_feat_show, NULL); 782 return single_open(filp, sched_feat_show, NULL);
803} 783}
804 784
805static struct file_operations sched_feat_fops = { 785static const struct file_operations sched_feat_fops = {
806 .open = sched_feat_open, 786 .open = sched_feat_open,
807 .write = sched_feat_write, 787 .write = sched_feat_write,
808 .read = seq_read, 788 .read = seq_read,
@@ -1509,8 +1489,65 @@ static int tg_nop(struct task_group *tg, void *data)
1509#endif 1489#endif
1510 1490
1511#ifdef CONFIG_SMP 1491#ifdef CONFIG_SMP
1512static unsigned long source_load(int cpu, int type); 1492/* Used instead of source_load when we know the type == 0 */
1513static unsigned long target_load(int cpu, int type); 1493static unsigned long weighted_cpuload(const int cpu)
1494{
1495 return cpu_rq(cpu)->load.weight;
1496}
1497
1498/*
1499 * Return a low guess at the load of a migration-source cpu weighted
1500 * according to the scheduling class and "nice" value.
1501 *
1502 * We want to under-estimate the load of migration sources, to
1503 * balance conservatively.
1504 */
1505static unsigned long source_load(int cpu, int type)
1506{
1507 struct rq *rq = cpu_rq(cpu);
1508 unsigned long total = weighted_cpuload(cpu);
1509
1510 if (type == 0 || !sched_feat(LB_BIAS))
1511 return total;
1512
1513 return min(rq->cpu_load[type-1], total);
1514}
1515
1516/*
1517 * Return a high guess at the load of a migration-target cpu weighted
1518 * according to the scheduling class and "nice" value.
1519 */
1520static unsigned long target_load(int cpu, int type)
1521{
1522 struct rq *rq = cpu_rq(cpu);
1523 unsigned long total = weighted_cpuload(cpu);
1524
1525 if (type == 0 || !sched_feat(LB_BIAS))
1526 return total;
1527
1528 return max(rq->cpu_load[type-1], total);
1529}
1530
1531static struct sched_group *group_of(int cpu)
1532{
1533 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
1534
1535 if (!sd)
1536 return NULL;
1537
1538 return sd->groups;
1539}
1540
1541static unsigned long power_of(int cpu)
1542{
1543 struct sched_group *group = group_of(cpu);
1544
1545 if (!group)
1546 return SCHED_LOAD_SCALE;
1547
1548 return group->cpu_power;
1549}
1550
1514static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1551static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1515 1552
1516static unsigned long cpu_avg_load_per_task(int cpu) 1553static unsigned long cpu_avg_load_per_task(int cpu)
@@ -1528,11 +1565,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1528 1565
1529#ifdef CONFIG_FAIR_GROUP_SCHED 1566#ifdef CONFIG_FAIR_GROUP_SCHED
1530 1567
1531struct update_shares_data { 1568static __read_mostly unsigned long *update_shares_data;
1532 unsigned long rq_weight[NR_CPUS];
1533};
1534
1535static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
1536 1569
1537static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1570static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1538 1571
@@ -1542,12 +1575,12 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1542static void update_group_shares_cpu(struct task_group *tg, int cpu, 1575static void update_group_shares_cpu(struct task_group *tg, int cpu,
1543 unsigned long sd_shares, 1576 unsigned long sd_shares,
1544 unsigned long sd_rq_weight, 1577 unsigned long sd_rq_weight,
1545 struct update_shares_data *usd) 1578 unsigned long *usd_rq_weight)
1546{ 1579{
1547 unsigned long shares, rq_weight; 1580 unsigned long shares, rq_weight;
1548 int boost = 0; 1581 int boost = 0;
1549 1582
1550 rq_weight = usd->rq_weight[cpu]; 1583 rq_weight = usd_rq_weight[cpu];
1551 if (!rq_weight) { 1584 if (!rq_weight) {
1552 boost = 1; 1585 boost = 1;
1553 rq_weight = NICE_0_LOAD; 1586 rq_weight = NICE_0_LOAD;
@@ -1582,7 +1615,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
1582static int tg_shares_up(struct task_group *tg, void *data) 1615static int tg_shares_up(struct task_group *tg, void *data)
1583{ 1616{
1584 unsigned long weight, rq_weight = 0, shares = 0; 1617 unsigned long weight, rq_weight = 0, shares = 0;
1585 struct update_shares_data *usd; 1618 unsigned long *usd_rq_weight;
1586 struct sched_domain *sd = data; 1619 struct sched_domain *sd = data;
1587 unsigned long flags; 1620 unsigned long flags;
1588 int i; 1621 int i;
@@ -1591,11 +1624,11 @@ static int tg_shares_up(struct task_group *tg, void *data)
1591 return 0; 1624 return 0;
1592 1625
1593 local_irq_save(flags); 1626 local_irq_save(flags);
1594 usd = &__get_cpu_var(update_shares_data); 1627 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1595 1628
1596 for_each_cpu(i, sched_domain_span(sd)) { 1629 for_each_cpu(i, sched_domain_span(sd)) {
1597 weight = tg->cfs_rq[i]->load.weight; 1630 weight = tg->cfs_rq[i]->load.weight;
1598 usd->rq_weight[i] = weight; 1631 usd_rq_weight[i] = weight;
1599 1632
1600 /* 1633 /*
1601 * If there are currently no tasks on the cpu pretend there 1634 * If there are currently no tasks on the cpu pretend there
@@ -1616,7 +1649,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1616 shares = tg->shares; 1649 shares = tg->shares;
1617 1650
1618 for_each_cpu(i, sched_domain_span(sd)) 1651 for_each_cpu(i, sched_domain_span(sd))
1619 update_group_shares_cpu(tg, i, shares, rq_weight, usd); 1652 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1620 1653
1621 local_irq_restore(flags); 1654 local_irq_restore(flags);
1622 1655
@@ -1695,6 +1728,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1695 1728
1696#ifdef CONFIG_PREEMPT 1729#ifdef CONFIG_PREEMPT
1697 1730
1731static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1732
1698/* 1733/*
1699 * fair double_lock_balance: Safely acquires both rq->locks in a fair 1734 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1700 * way at the expense of forcing extra atomic operations in all 1735 * way at the expense of forcing extra atomic operations in all
@@ -1958,14 +1993,40 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1958 p->sched_class->prio_changed(rq, p, oldprio, running); 1993 p->sched_class->prio_changed(rq, p, oldprio, running);
1959} 1994}
1960 1995
1961#ifdef CONFIG_SMP 1996/**
1962 1997 * kthread_bind - bind a just-created kthread to a cpu.
1963/* Used instead of source_load when we know the type == 0 */ 1998 * @p: thread created by kthread_create().
1964static unsigned long weighted_cpuload(const int cpu) 1999 * @cpu: cpu (might not be online, must be possible) for @k to run on.
2000 *
2001 * Description: This function is equivalent to set_cpus_allowed(),
2002 * except that @cpu doesn't need to be online, and the thread must be
2003 * stopped (i.e., just returned from kthread_create()).
2004 *
2005 * Function lives here instead of kthread.c because it messes with
2006 * scheduler internals which require locking.
2007 */
2008void kthread_bind(struct task_struct *p, unsigned int cpu)
1965{ 2009{
1966 return cpu_rq(cpu)->load.weight; 2010 struct rq *rq = cpu_rq(cpu);
2011 unsigned long flags;
2012
2013 /* Must have done schedule() in kthread() before we set_task_cpu */
2014 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
2015 WARN_ON(1);
2016 return;
2017 }
2018
2019 spin_lock_irqsave(&rq->lock, flags);
2020 update_rq_clock(rq);
2021 set_task_cpu(p, cpu);
2022 p->cpus_allowed = cpumask_of_cpu(cpu);
2023 p->rt.nr_cpus_allowed = 1;
2024 p->flags |= PF_THREAD_BOUND;
2025 spin_unlock_irqrestore(&rq->lock, flags);
1967} 2026}
2027EXPORT_SYMBOL(kthread_bind);
1968 2028
2029#ifdef CONFIG_SMP
1969/* 2030/*
1970 * Is this task likely cache-hot: 2031 * Is this task likely cache-hot:
1971 */ 2032 */
@@ -1977,7 +2038,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1977 /* 2038 /*
1978 * Buddy candidates are cache hot: 2039 * Buddy candidates are cache hot:
1979 */ 2040 */
1980 if (sched_feat(CACHE_HOT_BUDDY) && 2041 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
1981 (&p->se == cfs_rq_of(&p->se)->next || 2042 (&p->se == cfs_rq_of(&p->se)->next ||
1982 &p->se == cfs_rq_of(&p->se)->last)) 2043 &p->se == cfs_rq_of(&p->se)->last))
1983 return 1; 2044 return 1;
@@ -2018,12 +2079,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2018#endif 2079#endif
2019 if (old_cpu != new_cpu) { 2080 if (old_cpu != new_cpu) {
2020 p->se.nr_migrations++; 2081 p->se.nr_migrations++;
2021 new_rq->nr_migrations_in++;
2022#ifdef CONFIG_SCHEDSTATS 2082#ifdef CONFIG_SCHEDSTATS
2023 if (task_hot(p, old_rq->clock, NULL)) 2083 if (task_hot(p, old_rq->clock, NULL))
2024 schedstat_inc(p, se.nr_forced2_migrations); 2084 schedstat_inc(p, se.nr_forced2_migrations);
2025#endif 2085#endif
2026 perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, 2086 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
2027 1, 1, NULL, 0); 2087 1, 1, NULL, 0);
2028 } 2088 }
2029 p->se.vruntime -= old_cfsrq->min_vruntime - 2089 p->se.vruntime -= old_cfsrq->min_vruntime -
@@ -2055,6 +2115,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2055 * it is sufficient to simply update the task's cpu field. 2115 * it is sufficient to simply update the task's cpu field.
2056 */ 2116 */
2057 if (!p->se.on_rq && !task_running(rq, p)) { 2117 if (!p->se.on_rq && !task_running(rq, p)) {
2118 update_rq_clock(rq);
2058 set_task_cpu(p, dest_cpu); 2119 set_task_cpu(p, dest_cpu);
2059 return 0; 2120 return 0;
2060 } 2121 }
@@ -2239,185 +2300,6 @@ void kick_process(struct task_struct *p)
2239 preempt_enable(); 2300 preempt_enable();
2240} 2301}
2241EXPORT_SYMBOL_GPL(kick_process); 2302EXPORT_SYMBOL_GPL(kick_process);
2242
2243/*
2244 * Return a low guess at the load of a migration-source cpu weighted
2245 * according to the scheduling class and "nice" value.
2246 *
2247 * We want to under-estimate the load of migration sources, to
2248 * balance conservatively.
2249 */
2250static unsigned long source_load(int cpu, int type)
2251{
2252 struct rq *rq = cpu_rq(cpu);
2253 unsigned long total = weighted_cpuload(cpu);
2254
2255 if (type == 0 || !sched_feat(LB_BIAS))
2256 return total;
2257
2258 return min(rq->cpu_load[type-1], total);
2259}
2260
2261/*
2262 * Return a high guess at the load of a migration-target cpu weighted
2263 * according to the scheduling class and "nice" value.
2264 */
2265static unsigned long target_load(int cpu, int type)
2266{
2267 struct rq *rq = cpu_rq(cpu);
2268 unsigned long total = weighted_cpuload(cpu);
2269
2270 if (type == 0 || !sched_feat(LB_BIAS))
2271 return total;
2272
2273 return max(rq->cpu_load[type-1], total);
2274}
2275
2276/*
2277 * find_idlest_group finds and returns the least busy CPU group within the
2278 * domain.
2279 */
2280static struct sched_group *
2281find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2282{
2283 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
2284 unsigned long min_load = ULONG_MAX, this_load = 0;
2285 int load_idx = sd->forkexec_idx;
2286 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2287
2288 do {
2289 unsigned long load, avg_load;
2290 int local_group;
2291 int i;
2292
2293 /* Skip over this group if it has no CPUs allowed */
2294 if (!cpumask_intersects(sched_group_cpus(group),
2295 &p->cpus_allowed))
2296 continue;
2297
2298 local_group = cpumask_test_cpu(this_cpu,
2299 sched_group_cpus(group));
2300
2301 /* Tally up the load of all CPUs in the group */
2302 avg_load = 0;
2303
2304 for_each_cpu(i, sched_group_cpus(group)) {
2305 /* Bias balancing toward cpus of our domain */
2306 if (local_group)
2307 load = source_load(i, load_idx);
2308 else
2309 load = target_load(i, load_idx);
2310
2311 avg_load += load;
2312 }
2313
2314 /* Adjust by relative CPU power of the group */
2315 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
2316
2317 if (local_group) {
2318 this_load = avg_load;
2319 this = group;
2320 } else if (avg_load < min_load) {
2321 min_load = avg_load;
2322 idlest = group;
2323 }
2324 } while (group = group->next, group != sd->groups);
2325
2326 if (!idlest || 100*this_load < imbalance*min_load)
2327 return NULL;
2328 return idlest;
2329}
2330
2331/*
2332 * find_idlest_cpu - find the idlest cpu among the cpus in group.
2333 */
2334static int
2335find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2336{
2337 unsigned long load, min_load = ULONG_MAX;
2338 int idlest = -1;
2339 int i;
2340
2341 /* Traverse only the allowed CPUs */
2342 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2343 load = weighted_cpuload(i);
2344
2345 if (load < min_load || (load == min_load && i == this_cpu)) {
2346 min_load = load;
2347 idlest = i;
2348 }
2349 }
2350
2351 return idlest;
2352}
2353
2354/*
2355 * sched_balance_self: balance the current task (running on cpu) in domains
2356 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
2357 * SD_BALANCE_EXEC.
2358 *
2359 * Balance, ie. select the least loaded group.
2360 *
2361 * Returns the target CPU number, or the same CPU if no balancing is needed.
2362 *
2363 * preempt must be disabled.
2364 */
2365static int sched_balance_self(int cpu, int flag)
2366{
2367 struct task_struct *t = current;
2368 struct sched_domain *tmp, *sd = NULL;
2369
2370 for_each_domain(cpu, tmp) {
2371 /*
2372 * If power savings logic is enabled for a domain, stop there.
2373 */
2374 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2375 break;
2376 if (tmp->flags & flag)
2377 sd = tmp;
2378 }
2379
2380 if (sd)
2381 update_shares(sd);
2382
2383 while (sd) {
2384 struct sched_group *group;
2385 int new_cpu, weight;
2386
2387 if (!(sd->flags & flag)) {
2388 sd = sd->child;
2389 continue;
2390 }
2391
2392 group = find_idlest_group(sd, t, cpu);
2393 if (!group) {
2394 sd = sd->child;
2395 continue;
2396 }
2397
2398 new_cpu = find_idlest_cpu(group, t, cpu);
2399 if (new_cpu == -1 || new_cpu == cpu) {
2400 /* Now try balancing at a lower domain level of cpu */
2401 sd = sd->child;
2402 continue;
2403 }
2404
2405 /* Now try balancing at a lower domain level of new_cpu */
2406 cpu = new_cpu;
2407 weight = cpumask_weight(sched_domain_span(sd));
2408 sd = NULL;
2409 for_each_domain(cpu, tmp) {
2410 if (weight <= cpumask_weight(sched_domain_span(tmp)))
2411 break;
2412 if (tmp->flags & flag)
2413 sd = tmp;
2414 }
2415 /* while loop will break here if sd == NULL */
2416 }
2417
2418 return cpu;
2419}
2420
2421#endif /* CONFIG_SMP */ 2303#endif /* CONFIG_SMP */
2422 2304
2423/** 2305/**
@@ -2455,37 +2337,22 @@ void task_oncpu_function_call(struct task_struct *p,
2455 * 2337 *
2456 * returns failure only if the task is already active. 2338 * returns failure only if the task is already active.
2457 */ 2339 */
2458static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) 2340static int try_to_wake_up(struct task_struct *p, unsigned int state,
2341 int wake_flags)
2459{ 2342{
2460 int cpu, orig_cpu, this_cpu, success = 0; 2343 int cpu, orig_cpu, this_cpu, success = 0;
2461 unsigned long flags; 2344 unsigned long flags;
2462 long old_state; 2345 struct rq *rq, *orig_rq;
2463 struct rq *rq;
2464 2346
2465 if (!sched_feat(SYNC_WAKEUPS)) 2347 if (!sched_feat(SYNC_WAKEUPS))
2466 sync = 0; 2348 wake_flags &= ~WF_SYNC;
2467
2468#ifdef CONFIG_SMP
2469 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2470 struct sched_domain *sd;
2471
2472 this_cpu = raw_smp_processor_id();
2473 cpu = task_cpu(p);
2474 2349
2475 for_each_domain(this_cpu, sd) { 2350 this_cpu = get_cpu();
2476 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2477 update_shares(sd);
2478 break;
2479 }
2480 }
2481 }
2482#endif
2483 2351
2484 smp_wmb(); 2352 smp_wmb();
2485 rq = task_rq_lock(p, &flags); 2353 rq = orig_rq = task_rq_lock(p, &flags);
2486 update_rq_clock(rq); 2354 update_rq_clock(rq);
2487 old_state = p->state; 2355 if (!(p->state & state))
2488 if (!(old_state & state))
2489 goto out; 2356 goto out;
2490 2357
2491 if (p->se.on_rq) 2358 if (p->se.on_rq)
@@ -2493,27 +2360,34 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2493 2360
2494 cpu = task_cpu(p); 2361 cpu = task_cpu(p);
2495 orig_cpu = cpu; 2362 orig_cpu = cpu;
2496 this_cpu = smp_processor_id();
2497 2363
2498#ifdef CONFIG_SMP 2364#ifdef CONFIG_SMP
2499 if (unlikely(task_running(rq, p))) 2365 if (unlikely(task_running(rq, p)))
2500 goto out_activate; 2366 goto out_activate;
2501 2367
2502 cpu = p->sched_class->select_task_rq(p, sync); 2368 /*
2369 * In order to handle concurrent wakeups and release the rq->lock
2370 * we put the task in TASK_WAKING state.
2371 *
2372 * First fix up the nr_uninterruptible count:
2373 */
2374 if (task_contributes_to_load(p))
2375 rq->nr_uninterruptible--;
2376 p->state = TASK_WAKING;
2377 task_rq_unlock(rq, &flags);
2378
2379 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2503 if (cpu != orig_cpu) { 2380 if (cpu != orig_cpu) {
2381 local_irq_save(flags);
2382 rq = cpu_rq(cpu);
2383 update_rq_clock(rq);
2504 set_task_cpu(p, cpu); 2384 set_task_cpu(p, cpu);
2505 task_rq_unlock(rq, &flags); 2385 local_irq_restore(flags);
2506 /* might preempt at this point */
2507 rq = task_rq_lock(p, &flags);
2508 old_state = p->state;
2509 if (!(old_state & state))
2510 goto out;
2511 if (p->se.on_rq)
2512 goto out_running;
2513
2514 this_cpu = smp_processor_id();
2515 cpu = task_cpu(p);
2516 } 2386 }
2387 rq = task_rq_lock(p, &flags);
2388
2389 WARN_ON(p->state != TASK_WAKING);
2390 cpu = task_cpu(p);
2517 2391
2518#ifdef CONFIG_SCHEDSTATS 2392#ifdef CONFIG_SCHEDSTATS
2519 schedstat_inc(rq, ttwu_count); 2393 schedstat_inc(rq, ttwu_count);
@@ -2533,7 +2407,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2533out_activate: 2407out_activate:
2534#endif /* CONFIG_SMP */ 2408#endif /* CONFIG_SMP */
2535 schedstat_inc(p, se.nr_wakeups); 2409 schedstat_inc(p, se.nr_wakeups);
2536 if (sync) 2410 if (wake_flags & WF_SYNC)
2537 schedstat_inc(p, se.nr_wakeups_sync); 2411 schedstat_inc(p, se.nr_wakeups_sync);
2538 if (orig_cpu != cpu) 2412 if (orig_cpu != cpu)
2539 schedstat_inc(p, se.nr_wakeups_migrate); 2413 schedstat_inc(p, se.nr_wakeups_migrate);
@@ -2562,15 +2436,27 @@ out_activate:
2562 2436
2563out_running: 2437out_running:
2564 trace_sched_wakeup(rq, p, success); 2438 trace_sched_wakeup(rq, p, success);
2565 check_preempt_curr(rq, p, sync); 2439 check_preempt_curr(rq, p, wake_flags);
2566 2440
2567 p->state = TASK_RUNNING; 2441 p->state = TASK_RUNNING;
2568#ifdef CONFIG_SMP 2442#ifdef CONFIG_SMP
2569 if (p->sched_class->task_wake_up) 2443 if (p->sched_class->task_wake_up)
2570 p->sched_class->task_wake_up(rq, p); 2444 p->sched_class->task_wake_up(rq, p);
2445
2446 if (unlikely(rq->idle_stamp)) {
2447 u64 delta = rq->clock - rq->idle_stamp;
2448 u64 max = 2*sysctl_sched_migration_cost;
2449
2450 if (delta > max)
2451 rq->avg_idle = max;
2452 else
2453 update_avg(&rq->avg_idle, delta);
2454 rq->idle_stamp = 0;
2455 }
2571#endif 2456#endif
2572out: 2457out:
2573 task_rq_unlock(rq, &flags); 2458 task_rq_unlock(rq, &flags);
2459 put_cpu();
2574 2460
2575 return success; 2461 return success;
2576} 2462}
@@ -2613,6 +2499,7 @@ static void __sched_fork(struct task_struct *p)
2613 p->se.avg_overlap = 0; 2499 p->se.avg_overlap = 0;
2614 p->se.start_runtime = 0; 2500 p->se.start_runtime = 0;
2615 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2501 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2502 p->se.avg_running = 0;
2616 2503
2617#ifdef CONFIG_SCHEDSTATS 2504#ifdef CONFIG_SCHEDSTATS
2618 p->se.wait_start = 0; 2505 p->se.wait_start = 0;
@@ -2671,31 +2558,22 @@ static void __sched_fork(struct task_struct *p)
2671void sched_fork(struct task_struct *p, int clone_flags) 2558void sched_fork(struct task_struct *p, int clone_flags)
2672{ 2559{
2673 int cpu = get_cpu(); 2560 int cpu = get_cpu();
2561 unsigned long flags;
2674 2562
2675 __sched_fork(p); 2563 __sched_fork(p);
2676 2564
2677#ifdef CONFIG_SMP
2678 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2679#endif
2680 set_task_cpu(p, cpu);
2681
2682 /*
2683 * Make sure we do not leak PI boosting priority to the child.
2684 */
2685 p->prio = current->normal_prio;
2686
2687 /* 2565 /*
2688 * Revert to default priority/policy on fork if requested. 2566 * Revert to default priority/policy on fork if requested.
2689 */ 2567 */
2690 if (unlikely(p->sched_reset_on_fork)) { 2568 if (unlikely(p->sched_reset_on_fork)) {
2691 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) 2569 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
2692 p->policy = SCHED_NORMAL; 2570 p->policy = SCHED_NORMAL;
2693 2571 p->normal_prio = p->static_prio;
2694 if (p->normal_prio < DEFAULT_PRIO) 2572 }
2695 p->prio = DEFAULT_PRIO;
2696 2573
2697 if (PRIO_TO_NICE(p->static_prio) < 0) { 2574 if (PRIO_TO_NICE(p->static_prio) < 0) {
2698 p->static_prio = NICE_TO_PRIO(0); 2575 p->static_prio = NICE_TO_PRIO(0);
2576 p->normal_prio = p->static_prio;
2699 set_load_weight(p); 2577 set_load_weight(p);
2700 } 2578 }
2701 2579
@@ -2706,9 +2584,22 @@ void sched_fork(struct task_struct *p, int clone_flags)
2706 p->sched_reset_on_fork = 0; 2584 p->sched_reset_on_fork = 0;
2707 } 2585 }
2708 2586
2587 /*
2588 * Make sure we do not leak PI boosting priority to the child.
2589 */
2590 p->prio = current->normal_prio;
2591
2709 if (!rt_prio(p->prio)) 2592 if (!rt_prio(p->prio))
2710 p->sched_class = &fair_sched_class; 2593 p->sched_class = &fair_sched_class;
2711 2594
2595#ifdef CONFIG_SMP
2596 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
2597#endif
2598 local_irq_save(flags);
2599 update_rq_clock(cpu_rq(cpu));
2600 set_task_cpu(p, cpu);
2601 local_irq_restore(flags);
2602
2712#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2603#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2713 if (likely(sched_info_on())) 2604 if (likely(sched_info_on()))
2714 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2605 memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -2741,8 +2632,6 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2741 BUG_ON(p->state != TASK_RUNNING); 2632 BUG_ON(p->state != TASK_RUNNING);
2742 update_rq_clock(rq); 2633 update_rq_clock(rq);
2743 2634
2744 p->prio = effective_prio(p);
2745
2746 if (!p->sched_class->task_new || !current->se.on_rq) { 2635 if (!p->sched_class->task_new || !current->se.on_rq) {
2747 activate_task(rq, p, 0); 2636 activate_task(rq, p, 0);
2748 } else { 2637 } else {
@@ -2754,7 +2643,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2754 inc_nr_running(rq); 2643 inc_nr_running(rq);
2755 } 2644 }
2756 trace_sched_wakeup_new(rq, p, 1); 2645 trace_sched_wakeup_new(rq, p, 1);
2757 check_preempt_curr(rq, p, 0); 2646 check_preempt_curr(rq, p, WF_FORK);
2758#ifdef CONFIG_SMP 2647#ifdef CONFIG_SMP
2759 if (p->sched_class->task_wake_up) 2648 if (p->sched_class->task_wake_up)
2760 p->sched_class->task_wake_up(rq, p); 2649 p->sched_class->task_wake_up(rq, p);
@@ -2878,7 +2767,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2878 */ 2767 */
2879 prev_state = prev->state; 2768 prev_state = prev->state;
2880 finish_arch_switch(prev); 2769 finish_arch_switch(prev);
2881 perf_counter_task_sched_in(current, cpu_of(rq)); 2770 perf_event_task_sched_in(current, cpu_of(rq));
2882 finish_lock_switch(rq, prev); 2771 finish_lock_switch(rq, prev);
2883 2772
2884 fire_sched_in_preempt_notifiers(current); 2773 fire_sched_in_preempt_notifiers(current);
@@ -2976,14 +2865,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2976 */ 2865 */
2977 arch_start_context_switch(prev); 2866 arch_start_context_switch(prev);
2978 2867
2979 if (unlikely(!mm)) { 2868 if (likely(!mm)) {
2980 next->active_mm = oldmm; 2869 next->active_mm = oldmm;
2981 atomic_inc(&oldmm->mm_count); 2870 atomic_inc(&oldmm->mm_count);
2982 enter_lazy_tlb(oldmm, next); 2871 enter_lazy_tlb(oldmm, next);
2983 } else 2872 } else
2984 switch_mm(oldmm, mm, next); 2873 switch_mm(oldmm, mm, next);
2985 2874
2986 if (unlikely(!prev->mm)) { 2875 if (likely(!prev->mm)) {
2987 prev->active_mm = NULL; 2876 prev->active_mm = NULL;
2988 rq->prev_mm = oldmm; 2877 rq->prev_mm = oldmm;
2989 } 2878 }
@@ -3064,6 +2953,19 @@ unsigned long nr_iowait(void)
3064 return sum; 2953 return sum;
3065} 2954}
3066 2955
2956unsigned long nr_iowait_cpu(void)
2957{
2958 struct rq *this = this_rq();
2959 return atomic_read(&this->nr_iowait);
2960}
2961
2962unsigned long this_cpu_load(void)
2963{
2964 struct rq *this = this_rq();
2965 return this->cpu_load[0];
2966}
2967
2968
3067/* Variables and functions for calc_load */ 2969/* Variables and functions for calc_load */
3068static atomic_long_t calc_load_tasks; 2970static atomic_long_t calc_load_tasks;
3069static unsigned long calc_load_update; 2971static unsigned long calc_load_update;
@@ -3133,15 +3035,6 @@ static void calc_load_account_active(struct rq *this_rq)
3133} 3035}
3134 3036
3135/* 3037/*
3136 * Externally visible per-cpu scheduler statistics:
3137 * cpu_nr_migrations(cpu) - number of migrations into that cpu
3138 */
3139u64 cpu_nr_migrations(int cpu)
3140{
3141 return cpu_rq(cpu)->nr_migrations_in;
3142}
3143
3144/*
3145 * Update rq->cpu_load[] statistics. This function is usually called every 3038 * Update rq->cpu_load[] statistics. This function is usually called every
3146 * scheduler tick (TICK_NSEC). 3039 * scheduler tick (TICK_NSEC).
3147 */ 3040 */
@@ -3263,7 +3156,7 @@ out:
3263void sched_exec(void) 3156void sched_exec(void)
3264{ 3157{
3265 int new_cpu, this_cpu = get_cpu(); 3158 int new_cpu, this_cpu = get_cpu();
3266 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); 3159 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
3267 put_cpu(); 3160 put_cpu();
3268 if (new_cpu != this_cpu) 3161 if (new_cpu != this_cpu)
3269 sched_migrate_task(current, new_cpu); 3162 sched_migrate_task(current, new_cpu);
@@ -3683,11 +3576,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3683 *imbalance = sds->min_load_per_task; 3576 *imbalance = sds->min_load_per_task;
3684 sds->busiest = sds->group_min; 3577 sds->busiest = sds->group_min;
3685 3578
3686 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3687 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3688 group_first_cpu(sds->group_leader);
3689 }
3690
3691 return 1; 3579 return 1;
3692 3580
3693} 3581}
@@ -3711,7 +3599,18 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3711} 3599}
3712#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 3600#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3713 3601
3714unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) 3602
3603unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3604{
3605 return SCHED_LOAD_SCALE;
3606}
3607
3608unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3609{
3610 return default_scale_freq_power(sd, cpu);
3611}
3612
3613unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3715{ 3614{
3716 unsigned long weight = cpumask_weight(sched_domain_span(sd)); 3615 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3717 unsigned long smt_gain = sd->smt_gain; 3616 unsigned long smt_gain = sd->smt_gain;
@@ -3721,6 +3620,11 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3721 return smt_gain; 3620 return smt_gain;
3722} 3621}
3723 3622
3623unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3624{
3625 return default_scale_smt_power(sd, cpu);
3626}
3627
3724unsigned long scale_rt_power(int cpu) 3628unsigned long scale_rt_power(int cpu)
3725{ 3629{
3726 struct rq *rq = cpu_rq(cpu); 3630 struct rq *rq = cpu_rq(cpu);
@@ -3745,10 +3649,19 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
3745 unsigned long power = SCHED_LOAD_SCALE; 3649 unsigned long power = SCHED_LOAD_SCALE;
3746 struct sched_group *sdg = sd->groups; 3650 struct sched_group *sdg = sd->groups;
3747 3651
3748 /* here we could scale based on cpufreq */ 3652 if (sched_feat(ARCH_POWER))
3653 power *= arch_scale_freq_power(sd, cpu);
3654 else
3655 power *= default_scale_freq_power(sd, cpu);
3656
3657 power >>= SCHED_LOAD_SHIFT;
3749 3658
3750 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 3659 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3751 power *= arch_scale_smt_power(sd, cpu); 3660 if (sched_feat(ARCH_POWER))
3661 power *= arch_scale_smt_power(sd, cpu);
3662 else
3663 power *= default_scale_smt_power(sd, cpu);
3664
3752 power >>= SCHED_LOAD_SHIFT; 3665 power >>= SCHED_LOAD_SHIFT;
3753 } 3666 }
3754 3667
@@ -3785,6 +3698,7 @@ static void update_group_power(struct sched_domain *sd, int cpu)
3785 3698
3786/** 3699/**
3787 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3700 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3701 * @sd: The sched_domain whose statistics are to be updated.
3788 * @group: sched_group whose statistics are to be updated. 3702 * @group: sched_group whose statistics are to be updated.
3789 * @this_cpu: Cpu for which load balance is currently performed. 3703 * @this_cpu: Cpu for which load balance is currently performed.
3790 * @idle: Idle status of this_cpu 3704 * @idle: Idle status of this_cpu
@@ -4161,26 +4075,6 @@ ret:
4161 return NULL; 4075 return NULL;
4162} 4076}
4163 4077
4164static struct sched_group *group_of(int cpu)
4165{
4166 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
4167
4168 if (!sd)
4169 return NULL;
4170
4171 return sd->groups;
4172}
4173
4174static unsigned long power_of(int cpu)
4175{
4176 struct sched_group *group = group_of(cpu);
4177
4178 if (!group)
4179 return SCHED_LOAD_SCALE;
4180
4181 return group->cpu_power;
4182}
4183
4184/* 4078/*
4185 * find_busiest_queue - find the busiest runqueue among the cpus in group. 4079 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4186 */ 4080 */
@@ -4240,7 +4134,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4240 unsigned long flags; 4134 unsigned long flags;
4241 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4135 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4242 4136
4243 cpumask_setall(cpus); 4137 cpumask_copy(cpus, cpu_online_mask);
4244 4138
4245 /* 4139 /*
4246 * When power savings policy is enabled for the parent domain, idle 4140 * When power savings policy is enabled for the parent domain, idle
@@ -4403,7 +4297,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4403 int all_pinned = 0; 4297 int all_pinned = 0;
4404 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4298 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4405 4299
4406 cpumask_setall(cpus); 4300 cpumask_copy(cpus, cpu_online_mask);
4407 4301
4408 /* 4302 /*
4409 * When power savings policy is enabled for the parent domain, idle 4303 * When power savings policy is enabled for the parent domain, idle
@@ -4543,6 +4437,11 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4543 int pulled_task = 0; 4437 int pulled_task = 0;
4544 unsigned long next_balance = jiffies + HZ; 4438 unsigned long next_balance = jiffies + HZ;
4545 4439
4440 this_rq->idle_stamp = this_rq->clock;
4441
4442 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4443 return;
4444
4546 for_each_domain(this_cpu, sd) { 4445 for_each_domain(this_cpu, sd) {
4547 unsigned long interval; 4446 unsigned long interval;
4548 4447
@@ -4557,8 +4456,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4557 interval = msecs_to_jiffies(sd->balance_interval); 4456 interval = msecs_to_jiffies(sd->balance_interval);
4558 if (time_after(next_balance, sd->last_balance + interval)) 4457 if (time_after(next_balance, sd->last_balance + interval))
4559 next_balance = sd->last_balance + interval; 4458 next_balance = sd->last_balance + interval;
4560 if (pulled_task) 4459 if (pulled_task) {
4460 this_rq->idle_stamp = 0;
4561 break; 4461 break;
4462 }
4562 } 4463 }
4563 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 4464 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4564 /* 4465 /*
@@ -5160,8 +5061,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
5160 p->gtime = cputime_add(p->gtime, cputime); 5061 p->gtime = cputime_add(p->gtime, cputime);
5161 5062
5162 /* Add guest time to cpustat. */ 5063 /* Add guest time to cpustat. */
5163 cpustat->user = cputime64_add(cpustat->user, tmp); 5064 if (TASK_NICE(p) > 0) {
5164 cpustat->guest = cputime64_add(cpustat->guest, tmp); 5065 cpustat->nice = cputime64_add(cpustat->nice, tmp);
5066 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
5067 } else {
5068 cpustat->user = cputime64_add(cpustat->user, tmp);
5069 cpustat->guest = cputime64_add(cpustat->guest, tmp);
5070 }
5165} 5071}
5166 5072
5167/* 5073/*
@@ -5239,17 +5145,16 @@ void account_idle_time(cputime_t cputime)
5239 */ 5145 */
5240void account_process_tick(struct task_struct *p, int user_tick) 5146void account_process_tick(struct task_struct *p, int user_tick)
5241{ 5147{
5242 cputime_t one_jiffy = jiffies_to_cputime(1); 5148 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
5243 cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
5244 struct rq *rq = this_rq(); 5149 struct rq *rq = this_rq();
5245 5150
5246 if (user_tick) 5151 if (user_tick)
5247 account_user_time(p, one_jiffy, one_jiffy_scaled); 5152 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
5248 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 5153 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
5249 account_system_time(p, HARDIRQ_OFFSET, one_jiffy, 5154 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
5250 one_jiffy_scaled); 5155 one_jiffy_scaled);
5251 else 5156 else
5252 account_idle_time(one_jiffy); 5157 account_idle_time(cputime_one_jiffy);
5253} 5158}
5254 5159
5255/* 5160/*
@@ -5277,60 +5182,86 @@ void account_idle_ticks(unsigned long ticks)
5277 * Use precise platform statistics if available: 5182 * Use precise platform statistics if available:
5278 */ 5183 */
5279#ifdef CONFIG_VIRT_CPU_ACCOUNTING 5184#ifdef CONFIG_VIRT_CPU_ACCOUNTING
5280cputime_t task_utime(struct task_struct *p) 5185void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5281{ 5186{
5282 return p->utime; 5187 *ut = p->utime;
5188 *st = p->stime;
5283} 5189}
5284 5190
5285cputime_t task_stime(struct task_struct *p) 5191void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5286{ 5192{
5287 return p->stime; 5193 struct task_cputime cputime;
5194
5195 thread_group_cputime(p, &cputime);
5196
5197 *ut = cputime.utime;
5198 *st = cputime.stime;
5288} 5199}
5289#else 5200#else
5290cputime_t task_utime(struct task_struct *p) 5201
5202#ifndef nsecs_to_cputime
5203# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
5204#endif
5205
5206void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5291{ 5207{
5292 clock_t utime = cputime_to_clock_t(p->utime), 5208 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
5293 total = utime + cputime_to_clock_t(p->stime);
5294 u64 temp;
5295 5209
5296 /* 5210 /*
5297 * Use CFS's precise accounting: 5211 * Use CFS's precise accounting:
5298 */ 5212 */
5299 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); 5213 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
5300 5214
5301 if (total) { 5215 if (total) {
5302 temp *= utime; 5216 u64 temp;
5217
5218 temp = (u64)(rtime * utime);
5303 do_div(temp, total); 5219 do_div(temp, total);
5304 } 5220 utime = (cputime_t)temp;
5305 utime = (clock_t)temp; 5221 } else
5222 utime = rtime;
5223
5224 /*
5225 * Compare with previous values, to keep monotonicity:
5226 */
5227 p->prev_utime = max(p->prev_utime, utime);
5228 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
5306 5229
5307 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); 5230 *ut = p->prev_utime;
5308 return p->prev_utime; 5231 *st = p->prev_stime;
5309} 5232}
5310 5233
5311cputime_t task_stime(struct task_struct *p) 5234/*
5235 * Must be called with siglock held.
5236 */
5237void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5312{ 5238{
5313 clock_t stime; 5239 struct signal_struct *sig = p->signal;
5240 struct task_cputime cputime;
5241 cputime_t rtime, utime, total;
5314 5242
5315 /* 5243 thread_group_cputime(p, &cputime);
5316 * Use CFS's precise accounting. (we subtract utime from
5317 * the total, to make sure the total observed by userspace
5318 * grows monotonically - apps rely on that):
5319 */
5320 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
5321 cputime_to_clock_t(task_utime(p));
5322 5244
5323 if (stime >= 0) 5245 total = cputime_add(cputime.utime, cputime.stime);
5324 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); 5246 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
5325 5247
5326 return p->prev_stime; 5248 if (total) {
5327} 5249 u64 temp;
5328#endif
5329 5250
5330inline cputime_t task_gtime(struct task_struct *p) 5251 temp = (u64)(rtime * cputime.utime);
5331{ 5252 do_div(temp, total);
5332 return p->gtime; 5253 utime = (cputime_t)temp;
5254 } else
5255 utime = rtime;
5256
5257 sig->prev_utime = max(sig->prev_utime, utime);
5258 sig->prev_stime = max(sig->prev_stime,
5259 cputime_sub(rtime, sig->prev_utime));
5260
5261 *ut = sig->prev_utime;
5262 *st = sig->prev_stime;
5333} 5263}
5264#endif
5334 5265
5335/* 5266/*
5336 * This function gets called by the timer code, with HZ frequency. 5267 * This function gets called by the timer code, with HZ frequency.
@@ -5353,7 +5284,7 @@ void scheduler_tick(void)
5353 curr->sched_class->task_tick(rq, curr, 0); 5284 curr->sched_class->task_tick(rq, curr, 0);
5354 spin_unlock(&rq->lock); 5285 spin_unlock(&rq->lock);
5355 5286
5356 perf_counter_task_tick(curr, cpu); 5287 perf_event_task_tick(curr, cpu);
5357 5288
5358#ifdef CONFIG_SMP 5289#ifdef CONFIG_SMP
5359 rq->idle_at_tick = idle_cpu(cpu); 5290 rq->idle_at_tick = idle_cpu(cpu);
@@ -5465,14 +5396,13 @@ static inline void schedule_debug(struct task_struct *prev)
5465#endif 5396#endif
5466} 5397}
5467 5398
5468static void put_prev_task(struct rq *rq, struct task_struct *prev) 5399static void put_prev_task(struct rq *rq, struct task_struct *p)
5469{ 5400{
5470 if (prev->state == TASK_RUNNING) { 5401 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
5471 u64 runtime = prev->se.sum_exec_runtime;
5472 5402
5473 runtime -= prev->se.prev_sum_exec_runtime; 5403 update_avg(&p->se.avg_running, runtime);
5474 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5475 5404
5405 if (p->state == TASK_RUNNING) {
5476 /* 5406 /*
5477 * In order to avoid avg_overlap growing stale when we are 5407 * In order to avoid avg_overlap growing stale when we are
5478 * indeed overlapping and hence not getting put to sleep, grow 5408 * indeed overlapping and hence not getting put to sleep, grow
@@ -5482,9 +5412,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
5482 * correlates to the amount of cache footprint a task can 5412 * correlates to the amount of cache footprint a task can
5483 * build up. 5413 * build up.
5484 */ 5414 */
5485 update_avg(&prev->se.avg_overlap, runtime); 5415 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5416 update_avg(&p->se.avg_overlap, runtime);
5417 } else {
5418 update_avg(&p->se.avg_running, 0);
5486 } 5419 }
5487 prev->sched_class->put_prev_task(rq, prev); 5420 p->sched_class->put_prev_task(rq, p);
5488} 5421}
5489 5422
5490/* 5423/*
@@ -5567,7 +5500,7 @@ need_resched_nonpreemptible:
5567 5500
5568 if (likely(prev != next)) { 5501 if (likely(prev != next)) {
5569 sched_info_switch(prev, next); 5502 sched_info_switch(prev, next);
5570 perf_counter_task_sched_out(prev, next, cpu); 5503 perf_event_task_sched_out(prev, next, cpu);
5571 5504
5572 rq->nr_switches++; 5505 rq->nr_switches++;
5573 rq->curr = next; 5506 rq->curr = next;
@@ -5594,7 +5527,7 @@ need_resched_nonpreemptible:
5594} 5527}
5595EXPORT_SYMBOL(schedule); 5528EXPORT_SYMBOL(schedule);
5596 5529
5597#ifdef CONFIG_SMP 5530#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
5598/* 5531/*
5599 * Look out! "owner" is an entirely speculative pointer 5532 * Look out! "owner" is an entirely speculative pointer
5600 * access and not reliable. 5533 * access and not reliable.
@@ -5716,10 +5649,10 @@ asmlinkage void __sched preempt_schedule_irq(void)
5716 5649
5717#endif /* CONFIG_PREEMPT */ 5650#endif /* CONFIG_PREEMPT */
5718 5651
5719int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, 5652int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
5720 void *key) 5653 void *key)
5721{ 5654{
5722 return try_to_wake_up(curr->private, mode, sync); 5655 return try_to_wake_up(curr->private, mode, wake_flags);
5723} 5656}
5724EXPORT_SYMBOL(default_wake_function); 5657EXPORT_SYMBOL(default_wake_function);
5725 5658
@@ -5733,14 +5666,14 @@ EXPORT_SYMBOL(default_wake_function);
5733 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5666 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5734 */ 5667 */
5735static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5668static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5736 int nr_exclusive, int sync, void *key) 5669 int nr_exclusive, int wake_flags, void *key)
5737{ 5670{
5738 wait_queue_t *curr, *next; 5671 wait_queue_t *curr, *next;
5739 5672
5740 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 5673 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
5741 unsigned flags = curr->flags; 5674 unsigned flags = curr->flags;
5742 5675
5743 if (curr->func(curr, mode, sync, key) && 5676 if (curr->func(curr, mode, wake_flags, key) &&
5744 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 5677 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
5745 break; 5678 break;
5746 } 5679 }
@@ -5801,16 +5734,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5801 int nr_exclusive, void *key) 5734 int nr_exclusive, void *key)
5802{ 5735{
5803 unsigned long flags; 5736 unsigned long flags;
5804 int sync = 1; 5737 int wake_flags = WF_SYNC;
5805 5738
5806 if (unlikely(!q)) 5739 if (unlikely(!q))
5807 return; 5740 return;
5808 5741
5809 if (unlikely(!nr_exclusive)) 5742 if (unlikely(!nr_exclusive))
5810 sync = 0; 5743 wake_flags = 0;
5811 5744
5812 spin_lock_irqsave(&q->lock, flags); 5745 spin_lock_irqsave(&q->lock, flags);
5813 __wake_up_common(q, mode, nr_exclusive, sync, key); 5746 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
5814 spin_unlock_irqrestore(&q->lock, flags); 5747 spin_unlock_irqrestore(&q->lock, flags);
5815} 5748}
5816EXPORT_SYMBOL_GPL(__wake_up_sync_key); 5749EXPORT_SYMBOL_GPL(__wake_up_sync_key);
@@ -6288,22 +6221,14 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
6288 BUG_ON(p->se.on_rq); 6221 BUG_ON(p->se.on_rq);
6289 6222
6290 p->policy = policy; 6223 p->policy = policy;
6291 switch (p->policy) {
6292 case SCHED_NORMAL:
6293 case SCHED_BATCH:
6294 case SCHED_IDLE:
6295 p->sched_class = &fair_sched_class;
6296 break;
6297 case SCHED_FIFO:
6298 case SCHED_RR:
6299 p->sched_class = &rt_sched_class;
6300 break;
6301 }
6302
6303 p->rt_priority = prio; 6224 p->rt_priority = prio;
6304 p->normal_prio = normal_prio(p); 6225 p->normal_prio = normal_prio(p);
6305 /* we are holding p->pi_lock already */ 6226 /* we are holding p->pi_lock already */
6306 p->prio = rt_mutex_getprio(p); 6227 p->prio = rt_mutex_getprio(p);
6228 if (rt_prio(p->prio))
6229 p->sched_class = &rt_sched_class;
6230 else
6231 p->sched_class = &fair_sched_class;
6307 set_load_weight(p); 6232 set_load_weight(p);
6308} 6233}
6309 6234
@@ -6866,9 +6791,6 @@ EXPORT_SYMBOL(yield);
6866/* 6791/*
6867 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 6792 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
6868 * that process accounting knows that this is a task in IO wait state. 6793 * that process accounting knows that this is a task in IO wait state.
6869 *
6870 * But don't do that if it is a deliberate, throttling IO wait (this task
6871 * has set its backing_dev_info: the queue against which it should throttle)
6872 */ 6794 */
6873void __sched io_schedule(void) 6795void __sched io_schedule(void)
6874{ 6796{
@@ -6977,23 +6899,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6977 if (retval) 6899 if (retval)
6978 goto out_unlock; 6900 goto out_unlock;
6979 6901
6980 /* 6902 time_slice = p->sched_class->get_rr_interval(p);
6981 * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER
6982 * tasks that are on an otherwise idle runqueue:
6983 */
6984 time_slice = 0;
6985 if (p->policy == SCHED_RR) {
6986 time_slice = DEF_TIMESLICE;
6987 } else if (p->policy != SCHED_FIFO) {
6988 struct sched_entity *se = &p->se;
6989 unsigned long flags;
6990 struct rq *rq;
6991 6903
6992 rq = task_rq_lock(p, &flags);
6993 if (rq->cfs.load.weight)
6994 time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
6995 task_rq_unlock(rq, &flags);
6996 }
6997 read_unlock(&tasklist_lock); 6904 read_unlock(&tasklist_lock);
6998 jiffies_to_timespec(time_slice, &t); 6905 jiffies_to_timespec(time_slice, &t);
6999 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 6906 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
@@ -7066,7 +6973,7 @@ void show_state_filter(unsigned long state_filter)
7066 /* 6973 /*
7067 * Only show locks if all tasks are dumped: 6974 * Only show locks if all tasks are dumped:
7068 */ 6975 */
7069 if (state_filter == -1) 6976 if (!state_filter)
7070 debug_show_all_locks(); 6977 debug_show_all_locks();
7071} 6978}
7072 6979
@@ -7844,7 +7751,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7844/* 7751/*
7845 * Register at high priority so that task migration (migrate_all_tasks) 7752 * Register at high priority so that task migration (migrate_all_tasks)
7846 * happens before everything else. This has to be lower priority than 7753 * happens before everything else. This has to be lower priority than
7847 * the notifier in the perf_counter subsystem, though. 7754 * the notifier in the perf_event subsystem, though.
7848 */ 7755 */
7849static struct notifier_block __cpuinitdata migration_notifier = { 7756static struct notifier_block __cpuinitdata migration_notifier = {
7850 .notifier_call = migration_call, 7757 .notifier_call = migration_call,
@@ -7871,6 +7778,16 @@ early_initcall(migration_init);
7871 7778
7872#ifdef CONFIG_SCHED_DEBUG 7779#ifdef CONFIG_SCHED_DEBUG
7873 7780
7781static __read_mostly int sched_domain_debug_enabled;
7782
7783static int __init sched_domain_debug_setup(char *str)
7784{
7785 sched_domain_debug_enabled = 1;
7786
7787 return 0;
7788}
7789early_param("sched_debug", sched_domain_debug_setup);
7790
7874static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 7791static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7875 struct cpumask *groupmask) 7792 struct cpumask *groupmask)
7876{ 7793{
@@ -7957,6 +7874,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
7957 cpumask_var_t groupmask; 7874 cpumask_var_t groupmask;
7958 int level = 0; 7875 int level = 0;
7959 7876
7877 if (!sched_domain_debug_enabled)
7878 return;
7879
7960 if (!sd) { 7880 if (!sd) {
7961 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 7881 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
7962 return; 7882 return;
@@ -8000,9 +7920,7 @@ static int sd_degenerate(struct sched_domain *sd)
8000 } 7920 }
8001 7921
8002 /* Following flags don't use groups */ 7922 /* Following flags don't use groups */
8003 if (sd->flags & (SD_WAKE_IDLE | 7923 if (sd->flags & (SD_WAKE_AFFINE))
8004 SD_WAKE_AFFINE |
8005 SD_WAKE_BALANCE))
8006 return 0; 7924 return 0;
8007 7925
8008 return 1; 7926 return 1;
@@ -8019,10 +7937,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
8019 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 7937 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
8020 return 0; 7938 return 0;
8021 7939
8022 /* Does parent contain flags not in child? */
8023 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
8024 if (cflags & SD_WAKE_AFFINE)
8025 pflags &= ~SD_WAKE_BALANCE;
8026 /* Flags needing groups don't count if only 1 group in parent */ 7940 /* Flags needing groups don't count if only 1 group in parent */
8027 if (parent->groups == parent->groups->next) { 7941 if (parent->groups == parent->groups->next) {
8028 pflags &= ~(SD_LOAD_BALANCE | 7942 pflags &= ~(SD_LOAD_BALANCE |
@@ -8042,6 +7956,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
8042 7956
8043static void free_rootdomain(struct root_domain *rd) 7957static void free_rootdomain(struct root_domain *rd)
8044{ 7958{
7959 synchronize_sched();
7960
8045 cpupri_cleanup(&rd->cpupri); 7961 cpupri_cleanup(&rd->cpupri);
8046 7962
8047 free_cpumask_var(rd->rto_mask); 7963 free_cpumask_var(rd->rto_mask);
@@ -8182,6 +8098,7 @@ static cpumask_var_t cpu_isolated_map;
8182/* Setup the mask of cpus configured for isolated domains */ 8098/* Setup the mask of cpus configured for isolated domains */
8183static int __init isolated_cpu_setup(char *str) 8099static int __init isolated_cpu_setup(char *str)
8184{ 8100{
8101 alloc_bootmem_cpumask_var(&cpu_isolated_map);
8185 cpulist_parse(str, cpu_isolated_map); 8102 cpulist_parse(str, cpu_isolated_map);
8186 return 1; 8103 return 1;
8187} 8104}
@@ -8708,10 +8625,10 @@ static void set_domain_attribute(struct sched_domain *sd,
8708 request = attr->relax_domain_level; 8625 request = attr->relax_domain_level;
8709 if (request < sd->level) { 8626 if (request < sd->level) {
8710 /* turn off idle balance on this domain */ 8627 /* turn off idle balance on this domain */
8711 sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); 8628 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8712 } else { 8629 } else {
8713 /* turn on idle balance on this domain */ 8630 /* turn on idle balance on this domain */
8714 sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); 8631 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8715 } 8632 }
8716} 8633}
8717 8634
@@ -9018,7 +8935,7 @@ static int build_sched_domains(const struct cpumask *cpu_map)
9018 return __build_sched_domains(cpu_map, NULL); 8935 return __build_sched_domains(cpu_map, NULL);
9019} 8936}
9020 8937
9021static struct cpumask *doms_cur; /* current sched domains */ 8938static cpumask_var_t *doms_cur; /* current sched domains */
9022static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 8939static int ndoms_cur; /* number of sched domains in 'doms_cur' */
9023static struct sched_domain_attr *dattr_cur; 8940static struct sched_domain_attr *dattr_cur;
9024 /* attribues of custom domains in 'doms_cur' */ 8941 /* attribues of custom domains in 'doms_cur' */
@@ -9040,6 +8957,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void)
9040 return 0; 8957 return 0;
9041} 8958}
9042 8959
8960cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
8961{
8962 int i;
8963 cpumask_var_t *doms;
8964
8965 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
8966 if (!doms)
8967 return NULL;
8968 for (i = 0; i < ndoms; i++) {
8969 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
8970 free_sched_domains(doms, i);
8971 return NULL;
8972 }
8973 }
8974 return doms;
8975}
8976
8977void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
8978{
8979 unsigned int i;
8980 for (i = 0; i < ndoms; i++)
8981 free_cpumask_var(doms[i]);
8982 kfree(doms);
8983}
8984
9043/* 8985/*
9044 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 8986 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
9045 * For now this just excludes isolated cpus, but could be used to 8987 * For now this just excludes isolated cpus, but could be used to
@@ -9051,12 +8993,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
9051 8993
9052 arch_update_cpu_topology(); 8994 arch_update_cpu_topology();
9053 ndoms_cur = 1; 8995 ndoms_cur = 1;
9054 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); 8996 doms_cur = alloc_sched_domains(ndoms_cur);
9055 if (!doms_cur) 8997 if (!doms_cur)
9056 doms_cur = fallback_doms; 8998 doms_cur = &fallback_doms;
9057 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); 8999 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
9058 dattr_cur = NULL; 9000 dattr_cur = NULL;
9059 err = build_sched_domains(doms_cur); 9001 err = build_sched_domains(doms_cur[0]);
9060 register_sched_domain_sysctl(); 9002 register_sched_domain_sysctl();
9061 9003
9062 return err; 9004 return err;
@@ -9106,19 +9048,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
9106 * doms_new[] to the current sched domain partitioning, doms_cur[]. 9048 * doms_new[] to the current sched domain partitioning, doms_cur[].
9107 * It destroys each deleted domain and builds each new domain. 9049 * It destroys each deleted domain and builds each new domain.
9108 * 9050 *
9109 * 'doms_new' is an array of cpumask's of length 'ndoms_new'. 9051 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
9110 * The masks don't intersect (don't overlap.) We should setup one 9052 * The masks don't intersect (don't overlap.) We should setup one
9111 * sched domain for each mask. CPUs not in any of the cpumasks will 9053 * sched domain for each mask. CPUs not in any of the cpumasks will
9112 * not be load balanced. If the same cpumask appears both in the 9054 * not be load balanced. If the same cpumask appears both in the
9113 * current 'doms_cur' domains and in the new 'doms_new', we can leave 9055 * current 'doms_cur' domains and in the new 'doms_new', we can leave
9114 * it as it is. 9056 * it as it is.
9115 * 9057 *
9116 * The passed in 'doms_new' should be kmalloc'd. This routine takes 9058 * The passed in 'doms_new' should be allocated using
9117 * ownership of it and will kfree it when done with it. If the caller 9059 * alloc_sched_domains. This routine takes ownership of it and will
9118 * failed the kmalloc call, then it can pass in doms_new == NULL && 9060 * free_sched_domains it when done with it. If the caller failed the
9119 * ndoms_new == 1, and partition_sched_domains() will fallback to 9061 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
9120 * the single partition 'fallback_doms', it also forces the domains 9062 * and partition_sched_domains() will fallback to the single partition
9121 * to be rebuilt. 9063 * 'fallback_doms', it also forces the domains to be rebuilt.
9122 * 9064 *
9123 * If doms_new == NULL it will be replaced with cpu_online_mask. 9065 * If doms_new == NULL it will be replaced with cpu_online_mask.
9124 * ndoms_new == 0 is a special case for destroying existing domains, 9066 * ndoms_new == 0 is a special case for destroying existing domains,
@@ -9126,8 +9068,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
9126 * 9068 *
9127 * Call with hotplug lock held 9069 * Call with hotplug lock held
9128 */ 9070 */
9129/* FIXME: Change to struct cpumask *doms_new[] */ 9071void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
9130void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
9131 struct sched_domain_attr *dattr_new) 9072 struct sched_domain_attr *dattr_new)
9132{ 9073{
9133 int i, j, n; 9074 int i, j, n;
@@ -9146,40 +9087,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
9146 /* Destroy deleted domains */ 9087 /* Destroy deleted domains */
9147 for (i = 0; i < ndoms_cur; i++) { 9088 for (i = 0; i < ndoms_cur; i++) {
9148 for (j = 0; j < n && !new_topology; j++) { 9089 for (j = 0; j < n && !new_topology; j++) {
9149 if (cpumask_equal(&doms_cur[i], &doms_new[j]) 9090 if (cpumask_equal(doms_cur[i], doms_new[j])
9150 && dattrs_equal(dattr_cur, i, dattr_new, j)) 9091 && dattrs_equal(dattr_cur, i, dattr_new, j))
9151 goto match1; 9092 goto match1;
9152 } 9093 }
9153 /* no match - a current sched domain not in new doms_new[] */ 9094 /* no match - a current sched domain not in new doms_new[] */
9154 detach_destroy_domains(doms_cur + i); 9095 detach_destroy_domains(doms_cur[i]);
9155match1: 9096match1:
9156 ; 9097 ;
9157 } 9098 }
9158 9099
9159 if (doms_new == NULL) { 9100 if (doms_new == NULL) {
9160 ndoms_cur = 0; 9101 ndoms_cur = 0;
9161 doms_new = fallback_doms; 9102 doms_new = &fallback_doms;
9162 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); 9103 cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map);
9163 WARN_ON_ONCE(dattr_new); 9104 WARN_ON_ONCE(dattr_new);
9164 } 9105 }
9165 9106
9166 /* Build new domains */ 9107 /* Build new domains */
9167 for (i = 0; i < ndoms_new; i++) { 9108 for (i = 0; i < ndoms_new; i++) {
9168 for (j = 0; j < ndoms_cur && !new_topology; j++) { 9109 for (j = 0; j < ndoms_cur && !new_topology; j++) {
9169 if (cpumask_equal(&doms_new[i], &doms_cur[j]) 9110 if (cpumask_equal(doms_new[i], doms_cur[j])
9170 && dattrs_equal(dattr_new, i, dattr_cur, j)) 9111 && dattrs_equal(dattr_new, i, dattr_cur, j))
9171 goto match2; 9112 goto match2;
9172 } 9113 }
9173 /* no match - add a new doms_new */ 9114 /* no match - add a new doms_new */
9174 __build_sched_domains(doms_new + i, 9115 __build_sched_domains(doms_new[i],
9175 dattr_new ? dattr_new + i : NULL); 9116 dattr_new ? dattr_new + i : NULL);
9176match2: 9117match2:
9177 ; 9118 ;
9178 } 9119 }
9179 9120
9180 /* Remember the new sched domains */ 9121 /* Remember the new sched domains */
9181 if (doms_cur != fallback_doms) 9122 if (doms_cur != &fallback_doms)
9182 kfree(doms_cur); 9123 free_sched_domains(doms_cur, ndoms_cur);
9183 kfree(dattr_cur); /* kfree(NULL) is safe */ 9124 kfree(dattr_cur); /* kfree(NULL) is safe */
9184 doms_cur = doms_new; 9125 doms_cur = doms_new;
9185 dattr_cur = dattr_new; 9126 dattr_cur = dattr_new;
@@ -9329,6 +9270,7 @@ void __init sched_init_smp(void)
9329 cpumask_var_t non_isolated_cpus; 9270 cpumask_var_t non_isolated_cpus;
9330 9271
9331 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 9272 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
9273 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
9332 9274
9333#if defined(CONFIG_NUMA) 9275#if defined(CONFIG_NUMA)
9334 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), 9276 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@ -9360,7 +9302,6 @@ void __init sched_init_smp(void)
9360 sched_init_granularity(); 9302 sched_init_granularity();
9361 free_cpumask_var(non_isolated_cpus); 9303 free_cpumask_var(non_isolated_cpus);
9362 9304
9363 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
9364 init_sched_rt_class(); 9305 init_sched_rt_class();
9365} 9306}
9366#else 9307#else
@@ -9501,10 +9442,6 @@ void __init sched_init(void)
9501#ifdef CONFIG_CPUMASK_OFFSTACK 9442#ifdef CONFIG_CPUMASK_OFFSTACK
9502 alloc_size += num_possible_cpus() * cpumask_size(); 9443 alloc_size += num_possible_cpus() * cpumask_size();
9503#endif 9444#endif
9504 /*
9505 * As sched_init() is called before page_alloc is setup,
9506 * we use alloc_bootmem().
9507 */
9508 if (alloc_size) { 9445 if (alloc_size) {
9509 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 9446 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
9510 9447
@@ -9573,6 +9510,10 @@ void __init sched_init(void)
9573#endif /* CONFIG_USER_SCHED */ 9510#endif /* CONFIG_USER_SCHED */
9574#endif /* CONFIG_GROUP_SCHED */ 9511#endif /* CONFIG_GROUP_SCHED */
9575 9512
9513#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9514 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
9515 __alignof__(unsigned long));
9516#endif
9576 for_each_possible_cpu(i) { 9517 for_each_possible_cpu(i) {
9577 struct rq *rq; 9518 struct rq *rq;
9578 9519
@@ -9655,6 +9596,8 @@ void __init sched_init(void)
9655 rq->cpu = i; 9596 rq->cpu = i;
9656 rq->online = 0; 9597 rq->online = 0;
9657 rq->migration_thread = NULL; 9598 rq->migration_thread = NULL;
9599 rq->idle_stamp = 0;
9600 rq->avg_idle = 2*sysctl_sched_migration_cost;
9658 INIT_LIST_HEAD(&rq->migration_queue); 9601 INIT_LIST_HEAD(&rq->migration_queue);
9659 rq_attach_root(rq, &def_root_domain); 9602 rq_attach_root(rq, &def_root_domain);
9660#endif 9603#endif
@@ -9698,16 +9641,18 @@ void __init sched_init(void)
9698 current->sched_class = &fair_sched_class; 9641 current->sched_class = &fair_sched_class;
9699 9642
9700 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 9643 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
9701 alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 9644 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
9702#ifdef CONFIG_SMP 9645#ifdef CONFIG_SMP
9703#ifdef CONFIG_NO_HZ 9646#ifdef CONFIG_NO_HZ
9704 alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 9647 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
9705 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 9648 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
9706#endif 9649#endif
9707 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 9650 /* May be allocated at isolcpus cmdline parse time */
9651 if (cpu_isolated_map == NULL)
9652 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9708#endif /* SMP */ 9653#endif /* SMP */
9709 9654
9710 perf_counter_init(); 9655 perf_event_init();
9711 9656
9712 scheduler_running = 1; 9657 scheduler_running = 1;
9713} 9658}
@@ -10479,7 +10424,7 @@ static int sched_rt_global_constraints(void)
10479#endif /* CONFIG_RT_GROUP_SCHED */ 10424#endif /* CONFIG_RT_GROUP_SCHED */
10480 10425
10481int sched_rt_handler(struct ctl_table *table, int write, 10426int sched_rt_handler(struct ctl_table *table, int write,
10482 struct file *filp, void __user *buffer, size_t *lenp, 10427 void __user *buffer, size_t *lenp,
10483 loff_t *ppos) 10428 loff_t *ppos)
10484{ 10429{
10485 int ret; 10430 int ret;
@@ -10490,7 +10435,7 @@ int sched_rt_handler(struct ctl_table *table, int write,
10490 old_period = sysctl_sched_rt_period; 10435 old_period = sysctl_sched_rt_period;
10491 old_runtime = sysctl_sched_rt_runtime; 10436 old_runtime = sysctl_sched_rt_runtime;
10492 10437
10493 ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); 10438 ret = proc_dointvec(table, write, buffer, lenp, ppos);
10494 10439
10495 if (!ret && write) { 10440 if (!ret && write) {
10496 ret = sched_rt_global_constraints(); 10441 ret = sched_rt_global_constraints();
@@ -10544,8 +10489,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
10544} 10489}
10545 10490
10546static int 10491static int
10547cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 10492cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
10548 struct task_struct *tsk)
10549{ 10493{
10550#ifdef CONFIG_RT_GROUP_SCHED 10494#ifdef CONFIG_RT_GROUP_SCHED
10551 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) 10495 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
@@ -10555,15 +10499,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10555 if (tsk->sched_class != &fair_sched_class) 10499 if (tsk->sched_class != &fair_sched_class)
10556 return -EINVAL; 10500 return -EINVAL;
10557#endif 10501#endif
10502 return 0;
10503}
10558 10504
10505static int
10506cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10507 struct task_struct *tsk, bool threadgroup)
10508{
10509 int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
10510 if (retval)
10511 return retval;
10512 if (threadgroup) {
10513 struct task_struct *c;
10514 rcu_read_lock();
10515 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
10516 retval = cpu_cgroup_can_attach_task(cgrp, c);
10517 if (retval) {
10518 rcu_read_unlock();
10519 return retval;
10520 }
10521 }
10522 rcu_read_unlock();
10523 }
10559 return 0; 10524 return 0;
10560} 10525}
10561 10526
10562static void 10527static void
10563cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 10528cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10564 struct cgroup *old_cont, struct task_struct *tsk) 10529 struct cgroup *old_cont, struct task_struct *tsk,
10530 bool threadgroup)
10565{ 10531{
10566 sched_move_task(tsk); 10532 sched_move_task(tsk);
10533 if (threadgroup) {
10534 struct task_struct *c;
10535 rcu_read_lock();
10536 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
10537 sched_move_task(c);
10538 }
10539 rcu_read_unlock();
10540 }
10567} 10541}
10568 10542
10569#ifdef CONFIG_FAIR_GROUP_SCHED 10543#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -11005,6 +10979,7 @@ void synchronize_sched_expedited(void)
11005 spin_unlock_irqrestore(&rq->lock, flags); 10979 spin_unlock_irqrestore(&rq->lock, flags);
11006 } 10980 }
11007 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; 10981 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10982 synchronize_sched_expedited_count++;
11008 mutex_unlock(&rcu_sched_expedited_mutex); 10983 mutex_unlock(&rcu_sched_expedited_mutex);
11009 put_online_cpus(); 10984 put_online_cpus();
11010 if (need_full_sync) 10985 if (need_full_sync)
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index e1d16c9a7680..479ce5682d7c 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -48,13 +48,6 @@ static __read_mostly int sched_clock_running;
48__read_mostly int sched_clock_stable; 48__read_mostly int sched_clock_stable;
49 49
50struct sched_clock_data { 50struct sched_clock_data {
51 /*
52 * Raw spinlock - this is a special case: this might be called
53 * from within instrumentation code so we dont want to do any
54 * instrumentation ourselves.
55 */
56 raw_spinlock_t lock;
57
58 u64 tick_raw; 51 u64 tick_raw;
59 u64 tick_gtod; 52 u64 tick_gtod;
60 u64 clock; 53 u64 clock;
@@ -80,7 +73,6 @@ void sched_clock_init(void)
80 for_each_possible_cpu(cpu) { 73 for_each_possible_cpu(cpu) {
81 struct sched_clock_data *scd = cpu_sdc(cpu); 74 struct sched_clock_data *scd = cpu_sdc(cpu);
82 75
83 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
84 scd->tick_raw = 0; 76 scd->tick_raw = 0;
85 scd->tick_gtod = ktime_now; 77 scd->tick_gtod = ktime_now;
86 scd->clock = ktime_now; 78 scd->clock = ktime_now;
@@ -109,14 +101,19 @@ static inline u64 wrap_max(u64 x, u64 y)
109 * - filter out backward motion 101 * - filter out backward motion
110 * - use the GTOD tick value to create a window to filter crazy TSC values 102 * - use the GTOD tick value to create a window to filter crazy TSC values
111 */ 103 */
112static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) 104static u64 sched_clock_local(struct sched_clock_data *scd)
113{ 105{
114 s64 delta = now - scd->tick_raw; 106 u64 now, clock, old_clock, min_clock, max_clock;
115 u64 clock, min_clock, max_clock; 107 s64 delta;
116 108
109again:
110 now = sched_clock();
111 delta = now - scd->tick_raw;
117 if (unlikely(delta < 0)) 112 if (unlikely(delta < 0))
118 delta = 0; 113 delta = 0;
119 114
115 old_clock = scd->clock;
116
120 /* 117 /*
121 * scd->clock = clamp(scd->tick_gtod + delta, 118 * scd->clock = clamp(scd->tick_gtod + delta,
122 * max(scd->tick_gtod, scd->clock), 119 * max(scd->tick_gtod, scd->clock),
@@ -124,84 +121,73 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
124 */ 121 */
125 122
126 clock = scd->tick_gtod + delta; 123 clock = scd->tick_gtod + delta;
127 min_clock = wrap_max(scd->tick_gtod, scd->clock); 124 min_clock = wrap_max(scd->tick_gtod, old_clock);
128 max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); 125 max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
129 126
130 clock = wrap_max(clock, min_clock); 127 clock = wrap_max(clock, min_clock);
131 clock = wrap_min(clock, max_clock); 128 clock = wrap_min(clock, max_clock);
132 129
133 scd->clock = clock; 130 if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
131 goto again;
134 132
135 return scd->clock; 133 return clock;
136} 134}
137 135
138static void lock_double_clock(struct sched_clock_data *data1, 136static u64 sched_clock_remote(struct sched_clock_data *scd)
139 struct sched_clock_data *data2)
140{ 137{
141 if (data1 < data2) { 138 struct sched_clock_data *my_scd = this_scd();
142 __raw_spin_lock(&data1->lock); 139 u64 this_clock, remote_clock;
143 __raw_spin_lock(&data2->lock); 140 u64 *ptr, old_val, val;
141
142 sched_clock_local(my_scd);
143again:
144 this_clock = my_scd->clock;
145 remote_clock = scd->clock;
146
147 /*
148 * Use the opportunity that we have both locks
149 * taken to couple the two clocks: we take the
150 * larger time as the latest time for both
151 * runqueues. (this creates monotonic movement)
152 */
153 if (likely((s64)(remote_clock - this_clock) < 0)) {
154 ptr = &scd->clock;
155 old_val = remote_clock;
156 val = this_clock;
144 } else { 157 } else {
145 __raw_spin_lock(&data2->lock); 158 /*
146 __raw_spin_lock(&data1->lock); 159 * Should be rare, but possible:
160 */
161 ptr = &my_scd->clock;
162 old_val = this_clock;
163 val = remote_clock;
147 } 164 }
165
166 if (cmpxchg64(ptr, old_val, val) != old_val)
167 goto again;
168
169 return val;
148} 170}
149 171
150u64 sched_clock_cpu(int cpu) 172u64 sched_clock_cpu(int cpu)
151{ 173{
152 u64 now, clock, this_clock, remote_clock;
153 struct sched_clock_data *scd; 174 struct sched_clock_data *scd;
175 u64 clock;
176
177 WARN_ON_ONCE(!irqs_disabled());
154 178
155 if (sched_clock_stable) 179 if (sched_clock_stable)
156 return sched_clock(); 180 return sched_clock();
157 181
158 scd = cpu_sdc(cpu);
159
160 /*
161 * Normally this is not called in NMI context - but if it is,
162 * trying to do any locking here is totally lethal.
163 */
164 if (unlikely(in_nmi()))
165 return scd->clock;
166
167 if (unlikely(!sched_clock_running)) 182 if (unlikely(!sched_clock_running))
168 return 0ull; 183 return 0ull;
169 184
170 WARN_ON_ONCE(!irqs_disabled()); 185 scd = cpu_sdc(cpu);
171 now = sched_clock();
172
173 if (cpu != raw_smp_processor_id()) {
174 struct sched_clock_data *my_scd = this_scd();
175
176 lock_double_clock(scd, my_scd);
177
178 this_clock = __update_sched_clock(my_scd, now);
179 remote_clock = scd->clock;
180
181 /*
182 * Use the opportunity that we have both locks
183 * taken to couple the two clocks: we take the
184 * larger time as the latest time for both
185 * runqueues. (this creates monotonic movement)
186 */
187 if (likely((s64)(remote_clock - this_clock) < 0)) {
188 clock = this_clock;
189 scd->clock = clock;
190 } else {
191 /*
192 * Should be rare, but possible:
193 */
194 clock = remote_clock;
195 my_scd->clock = remote_clock;
196 }
197
198 __raw_spin_unlock(&my_scd->lock);
199 } else {
200 __raw_spin_lock(&scd->lock);
201 clock = __update_sched_clock(scd, now);
202 }
203 186
204 __raw_spin_unlock(&scd->lock); 187 if (cpu != smp_processor_id())
188 clock = sched_clock_remote(scd);
189 else
190 clock = sched_clock_local(scd);
205 191
206 return clock; 192 return clock;
207} 193}
@@ -223,11 +209,9 @@ void sched_clock_tick(void)
223 now_gtod = ktime_to_ns(ktime_get()); 209 now_gtod = ktime_to_ns(ktime_get());
224 now = sched_clock(); 210 now = sched_clock();
225 211
226 __raw_spin_lock(&scd->lock);
227 scd->tick_raw = now; 212 scd->tick_raw = now;
228 scd->tick_gtod = now_gtod; 213 scd->tick_gtod = now_gtod;
229 __update_sched_clock(scd, now); 214 sched_clock_local(scd);
230 __raw_spin_unlock(&scd->lock);
231} 215}
232 216
233/* 217/*
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 5ddbd0891267..6988cf08f705 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -285,12 +285,16 @@ static void print_cpu(struct seq_file *m, int cpu)
285 285
286#ifdef CONFIG_SCHEDSTATS 286#ifdef CONFIG_SCHEDSTATS
287#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); 287#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
288#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
288 289
289 P(yld_count); 290 P(yld_count);
290 291
291 P(sched_switch); 292 P(sched_switch);
292 P(sched_count); 293 P(sched_count);
293 P(sched_goidle); 294 P(sched_goidle);
295#ifdef CONFIG_SMP
296 P64(avg_idle);
297#endif
294 298
295 P(ttwu_count); 299 P(ttwu_count);
296 P(ttwu_local); 300 P(ttwu_local);
@@ -395,6 +399,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
395 PN(se.sum_exec_runtime); 399 PN(se.sum_exec_runtime);
396 PN(se.avg_overlap); 400 PN(se.avg_overlap);
397 PN(se.avg_wakeup); 401 PN(se.avg_wakeup);
402 PN(se.avg_running);
398 403
399 nr_switches = p->nvcsw + p->nivcsw; 404 nr_switches = p->nvcsw + p->nivcsw;
400 405
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index aa7f84121016..f61837ad336d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -384,10 +384,10 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
384 384
385#ifdef CONFIG_SCHED_DEBUG 385#ifdef CONFIG_SCHED_DEBUG
386int sched_nr_latency_handler(struct ctl_table *table, int write, 386int sched_nr_latency_handler(struct ctl_table *table, int write,
387 struct file *filp, void __user *buffer, size_t *lenp, 387 void __user *buffer, size_t *lenp,
388 loff_t *ppos) 388 loff_t *ppos)
389{ 389{
390 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 390 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
391 391
392 if (ret || !write) 392 if (ret || !write)
393 return ret; 393 return ret;
@@ -513,6 +513,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
513 if (entity_is_task(curr)) { 513 if (entity_is_task(curr)) {
514 struct task_struct *curtask = task_of(curr); 514 struct task_struct *curtask = task_of(curr);
515 515
516 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
516 cpuacct_charge(curtask, delta_exec); 517 cpuacct_charge(curtask, delta_exec);
517 account_group_exec_runtime(curtask, delta_exec); 518 account_group_exec_runtime(curtask, delta_exec);
518 } 519 }
@@ -709,24 +710,28 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
709 if (initial && sched_feat(START_DEBIT)) 710 if (initial && sched_feat(START_DEBIT))
710 vruntime += sched_vslice(cfs_rq, se); 711 vruntime += sched_vslice(cfs_rq, se);
711 712
712 if (!initial) { 713 /* sleeps up to a single latency don't count. */
713 /* sleeps upto a single latency don't count. */ 714 if (!initial && sched_feat(FAIR_SLEEPERS)) {
714 if (sched_feat(NEW_FAIR_SLEEPERS)) { 715 unsigned long thresh = sysctl_sched_latency;
715 unsigned long thresh = sysctl_sched_latency;
716 716
717 /* 717 /*
718 * Convert the sleeper threshold into virtual time. 718 * Convert the sleeper threshold into virtual time.
719 * SCHED_IDLE is a special sub-class. We care about 719 * SCHED_IDLE is a special sub-class. We care about
720 * fairness only relative to other SCHED_IDLE tasks, 720 * fairness only relative to other SCHED_IDLE tasks,
721 * all of which have the same weight. 721 * all of which have the same weight.
722 */ 722 */
723 if (sched_feat(NORMALIZED_SLEEPER) && 723 if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
724 (!entity_is_task(se) || 724 task_of(se)->policy != SCHED_IDLE))
725 task_of(se)->policy != SCHED_IDLE)) 725 thresh = calc_delta_fair(thresh, se);
726 thresh = calc_delta_fair(thresh, se);
727 726
728 vruntime -= thresh; 727 /*
729 } 728 * Halve their sleep time's effect, to allow
729 * for a gentler effect of sleepers:
730 */
731 if (sched_feat(GENTLE_FAIR_SLEEPERS))
732 thresh >>= 1;
733
734 vruntime -= thresh;
730 } 735 }
731 736
732 /* ensure we never gain time by being placed backwards. */ 737 /* ensure we never gain time by being placed backwards. */
@@ -757,10 +762,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
757 762
758static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 763static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
759{ 764{
760 if (cfs_rq->last == se) 765 if (!se || cfs_rq->last == se)
761 cfs_rq->last = NULL; 766 cfs_rq->last = NULL;
762 767
763 if (cfs_rq->next == se) 768 if (!se || cfs_rq->next == se)
764 cfs_rq->next = NULL; 769 cfs_rq->next = NULL;
765} 770}
766 771
@@ -817,6 +822,26 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
817 * re-elected due to buddy favours. 822 * re-elected due to buddy favours.
818 */ 823 */
819 clear_buddies(cfs_rq, curr); 824 clear_buddies(cfs_rq, curr);
825 return;
826 }
827
828 /*
829 * Ensure that a task that missed wakeup preemption by a
830 * narrow margin doesn't have to wait for a full slice.
831 * This also mitigates buddy induced latencies under load.
832 */
833 if (!sched_feat(WAKEUP_PREEMPT))
834 return;
835
836 if (delta_exec < sysctl_sched_min_granularity)
837 return;
838
839 if (cfs_rq->nr_running > 1) {
840 struct sched_entity *se = __pick_next_entity(cfs_rq);
841 s64 delta = curr->vruntime - se->vruntime;
842
843 if (delta > ideal_runtime)
844 resched_task(rq_of(cfs_rq)->curr);
820 } 845 }
821} 846}
822 847
@@ -856,12 +881,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
856static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 881static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
857{ 882{
858 struct sched_entity *se = __pick_next_entity(cfs_rq); 883 struct sched_entity *se = __pick_next_entity(cfs_rq);
884 struct sched_entity *left = se;
859 885
860 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) 886 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
861 return cfs_rq->next; 887 se = cfs_rq->next;
862 888
863 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) 889 /*
864 return cfs_rq->last; 890 * Prefer last buddy, try to return the CPU to a preempted task.
891 */
892 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
893 se = cfs_rq->last;
894
895 clear_buddies(cfs_rq, se);
865 896
866 return se; 897 return se;
867} 898}
@@ -1062,83 +1093,6 @@ static void yield_task_fair(struct rq *rq)
1062 se->vruntime = rightmost->vruntime + 1; 1093 se->vruntime = rightmost->vruntime + 1;
1063} 1094}
1064 1095
1065/*
1066 * wake_idle() will wake a task on an idle cpu if task->cpu is
1067 * not idle and an idle cpu is available. The span of cpus to
1068 * search starts with cpus closest then further out as needed,
1069 * so we always favor a closer, idle cpu.
1070 * Domains may include CPUs that are not usable for migration,
1071 * hence we need to mask them out (rq->rd->online)
1072 *
1073 * Returns the CPU we should wake onto.
1074 */
1075#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1076
1077#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
1078
1079static int wake_idle(int cpu, struct task_struct *p)
1080{
1081 struct sched_domain *sd;
1082 int i;
1083 unsigned int chosen_wakeup_cpu;
1084 int this_cpu;
1085 struct rq *task_rq = task_rq(p);
1086
1087 /*
1088 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
1089 * are idle and this is not a kernel thread and this task's affinity
1090 * allows it to be moved to preferred cpu, then just move!
1091 */
1092
1093 this_cpu = smp_processor_id();
1094 chosen_wakeup_cpu =
1095 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
1096
1097 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
1098 idle_cpu(cpu) && idle_cpu(this_cpu) &&
1099 p->mm && !(p->flags & PF_KTHREAD) &&
1100 cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
1101 return chosen_wakeup_cpu;
1102
1103 /*
1104 * If it is idle, then it is the best cpu to run this task.
1105 *
1106 * This cpu is also the best, if it has more than one task already.
1107 * Siblings must be also busy(in most cases) as they didn't already
1108 * pickup the extra load from this cpu and hence we need not check
1109 * sibling runqueue info. This will avoid the checks and cache miss
1110 * penalities associated with that.
1111 */
1112 if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
1113 return cpu;
1114
1115 for_each_domain(cpu, sd) {
1116 if ((sd->flags & SD_WAKE_IDLE)
1117 || ((sd->flags & SD_WAKE_IDLE_FAR)
1118 && !task_hot(p, task_rq->clock, sd))) {
1119 for_each_cpu_and(i, sched_domain_span(sd),
1120 &p->cpus_allowed) {
1121 if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
1122 if (i != task_cpu(p)) {
1123 schedstat_inc(p,
1124 se.nr_wakeups_idle);
1125 }
1126 return i;
1127 }
1128 }
1129 } else {
1130 break;
1131 }
1132 }
1133 return cpu;
1134}
1135#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
1136static inline int wake_idle(int cpu, struct task_struct *p)
1137{
1138 return cpu;
1139}
1140#endif
1141
1142#ifdef CONFIG_SMP 1096#ifdef CONFIG_SMP
1143 1097
1144#ifdef CONFIG_FAIR_GROUP_SCHED 1098#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1225,25 +1179,34 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1225 1179
1226#endif 1180#endif
1227 1181
1228static int 1182static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1229wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1230 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
1231 int idx, unsigned long load, unsigned long this_load,
1232 unsigned int imbalance)
1233{ 1183{
1234 struct task_struct *curr = this_rq->curr; 1184 struct task_struct *curr = current;
1235 struct task_group *tg; 1185 unsigned long this_load, load;
1236 unsigned long tl = this_load; 1186 int idx, this_cpu, prev_cpu;
1237 unsigned long tl_per_task; 1187 unsigned long tl_per_task;
1188 unsigned int imbalance;
1189 struct task_group *tg;
1238 unsigned long weight; 1190 unsigned long weight;
1239 int balanced; 1191 int balanced;
1240 1192
1241 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1193 idx = sd->wake_idx;
1242 return 0; 1194 this_cpu = smp_processor_id();
1195 prev_cpu = task_cpu(p);
1196 load = source_load(prev_cpu, idx);
1197 this_load = target_load(this_cpu, idx);
1243 1198
1244 if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || 1199 if (sync) {
1245 p->se.avg_overlap > sysctl_sched_migration_cost)) 1200 if (sched_feat(SYNC_LESS) &&
1246 sync = 0; 1201 (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1202 p->se.avg_overlap > sysctl_sched_migration_cost))
1203 sync = 0;
1204 } else {
1205 if (sched_feat(SYNC_MORE) &&
1206 (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1207 p->se.avg_overlap < sysctl_sched_migration_cost))
1208 sync = 1;
1209 }
1247 1210
1248 /* 1211 /*
1249 * If sync wakeup then subtract the (maximum possible) 1212 * If sync wakeup then subtract the (maximum possible)
@@ -1254,24 +1217,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1254 tg = task_group(current); 1217 tg = task_group(current);
1255 weight = current->se.load.weight; 1218 weight = current->se.load.weight;
1256 1219
1257 tl += effective_load(tg, this_cpu, -weight, -weight); 1220 this_load += effective_load(tg, this_cpu, -weight, -weight);
1258 load += effective_load(tg, prev_cpu, 0, -weight); 1221 load += effective_load(tg, prev_cpu, 0, -weight);
1259 } 1222 }
1260 1223
1261 tg = task_group(p); 1224 tg = task_group(p);
1262 weight = p->se.load.weight; 1225 weight = p->se.load.weight;
1263 1226
1227 imbalance = 100 + (sd->imbalance_pct - 100) / 2;
1228
1264 /* 1229 /*
1265 * In low-load situations, where prev_cpu is idle and this_cpu is idle 1230 * In low-load situations, where prev_cpu is idle and this_cpu is idle
1266 * due to the sync cause above having dropped tl to 0, we'll always have 1231 * due to the sync cause above having dropped this_load to 0, we'll
1267 * an imbalance, but there's really nothing you can do about that, so 1232 * always have an imbalance, but there's really nothing you can do
1268 * that's good too. 1233 * about that, so that's good too.
1269 * 1234 *
1270 * Otherwise check if either cpus are near enough in load to allow this 1235 * Otherwise check if either cpus are near enough in load to allow this
1271 * task to be woken on this_cpu. 1236 * task to be woken on this_cpu.
1272 */ 1237 */
1273 balanced = !tl || 1238 balanced = !this_load ||
1274 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= 1239 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
1275 imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); 1240 imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
1276 1241
1277 /* 1242 /*
@@ -1285,14 +1250,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1285 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1250 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1286 tl_per_task = cpu_avg_load_per_task(this_cpu); 1251 tl_per_task = cpu_avg_load_per_task(this_cpu);
1287 1252
1288 if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= 1253 if (balanced ||
1289 tl_per_task)) { 1254 (this_load <= load &&
1255 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
1290 /* 1256 /*
1291 * This domain has SD_WAKE_AFFINE and 1257 * This domain has SD_WAKE_AFFINE and
1292 * p is cache cold in this domain, and 1258 * p is cache cold in this domain, and
1293 * there is no bad imbalance. 1259 * there is no bad imbalance.
1294 */ 1260 */
1295 schedstat_inc(this_sd, ttwu_move_affine); 1261 schedstat_inc(sd, ttwu_move_affine);
1296 schedstat_inc(p, se.nr_wakeups_affine); 1262 schedstat_inc(p, se.nr_wakeups_affine);
1297 1263
1298 return 1; 1264 return 1;
@@ -1300,65 +1266,271 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1300 return 0; 1266 return 0;
1301} 1267}
1302 1268
1303static int select_task_rq_fair(struct task_struct *p, int sync) 1269/*
1270 * find_idlest_group finds and returns the least busy CPU group within the
1271 * domain.
1272 */
1273static struct sched_group *
1274find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1275 int this_cpu, int load_idx)
1304{ 1276{
1305 struct sched_domain *sd, *this_sd = NULL; 1277 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1306 int prev_cpu, this_cpu, new_cpu; 1278 unsigned long min_load = ULONG_MAX, this_load = 0;
1307 unsigned long load, this_load; 1279 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1308 struct rq *this_rq; 1280
1309 unsigned int imbalance; 1281 do {
1310 int idx; 1282 unsigned long load, avg_load;
1283 int local_group;
1284 int i;
1285
1286 /* Skip over this group if it has no CPUs allowed */
1287 if (!cpumask_intersects(sched_group_cpus(group),
1288 &p->cpus_allowed))
1289 continue;
1290
1291 local_group = cpumask_test_cpu(this_cpu,
1292 sched_group_cpus(group));
1293
1294 /* Tally up the load of all CPUs in the group */
1295 avg_load = 0;
1296
1297 for_each_cpu(i, sched_group_cpus(group)) {
1298 /* Bias balancing toward cpus of our domain */
1299 if (local_group)
1300 load = source_load(i, load_idx);
1301 else
1302 load = target_load(i, load_idx);
1303
1304 avg_load += load;
1305 }
1306
1307 /* Adjust by relative CPU power of the group */
1308 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1309
1310 if (local_group) {
1311 this_load = avg_load;
1312 this = group;
1313 } else if (avg_load < min_load) {
1314 min_load = avg_load;
1315 idlest = group;
1316 }
1317 } while (group = group->next, group != sd->groups);
1318
1319 if (!idlest || 100*this_load < imbalance*min_load)
1320 return NULL;
1321 return idlest;
1322}
1323
1324/*
1325 * find_idlest_cpu - find the idlest cpu among the cpus in group.
1326 */
1327static int
1328find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1329{
1330 unsigned long load, min_load = ULONG_MAX;
1331 int idlest = -1;
1332 int i;
1333
1334 /* Traverse only the allowed CPUs */
1335 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
1336 load = weighted_cpuload(i);
1337
1338 if (load < min_load || (load == min_load && i == this_cpu)) {
1339 min_load = load;
1340 idlest = i;
1341 }
1342 }
1343
1344 return idlest;
1345}
1346
1347/*
1348 * Try and locate an idle CPU in the sched_domain.
1349 */
1350static int
1351select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1352{
1353 int cpu = smp_processor_id();
1354 int prev_cpu = task_cpu(p);
1355 int i;
1311 1356
1312 prev_cpu = task_cpu(p); 1357 /*
1313 this_cpu = smp_processor_id(); 1358 * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
1314 this_rq = cpu_rq(this_cpu); 1359 * test in select_task_rq_fair) and the prev_cpu is idle then that's
1315 new_cpu = prev_cpu; 1360 * always a better target than the current cpu.
1361 */
1362 if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
1363 return prev_cpu;
1316 1364
1317 /* 1365 /*
1318 * 'this_sd' is the first domain that both 1366 * Otherwise, iterate the domain and find an elegible idle cpu.
1319 * this_cpu and prev_cpu are present in:
1320 */ 1367 */
1321 for_each_domain(this_cpu, sd) { 1368 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
1322 if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) { 1369 if (!cpu_rq(i)->cfs.nr_running) {
1323 this_sd = sd; 1370 target = i;
1324 break; 1371 break;
1325 } 1372 }
1326 } 1373 }
1327 1374
1328 if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed))) 1375 return target;
1329 goto out; 1376}
1330 1377
1331 /* 1378/*
1332 * Check for affine wakeup and passive balancing possibilities. 1379 * sched_balance_self: balance the current task (running on cpu) in domains
1333 */ 1380 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1334 if (!this_sd) 1381 * SD_BALANCE_EXEC.
1382 *
1383 * Balance, ie. select the least loaded group.
1384 *
1385 * Returns the target CPU number, or the same CPU if no balancing is needed.
1386 *
1387 * preempt must be disabled.
1388 */
1389static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1390{
1391 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1392 int cpu = smp_processor_id();
1393 int prev_cpu = task_cpu(p);
1394 int new_cpu = cpu;
1395 int want_affine = 0;
1396 int want_sd = 1;
1397 int sync = wake_flags & WF_SYNC;
1398
1399 if (sd_flag & SD_BALANCE_WAKE) {
1400 if (sched_feat(AFFINE_WAKEUPS) &&
1401 cpumask_test_cpu(cpu, &p->cpus_allowed))
1402 want_affine = 1;
1403 new_cpu = prev_cpu;
1404 }
1405
1406 rcu_read_lock();
1407 for_each_domain(cpu, tmp) {
1408 /*
1409 * If power savings logic is enabled for a domain, see if we
1410 * are not overloaded, if so, don't balance wider.
1411 */
1412 if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
1413 unsigned long power = 0;
1414 unsigned long nr_running = 0;
1415 unsigned long capacity;
1416 int i;
1417
1418 for_each_cpu(i, sched_domain_span(tmp)) {
1419 power += power_of(i);
1420 nr_running += cpu_rq(i)->cfs.nr_running;
1421 }
1422
1423 capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
1424
1425 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1426 nr_running /= 2;
1427
1428 if (nr_running < capacity)
1429 want_sd = 0;
1430 }
1431
1432 /*
1433 * While iterating the domains looking for a spanning
1434 * WAKE_AFFINE domain, adjust the affine target to any idle cpu
1435 * in cache sharing domains along the way.
1436 */
1437 if (want_affine) {
1438 int target = -1;
1439
1440 /*
1441 * If both cpu and prev_cpu are part of this domain,
1442 * cpu is a valid SD_WAKE_AFFINE target.
1443 */
1444 if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
1445 target = cpu;
1446
1447 /*
1448 * If there's an idle sibling in this domain, make that
1449 * the wake_affine target instead of the current cpu.
1450 */
1451 if (tmp->flags & SD_PREFER_SIBLING)
1452 target = select_idle_sibling(p, tmp, target);
1453
1454 if (target >= 0) {
1455 if (tmp->flags & SD_WAKE_AFFINE) {
1456 affine_sd = tmp;
1457 want_affine = 0;
1458 }
1459 cpu = target;
1460 }
1461 }
1462
1463 if (!want_sd && !want_affine)
1464 break;
1465
1466 if (!(tmp->flags & sd_flag))
1467 continue;
1468
1469 if (want_sd)
1470 sd = tmp;
1471 }
1472
1473 if (sched_feat(LB_SHARES_UPDATE)) {
1474 /*
1475 * Pick the largest domain to update shares over
1476 */
1477 tmp = sd;
1478 if (affine_sd && (!tmp ||
1479 cpumask_weight(sched_domain_span(affine_sd)) >
1480 cpumask_weight(sched_domain_span(sd))))
1481 tmp = affine_sd;
1482
1483 if (tmp)
1484 update_shares(tmp);
1485 }
1486
1487 if (affine_sd && wake_affine(affine_sd, p, sync)) {
1488 new_cpu = cpu;
1335 goto out; 1489 goto out;
1490 }
1336 1491
1337 idx = this_sd->wake_idx; 1492 while (sd) {
1493 int load_idx = sd->forkexec_idx;
1494 struct sched_group *group;
1495 int weight;
1338 1496
1339 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; 1497 if (!(sd->flags & sd_flag)) {
1498 sd = sd->child;
1499 continue;
1500 }
1340 1501
1341 load = source_load(prev_cpu, idx); 1502 if (sd_flag & SD_BALANCE_WAKE)
1342 this_load = target_load(this_cpu, idx); 1503 load_idx = sd->wake_idx;
1343 1504
1344 if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, 1505 group = find_idlest_group(sd, p, cpu, load_idx);
1345 load, this_load, imbalance)) 1506 if (!group) {
1346 return this_cpu; 1507 sd = sd->child;
1508 continue;
1509 }
1347 1510
1348 /* 1511 new_cpu = find_idlest_cpu(group, p, cpu);
1349 * Start passive balancing when half the imbalance_pct 1512 if (new_cpu == -1 || new_cpu == cpu) {
1350 * limit is reached. 1513 /* Now try balancing at a lower domain level of cpu */
1351 */ 1514 sd = sd->child;
1352 if (this_sd->flags & SD_WAKE_BALANCE) { 1515 continue;
1353 if (imbalance*this_load <= 100*load) { 1516 }
1354 schedstat_inc(this_sd, ttwu_move_balance); 1517
1355 schedstat_inc(p, se.nr_wakeups_passive); 1518 /* Now try balancing at a lower domain level of new_cpu */
1356 return this_cpu; 1519 cpu = new_cpu;
1520 weight = cpumask_weight(sched_domain_span(sd));
1521 sd = NULL;
1522 for_each_domain(cpu, tmp) {
1523 if (weight <= cpumask_weight(sched_domain_span(tmp)))
1524 break;
1525 if (tmp->flags & sd_flag)
1526 sd = tmp;
1357 } 1527 }
1528 /* while loop will break here if sd == NULL */
1358 } 1529 }
1359 1530
1360out: 1531out:
1361 return wake_idle(new_cpu, p); 1532 rcu_read_unlock();
1533 return new_cpu;
1362} 1534}
1363#endif /* CONFIG_SMP */ 1535#endif /* CONFIG_SMP */
1364 1536
@@ -1471,11 +1643,13 @@ static void set_next_buddy(struct sched_entity *se)
1471/* 1643/*
1472 * Preempt the current task with a newly woken task if needed: 1644 * Preempt the current task with a newly woken task if needed:
1473 */ 1645 */
1474static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) 1646static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1475{ 1647{
1476 struct task_struct *curr = rq->curr; 1648 struct task_struct *curr = rq->curr;
1477 struct sched_entity *se = &curr->se, *pse = &p->se; 1649 struct sched_entity *se = &curr->se, *pse = &p->se;
1478 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1650 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1651 int sync = wake_flags & WF_SYNC;
1652 int scale = cfs_rq->nr_running >= sched_nr_latency;
1479 1653
1480 update_curr(cfs_rq); 1654 update_curr(cfs_rq);
1481 1655
@@ -1490,18 +1664,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1490 if (unlikely(se == pse)) 1664 if (unlikely(se == pse))
1491 return; 1665 return;
1492 1666
1493 /* 1667 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
1494 * Only set the backward buddy when the current task is still on the 1668 set_next_buddy(pse);
1495 * rq. This can happen when a wakeup gets interleaved with schedule on
1496 * the ->pre_schedule() or idle_balance() point, either of which can
1497 * drop the rq lock.
1498 *
1499 * Also, during early boot the idle thread is in the fair class, for
1500 * obvious reasons its a bad idea to schedule back to the idle thread.
1501 */
1502 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
1503 set_last_buddy(se);
1504 set_next_buddy(pse);
1505 1669
1506 /* 1670 /*
1507 * We can come here with TIF_NEED_RESCHED already set from new task 1671 * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1523,22 +1687,45 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1523 return; 1687 return;
1524 } 1688 }
1525 1689
1526 if (!sched_feat(WAKEUP_PREEMPT)) 1690 if ((sched_feat(WAKEUP_SYNC) && sync) ||
1527 return; 1691 (sched_feat(WAKEUP_OVERLAP) &&
1528 1692 (se->avg_overlap < sysctl_sched_migration_cost &&
1529 if (sched_feat(WAKEUP_OVERLAP) && (sync || 1693 pse->avg_overlap < sysctl_sched_migration_cost))) {
1530 (se->avg_overlap < sysctl_sched_migration_cost &&
1531 pse->avg_overlap < sysctl_sched_migration_cost))) {
1532 resched_task(curr); 1694 resched_task(curr);
1533 return; 1695 return;
1534 } 1696 }
1535 1697
1698 if (sched_feat(WAKEUP_RUNNING)) {
1699 if (pse->avg_running < se->avg_running) {
1700 set_next_buddy(pse);
1701 resched_task(curr);
1702 return;
1703 }
1704 }
1705
1706 if (!sched_feat(WAKEUP_PREEMPT))
1707 return;
1708
1536 find_matching_se(&se, &pse); 1709 find_matching_se(&se, &pse);
1537 1710
1538 BUG_ON(!pse); 1711 BUG_ON(!pse);
1539 1712
1540 if (wakeup_preempt_entity(se, pse) == 1) 1713 if (wakeup_preempt_entity(se, pse) == 1) {
1541 resched_task(curr); 1714 resched_task(curr);
1715 /*
1716 * Only set the backward buddy when the current task is still
1717 * on the rq. This can happen when a wakeup gets interleaved
1718 * with schedule on the ->pre_schedule() or idle_balance()
1719 * point, either of which can * drop the rq lock.
1720 *
1721 * Also, during early boot the idle thread is in the fair class,
1722 * for obvious reasons its a bad idea to schedule back to it.
1723 */
1724 if (unlikely(!se->on_rq || curr == rq->idle))
1725 return;
1726 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
1727 set_last_buddy(se);
1728 }
1542} 1729}
1543 1730
1544static struct task_struct *pick_next_task_fair(struct rq *rq) 1731static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1547,16 +1734,11 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1547 struct cfs_rq *cfs_rq = &rq->cfs; 1734 struct cfs_rq *cfs_rq = &rq->cfs;
1548 struct sched_entity *se; 1735 struct sched_entity *se;
1549 1736
1550 if (unlikely(!cfs_rq->nr_running)) 1737 if (!cfs_rq->nr_running)
1551 return NULL; 1738 return NULL;
1552 1739
1553 do { 1740 do {
1554 se = pick_next_entity(cfs_rq); 1741 se = pick_next_entity(cfs_rq);
1555 /*
1556 * If se was a buddy, clear it so that it will have to earn
1557 * the favour again.
1558 */
1559 __clear_buddies(cfs_rq, se);
1560 set_next_entity(cfs_rq, se); 1742 set_next_entity(cfs_rq, se);
1561 cfs_rq = group_cfs_rq(se); 1743 cfs_rq = group_cfs_rq(se);
1562 } while (cfs_rq); 1744 } while (cfs_rq);
@@ -1832,6 +2014,25 @@ static void moved_group_fair(struct task_struct *p)
1832} 2014}
1833#endif 2015#endif
1834 2016
2017unsigned int get_rr_interval_fair(struct task_struct *task)
2018{
2019 struct sched_entity *se = &task->se;
2020 unsigned long flags;
2021 struct rq *rq;
2022 unsigned int rr_interval = 0;
2023
2024 /*
2025 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
2026 * idle runqueue:
2027 */
2028 rq = task_rq_lock(task, &flags);
2029 if (rq->cfs.load.weight)
2030 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
2031 task_rq_unlock(rq, &flags);
2032
2033 return rr_interval;
2034}
2035
1835/* 2036/*
1836 * All the scheduling class methods: 2037 * All the scheduling class methods:
1837 */ 2038 */
@@ -1860,6 +2061,8 @@ static const struct sched_class fair_sched_class = {
1860 .prio_changed = prio_changed_fair, 2061 .prio_changed = prio_changed_fair,
1861 .switched_to = switched_to_fair, 2062 .switched_to = switched_to_fair,
1862 2063
2064 .get_rr_interval = get_rr_interval_fair,
2065
1863#ifdef CONFIG_FAIR_GROUP_SCHED 2066#ifdef CONFIG_FAIR_GROUP_SCHED
1864 .moved_group = moved_group_fair, 2067 .moved_group = moved_group_fair,
1865#endif 2068#endif
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index e2dc63a5815d..0d94083582c7 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,17 +1,123 @@
1SCHED_FEAT(NEW_FAIR_SLEEPERS, 0) 1/*
2 * Disregards a certain amount of sleep time (sched_latency_ns) and
3 * considers the task to be running during that period. This gives it
4 * a service deficit on wakeup, allowing it to run sooner.
5 */
6SCHED_FEAT(FAIR_SLEEPERS, 1)
7
8/*
9 * Only give sleepers 50% of their service deficit. This allows
10 * them to run sooner, but does not allow tons of sleepers to
11 * rip the spread apart.
12 */
13SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
14
15/*
16 * By not normalizing the sleep time, heavy tasks get an effective
17 * longer period, and lighter task an effective shorter period they
18 * are considered running.
19 */
2SCHED_FEAT(NORMALIZED_SLEEPER, 0) 20SCHED_FEAT(NORMALIZED_SLEEPER, 0)
3SCHED_FEAT(ADAPTIVE_GRAN, 1) 21
4SCHED_FEAT(WAKEUP_PREEMPT, 1) 22/*
23 * Place new tasks ahead so that they do not starve already running
24 * tasks
25 */
5SCHED_FEAT(START_DEBIT, 1) 26SCHED_FEAT(START_DEBIT, 1)
27
28/*
29 * Should wakeups try to preempt running tasks.
30 */
31SCHED_FEAT(WAKEUP_PREEMPT, 1)
32
33/*
34 * Compute wakeup_gran based on task behaviour, clipped to
35 * [0, sched_wakeup_gran_ns]
36 */
37SCHED_FEAT(ADAPTIVE_GRAN, 1)
38
39/*
40 * When converting the wakeup granularity to virtual time, do it such
41 * that heavier tasks preempting a lighter task have an edge.
42 */
43SCHED_FEAT(ASYM_GRAN, 1)
44
45/*
46 * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
47 */
48SCHED_FEAT(WAKEUP_SYNC, 0)
49
50/*
51 * Wakeup preempt based on task behaviour. Tasks that do not overlap
52 * don't get preempted.
53 */
54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55
56/*
57 * Wakeup preemption towards tasks that run short
58 */
59SCHED_FEAT(WAKEUP_RUNNING, 0)
60
61/*
62 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
63 * the remote end is likely to consume the data we just wrote, and
64 * therefore has cache benefit from being placed on the same cpu, see
65 * also AFFINE_WAKEUPS.
66 */
67SCHED_FEAT(SYNC_WAKEUPS, 1)
68
69/*
70 * Based on load and program behaviour, see if it makes sense to place
71 * a newly woken task on the same cpu as the task that woke it --
72 * improve cache locality. Typically used with SYNC wakeups as
73 * generated by pipes and the like, see also SYNC_WAKEUPS.
74 */
6SCHED_FEAT(AFFINE_WAKEUPS, 1) 75SCHED_FEAT(AFFINE_WAKEUPS, 1)
76
77/*
78 * Weaken SYNC hint based on overlap
79 */
80SCHED_FEAT(SYNC_LESS, 1)
81
82/*
83 * Add SYNC hint based on overlap
84 */
85SCHED_FEAT(SYNC_MORE, 0)
86
87/*
88 * Prefer to schedule the task we woke last (assuming it failed
89 * wakeup-preemption), since its likely going to consume data we
90 * touched, increases cache locality.
91 */
92SCHED_FEAT(NEXT_BUDDY, 0)
93
94/*
95 * Prefer to schedule the task that ran last (when we did
96 * wake-preempt) as that likely will touch the same data, increases
97 * cache locality.
98 */
99SCHED_FEAT(LAST_BUDDY, 1)
100
101/*
102 * Consider buddies to be cache hot, decreases the likelyness of a
103 * cache buddy being migrated away, increases cache locality.
104 */
7SCHED_FEAT(CACHE_HOT_BUDDY, 1) 105SCHED_FEAT(CACHE_HOT_BUDDY, 1)
8SCHED_FEAT(SYNC_WAKEUPS, 1) 106
107/*
108 * Use arch dependent cpu power functions
109 */
110SCHED_FEAT(ARCH_POWER, 0)
111
9SCHED_FEAT(HRTICK, 0) 112SCHED_FEAT(HRTICK, 0)
10SCHED_FEAT(DOUBLE_TICK, 0) 113SCHED_FEAT(DOUBLE_TICK, 0)
11SCHED_FEAT(ASYM_GRAN, 1)
12SCHED_FEAT(LB_BIAS, 1) 114SCHED_FEAT(LB_BIAS, 1)
13SCHED_FEAT(LB_WAKEUP_UPDATE, 1) 115SCHED_FEAT(LB_SHARES_UPDATE, 1)
14SCHED_FEAT(ASYM_EFF_LOAD, 1) 116SCHED_FEAT(ASYM_EFF_LOAD, 1)
15SCHED_FEAT(WAKEUP_OVERLAP, 0) 117
16SCHED_FEAT(LAST_BUDDY, 1) 118/*
119 * Spin-wait on mutex acquisition when the mutex owner is running on
120 * another cpu -- assumes that when the owner is running, it will soon
121 * release the lock. Decreases scheduling overhead.
122 */
17SCHED_FEAT(OWNER_SPIN, 1) 123SCHED_FEAT(OWNER_SPIN, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 499672c10cbd..b133a28fcde3 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,7 @@
6 */ 6 */
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int select_task_rq_idle(struct task_struct *p, int sync) 9static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
10{ 10{
11 return task_cpu(p); /* IDLE tasks as never migrated */ 11 return task_cpu(p); /* IDLE tasks as never migrated */
12} 12}
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
14/* 14/*
15 * Idle tasks are unconditionally rescheduled: 15 * Idle tasks are unconditionally rescheduled:
16 */ 16 */
17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) 17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
18{ 18{
19 resched_task(rq->idle); 19 resched_task(rq->idle);
20} 20}
@@ -97,6 +97,11 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 97 check_preempt_curr(rq, p, 0);
98} 98}
99 99
100unsigned int get_rr_interval_idle(struct task_struct *task)
101{
102 return 0;
103}
104
100/* 105/*
101 * Simple, special scheduling class for the per-CPU idle tasks: 106 * Simple, special scheduling class for the per-CPU idle tasks:
102 */ 107 */
@@ -122,6 +127,8 @@ static const struct sched_class idle_sched_class = {
122 .set_curr_task = set_curr_task_idle, 127 .set_curr_task = set_curr_task_idle,
123 .task_tick = task_tick_idle, 128 .task_tick = task_tick_idle,
124 129
130 .get_rr_interval = get_rr_interval_idle,
131
125 .prio_changed = prio_changed_idle, 132 .prio_changed = prio_changed_idle,
126 .switched_to = switched_to_idle, 133 .switched_to = switched_to_idle,
127 134
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 2eb4bd6a526c..5c5fef378415 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -938,10 +938,13 @@ static void yield_task_rt(struct rq *rq)
938#ifdef CONFIG_SMP 938#ifdef CONFIG_SMP
939static int find_lowest_rq(struct task_struct *task); 939static int find_lowest_rq(struct task_struct *task);
940 940
941static int select_task_rq_rt(struct task_struct *p, int sync) 941static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
942{ 942{
943 struct rq *rq = task_rq(p); 943 struct rq *rq = task_rq(p);
944 944
945 if (sd_flag != SD_BALANCE_WAKE)
946 return smp_processor_id();
947
945 /* 948 /*
946 * If the current task is an RT task, then 949 * If the current task is an RT task, then
947 * try to see if we can wake this RT task up on another 950 * try to see if we can wake this RT task up on another
@@ -999,7 +1002,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
999/* 1002/*
1000 * Preempt the current task with a newly woken task if needed: 1003 * Preempt the current task with a newly woken task if needed:
1001 */ 1004 */
1002static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) 1005static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1003{ 1006{
1004 if (p->prio < rq->curr->prio) { 1007 if (p->prio < rq->curr->prio) {
1005 resched_task(rq->curr); 1008 resched_task(rq->curr);
@@ -1150,29 +1153,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1150 1153
1151static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 1154static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1152 1155
1153static inline int pick_optimal_cpu(int this_cpu,
1154 const struct cpumask *mask)
1155{
1156 int first;
1157
1158 /* "this_cpu" is cheaper to preempt than a remote processor */
1159 if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
1160 return this_cpu;
1161
1162 first = cpumask_first(mask);
1163 if (first < nr_cpu_ids)
1164 return first;
1165
1166 return -1;
1167}
1168
1169static int find_lowest_rq(struct task_struct *task) 1156static int find_lowest_rq(struct task_struct *task)
1170{ 1157{
1171 struct sched_domain *sd; 1158 struct sched_domain *sd;
1172 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); 1159 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
1173 int this_cpu = smp_processor_id(); 1160 int this_cpu = smp_processor_id();
1174 int cpu = task_cpu(task); 1161 int cpu = task_cpu(task);
1175 cpumask_var_t domain_mask;
1176 1162
1177 if (task->rt.nr_cpus_allowed == 1) 1163 if (task->rt.nr_cpus_allowed == 1)
1178 return -1; /* No other targets possible */ 1164 return -1; /* No other targets possible */
@@ -1195,28 +1181,26 @@ static int find_lowest_rq(struct task_struct *task)
1195 * Otherwise, we consult the sched_domains span maps to figure 1181 * Otherwise, we consult the sched_domains span maps to figure
1196 * out which cpu is logically closest to our hot cache data. 1182 * out which cpu is logically closest to our hot cache data.
1197 */ 1183 */
1198 if (this_cpu == cpu) 1184 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1199 this_cpu = -1; /* Skip this_cpu opt if the same */ 1185 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1200
1201 if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
1202 for_each_domain(cpu, sd) {
1203 if (sd->flags & SD_WAKE_AFFINE) {
1204 int best_cpu;
1205 1186
1206 cpumask_and(domain_mask, 1187 for_each_domain(cpu, sd) {
1207 sched_domain_span(sd), 1188 if (sd->flags & SD_WAKE_AFFINE) {
1208 lowest_mask); 1189 int best_cpu;
1209 1190
1210 best_cpu = pick_optimal_cpu(this_cpu, 1191 /*
1211 domain_mask); 1192 * "this_cpu" is cheaper to preempt than a
1212 1193 * remote processor.
1213 if (best_cpu != -1) { 1194 */
1214 free_cpumask_var(domain_mask); 1195 if (this_cpu != -1 &&
1215 return best_cpu; 1196 cpumask_test_cpu(this_cpu, sched_domain_span(sd)))
1216 } 1197 return this_cpu;
1217 } 1198
1199 best_cpu = cpumask_first_and(lowest_mask,
1200 sched_domain_span(sd));
1201 if (best_cpu < nr_cpu_ids)
1202 return best_cpu;
1218 } 1203 }
1219 free_cpumask_var(domain_mask);
1220 } 1204 }
1221 1205
1222 /* 1206 /*
@@ -1224,7 +1208,13 @@ static int find_lowest_rq(struct task_struct *task)
1224 * just give the caller *something* to work with from the compatible 1208 * just give the caller *something* to work with from the compatible
1225 * locations. 1209 * locations.
1226 */ 1210 */
1227 return pick_optimal_cpu(this_cpu, lowest_mask); 1211 if (this_cpu != -1)
1212 return this_cpu;
1213
1214 cpu = cpumask_any(lowest_mask);
1215 if (cpu < nr_cpu_ids)
1216 return cpu;
1217 return -1;
1228} 1218}
1229 1219
1230/* Will lock the rq it finds */ 1220/* Will lock the rq it finds */
@@ -1731,6 +1721,17 @@ static void set_curr_task_rt(struct rq *rq)
1731 dequeue_pushable_task(rq, p); 1721 dequeue_pushable_task(rq, p);
1732} 1722}
1733 1723
1724unsigned int get_rr_interval_rt(struct task_struct *task)
1725{
1726 /*
1727 * Time slice is 0 for SCHED_FIFO tasks
1728 */
1729 if (task->policy == SCHED_RR)
1730 return DEF_TIMESLICE;
1731 else
1732 return 0;
1733}
1734
1734static const struct sched_class rt_sched_class = { 1735static const struct sched_class rt_sched_class = {
1735 .next = &fair_sched_class, 1736 .next = &fair_sched_class,
1736 .enqueue_task = enqueue_task_rt, 1737 .enqueue_task = enqueue_task_rt,
@@ -1759,6 +1760,8 @@ static const struct sched_class rt_sched_class = {
1759 .set_curr_task = set_curr_task_rt, 1760 .set_curr_task = set_curr_task_rt,
1760 .task_tick = task_tick_rt, 1761 .task_tick = task_tick_rt,
1761 1762
1763 .get_rr_interval = get_rr_interval_rt,
1764
1762 .prio_changed = prio_changed_rt, 1765 .prio_changed = prio_changed_rt,
1763 .switched_to = switched_to_rt, 1766 .switched_to = switched_to_rt,
1764}; 1767};
diff --git a/kernel/signal.c b/kernel/signal.c
index 64c5deeaca5d..6b982f2cf524 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,12 +22,14 @@
22#include <linux/ptrace.h> 22#include <linux/ptrace.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/signalfd.h> 24#include <linux/signalfd.h>
25#include <linux/ratelimit.h>
25#include <linux/tracehook.h> 26#include <linux/tracehook.h>
26#include <linux/capability.h> 27#include <linux/capability.h>
27#include <linux/freezer.h> 28#include <linux/freezer.h>
28#include <linux/pid_namespace.h> 29#include <linux/pid_namespace.h>
29#include <linux/nsproxy.h> 30#include <linux/nsproxy.h>
30#include <trace/events/sched.h> 31#define CREATE_TRACE_POINTS
32#include <trace/events/signal.h>
31 33
32#include <asm/param.h> 34#include <asm/param.h>
33#include <asm/uaccess.h> 35#include <asm/uaccess.h>
@@ -41,6 +43,8 @@
41 43
42static struct kmem_cache *sigqueue_cachep; 44static struct kmem_cache *sigqueue_cachep;
43 45
46int print_fatal_signals __read_mostly;
47
44static void __user *sig_handler(struct task_struct *t, int sig) 48static void __user *sig_handler(struct task_struct *t, int sig)
45{ 49{
46 return t->sighand->action[sig - 1].sa.sa_handler; 50 return t->sighand->action[sig - 1].sa.sa_handler;
@@ -159,7 +163,7 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
159{ 163{
160 unsigned long i, *s, *m, x; 164 unsigned long i, *s, *m, x;
161 int sig = 0; 165 int sig = 0;
162 166
163 s = pending->signal.sig; 167 s = pending->signal.sig;
164 m = mask->sig; 168 m = mask->sig;
165 switch (_NSIG_WORDS) { 169 switch (_NSIG_WORDS) {
@@ -184,17 +188,31 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
184 sig = ffz(~x) + 1; 188 sig = ffz(~x) + 1;
185 break; 189 break;
186 } 190 }
187 191
188 return sig; 192 return sig;
189} 193}
190 194
195static inline void print_dropped_signal(int sig)
196{
197 static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
198
199 if (!print_fatal_signals)
200 return;
201
202 if (!__ratelimit(&ratelimit_state))
203 return;
204
205 printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
206 current->comm, current->pid, sig);
207}
208
191/* 209/*
192 * allocate a new signal queue record 210 * allocate a new signal queue record
193 * - this may be called without locks if and only if t == current, otherwise an 211 * - this may be called without locks if and only if t == current, otherwise an
194 * appopriate lock must be held to stop the target task from exiting 212 * appopriate lock must be held to stop the target task from exiting
195 */ 213 */
196static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, 214static struct sigqueue *
197 int override_rlimit) 215__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
198{ 216{
199 struct sigqueue *q = NULL; 217 struct sigqueue *q = NULL;
200 struct user_struct *user; 218 struct user_struct *user;
@@ -207,10 +225,15 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
207 */ 225 */
208 user = get_uid(__task_cred(t)->user); 226 user = get_uid(__task_cred(t)->user);
209 atomic_inc(&user->sigpending); 227 atomic_inc(&user->sigpending);
228
210 if (override_rlimit || 229 if (override_rlimit ||
211 atomic_read(&user->sigpending) <= 230 atomic_read(&user->sigpending) <=
212 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) 231 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) {
213 q = kmem_cache_alloc(sigqueue_cachep, flags); 232 q = kmem_cache_alloc(sigqueue_cachep, flags);
233 } else {
234 print_dropped_signal(sig);
235 }
236
214 if (unlikely(q == NULL)) { 237 if (unlikely(q == NULL)) {
215 atomic_dec(&user->sigpending); 238 atomic_dec(&user->sigpending);
216 free_uid(user); 239 free_uid(user);
@@ -705,7 +728,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
705 728
706 if (why) { 729 if (why) {
707 /* 730 /*
708 * The first thread which returns from finish_stop() 731 * The first thread which returns from do_signal_stop()
709 * will take ->siglock, notice SIGNAL_CLD_MASK, and 732 * will take ->siglock, notice SIGNAL_CLD_MASK, and
710 * notify its parent. See get_signal_to_deliver(). 733 * notify its parent. See get_signal_to_deliver().
711 */ 734 */
@@ -834,7 +857,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
834 struct sigqueue *q; 857 struct sigqueue *q;
835 int override_rlimit; 858 int override_rlimit;
836 859
837 trace_sched_signal_send(sig, t); 860 trace_signal_generate(sig, info, t);
838 861
839 assert_spin_locked(&t->sighand->siglock); 862 assert_spin_locked(&t->sighand->siglock);
840 863
@@ -869,7 +892,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
869 else 892 else
870 override_rlimit = 0; 893 override_rlimit = 0;
871 894
872 q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, 895 q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
873 override_rlimit); 896 override_rlimit);
874 if (q) { 897 if (q) {
875 list_add_tail(&q->list, &pending->list); 898 list_add_tail(&q->list, &pending->list);
@@ -896,12 +919,21 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
896 break; 919 break;
897 } 920 }
898 } else if (!is_si_special(info)) { 921 } else if (!is_si_special(info)) {
899 if (sig >= SIGRTMIN && info->si_code != SI_USER) 922 if (sig >= SIGRTMIN && info->si_code != SI_USER) {
900 /* 923 /*
901 * Queue overflow, abort. We may abort if the signal was rt 924 * Queue overflow, abort. We may abort if the
902 * and sent by user using something other than kill(). 925 * signal was rt and sent by user using something
903 */ 926 * other than kill().
927 */
928 trace_signal_overflow_fail(sig, group, info);
904 return -EAGAIN; 929 return -EAGAIN;
930 } else {
931 /*
932 * This is a silent loss of information. We still
933 * send the signal, but the *info bits are lost.
934 */
935 trace_signal_lose_info(sig, group, info);
936 }
905 } 937 }
906 938
907out_set: 939out_set:
@@ -925,8 +957,6 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
925 return __send_signal(sig, info, t, group, from_ancestor_ns); 957 return __send_signal(sig, info, t, group, from_ancestor_ns);
926} 958}
927 959
928int print_fatal_signals;
929
930static void print_fatal_signal(struct pt_regs *regs, int signr) 960static void print_fatal_signal(struct pt_regs *regs, int signr)
931{ 961{
932 printk("%s/%d: potentially unexpected fatal signal %d.\n", 962 printk("%s/%d: potentially unexpected fatal signal %d.\n",
@@ -971,6 +1001,20 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
971 return send_signal(sig, info, t, 0); 1001 return send_signal(sig, info, t, 0);
972} 1002}
973 1003
1004int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
1005 bool group)
1006{
1007 unsigned long flags;
1008 int ret = -ESRCH;
1009
1010 if (lock_task_sighand(p, &flags)) {
1011 ret = send_signal(sig, info, p, group);
1012 unlock_task_sighand(p, &flags);
1013 }
1014
1015 return ret;
1016}
1017
974/* 1018/*
975 * Force a signal that the process can't ignore: if necessary 1019 * Force a signal that the process can't ignore: if necessary
976 * we unblock the signal and change any SIG_IGN to SIG_DFL. 1020 * we unblock the signal and change any SIG_IGN to SIG_DFL.
@@ -1036,12 +1080,6 @@ void zap_other_threads(struct task_struct *p)
1036 } 1080 }
1037} 1081}
1038 1082
1039int __fatal_signal_pending(struct task_struct *tsk)
1040{
1041 return sigismember(&tsk->pending.signal, SIGKILL);
1042}
1043EXPORT_SYMBOL(__fatal_signal_pending);
1044
1045struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) 1083struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
1046{ 1084{
1047 struct sighand_struct *sighand; 1085 struct sighand_struct *sighand;
@@ -1068,18 +1106,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
1068 */ 1106 */
1069int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1107int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1070{ 1108{
1071 unsigned long flags; 1109 int ret = check_kill_permission(sig, info, p);
1072 int ret;
1073
1074 ret = check_kill_permission(sig, info, p);
1075 1110
1076 if (!ret && sig) { 1111 if (!ret && sig)
1077 ret = -ESRCH; 1112 ret = do_send_sig_info(sig, info, p, true);
1078 if (lock_task_sighand(p, &flags)) {
1079 ret = __group_send_sig_info(sig, info, p);
1080 unlock_task_sighand(p, &flags);
1081 }
1082 }
1083 1113
1084 return ret; 1114 return ret;
1085} 1115}
@@ -1224,15 +1254,9 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
1224 * These are for backward compatibility with the rest of the kernel source. 1254 * These are for backward compatibility with the rest of the kernel source.
1225 */ 1255 */
1226 1256
1227/*
1228 * The caller must ensure the task can't exit.
1229 */
1230int 1257int
1231send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1258send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1232{ 1259{
1233 int ret;
1234 unsigned long flags;
1235
1236 /* 1260 /*
1237 * Make sure legacy kernel users don't send in bad values 1261 * Make sure legacy kernel users don't send in bad values
1238 * (normal paths check this in check_kill_permission). 1262 * (normal paths check this in check_kill_permission).
@@ -1240,10 +1264,7 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1240 if (!valid_signal(sig)) 1264 if (!valid_signal(sig))
1241 return -EINVAL; 1265 return -EINVAL;
1242 1266
1243 spin_lock_irqsave(&p->sighand->siglock, flags); 1267 return do_send_sig_info(sig, info, p, false);
1244 ret = specific_send_sig_info(sig, info, p);
1245 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1246 return ret;
1247} 1268}
1248 1269
1249#define __si_special(priv) \ 1270#define __si_special(priv) \
@@ -1302,19 +1323,19 @@ EXPORT_SYMBOL(kill_pid);
1302 * These functions support sending signals using preallocated sigqueue 1323 * These functions support sending signals using preallocated sigqueue
1303 * structures. This is needed "because realtime applications cannot 1324 * structures. This is needed "because realtime applications cannot
1304 * afford to lose notifications of asynchronous events, like timer 1325 * afford to lose notifications of asynchronous events, like timer
1305 * expirations or I/O completions". In the case of Posix Timers 1326 * expirations or I/O completions". In the case of Posix Timers
1306 * we allocate the sigqueue structure from the timer_create. If this 1327 * we allocate the sigqueue structure from the timer_create. If this
1307 * allocation fails we are able to report the failure to the application 1328 * allocation fails we are able to report the failure to the application
1308 * with an EAGAIN error. 1329 * with an EAGAIN error.
1309 */ 1330 */
1310
1311struct sigqueue *sigqueue_alloc(void) 1331struct sigqueue *sigqueue_alloc(void)
1312{ 1332{
1313 struct sigqueue *q; 1333 struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
1314 1334
1315 if ((q = __sigqueue_alloc(current, GFP_KERNEL, 0))) 1335 if (q)
1316 q->flags |= SIGQUEUE_PREALLOC; 1336 q->flags |= SIGQUEUE_PREALLOC;
1317 return(q); 1337
1338 return q;
1318} 1339}
1319 1340
1320void sigqueue_free(struct sigqueue *q) 1341void sigqueue_free(struct sigqueue *q)
@@ -1383,15 +1404,6 @@ ret:
1383} 1404}
1384 1405
1385/* 1406/*
1386 * Wake up any threads in the parent blocked in wait* syscalls.
1387 */
1388static inline void __wake_up_parent(struct task_struct *p,
1389 struct task_struct *parent)
1390{
1391 wake_up_interruptible_sync(&parent->signal->wait_chldexit);
1392}
1393
1394/*
1395 * Let a parent know about the death of a child. 1407 * Let a parent know about the death of a child.
1396 * For a stopped/continued status change, use do_notify_parent_cldstop instead. 1408 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
1397 * 1409 *
@@ -1673,29 +1685,6 @@ void ptrace_notify(int exit_code)
1673 spin_unlock_irq(&current->sighand->siglock); 1685 spin_unlock_irq(&current->sighand->siglock);
1674} 1686}
1675 1687
1676static void
1677finish_stop(int stop_count)
1678{
1679 /*
1680 * If there are no other threads in the group, or if there is
1681 * a group stop in progress and we are the last to stop,
1682 * report to the parent. When ptraced, every thread reports itself.
1683 */
1684 if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) {
1685 read_lock(&tasklist_lock);
1686 do_notify_parent_cldstop(current, CLD_STOPPED);
1687 read_unlock(&tasklist_lock);
1688 }
1689
1690 do {
1691 schedule();
1692 } while (try_to_freeze());
1693 /*
1694 * Now we don't run again until continued.
1695 */
1696 current->exit_code = 0;
1697}
1698
1699/* 1688/*
1700 * This performs the stopping for SIGSTOP and other stop signals. 1689 * This performs the stopping for SIGSTOP and other stop signals.
1701 * We have to stop all threads in the thread group. 1690 * We have to stop all threads in the thread group.
@@ -1705,15 +1694,9 @@ finish_stop(int stop_count)
1705static int do_signal_stop(int signr) 1694static int do_signal_stop(int signr)
1706{ 1695{
1707 struct signal_struct *sig = current->signal; 1696 struct signal_struct *sig = current->signal;
1708 int stop_count; 1697 int notify;
1709 1698
1710 if (sig->group_stop_count > 0) { 1699 if (!sig->group_stop_count) {
1711 /*
1712 * There is a group stop in progress. We don't need to
1713 * start another one.
1714 */
1715 stop_count = --sig->group_stop_count;
1716 } else {
1717 struct task_struct *t; 1700 struct task_struct *t;
1718 1701
1719 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || 1702 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
@@ -1725,7 +1708,7 @@ static int do_signal_stop(int signr)
1725 */ 1708 */
1726 sig->group_exit_code = signr; 1709 sig->group_exit_code = signr;
1727 1710
1728 stop_count = 0; 1711 sig->group_stop_count = 1;
1729 for (t = next_thread(current); t != current; t = next_thread(t)) 1712 for (t = next_thread(current); t != current; t = next_thread(t))
1730 /* 1713 /*
1731 * Setting state to TASK_STOPPED for a group 1714 * Setting state to TASK_STOPPED for a group
@@ -1734,19 +1717,44 @@ static int do_signal_stop(int signr)
1734 */ 1717 */
1735 if (!(t->flags & PF_EXITING) && 1718 if (!(t->flags & PF_EXITING) &&
1736 !task_is_stopped_or_traced(t)) { 1719 !task_is_stopped_or_traced(t)) {
1737 stop_count++; 1720 sig->group_stop_count++;
1738 signal_wake_up(t, 0); 1721 signal_wake_up(t, 0);
1739 } 1722 }
1740 sig->group_stop_count = stop_count;
1741 } 1723 }
1724 /*
1725 * If there are no other threads in the group, or if there is
1726 * a group stop in progress and we are the last to stop, report
1727 * to the parent. When ptraced, every thread reports itself.
1728 */
1729 notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0;
1730 notify = tracehook_notify_jctl(notify, CLD_STOPPED);
1731 /*
1732 * tracehook_notify_jctl() can drop and reacquire siglock, so
1733 * we keep ->group_stop_count != 0 before the call. If SIGCONT
1734 * or SIGKILL comes in between ->group_stop_count == 0.
1735 */
1736 if (sig->group_stop_count) {
1737 if (!--sig->group_stop_count)
1738 sig->flags = SIGNAL_STOP_STOPPED;
1739 current->exit_code = sig->group_exit_code;
1740 __set_current_state(TASK_STOPPED);
1741 }
1742 spin_unlock_irq(&current->sighand->siglock);
1742 1743
1743 if (stop_count == 0) 1744 if (notify) {
1744 sig->flags = SIGNAL_STOP_STOPPED; 1745 read_lock(&tasklist_lock);
1745 current->exit_code = sig->group_exit_code; 1746 do_notify_parent_cldstop(current, notify);
1746 __set_current_state(TASK_STOPPED); 1747 read_unlock(&tasklist_lock);
1748 }
1749
1750 /* Now we don't run again until woken by SIGCONT or SIGKILL */
1751 do {
1752 schedule();
1753 } while (try_to_freeze());
1754
1755 tracehook_finish_jctl();
1756 current->exit_code = 0;
1747 1757
1748 spin_unlock_irq(&current->sighand->siglock);
1749 finish_stop(stop_count);
1750 return 1; 1758 return 1;
1751} 1759}
1752 1760
@@ -1815,14 +1823,15 @@ relock:
1815 int why = (signal->flags & SIGNAL_STOP_CONTINUED) 1823 int why = (signal->flags & SIGNAL_STOP_CONTINUED)
1816 ? CLD_CONTINUED : CLD_STOPPED; 1824 ? CLD_CONTINUED : CLD_STOPPED;
1817 signal->flags &= ~SIGNAL_CLD_MASK; 1825 signal->flags &= ~SIGNAL_CLD_MASK;
1818 spin_unlock_irq(&sighand->siglock);
1819 1826
1820 if (unlikely(!tracehook_notify_jctl(1, why))) 1827 why = tracehook_notify_jctl(why, CLD_CONTINUED);
1821 goto relock; 1828 spin_unlock_irq(&sighand->siglock);
1822 1829
1823 read_lock(&tasklist_lock); 1830 if (why) {
1824 do_notify_parent_cldstop(current->group_leader, why); 1831 read_lock(&tasklist_lock);
1825 read_unlock(&tasklist_lock); 1832 do_notify_parent_cldstop(current->group_leader, why);
1833 read_unlock(&tasklist_lock);
1834 }
1826 goto relock; 1835 goto relock;
1827 } 1836 }
1828 1837
@@ -1860,6 +1869,9 @@ relock:
1860 ka = &sighand->action[signr-1]; 1869 ka = &sighand->action[signr-1];
1861 } 1870 }
1862 1871
1872 /* Trace actually delivered signals. */
1873 trace_signal_deliver(signr, info, ka);
1874
1863 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ 1875 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
1864 continue; 1876 continue;
1865 if (ka->sa.sa_handler != SIG_DFL) { 1877 if (ka->sa.sa_handler != SIG_DFL) {
@@ -1987,14 +1999,14 @@ void exit_signals(struct task_struct *tsk)
1987 if (unlikely(tsk->signal->group_stop_count) && 1999 if (unlikely(tsk->signal->group_stop_count) &&
1988 !--tsk->signal->group_stop_count) { 2000 !--tsk->signal->group_stop_count) {
1989 tsk->signal->flags = SIGNAL_STOP_STOPPED; 2001 tsk->signal->flags = SIGNAL_STOP_STOPPED;
1990 group_stop = 1; 2002 group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED);
1991 } 2003 }
1992out: 2004out:
1993 spin_unlock_irq(&tsk->sighand->siglock); 2005 spin_unlock_irq(&tsk->sighand->siglock);
1994 2006
1995 if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) { 2007 if (unlikely(group_stop)) {
1996 read_lock(&tasklist_lock); 2008 read_lock(&tasklist_lock);
1997 do_notify_parent_cldstop(tsk, CLD_STOPPED); 2009 do_notify_parent_cldstop(tsk, group_stop);
1998 read_unlock(&tasklist_lock); 2010 read_unlock(&tasklist_lock);
1999 } 2011 }
2000} 2012}
@@ -2290,7 +2302,6 @@ static int
2290do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) 2302do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
2291{ 2303{
2292 struct task_struct *p; 2304 struct task_struct *p;
2293 unsigned long flags;
2294 int error = -ESRCH; 2305 int error = -ESRCH;
2295 2306
2296 rcu_read_lock(); 2307 rcu_read_lock();
@@ -2300,14 +2311,16 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
2300 /* 2311 /*
2301 * The null signal is a permissions and process existence 2312 * The null signal is a permissions and process existence
2302 * probe. No signal is actually delivered. 2313 * probe. No signal is actually delivered.
2303 *
2304 * If lock_task_sighand() fails we pretend the task dies
2305 * after receiving the signal. The window is tiny, and the
2306 * signal is private anyway.
2307 */ 2314 */
2308 if (!error && sig && lock_task_sighand(p, &flags)) { 2315 if (!error && sig) {
2309 error = specific_send_sig_info(sig, info, p); 2316 error = do_send_sig_info(sig, info, p, false);
2310 unlock_task_sighand(p, &flags); 2317 /*
2318 * If lock_task_sighand() failed we pretend the task
2319 * dies after receiving the signal. The window is tiny,
2320 * and the signal is private anyway.
2321 */
2322 if (unlikely(error == -ESRCH))
2323 error = 0;
2311 } 2324 }
2312 } 2325 }
2313 rcu_read_unlock(); 2326 rcu_read_unlock();
diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c
new file mode 100644
index 000000000000..e45c43645298
--- /dev/null
+++ b/kernel/slow-work-debugfs.c
@@ -0,0 +1,227 @@
1/* Slow work debugging
2 *
3 * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/slow-work.h>
14#include <linux/fs.h>
15#include <linux/time.h>
16#include <linux/seq_file.h>
17#include "slow-work.h"
18
19#define ITERATOR_SHIFT (BITS_PER_LONG - 4)
20#define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT)
21#define ITERATOR_COUNTER (~ITERATOR_SELECTOR)
22
23void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
24{
25 seq_puts(m, "Slow-work: New thread");
26}
27
28/*
29 * Render the time mark field on a work item into a 5-char time with units plus
30 * a space
31 */
32static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
33{
34 struct timespec now, diff;
35
36 now = CURRENT_TIME;
37 diff = timespec_sub(now, work->mark);
38
39 if (diff.tv_sec < 0)
40 seq_puts(m, " -ve ");
41 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
42 seq_printf(m, "%3luns ", diff.tv_nsec);
43 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
44 seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
45 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
46 seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
47 else if (diff.tv_sec <= 1)
48 seq_puts(m, " 1s ");
49 else if (diff.tv_sec < 60)
50 seq_printf(m, "%4lus ", diff.tv_sec);
51 else if (diff.tv_sec < 60 * 60)
52 seq_printf(m, "%4lum ", diff.tv_sec / 60);
53 else if (diff.tv_sec < 60 * 60 * 24)
54 seq_printf(m, "%4luh ", diff.tv_sec / 3600);
55 else
56 seq_puts(m, "exces ");
57}
58
59/*
60 * Describe a slow work item for debugfs
61 */
62static int slow_work_runqueue_show(struct seq_file *m, void *v)
63{
64 struct slow_work *work;
65 struct list_head *p = v;
66 unsigned long id;
67
68 switch ((unsigned long) v) {
69 case 1:
70 seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n");
71 return 0;
72 case 2:
73 seq_puts(m, "=== ===== ================ == ===== ==========\n");
74 return 0;
75
76 case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
77 id = (unsigned long) v - 3;
78
79 read_lock(&slow_work_execs_lock);
80 work = slow_work_execs[id];
81 if (work) {
82 smp_read_barrier_depends();
83
84 seq_printf(m, "%3lu %5d %16p %2lx ",
85 id, slow_work_pids[id], work, work->flags);
86 slow_work_print_mark(m, work);
87
88 if (work->ops->desc)
89 work->ops->desc(work, m);
90 seq_putc(m, '\n');
91 }
92 read_unlock(&slow_work_execs_lock);
93 return 0;
94
95 default:
96 work = list_entry(p, struct slow_work, link);
97 seq_printf(m, "%3s - %16p %2lx ",
98 work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
99 work, work->flags);
100 slow_work_print_mark(m, work);
101
102 if (work->ops->desc)
103 work->ops->desc(work, m);
104 seq_putc(m, '\n');
105 return 0;
106 }
107}
108
109/*
110 * map the iterator to a work item
111 */
112static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
113{
114 struct list_head *p;
115 unsigned long count, id;
116
117 switch (*_pos >> ITERATOR_SHIFT) {
118 case 0x0:
119 if (*_pos == 0)
120 *_pos = 1;
121 if (*_pos < 3)
122 return (void *)(unsigned long) *_pos;
123 if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
124 for (id = *_pos - 3;
125 id < SLOW_WORK_THREAD_LIMIT;
126 id++, (*_pos)++)
127 if (slow_work_execs[id])
128 return (void *)(unsigned long) *_pos;
129 *_pos = 0x1UL << ITERATOR_SHIFT;
130
131 case 0x1:
132 count = *_pos & ITERATOR_COUNTER;
133 list_for_each(p, &slow_work_queue) {
134 if (count == 0)
135 return p;
136 count--;
137 }
138 *_pos = 0x2UL << ITERATOR_SHIFT;
139
140 case 0x2:
141 count = *_pos & ITERATOR_COUNTER;
142 list_for_each(p, &vslow_work_queue) {
143 if (count == 0)
144 return p;
145 count--;
146 }
147 *_pos = 0x3UL << ITERATOR_SHIFT;
148
149 default:
150 return NULL;
151 }
152}
153
154/*
155 * set up the iterator to start reading from the first line
156 */
157static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
158{
159 spin_lock_irq(&slow_work_queue_lock);
160 return slow_work_runqueue_index(m, _pos);
161}
162
163/*
164 * move to the next line
165 */
166static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
167{
168 struct list_head *p = v;
169 unsigned long selector = *_pos >> ITERATOR_SHIFT;
170
171 (*_pos)++;
172 switch (selector) {
173 case 0x0:
174 return slow_work_runqueue_index(m, _pos);
175
176 case 0x1:
177 if (*_pos >> ITERATOR_SHIFT == 0x1) {
178 p = p->next;
179 if (p != &slow_work_queue)
180 return p;
181 }
182 *_pos = 0x2UL << ITERATOR_SHIFT;
183 p = &vslow_work_queue;
184
185 case 0x2:
186 if (*_pos >> ITERATOR_SHIFT == 0x2) {
187 p = p->next;
188 if (p != &vslow_work_queue)
189 return p;
190 }
191 *_pos = 0x3UL << ITERATOR_SHIFT;
192
193 default:
194 return NULL;
195 }
196}
197
198/*
199 * clean up after reading
200 */
201static void slow_work_runqueue_stop(struct seq_file *m, void *v)
202{
203 spin_unlock_irq(&slow_work_queue_lock);
204}
205
206static const struct seq_operations slow_work_runqueue_ops = {
207 .start = slow_work_runqueue_start,
208 .stop = slow_work_runqueue_stop,
209 .next = slow_work_runqueue_next,
210 .show = slow_work_runqueue_show,
211};
212
213/*
214 * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents
215 */
216static int slow_work_runqueue_open(struct inode *inode, struct file *file)
217{
218 return seq_open(file, &slow_work_runqueue_ops);
219}
220
221const struct file_operations slow_work_runqueue_fops = {
222 .owner = THIS_MODULE,
223 .open = slow_work_runqueue_open,
224 .read = seq_read,
225 .llseek = seq_lseek,
226 .release = seq_release,
227};
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 09d7519557d3..00889bd3c590 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -16,20 +16,17 @@
16#include <linux/kthread.h> 16#include <linux/kthread.h>
17#include <linux/freezer.h> 17#include <linux/freezer.h>
18#include <linux/wait.h> 18#include <linux/wait.h>
19 19#include <linux/debugfs.h>
20#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of 20#include "slow-work.h"
21 * things to do */
22#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
23 * OOM */
24 21
25static void slow_work_cull_timeout(unsigned long); 22static void slow_work_cull_timeout(unsigned long);
26static void slow_work_oom_timeout(unsigned long); 23static void slow_work_oom_timeout(unsigned long);
27 24
28#ifdef CONFIG_SYSCTL 25#ifdef CONFIG_SYSCTL
29static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *, 26static int slow_work_min_threads_sysctl(struct ctl_table *, int,
30 void __user *, size_t *, loff_t *); 27 void __user *, size_t *, loff_t *);
31 28
32static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *, 29static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
33 void __user *, size_t *, loff_t *); 30 void __user *, size_t *, loff_t *);
34#endif 31#endif
35 32
@@ -46,7 +43,7 @@ static unsigned vslow_work_proportion = 50; /* % of threads that may process
46 43
47#ifdef CONFIG_SYSCTL 44#ifdef CONFIG_SYSCTL
48static const int slow_work_min_min_threads = 2; 45static const int slow_work_min_min_threads = 2;
49static int slow_work_max_max_threads = 255; 46static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT;
50static const int slow_work_min_vslow = 1; 47static const int slow_work_min_vslow = 1;
51static const int slow_work_max_vslow = 99; 48static const int slow_work_max_vslow = 99;
52 49
@@ -98,6 +95,56 @@ static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
98static struct slow_work slow_work_new_thread; /* new thread starter */ 95static struct slow_work slow_work_new_thread; /* new thread starter */
99 96
100/* 97/*
98 * slow work ID allocation (use slow_work_queue_lock)
99 */
100static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
101
102/*
103 * Unregistration tracking to prevent put_ref() from disappearing during module
104 * unload
105 */
106#ifdef CONFIG_MODULES
107static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT];
108static struct module *slow_work_unreg_module;
109static struct slow_work *slow_work_unreg_work_item;
110static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq);
111static DEFINE_MUTEX(slow_work_unreg_sync_lock);
112
113static void slow_work_set_thread_processing(int id, struct slow_work *work)
114{
115 if (work)
116 slow_work_thread_processing[id] = work->owner;
117}
118static void slow_work_done_thread_processing(int id, struct slow_work *work)
119{
120 struct module *module = slow_work_thread_processing[id];
121
122 slow_work_thread_processing[id] = NULL;
123 smp_mb();
124 if (slow_work_unreg_work_item == work ||
125 slow_work_unreg_module == module)
126 wake_up_all(&slow_work_unreg_wq);
127}
128static void slow_work_clear_thread_processing(int id)
129{
130 slow_work_thread_processing[id] = NULL;
131}
132#else
133static void slow_work_set_thread_processing(int id, struct slow_work *work) {}
134static void slow_work_done_thread_processing(int id, struct slow_work *work) {}
135static void slow_work_clear_thread_processing(int id) {}
136#endif
137
138/*
139 * Data for tracking currently executing items for indication through /proc
140 */
141#ifdef CONFIG_SLOW_WORK_DEBUG
142struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT];
143pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT];
144DEFINE_RWLOCK(slow_work_execs_lock);
145#endif
146
147/*
101 * The queues of work items and the lock governing access to them. These are 148 * The queues of work items and the lock governing access to them. These are
102 * shared between all the CPUs. It doesn't make sense to have per-CPU queues 149 * shared between all the CPUs. It doesn't make sense to have per-CPU queues
103 * as the number of threads bears no relation to the number of CPUs. 150 * as the number of threads bears no relation to the number of CPUs.
@@ -105,9 +152,18 @@ static struct slow_work slow_work_new_thread; /* new thread starter */
105 * There are two queues of work items: one for slow work items, and one for 152 * There are two queues of work items: one for slow work items, and one for
106 * very slow work items. 153 * very slow work items.
107 */ 154 */
108static LIST_HEAD(slow_work_queue); 155LIST_HEAD(slow_work_queue);
109static LIST_HEAD(vslow_work_queue); 156LIST_HEAD(vslow_work_queue);
110static DEFINE_SPINLOCK(slow_work_queue_lock); 157DEFINE_SPINLOCK(slow_work_queue_lock);
158
159/*
160 * The following are two wait queues that get pinged when a work item is placed
161 * on an empty queue. These allow work items that are hogging a thread by
162 * sleeping in a way that could be deferred to yield their thread and enqueue
163 * themselves.
164 */
165static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation);
166static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation);
111 167
112/* 168/*
113 * The thread controls. A variable used to signal to the threads that they 169 * The thread controls. A variable used to signal to the threads that they
@@ -126,6 +182,20 @@ static DECLARE_COMPLETION(slow_work_last_thread_exited);
126static int slow_work_user_count; 182static int slow_work_user_count;
127static DEFINE_MUTEX(slow_work_user_lock); 183static DEFINE_MUTEX(slow_work_user_lock);
128 184
185static inline int slow_work_get_ref(struct slow_work *work)
186{
187 if (work->ops->get_ref)
188 return work->ops->get_ref(work);
189
190 return 0;
191}
192
193static inline void slow_work_put_ref(struct slow_work *work)
194{
195 if (work->ops->put_ref)
196 work->ops->put_ref(work);
197}
198
129/* 199/*
130 * Calculate the maximum number of active threads in the pool that are 200 * Calculate the maximum number of active threads in the pool that are
131 * permitted to process very slow work items. 201 * permitted to process very slow work items.
@@ -149,7 +219,7 @@ static unsigned slow_work_calc_vsmax(void)
149 * Attempt to execute stuff queued on a slow thread. Return true if we managed 219 * Attempt to execute stuff queued on a slow thread. Return true if we managed
150 * it, false if there was nothing to do. 220 * it, false if there was nothing to do.
151 */ 221 */
152static bool slow_work_execute(void) 222static noinline bool slow_work_execute(int id)
153{ 223{
154 struct slow_work *work = NULL; 224 struct slow_work *work = NULL;
155 unsigned vsmax; 225 unsigned vsmax;
@@ -186,6 +256,13 @@ static bool slow_work_execute(void)
186 } else { 256 } else {
187 very_slow = false; /* avoid the compiler warning */ 257 very_slow = false; /* avoid the compiler warning */
188 } 258 }
259
260 slow_work_set_thread_processing(id, work);
261 if (work) {
262 slow_work_mark_time(work);
263 slow_work_begin_exec(id, work);
264 }
265
189 spin_unlock_irq(&slow_work_queue_lock); 266 spin_unlock_irq(&slow_work_queue_lock);
190 267
191 if (!work) 268 if (!work)
@@ -194,12 +271,19 @@ static bool slow_work_execute(void)
194 if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags)) 271 if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
195 BUG(); 272 BUG();
196 273
197 work->ops->execute(work); 274 /* don't execute if the work is in the process of being cancelled */
275 if (!test_bit(SLOW_WORK_CANCELLING, &work->flags))
276 work->ops->execute(work);
198 277
199 if (very_slow) 278 if (very_slow)
200 atomic_dec(&vslow_work_executing_count); 279 atomic_dec(&vslow_work_executing_count);
201 clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags); 280 clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
202 281
282 /* wake up anyone waiting for this work to be complete */
283 wake_up_bit(&work->flags, SLOW_WORK_EXECUTING);
284
285 slow_work_end_exec(id, work);
286
203 /* if someone tried to enqueue the item whilst we were executing it, 287 /* if someone tried to enqueue the item whilst we were executing it,
204 * then it'll be left unenqueued to avoid multiple threads trying to 288 * then it'll be left unenqueued to avoid multiple threads trying to
205 * execute it simultaneously 289 * execute it simultaneously
@@ -219,7 +303,10 @@ static bool slow_work_execute(void)
219 spin_unlock_irq(&slow_work_queue_lock); 303 spin_unlock_irq(&slow_work_queue_lock);
220 } 304 }
221 305
222 work->ops->put_ref(work); 306 /* sort out the race between module unloading and put_ref() */
307 slow_work_put_ref(work);
308 slow_work_done_thread_processing(id, work);
309
223 return true; 310 return true;
224 311
225auto_requeue: 312auto_requeue:
@@ -227,15 +314,61 @@ auto_requeue:
227 * - we transfer our ref on the item back to the appropriate queue 314 * - we transfer our ref on the item back to the appropriate queue
228 * - don't wake another thread up as we're awake already 315 * - don't wake another thread up as we're awake already
229 */ 316 */
317 slow_work_mark_time(work);
230 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) 318 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
231 list_add_tail(&work->link, &vslow_work_queue); 319 list_add_tail(&work->link, &vslow_work_queue);
232 else 320 else
233 list_add_tail(&work->link, &slow_work_queue); 321 list_add_tail(&work->link, &slow_work_queue);
234 spin_unlock_irq(&slow_work_queue_lock); 322 spin_unlock_irq(&slow_work_queue_lock);
323 slow_work_clear_thread_processing(id);
235 return true; 324 return true;
236} 325}
237 326
238/** 327/**
328 * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work
329 * work: The work item under execution that wants to sleep
330 * _timeout: Scheduler sleep timeout
331 *
332 * Allow a requeueable work item to sleep on a slow-work processor thread until
333 * that thread is needed to do some other work or the sleep is interrupted by
334 * some other event.
335 *
336 * The caller must set up a wake up event before calling this and must have set
337 * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
338 * condition before calling this function as no test is made here.
339 *
340 * False is returned if there is nothing on the queue; true is returned if the
341 * work item should be requeued
342 */
343bool slow_work_sleep_till_thread_needed(struct slow_work *work,
344 signed long *_timeout)
345{
346 wait_queue_head_t *wfo_wq;
347 struct list_head *queue;
348
349 DEFINE_WAIT(wait);
350
351 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
352 wfo_wq = &vslow_work_queue_waits_for_occupation;
353 queue = &vslow_work_queue;
354 } else {
355 wfo_wq = &slow_work_queue_waits_for_occupation;
356 queue = &slow_work_queue;
357 }
358
359 if (!list_empty(queue))
360 return true;
361
362 add_wait_queue_exclusive(wfo_wq, &wait);
363 if (list_empty(queue))
364 *_timeout = schedule_timeout(*_timeout);
365 finish_wait(wfo_wq, &wait);
366
367 return !list_empty(queue);
368}
369EXPORT_SYMBOL(slow_work_sleep_till_thread_needed);
370
371/**
239 * slow_work_enqueue - Schedule a slow work item for processing 372 * slow_work_enqueue - Schedule a slow work item for processing
240 * @work: The work item to queue 373 * @work: The work item to queue
241 * 374 *
@@ -260,16 +393,22 @@ auto_requeue:
260 * allowed to pick items to execute. This ensures that very slow items won't 393 * allowed to pick items to execute. This ensures that very slow items won't
261 * overly block ones that are just ordinarily slow. 394 * overly block ones that are just ordinarily slow.
262 * 395 *
263 * Returns 0 if successful, -EAGAIN if not. 396 * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is
397 * attempted queued)
264 */ 398 */
265int slow_work_enqueue(struct slow_work *work) 399int slow_work_enqueue(struct slow_work *work)
266{ 400{
401 wait_queue_head_t *wfo_wq;
402 struct list_head *queue;
267 unsigned long flags; 403 unsigned long flags;
404 int ret;
405
406 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
407 return -ECANCELED;
268 408
269 BUG_ON(slow_work_user_count <= 0); 409 BUG_ON(slow_work_user_count <= 0);
270 BUG_ON(!work); 410 BUG_ON(!work);
271 BUG_ON(!work->ops); 411 BUG_ON(!work->ops);
272 BUG_ON(!work->ops->get_ref);
273 412
274 /* when honouring an enqueue request, we only promise that we will run 413 /* when honouring an enqueue request, we only promise that we will run
275 * the work function in the future; we do not promise to run it once 414 * the work function in the future; we do not promise to run it once
@@ -280,8 +419,19 @@ int slow_work_enqueue(struct slow_work *work)
280 * maintaining our promise 419 * maintaining our promise
281 */ 420 */
282 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { 421 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
422 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
423 wfo_wq = &vslow_work_queue_waits_for_occupation;
424 queue = &vslow_work_queue;
425 } else {
426 wfo_wq = &slow_work_queue_waits_for_occupation;
427 queue = &slow_work_queue;
428 }
429
283 spin_lock_irqsave(&slow_work_queue_lock, flags); 430 spin_lock_irqsave(&slow_work_queue_lock, flags);
284 431
432 if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags)))
433 goto cancelled;
434
285 /* we promise that we will not attempt to execute the work 435 /* we promise that we will not attempt to execute the work
286 * function in more than one thread simultaneously 436 * function in more than one thread simultaneously
287 * 437 *
@@ -299,25 +449,221 @@ int slow_work_enqueue(struct slow_work *work)
299 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { 449 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
300 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); 450 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
301 } else { 451 } else {
302 if (work->ops->get_ref(work) < 0) 452 ret = slow_work_get_ref(work);
303 goto cant_get_ref; 453 if (ret < 0)
304 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) 454 goto failed;
305 list_add_tail(&work->link, &vslow_work_queue); 455 slow_work_mark_time(work);
306 else 456 list_add_tail(&work->link, queue);
307 list_add_tail(&work->link, &slow_work_queue);
308 wake_up(&slow_work_thread_wq); 457 wake_up(&slow_work_thread_wq);
458
459 /* if someone who could be requeued is sleeping on a
460 * thread, then ask them to yield their thread */
461 if (work->link.prev == queue)
462 wake_up(wfo_wq);
309 } 463 }
310 464
311 spin_unlock_irqrestore(&slow_work_queue_lock, flags); 465 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
312 } 466 }
313 return 0; 467 return 0;
314 468
315cant_get_ref: 469cancelled:
470 ret = -ECANCELED;
471failed:
316 spin_unlock_irqrestore(&slow_work_queue_lock, flags); 472 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
317 return -EAGAIN; 473 return ret;
318} 474}
319EXPORT_SYMBOL(slow_work_enqueue); 475EXPORT_SYMBOL(slow_work_enqueue);
320 476
477static int slow_work_wait(void *word)
478{
479 schedule();
480 return 0;
481}
482
483/**
484 * slow_work_cancel - Cancel a slow work item
485 * @work: The work item to cancel
486 *
487 * This function will cancel a previously enqueued work item. If we cannot
488 * cancel the work item, it is guarenteed to have run when this function
489 * returns.
490 */
491void slow_work_cancel(struct slow_work *work)
492{
493 bool wait = true, put = false;
494
495 set_bit(SLOW_WORK_CANCELLING, &work->flags);
496 smp_mb();
497
498 /* if the work item is a delayed work item with an active timer, we
499 * need to wait for the timer to finish _before_ getting the spinlock,
500 * lest we deadlock against the timer routine
501 *
502 * the timer routine will leave DELAYED set if it notices the
503 * CANCELLING flag in time
504 */
505 if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
506 struct delayed_slow_work *dwork =
507 container_of(work, struct delayed_slow_work, work);
508 del_timer_sync(&dwork->timer);
509 }
510
511 spin_lock_irq(&slow_work_queue_lock);
512
513 if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
514 /* the timer routine aborted or never happened, so we are left
515 * holding the timer's reference on the item and should just
516 * drop the pending flag and wait for any ongoing execution to
517 * finish */
518 struct delayed_slow_work *dwork =
519 container_of(work, struct delayed_slow_work, work);
520
521 BUG_ON(timer_pending(&dwork->timer));
522 BUG_ON(!list_empty(&work->link));
523
524 clear_bit(SLOW_WORK_DELAYED, &work->flags);
525 put = true;
526 clear_bit(SLOW_WORK_PENDING, &work->flags);
527
528 } else if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
529 !list_empty(&work->link)) {
530 /* the link in the pending queue holds a reference on the item
531 * that we will need to release */
532 list_del_init(&work->link);
533 wait = false;
534 put = true;
535 clear_bit(SLOW_WORK_PENDING, &work->flags);
536
537 } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) {
538 /* the executor is holding our only reference on the item, so
539 * we merely need to wait for it to finish executing */
540 clear_bit(SLOW_WORK_PENDING, &work->flags);
541 }
542
543 spin_unlock_irq(&slow_work_queue_lock);
544
545 /* the EXECUTING flag is set by the executor whilst the spinlock is set
546 * and before the item is dequeued - so assuming the above doesn't
547 * actually dequeue it, simply waiting for the EXECUTING flag to be
548 * released here should be sufficient */
549 if (wait)
550 wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait,
551 TASK_UNINTERRUPTIBLE);
552
553 clear_bit(SLOW_WORK_CANCELLING, &work->flags);
554 if (put)
555 slow_work_put_ref(work);
556}
557EXPORT_SYMBOL(slow_work_cancel);
558
559/*
560 * Handle expiry of the delay timer, indicating that a delayed slow work item
561 * should now be queued if not cancelled
562 */
563static void delayed_slow_work_timer(unsigned long data)
564{
565 wait_queue_head_t *wfo_wq;
566 struct list_head *queue;
567 struct slow_work *work = (struct slow_work *) data;
568 unsigned long flags;
569 bool queued = false, put = false, first = false;
570
571 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
572 wfo_wq = &vslow_work_queue_waits_for_occupation;
573 queue = &vslow_work_queue;
574 } else {
575 wfo_wq = &slow_work_queue_waits_for_occupation;
576 queue = &slow_work_queue;
577 }
578
579 spin_lock_irqsave(&slow_work_queue_lock, flags);
580 if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) {
581 clear_bit(SLOW_WORK_DELAYED, &work->flags);
582
583 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
584 /* we discard the reference the timer was holding in
585 * favour of the one the executor holds */
586 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
587 put = true;
588 } else {
589 slow_work_mark_time(work);
590 list_add_tail(&work->link, queue);
591 queued = true;
592 if (work->link.prev == queue)
593 first = true;
594 }
595 }
596
597 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
598 if (put)
599 slow_work_put_ref(work);
600 if (first)
601 wake_up(wfo_wq);
602 if (queued)
603 wake_up(&slow_work_thread_wq);
604}
605
606/**
607 * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing
608 * @dwork: The delayed work item to queue
609 * @delay: When to start executing the work, in jiffies from now
610 *
611 * This is similar to slow_work_enqueue(), but it adds a delay before the work
612 * is actually queued for processing.
613 *
614 * The item can have delayed processing requested on it whilst it is being
615 * executed. The delay will begin immediately, and if it expires before the
616 * item finishes executing, the item will be placed back on the queue when it
617 * has done executing.
618 */
619int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
620 unsigned long delay)
621{
622 struct slow_work *work = &dwork->work;
623 unsigned long flags;
624 int ret;
625
626 if (delay == 0)
627 return slow_work_enqueue(&dwork->work);
628
629 BUG_ON(slow_work_user_count <= 0);
630 BUG_ON(!work);
631 BUG_ON(!work->ops);
632
633 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
634 return -ECANCELED;
635
636 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
637 spin_lock_irqsave(&slow_work_queue_lock, flags);
638
639 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
640 goto cancelled;
641
642 /* the timer holds a reference whilst it is pending */
643 ret = work->ops->get_ref(work);
644 if (ret < 0)
645 goto cant_get_ref;
646
647 if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags))
648 BUG();
649 dwork->timer.expires = jiffies + delay;
650 dwork->timer.data = (unsigned long) work;
651 dwork->timer.function = delayed_slow_work_timer;
652 add_timer(&dwork->timer);
653
654 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
655 }
656
657 return 0;
658
659cancelled:
660 ret = -ECANCELED;
661cant_get_ref:
662 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
663 return ret;
664}
665EXPORT_SYMBOL(delayed_slow_work_enqueue);
666
321/* 667/*
322 * Schedule a cull of the thread pool at some time in the near future 668 * Schedule a cull of the thread pool at some time in the near future
323 */ 669 */
@@ -368,13 +714,23 @@ static inline bool slow_work_available(int vsmax)
368 */ 714 */
369static int slow_work_thread(void *_data) 715static int slow_work_thread(void *_data)
370{ 716{
371 int vsmax; 717 int vsmax, id;
372 718
373 DEFINE_WAIT(wait); 719 DEFINE_WAIT(wait);
374 720
375 set_freezable(); 721 set_freezable();
376 set_user_nice(current, -5); 722 set_user_nice(current, -5);
377 723
724 /* allocate ourselves an ID */
725 spin_lock_irq(&slow_work_queue_lock);
726 id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
727 BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT);
728 __set_bit(id, slow_work_ids);
729 slow_work_set_thread_pid(id, current->pid);
730 spin_unlock_irq(&slow_work_queue_lock);
731
732 sprintf(current->comm, "kslowd%03u", id);
733
378 for (;;) { 734 for (;;) {
379 vsmax = vslow_work_proportion; 735 vsmax = vslow_work_proportion;
380 vsmax *= atomic_read(&slow_work_thread_count); 736 vsmax *= atomic_read(&slow_work_thread_count);
@@ -395,7 +751,7 @@ static int slow_work_thread(void *_data)
395 vsmax *= atomic_read(&slow_work_thread_count); 751 vsmax *= atomic_read(&slow_work_thread_count);
396 vsmax /= 100; 752 vsmax /= 100;
397 753
398 if (slow_work_available(vsmax) && slow_work_execute()) { 754 if (slow_work_available(vsmax) && slow_work_execute(id)) {
399 cond_resched(); 755 cond_resched();
400 if (list_empty(&slow_work_queue) && 756 if (list_empty(&slow_work_queue) &&
401 list_empty(&vslow_work_queue) && 757 list_empty(&vslow_work_queue) &&
@@ -412,6 +768,11 @@ static int slow_work_thread(void *_data)
412 break; 768 break;
413 } 769 }
414 770
771 spin_lock_irq(&slow_work_queue_lock);
772 slow_work_set_thread_pid(id, 0);
773 __clear_bit(id, slow_work_ids);
774 spin_unlock_irq(&slow_work_queue_lock);
775
415 if (atomic_dec_and_test(&slow_work_thread_count)) 776 if (atomic_dec_and_test(&slow_work_thread_count))
416 complete_and_exit(&slow_work_last_thread_exited, 0); 777 complete_and_exit(&slow_work_last_thread_exited, 0);
417 return 0; 778 return 0;
@@ -427,21 +788,6 @@ static void slow_work_cull_timeout(unsigned long data)
427} 788}
428 789
429/* 790/*
430 * Get a reference on slow work thread starter
431 */
432static int slow_work_new_thread_get_ref(struct slow_work *work)
433{
434 return 0;
435}
436
437/*
438 * Drop a reference on slow work thread starter
439 */
440static void slow_work_new_thread_put_ref(struct slow_work *work)
441{
442}
443
444/*
445 * Start a new slow work thread 791 * Start a new slow work thread
446 */ 792 */
447static void slow_work_new_thread_execute(struct slow_work *work) 793static void slow_work_new_thread_execute(struct slow_work *work)
@@ -475,9 +821,11 @@ static void slow_work_new_thread_execute(struct slow_work *work)
475} 821}
476 822
477static const struct slow_work_ops slow_work_new_thread_ops = { 823static const struct slow_work_ops slow_work_new_thread_ops = {
478 .get_ref = slow_work_new_thread_get_ref, 824 .owner = THIS_MODULE,
479 .put_ref = slow_work_new_thread_put_ref,
480 .execute = slow_work_new_thread_execute, 825 .execute = slow_work_new_thread_execute,
826#ifdef CONFIG_SLOW_WORK_DEBUG
827 .desc = slow_work_new_thread_desc,
828#endif
481}; 829};
482 830
483/* 831/*
@@ -493,10 +841,10 @@ static void slow_work_oom_timeout(unsigned long data)
493 * Handle adjustment of the minimum number of threads 841 * Handle adjustment of the minimum number of threads
494 */ 842 */
495static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, 843static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
496 struct file *filp, void __user *buffer, 844 void __user *buffer,
497 size_t *lenp, loff_t *ppos) 845 size_t *lenp, loff_t *ppos)
498{ 846{
499 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 847 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
500 int n; 848 int n;
501 849
502 if (ret == 0) { 850 if (ret == 0) {
@@ -521,10 +869,10 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
521 * Handle adjustment of the maximum number of threads 869 * Handle adjustment of the maximum number of threads
522 */ 870 */
523static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, 871static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
524 struct file *filp, void __user *buffer, 872 void __user *buffer,
525 size_t *lenp, loff_t *ppos) 873 size_t *lenp, loff_t *ppos)
526{ 874{
527 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 875 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
528 int n; 876 int n;
529 877
530 if (ret == 0) { 878 if (ret == 0) {
@@ -546,12 +894,13 @@ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
546 894
547/** 895/**
548 * slow_work_register_user - Register a user of the facility 896 * slow_work_register_user - Register a user of the facility
897 * @module: The module about to make use of the facility
549 * 898 *
550 * Register a user of the facility, starting up the initial threads if there 899 * Register a user of the facility, starting up the initial threads if there
551 * aren't any other users at this point. This will return 0 if successful, or 900 * aren't any other users at this point. This will return 0 if successful, or
552 * an error if not. 901 * an error if not.
553 */ 902 */
554int slow_work_register_user(void) 903int slow_work_register_user(struct module *module)
555{ 904{
556 struct task_struct *p; 905 struct task_struct *p;
557 int loop; 906 int loop;
@@ -598,14 +947,81 @@ error:
598} 947}
599EXPORT_SYMBOL(slow_work_register_user); 948EXPORT_SYMBOL(slow_work_register_user);
600 949
950/*
951 * wait for all outstanding items from the calling module to complete
952 * - note that more items may be queued whilst we're waiting
953 */
954static void slow_work_wait_for_items(struct module *module)
955{
956#ifdef CONFIG_MODULES
957 DECLARE_WAITQUEUE(myself, current);
958 struct slow_work *work;
959 int loop;
960
961 mutex_lock(&slow_work_unreg_sync_lock);
962 add_wait_queue(&slow_work_unreg_wq, &myself);
963
964 for (;;) {
965 spin_lock_irq(&slow_work_queue_lock);
966
967 /* first of all, we wait for the last queued item in each list
968 * to be processed */
969 list_for_each_entry_reverse(work, &vslow_work_queue, link) {
970 if (work->owner == module) {
971 set_current_state(TASK_UNINTERRUPTIBLE);
972 slow_work_unreg_work_item = work;
973 goto do_wait;
974 }
975 }
976 list_for_each_entry_reverse(work, &slow_work_queue, link) {
977 if (work->owner == module) {
978 set_current_state(TASK_UNINTERRUPTIBLE);
979 slow_work_unreg_work_item = work;
980 goto do_wait;
981 }
982 }
983
984 /* then we wait for the items being processed to finish */
985 slow_work_unreg_module = module;
986 smp_mb();
987 for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) {
988 if (slow_work_thread_processing[loop] == module)
989 goto do_wait;
990 }
991 spin_unlock_irq(&slow_work_queue_lock);
992 break; /* okay, we're done */
993
994 do_wait:
995 spin_unlock_irq(&slow_work_queue_lock);
996 schedule();
997 slow_work_unreg_work_item = NULL;
998 slow_work_unreg_module = NULL;
999 }
1000
1001 remove_wait_queue(&slow_work_unreg_wq, &myself);
1002 mutex_unlock(&slow_work_unreg_sync_lock);
1003#endif /* CONFIG_MODULES */
1004}
1005
601/** 1006/**
602 * slow_work_unregister_user - Unregister a user of the facility 1007 * slow_work_unregister_user - Unregister a user of the facility
1008 * @module: The module whose items should be cleared
603 * 1009 *
604 * Unregister a user of the facility, killing all the threads if this was the 1010 * Unregister a user of the facility, killing all the threads if this was the
605 * last one. 1011 * last one.
1012 *
1013 * This waits for all the work items belonging to the nominated module to go
1014 * away before proceeding.
606 */ 1015 */
607void slow_work_unregister_user(void) 1016void slow_work_unregister_user(struct module *module)
608{ 1017{
1018 /* first of all, wait for all outstanding items from the calling module
1019 * to complete */
1020 if (module)
1021 slow_work_wait_for_items(module);
1022
1023 /* then we can actually go about shutting down the facility if need
1024 * be */
609 mutex_lock(&slow_work_user_lock); 1025 mutex_lock(&slow_work_user_lock);
610 1026
611 BUG_ON(slow_work_user_count <= 0); 1027 BUG_ON(slow_work_user_count <= 0);
@@ -639,6 +1055,16 @@ static int __init init_slow_work(void)
639 if (slow_work_max_max_threads < nr_cpus * 2) 1055 if (slow_work_max_max_threads < nr_cpus * 2)
640 slow_work_max_max_threads = nr_cpus * 2; 1056 slow_work_max_max_threads = nr_cpus * 2;
641#endif 1057#endif
1058#ifdef CONFIG_SLOW_WORK_DEBUG
1059 {
1060 struct dentry *dbdir;
1061
1062 dbdir = debugfs_create_dir("slow_work", NULL);
1063 if (dbdir && !IS_ERR(dbdir))
1064 debugfs_create_file("runqueue", S_IFREG | 0400, dbdir,
1065 NULL, &slow_work_runqueue_fops);
1066 }
1067#endif
642 return 0; 1068 return 0;
643} 1069}
644 1070
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
new file mode 100644
index 000000000000..321f3c59d732
--- /dev/null
+++ b/kernel/slow-work.h
@@ -0,0 +1,72 @@
1/* Slow work private definitions
2 *
3 * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of
13 * things to do */
14#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
15 * OOM */
16
17#define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */
18
19/*
20 * slow-work.c
21 */
22#ifdef CONFIG_SLOW_WORK_DEBUG
23extern struct slow_work *slow_work_execs[];
24extern pid_t slow_work_pids[];
25extern rwlock_t slow_work_execs_lock;
26#endif
27
28extern struct list_head slow_work_queue;
29extern struct list_head vslow_work_queue;
30extern spinlock_t slow_work_queue_lock;
31
32/*
33 * slow-work-debugfs.c
34 */
35#ifdef CONFIG_SLOW_WORK_DEBUG
36extern const struct file_operations slow_work_runqueue_fops;
37
38extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
39#endif
40
41/*
42 * Helper functions
43 */
44static inline void slow_work_set_thread_pid(int id, pid_t pid)
45{
46#ifdef CONFIG_SLOW_WORK_PROC
47 slow_work_pids[id] = pid;
48#endif
49}
50
51static inline void slow_work_mark_time(struct slow_work *work)
52{
53#ifdef CONFIG_SLOW_WORK_PROC
54 work->mark = CURRENT_TIME;
55#endif
56}
57
58static inline void slow_work_begin_exec(int id, struct slow_work *work)
59{
60#ifdef CONFIG_SLOW_WORK_PROC
61 slow_work_execs[id] = work;
62#endif
63}
64
65static inline void slow_work_end_exec(int id, struct slow_work *work)
66{
67#ifdef CONFIG_SLOW_WORK_PROC
68 write_lock(&slow_work_execs_lock);
69 slow_work_execs[id] = NULL;
70 write_unlock(&slow_work_execs_lock);
71#endif
72}
diff --git a/kernel/smp.c b/kernel/smp.c
index 94188b8ecc33..a8c76069cf50 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -29,8 +29,7 @@ enum {
29 29
30struct call_function_data { 30struct call_function_data {
31 struct call_single_data csd; 31 struct call_single_data csd;
32 spinlock_t lock; 32 atomic_t refs;
33 unsigned int refs;
34 cpumask_var_t cpumask; 33 cpumask_var_t cpumask;
35}; 34};
36 35
@@ -39,9 +38,7 @@ struct call_single_queue {
39 spinlock_t lock; 38 spinlock_t lock;
40}; 39};
41 40
42static DEFINE_PER_CPU(struct call_function_data, cfd_data) = { 41static DEFINE_PER_CPU(struct call_function_data, cfd_data);
43 .lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock),
44};
45 42
46static int 43static int
47hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) 44hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -177,6 +174,11 @@ void generic_smp_call_function_interrupt(void)
177 int cpu = get_cpu(); 174 int cpu = get_cpu();
178 175
179 /* 176 /*
177 * Shouldn't receive this interrupt on a cpu that is not yet online.
178 */
179 WARN_ON_ONCE(!cpu_online(cpu));
180
181 /*
180 * Ensure entry is visible on call_function_queue after we have 182 * Ensure entry is visible on call_function_queue after we have
181 * entered the IPI. See comment in smp_call_function_many. 183 * entered the IPI. See comment in smp_call_function_many.
182 * If we don't have this, then we may miss an entry on the list 184 * If we don't have this, then we may miss an entry on the list
@@ -191,25 +193,18 @@ void generic_smp_call_function_interrupt(void)
191 list_for_each_entry_rcu(data, &call_function.queue, csd.list) { 193 list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
192 int refs; 194 int refs;
193 195
194 spin_lock(&data->lock); 196 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask))
195 if (!cpumask_test_cpu(cpu, data->cpumask)) {
196 spin_unlock(&data->lock);
197 continue; 197 continue;
198 }
199 cpumask_clear_cpu(cpu, data->cpumask);
200 spin_unlock(&data->lock);
201 198
202 data->csd.func(data->csd.info); 199 data->csd.func(data->csd.info);
203 200
204 spin_lock(&data->lock); 201 refs = atomic_dec_return(&data->refs);
205 WARN_ON(data->refs == 0); 202 WARN_ON(refs < 0);
206 refs = --data->refs;
207 if (!refs) { 203 if (!refs) {
208 spin_lock(&call_function.lock); 204 spin_lock(&call_function.lock);
209 list_del_rcu(&data->csd.list); 205 list_del_rcu(&data->csd.list);
210 spin_unlock(&call_function.lock); 206 spin_unlock(&call_function.lock);
211 } 207 }
212 spin_unlock(&data->lock);
213 208
214 if (refs) 209 if (refs)
215 continue; 210 continue;
@@ -230,6 +225,11 @@ void generic_smp_call_function_single_interrupt(void)
230 unsigned int data_flags; 225 unsigned int data_flags;
231 LIST_HEAD(list); 226 LIST_HEAD(list);
232 227
228 /*
229 * Shouldn't receive this interrupt on a cpu that is not yet online.
230 */
231 WARN_ON_ONCE(!cpu_online(smp_processor_id()));
232
233 spin_lock(&q->lock); 233 spin_lock(&q->lock);
234 list_replace_init(&q->list, &list); 234 list_replace_init(&q->list, &list);
235 spin_unlock(&q->lock); 235 spin_unlock(&q->lock);
@@ -265,9 +265,7 @@ static DEFINE_PER_CPU(struct call_single_data, csd_data);
265 * @info: An arbitrary pointer to pass to the function. 265 * @info: An arbitrary pointer to pass to the function.
266 * @wait: If true, wait until function has completed on other CPUs. 266 * @wait: If true, wait until function has completed on other CPUs.
267 * 267 *
268 * Returns 0 on success, else a negative status code. Note that @wait 268 * Returns 0 on success, else a negative status code.
269 * will be implicitly turned on in case of allocation failures, since
270 * we fall back to on-stack allocation.
271 */ 269 */
272int smp_call_function_single(int cpu, void (*func) (void *info), void *info, 270int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
273 int wait) 271 int wait)
@@ -285,8 +283,14 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
285 */ 283 */
286 this_cpu = get_cpu(); 284 this_cpu = get_cpu();
287 285
288 /* Can deadlock when called with interrupts disabled */ 286 /*
289 WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); 287 * Can deadlock when called with interrupts disabled.
288 * We allow cpu's that are not yet online though, as no one else can
289 * send smp call function interrupt to this cpu and as such deadlocks
290 * can't happen.
291 */
292 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
293 && !oops_in_progress);
290 294
291 if (cpu == this_cpu) { 295 if (cpu == this_cpu) {
292 local_irq_save(flags); 296 local_irq_save(flags);
@@ -315,6 +319,51 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
315} 319}
316EXPORT_SYMBOL(smp_call_function_single); 320EXPORT_SYMBOL(smp_call_function_single);
317 321
322/*
323 * smp_call_function_any - Run a function on any of the given cpus
324 * @mask: The mask of cpus it can run on.
325 * @func: The function to run. This must be fast and non-blocking.
326 * @info: An arbitrary pointer to pass to the function.
327 * @wait: If true, wait until function has completed.
328 *
329 * Returns 0 on success, else a negative status code (if no cpus were online).
330 * Note that @wait will be implicitly turned on in case of allocation failures,
331 * since we fall back to on-stack allocation.
332 *
333 * Selection preference:
334 * 1) current cpu if in @mask
335 * 2) any cpu of current node if in @mask
336 * 3) any other online cpu in @mask
337 */
338int smp_call_function_any(const struct cpumask *mask,
339 void (*func)(void *info), void *info, int wait)
340{
341 unsigned int cpu;
342 const struct cpumask *nodemask;
343 int ret;
344
345 /* Try for same CPU (cheapest) */
346 cpu = get_cpu();
347 if (cpumask_test_cpu(cpu, mask))
348 goto call;
349
350 /* Try for same node. */
351 nodemask = cpumask_of_node(cpu);
352 for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
353 cpu = cpumask_next_and(cpu, nodemask, mask)) {
354 if (cpu_online(cpu))
355 goto call;
356 }
357
358 /* Any online will do: smp_call_function_single handles nr_cpu_ids. */
359 cpu = cpumask_any_and(mask, cpu_online_mask);
360call:
361 ret = smp_call_function_single(cpu, func, info, wait);
362 put_cpu();
363 return ret;
364}
365EXPORT_SYMBOL_GPL(smp_call_function_any);
366
318/** 367/**
319 * __smp_call_function_single(): Run a function on another CPU 368 * __smp_call_function_single(): Run a function on another CPU
320 * @cpu: The CPU to run on. 369 * @cpu: The CPU to run on.
@@ -329,19 +378,18 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
329{ 378{
330 csd_lock(data); 379 csd_lock(data);
331 380
332 /* Can deadlock when called with interrupts disabled */ 381 /*
333 WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress); 382 * Can deadlock when called with interrupts disabled.
383 * We allow cpu's that are not yet online though, as no one else can
384 * send smp call function interrupt to this cpu and as such deadlocks
385 * can't happen.
386 */
387 WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled()
388 && !oops_in_progress);
334 389
335 generic_exec_single(cpu, data, wait); 390 generic_exec_single(cpu, data, wait);
336} 391}
337 392
338/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */
339
340#ifndef arch_send_call_function_ipi_mask
341# define arch_send_call_function_ipi_mask(maskp) \
342 arch_send_call_function_ipi(*(maskp))
343#endif
344
345/** 393/**
346 * smp_call_function_many(): Run a function on a set of other CPUs. 394 * smp_call_function_many(): Run a function on a set of other CPUs.
347 * @mask: The set of cpus to run on (only runs on online subset). 395 * @mask: The set of cpus to run on (only runs on online subset).
@@ -350,9 +398,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
350 * @wait: If true, wait (atomically) until function has completed 398 * @wait: If true, wait (atomically) until function has completed
351 * on other CPUs. 399 * on other CPUs.
352 * 400 *
353 * If @wait is true, then returns once @func has returned. Note that @wait 401 * If @wait is true, then returns once @func has returned.
354 * will be implicitly turned on in case of allocation failures, since
355 * we fall back to on-stack allocation.
356 * 402 *
357 * You must not call this function with disabled interrupts or from a 403 * You must not call this function with disabled interrupts or from a
358 * hardware interrupt handler or from a bottom half handler. Preemption 404 * hardware interrupt handler or from a bottom half handler. Preemption
@@ -365,8 +411,14 @@ void smp_call_function_many(const struct cpumask *mask,
365 unsigned long flags; 411 unsigned long flags;
366 int cpu, next_cpu, this_cpu = smp_processor_id(); 412 int cpu, next_cpu, this_cpu = smp_processor_id();
367 413
368 /* Can deadlock when called with interrupts disabled */ 414 /*
369 WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); 415 * Can deadlock when called with interrupts disabled.
416 * We allow cpu's that are not yet online though, as no one else can
417 * send smp call function interrupt to this cpu and as such deadlocks
418 * can't happen.
419 */
420 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
421 && !oops_in_progress);
370 422
371 /* So, what's a CPU they want? Ignoring this one. */ 423 /* So, what's a CPU they want? Ignoring this one. */
372 cpu = cpumask_first_and(mask, cpu_online_mask); 424 cpu = cpumask_first_and(mask, cpu_online_mask);
@@ -391,23 +443,20 @@ void smp_call_function_many(const struct cpumask *mask,
391 data = &__get_cpu_var(cfd_data); 443 data = &__get_cpu_var(cfd_data);
392 csd_lock(&data->csd); 444 csd_lock(&data->csd);
393 445
394 spin_lock_irqsave(&data->lock, flags);
395 data->csd.func = func; 446 data->csd.func = func;
396 data->csd.info = info; 447 data->csd.info = info;
397 cpumask_and(data->cpumask, mask, cpu_online_mask); 448 cpumask_and(data->cpumask, mask, cpu_online_mask);
398 cpumask_clear_cpu(this_cpu, data->cpumask); 449 cpumask_clear_cpu(this_cpu, data->cpumask);
399 data->refs = cpumask_weight(data->cpumask); 450 atomic_set(&data->refs, cpumask_weight(data->cpumask));
400 451
401 spin_lock(&call_function.lock); 452 spin_lock_irqsave(&call_function.lock, flags);
402 /* 453 /*
403 * Place entry at the _HEAD_ of the list, so that any cpu still 454 * Place entry at the _HEAD_ of the list, so that any cpu still
404 * observing the entry in generic_smp_call_function_interrupt() 455 * observing the entry in generic_smp_call_function_interrupt()
405 * will not miss any other list entries: 456 * will not miss any other list entries:
406 */ 457 */
407 list_add_rcu(&data->csd.list, &call_function.queue); 458 list_add_rcu(&data->csd.list, &call_function.queue);
408 spin_unlock(&call_function.lock); 459 spin_unlock_irqrestore(&call_function.lock, flags);
409
410 spin_unlock_irqrestore(&data->lock, flags);
411 460
412 /* 461 /*
413 * Make the list addition visible before sending the ipi. 462 * Make the list addition visible before sending the ipi.
@@ -435,8 +484,7 @@ EXPORT_SYMBOL(smp_call_function_many);
435 * Returns 0. 484 * Returns 0.
436 * 485 *
437 * If @wait is true, then returns once @func has returned; otherwise 486 * If @wait is true, then returns once @func has returned; otherwise
438 * it returns just before the target cpu calls @func. In case of allocation 487 * it returns just before the target cpu calls @func.
439 * failure, @wait will be implicitly turned on.
440 * 488 *
441 * You must not call this function with disabled interrupts or from a 489 * You must not call this function with disabled interrupts or from a
442 * hardware interrupt handler or from a bottom half handler. 490 * hardware interrupt handler or from a bottom half handler.
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7db25067cd2d..21939d9e830e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -57,7 +57,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58 58
59char *softirq_to_name[NR_SOFTIRQS] = { 59char *softirq_to_name[NR_SOFTIRQS] = {
60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
61 "TASKLET", "SCHED", "HRTIMER", "RCU" 61 "TASKLET", "SCHED", "HRTIMER", "RCU"
62}; 62};
63 63
@@ -302,9 +302,9 @@ void irq_exit(void)
302 if (!in_interrupt() && local_softirq_pending()) 302 if (!in_interrupt() && local_softirq_pending())
303 invoke_softirq(); 303 invoke_softirq();
304 304
305 rcu_irq_exit();
305#ifdef CONFIG_NO_HZ 306#ifdef CONFIG_NO_HZ
306 /* Make sure that timer wheel updates are propagated */ 307 /* Make sure that timer wheel updates are propagated */
307 rcu_irq_exit();
308 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) 308 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
309 tick_nohz_stop_sched_tick(0); 309 tick_nohz_stop_sched_tick(0);
310#endif 310#endif
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 88796c330838..81324d12eb35 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -90,11 +90,11 @@ void touch_all_softlockup_watchdogs(void)
90EXPORT_SYMBOL(touch_all_softlockup_watchdogs); 90EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
91 91
92int proc_dosoftlockup_thresh(struct ctl_table *table, int write, 92int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
93 struct file *filp, void __user *buffer, 93 void __user *buffer,
94 size_t *lenp, loff_t *ppos) 94 size_t *lenp, loff_t *ppos)
95{ 95{
96 touch_all_softlockup_watchdogs(); 96 touch_all_softlockup_watchdogs();
97 return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 97 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
98} 98}
99 99
100/* 100/*
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 5ddab730cb2f..41e042219ff6 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -21,145 +21,28 @@
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/module.h> 22#include <linux/module.h>
23 23
24#ifndef _spin_trylock
25int __lockfunc _spin_trylock(spinlock_t *lock)
26{
27 return __spin_trylock(lock);
28}
29EXPORT_SYMBOL(_spin_trylock);
30#endif
31
32#ifndef _read_trylock
33int __lockfunc _read_trylock(rwlock_t *lock)
34{
35 return __read_trylock(lock);
36}
37EXPORT_SYMBOL(_read_trylock);
38#endif
39
40#ifndef _write_trylock
41int __lockfunc _write_trylock(rwlock_t *lock)
42{
43 return __write_trylock(lock);
44}
45EXPORT_SYMBOL(_write_trylock);
46#endif
47
48/* 24/*
49 * If lockdep is enabled then we use the non-preemption spin-ops 25 * If lockdep is enabled then we use the non-preemption spin-ops
50 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are 26 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
51 * not re-enabled during lock-acquire (which the preempt-spin-ops do): 27 * not re-enabled during lock-acquire (which the preempt-spin-ops do):
52 */ 28 */
53#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) 29#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
54
55#ifndef _read_lock
56void __lockfunc _read_lock(rwlock_t *lock)
57{
58 __read_lock(lock);
59}
60EXPORT_SYMBOL(_read_lock);
61#endif
62
63#ifndef _spin_lock_irqsave
64unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
65{
66 return __spin_lock_irqsave(lock);
67}
68EXPORT_SYMBOL(_spin_lock_irqsave);
69#endif
70
71#ifndef _spin_lock_irq
72void __lockfunc _spin_lock_irq(spinlock_t *lock)
73{
74 __spin_lock_irq(lock);
75}
76EXPORT_SYMBOL(_spin_lock_irq);
77#endif
78
79#ifndef _spin_lock_bh
80void __lockfunc _spin_lock_bh(spinlock_t *lock)
81{
82 __spin_lock_bh(lock);
83}
84EXPORT_SYMBOL(_spin_lock_bh);
85#endif
86
87#ifndef _read_lock_irqsave
88unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
89{
90 return __read_lock_irqsave(lock);
91}
92EXPORT_SYMBOL(_read_lock_irqsave);
93#endif
94
95#ifndef _read_lock_irq
96void __lockfunc _read_lock_irq(rwlock_t *lock)
97{
98 __read_lock_irq(lock);
99}
100EXPORT_SYMBOL(_read_lock_irq);
101#endif
102
103#ifndef _read_lock_bh
104void __lockfunc _read_lock_bh(rwlock_t *lock)
105{
106 __read_lock_bh(lock);
107}
108EXPORT_SYMBOL(_read_lock_bh);
109#endif
110
111#ifndef _write_lock_irqsave
112unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
113{
114 return __write_lock_irqsave(lock);
115}
116EXPORT_SYMBOL(_write_lock_irqsave);
117#endif
118
119#ifndef _write_lock_irq
120void __lockfunc _write_lock_irq(rwlock_t *lock)
121{
122 __write_lock_irq(lock);
123}
124EXPORT_SYMBOL(_write_lock_irq);
125#endif
126
127#ifndef _write_lock_bh
128void __lockfunc _write_lock_bh(rwlock_t *lock)
129{
130 __write_lock_bh(lock);
131}
132EXPORT_SYMBOL(_write_lock_bh);
133#endif
134
135#ifndef _spin_lock
136void __lockfunc _spin_lock(spinlock_t *lock)
137{
138 __spin_lock(lock);
139}
140EXPORT_SYMBOL(_spin_lock);
141#endif
142
143#ifndef _write_lock
144void __lockfunc _write_lock(rwlock_t *lock)
145{
146 __write_lock(lock);
147}
148EXPORT_SYMBOL(_write_lock);
149#endif
150
151#else /* CONFIG_PREEMPT: */
152
153/* 30/*
31 * The __lock_function inlines are taken from
32 * include/linux/spinlock_api_smp.h
33 */
34#else
35/*
36 * We build the __lock_function inlines here. They are too large for
37 * inlining all over the place, but here is only one user per function
38 * which embedds them into the calling _lock_function below.
39 *
154 * This could be a long-held lock. We both prepare to spin for a long 40 * This could be a long-held lock. We both prepare to spin for a long
155 * time (making _this_ CPU preemptable if possible), and we also signal 41 * time (making _this_ CPU preemptable if possible), and we also signal
156 * towards that other CPU that it should break the lock ASAP. 42 * towards that other CPU that it should break the lock ASAP.
157 *
158 * (We do this in a function because inlining it would be excessive.)
159 */ 43 */
160
161#define BUILD_LOCK_OPS(op, locktype) \ 44#define BUILD_LOCK_OPS(op, locktype) \
162void __lockfunc _##op##_lock(locktype##_t *lock) \ 45void __lockfunc __##op##_lock(locktype##_t *lock) \
163{ \ 46{ \
164 for (;;) { \ 47 for (;;) { \
165 preempt_disable(); \ 48 preempt_disable(); \
@@ -175,9 +58,7 @@ void __lockfunc _##op##_lock(locktype##_t *lock) \
175 (lock)->break_lock = 0; \ 58 (lock)->break_lock = 0; \
176} \ 59} \
177 \ 60 \
178EXPORT_SYMBOL(_##op##_lock); \ 61unsigned long __lockfunc __##op##_lock_irqsave(locktype##_t *lock) \
179 \
180unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \
181{ \ 62{ \
182 unsigned long flags; \ 63 unsigned long flags; \
183 \ 64 \
@@ -198,16 +79,12 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \
198 return flags; \ 79 return flags; \
199} \ 80} \
200 \ 81 \
201EXPORT_SYMBOL(_##op##_lock_irqsave); \ 82void __lockfunc __##op##_lock_irq(locktype##_t *lock) \
202 \
203void __lockfunc _##op##_lock_irq(locktype##_t *lock) \
204{ \ 83{ \
205 _##op##_lock_irqsave(lock); \ 84 _##op##_lock_irqsave(lock); \
206} \ 85} \
207 \ 86 \
208EXPORT_SYMBOL(_##op##_lock_irq); \ 87void __lockfunc __##op##_lock_bh(locktype##_t *lock) \
209 \
210void __lockfunc _##op##_lock_bh(locktype##_t *lock) \
211{ \ 88{ \
212 unsigned long flags; \ 89 unsigned long flags; \
213 \ 90 \
@@ -220,23 +97,21 @@ void __lockfunc _##op##_lock_bh(locktype##_t *lock) \
220 local_bh_disable(); \ 97 local_bh_disable(); \
221 local_irq_restore(flags); \ 98 local_irq_restore(flags); \
222} \ 99} \
223 \
224EXPORT_SYMBOL(_##op##_lock_bh)
225 100
226/* 101/*
227 * Build preemption-friendly versions of the following 102 * Build preemption-friendly versions of the following
228 * lock-spinning functions: 103 * lock-spinning functions:
229 * 104 *
230 * _[spin|read|write]_lock() 105 * __[spin|read|write]_lock()
231 * _[spin|read|write]_lock_irq() 106 * __[spin|read|write]_lock_irq()
232 * _[spin|read|write]_lock_irqsave() 107 * __[spin|read|write]_lock_irqsave()
233 * _[spin|read|write]_lock_bh() 108 * __[spin|read|write]_lock_bh()
234 */ 109 */
235BUILD_LOCK_OPS(spin, spinlock); 110BUILD_LOCK_OPS(spin, spinlock);
236BUILD_LOCK_OPS(read, rwlock); 111BUILD_LOCK_OPS(read, rwlock);
237BUILD_LOCK_OPS(write, rwlock); 112BUILD_LOCK_OPS(write, rwlock);
238 113
239#endif /* CONFIG_PREEMPT */ 114#endif
240 115
241#ifdef CONFIG_DEBUG_LOCK_ALLOC 116#ifdef CONFIG_DEBUG_LOCK_ALLOC
242 117
@@ -248,7 +123,8 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
248} 123}
249EXPORT_SYMBOL(_spin_lock_nested); 124EXPORT_SYMBOL(_spin_lock_nested);
250 125
251unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) 126unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock,
127 int subclass)
252{ 128{
253 unsigned long flags; 129 unsigned long flags;
254 130
@@ -272,7 +148,127 @@ EXPORT_SYMBOL(_spin_lock_nest_lock);
272 148
273#endif 149#endif
274 150
275#ifndef _spin_unlock 151#ifndef CONFIG_INLINE_SPIN_TRYLOCK
152int __lockfunc _spin_trylock(spinlock_t *lock)
153{
154 return __spin_trylock(lock);
155}
156EXPORT_SYMBOL(_spin_trylock);
157#endif
158
159#ifndef CONFIG_INLINE_READ_TRYLOCK
160int __lockfunc _read_trylock(rwlock_t *lock)
161{
162 return __read_trylock(lock);
163}
164EXPORT_SYMBOL(_read_trylock);
165#endif
166
167#ifndef CONFIG_INLINE_WRITE_TRYLOCK
168int __lockfunc _write_trylock(rwlock_t *lock)
169{
170 return __write_trylock(lock);
171}
172EXPORT_SYMBOL(_write_trylock);
173#endif
174
175#ifndef CONFIG_INLINE_READ_LOCK
176void __lockfunc _read_lock(rwlock_t *lock)
177{
178 __read_lock(lock);
179}
180EXPORT_SYMBOL(_read_lock);
181#endif
182
183#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
184unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
185{
186 return __spin_lock_irqsave(lock);
187}
188EXPORT_SYMBOL(_spin_lock_irqsave);
189#endif
190
191#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ
192void __lockfunc _spin_lock_irq(spinlock_t *lock)
193{
194 __spin_lock_irq(lock);
195}
196EXPORT_SYMBOL(_spin_lock_irq);
197#endif
198
199#ifndef CONFIG_INLINE_SPIN_LOCK_BH
200void __lockfunc _spin_lock_bh(spinlock_t *lock)
201{
202 __spin_lock_bh(lock);
203}
204EXPORT_SYMBOL(_spin_lock_bh);
205#endif
206
207#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE
208unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
209{
210 return __read_lock_irqsave(lock);
211}
212EXPORT_SYMBOL(_read_lock_irqsave);
213#endif
214
215#ifndef CONFIG_INLINE_READ_LOCK_IRQ
216void __lockfunc _read_lock_irq(rwlock_t *lock)
217{
218 __read_lock_irq(lock);
219}
220EXPORT_SYMBOL(_read_lock_irq);
221#endif
222
223#ifndef CONFIG_INLINE_READ_LOCK_BH
224void __lockfunc _read_lock_bh(rwlock_t *lock)
225{
226 __read_lock_bh(lock);
227}
228EXPORT_SYMBOL(_read_lock_bh);
229#endif
230
231#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
232unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
233{
234 return __write_lock_irqsave(lock);
235}
236EXPORT_SYMBOL(_write_lock_irqsave);
237#endif
238
239#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ
240void __lockfunc _write_lock_irq(rwlock_t *lock)
241{
242 __write_lock_irq(lock);
243}
244EXPORT_SYMBOL(_write_lock_irq);
245#endif
246
247#ifndef CONFIG_INLINE_WRITE_LOCK_BH
248void __lockfunc _write_lock_bh(rwlock_t *lock)
249{
250 __write_lock_bh(lock);
251}
252EXPORT_SYMBOL(_write_lock_bh);
253#endif
254
255#ifndef CONFIG_INLINE_SPIN_LOCK
256void __lockfunc _spin_lock(spinlock_t *lock)
257{
258 __spin_lock(lock);
259}
260EXPORT_SYMBOL(_spin_lock);
261#endif
262
263#ifndef CONFIG_INLINE_WRITE_LOCK
264void __lockfunc _write_lock(rwlock_t *lock)
265{
266 __write_lock(lock);
267}
268EXPORT_SYMBOL(_write_lock);
269#endif
270
271#ifndef CONFIG_INLINE_SPIN_UNLOCK
276void __lockfunc _spin_unlock(spinlock_t *lock) 272void __lockfunc _spin_unlock(spinlock_t *lock)
277{ 273{
278 __spin_unlock(lock); 274 __spin_unlock(lock);
@@ -280,7 +276,7 @@ void __lockfunc _spin_unlock(spinlock_t *lock)
280EXPORT_SYMBOL(_spin_unlock); 276EXPORT_SYMBOL(_spin_unlock);
281#endif 277#endif
282 278
283#ifndef _write_unlock 279#ifndef CONFIG_INLINE_WRITE_UNLOCK
284void __lockfunc _write_unlock(rwlock_t *lock) 280void __lockfunc _write_unlock(rwlock_t *lock)
285{ 281{
286 __write_unlock(lock); 282 __write_unlock(lock);
@@ -288,7 +284,7 @@ void __lockfunc _write_unlock(rwlock_t *lock)
288EXPORT_SYMBOL(_write_unlock); 284EXPORT_SYMBOL(_write_unlock);
289#endif 285#endif
290 286
291#ifndef _read_unlock 287#ifndef CONFIG_INLINE_READ_UNLOCK
292void __lockfunc _read_unlock(rwlock_t *lock) 288void __lockfunc _read_unlock(rwlock_t *lock)
293{ 289{
294 __read_unlock(lock); 290 __read_unlock(lock);
@@ -296,7 +292,7 @@ void __lockfunc _read_unlock(rwlock_t *lock)
296EXPORT_SYMBOL(_read_unlock); 292EXPORT_SYMBOL(_read_unlock);
297#endif 293#endif
298 294
299#ifndef _spin_unlock_irqrestore 295#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
300void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) 296void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
301{ 297{
302 __spin_unlock_irqrestore(lock, flags); 298 __spin_unlock_irqrestore(lock, flags);
@@ -304,7 +300,7 @@ void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
304EXPORT_SYMBOL(_spin_unlock_irqrestore); 300EXPORT_SYMBOL(_spin_unlock_irqrestore);
305#endif 301#endif
306 302
307#ifndef _spin_unlock_irq 303#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ
308void __lockfunc _spin_unlock_irq(spinlock_t *lock) 304void __lockfunc _spin_unlock_irq(spinlock_t *lock)
309{ 305{
310 __spin_unlock_irq(lock); 306 __spin_unlock_irq(lock);
@@ -312,7 +308,7 @@ void __lockfunc _spin_unlock_irq(spinlock_t *lock)
312EXPORT_SYMBOL(_spin_unlock_irq); 308EXPORT_SYMBOL(_spin_unlock_irq);
313#endif 309#endif
314 310
315#ifndef _spin_unlock_bh 311#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH
316void __lockfunc _spin_unlock_bh(spinlock_t *lock) 312void __lockfunc _spin_unlock_bh(spinlock_t *lock)
317{ 313{
318 __spin_unlock_bh(lock); 314 __spin_unlock_bh(lock);
@@ -320,7 +316,7 @@ void __lockfunc _spin_unlock_bh(spinlock_t *lock)
320EXPORT_SYMBOL(_spin_unlock_bh); 316EXPORT_SYMBOL(_spin_unlock_bh);
321#endif 317#endif
322 318
323#ifndef _read_unlock_irqrestore 319#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
324void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 320void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
325{ 321{
326 __read_unlock_irqrestore(lock, flags); 322 __read_unlock_irqrestore(lock, flags);
@@ -328,7 +324,7 @@ void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
328EXPORT_SYMBOL(_read_unlock_irqrestore); 324EXPORT_SYMBOL(_read_unlock_irqrestore);
329#endif 325#endif
330 326
331#ifndef _read_unlock_irq 327#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ
332void __lockfunc _read_unlock_irq(rwlock_t *lock) 328void __lockfunc _read_unlock_irq(rwlock_t *lock)
333{ 329{
334 __read_unlock_irq(lock); 330 __read_unlock_irq(lock);
@@ -336,7 +332,7 @@ void __lockfunc _read_unlock_irq(rwlock_t *lock)
336EXPORT_SYMBOL(_read_unlock_irq); 332EXPORT_SYMBOL(_read_unlock_irq);
337#endif 333#endif
338 334
339#ifndef _read_unlock_bh 335#ifndef CONFIG_INLINE_READ_UNLOCK_BH
340void __lockfunc _read_unlock_bh(rwlock_t *lock) 336void __lockfunc _read_unlock_bh(rwlock_t *lock)
341{ 337{
342 __read_unlock_bh(lock); 338 __read_unlock_bh(lock);
@@ -344,7 +340,7 @@ void __lockfunc _read_unlock_bh(rwlock_t *lock)
344EXPORT_SYMBOL(_read_unlock_bh); 340EXPORT_SYMBOL(_read_unlock_bh);
345#endif 341#endif
346 342
347#ifndef _write_unlock_irqrestore 343#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
348void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 344void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
349{ 345{
350 __write_unlock_irqrestore(lock, flags); 346 __write_unlock_irqrestore(lock, flags);
@@ -352,7 +348,7 @@ void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
352EXPORT_SYMBOL(_write_unlock_irqrestore); 348EXPORT_SYMBOL(_write_unlock_irqrestore);
353#endif 349#endif
354 350
355#ifndef _write_unlock_irq 351#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ
356void __lockfunc _write_unlock_irq(rwlock_t *lock) 352void __lockfunc _write_unlock_irq(rwlock_t *lock)
357{ 353{
358 __write_unlock_irq(lock); 354 __write_unlock_irq(lock);
@@ -360,7 +356,7 @@ void __lockfunc _write_unlock_irq(rwlock_t *lock)
360EXPORT_SYMBOL(_write_unlock_irq); 356EXPORT_SYMBOL(_write_unlock_irq);
361#endif 357#endif
362 358
363#ifndef _write_unlock_bh 359#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH
364void __lockfunc _write_unlock_bh(rwlock_t *lock) 360void __lockfunc _write_unlock_bh(rwlock_t *lock)
365{ 361{
366 __write_unlock_bh(lock); 362 __write_unlock_bh(lock);
@@ -368,7 +364,7 @@ void __lockfunc _write_unlock_bh(rwlock_t *lock)
368EXPORT_SYMBOL(_write_unlock_bh); 364EXPORT_SYMBOL(_write_unlock_bh);
369#endif 365#endif
370 366
371#ifndef _spin_trylock_bh 367#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH
372int __lockfunc _spin_trylock_bh(spinlock_t *lock) 368int __lockfunc _spin_trylock_bh(spinlock_t *lock)
373{ 369{
374 return __spin_trylock_bh(lock); 370 return __spin_trylock_bh(lock);
diff --git a/kernel/srcu.c b/kernel/srcu.c
index b0aeeaf22ce4..818d7d9aa03c 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -49,6 +49,7 @@ int init_srcu_struct(struct srcu_struct *sp)
49 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); 49 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
50 return (sp->per_cpu_ref ? 0 : -ENOMEM); 50 return (sp->per_cpu_ref ? 0 : -ENOMEM);
51} 51}
52EXPORT_SYMBOL_GPL(init_srcu_struct);
52 53
53/* 54/*
54 * srcu_readers_active_idx -- returns approximate number of readers 55 * srcu_readers_active_idx -- returns approximate number of readers
@@ -97,6 +98,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
97 free_percpu(sp->per_cpu_ref); 98 free_percpu(sp->per_cpu_ref);
98 sp->per_cpu_ref = NULL; 99 sp->per_cpu_ref = NULL;
99} 100}
101EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
100 102
101/** 103/**
102 * srcu_read_lock - register a new reader for an SRCU-protected structure. 104 * srcu_read_lock - register a new reader for an SRCU-protected structure.
@@ -118,6 +120,7 @@ int srcu_read_lock(struct srcu_struct *sp)
118 preempt_enable(); 120 preempt_enable();
119 return idx; 121 return idx;
120} 122}
123EXPORT_SYMBOL_GPL(srcu_read_lock);
121 124
122/** 125/**
123 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. 126 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
@@ -136,22 +139,12 @@ void srcu_read_unlock(struct srcu_struct *sp, int idx)
136 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; 139 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
137 preempt_enable(); 140 preempt_enable();
138} 141}
142EXPORT_SYMBOL_GPL(srcu_read_unlock);
139 143
140/** 144/*
141 * synchronize_srcu - wait for prior SRCU read-side critical-section completion 145 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
142 * @sp: srcu_struct with which to synchronize.
143 *
144 * Flip the completed counter, and wait for the old count to drain to zero.
145 * As with classic RCU, the updater must use some separate means of
146 * synchronizing concurrent updates. Can block; must be called from
147 * process context.
148 *
149 * Note that it is illegal to call synchornize_srcu() from the corresponding
150 * SRCU read-side critical section; doing so will result in deadlock.
151 * However, it is perfectly legal to call synchronize_srcu() on one
152 * srcu_struct from some other srcu_struct's read-side critical section.
153 */ 146 */
154void synchronize_srcu(struct srcu_struct *sp) 147void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
155{ 148{
156 int idx; 149 int idx;
157 150
@@ -173,7 +166,7 @@ void synchronize_srcu(struct srcu_struct *sp)
173 return; 166 return;
174 } 167 }
175 168
176 synchronize_sched(); /* Force memory barrier on all CPUs. */ 169 sync_func(); /* Force memory barrier on all CPUs. */
177 170
178 /* 171 /*
179 * The preceding synchronize_sched() ensures that any CPU that 172 * The preceding synchronize_sched() ensures that any CPU that
@@ -190,7 +183,7 @@ void synchronize_srcu(struct srcu_struct *sp)
190 idx = sp->completed & 0x1; 183 idx = sp->completed & 0x1;
191 sp->completed++; 184 sp->completed++;
192 185
193 synchronize_sched(); /* Force memory barrier on all CPUs. */ 186 sync_func(); /* Force memory barrier on all CPUs. */
194 187
195 /* 188 /*
196 * At this point, because of the preceding synchronize_sched(), 189 * At this point, because of the preceding synchronize_sched(),
@@ -203,7 +196,7 @@ void synchronize_srcu(struct srcu_struct *sp)
203 while (srcu_readers_active_idx(sp, idx)) 196 while (srcu_readers_active_idx(sp, idx))
204 schedule_timeout_interruptible(1); 197 schedule_timeout_interruptible(1);
205 198
206 synchronize_sched(); /* Force memory barrier on all CPUs. */ 199 sync_func(); /* Force memory barrier on all CPUs. */
207 200
208 /* 201 /*
209 * The preceding synchronize_sched() forces all srcu_read_unlock() 202 * The preceding synchronize_sched() forces all srcu_read_unlock()
@@ -237,6 +230,47 @@ void synchronize_srcu(struct srcu_struct *sp)
237} 230}
238 231
239/** 232/**
233 * synchronize_srcu - wait for prior SRCU read-side critical-section completion
234 * @sp: srcu_struct with which to synchronize.
235 *
236 * Flip the completed counter, and wait for the old count to drain to zero.
237 * As with classic RCU, the updater must use some separate means of
238 * synchronizing concurrent updates. Can block; must be called from
239 * process context.
240 *
241 * Note that it is illegal to call synchronize_srcu() from the corresponding
242 * SRCU read-side critical section; doing so will result in deadlock.
243 * However, it is perfectly legal to call synchronize_srcu() on one
244 * srcu_struct from some other srcu_struct's read-side critical section.
245 */
246void synchronize_srcu(struct srcu_struct *sp)
247{
248 __synchronize_srcu(sp, synchronize_sched);
249}
250EXPORT_SYMBOL_GPL(synchronize_srcu);
251
252/**
253 * synchronize_srcu_expedited - like synchronize_srcu, but less patient
254 * @sp: srcu_struct with which to synchronize.
255 *
256 * Flip the completed counter, and wait for the old count to drain to zero.
257 * As with classic RCU, the updater must use some separate means of
258 * synchronizing concurrent updates. Can block; must be called from
259 * process context.
260 *
261 * Note that it is illegal to call synchronize_srcu_expedited()
262 * from the corresponding SRCU read-side critical section; doing so
263 * will result in deadlock. However, it is perfectly legal to call
264 * synchronize_srcu_expedited() on one srcu_struct from some other
265 * srcu_struct's read-side critical section.
266 */
267void synchronize_srcu_expedited(struct srcu_struct *sp)
268{
269 __synchronize_srcu(sp, synchronize_sched_expedited);
270}
271EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
272
273/**
240 * srcu_batches_completed - return batches completed. 274 * srcu_batches_completed - return batches completed.
241 * @sp: srcu_struct on which to report batch completion. 275 * @sp: srcu_struct on which to report batch completion.
242 * 276 *
@@ -248,10 +282,4 @@ long srcu_batches_completed(struct srcu_struct *sp)
248{ 282{
249 return sp->completed; 283 return sp->completed;
250} 284}
251
252EXPORT_SYMBOL_GPL(init_srcu_struct);
253EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
254EXPORT_SYMBOL_GPL(srcu_read_lock);
255EXPORT_SYMBOL_GPL(srcu_read_unlock);
256EXPORT_SYMBOL_GPL(synchronize_srcu);
257EXPORT_SYMBOL_GPL(srcu_batches_completed); 285EXPORT_SYMBOL_GPL(srcu_batches_completed);
diff --git a/kernel/sys.c b/kernel/sys.c
index b3f1097c76fa..9968c5fb55b9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,7 +14,7 @@
14#include <linux/prctl.h> 14#include <linux/prctl.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/perf_counter.h> 17#include <linux/perf_event.h>
18#include <linux/resource.h> 18#include <linux/resource.h>
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/kexec.h> 20#include <linux/kexec.h>
@@ -911,16 +911,15 @@ change_okay:
911 911
912void do_sys_times(struct tms *tms) 912void do_sys_times(struct tms *tms)
913{ 913{
914 struct task_cputime cputime; 914 cputime_t tgutime, tgstime, cutime, cstime;
915 cputime_t cutime, cstime;
916 915
917 thread_group_cputime(current, &cputime);
918 spin_lock_irq(&current->sighand->siglock); 916 spin_lock_irq(&current->sighand->siglock);
917 thread_group_times(current, &tgutime, &tgstime);
919 cutime = current->signal->cutime; 918 cutime = current->signal->cutime;
920 cstime = current->signal->cstime; 919 cstime = current->signal->cstime;
921 spin_unlock_irq(&current->sighand->siglock); 920 spin_unlock_irq(&current->sighand->siglock);
922 tms->tms_utime = cputime_to_clock_t(cputime.utime); 921 tms->tms_utime = cputime_to_clock_t(tgutime);
923 tms->tms_stime = cputime_to_clock_t(cputime.stime); 922 tms->tms_stime = cputime_to_clock_t(tgstime);
924 tms->tms_cutime = cputime_to_clock_t(cutime); 923 tms->tms_cutime = cputime_to_clock_t(cutime);
925 tms->tms_cstime = cputime_to_clock_t(cstime); 924 tms->tms_cstime = cputime_to_clock_t(cstime);
926} 925}
@@ -1110,6 +1109,8 @@ SYSCALL_DEFINE0(setsid)
1110 err = session; 1109 err = session;
1111out: 1110out:
1112 write_unlock_irq(&tasklist_lock); 1111 write_unlock_irq(&tasklist_lock);
1112 if (err > 0)
1113 proc_sid_connector(group_leader);
1113 return err; 1114 return err;
1114} 1115}
1115 1116
@@ -1336,16 +1337,16 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1336{ 1337{
1337 struct task_struct *t; 1338 struct task_struct *t;
1338 unsigned long flags; 1339 unsigned long flags;
1339 cputime_t utime, stime; 1340 cputime_t tgutime, tgstime, utime, stime;
1340 struct task_cputime cputime; 1341 unsigned long maxrss = 0;
1341 1342
1342 memset((char *) r, 0, sizeof *r); 1343 memset((char *) r, 0, sizeof *r);
1343 utime = stime = cputime_zero; 1344 utime = stime = cputime_zero;
1344 1345
1345 if (who == RUSAGE_THREAD) { 1346 if (who == RUSAGE_THREAD) {
1346 utime = task_utime(current); 1347 task_times(current, &utime, &stime);
1347 stime = task_stime(current);
1348 accumulate_thread_rusage(p, r); 1348 accumulate_thread_rusage(p, r);
1349 maxrss = p->signal->maxrss;
1349 goto out; 1350 goto out;
1350 } 1351 }
1351 1352
@@ -1363,20 +1364,23 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1363 r->ru_majflt = p->signal->cmaj_flt; 1364 r->ru_majflt = p->signal->cmaj_flt;
1364 r->ru_inblock = p->signal->cinblock; 1365 r->ru_inblock = p->signal->cinblock;
1365 r->ru_oublock = p->signal->coublock; 1366 r->ru_oublock = p->signal->coublock;
1367 maxrss = p->signal->cmaxrss;
1366 1368
1367 if (who == RUSAGE_CHILDREN) 1369 if (who == RUSAGE_CHILDREN)
1368 break; 1370 break;
1369 1371
1370 case RUSAGE_SELF: 1372 case RUSAGE_SELF:
1371 thread_group_cputime(p, &cputime); 1373 thread_group_times(p, &tgutime, &tgstime);
1372 utime = cputime_add(utime, cputime.utime); 1374 utime = cputime_add(utime, tgutime);
1373 stime = cputime_add(stime, cputime.stime); 1375 stime = cputime_add(stime, tgstime);
1374 r->ru_nvcsw += p->signal->nvcsw; 1376 r->ru_nvcsw += p->signal->nvcsw;
1375 r->ru_nivcsw += p->signal->nivcsw; 1377 r->ru_nivcsw += p->signal->nivcsw;
1376 r->ru_minflt += p->signal->min_flt; 1378 r->ru_minflt += p->signal->min_flt;
1377 r->ru_majflt += p->signal->maj_flt; 1379 r->ru_majflt += p->signal->maj_flt;
1378 r->ru_inblock += p->signal->inblock; 1380 r->ru_inblock += p->signal->inblock;
1379 r->ru_oublock += p->signal->oublock; 1381 r->ru_oublock += p->signal->oublock;
1382 if (maxrss < p->signal->maxrss)
1383 maxrss = p->signal->maxrss;
1380 t = p; 1384 t = p;
1381 do { 1385 do {
1382 accumulate_thread_rusage(t, r); 1386 accumulate_thread_rusage(t, r);
@@ -1392,6 +1396,15 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1392out: 1396out:
1393 cputime_to_timeval(utime, &r->ru_utime); 1397 cputime_to_timeval(utime, &r->ru_utime);
1394 cputime_to_timeval(stime, &r->ru_stime); 1398 cputime_to_timeval(stime, &r->ru_stime);
1399
1400 if (who != RUSAGE_CHILDREN) {
1401 struct mm_struct *mm = get_task_mm(p);
1402 if (mm) {
1403 setmax_mm_hiwater_rss(&maxrss, mm);
1404 mmput(mm);
1405 }
1406 }
1407 r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
1395} 1408}
1396 1409
1397int getrusage(struct task_struct *p, int who, struct rusage __user *ru) 1410int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
@@ -1511,11 +1524,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1511 case PR_SET_TSC: 1524 case PR_SET_TSC:
1512 error = SET_TSC_CTL(arg2); 1525 error = SET_TSC_CTL(arg2);
1513 break; 1526 break;
1514 case PR_TASK_PERF_COUNTERS_DISABLE: 1527 case PR_TASK_PERF_EVENTS_DISABLE:
1515 error = perf_counter_task_disable(); 1528 error = perf_event_task_disable();
1516 break; 1529 break;
1517 case PR_TASK_PERF_COUNTERS_ENABLE: 1530 case PR_TASK_PERF_EVENTS_ENABLE:
1518 error = perf_counter_task_enable(); 1531 error = perf_event_task_enable();
1519 break; 1532 break;
1520 case PR_GET_TIMERSLACK: 1533 case PR_GET_TIMERSLACK:
1521 error = current->timer_slack_ns; 1534 error = current->timer_slack_ns;
@@ -1528,6 +1541,41 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1528 current->timer_slack_ns = arg2; 1541 current->timer_slack_ns = arg2;
1529 error = 0; 1542 error = 0;
1530 break; 1543 break;
1544 case PR_MCE_KILL:
1545 if (arg4 | arg5)
1546 return -EINVAL;
1547 switch (arg2) {
1548 case PR_MCE_KILL_CLEAR:
1549 if (arg3 != 0)
1550 return -EINVAL;
1551 current->flags &= ~PF_MCE_PROCESS;
1552 break;
1553 case PR_MCE_KILL_SET:
1554 current->flags |= PF_MCE_PROCESS;
1555 if (arg3 == PR_MCE_KILL_EARLY)
1556 current->flags |= PF_MCE_EARLY;
1557 else if (arg3 == PR_MCE_KILL_LATE)
1558 current->flags &= ~PF_MCE_EARLY;
1559 else if (arg3 == PR_MCE_KILL_DEFAULT)
1560 current->flags &=
1561 ~(PF_MCE_EARLY|PF_MCE_PROCESS);
1562 else
1563 return -EINVAL;
1564 break;
1565 default:
1566 return -EINVAL;
1567 }
1568 error = 0;
1569 break;
1570 case PR_MCE_KILL_GET:
1571 if (arg2 | arg3 | arg4 | arg5)
1572 return -EINVAL;
1573 if (current->flags & PF_MCE_PROCESS)
1574 error = (current->flags & PF_MCE_EARLY) ?
1575 PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
1576 else
1577 error = PR_MCE_KILL_DEFAULT;
1578 break;
1531 default: 1579 default:
1532 error = -EINVAL; 1580 error = -EINVAL;
1533 break; 1581 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 68320f6b07b5..e06d0b8d1951 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -49,6 +49,7 @@ cond_syscall(sys_sendmsg);
49cond_syscall(compat_sys_sendmsg); 49cond_syscall(compat_sys_sendmsg);
50cond_syscall(sys_recvmsg); 50cond_syscall(sys_recvmsg);
51cond_syscall(compat_sys_recvmsg); 51cond_syscall(compat_sys_recvmsg);
52cond_syscall(compat_sys_recvfrom);
52cond_syscall(sys_socketcall); 53cond_syscall(sys_socketcall);
53cond_syscall(sys_futex); 54cond_syscall(sys_futex);
54cond_syscall(compat_sys_futex); 55cond_syscall(compat_sys_futex);
@@ -177,4 +178,4 @@ cond_syscall(sys_eventfd);
177cond_syscall(sys_eventfd2); 178cond_syscall(sys_eventfd2);
178 179
179/* performance counters: */ 180/* performance counters: */
180cond_syscall(sys_perf_counter_open); 181cond_syscall(sys_perf_event_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3125cff1c570..4dbf93a52ee9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -26,7 +26,6 @@
26#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
27#include <linux/security.h> 27#include <linux/security.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/utsname.h>
30#include <linux/kmemcheck.h> 29#include <linux/kmemcheck.h>
31#include <linux/smp_lock.h> 30#include <linux/smp_lock.h>
32#include <linux/fs.h> 31#include <linux/fs.h>
@@ -37,6 +36,7 @@
37#include <linux/sysrq.h> 36#include <linux/sysrq.h>
38#include <linux/highuid.h> 37#include <linux/highuid.h>
39#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/ratelimit.h>
40#include <linux/hugetlb.h> 40#include <linux/hugetlb.h>
41#include <linux/initrd.h> 41#include <linux/initrd.h>
42#include <linux/key.h> 42#include <linux/key.h>
@@ -50,7 +50,7 @@
50#include <linux/reboot.h> 50#include <linux/reboot.h>
51#include <linux/ftrace.h> 51#include <linux/ftrace.h>
52#include <linux/slow-work.h> 52#include <linux/slow-work.h>
53#include <linux/perf_counter.h> 53#include <linux/perf_event.h>
54 54
55#include <asm/uaccess.h> 55#include <asm/uaccess.h>
56#include <asm/processor.h> 56#include <asm/processor.h>
@@ -77,6 +77,7 @@ extern int max_threads;
77extern int core_uses_pid; 77extern int core_uses_pid;
78extern int suid_dumpable; 78extern int suid_dumpable;
79extern char core_pattern[]; 79extern char core_pattern[];
80extern unsigned int core_pipe_limit;
80extern int pid_max; 81extern int pid_max;
81extern int min_free_kbytes; 82extern int min_free_kbytes;
82extern int pid_max_min, pid_max_max; 83extern int pid_max_min, pid_max_max;
@@ -91,6 +92,9 @@ extern int sysctl_nr_trim_pages;
91#ifdef CONFIG_RCU_TORTURE_TEST 92#ifdef CONFIG_RCU_TORTURE_TEST
92extern int rcutorture_runnable; 93extern int rcutorture_runnable;
93#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ 94#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
95#ifdef CONFIG_BLOCK
96extern int blk_iopoll_enabled;
97#endif
94 98
95/* Constants used for minimum and maximum */ 99/* Constants used for minimum and maximum */
96#ifdef CONFIG_DETECT_SOFTLOCKUP 100#ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -103,6 +107,9 @@ static int __maybe_unused one = 1;
103static int __maybe_unused two = 2; 107static int __maybe_unused two = 2;
104static unsigned long one_ul = 1; 108static unsigned long one_ul = 1;
105static int one_hundred = 100; 109static int one_hundred = 100;
110#ifdef CONFIG_PRINTK
111static int ten_thousand = 10000;
112#endif
106 113
107/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ 114/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
108static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; 115static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -152,14 +159,16 @@ extern int no_unaligned_warning;
152extern int unaligned_dump_stack; 159extern int unaligned_dump_stack;
153#endif 160#endif
154 161
162extern struct ratelimit_state printk_ratelimit_state;
163
155#ifdef CONFIG_RT_MUTEXES 164#ifdef CONFIG_RT_MUTEXES
156extern int max_lock_depth; 165extern int max_lock_depth;
157#endif 166#endif
158 167
159#ifdef CONFIG_PROC_SYSCTL 168#ifdef CONFIG_PROC_SYSCTL
160static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, 169static int proc_do_cad_pid(struct ctl_table *table, int write,
161 void __user *buffer, size_t *lenp, loff_t *ppos); 170 void __user *buffer, size_t *lenp, loff_t *ppos);
162static int proc_taint(struct ctl_table *table, int write, struct file *filp, 171static int proc_taint(struct ctl_table *table, int write,
163 void __user *buffer, size_t *lenp, loff_t *ppos); 172 void __user *buffer, size_t *lenp, loff_t *ppos);
164#endif 173#endif
165 174
@@ -418,6 +427,14 @@ static struct ctl_table kern_table[] = {
418 .proc_handler = &proc_dostring, 427 .proc_handler = &proc_dostring,
419 .strategy = &sysctl_string, 428 .strategy = &sysctl_string,
420 }, 429 },
430 {
431 .ctl_name = CTL_UNNUMBERED,
432 .procname = "core_pipe_limit",
433 .data = &core_pipe_limit,
434 .maxlen = sizeof(unsigned int),
435 .mode = 0644,
436 .proc_handler = &proc_dointvec,
437 },
421#ifdef CONFIG_PROC_SYSCTL 438#ifdef CONFIG_PROC_SYSCTL
422 { 439 {
423 .procname = "tainted", 440 .procname = "tainted",
@@ -719,6 +736,17 @@ static struct ctl_table kern_table[] = {
719 .mode = 0644, 736 .mode = 0644,
720 .proc_handler = &proc_dointvec, 737 .proc_handler = &proc_dointvec,
721 }, 738 },
739 {
740 .ctl_name = CTL_UNNUMBERED,
741 .procname = "printk_delay",
742 .data = &printk_delay_msec,
743 .maxlen = sizeof(int),
744 .mode = 0644,
745 .proc_handler = &proc_dointvec_minmax,
746 .strategy = &sysctl_intvec,
747 .extra1 = &zero,
748 .extra2 = &ten_thousand,
749 },
722#endif 750#endif
723 { 751 {
724 .ctl_name = KERN_NGROUPS_MAX, 752 .ctl_name = KERN_NGROUPS_MAX,
@@ -961,28 +989,28 @@ static struct ctl_table kern_table[] = {
961 .child = slow_work_sysctls, 989 .child = slow_work_sysctls,
962 }, 990 },
963#endif 991#endif
964#ifdef CONFIG_PERF_COUNTERS 992#ifdef CONFIG_PERF_EVENTS
965 { 993 {
966 .ctl_name = CTL_UNNUMBERED, 994 .ctl_name = CTL_UNNUMBERED,
967 .procname = "perf_counter_paranoid", 995 .procname = "perf_event_paranoid",
968 .data = &sysctl_perf_counter_paranoid, 996 .data = &sysctl_perf_event_paranoid,
969 .maxlen = sizeof(sysctl_perf_counter_paranoid), 997 .maxlen = sizeof(sysctl_perf_event_paranoid),
970 .mode = 0644, 998 .mode = 0644,
971 .proc_handler = &proc_dointvec, 999 .proc_handler = &proc_dointvec,
972 }, 1000 },
973 { 1001 {
974 .ctl_name = CTL_UNNUMBERED, 1002 .ctl_name = CTL_UNNUMBERED,
975 .procname = "perf_counter_mlock_kb", 1003 .procname = "perf_event_mlock_kb",
976 .data = &sysctl_perf_counter_mlock, 1004 .data = &sysctl_perf_event_mlock,
977 .maxlen = sizeof(sysctl_perf_counter_mlock), 1005 .maxlen = sizeof(sysctl_perf_event_mlock),
978 .mode = 0644, 1006 .mode = 0644,
979 .proc_handler = &proc_dointvec, 1007 .proc_handler = &proc_dointvec,
980 }, 1008 },
981 { 1009 {
982 .ctl_name = CTL_UNNUMBERED, 1010 .ctl_name = CTL_UNNUMBERED,
983 .procname = "perf_counter_max_sample_rate", 1011 .procname = "perf_event_max_sample_rate",
984 .data = &sysctl_perf_counter_sample_rate, 1012 .data = &sysctl_perf_event_sample_rate,
985 .maxlen = sizeof(sysctl_perf_counter_sample_rate), 1013 .maxlen = sizeof(sysctl_perf_event_sample_rate),
986 .mode = 0644, 1014 .mode = 0644,
987 .proc_handler = &proc_dointvec, 1015 .proc_handler = &proc_dointvec,
988 }, 1016 },
@@ -997,7 +1025,16 @@ static struct ctl_table kern_table[] = {
997 .proc_handler = &proc_dointvec, 1025 .proc_handler = &proc_dointvec,
998 }, 1026 },
999#endif 1027#endif
1000 1028#ifdef CONFIG_BLOCK
1029 {
1030 .ctl_name = CTL_UNNUMBERED,
1031 .procname = "blk_iopoll",
1032 .data = &blk_iopoll_enabled,
1033 .maxlen = sizeof(int),
1034 .mode = 0644,
1035 .proc_handler = &proc_dointvec,
1036 },
1037#endif
1001/* 1038/*
1002 * NOTE: do not add new entries to this table unless you have read 1039 * NOTE: do not add new entries to this table unless you have read
1003 * Documentation/sysctl/ctl_unnumbered.txt 1040 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1364,6 +1401,31 @@ static struct ctl_table vm_table[] = {
1364 .mode = 0644, 1401 .mode = 0644,
1365 .proc_handler = &scan_unevictable_handler, 1402 .proc_handler = &scan_unevictable_handler,
1366 }, 1403 },
1404#ifdef CONFIG_MEMORY_FAILURE
1405 {
1406 .ctl_name = CTL_UNNUMBERED,
1407 .procname = "memory_failure_early_kill",
1408 .data = &sysctl_memory_failure_early_kill,
1409 .maxlen = sizeof(sysctl_memory_failure_early_kill),
1410 .mode = 0644,
1411 .proc_handler = &proc_dointvec_minmax,
1412 .strategy = &sysctl_intvec,
1413 .extra1 = &zero,
1414 .extra2 = &one,
1415 },
1416 {
1417 .ctl_name = CTL_UNNUMBERED,
1418 .procname = "memory_failure_recovery",
1419 .data = &sysctl_memory_failure_recovery,
1420 .maxlen = sizeof(sysctl_memory_failure_recovery),
1421 .mode = 0644,
1422 .proc_handler = &proc_dointvec_minmax,
1423 .strategy = &sysctl_intvec,
1424 .extra1 = &zero,
1425 .extra2 = &one,
1426 },
1427#endif
1428
1367/* 1429/*
1368 * NOTE: do not add new entries to this table unless you have read 1430 * NOTE: do not add new entries to this table unless you have read
1369 * Documentation/sysctl/ctl_unnumbered.txt 1431 * Documentation/sysctl/ctl_unnumbered.txt
@@ -2192,7 +2254,7 @@ void sysctl_head_put(struct ctl_table_header *head)
2192#ifdef CONFIG_PROC_SYSCTL 2254#ifdef CONFIG_PROC_SYSCTL
2193 2255
2194static int _proc_do_string(void* data, int maxlen, int write, 2256static int _proc_do_string(void* data, int maxlen, int write,
2195 struct file *filp, void __user *buffer, 2257 void __user *buffer,
2196 size_t *lenp, loff_t *ppos) 2258 size_t *lenp, loff_t *ppos)
2197{ 2259{
2198 size_t len; 2260 size_t len;
@@ -2253,7 +2315,6 @@ static int _proc_do_string(void* data, int maxlen, int write,
2253 * proc_dostring - read a string sysctl 2315 * proc_dostring - read a string sysctl
2254 * @table: the sysctl table 2316 * @table: the sysctl table
2255 * @write: %TRUE if this is a write to the sysctl file 2317 * @write: %TRUE if this is a write to the sysctl file
2256 * @filp: the file structure
2257 * @buffer: the user buffer 2318 * @buffer: the user buffer
2258 * @lenp: the size of the user buffer 2319 * @lenp: the size of the user buffer
2259 * @ppos: file position 2320 * @ppos: file position
@@ -2267,10 +2328,10 @@ static int _proc_do_string(void* data, int maxlen, int write,
2267 * 2328 *
2268 * Returns 0 on success. 2329 * Returns 0 on success.
2269 */ 2330 */
2270int proc_dostring(struct ctl_table *table, int write, struct file *filp, 2331int proc_dostring(struct ctl_table *table, int write,
2271 void __user *buffer, size_t *lenp, loff_t *ppos) 2332 void __user *buffer, size_t *lenp, loff_t *ppos)
2272{ 2333{
2273 return _proc_do_string(table->data, table->maxlen, write, filp, 2334 return _proc_do_string(table->data, table->maxlen, write,
2274 buffer, lenp, ppos); 2335 buffer, lenp, ppos);
2275} 2336}
2276 2337
@@ -2295,7 +2356,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
2295} 2356}
2296 2357
2297static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, 2358static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2298 int write, struct file *filp, void __user *buffer, 2359 int write, void __user *buffer,
2299 size_t *lenp, loff_t *ppos, 2360 size_t *lenp, loff_t *ppos,
2300 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2361 int (*conv)(int *negp, unsigned long *lvalp, int *valp,
2301 int write, void *data), 2362 int write, void *data),
@@ -2402,13 +2463,13 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2402#undef TMPBUFLEN 2463#undef TMPBUFLEN
2403} 2464}
2404 2465
2405static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp, 2466static int do_proc_dointvec(struct ctl_table *table, int write,
2406 void __user *buffer, size_t *lenp, loff_t *ppos, 2467 void __user *buffer, size_t *lenp, loff_t *ppos,
2407 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2468 int (*conv)(int *negp, unsigned long *lvalp, int *valp,
2408 int write, void *data), 2469 int write, void *data),
2409 void *data) 2470 void *data)
2410{ 2471{
2411 return __do_proc_dointvec(table->data, table, write, filp, 2472 return __do_proc_dointvec(table->data, table, write,
2412 buffer, lenp, ppos, conv, data); 2473 buffer, lenp, ppos, conv, data);
2413} 2474}
2414 2475
@@ -2416,7 +2477,6 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
2416 * proc_dointvec - read a vector of integers 2477 * proc_dointvec - read a vector of integers
2417 * @table: the sysctl table 2478 * @table: the sysctl table
2418 * @write: %TRUE if this is a write to the sysctl file 2479 * @write: %TRUE if this is a write to the sysctl file
2419 * @filp: the file structure
2420 * @buffer: the user buffer 2480 * @buffer: the user buffer
2421 * @lenp: the size of the user buffer 2481 * @lenp: the size of the user buffer
2422 * @ppos: file position 2482 * @ppos: file position
@@ -2426,10 +2486,10 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
2426 * 2486 *
2427 * Returns 0 on success. 2487 * Returns 0 on success.
2428 */ 2488 */
2429int proc_dointvec(struct ctl_table *table, int write, struct file *filp, 2489int proc_dointvec(struct ctl_table *table, int write,
2430 void __user *buffer, size_t *lenp, loff_t *ppos) 2490 void __user *buffer, size_t *lenp, loff_t *ppos)
2431{ 2491{
2432 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2492 return do_proc_dointvec(table,write,buffer,lenp,ppos,
2433 NULL,NULL); 2493 NULL,NULL);
2434} 2494}
2435 2495
@@ -2437,7 +2497,7 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
2437 * Taint values can only be increased 2497 * Taint values can only be increased
2438 * This means we can safely use a temporary. 2498 * This means we can safely use a temporary.
2439 */ 2499 */
2440static int proc_taint(struct ctl_table *table, int write, struct file *filp, 2500static int proc_taint(struct ctl_table *table, int write,
2441 void __user *buffer, size_t *lenp, loff_t *ppos) 2501 void __user *buffer, size_t *lenp, loff_t *ppos)
2442{ 2502{
2443 struct ctl_table t; 2503 struct ctl_table t;
@@ -2449,7 +2509,7 @@ static int proc_taint(struct ctl_table *table, int write, struct file *filp,
2449 2509
2450 t = *table; 2510 t = *table;
2451 t.data = &tmptaint; 2511 t.data = &tmptaint;
2452 err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); 2512 err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
2453 if (err < 0) 2513 if (err < 0)
2454 return err; 2514 return err;
2455 2515
@@ -2501,7 +2561,6 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
2501 * proc_dointvec_minmax - read a vector of integers with min/max values 2561 * proc_dointvec_minmax - read a vector of integers with min/max values
2502 * @table: the sysctl table 2562 * @table: the sysctl table
2503 * @write: %TRUE if this is a write to the sysctl file 2563 * @write: %TRUE if this is a write to the sysctl file
2504 * @filp: the file structure
2505 * @buffer: the user buffer 2564 * @buffer: the user buffer
2506 * @lenp: the size of the user buffer 2565 * @lenp: the size of the user buffer
2507 * @ppos: file position 2566 * @ppos: file position
@@ -2514,19 +2573,18 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
2514 * 2573 *
2515 * Returns 0 on success. 2574 * Returns 0 on success.
2516 */ 2575 */
2517int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, 2576int proc_dointvec_minmax(struct ctl_table *table, int write,
2518 void __user *buffer, size_t *lenp, loff_t *ppos) 2577 void __user *buffer, size_t *lenp, loff_t *ppos)
2519{ 2578{
2520 struct do_proc_dointvec_minmax_conv_param param = { 2579 struct do_proc_dointvec_minmax_conv_param param = {
2521 .min = (int *) table->extra1, 2580 .min = (int *) table->extra1,
2522 .max = (int *) table->extra2, 2581 .max = (int *) table->extra2,
2523 }; 2582 };
2524 return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, 2583 return do_proc_dointvec(table, write, buffer, lenp, ppos,
2525 do_proc_dointvec_minmax_conv, &param); 2584 do_proc_dointvec_minmax_conv, &param);
2526} 2585}
2527 2586
2528static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, 2587static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
2529 struct file *filp,
2530 void __user *buffer, 2588 void __user *buffer,
2531 size_t *lenp, loff_t *ppos, 2589 size_t *lenp, loff_t *ppos,
2532 unsigned long convmul, 2590 unsigned long convmul,
@@ -2631,21 +2689,19 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2631} 2689}
2632 2690
2633static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, 2691static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
2634 struct file *filp,
2635 void __user *buffer, 2692 void __user *buffer,
2636 size_t *lenp, loff_t *ppos, 2693 size_t *lenp, loff_t *ppos,
2637 unsigned long convmul, 2694 unsigned long convmul,
2638 unsigned long convdiv) 2695 unsigned long convdiv)
2639{ 2696{
2640 return __do_proc_doulongvec_minmax(table->data, table, write, 2697 return __do_proc_doulongvec_minmax(table->data, table, write,
2641 filp, buffer, lenp, ppos, convmul, convdiv); 2698 buffer, lenp, ppos, convmul, convdiv);
2642} 2699}
2643 2700
2644/** 2701/**
2645 * proc_doulongvec_minmax - read a vector of long integers with min/max values 2702 * proc_doulongvec_minmax - read a vector of long integers with min/max values
2646 * @table: the sysctl table 2703 * @table: the sysctl table
2647 * @write: %TRUE if this is a write to the sysctl file 2704 * @write: %TRUE if this is a write to the sysctl file
2648 * @filp: the file structure
2649 * @buffer: the user buffer 2705 * @buffer: the user buffer
2650 * @lenp: the size of the user buffer 2706 * @lenp: the size of the user buffer
2651 * @ppos: file position 2707 * @ppos: file position
@@ -2658,17 +2714,16 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
2658 * 2714 *
2659 * Returns 0 on success. 2715 * Returns 0 on success.
2660 */ 2716 */
2661int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, 2717int proc_doulongvec_minmax(struct ctl_table *table, int write,
2662 void __user *buffer, size_t *lenp, loff_t *ppos) 2718 void __user *buffer, size_t *lenp, loff_t *ppos)
2663{ 2719{
2664 return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l); 2720 return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
2665} 2721}
2666 2722
2667/** 2723/**
2668 * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values 2724 * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
2669 * @table: the sysctl table 2725 * @table: the sysctl table
2670 * @write: %TRUE if this is a write to the sysctl file 2726 * @write: %TRUE if this is a write to the sysctl file
2671 * @filp: the file structure
2672 * @buffer: the user buffer 2727 * @buffer: the user buffer
2673 * @lenp: the size of the user buffer 2728 * @lenp: the size of the user buffer
2674 * @ppos: file position 2729 * @ppos: file position
@@ -2683,11 +2738,10 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp
2683 * Returns 0 on success. 2738 * Returns 0 on success.
2684 */ 2739 */
2685int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, 2740int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2686 struct file *filp,
2687 void __user *buffer, 2741 void __user *buffer,
2688 size_t *lenp, loff_t *ppos) 2742 size_t *lenp, loff_t *ppos)
2689{ 2743{
2690 return do_proc_doulongvec_minmax(table, write, filp, buffer, 2744 return do_proc_doulongvec_minmax(table, write, buffer,
2691 lenp, ppos, HZ, 1000l); 2745 lenp, ppos, HZ, 1000l);
2692} 2746}
2693 2747
@@ -2763,7 +2817,6 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
2763 * proc_dointvec_jiffies - read a vector of integers as seconds 2817 * proc_dointvec_jiffies - read a vector of integers as seconds
2764 * @table: the sysctl table 2818 * @table: the sysctl table
2765 * @write: %TRUE if this is a write to the sysctl file 2819 * @write: %TRUE if this is a write to the sysctl file
2766 * @filp: the file structure
2767 * @buffer: the user buffer 2820 * @buffer: the user buffer
2768 * @lenp: the size of the user buffer 2821 * @lenp: the size of the user buffer
2769 * @ppos: file position 2822 * @ppos: file position
@@ -2775,10 +2828,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
2775 * 2828 *
2776 * Returns 0 on success. 2829 * Returns 0 on success.
2777 */ 2830 */
2778int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, 2831int proc_dointvec_jiffies(struct ctl_table *table, int write,
2779 void __user *buffer, size_t *lenp, loff_t *ppos) 2832 void __user *buffer, size_t *lenp, loff_t *ppos)
2780{ 2833{
2781 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2834 return do_proc_dointvec(table,write,buffer,lenp,ppos,
2782 do_proc_dointvec_jiffies_conv,NULL); 2835 do_proc_dointvec_jiffies_conv,NULL);
2783} 2836}
2784 2837
@@ -2786,7 +2839,6 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
2786 * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds 2839 * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
2787 * @table: the sysctl table 2840 * @table: the sysctl table
2788 * @write: %TRUE if this is a write to the sysctl file 2841 * @write: %TRUE if this is a write to the sysctl file
2789 * @filp: the file structure
2790 * @buffer: the user buffer 2842 * @buffer: the user buffer
2791 * @lenp: the size of the user buffer 2843 * @lenp: the size of the user buffer
2792 * @ppos: pointer to the file position 2844 * @ppos: pointer to the file position
@@ -2798,10 +2850,10 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
2798 * 2850 *
2799 * Returns 0 on success. 2851 * Returns 0 on success.
2800 */ 2852 */
2801int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, 2853int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
2802 void __user *buffer, size_t *lenp, loff_t *ppos) 2854 void __user *buffer, size_t *lenp, loff_t *ppos)
2803{ 2855{
2804 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2856 return do_proc_dointvec(table,write,buffer,lenp,ppos,
2805 do_proc_dointvec_userhz_jiffies_conv,NULL); 2857 do_proc_dointvec_userhz_jiffies_conv,NULL);
2806} 2858}
2807 2859
@@ -2809,7 +2861,6 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
2809 * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds 2861 * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
2810 * @table: the sysctl table 2862 * @table: the sysctl table
2811 * @write: %TRUE if this is a write to the sysctl file 2863 * @write: %TRUE if this is a write to the sysctl file
2812 * @filp: the file structure
2813 * @buffer: the user buffer 2864 * @buffer: the user buffer
2814 * @lenp: the size of the user buffer 2865 * @lenp: the size of the user buffer
2815 * @ppos: file position 2866 * @ppos: file position
@@ -2822,14 +2873,14 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
2822 * 2873 *
2823 * Returns 0 on success. 2874 * Returns 0 on success.
2824 */ 2875 */
2825int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, 2876int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
2826 void __user *buffer, size_t *lenp, loff_t *ppos) 2877 void __user *buffer, size_t *lenp, loff_t *ppos)
2827{ 2878{
2828 return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, 2879 return do_proc_dointvec(table, write, buffer, lenp, ppos,
2829 do_proc_dointvec_ms_jiffies_conv, NULL); 2880 do_proc_dointvec_ms_jiffies_conv, NULL);
2830} 2881}
2831 2882
2832static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, 2883static int proc_do_cad_pid(struct ctl_table *table, int write,
2833 void __user *buffer, size_t *lenp, loff_t *ppos) 2884 void __user *buffer, size_t *lenp, loff_t *ppos)
2834{ 2885{
2835 struct pid *new_pid; 2886 struct pid *new_pid;
@@ -2838,7 +2889,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
2838 2889
2839 tmp = pid_vnr(cad_pid); 2890 tmp = pid_vnr(cad_pid);
2840 2891
2841 r = __do_proc_dointvec(&tmp, table, write, filp, buffer, 2892 r = __do_proc_dointvec(&tmp, table, write, buffer,
2842 lenp, ppos, NULL, NULL); 2893 lenp, ppos, NULL, NULL);
2843 if (r || !write) 2894 if (r || !write)
2844 return r; 2895 return r;
@@ -2853,50 +2904,49 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
2853 2904
2854#else /* CONFIG_PROC_FS */ 2905#else /* CONFIG_PROC_FS */
2855 2906
2856int proc_dostring(struct ctl_table *table, int write, struct file *filp, 2907int proc_dostring(struct ctl_table *table, int write,
2857 void __user *buffer, size_t *lenp, loff_t *ppos) 2908 void __user *buffer, size_t *lenp, loff_t *ppos)
2858{ 2909{
2859 return -ENOSYS; 2910 return -ENOSYS;
2860} 2911}
2861 2912
2862int proc_dointvec(struct ctl_table *table, int write, struct file *filp, 2913int proc_dointvec(struct ctl_table *table, int write,
2863 void __user *buffer, size_t *lenp, loff_t *ppos) 2914 void __user *buffer, size_t *lenp, loff_t *ppos)
2864{ 2915{
2865 return -ENOSYS; 2916 return -ENOSYS;
2866} 2917}
2867 2918
2868int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, 2919int proc_dointvec_minmax(struct ctl_table *table, int write,
2869 void __user *buffer, size_t *lenp, loff_t *ppos) 2920 void __user *buffer, size_t *lenp, loff_t *ppos)
2870{ 2921{
2871 return -ENOSYS; 2922 return -ENOSYS;
2872} 2923}
2873 2924
2874int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, 2925int proc_dointvec_jiffies(struct ctl_table *table, int write,
2875 void __user *buffer, size_t *lenp, loff_t *ppos) 2926 void __user *buffer, size_t *lenp, loff_t *ppos)
2876{ 2927{
2877 return -ENOSYS; 2928 return -ENOSYS;
2878} 2929}
2879 2930
2880int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, 2931int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
2881 void __user *buffer, size_t *lenp, loff_t *ppos) 2932 void __user *buffer, size_t *lenp, loff_t *ppos)
2882{ 2933{
2883 return -ENOSYS; 2934 return -ENOSYS;
2884} 2935}
2885 2936
2886int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, 2937int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
2887 void __user *buffer, size_t *lenp, loff_t *ppos) 2938 void __user *buffer, size_t *lenp, loff_t *ppos)
2888{ 2939{
2889 return -ENOSYS; 2940 return -ENOSYS;
2890} 2941}
2891 2942
2892int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, 2943int proc_doulongvec_minmax(struct ctl_table *table, int write,
2893 void __user *buffer, size_t *lenp, loff_t *ppos) 2944 void __user *buffer, size_t *lenp, loff_t *ppos)
2894{ 2945{
2895 return -ENOSYS; 2946 return -ENOSYS;
2896} 2947}
2897 2948
2898int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, 2949int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2899 struct file *filp,
2900 void __user *buffer, 2950 void __user *buffer,
2901 size_t *lenp, loff_t *ppos) 2951 size_t *lenp, loff_t *ppos)
2902{ 2952{
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index b38423ca711a..b6e7aaea4604 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1521,7 +1521,7 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1521 if (!table->ctl_name && table->strategy) 1521 if (!table->ctl_name && table->strategy)
1522 set_fail(&fail, table, "Strategy without ctl_name"); 1522 set_fail(&fail, table, "Strategy without ctl_name");
1523#endif 1523#endif
1524#ifdef CONFIG_PROC_FS 1524#ifdef CONFIG_PROC_SYSCTL
1525 if (table->procname && !table->proc_handler) 1525 if (table->procname && !table->proc_handler)
1526 set_fail(&fail, table, "No proc_handler"); 1526 set_fail(&fail, table, "No proc_handler");
1527#endif 1527#endif
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 888adbcca30c..ea8384d3caa7 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -108,7 +108,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
108/* 108/*
109 * Send taskstats data in @skb to listener with nl_pid @pid 109 * Send taskstats data in @skb to listener with nl_pid @pid
110 */ 110 */
111static int send_reply(struct sk_buff *skb, pid_t pid) 111static int send_reply(struct sk_buff *skb, struct genl_info *info)
112{ 112{
113 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 113 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
114 void *reply = genlmsg_data(genlhdr); 114 void *reply = genlmsg_data(genlhdr);
@@ -120,7 +120,7 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
120 return rc; 120 return rc;
121 } 121 }
122 122
123 return genlmsg_unicast(skb, pid); 123 return genlmsg_reply(skb, info);
124} 124}
125 125
126/* 126/*
@@ -150,7 +150,7 @@ static void send_cpu_listeners(struct sk_buff *skb,
150 if (!skb_next) 150 if (!skb_next)
151 break; 151 break;
152 } 152 }
153 rc = genlmsg_unicast(skb_cur, s->pid); 153 rc = genlmsg_unicast(&init_net, skb_cur, s->pid);
154 if (rc == -ECONNREFUSED) { 154 if (rc == -ECONNREFUSED) {
155 s->valid = 0; 155 s->valid = 0;
156 delcount++; 156 delcount++;
@@ -418,7 +418,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
418 goto err; 418 goto err;
419 } 419 }
420 420
421 rc = send_reply(rep_skb, info->snd_pid); 421 rc = send_reply(rep_skb, info);
422 422
423err: 423err:
424 fput_light(file, fput_needed); 424 fput_light(file, fput_needed);
@@ -487,7 +487,7 @@ free_return_rc:
487 } else 487 } else
488 goto err; 488 goto err;
489 489
490 return send_reply(rep_skb, info->snd_pid); 490 return send_reply(rep_skb, info);
491err: 491err:
492 nlmsg_free(rep_skb); 492 nlmsg_free(rep_skb);
493 return rc; 493 return rc;
diff --git a/kernel/time.c b/kernel/time.c
index 29511943871a..804798005d19 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -370,13 +370,20 @@ EXPORT_SYMBOL(mktime);
370 * 0 <= tv_nsec < NSEC_PER_SEC 370 * 0 <= tv_nsec < NSEC_PER_SEC
371 * For negative values only the tv_sec field is negative ! 371 * For negative values only the tv_sec field is negative !
372 */ 372 */
373void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) 373void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec)
374{ 374{
375 while (nsec >= NSEC_PER_SEC) { 375 while (nsec >= NSEC_PER_SEC) {
376 /*
377 * The following asm() prevents the compiler from
378 * optimising this loop into a modulo operation. See
379 * also __iter_div_u64_rem() in include/linux/time.h
380 */
381 asm("" : "+rm"(nsec));
376 nsec -= NSEC_PER_SEC; 382 nsec -= NSEC_PER_SEC;
377 ++sec; 383 ++sec;
378 } 384 }
379 while (nsec < 0) { 385 while (nsec < 0) {
386 asm("" : "+rm"(nsec));
380 nsec += NSEC_PER_SEC; 387 nsec += NSEC_PER_SEC;
381 --sec; 388 --sec;
382 } 389 }
@@ -655,6 +662,36 @@ u64 nsec_to_clock_t(u64 x)
655#endif 662#endif
656} 663}
657 664
665/**
666 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
667 *
668 * @n: nsecs in u64
669 *
670 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
671 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
672 * for scheduler, not for use in device drivers to calculate timeout value.
673 *
674 * note:
675 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
676 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
677 */
678unsigned long nsecs_to_jiffies(u64 n)
679{
680#if (NSEC_PER_SEC % HZ) == 0
681 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
682 return div_u64(n, NSEC_PER_SEC / HZ);
683#elif (HZ % 512) == 0
684 /* overflow after 292 years if HZ = 1024 */
685 return div_u64(n * HZ / 512, NSEC_PER_SEC / 512);
686#else
687 /*
688 * Generic case - optimized for cases where HZ is a multiple of 3.
689 * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc.
690 */
691 return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
692#endif
693}
694
658#if (BITS_PER_LONG < 64) 695#if (BITS_PER_LONG < 64)
659u64 get_jiffies_64(void) 696u64 get_jiffies_64(void)
660{ 697{
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 0b0a6366c9d4..ee266620b06c 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o
2 2
3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 7466cb811251..5e18c6ab2c6a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -21,7 +21,6 @@
21 * 21 *
22 * TODO WishList: 22 * TODO WishList:
23 * o Allow clocksource drivers to be unregistered 23 * o Allow clocksource drivers to be unregistered
24 * o get rid of clocksource_jiffies extern
25 */ 24 */
26 25
27#include <linux/clocksource.h> 26#include <linux/clocksource.h>
@@ -30,6 +29,7 @@
30#include <linux/module.h> 29#include <linux/module.h>
31#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ 30#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
32#include <linux/tick.h> 31#include <linux/tick.h>
32#include <linux/kthread.h>
33 33
34void timecounter_init(struct timecounter *tc, 34void timecounter_init(struct timecounter *tc,
35 const struct cyclecounter *cc, 35 const struct cyclecounter *cc,
@@ -107,50 +107,35 @@ u64 timecounter_cyc2time(struct timecounter *tc,
107} 107}
108EXPORT_SYMBOL(timecounter_cyc2time); 108EXPORT_SYMBOL(timecounter_cyc2time);
109 109
110/* XXX - Would like a better way for initializing curr_clocksource */
111extern struct clocksource clocksource_jiffies;
112
113/*[Clocksource internal variables]--------- 110/*[Clocksource internal variables]---------
114 * curr_clocksource: 111 * curr_clocksource:
115 * currently selected clocksource. Initialized to clocksource_jiffies. 112 * currently selected clocksource.
116 * next_clocksource:
117 * pending next selected clocksource.
118 * clocksource_list: 113 * clocksource_list:
119 * linked list with the registered clocksources 114 * linked list with the registered clocksources
120 * clocksource_lock: 115 * clocksource_mutex:
121 * protects manipulations to curr_clocksource and next_clocksource 116 * protects manipulations to curr_clocksource and the clocksource_list
122 * and the clocksource_list
123 * override_name: 117 * override_name:
124 * Name of the user-specified clocksource. 118 * Name of the user-specified clocksource.
125 */ 119 */
126static struct clocksource *curr_clocksource = &clocksource_jiffies; 120static struct clocksource *curr_clocksource;
127static struct clocksource *next_clocksource;
128static struct clocksource *clocksource_override;
129static LIST_HEAD(clocksource_list); 121static LIST_HEAD(clocksource_list);
130static DEFINE_SPINLOCK(clocksource_lock); 122static DEFINE_MUTEX(clocksource_mutex);
131static char override_name[32]; 123static char override_name[32];
132static int finished_booting; 124static int finished_booting;
133 125
134/* clocksource_done_booting - Called near the end of core bootup
135 *
136 * Hack to avoid lots of clocksource churn at boot time.
137 * We use fs_initcall because we want this to start before
138 * device_initcall but after subsys_initcall.
139 */
140static int __init clocksource_done_booting(void)
141{
142 finished_booting = 1;
143 return 0;
144}
145fs_initcall(clocksource_done_booting);
146
147#ifdef CONFIG_CLOCKSOURCE_WATCHDOG 126#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
127static void clocksource_watchdog_work(struct work_struct *work);
128
148static LIST_HEAD(watchdog_list); 129static LIST_HEAD(watchdog_list);
149static struct clocksource *watchdog; 130static struct clocksource *watchdog;
150static struct timer_list watchdog_timer; 131static struct timer_list watchdog_timer;
132static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
151static DEFINE_SPINLOCK(watchdog_lock); 133static DEFINE_SPINLOCK(watchdog_lock);
152static cycle_t watchdog_last; 134static cycle_t watchdog_last;
153static unsigned long watchdog_resumed; 135static int watchdog_running;
136
137static int clocksource_watchdog_kthread(void *data);
138static void __clocksource_change_rating(struct clocksource *cs, int rating);
154 139
155/* 140/*
156 * Interval: 0.5sec Threshold: 0.0625s 141 * Interval: 0.5sec Threshold: 0.0625s
@@ -158,135 +143,249 @@ static unsigned long watchdog_resumed;
158#define WATCHDOG_INTERVAL (HZ >> 1) 143#define WATCHDOG_INTERVAL (HZ >> 1)
159#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) 144#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
160 145
161static void clocksource_ratewd(struct clocksource *cs, int64_t delta) 146static void clocksource_watchdog_work(struct work_struct *work)
162{ 147{
163 if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD) 148 /*
164 return; 149 * If kthread_run fails the next watchdog scan over the
150 * watchdog_list will find the unstable clock again.
151 */
152 kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
153}
154
155static void __clocksource_unstable(struct clocksource *cs)
156{
157 cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
158 cs->flags |= CLOCK_SOURCE_UNSTABLE;
159 if (finished_booting)
160 schedule_work(&watchdog_work);
161}
165 162
163static void clocksource_unstable(struct clocksource *cs, int64_t delta)
164{
166 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", 165 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
167 cs->name, delta); 166 cs->name, delta);
168 cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); 167 __clocksource_unstable(cs);
169 clocksource_change_rating(cs, 0); 168}
170 list_del(&cs->wd_list); 169
170/**
171 * clocksource_mark_unstable - mark clocksource unstable via watchdog
172 * @cs: clocksource to be marked unstable
173 *
174 * This function is called instead of clocksource_change_rating from
175 * cpu hotplug code to avoid a deadlock between the clocksource mutex
176 * and the cpu hotplug mutex. It defers the update of the clocksource
177 * to the watchdog thread.
178 */
179void clocksource_mark_unstable(struct clocksource *cs)
180{
181 unsigned long flags;
182
183 spin_lock_irqsave(&watchdog_lock, flags);
184 if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
185 if (list_empty(&cs->wd_list))
186 list_add(&cs->wd_list, &watchdog_list);
187 __clocksource_unstable(cs);
188 }
189 spin_unlock_irqrestore(&watchdog_lock, flags);
171} 190}
172 191
173static void clocksource_watchdog(unsigned long data) 192static void clocksource_watchdog(unsigned long data)
174{ 193{
175 struct clocksource *cs, *tmp; 194 struct clocksource *cs;
176 cycle_t csnow, wdnow; 195 cycle_t csnow, wdnow;
177 int64_t wd_nsec, cs_nsec; 196 int64_t wd_nsec, cs_nsec;
178 int resumed; 197 int next_cpu;
179 198
180 spin_lock(&watchdog_lock); 199 spin_lock(&watchdog_lock);
181 200 if (!watchdog_running)
182 resumed = test_and_clear_bit(0, &watchdog_resumed); 201 goto out;
183 202
184 wdnow = watchdog->read(watchdog); 203 wdnow = watchdog->read(watchdog);
185 wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); 204 wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask,
205 watchdog->mult, watchdog->shift);
186 watchdog_last = wdnow; 206 watchdog_last = wdnow;
187 207
188 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { 208 list_for_each_entry(cs, &watchdog_list, wd_list) {
189 csnow = cs->read(cs);
190 209
191 if (unlikely(resumed)) { 210 /* Clocksource already marked unstable? */
192 cs->wd_last = csnow; 211 if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
212 if (finished_booting)
213 schedule_work(&watchdog_work);
193 continue; 214 continue;
194 } 215 }
195 216
196 /* Initialized ? */ 217 csnow = cs->read(cs);
218
219 /* Clocksource initialized ? */
197 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { 220 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
198 if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
199 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
200 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
201 /*
202 * We just marked the clocksource as
203 * highres-capable, notify the rest of the
204 * system as well so that we transition
205 * into high-res mode:
206 */
207 tick_clock_notify();
208 }
209 cs->flags |= CLOCK_SOURCE_WATCHDOG; 221 cs->flags |= CLOCK_SOURCE_WATCHDOG;
210 cs->wd_last = csnow; 222 cs->wd_last = csnow;
211 } else { 223 continue;
212 cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask);
213 cs->wd_last = csnow;
214 /* Check the delta. Might remove from the list ! */
215 clocksource_ratewd(cs, cs_nsec - wd_nsec);
216 } 224 }
217 }
218 225
219 if (!list_empty(&watchdog_list)) { 226 /* Check the deviation from the watchdog clocksource. */
220 /* 227 cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) &
221 * Cycle through CPUs to check if the CPUs stay 228 cs->mask, cs->mult, cs->shift);
222 * synchronized to each other. 229 cs->wd_last = csnow;
223 */ 230 if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
224 int next_cpu = cpumask_next(raw_smp_processor_id(), 231 clocksource_unstable(cs, cs_nsec - wd_nsec);
225 cpu_online_mask); 232 continue;
233 }
226 234
227 if (next_cpu >= nr_cpu_ids) 235 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
228 next_cpu = cpumask_first(cpu_online_mask); 236 (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
229 watchdog_timer.expires += WATCHDOG_INTERVAL; 237 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
230 add_timer_on(&watchdog_timer, next_cpu); 238 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
239 /*
240 * We just marked the clocksource as highres-capable,
241 * notify the rest of the system as well so that we
242 * transition into high-res mode:
243 */
244 tick_clock_notify();
245 }
231 } 246 }
247
248 /*
249 * Cycle through CPUs to check if the CPUs stay synchronized
250 * to each other.
251 */
252 next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
253 if (next_cpu >= nr_cpu_ids)
254 next_cpu = cpumask_first(cpu_online_mask);
255 watchdog_timer.expires += WATCHDOG_INTERVAL;
256 add_timer_on(&watchdog_timer, next_cpu);
257out:
232 spin_unlock(&watchdog_lock); 258 spin_unlock(&watchdog_lock);
233} 259}
260
261static inline void clocksource_start_watchdog(void)
262{
263 if (watchdog_running || !watchdog || list_empty(&watchdog_list))
264 return;
265 init_timer(&watchdog_timer);
266 watchdog_timer.function = clocksource_watchdog;
267 watchdog_last = watchdog->read(watchdog);
268 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
269 add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
270 watchdog_running = 1;
271}
272
273static inline void clocksource_stop_watchdog(void)
274{
275 if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
276 return;
277 del_timer(&watchdog_timer);
278 watchdog_running = 0;
279}
280
281static inline void clocksource_reset_watchdog(void)
282{
283 struct clocksource *cs;
284
285 list_for_each_entry(cs, &watchdog_list, wd_list)
286 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
287}
288
234static void clocksource_resume_watchdog(void) 289static void clocksource_resume_watchdog(void)
235{ 290{
236 set_bit(0, &watchdog_resumed); 291 unsigned long flags;
292
293 spin_lock_irqsave(&watchdog_lock, flags);
294 clocksource_reset_watchdog();
295 spin_unlock_irqrestore(&watchdog_lock, flags);
237} 296}
238 297
239static void clocksource_check_watchdog(struct clocksource *cs) 298static void clocksource_enqueue_watchdog(struct clocksource *cs)
240{ 299{
241 struct clocksource *cse;
242 unsigned long flags; 300 unsigned long flags;
243 301
244 spin_lock_irqsave(&watchdog_lock, flags); 302 spin_lock_irqsave(&watchdog_lock, flags);
245 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { 303 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
246 int started = !list_empty(&watchdog_list); 304 /* cs is a clocksource to be watched. */
247
248 list_add(&cs->wd_list, &watchdog_list); 305 list_add(&cs->wd_list, &watchdog_list);
249 if (!started && watchdog) { 306 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
250 watchdog_last = watchdog->read(watchdog);
251 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
252 add_timer_on(&watchdog_timer,
253 cpumask_first(cpu_online_mask));
254 }
255 } else { 307 } else {
308 /* cs is a watchdog. */
256 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 309 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
257 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 310 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
258 311 /* Pick the best watchdog. */
259 if (!watchdog || cs->rating > watchdog->rating) { 312 if (!watchdog || cs->rating > watchdog->rating) {
260 if (watchdog)
261 del_timer(&watchdog_timer);
262 watchdog = cs; 313 watchdog = cs;
263 init_timer(&watchdog_timer);
264 watchdog_timer.function = clocksource_watchdog;
265
266 /* Reset watchdog cycles */ 314 /* Reset watchdog cycles */
267 list_for_each_entry(cse, &watchdog_list, wd_list) 315 clocksource_reset_watchdog();
268 cse->flags &= ~CLOCK_SOURCE_WATCHDOG; 316 }
269 /* Start if list is not empty */ 317 }
270 if (!list_empty(&watchdog_list)) { 318 /* Check if the watchdog timer needs to be started. */
271 watchdog_last = watchdog->read(watchdog); 319 clocksource_start_watchdog();
272 watchdog_timer.expires = 320 spin_unlock_irqrestore(&watchdog_lock, flags);
273 jiffies + WATCHDOG_INTERVAL; 321}
274 add_timer_on(&watchdog_timer, 322
275 cpumask_first(cpu_online_mask)); 323static void clocksource_dequeue_watchdog(struct clocksource *cs)
276 } 324{
325 struct clocksource *tmp;
326 unsigned long flags;
327
328 spin_lock_irqsave(&watchdog_lock, flags);
329 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
330 /* cs is a watched clocksource. */
331 list_del_init(&cs->wd_list);
332 } else if (cs == watchdog) {
333 /* Reset watchdog cycles */
334 clocksource_reset_watchdog();
335 /* Current watchdog is removed. Find an alternative. */
336 watchdog = NULL;
337 list_for_each_entry(tmp, &clocksource_list, list) {
338 if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY)
339 continue;
340 if (!watchdog || tmp->rating > watchdog->rating)
341 watchdog = tmp;
277 } 342 }
278 } 343 }
344 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
345 /* Check if the watchdog timer needs to be stopped. */
346 clocksource_stop_watchdog();
279 spin_unlock_irqrestore(&watchdog_lock, flags); 347 spin_unlock_irqrestore(&watchdog_lock, flags);
280} 348}
281#else 349
282static void clocksource_check_watchdog(struct clocksource *cs) 350static int clocksource_watchdog_kthread(void *data)
351{
352 struct clocksource *cs, *tmp;
353 unsigned long flags;
354 LIST_HEAD(unstable);
355
356 mutex_lock(&clocksource_mutex);
357 spin_lock_irqsave(&watchdog_lock, flags);
358 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list)
359 if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
360 list_del_init(&cs->wd_list);
361 list_add(&cs->wd_list, &unstable);
362 }
363 /* Check if the watchdog timer needs to be stopped. */
364 clocksource_stop_watchdog();
365 spin_unlock_irqrestore(&watchdog_lock, flags);
366
367 /* Needs to be done outside of watchdog lock */
368 list_for_each_entry_safe(cs, tmp, &unstable, wd_list) {
369 list_del_init(&cs->wd_list);
370 __clocksource_change_rating(cs, 0);
371 }
372 mutex_unlock(&clocksource_mutex);
373 return 0;
374}
375
376#else /* CONFIG_CLOCKSOURCE_WATCHDOG */
377
378static void clocksource_enqueue_watchdog(struct clocksource *cs)
283{ 379{
284 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 380 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
285 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 381 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
286} 382}
287 383
384static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
288static inline void clocksource_resume_watchdog(void) { } 385static inline void clocksource_resume_watchdog(void) { }
289#endif 386static inline int clocksource_watchdog_kthread(void *data) { return 0; }
387
388#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
290 389
291/** 390/**
292 * clocksource_resume - resume the clocksource(s) 391 * clocksource_resume - resume the clocksource(s)
@@ -294,18 +393,12 @@ static inline void clocksource_resume_watchdog(void) { }
294void clocksource_resume(void) 393void clocksource_resume(void)
295{ 394{
296 struct clocksource *cs; 395 struct clocksource *cs;
297 unsigned long flags;
298 396
299 spin_lock_irqsave(&clocksource_lock, flags); 397 list_for_each_entry(cs, &clocksource_list, list)
300
301 list_for_each_entry(cs, &clocksource_list, list) {
302 if (cs->resume) 398 if (cs->resume)
303 cs->resume(); 399 cs->resume();
304 }
305 400
306 clocksource_resume_watchdog(); 401 clocksource_resume_watchdog();
307
308 spin_unlock_irqrestore(&clocksource_lock, flags);
309} 402}
310 403
311/** 404/**
@@ -320,75 +413,94 @@ void clocksource_touch_watchdog(void)
320 clocksource_resume_watchdog(); 413 clocksource_resume_watchdog();
321} 414}
322 415
416#ifdef CONFIG_GENERIC_TIME
417
323/** 418/**
324 * clocksource_get_next - Returns the selected clocksource 419 * clocksource_select - Select the best clocksource available
325 * 420 *
421 * Private function. Must hold clocksource_mutex when called.
422 *
423 * Select the clocksource with the best rating, or the clocksource,
424 * which is selected by userspace override.
326 */ 425 */
327struct clocksource *clocksource_get_next(void) 426static void clocksource_select(void)
328{ 427{
329 unsigned long flags; 428 struct clocksource *best, *cs;
330 429
331 spin_lock_irqsave(&clocksource_lock, flags); 430 if (!finished_booting || list_empty(&clocksource_list))
332 if (next_clocksource && finished_booting) { 431 return;
333 curr_clocksource = next_clocksource; 432 /* First clocksource on the list has the best rating. */
334 next_clocksource = NULL; 433 best = list_first_entry(&clocksource_list, struct clocksource, list);
434 /* Check for the override clocksource. */
435 list_for_each_entry(cs, &clocksource_list, list) {
436 if (strcmp(cs->name, override_name) != 0)
437 continue;
438 /*
439 * Check to make sure we don't switch to a non-highres
440 * capable clocksource if the tick code is in oneshot
441 * mode (highres or nohz)
442 */
443 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
444 tick_oneshot_mode_active()) {
445 /* Override clocksource cannot be used. */
446 printk(KERN_WARNING "Override clocksource %s is not "
447 "HRT compatible. Cannot switch while in "
448 "HRT/NOHZ mode\n", cs->name);
449 override_name[0] = 0;
450 } else
451 /* Override clocksource can be used. */
452 best = cs;
453 break;
454 }
455 if (curr_clocksource != best) {
456 printk(KERN_INFO "Switching to clocksource %s\n", best->name);
457 curr_clocksource = best;
458 timekeeping_notify(curr_clocksource);
335 } 459 }
336 spin_unlock_irqrestore(&clocksource_lock, flags);
337
338 return curr_clocksource;
339} 460}
340 461
341/** 462#else /* CONFIG_GENERIC_TIME */
342 * select_clocksource - Selects the best registered clocksource. 463
343 * 464static inline void clocksource_select(void) { }
344 * Private function. Must hold clocksource_lock when called. 465
466#endif
467
468/*
469 * clocksource_done_booting - Called near the end of core bootup
345 * 470 *
346 * Select the clocksource with the best rating, or the clocksource, 471 * Hack to avoid lots of clocksource churn at boot time.
347 * which is selected by userspace override. 472 * We use fs_initcall because we want this to start before
473 * device_initcall but after subsys_initcall.
348 */ 474 */
349static struct clocksource *select_clocksource(void) 475static int __init clocksource_done_booting(void)
350{ 476{
351 struct clocksource *next; 477 finished_booting = 1;
352
353 if (list_empty(&clocksource_list))
354 return NULL;
355
356 if (clocksource_override)
357 next = clocksource_override;
358 else
359 next = list_entry(clocksource_list.next, struct clocksource,
360 list);
361 478
362 if (next == curr_clocksource) 479 /*
363 return NULL; 480 * Run the watchdog first to eliminate unstable clock sources
481 */
482 clocksource_watchdog_kthread(NULL);
364 483
365 return next; 484 mutex_lock(&clocksource_mutex);
485 clocksource_select();
486 mutex_unlock(&clocksource_mutex);
487 return 0;
366} 488}
489fs_initcall(clocksource_done_booting);
367 490
368/* 491/*
369 * Enqueue the clocksource sorted by rating 492 * Enqueue the clocksource sorted by rating
370 */ 493 */
371static int clocksource_enqueue(struct clocksource *c) 494static void clocksource_enqueue(struct clocksource *cs)
372{ 495{
373 struct list_head *tmp, *entry = &clocksource_list; 496 struct list_head *entry = &clocksource_list;
497 struct clocksource *tmp;
374 498
375 list_for_each(tmp, &clocksource_list) { 499 list_for_each_entry(tmp, &clocksource_list, list)
376 struct clocksource *cs;
377
378 cs = list_entry(tmp, struct clocksource, list);
379 if (cs == c)
380 return -EBUSY;
381 /* Keep track of the place, where to insert */ 500 /* Keep track of the place, where to insert */
382 if (cs->rating >= c->rating) 501 if (tmp->rating >= cs->rating)
383 entry = tmp; 502 entry = &tmp->list;
384 } 503 list_add(&cs->list, entry);
385 list_add(&c->list, entry);
386
387 if (strlen(c->name) == strlen(override_name) &&
388 !strcmp(c->name, override_name))
389 clocksource_override = c;
390
391 return 0;
392} 504}
393 505
394/** 506/**
@@ -397,52 +509,48 @@ static int clocksource_enqueue(struct clocksource *c)
397 * 509 *
398 * Returns -EBUSY if registration fails, zero otherwise. 510 * Returns -EBUSY if registration fails, zero otherwise.
399 */ 511 */
400int clocksource_register(struct clocksource *c) 512int clocksource_register(struct clocksource *cs)
401{ 513{
402 unsigned long flags; 514 mutex_lock(&clocksource_mutex);
403 int ret; 515 clocksource_enqueue(cs);
404 516 clocksource_select();
405 spin_lock_irqsave(&clocksource_lock, flags); 517 clocksource_enqueue_watchdog(cs);
406 ret = clocksource_enqueue(c); 518 mutex_unlock(&clocksource_mutex);
407 if (!ret) 519 return 0;
408 next_clocksource = select_clocksource();
409 spin_unlock_irqrestore(&clocksource_lock, flags);
410 if (!ret)
411 clocksource_check_watchdog(c);
412 return ret;
413} 520}
414EXPORT_SYMBOL(clocksource_register); 521EXPORT_SYMBOL(clocksource_register);
415 522
523static void __clocksource_change_rating(struct clocksource *cs, int rating)
524{
525 list_del(&cs->list);
526 cs->rating = rating;
527 clocksource_enqueue(cs);
528 clocksource_select();
529}
530
416/** 531/**
417 * clocksource_change_rating - Change the rating of a registered clocksource 532 * clocksource_change_rating - Change the rating of a registered clocksource
418 *
419 */ 533 */
420void clocksource_change_rating(struct clocksource *cs, int rating) 534void clocksource_change_rating(struct clocksource *cs, int rating)
421{ 535{
422 unsigned long flags; 536 mutex_lock(&clocksource_mutex);
423 537 __clocksource_change_rating(cs, rating);
424 spin_lock_irqsave(&clocksource_lock, flags); 538 mutex_unlock(&clocksource_mutex);
425 list_del(&cs->list);
426 cs->rating = rating;
427 clocksource_enqueue(cs);
428 next_clocksource = select_clocksource();
429 spin_unlock_irqrestore(&clocksource_lock, flags);
430} 539}
540EXPORT_SYMBOL(clocksource_change_rating);
431 541
432/** 542/**
433 * clocksource_unregister - remove a registered clocksource 543 * clocksource_unregister - remove a registered clocksource
434 */ 544 */
435void clocksource_unregister(struct clocksource *cs) 545void clocksource_unregister(struct clocksource *cs)
436{ 546{
437 unsigned long flags; 547 mutex_lock(&clocksource_mutex);
438 548 clocksource_dequeue_watchdog(cs);
439 spin_lock_irqsave(&clocksource_lock, flags);
440 list_del(&cs->list); 549 list_del(&cs->list);
441 if (clocksource_override == cs) 550 clocksource_select();
442 clocksource_override = NULL; 551 mutex_unlock(&clocksource_mutex);
443 next_clocksource = select_clocksource();
444 spin_unlock_irqrestore(&clocksource_lock, flags);
445} 552}
553EXPORT_SYMBOL(clocksource_unregister);
446 554
447#ifdef CONFIG_SYSFS 555#ifdef CONFIG_SYSFS
448/** 556/**
@@ -458,9 +566,9 @@ sysfs_show_current_clocksources(struct sys_device *dev,
458{ 566{
459 ssize_t count = 0; 567 ssize_t count = 0;
460 568
461 spin_lock_irq(&clocksource_lock); 569 mutex_lock(&clocksource_mutex);
462 count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name); 570 count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
463 spin_unlock_irq(&clocksource_lock); 571 mutex_unlock(&clocksource_mutex);
464 572
465 return count; 573 return count;
466} 574}
@@ -478,9 +586,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
478 struct sysdev_attribute *attr, 586 struct sysdev_attribute *attr,
479 const char *buf, size_t count) 587 const char *buf, size_t count)
480{ 588{
481 struct clocksource *ovr = NULL;
482 size_t ret = count; 589 size_t ret = count;
483 int len;
484 590
485 /* strings from sysfs write are not 0 terminated! */ 591 /* strings from sysfs write are not 0 terminated! */
486 if (count >= sizeof(override_name)) 592 if (count >= sizeof(override_name))
@@ -490,44 +596,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
490 if (buf[count-1] == '\n') 596 if (buf[count-1] == '\n')
491 count--; 597 count--;
492 598
493 spin_lock_irq(&clocksource_lock); 599 mutex_lock(&clocksource_mutex);
494 600
495 if (count > 0) 601 if (count > 0)
496 memcpy(override_name, buf, count); 602 memcpy(override_name, buf, count);
497 override_name[count] = 0; 603 override_name[count] = 0;
604 clocksource_select();
498 605
499 len = strlen(override_name); 606 mutex_unlock(&clocksource_mutex);
500 if (len) {
501 struct clocksource *cs;
502
503 ovr = clocksource_override;
504 /* try to select it: */
505 list_for_each_entry(cs, &clocksource_list, list) {
506 if (strlen(cs->name) == len &&
507 !strcmp(cs->name, override_name))
508 ovr = cs;
509 }
510 }
511
512 /*
513 * Check to make sure we don't switch to a non-highres capable
514 * clocksource if the tick code is in oneshot mode (highres or nohz)
515 */
516 if (tick_oneshot_mode_active() && ovr &&
517 !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
518 printk(KERN_WARNING "%s clocksource is not HRT compatible. "
519 "Cannot switch while in HRT/NOHZ mode\n", ovr->name);
520 ovr = NULL;
521 override_name[0] = 0;
522 }
523
524 /* Reselect, when the override name has changed */
525 if (ovr != clocksource_override) {
526 clocksource_override = ovr;
527 next_clocksource = select_clocksource();
528 }
529
530 spin_unlock_irq(&clocksource_lock);
531 607
532 return ret; 608 return ret;
533} 609}
@@ -547,7 +623,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
547 struct clocksource *src; 623 struct clocksource *src;
548 ssize_t count = 0; 624 ssize_t count = 0;
549 625
550 spin_lock_irq(&clocksource_lock); 626 mutex_lock(&clocksource_mutex);
551 list_for_each_entry(src, &clocksource_list, list) { 627 list_for_each_entry(src, &clocksource_list, list) {
552 /* 628 /*
553 * Don't show non-HRES clocksource if the tick code is 629 * Don't show non-HRES clocksource if the tick code is
@@ -559,7 +635,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
559 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), 635 max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
560 "%s ", src->name); 636 "%s ", src->name);
561 } 637 }
562 spin_unlock_irq(&clocksource_lock); 638 mutex_unlock(&clocksource_mutex);
563 639
564 count += snprintf(buf + count, 640 count += snprintf(buf + count,
565 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n"); 641 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
@@ -614,11 +690,10 @@ device_initcall(init_clocksource_sysfs);
614 */ 690 */
615static int __init boot_override_clocksource(char* str) 691static int __init boot_override_clocksource(char* str)
616{ 692{
617 unsigned long flags; 693 mutex_lock(&clocksource_mutex);
618 spin_lock_irqsave(&clocksource_lock, flags);
619 if (str) 694 if (str)
620 strlcpy(override_name, str, sizeof(override_name)); 695 strlcpy(override_name, str, sizeof(override_name));
621 spin_unlock_irqrestore(&clocksource_lock, flags); 696 mutex_unlock(&clocksource_mutex);
622 return 1; 697 return 1;
623} 698}
624 699
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index c3f6c30816e3..5404a8456909 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -61,7 +61,6 @@ struct clocksource clocksource_jiffies = {
61 .read = jiffies_read, 61 .read = jiffies_read,
62 .mask = 0xffffffff, /*32bits*/ 62 .mask = 0xffffffff, /*32bits*/
63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ 63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
64 .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT,
65 .shift = JIFFIES_SHIFT, 64 .shift = JIFFIES_SHIFT,
66}; 65};
67 66
@@ -71,3 +70,8 @@ static int __init init_jiffies_clocksource(void)
71} 70}
72 71
73core_initcall(init_jiffies_clocksource); 72core_initcall(init_jiffies_clocksource);
73
74struct clocksource * __init __weak clocksource_default_clock(void)
75{
76 return &clocksource_jiffies;
77}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7fc64375ff43..4800f933910e 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -194,8 +194,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
194 case TIME_OK: 194 case TIME_OK:
195 break; 195 break;
196 case TIME_INS: 196 case TIME_INS:
197 xtime.tv_sec--; 197 timekeeping_leap_insert(-1);
198 wall_to_monotonic.tv_sec++;
199 time_state = TIME_OOP; 198 time_state = TIME_OOP;
200 printk(KERN_NOTICE 199 printk(KERN_NOTICE
201 "Clock: inserting leap second 23:59:60 UTC\n"); 200 "Clock: inserting leap second 23:59:60 UTC\n");
@@ -203,9 +202,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
203 res = HRTIMER_RESTART; 202 res = HRTIMER_RESTART;
204 break; 203 break;
205 case TIME_DEL: 204 case TIME_DEL:
206 xtime.tv_sec++; 205 timekeeping_leap_insert(1);
207 time_tai--; 206 time_tai--;
208 wall_to_monotonic.tv_sec--;
209 time_state = TIME_WAIT; 207 time_state = TIME_WAIT;
210 printk(KERN_NOTICE 208 printk(KERN_NOTICE
211 "Clock: deleting leap second 23:59:59 UTC\n"); 209 "Clock: deleting leap second 23:59:59 UTC\n");
@@ -219,7 +217,6 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
219 time_state = TIME_OK; 217 time_state = TIME_OK;
220 break; 218 break;
221 } 219 }
222 update_vsyscall(&xtime, clock);
223 220
224 write_sequnlock(&xtime_lock); 221 write_sequnlock(&xtime_lock);
225 222
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e0f59a21c061..89aed5933ed4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -231,6 +231,13 @@ void tick_nohz_stop_sched_tick(int inidle)
231 if (!inidle && !ts->inidle) 231 if (!inidle && !ts->inidle)
232 goto end; 232 goto end;
233 233
234 /*
235 * Set ts->inidle unconditionally. Even if the system did not
236 * switch to NOHZ mode the cpu frequency governers rely on the
237 * update of the idle time accounting in tick_nohz_start_idle().
238 */
239 ts->inidle = 1;
240
234 now = tick_nohz_start_idle(ts); 241 now = tick_nohz_start_idle(ts);
235 242
236 /* 243 /*
@@ -248,8 +255,6 @@ void tick_nohz_stop_sched_tick(int inidle)
248 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 255 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
249 goto end; 256 goto end;
250 257
251 ts->inidle = 1;
252
253 if (need_resched()) 258 if (need_resched())
254 goto end; 259 goto end;
255 260
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
new file mode 100644
index 000000000000..86628e755f38
--- /dev/null
+++ b/kernel/time/timeconv.c
@@ -0,0 +1,127 @@
1/*
2 * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc.
3 * This file is part of the GNU C Library.
4 * Contributed by Paul Eggert (eggert@twinsun.com).
5 *
6 * The GNU C Library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
10 *
11 * The GNU C Library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
15 *
16 * You should have received a copy of the GNU Library General Public
17 * License along with the GNU C Library; see the file COPYING.LIB. If not,
18 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
20 */
21
22/*
23 * Converts the calendar time to broken-down time representation
24 * Based on code from glibc-2.6
25 *
26 * 2009-7-14:
27 * Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com>
28 */
29
30#include <linux/time.h>
31#include <linux/module.h>
32
33/*
34 * Nonzero if YEAR is a leap year (every 4 years,
35 * except every 100th isn't, and every 400th is).
36 */
37static int __isleap(long year)
38{
39 return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0);
40}
41
42/* do a mathdiv for long type */
43static long math_div(long a, long b)
44{
45 return a / b - (a % b < 0);
46}
47
48/* How many leap years between y1 and y2, y1 must less or equal to y2 */
49static long leaps_between(long y1, long y2)
50{
51 long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100)
52 + math_div(y1 - 1, 400);
53 long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100)
54 + math_div(y2 - 1, 400);
55 return leaps2 - leaps1;
56}
57
58/* How many days come before each month (0-12). */
59static const unsigned short __mon_yday[2][13] = {
60 /* Normal years. */
61 {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365},
62 /* Leap years. */
63 {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366}
64};
65
66#define SECS_PER_HOUR (60 * 60)
67#define SECS_PER_DAY (SECS_PER_HOUR * 24)
68
69/**
70 * time_to_tm - converts the calendar time to local broken-down time
71 *
72 * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970,
73 * Coordinated Universal Time (UTC).
74 * @offset offset seconds adding to totalsecs.
75 * @result pointer to struct tm variable to receive broken-down time
76 */
77void time_to_tm(time_t totalsecs, int offset, struct tm *result)
78{
79 long days, rem, y;
80 const unsigned short *ip;
81
82 days = totalsecs / SECS_PER_DAY;
83 rem = totalsecs % SECS_PER_DAY;
84 rem += offset;
85 while (rem < 0) {
86 rem += SECS_PER_DAY;
87 --days;
88 }
89 while (rem >= SECS_PER_DAY) {
90 rem -= SECS_PER_DAY;
91 ++days;
92 }
93
94 result->tm_hour = rem / SECS_PER_HOUR;
95 rem %= SECS_PER_HOUR;
96 result->tm_min = rem / 60;
97 result->tm_sec = rem % 60;
98
99 /* January 1, 1970 was a Thursday. */
100 result->tm_wday = (4 + days) % 7;
101 if (result->tm_wday < 0)
102 result->tm_wday += 7;
103
104 y = 1970;
105
106 while (days < 0 || days >= (__isleap(y) ? 366 : 365)) {
107 /* Guess a corrected year, assuming 365 days per year. */
108 long yg = y + math_div(days, 365);
109
110 /* Adjust DAYS and Y to match the guessed year. */
111 days -= (yg - y) * 365 + leaps_between(y, yg);
112 y = yg;
113 }
114
115 result->tm_year = y - 1900;
116
117 result->tm_yday = days;
118
119 ip = __mon_yday[__isleap(y)];
120 for (y = 11; days < ip[y]; y--)
121 continue;
122 days -= ip[y];
123
124 result->tm_mon = y;
125 result->tm_mday = days + 1;
126}
127EXPORT_SYMBOL(time_to_tm);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e8c77d9c633a..c3a4e2907eaa 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -13,12 +13,123 @@
13#include <linux/percpu.h> 13#include <linux/percpu.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/sched.h>
16#include <linux/sysdev.h> 17#include <linux/sysdev.h>
17#include <linux/clocksource.h> 18#include <linux/clocksource.h>
18#include <linux/jiffies.h> 19#include <linux/jiffies.h>
19#include <linux/time.h> 20#include <linux/time.h>
20#include <linux/tick.h> 21#include <linux/tick.h>
22#include <linux/stop_machine.h>
23
24/* Structure holding internal timekeeping values. */
25struct timekeeper {
26 /* Current clocksource used for timekeeping. */
27 struct clocksource *clock;
28 /* The shift value of the current clocksource. */
29 int shift;
30
31 /* Number of clock cycles in one NTP interval. */
32 cycle_t cycle_interval;
33 /* Number of clock shifted nano seconds in one NTP interval. */
34 u64 xtime_interval;
35 /* Raw nano seconds accumulated per NTP interval. */
36 u32 raw_interval;
37
38 /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */
39 u64 xtime_nsec;
40 /* Difference between accumulated time and NTP time in ntp
41 * shifted nano seconds. */
42 s64 ntp_error;
43 /* Shift conversion between clock shifted nano seconds and
44 * ntp shifted nano seconds. */
45 int ntp_error_shift;
46 /* NTP adjusted clock multiplier */
47 u32 mult;
48};
49
50struct timekeeper timekeeper;
51
52/**
53 * timekeeper_setup_internals - Set up internals to use clocksource clock.
54 *
55 * @clock: Pointer to clocksource.
56 *
57 * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
58 * pair and interval request.
59 *
60 * Unless you're the timekeeping code, you should not be using this!
61 */
62static void timekeeper_setup_internals(struct clocksource *clock)
63{
64 cycle_t interval;
65 u64 tmp;
66
67 timekeeper.clock = clock;
68 clock->cycle_last = clock->read(clock);
21 69
70 /* Do the ns -> cycle conversion first, using original mult */
71 tmp = NTP_INTERVAL_LENGTH;
72 tmp <<= clock->shift;
73 tmp += clock->mult/2;
74 do_div(tmp, clock->mult);
75 if (tmp == 0)
76 tmp = 1;
77
78 interval = (cycle_t) tmp;
79 timekeeper.cycle_interval = interval;
80
81 /* Go back from cycles -> shifted ns */
82 timekeeper.xtime_interval = (u64) interval * clock->mult;
83 timekeeper.raw_interval =
84 ((u64) interval * clock->mult) >> clock->shift;
85
86 timekeeper.xtime_nsec = 0;
87 timekeeper.shift = clock->shift;
88
89 timekeeper.ntp_error = 0;
90 timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
91
92 /*
93 * The timekeeper keeps its own mult values for the currently
94 * active clocksource. These value will be adjusted via NTP
95 * to counteract clock drifting.
96 */
97 timekeeper.mult = clock->mult;
98}
99
100/* Timekeeper helper functions. */
101static inline s64 timekeeping_get_ns(void)
102{
103 cycle_t cycle_now, cycle_delta;
104 struct clocksource *clock;
105
106 /* read clocksource: */
107 clock = timekeeper.clock;
108 cycle_now = clock->read(clock);
109
110 /* calculate the delta since the last update_wall_time: */
111 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
112
113 /* return delta convert to nanoseconds using ntp adjusted mult. */
114 return clocksource_cyc2ns(cycle_delta, timekeeper.mult,
115 timekeeper.shift);
116}
117
118static inline s64 timekeeping_get_ns_raw(void)
119{
120 cycle_t cycle_now, cycle_delta;
121 struct clocksource *clock;
122
123 /* read clocksource: */
124 clock = timekeeper.clock;
125 cycle_now = clock->read(clock);
126
127 /* calculate the delta since the last update_wall_time: */
128 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
129
130 /* return delta convert to nanoseconds using ntp adjusted mult. */
131 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
132}
22 133
23/* 134/*
24 * This read-write spinlock protects us from races in SMP while 135 * This read-write spinlock protects us from races in SMP while
@@ -44,7 +155,12 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
44 */ 155 */
45struct timespec xtime __attribute__ ((aligned (16))); 156struct timespec xtime __attribute__ ((aligned (16)));
46struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 157struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
47static unsigned long total_sleep_time; /* seconds */ 158static struct timespec total_sleep_time;
159
160/*
161 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
162 */
163struct timespec raw_time;
48 164
49/* flag for if timekeeping is suspended */ 165/* flag for if timekeeping is suspended */
50int __read_mostly timekeeping_suspended; 166int __read_mostly timekeeping_suspended;
@@ -56,35 +172,44 @@ void update_xtime_cache(u64 nsec)
56 timespec_add_ns(&xtime_cache, nsec); 172 timespec_add_ns(&xtime_cache, nsec);
57} 173}
58 174
59struct clocksource *clock; 175/* must hold xtime_lock */
60 176void timekeeping_leap_insert(int leapsecond)
177{
178 xtime.tv_sec += leapsecond;
179 wall_to_monotonic.tv_sec -= leapsecond;
180 update_vsyscall(&xtime, timekeeper.clock);
181}
61 182
62#ifdef CONFIG_GENERIC_TIME 183#ifdef CONFIG_GENERIC_TIME
184
63/** 185/**
64 * clocksource_forward_now - update clock to the current time 186 * timekeeping_forward_now - update clock to the current time
65 * 187 *
66 * Forward the current clock to update its state since the last call to 188 * Forward the current clock to update its state since the last call to
67 * update_wall_time(). This is useful before significant clock changes, 189 * update_wall_time(). This is useful before significant clock changes,
68 * as it avoids having to deal with this time offset explicitly. 190 * as it avoids having to deal with this time offset explicitly.
69 */ 191 */
70static void clocksource_forward_now(void) 192static void timekeeping_forward_now(void)
71{ 193{
72 cycle_t cycle_now, cycle_delta; 194 cycle_t cycle_now, cycle_delta;
195 struct clocksource *clock;
73 s64 nsec; 196 s64 nsec;
74 197
75 cycle_now = clocksource_read(clock); 198 clock = timekeeper.clock;
199 cycle_now = clock->read(clock);
76 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 200 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
77 clock->cycle_last = cycle_now; 201 clock->cycle_last = cycle_now;
78 202
79 nsec = cyc2ns(clock, cycle_delta); 203 nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult,
204 timekeeper.shift);
80 205
81 /* If arch requires, add in gettimeoffset() */ 206 /* If arch requires, add in gettimeoffset() */
82 nsec += arch_gettimeoffset(); 207 nsec += arch_gettimeoffset();
83 208
84 timespec_add_ns(&xtime, nsec); 209 timespec_add_ns(&xtime, nsec);
85 210
86 nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; 211 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
87 clock->raw_time.tv_nsec += nsec; 212 timespec_add_ns(&raw_time, nsec);
88} 213}
89 214
90/** 215/**
@@ -95,7 +220,6 @@ static void clocksource_forward_now(void)
95 */ 220 */
96void getnstimeofday(struct timespec *ts) 221void getnstimeofday(struct timespec *ts)
97{ 222{
98 cycle_t cycle_now, cycle_delta;
99 unsigned long seq; 223 unsigned long seq;
100 s64 nsecs; 224 s64 nsecs;
101 225
@@ -105,15 +229,7 @@ void getnstimeofday(struct timespec *ts)
105 seq = read_seqbegin(&xtime_lock); 229 seq = read_seqbegin(&xtime_lock);
106 230
107 *ts = xtime; 231 *ts = xtime;
108 232 nsecs = timekeeping_get_ns();
109 /* read clocksource: */
110 cycle_now = clocksource_read(clock);
111
112 /* calculate the delta since the last update_wall_time: */
113 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
114
115 /* convert to nanoseconds: */
116 nsecs = cyc2ns(clock, cycle_delta);
117 233
118 /* If arch requires, add in gettimeoffset() */ 234 /* If arch requires, add in gettimeoffset() */
119 nsecs += arch_gettimeoffset(); 235 nsecs += arch_gettimeoffset();
@@ -125,6 +241,57 @@ void getnstimeofday(struct timespec *ts)
125 241
126EXPORT_SYMBOL(getnstimeofday); 242EXPORT_SYMBOL(getnstimeofday);
127 243
244ktime_t ktime_get(void)
245{
246 unsigned int seq;
247 s64 secs, nsecs;
248
249 WARN_ON(timekeeping_suspended);
250
251 do {
252 seq = read_seqbegin(&xtime_lock);
253 secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
254 nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
255 nsecs += timekeeping_get_ns();
256
257 } while (read_seqretry(&xtime_lock, seq));
258 /*
259 * Use ktime_set/ktime_add_ns to create a proper ktime on
260 * 32-bit architectures without CONFIG_KTIME_SCALAR.
261 */
262 return ktime_add_ns(ktime_set(secs, 0), nsecs);
263}
264EXPORT_SYMBOL_GPL(ktime_get);
265
266/**
267 * ktime_get_ts - get the monotonic clock in timespec format
268 * @ts: pointer to timespec variable
269 *
270 * The function calculates the monotonic clock from the realtime
271 * clock and the wall_to_monotonic offset and stores the result
272 * in normalized timespec format in the variable pointed to by @ts.
273 */
274void ktime_get_ts(struct timespec *ts)
275{
276 struct timespec tomono;
277 unsigned int seq;
278 s64 nsecs;
279
280 WARN_ON(timekeeping_suspended);
281
282 do {
283 seq = read_seqbegin(&xtime_lock);
284 *ts = xtime;
285 tomono = wall_to_monotonic;
286 nsecs = timekeeping_get_ns();
287
288 } while (read_seqretry(&xtime_lock, seq));
289
290 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
291 ts->tv_nsec + tomono.tv_nsec + nsecs);
292}
293EXPORT_SYMBOL_GPL(ktime_get_ts);
294
128/** 295/**
129 * do_gettimeofday - Returns the time of day in a timeval 296 * do_gettimeofday - Returns the time of day in a timeval
130 * @tv: pointer to the timeval to be set 297 * @tv: pointer to the timeval to be set
@@ -157,7 +324,7 @@ int do_settimeofday(struct timespec *tv)
157 324
158 write_seqlock_irqsave(&xtime_lock, flags); 325 write_seqlock_irqsave(&xtime_lock, flags);
159 326
160 clocksource_forward_now(); 327 timekeeping_forward_now();
161 328
162 ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; 329 ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
163 ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; 330 ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
@@ -167,10 +334,10 @@ int do_settimeofday(struct timespec *tv)
167 334
168 update_xtime_cache(0); 335 update_xtime_cache(0);
169 336
170 clock->error = 0; 337 timekeeper.ntp_error = 0;
171 ntp_clear(); 338 ntp_clear();
172 339
173 update_vsyscall(&xtime, clock); 340 update_vsyscall(&xtime, timekeeper.clock);
174 341
175 write_sequnlock_irqrestore(&xtime_lock, flags); 342 write_sequnlock_irqrestore(&xtime_lock, flags);
176 343
@@ -187,44 +354,97 @@ EXPORT_SYMBOL(do_settimeofday);
187 * 354 *
188 * Accumulates current time interval and initializes new clocksource 355 * Accumulates current time interval and initializes new clocksource
189 */ 356 */
190static void change_clocksource(void) 357static int change_clocksource(void *data)
191{ 358{
192 struct clocksource *new, *old; 359 struct clocksource *new, *old;
193 360
194 new = clocksource_get_next(); 361 new = (struct clocksource *) data;
362
363 timekeeping_forward_now();
364 if (!new->enable || new->enable(new) == 0) {
365 old = timekeeper.clock;
366 timekeeper_setup_internals(new);
367 if (old->disable)
368 old->disable(old);
369 }
370 return 0;
371}
195 372
196 if (clock == new) 373/**
374 * timekeeping_notify - Install a new clock source
375 * @clock: pointer to the clock source
376 *
377 * This function is called from clocksource.c after a new, better clock
378 * source has been registered. The caller holds the clocksource_mutex.
379 */
380void timekeeping_notify(struct clocksource *clock)
381{
382 if (timekeeper.clock == clock)
197 return; 383 return;
384 stop_machine(change_clocksource, clock, NULL);
385 tick_clock_notify();
386}
198 387
199 clocksource_forward_now(); 388#else /* GENERIC_TIME */
200 389
201 if (clocksource_enable(new)) 390static inline void timekeeping_forward_now(void) { }
202 return;
203 391
204 new->raw_time = clock->raw_time; 392/**
205 old = clock; 393 * ktime_get - get the monotonic time in ktime_t format
206 clock = new; 394 *
207 clocksource_disable(old); 395 * returns the time in ktime_t format
396 */
397ktime_t ktime_get(void)
398{
399 struct timespec now;
208 400
209 clock->cycle_last = 0; 401 ktime_get_ts(&now);
210 clock->cycle_last = clocksource_read(clock);
211 clock->error = 0;
212 clock->xtime_nsec = 0;
213 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
214 402
215 tick_clock_notify(); 403 return timespec_to_ktime(now);
404}
405EXPORT_SYMBOL_GPL(ktime_get);
216 406
217 /* 407/**
218 * We're holding xtime lock and waking up klogd would deadlock 408 * ktime_get_ts - get the monotonic clock in timespec format
219 * us on enqueue. So no printing! 409 * @ts: pointer to timespec variable
220 printk(KERN_INFO "Time: %s clocksource has been installed.\n", 410 *
221 clock->name); 411 * The function calculates the monotonic clock from the realtime
222 */ 412 * clock and the wall_to_monotonic offset and stores the result
413 * in normalized timespec format in the variable pointed to by @ts.
414 */
415void ktime_get_ts(struct timespec *ts)
416{
417 struct timespec tomono;
418 unsigned long seq;
419
420 do {
421 seq = read_seqbegin(&xtime_lock);
422 getnstimeofday(ts);
423 tomono = wall_to_monotonic;
424
425 } while (read_seqretry(&xtime_lock, seq));
426
427 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
428 ts->tv_nsec + tomono.tv_nsec);
223} 429}
224#else 430EXPORT_SYMBOL_GPL(ktime_get_ts);
225static inline void clocksource_forward_now(void) { } 431
226static inline void change_clocksource(void) { } 432#endif /* !GENERIC_TIME */
227#endif 433
434/**
435 * ktime_get_real - get the real (wall-) time in ktime_t format
436 *
437 * returns the time in ktime_t format
438 */
439ktime_t ktime_get_real(void)
440{
441 struct timespec now;
442
443 getnstimeofday(&now);
444
445 return timespec_to_ktime(now);
446}
447EXPORT_SYMBOL_GPL(ktime_get_real);
228 448
229/** 449/**
230 * getrawmonotonic - Returns the raw monotonic time in a timespec 450 * getrawmonotonic - Returns the raw monotonic time in a timespec
@@ -236,21 +456,11 @@ void getrawmonotonic(struct timespec *ts)
236{ 456{
237 unsigned long seq; 457 unsigned long seq;
238 s64 nsecs; 458 s64 nsecs;
239 cycle_t cycle_now, cycle_delta;
240 459
241 do { 460 do {
242 seq = read_seqbegin(&xtime_lock); 461 seq = read_seqbegin(&xtime_lock);
243 462 nsecs = timekeeping_get_ns_raw();
244 /* read clocksource: */ 463 *ts = raw_time;
245 cycle_now = clocksource_read(clock);
246
247 /* calculate the delta since the last update_wall_time: */
248 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
249
250 /* convert to nanoseconds: */
251 nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
252
253 *ts = clock->raw_time;
254 464
255 } while (read_seqretry(&xtime_lock, seq)); 465 } while (read_seqretry(&xtime_lock, seq));
256 466
@@ -270,7 +480,7 @@ int timekeeping_valid_for_hres(void)
270 do { 480 do {
271 seq = read_seqbegin(&xtime_lock); 481 seq = read_seqbegin(&xtime_lock);
272 482
273 ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; 483 ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
274 484
275 } while (read_seqretry(&xtime_lock, seq)); 485 } while (read_seqretry(&xtime_lock, seq));
276 486
@@ -278,17 +488,33 @@ int timekeeping_valid_for_hres(void)
278} 488}
279 489
280/** 490/**
281 * read_persistent_clock - Return time in seconds from the persistent clock. 491 * read_persistent_clock - Return time from the persistent clock.
282 * 492 *
283 * Weak dummy function for arches that do not yet support it. 493 * Weak dummy function for arches that do not yet support it.
284 * Returns seconds from epoch using the battery backed persistent clock. 494 * Reads the time from the battery backed persistent clock.
285 * Returns zero if unsupported. 495 * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
286 * 496 *
287 * XXX - Do be sure to remove it once all arches implement it. 497 * XXX - Do be sure to remove it once all arches implement it.
288 */ 498 */
289unsigned long __attribute__((weak)) read_persistent_clock(void) 499void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
290{ 500{
291 return 0; 501 ts->tv_sec = 0;
502 ts->tv_nsec = 0;
503}
504
505/**
506 * read_boot_clock - Return time of the system start.
507 *
508 * Weak dummy function for arches that do not yet support it.
509 * Function to read the exact time the system has been started.
510 * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
511 *
512 * XXX - Do be sure to remove it once all arches implement it.
513 */
514void __attribute__((weak)) read_boot_clock(struct timespec *ts)
515{
516 ts->tv_sec = 0;
517 ts->tv_nsec = 0;
292} 518}
293 519
294/* 520/*
@@ -296,29 +522,40 @@ unsigned long __attribute__((weak)) read_persistent_clock(void)
296 */ 522 */
297void __init timekeeping_init(void) 523void __init timekeeping_init(void)
298{ 524{
525 struct clocksource *clock;
299 unsigned long flags; 526 unsigned long flags;
300 unsigned long sec = read_persistent_clock(); 527 struct timespec now, boot;
528
529 read_persistent_clock(&now);
530 read_boot_clock(&boot);
301 531
302 write_seqlock_irqsave(&xtime_lock, flags); 532 write_seqlock_irqsave(&xtime_lock, flags);
303 533
304 ntp_init(); 534 ntp_init();
305 535
306 clock = clocksource_get_next(); 536 clock = clocksource_default_clock();
307 clocksource_enable(clock); 537 if (clock->enable)
308 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 538 clock->enable(clock);
309 clock->cycle_last = clocksource_read(clock); 539 timekeeper_setup_internals(clock);
310 540
311 xtime.tv_sec = sec; 541 xtime.tv_sec = now.tv_sec;
312 xtime.tv_nsec = 0; 542 xtime.tv_nsec = now.tv_nsec;
543 raw_time.tv_sec = 0;
544 raw_time.tv_nsec = 0;
545 if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
546 boot.tv_sec = xtime.tv_sec;
547 boot.tv_nsec = xtime.tv_nsec;
548 }
313 set_normalized_timespec(&wall_to_monotonic, 549 set_normalized_timespec(&wall_to_monotonic,
314 -xtime.tv_sec, -xtime.tv_nsec); 550 -boot.tv_sec, -boot.tv_nsec);
315 update_xtime_cache(0); 551 update_xtime_cache(0);
316 total_sleep_time = 0; 552 total_sleep_time.tv_sec = 0;
553 total_sleep_time.tv_nsec = 0;
317 write_sequnlock_irqrestore(&xtime_lock, flags); 554 write_sequnlock_irqrestore(&xtime_lock, flags);
318} 555}
319 556
320/* time in seconds when suspend began */ 557/* time in seconds when suspend began */
321static unsigned long timekeeping_suspend_time; 558static struct timespec timekeeping_suspend_time;
322 559
323/** 560/**
324 * timekeeping_resume - Resumes the generic timekeeping subsystem. 561 * timekeeping_resume - Resumes the generic timekeeping subsystem.
@@ -331,24 +568,24 @@ static unsigned long timekeeping_suspend_time;
331static int timekeeping_resume(struct sys_device *dev) 568static int timekeeping_resume(struct sys_device *dev)
332{ 569{
333 unsigned long flags; 570 unsigned long flags;
334 unsigned long now = read_persistent_clock(); 571 struct timespec ts;
572
573 read_persistent_clock(&ts);
335 574
336 clocksource_resume(); 575 clocksource_resume();
337 576
338 write_seqlock_irqsave(&xtime_lock, flags); 577 write_seqlock_irqsave(&xtime_lock, flags);
339 578
340 if (now && (now > timekeeping_suspend_time)) { 579 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
341 unsigned long sleep_length = now - timekeeping_suspend_time; 580 ts = timespec_sub(ts, timekeeping_suspend_time);
342 581 xtime = timespec_add_safe(xtime, ts);
343 xtime.tv_sec += sleep_length; 582 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
344 wall_to_monotonic.tv_sec -= sleep_length; 583 total_sleep_time = timespec_add_safe(total_sleep_time, ts);
345 total_sleep_time += sleep_length;
346 } 584 }
347 update_xtime_cache(0); 585 update_xtime_cache(0);
348 /* re-base the last cycle value */ 586 /* re-base the last cycle value */
349 clock->cycle_last = 0; 587 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
350 clock->cycle_last = clocksource_read(clock); 588 timekeeper.ntp_error = 0;
351 clock->error = 0;
352 timekeeping_suspended = 0; 589 timekeeping_suspended = 0;
353 write_sequnlock_irqrestore(&xtime_lock, flags); 590 write_sequnlock_irqrestore(&xtime_lock, flags);
354 591
@@ -366,10 +603,10 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
366{ 603{
367 unsigned long flags; 604 unsigned long flags;
368 605
369 timekeeping_suspend_time = read_persistent_clock(); 606 read_persistent_clock(&timekeeping_suspend_time);
370 607
371 write_seqlock_irqsave(&xtime_lock, flags); 608 write_seqlock_irqsave(&xtime_lock, flags);
372 clocksource_forward_now(); 609 timekeeping_forward_now();
373 timekeeping_suspended = 1; 610 timekeeping_suspended = 1;
374 write_sequnlock_irqrestore(&xtime_lock, flags); 611 write_sequnlock_irqrestore(&xtime_lock, flags);
375 612
@@ -404,7 +641,7 @@ device_initcall(timekeeping_init_device);
404 * If the error is already larger, we look ahead even further 641 * If the error is already larger, we look ahead even further
405 * to compensate for late or lost adjustments. 642 * to compensate for late or lost adjustments.
406 */ 643 */
407static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, 644static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
408 s64 *offset) 645 s64 *offset)
409{ 646{
410 s64 tick_error, i; 647 s64 tick_error, i;
@@ -420,7 +657,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
420 * here. This is tuned so that an error of about 1 msec is adjusted 657 * here. This is tuned so that an error of about 1 msec is adjusted
421 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). 658 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
422 */ 659 */
423 error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); 660 error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
424 error2 = abs(error2); 661 error2 = abs(error2);
425 for (look_ahead = 0; error2 > 0; look_ahead++) 662 for (look_ahead = 0; error2 > 0; look_ahead++)
426 error2 >>= 2; 663 error2 >>= 2;
@@ -429,8 +666,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
429 * Now calculate the error in (1 << look_ahead) ticks, but first 666 * Now calculate the error in (1 << look_ahead) ticks, but first
430 * remove the single look ahead already included in the error. 667 * remove the single look ahead already included in the error.
431 */ 668 */
432 tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1); 669 tick_error = tick_length >> (timekeeper.ntp_error_shift + 1);
433 tick_error -= clock->xtime_interval >> 1; 670 tick_error -= timekeeper.xtime_interval >> 1;
434 error = ((error - tick_error) >> look_ahead) + tick_error; 671 error = ((error - tick_error) >> look_ahead) + tick_error;
435 672
436 /* Finally calculate the adjustment shift value. */ 673 /* Finally calculate the adjustment shift value. */
@@ -455,18 +692,18 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
455 * this is optimized for the most common adjustments of -1,0,1, 692 * this is optimized for the most common adjustments of -1,0,1,
456 * for other values we can do a bit more work. 693 * for other values we can do a bit more work.
457 */ 694 */
458static void clocksource_adjust(s64 offset) 695static void timekeeping_adjust(s64 offset)
459{ 696{
460 s64 error, interval = clock->cycle_interval; 697 s64 error, interval = timekeeper.cycle_interval;
461 int adj; 698 int adj;
462 699
463 error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1); 700 error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
464 if (error > interval) { 701 if (error > interval) {
465 error >>= 2; 702 error >>= 2;
466 if (likely(error <= interval)) 703 if (likely(error <= interval))
467 adj = 1; 704 adj = 1;
468 else 705 else
469 adj = clocksource_bigadjust(error, &interval, &offset); 706 adj = timekeeping_bigadjust(error, &interval, &offset);
470 } else if (error < -interval) { 707 } else if (error < -interval) {
471 error >>= 2; 708 error >>= 2;
472 if (likely(error >= -interval)) { 709 if (likely(error >= -interval)) {
@@ -474,15 +711,15 @@ static void clocksource_adjust(s64 offset)
474 interval = -interval; 711 interval = -interval;
475 offset = -offset; 712 offset = -offset;
476 } else 713 } else
477 adj = clocksource_bigadjust(error, &interval, &offset); 714 adj = timekeeping_bigadjust(error, &interval, &offset);
478 } else 715 } else
479 return; 716 return;
480 717
481 clock->mult += adj; 718 timekeeper.mult += adj;
482 clock->xtime_interval += interval; 719 timekeeper.xtime_interval += interval;
483 clock->xtime_nsec -= offset; 720 timekeeper.xtime_nsec -= offset;
484 clock->error -= (interval - offset) << 721 timekeeper.ntp_error -= (interval - offset) <<
485 (NTP_SCALE_SHIFT - clock->shift); 722 timekeeper.ntp_error_shift;
486} 723}
487 724
488/** 725/**
@@ -492,53 +729,59 @@ static void clocksource_adjust(s64 offset)
492 */ 729 */
493void update_wall_time(void) 730void update_wall_time(void)
494{ 731{
732 struct clocksource *clock;
495 cycle_t offset; 733 cycle_t offset;
734 u64 nsecs;
496 735
497 /* Make sure we're fully resumed: */ 736 /* Make sure we're fully resumed: */
498 if (unlikely(timekeeping_suspended)) 737 if (unlikely(timekeeping_suspended))
499 return; 738 return;
500 739
740 clock = timekeeper.clock;
501#ifdef CONFIG_GENERIC_TIME 741#ifdef CONFIG_GENERIC_TIME
502 offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; 742 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
503#else 743#else
504 offset = clock->cycle_interval; 744 offset = timekeeper.cycle_interval;
505#endif 745#endif
506 clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift; 746 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
507 747
508 /* normally this loop will run just once, however in the 748 /* normally this loop will run just once, however in the
509 * case of lost or late ticks, it will accumulate correctly. 749 * case of lost or late ticks, it will accumulate correctly.
510 */ 750 */
511 while (offset >= clock->cycle_interval) { 751 while (offset >= timekeeper.cycle_interval) {
752 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
753
512 /* accumulate one interval */ 754 /* accumulate one interval */
513 offset -= clock->cycle_interval; 755 offset -= timekeeper.cycle_interval;
514 clock->cycle_last += clock->cycle_interval; 756 clock->cycle_last += timekeeper.cycle_interval;
515 757
516 clock->xtime_nsec += clock->xtime_interval; 758 timekeeper.xtime_nsec += timekeeper.xtime_interval;
517 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { 759 if (timekeeper.xtime_nsec >= nsecps) {
518 clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; 760 timekeeper.xtime_nsec -= nsecps;
519 xtime.tv_sec++; 761 xtime.tv_sec++;
520 second_overflow(); 762 second_overflow();
521 } 763 }
522 764
523 clock->raw_time.tv_nsec += clock->raw_interval; 765 raw_time.tv_nsec += timekeeper.raw_interval;
524 if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) { 766 if (raw_time.tv_nsec >= NSEC_PER_SEC) {
525 clock->raw_time.tv_nsec -= NSEC_PER_SEC; 767 raw_time.tv_nsec -= NSEC_PER_SEC;
526 clock->raw_time.tv_sec++; 768 raw_time.tv_sec++;
527 } 769 }
528 770
529 /* accumulate error between NTP and clock interval */ 771 /* accumulate error between NTP and clock interval */
530 clock->error += tick_length; 772 timekeeper.ntp_error += tick_length;
531 clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); 773 timekeeper.ntp_error -= timekeeper.xtime_interval <<
774 timekeeper.ntp_error_shift;
532 } 775 }
533 776
534 /* correct the clock when NTP error is too big */ 777 /* correct the clock when NTP error is too big */
535 clocksource_adjust(offset); 778 timekeeping_adjust(offset);
536 779
537 /* 780 /*
538 * Since in the loop above, we accumulate any amount of time 781 * Since in the loop above, we accumulate any amount of time
539 * in xtime_nsec over a second into xtime.tv_sec, its possible for 782 * in xtime_nsec over a second into xtime.tv_sec, its possible for
540 * xtime_nsec to be fairly small after the loop. Further, if we're 783 * xtime_nsec to be fairly small after the loop. Further, if we're
541 * slightly speeding the clocksource up in clocksource_adjust(), 784 * slightly speeding the clocksource up in timekeeping_adjust(),
542 * its possible the required corrective factor to xtime_nsec could 785 * its possible the required corrective factor to xtime_nsec could
543 * cause it to underflow. 786 * cause it to underflow.
544 * 787 *
@@ -550,24 +793,25 @@ void update_wall_time(void)
550 * We'll correct this error next time through this function, when 793 * We'll correct this error next time through this function, when
551 * xtime_nsec is not as small. 794 * xtime_nsec is not as small.
552 */ 795 */
553 if (unlikely((s64)clock->xtime_nsec < 0)) { 796 if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
554 s64 neg = -(s64)clock->xtime_nsec; 797 s64 neg = -(s64)timekeeper.xtime_nsec;
555 clock->xtime_nsec = 0; 798 timekeeper.xtime_nsec = 0;
556 clock->error += neg << (NTP_SCALE_SHIFT - clock->shift); 799 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
557 } 800 }
558 801
559 /* store full nanoseconds into xtime after rounding it up and 802 /* store full nanoseconds into xtime after rounding it up and
560 * add the remainder to the error difference. 803 * add the remainder to the error difference.
561 */ 804 */
562 xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1; 805 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
563 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; 806 timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift;
564 clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift); 807 timekeeper.ntp_error += timekeeper.xtime_nsec <<
808 timekeeper.ntp_error_shift;
565 809
566 update_xtime_cache(cyc2ns(clock, offset)); 810 nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
811 update_xtime_cache(nsecs);
567 812
568 /* check to see if there is a new clocksource to use */ 813 /* check to see if there is a new clocksource to use */
569 change_clocksource(); 814 update_vsyscall(&xtime, timekeeper.clock);
570 update_vsyscall(&xtime, clock);
571} 815}
572 816
573/** 817/**
@@ -583,9 +827,12 @@ void update_wall_time(void)
583 */ 827 */
584void getboottime(struct timespec *ts) 828void getboottime(struct timespec *ts)
585{ 829{
586 set_normalized_timespec(ts, 830 struct timespec boottime = {
587 - (wall_to_monotonic.tv_sec + total_sleep_time), 831 .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec,
588 - wall_to_monotonic.tv_nsec); 832 .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec
833 };
834
835 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
589} 836}
590 837
591/** 838/**
@@ -594,7 +841,7 @@ void getboottime(struct timespec *ts)
594 */ 841 */
595void monotonic_to_bootbased(struct timespec *ts) 842void monotonic_to_bootbased(struct timespec *ts)
596{ 843{
597 ts->tv_sec += total_sleep_time; 844 *ts = timespec_add_safe(*ts, total_sleep_time);
598} 845}
599 846
600unsigned long get_seconds(void) 847unsigned long get_seconds(void)
@@ -603,6 +850,10 @@ unsigned long get_seconds(void)
603} 850}
604EXPORT_SYMBOL(get_seconds); 851EXPORT_SYMBOL(get_seconds);
605 852
853struct timespec __current_kernel_time(void)
854{
855 return xtime_cache;
856}
606 857
607struct timespec current_kernel_time(void) 858struct timespec current_kernel_time(void)
608{ 859{
@@ -618,3 +869,20 @@ struct timespec current_kernel_time(void)
618 return now; 869 return now;
619} 870}
620EXPORT_SYMBOL(current_kernel_time); 871EXPORT_SYMBOL(current_kernel_time);
872
873struct timespec get_monotonic_coarse(void)
874{
875 struct timespec now, mono;
876 unsigned long seq;
877
878 do {
879 seq = read_seqbegin(&xtime_lock);
880
881 now = xtime_cache;
882 mono = wall_to_monotonic;
883 } while (read_seqretry(&xtime_lock, seq));
884
885 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
886 now.tv_nsec + mono.tv_nsec);
887 return now;
888}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index fddd69d16e03..1b5b7aa2fdfd 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -275,7 +275,7 @@ static int timer_list_open(struct inode *inode, struct file *filp)
275 return single_open(filp, timer_list_show, NULL); 275 return single_open(filp, timer_list_show, NULL);
276} 276}
277 277
278static struct file_operations timer_list_fops = { 278static const struct file_operations timer_list_fops = {
279 .open = timer_list_open, 279 .open = timer_list_open,
280 .read = seq_read, 280 .read = seq_read,
281 .llseek = seq_lseek, 281 .llseek = seq_lseek,
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 4cde8b9c716f..ee5681f8d7ec 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -395,7 +395,7 @@ static int tstats_open(struct inode *inode, struct file *filp)
395 return single_open(filp, tstats_show, NULL); 395 return single_open(filp, tstats_show, NULL);
396} 396}
397 397
398static struct file_operations tstats_fops = { 398static const struct file_operations tstats_fops = {
399 .open = tstats_open, 399 .open = tstats_open,
400 .read = seq_read, 400 .read = seq_read,
401 .write = tstats_write, 401 .write = tstats_write,
diff --git a/kernel/timer.c b/kernel/timer.c
index a3d25f415019..5db5a8d26811 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,7 +37,7 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_counter.h> 40#include <linux/perf_event.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42 42
43#include <asm/uaccess.h> 43#include <asm/uaccess.h>
@@ -46,6 +46,9 @@
46#include <asm/timex.h> 46#include <asm/timex.h>
47#include <asm/io.h> 47#include <asm/io.h>
48 48
49#define CREATE_TRACE_POINTS
50#include <trace/events/timer.h>
51
49u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; 52u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
50 53
51EXPORT_SYMBOL(jiffies_64); 54EXPORT_SYMBOL(jiffies_64);
@@ -72,6 +75,7 @@ struct tvec_base {
72 spinlock_t lock; 75 spinlock_t lock;
73 struct timer_list *running_timer; 76 struct timer_list *running_timer;
74 unsigned long timer_jiffies; 77 unsigned long timer_jiffies;
78 unsigned long next_timer;
75 struct tvec_root tv1; 79 struct tvec_root tv1;
76 struct tvec tv2; 80 struct tvec tv2;
77 struct tvec tv3; 81 struct tvec tv3;
@@ -520,6 +524,25 @@ static inline void debug_timer_activate(struct timer_list *timer) { }
520static inline void debug_timer_deactivate(struct timer_list *timer) { } 524static inline void debug_timer_deactivate(struct timer_list *timer) { }
521#endif 525#endif
522 526
527static inline void debug_init(struct timer_list *timer)
528{
529 debug_timer_init(timer);
530 trace_timer_init(timer);
531}
532
533static inline void
534debug_activate(struct timer_list *timer, unsigned long expires)
535{
536 debug_timer_activate(timer);
537 trace_timer_start(timer, expires);
538}
539
540static inline void debug_deactivate(struct timer_list *timer)
541{
542 debug_timer_deactivate(timer);
543 trace_timer_cancel(timer);
544}
545
523static void __init_timer(struct timer_list *timer, 546static void __init_timer(struct timer_list *timer,
524 const char *name, 547 const char *name,
525 struct lock_class_key *key) 548 struct lock_class_key *key)
@@ -548,7 +571,7 @@ void init_timer_key(struct timer_list *timer,
548 const char *name, 571 const char *name,
549 struct lock_class_key *key) 572 struct lock_class_key *key)
550{ 573{
551 debug_timer_init(timer); 574 debug_init(timer);
552 __init_timer(timer, name, key); 575 __init_timer(timer, name, key);
553} 576}
554EXPORT_SYMBOL(init_timer_key); 577EXPORT_SYMBOL(init_timer_key);
@@ -567,7 +590,7 @@ static inline void detach_timer(struct timer_list *timer,
567{ 590{
568 struct list_head *entry = &timer->entry; 591 struct list_head *entry = &timer->entry;
569 592
570 debug_timer_deactivate(timer); 593 debug_deactivate(timer);
571 594
572 __list_del(entry->prev, entry->next); 595 __list_del(entry->prev, entry->next);
573 if (clear_pending) 596 if (clear_pending)
@@ -622,13 +645,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
622 645
623 if (timer_pending(timer)) { 646 if (timer_pending(timer)) {
624 detach_timer(timer, 0); 647 detach_timer(timer, 0);
648 if (timer->expires == base->next_timer &&
649 !tbase_get_deferrable(timer->base))
650 base->next_timer = base->timer_jiffies;
625 ret = 1; 651 ret = 1;
626 } else { 652 } else {
627 if (pending_only) 653 if (pending_only)
628 goto out_unlock; 654 goto out_unlock;
629 } 655 }
630 656
631 debug_timer_activate(timer); 657 debug_activate(timer, expires);
632 658
633 new_base = __get_cpu_var(tvec_bases); 659 new_base = __get_cpu_var(tvec_bases);
634 660
@@ -663,6 +689,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
663 } 689 }
664 690
665 timer->expires = expires; 691 timer->expires = expires;
692 if (time_before(timer->expires, base->next_timer) &&
693 !tbase_get_deferrable(timer->base))
694 base->next_timer = timer->expires;
666 internal_add_timer(base, timer); 695 internal_add_timer(base, timer);
667 696
668out_unlock: 697out_unlock:
@@ -780,7 +809,10 @@ void add_timer_on(struct timer_list *timer, int cpu)
780 BUG_ON(timer_pending(timer) || !timer->function); 809 BUG_ON(timer_pending(timer) || !timer->function);
781 spin_lock_irqsave(&base->lock, flags); 810 spin_lock_irqsave(&base->lock, flags);
782 timer_set_base(timer, base); 811 timer_set_base(timer, base);
783 debug_timer_activate(timer); 812 debug_activate(timer, timer->expires);
813 if (time_before(timer->expires, base->next_timer) &&
814 !tbase_get_deferrable(timer->base))
815 base->next_timer = timer->expires;
784 internal_add_timer(base, timer); 816 internal_add_timer(base, timer);
785 /* 817 /*
786 * Check whether the other CPU is idle and needs to be 818 * Check whether the other CPU is idle and needs to be
@@ -817,6 +849,9 @@ int del_timer(struct timer_list *timer)
817 base = lock_timer_base(timer, &flags); 849 base = lock_timer_base(timer, &flags);
818 if (timer_pending(timer)) { 850 if (timer_pending(timer)) {
819 detach_timer(timer, 1); 851 detach_timer(timer, 1);
852 if (timer->expires == base->next_timer &&
853 !tbase_get_deferrable(timer->base))
854 base->next_timer = base->timer_jiffies;
820 ret = 1; 855 ret = 1;
821 } 856 }
822 spin_unlock_irqrestore(&base->lock, flags); 857 spin_unlock_irqrestore(&base->lock, flags);
@@ -850,6 +885,9 @@ int try_to_del_timer_sync(struct timer_list *timer)
850 ret = 0; 885 ret = 0;
851 if (timer_pending(timer)) { 886 if (timer_pending(timer)) {
852 detach_timer(timer, 1); 887 detach_timer(timer, 1);
888 if (timer->expires == base->next_timer &&
889 !tbase_get_deferrable(timer->base))
890 base->next_timer = base->timer_jiffies;
853 ret = 1; 891 ret = 1;
854 } 892 }
855out: 893out:
@@ -984,7 +1022,9 @@ static inline void __run_timers(struct tvec_base *base)
984 */ 1022 */
985 lock_map_acquire(&lockdep_map); 1023 lock_map_acquire(&lockdep_map);
986 1024
1025 trace_timer_expire_entry(timer);
987 fn(data); 1026 fn(data);
1027 trace_timer_expire_exit(timer);
988 1028
989 lock_map_release(&lockdep_map); 1029 lock_map_release(&lockdep_map);
990 1030
@@ -1007,8 +1047,8 @@ static inline void __run_timers(struct tvec_base *base)
1007#ifdef CONFIG_NO_HZ 1047#ifdef CONFIG_NO_HZ
1008/* 1048/*
1009 * Find out when the next timer event is due to happen. This 1049 * Find out when the next timer event is due to happen. This
1010 * is used on S/390 to stop all activity when a cpus is idle. 1050 * is used on S/390 to stop all activity when a CPU is idle.
1011 * This functions needs to be called disabled. 1051 * This function needs to be called with interrupts disabled.
1012 */ 1052 */
1013static unsigned long __next_timer_interrupt(struct tvec_base *base) 1053static unsigned long __next_timer_interrupt(struct tvec_base *base)
1014{ 1054{
@@ -1134,7 +1174,9 @@ unsigned long get_next_timer_interrupt(unsigned long now)
1134 unsigned long expires; 1174 unsigned long expires;
1135 1175
1136 spin_lock(&base->lock); 1176 spin_lock(&base->lock);
1137 expires = __next_timer_interrupt(base); 1177 if (time_before_eq(base->next_timer, base->timer_jiffies))
1178 base->next_timer = __next_timer_interrupt(base);
1179 expires = base->next_timer;
1138 spin_unlock(&base->lock); 1180 spin_unlock(&base->lock);
1139 1181
1140 if (time_before_eq(expires, now)) 1182 if (time_before_eq(expires, now))
@@ -1169,7 +1211,7 @@ static void run_timer_softirq(struct softirq_action *h)
1169{ 1211{
1170 struct tvec_base *base = __get_cpu_var(tvec_bases); 1212 struct tvec_base *base = __get_cpu_var(tvec_bases);
1171 1213
1172 perf_counter_do_pending(); 1214 perf_event_do_pending();
1173 1215
1174 hrtimer_run_pending(); 1216 hrtimer_run_pending();
1175 1217
@@ -1522,6 +1564,7 @@ static int __cpuinit init_timers_cpu(int cpu)
1522 INIT_LIST_HEAD(base->tv1.vec + j); 1564 INIT_LIST_HEAD(base->tv1.vec + j);
1523 1565
1524 base->timer_jiffies = jiffies; 1566 base->timer_jiffies = jiffies;
1567 base->next_timer = base->timer_jiffies;
1525 return 0; 1568 return 0;
1526} 1569}
1527 1570
@@ -1534,6 +1577,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
1534 timer = list_first_entry(head, struct timer_list, entry); 1577 timer = list_first_entry(head, struct timer_list, entry);
1535 detach_timer(timer, 0); 1578 detach_timer(timer, 0);
1536 timer_set_base(timer, new_base); 1579 timer_set_base(timer, new_base);
1580 if (time_before(timer->expires, new_base->next_timer) &&
1581 !tbase_get_deferrable(timer->base))
1582 new_base->next_timer = timer->expires;
1537 internal_add_timer(new_base, timer); 1583 internal_add_timer(new_base, timer);
1538 } 1584 }
1539} 1585}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 1ea0d1234f4a..d006554888dc 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -11,12 +11,18 @@ config NOP_TRACER
11 11
12config HAVE_FTRACE_NMI_ENTER 12config HAVE_FTRACE_NMI_ENTER
13 bool 13 bool
14 help
15 See Documentation/trace/ftrace-implementation.txt
14 16
15config HAVE_FUNCTION_TRACER 17config HAVE_FUNCTION_TRACER
16 bool 18 bool
19 help
20 See Documentation/trace/ftrace-implementation.txt
17 21
18config HAVE_FUNCTION_GRAPH_TRACER 22config HAVE_FUNCTION_GRAPH_TRACER
19 bool 23 bool
24 help
25 See Documentation/trace/ftrace-implementation.txt
20 26
21config HAVE_FUNCTION_GRAPH_FP_TEST 27config HAVE_FUNCTION_GRAPH_FP_TEST
22 bool 28 bool
@@ -28,21 +34,25 @@ config HAVE_FUNCTION_GRAPH_FP_TEST
28config HAVE_FUNCTION_TRACE_MCOUNT_TEST 34config HAVE_FUNCTION_TRACE_MCOUNT_TEST
29 bool 35 bool
30 help 36 help
31 This gets selected when the arch tests the function_trace_stop 37 See Documentation/trace/ftrace-implementation.txt
32 variable at the mcount call site. Otherwise, this variable
33 is tested by the called function.
34 38
35config HAVE_DYNAMIC_FTRACE 39config HAVE_DYNAMIC_FTRACE
36 bool 40 bool
41 help
42 See Documentation/trace/ftrace-implementation.txt
37 43
38config HAVE_FTRACE_MCOUNT_RECORD 44config HAVE_FTRACE_MCOUNT_RECORD
39 bool 45 bool
46 help
47 See Documentation/trace/ftrace-implementation.txt
40 48
41config HAVE_HW_BRANCH_TRACER 49config HAVE_HW_BRANCH_TRACER
42 bool 50 bool
43 51
44config HAVE_SYSCALL_TRACEPOINTS 52config HAVE_SYSCALL_TRACEPOINTS
45 bool 53 bool
54 help
55 See Documentation/trace/ftrace-implementation.txt
46 56
47config TRACER_MAX_TRACE 57config TRACER_MAX_TRACE
48 bool 58 bool
@@ -73,7 +83,7 @@ config RING_BUFFER_ALLOW_SWAP
73# This allows those options to appear when no other tracer is selected. But the 83# This allows those options to appear when no other tracer is selected. But the
74# options do not appear when something else selects it. We need the two options 84# options do not appear when something else selects it. We need the two options
75# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the 85# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
76# hidding of the automatic options options. 86# hidding of the automatic options.
77 87
78config TRACING 88config TRACING
79 bool 89 bool
@@ -329,6 +339,27 @@ config POWER_TRACER
329 power management decisions, specifically the C-state and P-state 339 power management decisions, specifically the C-state and P-state
330 behavior. 340 behavior.
331 341
342config KSYM_TRACER
343 bool "Trace read and write access on kernel memory locations"
344 depends on HAVE_HW_BREAKPOINT
345 select TRACING
346 help
347 This tracer helps find read and write operations on any given kernel
348 symbol i.e. /proc/kallsyms.
349
350config PROFILE_KSYM_TRACER
351 bool "Profile all kernel memory accesses on 'watched' variables"
352 depends on KSYM_TRACER
353 help
354 This tracer profiles kernel accesses on variables watched through the
355 ksym tracer ftrace plugin. Depending upon the hardware, all read
356 and write operations on kernel variables can be monitored for
357 accesses.
358
359 The results will be displayed in:
360 /debugfs/tracing/profile_ksym
361
362 Say N if unsure.
332 363
333config STACK_TRACER 364config STACK_TRACER
334 bool "Trace max stack" 365 bool "Trace max stack"
@@ -418,6 +449,23 @@ config BLK_DEV_IO_TRACE
418 449
419 If unsure, say N. 450 If unsure, say N.
420 451
452config KPROBE_EVENT
453 depends on KPROBES
454 depends on X86
455 bool "Enable kprobes-based dynamic events"
456 select TRACING
457 default y
458 help
459 This allows the user to add tracing events (similar to tracepoints) on the fly
460 via the ftrace interface. See Documentation/trace/kprobetrace.txt
461 for more details.
462
463 Those events can be inserted wherever kprobes can probe, and record
464 various register and memory values.
465
466 This option is also required by perf-probe subcommand of perf tools. If
467 you want to use perf tools, this option is strongly recommended.
468
421config DYNAMIC_FTRACE 469config DYNAMIC_FTRACE
422 bool "enable/disable ftrace tracepoints dynamically" 470 bool "enable/disable ftrace tracepoints dynamically"
423 depends on FUNCTION_TRACER 471 depends on FUNCTION_TRACER
@@ -469,6 +517,18 @@ config FTRACE_STARTUP_TEST
469 functioning properly. It will do tests on all the configured 517 functioning properly. It will do tests on all the configured
470 tracers of ftrace. 518 tracers of ftrace.
471 519
520config EVENT_TRACE_TEST_SYSCALLS
521 bool "Run selftest on syscall events"
522 depends on FTRACE_STARTUP_TEST
523 help
524 This option will also enable testing every syscall event.
525 It only enables the event and disables it and runs various loads
526 with the event enabled. This adds a bit more time for kernel boot
527 up since it runs this on every system call defined.
528
529 TBD - enable a way to actually call the syscalls as we test their
530 events
531
472config MMIOTRACE 532config MMIOTRACE
473 bool "Memory mapped IO tracing" 533 bool "Memory mapped IO tracing"
474 depends on HAVE_MMIOTRACE_SUPPORT && PCI 534 depends on HAVE_MMIOTRACE_SUPPORT && PCI
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 844164dca90a..cd9ecd89ec77 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -42,7 +42,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
44obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o 44obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
45obj-$(CONFIG_POWER_TRACER) += trace_power.o
46obj-$(CONFIG_KMEMTRACE) += kmemtrace.o 45obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
47obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o 46obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
48obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 47obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
@@ -54,5 +53,8 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o
54obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
55obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
56obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
57obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
58obj-$(CONFIG_EVENT_TRACING) += power-traces.o
57 59
58libftrace-y := ftrace.o 60libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 3eb159c277c8..d9d6206e0b14 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -856,6 +856,37 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
856} 856}
857 857
858/** 858/**
859 * blk_add_trace_rq_remap - Add a trace for a request-remap operation
860 * @q: queue the io is for
861 * @rq: the source request
862 * @dev: target device
863 * @from: source sector
864 *
865 * Description:
866 * Device mapper remaps request to other devices.
867 * Add a trace for that action.
868 *
869 **/
870static void blk_add_trace_rq_remap(struct request_queue *q,
871 struct request *rq, dev_t dev,
872 sector_t from)
873{
874 struct blk_trace *bt = q->blk_trace;
875 struct blk_io_trace_remap r;
876
877 if (likely(!bt))
878 return;
879
880 r.device_from = cpu_to_be32(dev);
881 r.device_to = cpu_to_be32(disk_devt(rq->rq_disk));
882 r.sector_from = cpu_to_be64(from);
883
884 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
885 rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors,
886 sizeof(r), &r);
887}
888
889/**
859 * blk_add_driver_data - Add binary message with driver-specific data 890 * blk_add_driver_data - Add binary message with driver-specific data
860 * @q: queue the io is for 891 * @q: queue the io is for
861 * @rq: io request 892 * @rq: io request
@@ -922,10 +953,13 @@ static void blk_register_tracepoints(void)
922 WARN_ON(ret); 953 WARN_ON(ret);
923 ret = register_trace_block_remap(blk_add_trace_remap); 954 ret = register_trace_block_remap(blk_add_trace_remap);
924 WARN_ON(ret); 955 WARN_ON(ret);
956 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap);
957 WARN_ON(ret);
925} 958}
926 959
927static void blk_unregister_tracepoints(void) 960static void blk_unregister_tracepoints(void)
928{ 961{
962 unregister_trace_block_rq_remap(blk_add_trace_rq_remap);
929 unregister_trace_block_remap(blk_add_trace_remap); 963 unregister_trace_block_remap(blk_add_trace_remap);
930 unregister_trace_block_split(blk_add_trace_split); 964 unregister_trace_block_split(blk_add_trace_split);
931 unregister_trace_block_unplug_io(blk_add_trace_unplug_io); 965 unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
@@ -1657,6 +1691,11 @@ int blk_trace_init_sysfs(struct device *dev)
1657 return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); 1691 return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
1658} 1692}
1659 1693
1694void blk_trace_remove_sysfs(struct device *dev)
1695{
1696 sysfs_remove_group(&dev->kobj, &blk_trace_attr_group);
1697}
1698
1660#endif /* CONFIG_BLK_DEV_IO_TRACE */ 1699#endif /* CONFIG_BLK_DEV_IO_TRACE */
1661 1700
1662#ifdef CONFIG_EVENT_TRACING 1701#ifdef CONFIG_EVENT_TRACING
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8c804e24f96f..e51a1bcb7bed 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -60,6 +60,13 @@ static int last_ftrace_enabled;
60/* Quick disabling of function tracer. */ 60/* Quick disabling of function tracer. */
61int function_trace_stop; 61int function_trace_stop;
62 62
63/* List for set_ftrace_pid's pids. */
64LIST_HEAD(ftrace_pids);
65struct ftrace_pid {
66 struct list_head list;
67 struct pid *pid;
68};
69
63/* 70/*
64 * ftrace_disabled is set when an anomaly is discovered. 71 * ftrace_disabled is set when an anomaly is discovered.
65 * ftrace_disabled is much stronger than ftrace_enabled. 72 * ftrace_disabled is much stronger than ftrace_enabled.
@@ -78,6 +85,10 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
78ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 85ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
79ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 86ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
80 87
88#ifdef CONFIG_FUNCTION_GRAPH_TRACER
89static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
90#endif
91
81static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 92static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
82{ 93{
83 struct ftrace_ops *op = ftrace_list; 94 struct ftrace_ops *op = ftrace_list;
@@ -155,7 +166,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
155 else 166 else
156 func = ftrace_list_func; 167 func = ftrace_list_func;
157 168
158 if (ftrace_pid_trace) { 169 if (!list_empty(&ftrace_pids)) {
159 set_ftrace_pid_function(func); 170 set_ftrace_pid_function(func);
160 func = ftrace_pid_func; 171 func = ftrace_pid_func;
161 } 172 }
@@ -203,7 +214,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
203 if (ftrace_list->next == &ftrace_list_end) { 214 if (ftrace_list->next == &ftrace_list_end) {
204 ftrace_func_t func = ftrace_list->func; 215 ftrace_func_t func = ftrace_list->func;
205 216
206 if (ftrace_pid_trace) { 217 if (!list_empty(&ftrace_pids)) {
207 set_ftrace_pid_function(func); 218 set_ftrace_pid_function(func);
208 func = ftrace_pid_func; 219 func = ftrace_pid_func;
209 } 220 }
@@ -225,9 +236,13 @@ static void ftrace_update_pid_func(void)
225 if (ftrace_trace_function == ftrace_stub) 236 if (ftrace_trace_function == ftrace_stub)
226 return; 237 return;
227 238
239#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
228 func = ftrace_trace_function; 240 func = ftrace_trace_function;
241#else
242 func = __ftrace_trace_function;
243#endif
229 244
230 if (ftrace_pid_trace) { 245 if (!list_empty(&ftrace_pids)) {
231 set_ftrace_pid_function(func); 246 set_ftrace_pid_function(func);
232 func = ftrace_pid_func; 247 func = ftrace_pid_func;
233 } else { 248 } else {
@@ -736,7 +751,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
736 out: 751 out:
737 mutex_unlock(&ftrace_profile_lock); 752 mutex_unlock(&ftrace_profile_lock);
738 753
739 filp->f_pos += cnt; 754 *ppos += cnt;
740 755
741 return cnt; 756 return cnt;
742} 757}
@@ -817,8 +832,6 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
817} 832}
818#endif /* CONFIG_FUNCTION_PROFILER */ 833#endif /* CONFIG_FUNCTION_PROFILER */
819 834
820/* set when tracing only a pid */
821struct pid *ftrace_pid_trace;
822static struct pid * const ftrace_swapper_pid = &init_struct_pid; 835static struct pid * const ftrace_swapper_pid = &init_struct_pid;
823 836
824#ifdef CONFIG_DYNAMIC_FTRACE 837#ifdef CONFIG_DYNAMIC_FTRACE
@@ -1074,14 +1087,9 @@ static void ftrace_replace_code(int enable)
1074 failed = __ftrace_replace_code(rec, enable); 1087 failed = __ftrace_replace_code(rec, enable);
1075 if (failed) { 1088 if (failed) {
1076 rec->flags |= FTRACE_FL_FAILED; 1089 rec->flags |= FTRACE_FL_FAILED;
1077 if ((system_state == SYSTEM_BOOTING) || 1090 ftrace_bug(failed, rec->ip);
1078 !core_kernel_text(rec->ip)) { 1091 /* Stop processing */
1079 ftrace_free_rec(rec); 1092 return;
1080 } else {
1081 ftrace_bug(failed, rec->ip);
1082 /* Stop processing */
1083 return;
1084 }
1085 } 1093 }
1086 } while_for_each_ftrace_rec(); 1094 } while_for_each_ftrace_rec();
1087} 1095}
@@ -1262,12 +1270,34 @@ static int ftrace_update_code(struct module *mod)
1262 ftrace_new_addrs = p->newlist; 1270 ftrace_new_addrs = p->newlist;
1263 p->flags = 0L; 1271 p->flags = 0L;
1264 1272
1265 /* convert record (i.e, patch mcount-call with NOP) */ 1273 /*
1266 if (ftrace_code_disable(mod, p)) { 1274 * Do the initial record convertion from mcount jump
1267 p->flags |= FTRACE_FL_CONVERTED; 1275 * to the NOP instructions.
1268 ftrace_update_cnt++; 1276 */
1269 } else 1277 if (!ftrace_code_disable(mod, p)) {
1270 ftrace_free_rec(p); 1278 ftrace_free_rec(p);
1279 continue;
1280 }
1281
1282 p->flags |= FTRACE_FL_CONVERTED;
1283 ftrace_update_cnt++;
1284
1285 /*
1286 * If the tracing is enabled, go ahead and enable the record.
1287 *
1288 * The reason not to enable the record immediatelly is the
1289 * inherent check of ftrace_make_nop/ftrace_make_call for
1290 * correct previous instructions. Making first the NOP
1291 * conversion puts the module to the correct state, thus
1292 * passing the ftrace_make_call check.
1293 */
1294 if (ftrace_start_up) {
1295 int failed = __ftrace_replace_code(p, 1);
1296 if (failed) {
1297 ftrace_bug(failed, p->ip);
1298 ftrace_free_rec(p);
1299 }
1300 }
1271 } 1301 }
1272 1302
1273 stop = ftrace_now(raw_smp_processor_id()); 1303 stop = ftrace_now(raw_smp_processor_id());
@@ -1323,11 +1353,10 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
1323 1353
1324enum { 1354enum {
1325 FTRACE_ITER_FILTER = (1 << 0), 1355 FTRACE_ITER_FILTER = (1 << 0),
1326 FTRACE_ITER_CONT = (1 << 1), 1356 FTRACE_ITER_NOTRACE = (1 << 1),
1327 FTRACE_ITER_NOTRACE = (1 << 2), 1357 FTRACE_ITER_FAILURES = (1 << 2),
1328 FTRACE_ITER_FAILURES = (1 << 3), 1358 FTRACE_ITER_PRINTALL = (1 << 3),
1329 FTRACE_ITER_PRINTALL = (1 << 4), 1359 FTRACE_ITER_HASH = (1 << 4),
1330 FTRACE_ITER_HASH = (1 << 5),
1331}; 1360};
1332 1361
1333#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 1362#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -1337,8 +1366,7 @@ struct ftrace_iterator {
1337 int hidx; 1366 int hidx;
1338 int idx; 1367 int idx;
1339 unsigned flags; 1368 unsigned flags;
1340 unsigned char buffer[FTRACE_BUFF_MAX+1]; 1369 struct trace_parser parser;
1341 unsigned buffer_idx;
1342}; 1370};
1343 1371
1344static void * 1372static void *
@@ -1407,7 +1435,7 @@ static int t_hash_show(struct seq_file *m, void *v)
1407 if (rec->ops->print) 1435 if (rec->ops->print)
1408 return rec->ops->print(m, rec->ip, rec->ops, rec->data); 1436 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
1409 1437
1410 seq_printf(m, "%pf:%pf", (void *)rec->ip, (void *)rec->ops->func); 1438 seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func);
1411 1439
1412 if (rec->data) 1440 if (rec->data)
1413 seq_printf(m, ":%p", rec->data); 1441 seq_printf(m, ":%p", rec->data);
@@ -1517,12 +1545,12 @@ static int t_show(struct seq_file *m, void *v)
1517 if (!rec) 1545 if (!rec)
1518 return 0; 1546 return 0;
1519 1547
1520 seq_printf(m, "%pf\n", (void *)rec->ip); 1548 seq_printf(m, "%ps\n", (void *)rec->ip);
1521 1549
1522 return 0; 1550 return 0;
1523} 1551}
1524 1552
1525static struct seq_operations show_ftrace_seq_ops = { 1553static const struct seq_operations show_ftrace_seq_ops = {
1526 .start = t_start, 1554 .start = t_start,
1527 .next = t_next, 1555 .next = t_next,
1528 .stop = t_stop, 1556 .stop = t_stop,
@@ -1604,6 +1632,11 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1604 if (!iter) 1632 if (!iter)
1605 return -ENOMEM; 1633 return -ENOMEM;
1606 1634
1635 if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) {
1636 kfree(iter);
1637 return -ENOMEM;
1638 }
1639
1607 mutex_lock(&ftrace_regex_lock); 1640 mutex_lock(&ftrace_regex_lock);
1608 if ((file->f_mode & FMODE_WRITE) && 1641 if ((file->f_mode & FMODE_WRITE) &&
1609 (file->f_flags & O_TRUNC)) 1642 (file->f_flags & O_TRUNC))
@@ -1618,8 +1651,10 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1618 if (!ret) { 1651 if (!ret) {
1619 struct seq_file *m = file->private_data; 1652 struct seq_file *m = file->private_data;
1620 m->private = iter; 1653 m->private = iter;
1621 } else 1654 } else {
1655 trace_parser_put(&iter->parser);
1622 kfree(iter); 1656 kfree(iter);
1657 }
1623 } else 1658 } else
1624 file->private_data = iter; 1659 file->private_data = iter;
1625 mutex_unlock(&ftrace_regex_lock); 1660 mutex_unlock(&ftrace_regex_lock);
@@ -1652,60 +1687,6 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
1652 return ret; 1687 return ret;
1653} 1688}
1654 1689
1655enum {
1656 MATCH_FULL,
1657 MATCH_FRONT_ONLY,
1658 MATCH_MIDDLE_ONLY,
1659 MATCH_END_ONLY,
1660};
1661
1662/*
1663 * (static function - no need for kernel doc)
1664 *
1665 * Pass in a buffer containing a glob and this function will
1666 * set search to point to the search part of the buffer and
1667 * return the type of search it is (see enum above).
1668 * This does modify buff.
1669 *
1670 * Returns enum type.
1671 * search returns the pointer to use for comparison.
1672 * not returns 1 if buff started with a '!'
1673 * 0 otherwise.
1674 */
1675static int
1676ftrace_setup_glob(char *buff, int len, char **search, int *not)
1677{
1678 int type = MATCH_FULL;
1679 int i;
1680
1681 if (buff[0] == '!') {
1682 *not = 1;
1683 buff++;
1684 len--;
1685 } else
1686 *not = 0;
1687
1688 *search = buff;
1689
1690 for (i = 0; i < len; i++) {
1691 if (buff[i] == '*') {
1692 if (!i) {
1693 *search = buff + 1;
1694 type = MATCH_END_ONLY;
1695 } else {
1696 if (type == MATCH_END_ONLY)
1697 type = MATCH_MIDDLE_ONLY;
1698 else
1699 type = MATCH_FRONT_ONLY;
1700 buff[i] = 0;
1701 break;
1702 }
1703 }
1704 }
1705
1706 return type;
1707}
1708
1709static int ftrace_match(char *str, char *regex, int len, int type) 1690static int ftrace_match(char *str, char *regex, int len, int type)
1710{ 1691{
1711 int matched = 0; 1692 int matched = 0;
@@ -1754,7 +1735,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
1754 int not; 1735 int not;
1755 1736
1756 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1737 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1757 type = ftrace_setup_glob(buff, len, &search, &not); 1738 type = filter_parse_regex(buff, len, &search, &not);
1758 1739
1759 search_len = strlen(search); 1740 search_len = strlen(search);
1760 1741
@@ -1822,7 +1803,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
1822 } 1803 }
1823 1804
1824 if (strlen(buff)) { 1805 if (strlen(buff)) {
1825 type = ftrace_setup_glob(buff, strlen(buff), &search, &not); 1806 type = filter_parse_regex(buff, strlen(buff), &search, &not);
1826 search_len = strlen(search); 1807 search_len = strlen(search);
1827 } 1808 }
1828 1809
@@ -1987,7 +1968,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
1987 int count = 0; 1968 int count = 0;
1988 char *search; 1969 char *search;
1989 1970
1990 type = ftrace_setup_glob(glob, strlen(glob), &search, &not); 1971 type = filter_parse_regex(glob, strlen(glob), &search, &not);
1991 len = strlen(search); 1972 len = strlen(search);
1992 1973
1993 /* we do not support '!' for function probes */ 1974 /* we do not support '!' for function probes */
@@ -2059,12 +2040,12 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
2059 int i, len = 0; 2040 int i, len = 0;
2060 char *search; 2041 char *search;
2061 2042
2062 if (glob && (strcmp(glob, "*") || !strlen(glob))) 2043 if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
2063 glob = NULL; 2044 glob = NULL;
2064 else { 2045 else if (glob) {
2065 int not; 2046 int not;
2066 2047
2067 type = ftrace_setup_glob(glob, strlen(glob), &search, &not); 2048 type = filter_parse_regex(glob, strlen(glob), &search, &not);
2068 len = strlen(search); 2049 len = strlen(search);
2069 2050
2070 /* we do not support '!' for function probes */ 2051 /* we do not support '!' for function probes */
@@ -2196,11 +2177,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2196 size_t cnt, loff_t *ppos, int enable) 2177 size_t cnt, loff_t *ppos, int enable)
2197{ 2178{
2198 struct ftrace_iterator *iter; 2179 struct ftrace_iterator *iter;
2199 char ch; 2180 struct trace_parser *parser;
2200 size_t read = 0; 2181 ssize_t ret, read;
2201 ssize_t ret;
2202 2182
2203 if (!cnt || cnt < 0) 2183 if (!cnt)
2204 return 0; 2184 return 0;
2205 2185
2206 mutex_lock(&ftrace_regex_lock); 2186 mutex_lock(&ftrace_regex_lock);
@@ -2211,70 +2191,21 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2211 } else 2191 } else
2212 iter = file->private_data; 2192 iter = file->private_data;
2213 2193
2214 if (!*ppos) { 2194 parser = &iter->parser;
2215 iter->flags &= ~FTRACE_ITER_CONT; 2195 read = trace_get_user(parser, ubuf, cnt, ppos);
2216 iter->buffer_idx = 0;
2217 }
2218
2219 ret = get_user(ch, ubuf++);
2220 if (ret)
2221 goto out;
2222 read++;
2223 cnt--;
2224 2196
2225 /* 2197 if (read >= 0 && trace_parser_loaded(parser) &&
2226 * If the parser haven't finished with the last write, 2198 !trace_parser_cont(parser)) {
2227 * continue reading the user input without skipping spaces. 2199 ret = ftrace_process_regex(parser->buffer,
2228 */ 2200 parser->idx, enable);
2229 if (!(iter->flags & FTRACE_ITER_CONT)) {
2230 /* skip white space */
2231 while (cnt && isspace(ch)) {
2232 ret = get_user(ch, ubuf++);
2233 if (ret)
2234 goto out;
2235 read++;
2236 cnt--;
2237 }
2238
2239 /* only spaces were written */
2240 if (isspace(ch)) {
2241 *ppos += read;
2242 ret = read;
2243 goto out;
2244 }
2245
2246 iter->buffer_idx = 0;
2247 }
2248
2249 while (cnt && !isspace(ch)) {
2250 if (iter->buffer_idx < FTRACE_BUFF_MAX)
2251 iter->buffer[iter->buffer_idx++] = ch;
2252 else {
2253 ret = -EINVAL;
2254 goto out;
2255 }
2256 ret = get_user(ch, ubuf++);
2257 if (ret) 2201 if (ret)
2258 goto out; 2202 goto out_unlock;
2259 read++;
2260 cnt--;
2261 }
2262 2203
2263 if (isspace(ch)) { 2204 trace_parser_clear(parser);
2264 iter->buffer[iter->buffer_idx] = 0;
2265 ret = ftrace_process_regex(iter->buffer,
2266 iter->buffer_idx, enable);
2267 if (ret)
2268 goto out;
2269 iter->buffer_idx = 0;
2270 } else {
2271 iter->flags |= FTRACE_ITER_CONT;
2272 iter->buffer[iter->buffer_idx++] = ch;
2273 } 2205 }
2274 2206
2275 *ppos += read;
2276 ret = read; 2207 ret = read;
2277 out: 2208out_unlock:
2278 mutex_unlock(&ftrace_regex_lock); 2209 mutex_unlock(&ftrace_regex_lock);
2279 2210
2280 return ret; 2211 return ret;
@@ -2358,6 +2289,32 @@ static int __init set_ftrace_filter(char *str)
2358} 2289}
2359__setup("ftrace_filter=", set_ftrace_filter); 2290__setup("ftrace_filter=", set_ftrace_filter);
2360 2291
2292#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2293static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
2294static int __init set_graph_function(char *str)
2295{
2296 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
2297 return 1;
2298}
2299__setup("ftrace_graph_filter=", set_graph_function);
2300
2301static void __init set_ftrace_early_graph(char *buf)
2302{
2303 int ret;
2304 char *func;
2305
2306 while (buf) {
2307 func = strsep(&buf, ",");
2308 /* we allow only one expression at a time */
2309 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
2310 func);
2311 if (ret)
2312 printk(KERN_DEBUG "ftrace: function %s not "
2313 "traceable\n", func);
2314 }
2315}
2316#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2317
2361static void __init set_ftrace_early_filter(char *buf, int enable) 2318static void __init set_ftrace_early_filter(char *buf, int enable)
2362{ 2319{
2363 char *func; 2320 char *func;
@@ -2374,6 +2331,10 @@ static void __init set_ftrace_early_filters(void)
2374 set_ftrace_early_filter(ftrace_filter_buf, 1); 2331 set_ftrace_early_filter(ftrace_filter_buf, 1);
2375 if (ftrace_notrace_buf[0]) 2332 if (ftrace_notrace_buf[0])
2376 set_ftrace_early_filter(ftrace_notrace_buf, 0); 2333 set_ftrace_early_filter(ftrace_notrace_buf, 0);
2334#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2335 if (ftrace_graph_buf[0])
2336 set_ftrace_early_graph(ftrace_graph_buf);
2337#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2377} 2338}
2378 2339
2379static int 2340static int
@@ -2381,6 +2342,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2381{ 2342{
2382 struct seq_file *m = (struct seq_file *)file->private_data; 2343 struct seq_file *m = (struct seq_file *)file->private_data;
2383 struct ftrace_iterator *iter; 2344 struct ftrace_iterator *iter;
2345 struct trace_parser *parser;
2384 2346
2385 mutex_lock(&ftrace_regex_lock); 2347 mutex_lock(&ftrace_regex_lock);
2386 if (file->f_mode & FMODE_READ) { 2348 if (file->f_mode & FMODE_READ) {
@@ -2390,9 +2352,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2390 } else 2352 } else
2391 iter = file->private_data; 2353 iter = file->private_data;
2392 2354
2393 if (iter->buffer_idx) { 2355 parser = &iter->parser;
2394 iter->buffer[iter->buffer_idx] = 0; 2356 if (trace_parser_loaded(parser)) {
2395 ftrace_match_records(iter->buffer, iter->buffer_idx, enable); 2357 parser->buffer[parser->idx] = 0;
2358 ftrace_match_records(parser->buffer, parser->idx, enable);
2396 } 2359 }
2397 2360
2398 mutex_lock(&ftrace_lock); 2361 mutex_lock(&ftrace_lock);
@@ -2400,7 +2363,9 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2400 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 2363 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
2401 mutex_unlock(&ftrace_lock); 2364 mutex_unlock(&ftrace_lock);
2402 2365
2366 trace_parser_put(parser);
2403 kfree(iter); 2367 kfree(iter);
2368
2404 mutex_unlock(&ftrace_regex_lock); 2369 mutex_unlock(&ftrace_regex_lock);
2405 return 0; 2370 return 0;
2406} 2371}
@@ -2457,11 +2422,9 @@ unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
2457static void * 2422static void *
2458__g_next(struct seq_file *m, loff_t *pos) 2423__g_next(struct seq_file *m, loff_t *pos)
2459{ 2424{
2460 unsigned long *array = m->private;
2461
2462 if (*pos >= ftrace_graph_count) 2425 if (*pos >= ftrace_graph_count)
2463 return NULL; 2426 return NULL;
2464 return &array[*pos]; 2427 return &ftrace_graph_funcs[*pos];
2465} 2428}
2466 2429
2467static void * 2430static void *
@@ -2499,12 +2462,12 @@ static int g_show(struct seq_file *m, void *v)
2499 return 0; 2462 return 0;
2500 } 2463 }
2501 2464
2502 seq_printf(m, "%pf\n", v); 2465 seq_printf(m, "%ps\n", (void *)*ptr);
2503 2466
2504 return 0; 2467 return 0;
2505} 2468}
2506 2469
2507static struct seq_operations ftrace_graph_seq_ops = { 2470static const struct seq_operations ftrace_graph_seq_ops = {
2508 .start = g_start, 2471 .start = g_start,
2509 .next = g_next, 2472 .next = g_next,
2510 .stop = g_stop, 2473 .stop = g_stop,
@@ -2525,16 +2488,10 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2525 ftrace_graph_count = 0; 2488 ftrace_graph_count = 0;
2526 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); 2489 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
2527 } 2490 }
2491 mutex_unlock(&graph_lock);
2528 2492
2529 if (file->f_mode & FMODE_READ) { 2493 if (file->f_mode & FMODE_READ)
2530 ret = seq_open(file, &ftrace_graph_seq_ops); 2494 ret = seq_open(file, &ftrace_graph_seq_ops);
2531 if (!ret) {
2532 struct seq_file *m = file->private_data;
2533 m->private = ftrace_graph_funcs;
2534 }
2535 } else
2536 file->private_data = ftrace_graph_funcs;
2537 mutex_unlock(&graph_lock);
2538 2495
2539 return ret; 2496 return ret;
2540} 2497}
@@ -2563,7 +2520,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2563 return -ENODEV; 2520 return -ENODEV;
2564 2521
2565 /* decode regex */ 2522 /* decode regex */
2566 type = ftrace_setup_glob(buffer, strlen(buffer), &search, &not); 2523 type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
2567 if (not) 2524 if (not)
2568 return -EINVAL; 2525 return -EINVAL;
2569 2526
@@ -2602,12 +2559,8 @@ static ssize_t
2602ftrace_graph_write(struct file *file, const char __user *ubuf, 2559ftrace_graph_write(struct file *file, const char __user *ubuf,
2603 size_t cnt, loff_t *ppos) 2560 size_t cnt, loff_t *ppos)
2604{ 2561{
2605 unsigned char buffer[FTRACE_BUFF_MAX+1]; 2562 struct trace_parser parser;
2606 unsigned long *array; 2563 ssize_t read, ret;
2607 size_t read = 0;
2608 ssize_t ret;
2609 int index = 0;
2610 char ch;
2611 2564
2612 if (!cnt || cnt < 0) 2565 if (!cnt || cnt < 0)
2613 return 0; 2566 return 0;
@@ -2616,60 +2569,31 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2616 2569
2617 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) { 2570 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
2618 ret = -EBUSY; 2571 ret = -EBUSY;
2619 goto out; 2572 goto out_unlock;
2620 } 2573 }
2621 2574
2622 if (file->f_mode & FMODE_READ) { 2575 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
2623 struct seq_file *m = file->private_data; 2576 ret = -ENOMEM;
2624 array = m->private; 2577 goto out_unlock;
2625 } else
2626 array = file->private_data;
2627
2628 ret = get_user(ch, ubuf++);
2629 if (ret)
2630 goto out;
2631 read++;
2632 cnt--;
2633
2634 /* skip white space */
2635 while (cnt && isspace(ch)) {
2636 ret = get_user(ch, ubuf++);
2637 if (ret)
2638 goto out;
2639 read++;
2640 cnt--;
2641 } 2578 }
2642 2579
2643 if (isspace(ch)) { 2580 read = trace_get_user(&parser, ubuf, cnt, ppos);
2644 *ppos += read;
2645 ret = read;
2646 goto out;
2647 }
2648 2581
2649 while (cnt && !isspace(ch)) { 2582 if (read >= 0 && trace_parser_loaded((&parser))) {
2650 if (index < FTRACE_BUFF_MAX) 2583 parser.buffer[parser.idx] = 0;
2651 buffer[index++] = ch; 2584
2652 else { 2585 /* we allow only one expression at a time */
2653 ret = -EINVAL; 2586 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
2654 goto out; 2587 parser.buffer);
2655 }
2656 ret = get_user(ch, ubuf++);
2657 if (ret) 2588 if (ret)
2658 goto out; 2589 goto out_free;
2659 read++;
2660 cnt--;
2661 } 2590 }
2662 buffer[index] = 0;
2663
2664 /* we allow only one expression at a time */
2665 ret = ftrace_set_func(array, &ftrace_graph_count, buffer);
2666 if (ret)
2667 goto out;
2668
2669 file->f_pos += read;
2670 2591
2671 ret = read; 2592 ret = read;
2672 out: 2593
2594out_free:
2595 trace_parser_put(&parser);
2596out_unlock:
2673 mutex_unlock(&graph_lock); 2597 mutex_unlock(&graph_lock);
2674 2598
2675 return ret; 2599 return ret;
@@ -2707,7 +2631,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
2707 return 0; 2631 return 0;
2708} 2632}
2709 2633
2710static int ftrace_convert_nops(struct module *mod, 2634static int ftrace_process_locs(struct module *mod,
2711 unsigned long *start, 2635 unsigned long *start,
2712 unsigned long *end) 2636 unsigned long *end)
2713{ 2637{
@@ -2740,19 +2664,17 @@ static int ftrace_convert_nops(struct module *mod,
2740} 2664}
2741 2665
2742#ifdef CONFIG_MODULES 2666#ifdef CONFIG_MODULES
2743void ftrace_release(void *start, void *end) 2667void ftrace_release_mod(struct module *mod)
2744{ 2668{
2745 struct dyn_ftrace *rec; 2669 struct dyn_ftrace *rec;
2746 struct ftrace_page *pg; 2670 struct ftrace_page *pg;
2747 unsigned long s = (unsigned long)start;
2748 unsigned long e = (unsigned long)end;
2749 2671
2750 if (ftrace_disabled || !start || start == end) 2672 if (ftrace_disabled)
2751 return; 2673 return;
2752 2674
2753 mutex_lock(&ftrace_lock); 2675 mutex_lock(&ftrace_lock);
2754 do_for_each_ftrace_rec(pg, rec) { 2676 do_for_each_ftrace_rec(pg, rec) {
2755 if ((rec->ip >= s) && (rec->ip < e)) { 2677 if (within_module_core(rec->ip, mod)) {
2756 /* 2678 /*
2757 * rec->ip is changed in ftrace_free_rec() 2679 * rec->ip is changed in ftrace_free_rec()
2758 * It should not between s and e if record was freed. 2680 * It should not between s and e if record was freed.
@@ -2769,7 +2691,7 @@ static void ftrace_init_module(struct module *mod,
2769{ 2691{
2770 if (ftrace_disabled || start == end) 2692 if (ftrace_disabled || start == end)
2771 return; 2693 return;
2772 ftrace_convert_nops(mod, start, end); 2694 ftrace_process_locs(mod, start, end);
2773} 2695}
2774 2696
2775static int ftrace_module_notify(struct notifier_block *self, 2697static int ftrace_module_notify(struct notifier_block *self,
@@ -2784,9 +2706,7 @@ static int ftrace_module_notify(struct notifier_block *self,
2784 mod->num_ftrace_callsites); 2706 mod->num_ftrace_callsites);
2785 break; 2707 break;
2786 case MODULE_STATE_GOING: 2708 case MODULE_STATE_GOING:
2787 ftrace_release(mod->ftrace_callsites, 2709 ftrace_release_mod(mod);
2788 mod->ftrace_callsites +
2789 mod->num_ftrace_callsites);
2790 break; 2710 break;
2791 } 2711 }
2792 2712
@@ -2832,7 +2752,7 @@ void __init ftrace_init(void)
2832 2752
2833 last_ftrace_enabled = ftrace_enabled = 1; 2753 last_ftrace_enabled = ftrace_enabled = 1;
2834 2754
2835 ret = ftrace_convert_nops(NULL, 2755 ret = ftrace_process_locs(NULL,
2836 __start_mcount_loc, 2756 __start_mcount_loc,
2837 __stop_mcount_loc); 2757 __stop_mcount_loc);
2838 2758
@@ -2865,23 +2785,6 @@ static inline void ftrace_startup_enable(int command) { }
2865# define ftrace_shutdown_sysctl() do { } while (0) 2785# define ftrace_shutdown_sysctl() do { } while (0)
2866#endif /* CONFIG_DYNAMIC_FTRACE */ 2786#endif /* CONFIG_DYNAMIC_FTRACE */
2867 2787
2868static ssize_t
2869ftrace_pid_read(struct file *file, char __user *ubuf,
2870 size_t cnt, loff_t *ppos)
2871{
2872 char buf[64];
2873 int r;
2874
2875 if (ftrace_pid_trace == ftrace_swapper_pid)
2876 r = sprintf(buf, "swapper tasks\n");
2877 else if (ftrace_pid_trace)
2878 r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace));
2879 else
2880 r = sprintf(buf, "no pid\n");
2881
2882 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2883}
2884
2885static void clear_ftrace_swapper(void) 2788static void clear_ftrace_swapper(void)
2886{ 2789{
2887 struct task_struct *p; 2790 struct task_struct *p;
@@ -2932,14 +2835,12 @@ static void set_ftrace_pid(struct pid *pid)
2932 rcu_read_unlock(); 2835 rcu_read_unlock();
2933} 2836}
2934 2837
2935static void clear_ftrace_pid_task(struct pid **pid) 2838static void clear_ftrace_pid_task(struct pid *pid)
2936{ 2839{
2937 if (*pid == ftrace_swapper_pid) 2840 if (pid == ftrace_swapper_pid)
2938 clear_ftrace_swapper(); 2841 clear_ftrace_swapper();
2939 else 2842 else
2940 clear_ftrace_pid(*pid); 2843 clear_ftrace_pid(pid);
2941
2942 *pid = NULL;
2943} 2844}
2944 2845
2945static void set_ftrace_pid_task(struct pid *pid) 2846static void set_ftrace_pid_task(struct pid *pid)
@@ -2950,74 +2851,184 @@ static void set_ftrace_pid_task(struct pid *pid)
2950 set_ftrace_pid(pid); 2851 set_ftrace_pid(pid);
2951} 2852}
2952 2853
2953static ssize_t 2854static int ftrace_pid_add(int p)
2954ftrace_pid_write(struct file *filp, const char __user *ubuf,
2955 size_t cnt, loff_t *ppos)
2956{ 2855{
2957 struct pid *pid; 2856 struct pid *pid;
2958 char buf[64]; 2857 struct ftrace_pid *fpid;
2959 long val; 2858 int ret = -EINVAL;
2960 int ret;
2961 2859
2962 if (cnt >= sizeof(buf)) 2860 mutex_lock(&ftrace_lock);
2963 return -EINVAL;
2964 2861
2965 if (copy_from_user(&buf, ubuf, cnt)) 2862 if (!p)
2966 return -EFAULT; 2863 pid = ftrace_swapper_pid;
2864 else
2865 pid = find_get_pid(p);
2967 2866
2968 buf[cnt] = 0; 2867 if (!pid)
2868 goto out;
2969 2869
2970 ret = strict_strtol(buf, 10, &val); 2870 ret = 0;
2971 if (ret < 0)
2972 return ret;
2973 2871
2974 mutex_lock(&ftrace_lock); 2872 list_for_each_entry(fpid, &ftrace_pids, list)
2975 if (val < 0) { 2873 if (fpid->pid == pid)
2976 /* disable pid tracing */ 2874 goto out_put;
2977 if (!ftrace_pid_trace)
2978 goto out;
2979 2875
2980 clear_ftrace_pid_task(&ftrace_pid_trace); 2876 ret = -ENOMEM;
2981 2877
2982 } else { 2878 fpid = kmalloc(sizeof(*fpid), GFP_KERNEL);
2983 /* swapper task is special */ 2879 if (!fpid)
2984 if (!val) { 2880 goto out_put;
2985 pid = ftrace_swapper_pid;
2986 if (pid == ftrace_pid_trace)
2987 goto out;
2988 } else {
2989 pid = find_get_pid(val);
2990 2881
2991 if (pid == ftrace_pid_trace) { 2882 list_add(&fpid->list, &ftrace_pids);
2992 put_pid(pid); 2883 fpid->pid = pid;
2993 goto out;
2994 }
2995 }
2996 2884
2997 if (ftrace_pid_trace) 2885 set_ftrace_pid_task(pid);
2998 clear_ftrace_pid_task(&ftrace_pid_trace);
2999 2886
3000 if (!pid) 2887 ftrace_update_pid_func();
3001 goto out; 2888 ftrace_startup_enable(0);
2889
2890 mutex_unlock(&ftrace_lock);
2891 return 0;
3002 2892
3003 ftrace_pid_trace = pid; 2893out_put:
2894 if (pid != ftrace_swapper_pid)
2895 put_pid(pid);
3004 2896
3005 set_ftrace_pid_task(ftrace_pid_trace); 2897out:
2898 mutex_unlock(&ftrace_lock);
2899 return ret;
2900}
2901
2902static void ftrace_pid_reset(void)
2903{
2904 struct ftrace_pid *fpid, *safe;
2905
2906 mutex_lock(&ftrace_lock);
2907 list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) {
2908 struct pid *pid = fpid->pid;
2909
2910 clear_ftrace_pid_task(pid);
2911
2912 list_del(&fpid->list);
2913 kfree(fpid);
3006 } 2914 }
3007 2915
3008 /* update the function call */
3009 ftrace_update_pid_func(); 2916 ftrace_update_pid_func();
3010 ftrace_startup_enable(0); 2917 ftrace_startup_enable(0);
3011 2918
3012 out:
3013 mutex_unlock(&ftrace_lock); 2919 mutex_unlock(&ftrace_lock);
2920}
3014 2921
3015 return cnt; 2922static void *fpid_start(struct seq_file *m, loff_t *pos)
2923{
2924 mutex_lock(&ftrace_lock);
2925
2926 if (list_empty(&ftrace_pids) && (!*pos))
2927 return (void *) 1;
2928
2929 return seq_list_start(&ftrace_pids, *pos);
2930}
2931
2932static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
2933{
2934 if (v == (void *)1)
2935 return NULL;
2936
2937 return seq_list_next(v, &ftrace_pids, pos);
2938}
2939
2940static void fpid_stop(struct seq_file *m, void *p)
2941{
2942 mutex_unlock(&ftrace_lock);
2943}
2944
2945static int fpid_show(struct seq_file *m, void *v)
2946{
2947 const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
2948
2949 if (v == (void *)1) {
2950 seq_printf(m, "no pid\n");
2951 return 0;
2952 }
2953
2954 if (fpid->pid == ftrace_swapper_pid)
2955 seq_printf(m, "swapper tasks\n");
2956 else
2957 seq_printf(m, "%u\n", pid_vnr(fpid->pid));
2958
2959 return 0;
2960}
2961
2962static const struct seq_operations ftrace_pid_sops = {
2963 .start = fpid_start,
2964 .next = fpid_next,
2965 .stop = fpid_stop,
2966 .show = fpid_show,
2967};
2968
2969static int
2970ftrace_pid_open(struct inode *inode, struct file *file)
2971{
2972 int ret = 0;
2973
2974 if ((file->f_mode & FMODE_WRITE) &&
2975 (file->f_flags & O_TRUNC))
2976 ftrace_pid_reset();
2977
2978 if (file->f_mode & FMODE_READ)
2979 ret = seq_open(file, &ftrace_pid_sops);
2980
2981 return ret;
2982}
2983
2984static ssize_t
2985ftrace_pid_write(struct file *filp, const char __user *ubuf,
2986 size_t cnt, loff_t *ppos)
2987{
2988 char buf[64], *tmp;
2989 long val;
2990 int ret;
2991
2992 if (cnt >= sizeof(buf))
2993 return -EINVAL;
2994
2995 if (copy_from_user(&buf, ubuf, cnt))
2996 return -EFAULT;
2997
2998 buf[cnt] = 0;
2999
3000 /*
3001 * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid"
3002 * to clean the filter quietly.
3003 */
3004 tmp = strstrip(buf);
3005 if (strlen(tmp) == 0)
3006 return 1;
3007
3008 ret = strict_strtol(tmp, 10, &val);
3009 if (ret < 0)
3010 return ret;
3011
3012 ret = ftrace_pid_add(val);
3013
3014 return ret ? ret : cnt;
3015}
3016
3017static int
3018ftrace_pid_release(struct inode *inode, struct file *file)
3019{
3020 if (file->f_mode & FMODE_READ)
3021 seq_release(inode, file);
3022
3023 return 0;
3016} 3024}
3017 3025
3018static const struct file_operations ftrace_pid_fops = { 3026static const struct file_operations ftrace_pid_fops = {
3019 .read = ftrace_pid_read, 3027 .open = ftrace_pid_open,
3020 .write = ftrace_pid_write, 3028 .write = ftrace_pid_write,
3029 .read = seq_read,
3030 .llseek = seq_lseek,
3031 .release = ftrace_pid_release,
3021}; 3032};
3022 3033
3023static __init int ftrace_init_debugfs(void) 3034static __init int ftrace_init_debugfs(void)
@@ -3100,7 +3111,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
3100 3111
3101int 3112int
3102ftrace_enable_sysctl(struct ctl_table *table, int write, 3113ftrace_enable_sysctl(struct ctl_table *table, int write,
3103 struct file *file, void __user *buffer, size_t *lenp, 3114 void __user *buffer, size_t *lenp,
3104 loff_t *ppos) 3115 loff_t *ppos)
3105{ 3116{
3106 int ret; 3117 int ret;
@@ -3110,7 +3121,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
3110 3121
3111 mutex_lock(&ftrace_lock); 3122 mutex_lock(&ftrace_lock);
3112 3123
3113 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 3124 ret = proc_dointvec(table, write, buffer, lenp, ppos);
3114 3125
3115 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) 3126 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
3116 goto out; 3127 goto out;
@@ -3380,4 +3391,3 @@ void ftrace_graph_stop(void)
3380 ftrace_stop(); 3391 ftrace_stop();
3381} 3392}
3382#endif 3393#endif
3383
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 81b1645c8549..a91da69f153a 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -501,7 +501,7 @@ static int __init init_kmem_tracer(void)
501 return 1; 501 return 1;
502 } 502 }
503 503
504 if (!register_tracer(&kmem_tracer)) { 504 if (register_tracer(&kmem_tracer) != 0) {
505 pr_warning("Warning: could not register the kmem tracer\n"); 505 pr_warning("Warning: could not register the kmem tracer\n");
506 return 1; 506 return 1;
507 } 507 }
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
new file mode 100644
index 000000000000..e06c6e3d56a3
--- /dev/null
+++ b/kernel/trace/power-traces.c
@@ -0,0 +1,20 @@
1/*
2 * Power trace points
3 *
4 * Copyright (C) 2009 Arjan van de Ven <arjan@linux.intel.com>
5 */
6
7#include <linux/string.h>
8#include <linux/types.h>
9#include <linux/workqueue.h>
10#include <linux/sched.h>
11#include <linux/module.h>
12#include <linux/slab.h>
13
14#define CREATE_TRACE_POINTS
15#include <trace/events/power.h>
16
17EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
18EXPORT_TRACEPOINT_SYMBOL_GPL(power_end);
19EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
20
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 454e74e718cf..a1ca4956ab5e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -201,8 +201,6 @@ int tracing_is_on(void)
201} 201}
202EXPORT_SYMBOL_GPL(tracing_is_on); 202EXPORT_SYMBOL_GPL(tracing_is_on);
203 203
204#include "trace.h"
205
206#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) 204#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
207#define RB_ALIGNMENT 4U 205#define RB_ALIGNMENT 4U
208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 206#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
@@ -399,18 +397,21 @@ int ring_buffer_print_page_header(struct trace_seq *s)
399 int ret; 397 int ret;
400 398
401 ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" 399 ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
402 "offset:0;\tsize:%u;\n", 400 "offset:0;\tsize:%u;\tsigned:%u;\n",
403 (unsigned int)sizeof(field.time_stamp)); 401 (unsigned int)sizeof(field.time_stamp),
402 (unsigned int)is_signed_type(u64));
404 403
405 ret = trace_seq_printf(s, "\tfield: local_t commit;\t" 404 ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
406 "offset:%u;\tsize:%u;\n", 405 "offset:%u;\tsize:%u;\tsigned:%u;\n",
407 (unsigned int)offsetof(typeof(field), commit), 406 (unsigned int)offsetof(typeof(field), commit),
408 (unsigned int)sizeof(field.commit)); 407 (unsigned int)sizeof(field.commit),
408 (unsigned int)is_signed_type(long));
409 409
410 ret = trace_seq_printf(s, "\tfield: char data;\t" 410 ret = trace_seq_printf(s, "\tfield: char data;\t"
411 "offset:%u;\tsize:%u;\n", 411 "offset:%u;\tsize:%u;\tsigned:%u;\n",
412 (unsigned int)offsetof(typeof(field), data), 412 (unsigned int)offsetof(typeof(field), data),
413 (unsigned int)BUF_PAGE_SIZE); 413 (unsigned int)BUF_PAGE_SIZE,
414 (unsigned int)is_signed_type(char));
414 415
415 return ret; 416 return ret;
416} 417}
@@ -485,7 +486,7 @@ struct ring_buffer_iter {
485/* Up this if you want to test the TIME_EXTENTS and normalization */ 486/* Up this if you want to test the TIME_EXTENTS and normalization */
486#define DEBUG_SHIFT 0 487#define DEBUG_SHIFT 0
487 488
488static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu) 489static inline u64 rb_time_stamp(struct ring_buffer *buffer)
489{ 490{
490 /* shift to debug/test normalization and TIME_EXTENTS */ 491 /* shift to debug/test normalization and TIME_EXTENTS */
491 return buffer->clock() << DEBUG_SHIFT; 492 return buffer->clock() << DEBUG_SHIFT;
@@ -496,7 +497,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
496 u64 time; 497 u64 time;
497 498
498 preempt_disable_notrace(); 499 preempt_disable_notrace();
499 time = rb_time_stamp(buffer, cpu); 500 time = rb_time_stamp(buffer);
500 preempt_enable_no_resched_notrace(); 501 preempt_enable_no_resched_notrace();
501 502
502 return time; 503 return time;
@@ -601,7 +602,7 @@ static struct list_head *rb_list_head(struct list_head *list)
601} 602}
602 603
603/* 604/*
604 * rb_is_head_page - test if the give page is the head page 605 * rb_is_head_page - test if the given page is the head page
605 * 606 *
606 * Because the reader may move the head_page pointer, we can 607 * Because the reader may move the head_page pointer, we can
607 * not trust what the head page is (it may be pointing to 608 * not trust what the head page is (it may be pointing to
@@ -701,8 +702,8 @@ static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
701 702
702 val &= ~RB_FLAG_MASK; 703 val &= ~RB_FLAG_MASK;
703 704
704 ret = (unsigned long)cmpxchg(&list->next, 705 ret = cmpxchg((unsigned long *)&list->next,
705 val | old_flag, val | new_flag); 706 val | old_flag, val | new_flag);
706 707
707 /* check if the reader took the page */ 708 /* check if the reader took the page */
708 if ((ret & ~RB_FLAG_MASK) != val) 709 if ((ret & ~RB_FLAG_MASK) != val)
@@ -794,7 +795,7 @@ static int rb_head_page_replace(struct buffer_page *old,
794 val = *ptr & ~RB_FLAG_MASK; 795 val = *ptr & ~RB_FLAG_MASK;
795 val |= RB_PAGE_HEAD; 796 val |= RB_PAGE_HEAD;
796 797
797 ret = cmpxchg(ptr, val, &new->list); 798 ret = cmpxchg(ptr, val, (unsigned long)&new->list);
798 799
799 return ret == val; 800 return ret == val;
800} 801}
@@ -1195,6 +1196,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1195 atomic_inc(&cpu_buffer->record_disabled); 1196 atomic_inc(&cpu_buffer->record_disabled);
1196 synchronize_sched(); 1197 synchronize_sched();
1197 1198
1199 spin_lock_irq(&cpu_buffer->reader_lock);
1198 rb_head_page_deactivate(cpu_buffer); 1200 rb_head_page_deactivate(cpu_buffer);
1199 1201
1200 for (i = 0; i < nr_pages; i++) { 1202 for (i = 0; i < nr_pages; i++) {
@@ -1209,6 +1211,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1209 return; 1211 return;
1210 1212
1211 rb_reset_cpu(cpu_buffer); 1213 rb_reset_cpu(cpu_buffer);
1214 spin_unlock_irq(&cpu_buffer->reader_lock);
1212 1215
1213 rb_check_pages(cpu_buffer); 1216 rb_check_pages(cpu_buffer);
1214 1217
@@ -1787,9 +1790,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1787static struct ring_buffer_event * 1790static struct ring_buffer_event *
1788rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, 1791rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1789 unsigned long length, unsigned long tail, 1792 unsigned long length, unsigned long tail,
1790 struct buffer_page *commit_page,
1791 struct buffer_page *tail_page, u64 *ts) 1793 struct buffer_page *tail_page, u64 *ts)
1792{ 1794{
1795 struct buffer_page *commit_page = cpu_buffer->commit_page;
1793 struct ring_buffer *buffer = cpu_buffer->buffer; 1796 struct ring_buffer *buffer = cpu_buffer->buffer;
1794 struct buffer_page *next_page; 1797 struct buffer_page *next_page;
1795 int ret; 1798 int ret;
@@ -1870,7 +1873,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1870 * Nested commits always have zero deltas, so 1873 * Nested commits always have zero deltas, so
1871 * just reread the time stamp 1874 * just reread the time stamp
1872 */ 1875 */
1873 *ts = rb_time_stamp(buffer, cpu_buffer->cpu); 1876 *ts = rb_time_stamp(buffer);
1874 next_page->page->time_stamp = *ts; 1877 next_page->page->time_stamp = *ts;
1875 } 1878 }
1876 1879
@@ -1892,13 +1895,10 @@ static struct ring_buffer_event *
1892__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 1895__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1893 unsigned type, unsigned long length, u64 *ts) 1896 unsigned type, unsigned long length, u64 *ts)
1894{ 1897{
1895 struct buffer_page *tail_page, *commit_page; 1898 struct buffer_page *tail_page;
1896 struct ring_buffer_event *event; 1899 struct ring_buffer_event *event;
1897 unsigned long tail, write; 1900 unsigned long tail, write;
1898 1901
1899 commit_page = cpu_buffer->commit_page;
1900 /* we just need to protect against interrupts */
1901 barrier();
1902 tail_page = cpu_buffer->tail_page; 1902 tail_page = cpu_buffer->tail_page;
1903 write = local_add_return(length, &tail_page->write); 1903 write = local_add_return(length, &tail_page->write);
1904 1904
@@ -1909,7 +1909,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1909 /* See if we shot pass the end of this buffer page */ 1909 /* See if we shot pass the end of this buffer page */
1910 if (write > BUF_PAGE_SIZE) 1910 if (write > BUF_PAGE_SIZE)
1911 return rb_move_tail(cpu_buffer, length, tail, 1911 return rb_move_tail(cpu_buffer, length, tail,
1912 commit_page, tail_page, ts); 1912 tail_page, ts);
1913 1913
1914 /* We reserved something on the buffer */ 1914 /* We reserved something on the buffer */
1915 1915
@@ -2113,7 +2113,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2113 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) 2113 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
2114 goto out_fail; 2114 goto out_fail;
2115 2115
2116 ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); 2116 ts = rb_time_stamp(cpu_buffer->buffer);
2117 2117
2118 /* 2118 /*
2119 * Only the first commit can update the timestamp. 2119 * Only the first commit can update the timestamp.
@@ -2683,7 +2683,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
2683EXPORT_SYMBOL_GPL(ring_buffer_entries); 2683EXPORT_SYMBOL_GPL(ring_buffer_entries);
2684 2684
2685/** 2685/**
2686 * ring_buffer_overrun_cpu - get the number of overruns in buffer 2686 * ring_buffer_overruns - get the number of overruns in buffer
2687 * @buffer: The ring buffer 2687 * @buffer: The ring buffer
2688 * 2688 *
2689 * Returns the total number of overruns in the ring buffer 2689 * Returns the total number of overruns in the ring buffer
@@ -2997,15 +2997,12 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
2997} 2997}
2998 2998
2999static struct ring_buffer_event * 2999static struct ring_buffer_event *
3000rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) 3000rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
3001{ 3001{
3002 struct ring_buffer_per_cpu *cpu_buffer;
3003 struct ring_buffer_event *event; 3002 struct ring_buffer_event *event;
3004 struct buffer_page *reader; 3003 struct buffer_page *reader;
3005 int nr_loops = 0; 3004 int nr_loops = 0;
3006 3005
3007 cpu_buffer = buffer->buffers[cpu];
3008
3009 again: 3006 again:
3010 /* 3007 /*
3011 * We repeat when a timestamp is encountered. It is possible 3008 * We repeat when a timestamp is encountered. It is possible
@@ -3049,7 +3046,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
3049 case RINGBUF_TYPE_DATA: 3046 case RINGBUF_TYPE_DATA:
3050 if (ts) { 3047 if (ts) {
3051 *ts = cpu_buffer->read_stamp + event->time_delta; 3048 *ts = cpu_buffer->read_stamp + event->time_delta;
3052 ring_buffer_normalize_time_stamp(buffer, 3049 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
3053 cpu_buffer->cpu, ts); 3050 cpu_buffer->cpu, ts);
3054 } 3051 }
3055 return event; 3052 return event;
@@ -3168,7 +3165,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
3168 local_irq_save(flags); 3165 local_irq_save(flags);
3169 if (dolock) 3166 if (dolock)
3170 spin_lock(&cpu_buffer->reader_lock); 3167 spin_lock(&cpu_buffer->reader_lock);
3171 event = rb_buffer_peek(buffer, cpu, ts); 3168 event = rb_buffer_peek(cpu_buffer, ts);
3172 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3169 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3173 rb_advance_reader(cpu_buffer); 3170 rb_advance_reader(cpu_buffer);
3174 if (dolock) 3171 if (dolock)
@@ -3237,7 +3234,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
3237 if (dolock) 3234 if (dolock)
3238 spin_lock(&cpu_buffer->reader_lock); 3235 spin_lock(&cpu_buffer->reader_lock);
3239 3236
3240 event = rb_buffer_peek(buffer, cpu, ts); 3237 event = rb_buffer_peek(cpu_buffer, ts);
3241 if (event) 3238 if (event)
3242 rb_advance_reader(cpu_buffer); 3239 rb_advance_reader(cpu_buffer);
3243 3240
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 573d3cc762c3..b2477caf09c2 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -35,6 +35,28 @@ static int disable_reader;
35module_param(disable_reader, uint, 0644); 35module_param(disable_reader, uint, 0644);
36MODULE_PARM_DESC(disable_reader, "only run producer"); 36MODULE_PARM_DESC(disable_reader, "only run producer");
37 37
38static int write_iteration = 50;
39module_param(write_iteration, uint, 0644);
40MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
41
42static int producer_nice = 19;
43static int consumer_nice = 19;
44
45static int producer_fifo = -1;
46static int consumer_fifo = -1;
47
48module_param(producer_nice, uint, 0644);
49MODULE_PARM_DESC(producer_nice, "nice prio for producer");
50
51module_param(consumer_nice, uint, 0644);
52MODULE_PARM_DESC(consumer_nice, "nice prio for consumer");
53
54module_param(producer_fifo, uint, 0644);
55MODULE_PARM_DESC(producer_fifo, "fifo prio for producer");
56
57module_param(consumer_fifo, uint, 0644);
58MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer");
59
38static int read_events; 60static int read_events;
39 61
40static int kill_test; 62static int kill_test;
@@ -208,15 +230,18 @@ static void ring_buffer_producer(void)
208 do { 230 do {
209 struct ring_buffer_event *event; 231 struct ring_buffer_event *event;
210 int *entry; 232 int *entry;
211 233 int i;
212 event = ring_buffer_lock_reserve(buffer, 10); 234
213 if (!event) { 235 for (i = 0; i < write_iteration; i++) {
214 missed++; 236 event = ring_buffer_lock_reserve(buffer, 10);
215 } else { 237 if (!event) {
216 hit++; 238 missed++;
217 entry = ring_buffer_event_data(event); 239 } else {
218 *entry = smp_processor_id(); 240 hit++;
219 ring_buffer_unlock_commit(buffer, event); 241 entry = ring_buffer_event_data(event);
242 *entry = smp_processor_id();
243 ring_buffer_unlock_commit(buffer, event);
244 }
220 } 245 }
221 do_gettimeofday(&end_tv); 246 do_gettimeofday(&end_tv);
222 247
@@ -263,6 +288,27 @@ static void ring_buffer_producer(void)
263 288
264 if (kill_test) 289 if (kill_test)
265 trace_printk("ERROR!\n"); 290 trace_printk("ERROR!\n");
291
292 if (!disable_reader) {
293 if (consumer_fifo < 0)
294 trace_printk("Running Consumer at nice: %d\n",
295 consumer_nice);
296 else
297 trace_printk("Running Consumer at SCHED_FIFO %d\n",
298 consumer_fifo);
299 }
300 if (producer_fifo < 0)
301 trace_printk("Running Producer at nice: %d\n",
302 producer_nice);
303 else
304 trace_printk("Running Producer at SCHED_FIFO %d\n",
305 producer_fifo);
306
307 /* Let the user know that the test is running at low priority */
308 if (producer_fifo < 0 && consumer_fifo < 0 &&
309 producer_nice == 19 && consumer_nice == 19)
310 trace_printk("WARNING!!! This test is running at lowest priority.\n");
311
266 trace_printk("Time: %lld (usecs)\n", time); 312 trace_printk("Time: %lld (usecs)\n", time);
267 trace_printk("Overruns: %lld\n", overruns); 313 trace_printk("Overruns: %lld\n", overruns);
268 if (disable_reader) 314 if (disable_reader)
@@ -392,6 +438,27 @@ static int __init ring_buffer_benchmark_init(void)
392 if (IS_ERR(producer)) 438 if (IS_ERR(producer))
393 goto out_kill; 439 goto out_kill;
394 440
441 /*
442 * Run them as low-prio background tasks by default:
443 */
444 if (!disable_reader) {
445 if (consumer_fifo >= 0) {
446 struct sched_param param = {
447 .sched_priority = consumer_fifo
448 };
449 sched_setscheduler(consumer, SCHED_FIFO, &param);
450 } else
451 set_user_nice(consumer, consumer_nice);
452 }
453
454 if (producer_fifo >= 0) {
455 struct sched_param param = {
456 .sched_priority = consumer_fifo
457 };
458 sched_setscheduler(producer, SCHED_FIFO, &param);
459 } else
460 set_user_nice(producer, producer_nice);
461
395 return 0; 462 return 0;
396 463
397 out_kill: 464 out_kill:
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5c75deeefe30..874f2893cff0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -125,19 +125,19 @@ int ftrace_dump_on_oops;
125 125
126static int tracing_set_tracer(const char *buf); 126static int tracing_set_tracer(const char *buf);
127 127
128#define BOOTUP_TRACER_SIZE 100 128#define MAX_TRACER_SIZE 100
129static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata; 129static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
130static char *default_bootup_tracer; 130static char *default_bootup_tracer;
131 131
132static int __init set_ftrace(char *str) 132static int __init set_cmdline_ftrace(char *str)
133{ 133{
134 strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE); 134 strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
135 default_bootup_tracer = bootup_tracer_buf; 135 default_bootup_tracer = bootup_tracer_buf;
136 /* We are using ftrace early, expand it */ 136 /* We are using ftrace early, expand it */
137 ring_buffer_expanded = 1; 137 ring_buffer_expanded = 1;
138 return 1; 138 return 1;
139} 139}
140__setup("ftrace=", set_ftrace); 140__setup("ftrace=", set_cmdline_ftrace);
141 141
142static int __init set_ftrace_dump_on_oops(char *str) 142static int __init set_ftrace_dump_on_oops(char *str)
143{ 143{
@@ -242,13 +242,6 @@ static struct tracer *trace_types __read_mostly;
242static struct tracer *current_trace __read_mostly; 242static struct tracer *current_trace __read_mostly;
243 243
244/* 244/*
245 * max_tracer_type_len is used to simplify the allocating of
246 * buffers to read userspace tracer names. We keep track of
247 * the longest tracer name registered.
248 */
249static int max_tracer_type_len;
250
251/*
252 * trace_types_lock is used to protect the trace_types list. 245 * trace_types_lock is used to protect the trace_types list.
253 * This lock is also used to keep user access serialized. 246 * This lock is also used to keep user access serialized.
254 * Accesses from userspace will grab this lock while userspace 247 * Accesses from userspace will grab this lock while userspace
@@ -275,12 +268,18 @@ static DEFINE_SPINLOCK(tracing_start_lock);
275 */ 268 */
276void trace_wake_up(void) 269void trace_wake_up(void)
277{ 270{
271 int cpu;
272
273 if (trace_flags & TRACE_ITER_BLOCK)
274 return;
278 /* 275 /*
279 * The runqueue_is_locked() can fail, but this is the best we 276 * The runqueue_is_locked() can fail, but this is the best we
280 * have for now: 277 * have for now:
281 */ 278 */
282 if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked()) 279 cpu = get_cpu();
280 if (!runqueue_is_locked(cpu))
283 wake_up(&trace_wait); 281 wake_up(&trace_wait);
282 put_cpu();
284} 283}
285 284
286static int __init set_buf_size(char *str) 285static int __init set_buf_size(char *str)
@@ -339,6 +338,112 @@ static struct {
339 338
340int trace_clock_id; 339int trace_clock_id;
341 340
341/*
342 * trace_parser_get_init - gets the buffer for trace parser
343 */
344int trace_parser_get_init(struct trace_parser *parser, int size)
345{
346 memset(parser, 0, sizeof(*parser));
347
348 parser->buffer = kmalloc(size, GFP_KERNEL);
349 if (!parser->buffer)
350 return 1;
351
352 parser->size = size;
353 return 0;
354}
355
356/*
357 * trace_parser_put - frees the buffer for trace parser
358 */
359void trace_parser_put(struct trace_parser *parser)
360{
361 kfree(parser->buffer);
362}
363
364/*
365 * trace_get_user - reads the user input string separated by space
366 * (matched by isspace(ch))
367 *
368 * For each string found the 'struct trace_parser' is updated,
369 * and the function returns.
370 *
371 * Returns number of bytes read.
372 *
373 * See kernel/trace/trace.h for 'struct trace_parser' details.
374 */
375int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
376 size_t cnt, loff_t *ppos)
377{
378 char ch;
379 size_t read = 0;
380 ssize_t ret;
381
382 if (!*ppos)
383 trace_parser_clear(parser);
384
385 ret = get_user(ch, ubuf++);
386 if (ret)
387 goto out;
388
389 read++;
390 cnt--;
391
392 /*
393 * The parser is not finished with the last write,
394 * continue reading the user input without skipping spaces.
395 */
396 if (!parser->cont) {
397 /* skip white space */
398 while (cnt && isspace(ch)) {
399 ret = get_user(ch, ubuf++);
400 if (ret)
401 goto out;
402 read++;
403 cnt--;
404 }
405
406 /* only spaces were written */
407 if (isspace(ch)) {
408 *ppos += read;
409 ret = read;
410 goto out;
411 }
412
413 parser->idx = 0;
414 }
415
416 /* read the non-space input */
417 while (cnt && !isspace(ch)) {
418 if (parser->idx < parser->size - 1)
419 parser->buffer[parser->idx++] = ch;
420 else {
421 ret = -EINVAL;
422 goto out;
423 }
424 ret = get_user(ch, ubuf++);
425 if (ret)
426 goto out;
427 read++;
428 cnt--;
429 }
430
431 /* We either got finished input or we have to wait for another call. */
432 if (isspace(ch)) {
433 parser->buffer[parser->idx] = 0;
434 parser->cont = false;
435 } else {
436 parser->cont = true;
437 parser->buffer[parser->idx++] = ch;
438 }
439
440 *ppos += read;
441 ret = read;
442
443out:
444 return ret;
445}
446
342ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) 447ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
343{ 448{
344 int len; 449 int len;
@@ -513,7 +618,6 @@ __releases(kernel_lock)
513__acquires(kernel_lock) 618__acquires(kernel_lock)
514{ 619{
515 struct tracer *t; 620 struct tracer *t;
516 int len;
517 int ret = 0; 621 int ret = 0;
518 622
519 if (!type->name) { 623 if (!type->name) {
@@ -521,6 +625,11 @@ __acquires(kernel_lock)
521 return -1; 625 return -1;
522 } 626 }
523 627
628 if (strlen(type->name) > MAX_TRACER_SIZE) {
629 pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
630 return -1;
631 }
632
524 /* 633 /*
525 * When this gets called we hold the BKL which means that 634 * When this gets called we hold the BKL which means that
526 * preemption is disabled. Various trace selftests however 635 * preemption is disabled. Various trace selftests however
@@ -535,7 +644,7 @@ __acquires(kernel_lock)
535 for (t = trace_types; t; t = t->next) { 644 for (t = trace_types; t; t = t->next) {
536 if (strcmp(type->name, t->name) == 0) { 645 if (strcmp(type->name, t->name) == 0) {
537 /* already found */ 646 /* already found */
538 pr_info("Trace %s already registered\n", 647 pr_info("Tracer %s already registered\n",
539 type->name); 648 type->name);
540 ret = -1; 649 ret = -1;
541 goto out; 650 goto out;
@@ -586,9 +695,6 @@ __acquires(kernel_lock)
586 695
587 type->next = trace_types; 696 type->next = trace_types;
588 trace_types = type; 697 trace_types = type;
589 len = strlen(type->name);
590 if (len > max_tracer_type_len)
591 max_tracer_type_len = len;
592 698
593 out: 699 out:
594 tracing_selftest_running = false; 700 tracing_selftest_running = false;
@@ -597,7 +703,7 @@ __acquires(kernel_lock)
597 if (ret || !default_bootup_tracer) 703 if (ret || !default_bootup_tracer)
598 goto out_unlock; 704 goto out_unlock;
599 705
600 if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE)) 706 if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE))
601 goto out_unlock; 707 goto out_unlock;
602 708
603 printk(KERN_INFO "Starting tracer '%s'\n", type->name); 709 printk(KERN_INFO "Starting tracer '%s'\n", type->name);
@@ -619,14 +725,13 @@ __acquires(kernel_lock)
619void unregister_tracer(struct tracer *type) 725void unregister_tracer(struct tracer *type)
620{ 726{
621 struct tracer **t; 727 struct tracer **t;
622 int len;
623 728
624 mutex_lock(&trace_types_lock); 729 mutex_lock(&trace_types_lock);
625 for (t = &trace_types; *t; t = &(*t)->next) { 730 for (t = &trace_types; *t; t = &(*t)->next) {
626 if (*t == type) 731 if (*t == type)
627 goto found; 732 goto found;
628 } 733 }
629 pr_info("Trace %s not registered\n", type->name); 734 pr_info("Tracer %s not registered\n", type->name);
630 goto out; 735 goto out;
631 736
632 found: 737 found:
@@ -639,17 +744,7 @@ void unregister_tracer(struct tracer *type)
639 current_trace->stop(&global_trace); 744 current_trace->stop(&global_trace);
640 current_trace = &nop_trace; 745 current_trace = &nop_trace;
641 } 746 }
642 747out:
643 if (strlen(type->name) != max_tracer_type_len)
644 goto out;
645
646 max_tracer_type_len = 0;
647 for (t = &trace_types; *t; t = &(*t)->next) {
648 len = strlen((*t)->name);
649 if (len > max_tracer_type_len)
650 max_tracer_type_len = len;
651 }
652 out:
653 mutex_unlock(&trace_types_lock); 748 mutex_unlock(&trace_types_lock);
654} 749}
655 750
@@ -719,6 +814,11 @@ static void trace_init_cmdlines(void)
719 cmdline_idx = 0; 814 cmdline_idx = 0;
720} 815}
721 816
817int is_tracing_stopped(void)
818{
819 return trace_stop_count;
820}
821
722/** 822/**
723 * ftrace_off_permanent - disable all ftrace code permanently 823 * ftrace_off_permanent - disable all ftrace code permanently
724 * 824 *
@@ -886,7 +986,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
886 986
887 entry->preempt_count = pc & 0xff; 987 entry->preempt_count = pc & 0xff;
888 entry->pid = (tsk) ? tsk->pid : 0; 988 entry->pid = (tsk) ? tsk->pid : 0;
889 entry->tgid = (tsk) ? tsk->tgid : 0; 989 entry->lock_depth = (tsk) ? tsk->lock_depth : 0;
890 entry->flags = 990 entry->flags =
891#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 991#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
892 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 992 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1068,6 +1168,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1068 return; 1168 return;
1069 entry = ring_buffer_event_data(event); 1169 entry = ring_buffer_event_data(event);
1070 1170
1171 entry->tgid = current->tgid;
1071 memset(&entry->caller, 0, sizeof(entry->caller)); 1172 memset(&entry->caller, 0, sizeof(entry->caller));
1072 1173
1073 trace.nr_entries = 0; 1174 trace.nr_entries = 0;
@@ -1094,6 +1195,7 @@ ftrace_trace_special(void *__tr,
1094 unsigned long arg1, unsigned long arg2, unsigned long arg3, 1195 unsigned long arg1, unsigned long arg2, unsigned long arg3,
1095 int pc) 1196 int pc)
1096{ 1197{
1198 struct ftrace_event_call *call = &event_special;
1097 struct ring_buffer_event *event; 1199 struct ring_buffer_event *event;
1098 struct trace_array *tr = __tr; 1200 struct trace_array *tr = __tr;
1099 struct ring_buffer *buffer = tr->buffer; 1201 struct ring_buffer *buffer = tr->buffer;
@@ -1107,7 +1209,9 @@ ftrace_trace_special(void *__tr,
1107 entry->arg1 = arg1; 1209 entry->arg1 = arg1;
1108 entry->arg2 = arg2; 1210 entry->arg2 = arg2;
1109 entry->arg3 = arg3; 1211 entry->arg3 = arg3;
1110 trace_buffer_unlock_commit(buffer, event, 0, pc); 1212
1213 if (!filter_check_discard(call, entry, buffer, event))
1214 trace_buffer_unlock_commit(buffer, event, 0, pc);
1111} 1215}
1112 1216
1113void 1217void
@@ -1257,10 +1361,11 @@ int trace_array_vprintk(struct trace_array *tr,
1257 pause_graph_tracing(); 1361 pause_graph_tracing();
1258 raw_local_irq_save(irq_flags); 1362 raw_local_irq_save(irq_flags);
1259 __raw_spin_lock(&trace_buf_lock); 1363 __raw_spin_lock(&trace_buf_lock);
1260 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); 1364 if (args == NULL) {
1261 1365 strncpy(trace_buf, fmt, TRACE_BUF_SIZE);
1262 len = min(len, TRACE_BUF_SIZE-1); 1366 len = strlen(trace_buf);
1263 trace_buf[len] = 0; 1367 } else
1368 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1264 1369
1265 size = sizeof(*entry) + len + 1; 1370 size = sizeof(*entry) + len + 1;
1266 buffer = tr->buffer; 1371 buffer = tr->buffer;
@@ -1269,10 +1374,10 @@ int trace_array_vprintk(struct trace_array *tr,
1269 if (!event) 1374 if (!event)
1270 goto out_unlock; 1375 goto out_unlock;
1271 entry = ring_buffer_event_data(event); 1376 entry = ring_buffer_event_data(event);
1272 entry->ip = ip; 1377 entry->ip = ip;
1273 1378
1274 memcpy(&entry->buf, trace_buf, len); 1379 memcpy(&entry->buf, trace_buf, len);
1275 entry->buf[len] = 0; 1380 entry->buf[len] = '\0';
1276 if (!filter_check_discard(call, entry, buffer, event)) 1381 if (!filter_check_discard(call, entry, buffer, event))
1277 ring_buffer_unlock_commit(buffer, event); 1382 ring_buffer_unlock_commit(buffer, event);
1278 1383
@@ -1289,7 +1394,7 @@ int trace_array_vprintk(struct trace_array *tr,
1289 1394
1290int trace_vprintk(unsigned long ip, const char *fmt, va_list args) 1395int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1291{ 1396{
1292 return trace_array_printk(&global_trace, ip, fmt, args); 1397 return trace_array_vprintk(&global_trace, ip, fmt, args);
1293} 1398}
1294EXPORT_SYMBOL_GPL(trace_vprintk); 1399EXPORT_SYMBOL_GPL(trace_vprintk);
1295 1400
@@ -1530,10 +1635,10 @@ static void print_lat_help_header(struct seq_file *m)
1530 seq_puts(m, "# | / _----=> need-resched \n"); 1635 seq_puts(m, "# | / _----=> need-resched \n");
1531 seq_puts(m, "# || / _---=> hardirq/softirq \n"); 1636 seq_puts(m, "# || / _---=> hardirq/softirq \n");
1532 seq_puts(m, "# ||| / _--=> preempt-depth \n"); 1637 seq_puts(m, "# ||| / _--=> preempt-depth \n");
1533 seq_puts(m, "# |||| / \n"); 1638 seq_puts(m, "# |||| /_--=> lock-depth \n");
1534 seq_puts(m, "# ||||| delay \n"); 1639 seq_puts(m, "# |||||/ delay \n");
1535 seq_puts(m, "# cmd pid ||||| time | caller \n"); 1640 seq_puts(m, "# cmd pid |||||| time | caller \n");
1536 seq_puts(m, "# \\ / ||||| \\ | / \n"); 1641 seq_puts(m, "# \\ / |||||| \\ | / \n");
1537} 1642}
1538 1643
1539static void print_func_help_header(struct seq_file *m) 1644static void print_func_help_header(struct seq_file *m)
@@ -1845,7 +1950,7 @@ static int s_show(struct seq_file *m, void *v)
1845 return 0; 1950 return 0;
1846} 1951}
1847 1952
1848static struct seq_operations tracer_seq_ops = { 1953static const struct seq_operations tracer_seq_ops = {
1849 .start = s_start, 1954 .start = s_start,
1850 .next = s_next, 1955 .next = s_next,
1851 .stop = s_stop, 1956 .stop = s_stop,
@@ -1880,11 +1985,9 @@ __tracing_open(struct inode *inode, struct file *file)
1880 if (current_trace) 1985 if (current_trace)
1881 *iter->trace = *current_trace; 1986 *iter->trace = *current_trace;
1882 1987
1883 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) 1988 if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
1884 goto fail; 1989 goto fail;
1885 1990
1886 cpumask_clear(iter->started);
1887
1888 if (current_trace && current_trace->print_max) 1991 if (current_trace && current_trace->print_max)
1889 iter->tr = &max_tr; 1992 iter->tr = &max_tr;
1890 else 1993 else
@@ -2059,7 +2162,7 @@ static int t_show(struct seq_file *m, void *v)
2059 return 0; 2162 return 0;
2060} 2163}
2061 2164
2062static struct seq_operations show_traces_seq_ops = { 2165static const struct seq_operations show_traces_seq_ops = {
2063 .start = t_start, 2166 .start = t_start,
2064 .next = t_next, 2167 .next = t_next,
2065 .stop = t_stop, 2168 .stop = t_stop,
@@ -2338,7 +2441,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2338 return ret; 2441 return ret;
2339 } 2442 }
2340 2443
2341 filp->f_pos += cnt; 2444 *ppos += cnt;
2342 2445
2343 return cnt; 2446 return cnt;
2344} 2447}
@@ -2480,7 +2583,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2480 } 2583 }
2481 mutex_unlock(&trace_types_lock); 2584 mutex_unlock(&trace_types_lock);
2482 2585
2483 filp->f_pos += cnt; 2586 *ppos += cnt;
2484 2587
2485 return cnt; 2588 return cnt;
2486} 2589}
@@ -2489,7 +2592,7 @@ static ssize_t
2489tracing_set_trace_read(struct file *filp, char __user *ubuf, 2592tracing_set_trace_read(struct file *filp, char __user *ubuf,
2490 size_t cnt, loff_t *ppos) 2593 size_t cnt, loff_t *ppos)
2491{ 2594{
2492 char buf[max_tracer_type_len+2]; 2595 char buf[MAX_TRACER_SIZE+2];
2493 int r; 2596 int r;
2494 2597
2495 mutex_lock(&trace_types_lock); 2598 mutex_lock(&trace_types_lock);
@@ -2639,15 +2742,15 @@ static ssize_t
2639tracing_set_trace_write(struct file *filp, const char __user *ubuf, 2742tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2640 size_t cnt, loff_t *ppos) 2743 size_t cnt, loff_t *ppos)
2641{ 2744{
2642 char buf[max_tracer_type_len+1]; 2745 char buf[MAX_TRACER_SIZE+1];
2643 int i; 2746 int i;
2644 size_t ret; 2747 size_t ret;
2645 int err; 2748 int err;
2646 2749
2647 ret = cnt; 2750 ret = cnt;
2648 2751
2649 if (cnt > max_tracer_type_len) 2752 if (cnt > MAX_TRACER_SIZE)
2650 cnt = max_tracer_type_len; 2753 cnt = MAX_TRACER_SIZE;
2651 2754
2652 if (copy_from_user(&buf, ubuf, cnt)) 2755 if (copy_from_user(&buf, ubuf, cnt))
2653 return -EFAULT; 2756 return -EFAULT;
@@ -2662,7 +2765,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2662 if (err) 2765 if (err)
2663 return err; 2766 return err;
2664 2767
2665 filp->f_pos += ret; 2768 *ppos += ret;
2666 2769
2667 return ret; 2770 return ret;
2668} 2771}
@@ -3197,7 +3300,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3197 } 3300 }
3198 } 3301 }
3199 3302
3200 filp->f_pos += cnt; 3303 *ppos += cnt;
3201 3304
3202 /* If check pages failed, return ENOMEM */ 3305 /* If check pages failed, return ENOMEM */
3203 if (tracing_disabled) 3306 if (tracing_disabled)
@@ -3217,22 +3320,11 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3217 return cnt; 3320 return cnt;
3218} 3321}
3219 3322
3220static int mark_printk(const char *fmt, ...)
3221{
3222 int ret;
3223 va_list args;
3224 va_start(args, fmt);
3225 ret = trace_vprintk(0, fmt, args);
3226 va_end(args);
3227 return ret;
3228}
3229
3230static ssize_t 3323static ssize_t
3231tracing_mark_write(struct file *filp, const char __user *ubuf, 3324tracing_mark_write(struct file *filp, const char __user *ubuf,
3232 size_t cnt, loff_t *fpos) 3325 size_t cnt, loff_t *fpos)
3233{ 3326{
3234 char *buf; 3327 char *buf;
3235 char *end;
3236 3328
3237 if (tracing_disabled) 3329 if (tracing_disabled)
3238 return -EINVAL; 3330 return -EINVAL;
@@ -3240,7 +3332,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3240 if (cnt > TRACE_BUF_SIZE) 3332 if (cnt > TRACE_BUF_SIZE)
3241 cnt = TRACE_BUF_SIZE; 3333 cnt = TRACE_BUF_SIZE;
3242 3334
3243 buf = kmalloc(cnt + 1, GFP_KERNEL); 3335 buf = kmalloc(cnt + 2, GFP_KERNEL);
3244 if (buf == NULL) 3336 if (buf == NULL)
3245 return -ENOMEM; 3337 return -ENOMEM;
3246 3338
@@ -3248,14 +3340,13 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3248 kfree(buf); 3340 kfree(buf);
3249 return -EFAULT; 3341 return -EFAULT;
3250 } 3342 }
3343 if (buf[cnt-1] != '\n') {
3344 buf[cnt] = '\n';
3345 buf[cnt+1] = '\0';
3346 } else
3347 buf[cnt] = '\0';
3251 3348
3252 /* Cut from the first nil or newline. */ 3349 cnt = trace_vprintk(0, buf, NULL);
3253 buf[cnt] = '\0';
3254 end = strchr(buf, '\n');
3255 if (end)
3256 *end = '\0';
3257
3258 cnt = mark_printk("%s\n", buf);
3259 kfree(buf); 3350 kfree(buf);
3260 *fpos += cnt; 3351 *fpos += cnt;
3261 3352
@@ -3628,7 +3719,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3628 3719
3629 s = kmalloc(sizeof(*s), GFP_KERNEL); 3720 s = kmalloc(sizeof(*s), GFP_KERNEL);
3630 if (!s) 3721 if (!s)
3631 return ENOMEM; 3722 return -ENOMEM;
3632 3723
3633 trace_seq_init(s); 3724 trace_seq_init(s);
3634 3725
@@ -4285,7 +4376,7 @@ __init static int tracer_alloc_buffers(void)
4285 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 4376 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
4286 goto out_free_buffer_mask; 4377 goto out_free_buffer_mask;
4287 4378
4288 if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) 4379 if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
4289 goto out_free_tracing_cpumask; 4380 goto out_free_tracing_cpumask;
4290 4381
4291 /* To save memory, keep the ring buffer size to its minimum */ 4382 /* To save memory, keep the ring buffer size to its minimum */
@@ -4296,7 +4387,6 @@ __init static int tracer_alloc_buffers(void)
4296 4387
4297 cpumask_copy(tracing_buffer_mask, cpu_possible_mask); 4388 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
4298 cpumask_copy(tracing_cpumask, cpu_all_mask); 4389 cpumask_copy(tracing_cpumask, cpu_all_mask);
4299 cpumask_clear(tracing_reader_cpumask);
4300 4390
4301 /* TODO: make the number of buffers hot pluggable with CPUS */ 4391 /* TODO: make the number of buffers hot pluggable with CPUS */
4302 global_trace.buffer = ring_buffer_alloc(ring_buf_size, 4392 global_trace.buffer = ring_buffer_alloc(ring_buf_size,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fa1dccb579d5..1d7f4830a80d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -7,10 +7,11 @@
7#include <linux/clocksource.h> 7#include <linux/clocksource.h>
8#include <linux/ring_buffer.h> 8#include <linux/ring_buffer.h>
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/tracepoint.h>
10#include <linux/ftrace.h> 11#include <linux/ftrace.h>
11#include <trace/boot.h> 12#include <trace/boot.h>
12#include <linux/kmemtrace.h> 13#include <linux/kmemtrace.h>
13#include <trace/power.h> 14#include <linux/hw_breakpoint.h>
14 15
15#include <linux/trace_seq.h> 16#include <linux/trace_seq.h>
16#include <linux/ftrace_event.h> 17#include <linux/ftrace_event.h>
@@ -36,182 +37,101 @@ enum trace_type {
36 TRACE_HW_BRANCHES, 37 TRACE_HW_BRANCHES,
37 TRACE_KMEM_ALLOC, 38 TRACE_KMEM_ALLOC,
38 TRACE_KMEM_FREE, 39 TRACE_KMEM_FREE,
39 TRACE_POWER,
40 TRACE_BLK, 40 TRACE_BLK,
41 TRACE_KSYM,
41 42
42 __TRACE_LAST_TYPE, 43 __TRACE_LAST_TYPE,
43}; 44};
44 45
45/* 46enum kmemtrace_type_id {
46 * Function trace entry - function address and parent function addres: 47 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
47 */ 48 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
48struct ftrace_entry { 49 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
49 struct trace_entry ent;
50 unsigned long ip;
51 unsigned long parent_ip;
52};
53
54/* Function call entry */
55struct ftrace_graph_ent_entry {
56 struct trace_entry ent;
57 struct ftrace_graph_ent graph_ent;
58}; 50};
59 51
60/* Function return entry */
61struct ftrace_graph_ret_entry {
62 struct trace_entry ent;
63 struct ftrace_graph_ret ret;
64};
65extern struct tracer boot_tracer; 52extern struct tracer boot_tracer;
66 53
67/* 54#undef __field
68 * Context switch trace entry - which task (and prio) we switched from/to: 55#define __field(type, item) type item;
69 */
70struct ctx_switch_entry {
71 struct trace_entry ent;
72 unsigned int prev_pid;
73 unsigned char prev_prio;
74 unsigned char prev_state;
75 unsigned int next_pid;
76 unsigned char next_prio;
77 unsigned char next_state;
78 unsigned int next_cpu;
79};
80
81/*
82 * Special (free-form) trace entry:
83 */
84struct special_entry {
85 struct trace_entry ent;
86 unsigned long arg1;
87 unsigned long arg2;
88 unsigned long arg3;
89};
90
91/*
92 * Stack-trace entry:
93 */
94
95#define FTRACE_STACK_ENTRIES 8
96
97struct stack_entry {
98 struct trace_entry ent;
99 unsigned long caller[FTRACE_STACK_ENTRIES];
100};
101 56
102struct userstack_entry { 57#undef __field_struct
103 struct trace_entry ent; 58#define __field_struct(type, item) __field(type, item)
104 unsigned long caller[FTRACE_STACK_ENTRIES];
105};
106 59
107/* 60#undef __field_desc
108 * trace_printk entry: 61#define __field_desc(type, container, item)
109 */
110struct bprint_entry {
111 struct trace_entry ent;
112 unsigned long ip;
113 const char *fmt;
114 u32 buf[];
115};
116 62
117struct print_entry { 63#undef __array
118 struct trace_entry ent; 64#define __array(type, item, size) type item[size];
119 unsigned long ip;
120 char buf[];
121};
122 65
123#define TRACE_OLD_SIZE 88 66#undef __array_desc
67#define __array_desc(type, container, item, size)
124 68
125struct trace_field_cont { 69#undef __dynamic_array
126 unsigned char type; 70#define __dynamic_array(type, item) type item[];
127 /* Temporary till we get rid of this completely */
128 char buf[TRACE_OLD_SIZE - 1];
129};
130 71
131struct trace_mmiotrace_rw { 72#undef F_STRUCT
132 struct trace_entry ent; 73#define F_STRUCT(args...) args
133 struct mmiotrace_rw rw;
134};
135 74
136struct trace_mmiotrace_map { 75#undef FTRACE_ENTRY
137 struct trace_entry ent; 76#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
138 struct mmiotrace_map map; 77 struct struct_name { \
139}; 78 struct trace_entry ent; \
79 tstruct \
80 }
140 81
141struct trace_boot_call { 82#undef TP_ARGS
142 struct trace_entry ent; 83#define TP_ARGS(args...) args
143 struct boot_trace_call boot_call;
144};
145 84
146struct trace_boot_ret { 85#undef FTRACE_ENTRY_DUP
147 struct trace_entry ent; 86#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk)
148 struct boot_trace_ret boot_ret;
149};
150 87
151#define TRACE_FUNC_SIZE 30 88#include "trace_entries.h"
152#define TRACE_FILE_SIZE 20
153struct trace_branch {
154 struct trace_entry ent;
155 unsigned line;
156 char func[TRACE_FUNC_SIZE+1];
157 char file[TRACE_FILE_SIZE+1];
158 char correct;
159};
160 89
161struct hw_branch_entry { 90/*
91 * syscalls are special, and need special handling, this is why
92 * they are not included in trace_entries.h
93 */
94struct syscall_trace_enter {
162 struct trace_entry ent; 95 struct trace_entry ent;
163 u64 from; 96 int nr;
164 u64 to; 97 unsigned long args[];
165}; 98};
166 99
167struct trace_power { 100struct syscall_trace_exit {
168 struct trace_entry ent; 101 struct trace_entry ent;
169 struct power_trace state_data; 102 int nr;
170}; 103 long ret;
171
172enum kmemtrace_type_id {
173 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
174 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
175 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
176}; 104};
177 105
178struct kmemtrace_alloc_entry { 106struct kprobe_trace_entry {
179 struct trace_entry ent; 107 struct trace_entry ent;
180 enum kmemtrace_type_id type_id; 108 unsigned long ip;
181 unsigned long call_site; 109 int nargs;
182 const void *ptr; 110 unsigned long args[];
183 size_t bytes_req;
184 size_t bytes_alloc;
185 gfp_t gfp_flags;
186 int node;
187}; 111};
188 112
189struct kmemtrace_free_entry { 113#define SIZEOF_KPROBE_TRACE_ENTRY(n) \
190 struct trace_entry ent; 114 (offsetof(struct kprobe_trace_entry, args) + \
191 enum kmemtrace_type_id type_id; 115 (sizeof(unsigned long) * (n)))
192 unsigned long call_site;
193 const void *ptr;
194};
195 116
196struct syscall_trace_enter { 117struct kretprobe_trace_entry {
197 struct trace_entry ent; 118 struct trace_entry ent;
198 int nr; 119 unsigned long func;
120 unsigned long ret_ip;
121 int nargs;
199 unsigned long args[]; 122 unsigned long args[];
200}; 123};
201 124
202struct syscall_trace_exit { 125#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \
203 struct trace_entry ent; 126 (offsetof(struct kretprobe_trace_entry, args) + \
204 int nr; 127 (sizeof(unsigned long) * (n)))
205 unsigned long ret;
206};
207
208 128
209/* 129/*
210 * trace_flag_type is an enumeration that holds different 130 * trace_flag_type is an enumeration that holds different
211 * states when a trace occurs. These are: 131 * states when a trace occurs. These are:
212 * IRQS_OFF - interrupts were disabled 132 * IRQS_OFF - interrupts were disabled
213 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags 133 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
214 * NEED_RESCED - reschedule is requested 134 * NEED_RESCHED - reschedule is requested
215 * HARDIRQ - inside an interrupt handler 135 * HARDIRQ - inside an interrupt handler
216 * SOFTIRQ - inside a softirq handler 136 * SOFTIRQ - inside a softirq handler
217 */ 137 */
@@ -310,11 +230,11 @@ extern void __ftrace_bad_type(void);
310 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ 230 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
311 TRACE_GRAPH_RET); \ 231 TRACE_GRAPH_RET); \
312 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ 232 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
313 IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
314 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ 233 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
315 TRACE_KMEM_ALLOC); \ 234 TRACE_KMEM_ALLOC); \
316 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 235 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
317 TRACE_KMEM_FREE); \ 236 TRACE_KMEM_FREE); \
237 IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
318 __ftrace_bad_type(); \ 238 __ftrace_bad_type(); \
319 } while (0) 239 } while (0)
320 240
@@ -390,7 +310,6 @@ struct tracer {
390 struct tracer *next; 310 struct tracer *next;
391 int print_max; 311 int print_max;
392 struct tracer_flags *flags; 312 struct tracer_flags *flags;
393 struct tracer_stat *stats;
394}; 313};
395 314
396 315
@@ -469,6 +388,9 @@ void tracing_stop_sched_switch_record(void);
469void tracing_start_sched_switch_record(void); 388void tracing_start_sched_switch_record(void);
470int register_tracer(struct tracer *type); 389int register_tracer(struct tracer *type);
471void unregister_tracer(struct tracer *type); 390void unregister_tracer(struct tracer *type);
391int is_tracing_stopped(void);
392
393extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
472 394
473extern unsigned long nsecs_to_usecs(unsigned long nsecs); 395extern unsigned long nsecs_to_usecs(unsigned long nsecs);
474 396
@@ -509,20 +431,6 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
509 431
510extern cycle_t ftrace_now(int cpu); 432extern cycle_t ftrace_now(int cpu);
511 433
512#ifdef CONFIG_CONTEXT_SWITCH_TRACER
513typedef void
514(*tracer_switch_func_t)(void *private,
515 void *__rq,
516 struct task_struct *prev,
517 struct task_struct *next);
518
519struct tracer_switch_ops {
520 tracer_switch_func_t func;
521 void *private;
522 struct tracer_switch_ops *next;
523};
524#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
525
526extern void trace_find_cmdline(int pid, char comm[]); 434extern void trace_find_cmdline(int pid, char comm[]);
527 435
528#ifdef CONFIG_DYNAMIC_FTRACE 436#ifdef CONFIG_DYNAMIC_FTRACE
@@ -558,6 +466,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
558 struct trace_array *tr); 466 struct trace_array *tr);
559extern int trace_selftest_startup_hw_branches(struct tracer *trace, 467extern int trace_selftest_startup_hw_branches(struct tracer *trace,
560 struct trace_array *tr); 468 struct trace_array *tr);
469extern int trace_selftest_startup_ksym(struct tracer *trace,
470 struct trace_array *tr);
561#endif /* CONFIG_FTRACE_STARTUP_TEST */ 471#endif /* CONFIG_FTRACE_STARTUP_TEST */
562 472
563extern void *head_page(struct trace_array_cpu *data); 473extern void *head_page(struct trace_array_cpu *data);
@@ -603,10 +513,6 @@ static inline int ftrace_graph_addr(unsigned long addr)
603 return 0; 513 return 0;
604} 514}
605#else 515#else
606static inline int ftrace_trace_addr(unsigned long addr)
607{
608 return 1;
609}
610static inline int ftrace_graph_addr(unsigned long addr) 516static inline int ftrace_graph_addr(unsigned long addr)
611{ 517{
612 return 1; 518 return 1;
@@ -620,12 +526,12 @@ print_graph_function(struct trace_iterator *iter)
620} 526}
621#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 527#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
622 528
623extern struct pid *ftrace_pid_trace; 529extern struct list_head ftrace_pids;
624 530
625#ifdef CONFIG_FUNCTION_TRACER 531#ifdef CONFIG_FUNCTION_TRACER
626static inline int ftrace_trace_task(struct task_struct *task) 532static inline int ftrace_trace_task(struct task_struct *task)
627{ 533{
628 if (!ftrace_pid_trace) 534 if (list_empty(&ftrace_pids))
629 return 1; 535 return 1;
630 536
631 return test_tsk_trace_trace(task); 537 return test_tsk_trace_trace(task);
@@ -638,6 +544,41 @@ static inline int ftrace_trace_task(struct task_struct *task)
638#endif 544#endif
639 545
640/* 546/*
547 * struct trace_parser - servers for reading the user input separated by spaces
548 * @cont: set if the input is not complete - no final space char was found
549 * @buffer: holds the parsed user input
550 * @idx: user input lenght
551 * @size: buffer size
552 */
553struct trace_parser {
554 bool cont;
555 char *buffer;
556 unsigned idx;
557 unsigned size;
558};
559
560static inline bool trace_parser_loaded(struct trace_parser *parser)
561{
562 return (parser->idx != 0);
563}
564
565static inline bool trace_parser_cont(struct trace_parser *parser)
566{
567 return parser->cont;
568}
569
570static inline void trace_parser_clear(struct trace_parser *parser)
571{
572 parser->cont = false;
573 parser->idx = 0;
574}
575
576extern int trace_parser_get_init(struct trace_parser *parser, int size);
577extern void trace_parser_put(struct trace_parser *parser);
578extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
579 size_t cnt, loff_t *ppos);
580
581/*
641 * trace_iterator_flags is an enumeration that defines bit 582 * trace_iterator_flags is an enumeration that defines bit
642 * positions into trace_flags that controls the output. 583 * positions into trace_flags that controls the output.
643 * 584 *
@@ -772,7 +713,6 @@ struct event_filter {
772 int n_preds; 713 int n_preds;
773 struct filter_pred **preds; 714 struct filter_pred **preds;
774 char *filter_string; 715 char *filter_string;
775 bool no_reset;
776}; 716};
777 717
778struct event_subsystem { 718struct event_subsystem {
@@ -784,22 +724,40 @@ struct event_subsystem {
784}; 724};
785 725
786struct filter_pred; 726struct filter_pred;
727struct regex;
787 728
788typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, 729typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
789 int val1, int val2); 730 int val1, int val2);
790 731
791struct filter_pred { 732typedef int (*regex_match_func)(char *str, struct regex *r, int len);
792 filter_pred_fn_t fn; 733
793 u64 val; 734enum regex_type {
794 char str_val[MAX_FILTER_STR_VAL]; 735 MATCH_FULL = 0,
795 int str_len; 736 MATCH_FRONT_ONLY,
796 char *field_name; 737 MATCH_MIDDLE_ONLY,
797 int offset; 738 MATCH_END_ONLY,
798 int not; 739};
799 int op; 740
800 int pop_n; 741struct regex {
742 char pattern[MAX_FILTER_STR_VAL];
743 int len;
744 int field_len;
745 regex_match_func match;
801}; 746};
802 747
748struct filter_pred {
749 filter_pred_fn_t fn;
750 u64 val;
751 struct regex regex;
752 char *field_name;
753 int offset;
754 int not;
755 int op;
756 int pop_n;
757};
758
759extern enum regex_type
760filter_parse_regex(char *buff, int len, char **search, int *not);
803extern void print_event_filter(struct ftrace_event_call *call, 761extern void print_event_filter(struct ftrace_event_call *call,
804 struct trace_seq *s); 762 struct trace_seq *s);
805extern int apply_event_filter(struct ftrace_event_call *call, 763extern int apply_event_filter(struct ftrace_event_call *call,
@@ -815,7 +773,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
815 struct ring_buffer *buffer, 773 struct ring_buffer *buffer,
816 struct ring_buffer_event *event) 774 struct ring_buffer_event *event)
817{ 775{
818 if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) { 776 if (unlikely(call->filter_active) &&
777 !filter_match_preds(call->filter, rec)) {
819 ring_buffer_discard_commit(buffer, event); 778 ring_buffer_discard_commit(buffer, event);
820 return 1; 779 return 1;
821 } 780 }
@@ -823,58 +782,18 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
823 return 0; 782 return 0;
824} 783}
825 784
826#define DEFINE_COMPARISON_PRED(type) \
827static int filter_pred_##type(struct filter_pred *pred, void *event, \
828 int val1, int val2) \
829{ \
830 type *addr = (type *)(event + pred->offset); \
831 type val = (type)pred->val; \
832 int match = 0; \
833 \
834 switch (pred->op) { \
835 case OP_LT: \
836 match = (*addr < val); \
837 break; \
838 case OP_LE: \
839 match = (*addr <= val); \
840 break; \
841 case OP_GT: \
842 match = (*addr > val); \
843 break; \
844 case OP_GE: \
845 match = (*addr >= val); \
846 break; \
847 default: \
848 break; \
849 } \
850 \
851 return match; \
852}
853
854#define DEFINE_EQUALITY_PRED(size) \
855static int filter_pred_##size(struct filter_pred *pred, void *event, \
856 int val1, int val2) \
857{ \
858 u##size *addr = (u##size *)(event + pred->offset); \
859 u##size val = (u##size)pred->val; \
860 int match; \
861 \
862 match = (val == *addr) ^ pred->not; \
863 \
864 return match; \
865}
866
867extern struct mutex event_mutex; 785extern struct mutex event_mutex;
868extern struct list_head ftrace_events; 786extern struct list_head ftrace_events;
869 787
870extern const char *__start___trace_bprintk_fmt[]; 788extern const char *__start___trace_bprintk_fmt[];
871extern const char *__stop___trace_bprintk_fmt[]; 789extern const char *__stop___trace_bprintk_fmt[];
872 790
873#undef TRACE_EVENT_FORMAT 791#undef FTRACE_ENTRY
874#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 792#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
875 extern struct ftrace_event_call event_##call; 793 extern struct ftrace_event_call event_##call;
876#undef TRACE_EVENT_FORMAT_NOFILTER 794#undef FTRACE_ENTRY_DUP
877#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt) 795#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
878#include "trace_event_types.h" 796 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
797#include "trace_entries.h"
879 798
880#endif /* _LINUX_KERNEL_TRACE_H */ 799#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 19bfc75d467e..c21d5f3956ad 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -129,6 +129,7 @@ struct tracer boot_tracer __read_mostly =
129 129
130void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) 130void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
131{ 131{
132 struct ftrace_event_call *call = &event_boot_call;
132 struct ring_buffer_event *event; 133 struct ring_buffer_event *event;
133 struct ring_buffer *buffer; 134 struct ring_buffer *buffer;
134 struct trace_boot_call *entry; 135 struct trace_boot_call *entry;
@@ -150,13 +151,15 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
150 goto out; 151 goto out;
151 entry = ring_buffer_event_data(event); 152 entry = ring_buffer_event_data(event);
152 entry->boot_call = *bt; 153 entry->boot_call = *bt;
153 trace_buffer_unlock_commit(buffer, event, 0, 0); 154 if (!filter_check_discard(call, entry, buffer, event))
155 trace_buffer_unlock_commit(buffer, event, 0, 0);
154 out: 156 out:
155 preempt_enable(); 157 preempt_enable();
156} 158}
157 159
158void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) 160void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
159{ 161{
162 struct ftrace_event_call *call = &event_boot_ret;
160 struct ring_buffer_event *event; 163 struct ring_buffer_event *event;
161 struct ring_buffer *buffer; 164 struct ring_buffer *buffer;
162 struct trace_boot_ret *entry; 165 struct trace_boot_ret *entry;
@@ -175,7 +178,8 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
175 goto out; 178 goto out;
176 entry = ring_buffer_event_data(event); 179 entry = ring_buffer_event_data(event);
177 entry->boot_ret = *bt; 180 entry->boot_ret = *bt;
178 trace_buffer_unlock_commit(buffer, event, 0, 0); 181 if (!filter_check_discard(call, entry, buffer, event))
182 trace_buffer_unlock_commit(buffer, event, 0, 0);
179 out: 183 out:
180 preempt_enable(); 184 preempt_enable();
181} 185}
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 7a7a9fd249a9..4a194f08f88c 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -34,6 +34,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
34 struct trace_array *tr = branch_tracer; 34 struct trace_array *tr = branch_tracer;
35 struct ring_buffer_event *event; 35 struct ring_buffer_event *event;
36 struct trace_branch *entry; 36 struct trace_branch *entry;
37 struct ring_buffer *buffer;
37 unsigned long flags; 38 unsigned long flags;
38 int cpu, pc; 39 int cpu, pc;
39 const char *p; 40 const char *p;
@@ -54,7 +55,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
54 goto out; 55 goto out;
55 56
56 pc = preempt_count(); 57 pc = preempt_count();
57 event = trace_buffer_lock_reserve(tr, TRACE_BRANCH, 58 buffer = tr->buffer;
59 event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
58 sizeof(*entry), flags, pc); 60 sizeof(*entry), flags, pc);
59 if (!event) 61 if (!event)
60 goto out; 62 goto out;
@@ -74,8 +76,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
74 entry->line = f->line; 76 entry->line = f->line;
75 entry->correct = val == expect; 77 entry->correct = val == expect;
76 78
77 if (!filter_check_discard(call, entry, tr->buffer, event)) 79 if (!filter_check_discard(call, entry, buffer, event))
78 ring_buffer_unlock_commit(tr->buffer, event); 80 ring_buffer_unlock_commit(buffer, event);
79 81
80 out: 82 out:
81 atomic_dec(&tr->data[cpu]->disabled); 83 atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index b588fd81f7f9..878c03f386ba 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -20,6 +20,8 @@
20#include <linux/ktime.h> 20#include <linux/ktime.h>
21#include <linux/trace_clock.h> 21#include <linux/trace_clock.h>
22 22
23#include "trace.h"
24
23/* 25/*
24 * trace_clock_local(): the simplest and least coherent tracing clock. 26 * trace_clock_local(): the simplest and least coherent tracing clock.
25 * 27 *
@@ -28,17 +30,17 @@
28 */ 30 */
29u64 notrace trace_clock_local(void) 31u64 notrace trace_clock_local(void)
30{ 32{
31 unsigned long flags;
32 u64 clock; 33 u64 clock;
34 int resched;
33 35
34 /* 36 /*
35 * sched_clock() is an architecture implemented, fast, scalable, 37 * sched_clock() is an architecture implemented, fast, scalable,
36 * lockless clock. It is not guaranteed to be coherent across 38 * lockless clock. It is not guaranteed to be coherent across
37 * CPUs, nor across CPU idle events. 39 * CPUs, nor across CPU idle events.
38 */ 40 */
39 raw_local_irq_save(flags); 41 resched = ftrace_preempt_disable();
40 clock = sched_clock(); 42 clock = sched_clock();
41 raw_local_irq_restore(flags); 43 ftrace_preempt_enable(resched);
42 44
43 return clock; 45 return clock;
44} 46}
@@ -66,10 +68,14 @@ u64 notrace trace_clock(void)
66 * Used by plugins that need globally coherent timestamps. 68 * Used by plugins that need globally coherent timestamps.
67 */ 69 */
68 70
69static u64 prev_trace_clock_time; 71/* keep prev_time and lock in the same cacheline. */
70 72static struct {
71static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp = 73 u64 prev_time;
72 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 74 raw_spinlock_t lock;
75} trace_clock_struct ____cacheline_aligned_in_smp =
76 {
77 .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED,
78 };
73 79
74u64 notrace trace_clock_global(void) 80u64 notrace trace_clock_global(void)
75{ 81{
@@ -88,19 +94,19 @@ u64 notrace trace_clock_global(void)
88 if (unlikely(in_nmi())) 94 if (unlikely(in_nmi()))
89 goto out; 95 goto out;
90 96
91 __raw_spin_lock(&trace_clock_lock); 97 __raw_spin_lock(&trace_clock_struct.lock);
92 98
93 /* 99 /*
94 * TODO: if this happens often then maybe we should reset 100 * TODO: if this happens often then maybe we should reset
95 * my_scd->clock to prev_trace_clock_time+1, to make sure 101 * my_scd->clock to prev_time+1, to make sure
96 * we start ticking with the local clock from now on? 102 * we start ticking with the local clock from now on?
97 */ 103 */
98 if ((s64)(now - prev_trace_clock_time) < 0) 104 if ((s64)(now - trace_clock_struct.prev_time) < 0)
99 now = prev_trace_clock_time + 1; 105 now = trace_clock_struct.prev_time + 1;
100 106
101 prev_trace_clock_time = now; 107 trace_clock_struct.prev_time = now;
102 108
103 __raw_spin_unlock(&trace_clock_lock); 109 __raw_spin_unlock(&trace_clock_struct.lock);
104 110
105 out: 111 out:
106 raw_local_irq_restore(flags); 112 raw_local_irq_restore(flags);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
new file mode 100644
index 000000000000..c16a08f399df
--- /dev/null
+++ b/kernel/trace/trace_entries.h
@@ -0,0 +1,382 @@
1/*
2 * This file defines the trace event structures that go into the ring
3 * buffer directly. They are created via macros so that changes for them
4 * appear in the format file. Using macros will automate this process.
5 *
6 * The macro used to create a ftrace data structure is:
7 *
8 * FTRACE_ENTRY( name, struct_name, id, structure, print )
9 *
10 * @name: the name used the event name, as well as the name of
11 * the directory that holds the format file.
12 *
13 * @struct_name: the name of the structure that is created.
14 *
15 * @id: The event identifier that is used to detect what event
16 * this is from the ring buffer.
17 *
18 * @structure: the structure layout
19 *
20 * - __field( type, item )
21 * This is equivalent to declaring
22 * type item;
23 * in the structure.
24 * - __array( type, item, size )
25 * This is equivalent to declaring
26 * type item[size];
27 * in the structure.
28 *
29 * * for structures within structures, the format of the internal
30 * structure is layed out. This allows the internal structure
31 * to be deciphered for the format file. Although these macros
32 * may become out of sync with the internal structure, they
33 * will create a compile error if it happens. Since the
34 * internel structures are just tracing helpers, this is not
35 * an issue.
36 *
37 * When an internal structure is used, it should use:
38 *
39 * __field_struct( type, item )
40 *
41 * instead of __field. This will prevent it from being shown in
42 * the output file. The fields in the structure should use.
43 *
44 * __field_desc( type, container, item )
45 * __array_desc( type, container, item, len )
46 *
47 * type, item and len are the same as __field and __array, but
48 * container is added. This is the name of the item in
49 * __field_struct that this is describing.
50 *
51 *
52 * @print: the print format shown to users in the format file.
53 */
54
55/*
56 * Function trace entry - function address and parent function addres:
57 */
58FTRACE_ENTRY(function, ftrace_entry,
59
60 TRACE_FN,
61
62 F_STRUCT(
63 __field( unsigned long, ip )
64 __field( unsigned long, parent_ip )
65 ),
66
67 F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip)
68);
69
70/* Function call entry */
71FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
72
73 TRACE_GRAPH_ENT,
74
75 F_STRUCT(
76 __field_struct( struct ftrace_graph_ent, graph_ent )
77 __field_desc( unsigned long, graph_ent, func )
78 __field_desc( int, graph_ent, depth )
79 ),
80
81 F_printk("--> %lx (%d)", __entry->func, __entry->depth)
82);
83
84/* Function return entry */
85FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
86
87 TRACE_GRAPH_RET,
88
89 F_STRUCT(
90 __field_struct( struct ftrace_graph_ret, ret )
91 __field_desc( unsigned long, ret, func )
92 __field_desc( unsigned long long, ret, calltime)
93 __field_desc( unsigned long long, ret, rettime )
94 __field_desc( unsigned long, ret, overrun )
95 __field_desc( int, ret, depth )
96 ),
97
98 F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d",
99 __entry->func, __entry->depth,
100 __entry->calltime, __entry->rettime,
101 __entry->depth)
102);
103
104/*
105 * Context switch trace entry - which task (and prio) we switched from/to:
106 *
107 * This is used for both wakeup and context switches. We only want
108 * to create one structure, but we need two outputs for it.
109 */
110#define FTRACE_CTX_FIELDS \
111 __field( unsigned int, prev_pid ) \
112 __field( unsigned char, prev_prio ) \
113 __field( unsigned char, prev_state ) \
114 __field( unsigned int, next_pid ) \
115 __field( unsigned char, next_prio ) \
116 __field( unsigned char, next_state ) \
117 __field( unsigned int, next_cpu )
118
119FTRACE_ENTRY(context_switch, ctx_switch_entry,
120
121 TRACE_CTX,
122
123 F_STRUCT(
124 FTRACE_CTX_FIELDS
125 ),
126
127 F_printk("%u:%u:%u ==> %u:%u:%u [%03u]",
128 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
129 __entry->next_pid, __entry->next_prio, __entry->next_state,
130 __entry->next_cpu
131 )
132);
133
134/*
135 * FTRACE_ENTRY_DUP only creates the format file, it will not
136 * create another structure.
137 */
138FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
139
140 TRACE_WAKE,
141
142 F_STRUCT(
143 FTRACE_CTX_FIELDS
144 ),
145
146 F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]",
147 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
148 __entry->next_pid, __entry->next_prio, __entry->next_state,
149 __entry->next_cpu
150 )
151);
152
153/*
154 * Special (free-form) trace entry:
155 */
156FTRACE_ENTRY(special, special_entry,
157
158 TRACE_SPECIAL,
159
160 F_STRUCT(
161 __field( unsigned long, arg1 )
162 __field( unsigned long, arg2 )
163 __field( unsigned long, arg3 )
164 ),
165
166 F_printk("(%08lx) (%08lx) (%08lx)",
167 __entry->arg1, __entry->arg2, __entry->arg3)
168);
169
170/*
171 * Stack-trace entry:
172 */
173
174#define FTRACE_STACK_ENTRIES 8
175
176FTRACE_ENTRY(kernel_stack, stack_entry,
177
178 TRACE_STACK,
179
180 F_STRUCT(
181 __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
182 ),
183
184 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
185 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
186 __entry->caller[0], __entry->caller[1], __entry->caller[2],
187 __entry->caller[3], __entry->caller[4], __entry->caller[5],
188 __entry->caller[6], __entry->caller[7])
189);
190
191FTRACE_ENTRY(user_stack, userstack_entry,
192
193 TRACE_USER_STACK,
194
195 F_STRUCT(
196 __field( unsigned int, tgid )
197 __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
198 ),
199
200 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
201 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
202 __entry->caller[0], __entry->caller[1], __entry->caller[2],
203 __entry->caller[3], __entry->caller[4], __entry->caller[5],
204 __entry->caller[6], __entry->caller[7])
205);
206
207/*
208 * trace_printk entry:
209 */
210FTRACE_ENTRY(bprint, bprint_entry,
211
212 TRACE_BPRINT,
213
214 F_STRUCT(
215 __field( unsigned long, ip )
216 __field( const char *, fmt )
217 __dynamic_array( u32, buf )
218 ),
219
220 F_printk("%08lx fmt:%p",
221 __entry->ip, __entry->fmt)
222);
223
224FTRACE_ENTRY(print, print_entry,
225
226 TRACE_PRINT,
227
228 F_STRUCT(
229 __field( unsigned long, ip )
230 __dynamic_array( char, buf )
231 ),
232
233 F_printk("%08lx %s",
234 __entry->ip, __entry->buf)
235);
236
237FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
238
239 TRACE_MMIO_RW,
240
241 F_STRUCT(
242 __field_struct( struct mmiotrace_rw, rw )
243 __field_desc( resource_size_t, rw, phys )
244 __field_desc( unsigned long, rw, value )
245 __field_desc( unsigned long, rw, pc )
246 __field_desc( int, rw, map_id )
247 __field_desc( unsigned char, rw, opcode )
248 __field_desc( unsigned char, rw, width )
249 ),
250
251 F_printk("%lx %lx %lx %d %x %x",
252 (unsigned long)__entry->phys, __entry->value, __entry->pc,
253 __entry->map_id, __entry->opcode, __entry->width)
254);
255
256FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
257
258 TRACE_MMIO_MAP,
259
260 F_STRUCT(
261 __field_struct( struct mmiotrace_map, map )
262 __field_desc( resource_size_t, map, phys )
263 __field_desc( unsigned long, map, virt )
264 __field_desc( unsigned long, map, len )
265 __field_desc( int, map, map_id )
266 __field_desc( unsigned char, map, opcode )
267 ),
268
269 F_printk("%lx %lx %lx %d %x",
270 (unsigned long)__entry->phys, __entry->virt, __entry->len,
271 __entry->map_id, __entry->opcode)
272);
273
274FTRACE_ENTRY(boot_call, trace_boot_call,
275
276 TRACE_BOOT_CALL,
277
278 F_STRUCT(
279 __field_struct( struct boot_trace_call, boot_call )
280 __field_desc( pid_t, boot_call, caller )
281 __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN)
282 ),
283
284 F_printk("%d %s", __entry->caller, __entry->func)
285);
286
287FTRACE_ENTRY(boot_ret, trace_boot_ret,
288
289 TRACE_BOOT_RET,
290
291 F_STRUCT(
292 __field_struct( struct boot_trace_ret, boot_ret )
293 __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN)
294 __field_desc( int, boot_ret, result )
295 __field_desc( unsigned long, boot_ret, duration )
296 ),
297
298 F_printk("%s %d %lx",
299 __entry->func, __entry->result, __entry->duration)
300);
301
302#define TRACE_FUNC_SIZE 30
303#define TRACE_FILE_SIZE 20
304
305FTRACE_ENTRY(branch, trace_branch,
306
307 TRACE_BRANCH,
308
309 F_STRUCT(
310 __field( unsigned int, line )
311 __array( char, func, TRACE_FUNC_SIZE+1 )
312 __array( char, file, TRACE_FILE_SIZE+1 )
313 __field( char, correct )
314 ),
315
316 F_printk("%u:%s:%s (%u)",
317 __entry->line,
318 __entry->func, __entry->file, __entry->correct)
319);
320
321FTRACE_ENTRY(hw_branch, hw_branch_entry,
322
323 TRACE_HW_BRANCHES,
324
325 F_STRUCT(
326 __field( u64, from )
327 __field( u64, to )
328 ),
329
330 F_printk("from: %llx to: %llx", __entry->from, __entry->to)
331);
332
333FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
334
335 TRACE_KMEM_ALLOC,
336
337 F_STRUCT(
338 __field( enum kmemtrace_type_id, type_id )
339 __field( unsigned long, call_site )
340 __field( const void *, ptr )
341 __field( size_t, bytes_req )
342 __field( size_t, bytes_alloc )
343 __field( gfp_t, gfp_flags )
344 __field( int, node )
345 ),
346
347 F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi"
348 " flags:%x node:%d",
349 __entry->type_id, __entry->call_site, __entry->ptr,
350 __entry->bytes_req, __entry->bytes_alloc,
351 __entry->gfp_flags, __entry->node)
352);
353
354FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
355
356 TRACE_KMEM_FREE,
357
358 F_STRUCT(
359 __field( enum kmemtrace_type_id, type_id )
360 __field( unsigned long, call_site )
361 __field( const void *, ptr )
362 ),
363
364 F_printk("type:%u call_site:%lx ptr:%p",
365 __entry->type_id, __entry->call_site, __entry->ptr)
366);
367
368FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
369
370 TRACE_KSYM,
371
372 F_STRUCT(
373 __field( unsigned long, ip )
374 __field( unsigned char, type )
375 __array( char , cmd, TASK_COMM_LEN )
376 __field( unsigned long, addr )
377 ),
378
379 F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
380 (void *)__entry->ip, (unsigned int)__entry->type,
381 (void *)__entry->addr, __entry->cmd)
382);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 11ba5bb4ed0a..d9c60f80aa0d 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -5,8 +5,62 @@
5 * 5 *
6 */ 6 */
7 7
8#include <linux/module.h>
8#include "trace.h" 9#include "trace.h"
9 10
11
12char *perf_trace_buf;
13EXPORT_SYMBOL_GPL(perf_trace_buf);
14
15char *perf_trace_buf_nmi;
16EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
17
18typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
19
20/* Count the events in use (per event id, not per instance) */
21static int total_profile_count;
22
23static int ftrace_profile_enable_event(struct ftrace_event_call *event)
24{
25 char *buf;
26 int ret = -ENOMEM;
27
28 if (atomic_inc_return(&event->profile_count))
29 return 0;
30
31 if (!total_profile_count) {
32 buf = (char *)alloc_percpu(perf_trace_t);
33 if (!buf)
34 goto fail_buf;
35
36 rcu_assign_pointer(perf_trace_buf, buf);
37
38 buf = (char *)alloc_percpu(perf_trace_t);
39 if (!buf)
40 goto fail_buf_nmi;
41
42 rcu_assign_pointer(perf_trace_buf_nmi, buf);
43 }
44
45 ret = event->profile_enable(event);
46 if (!ret) {
47 total_profile_count++;
48 return 0;
49 }
50
51fail_buf_nmi:
52 if (!total_profile_count) {
53 free_percpu(perf_trace_buf_nmi);
54 free_percpu(perf_trace_buf);
55 perf_trace_buf_nmi = NULL;
56 perf_trace_buf = NULL;
57 }
58fail_buf:
59 atomic_dec(&event->profile_count);
60
61 return ret;
62}
63
10int ftrace_profile_enable(int event_id) 64int ftrace_profile_enable(int event_id)
11{ 65{
12 struct ftrace_event_call *event; 66 struct ftrace_event_call *event;
@@ -14,8 +68,9 @@ int ftrace_profile_enable(int event_id)
14 68
15 mutex_lock(&event_mutex); 69 mutex_lock(&event_mutex);
16 list_for_each_entry(event, &ftrace_events, list) { 70 list_for_each_entry(event, &ftrace_events, list) {
17 if (event->id == event_id && event->profile_enable) { 71 if (event->id == event_id && event->profile_enable &&
18 ret = event->profile_enable(event); 72 try_module_get(event->mod)) {
73 ret = ftrace_profile_enable_event(event);
19 break; 74 break;
20 } 75 }
21 } 76 }
@@ -24,6 +79,33 @@ int ftrace_profile_enable(int event_id)
24 return ret; 79 return ret;
25} 80}
26 81
82static void ftrace_profile_disable_event(struct ftrace_event_call *event)
83{
84 char *buf, *nmi_buf;
85
86 if (!atomic_add_negative(-1, &event->profile_count))
87 return;
88
89 event->profile_disable(event);
90
91 if (!--total_profile_count) {
92 buf = perf_trace_buf;
93 rcu_assign_pointer(perf_trace_buf, NULL);
94
95 nmi_buf = perf_trace_buf_nmi;
96 rcu_assign_pointer(perf_trace_buf_nmi, NULL);
97
98 /*
99 * Ensure every events in profiling have finished before
100 * releasing the buffers
101 */
102 synchronize_sched();
103
104 free_percpu(buf);
105 free_percpu(nmi_buf);
106 }
107}
108
27void ftrace_profile_disable(int event_id) 109void ftrace_profile_disable(int event_id)
28{ 110{
29 struct ftrace_event_call *event; 111 struct ftrace_event_call *event;
@@ -31,7 +113,8 @@ void ftrace_profile_disable(int event_id)
31 mutex_lock(&event_mutex); 113 mutex_lock(&event_mutex);
32 list_for_each_entry(event, &ftrace_events, list) { 114 list_for_each_entry(event, &ftrace_events, list) {
33 if (event->id == event_id) { 115 if (event->id == event_id) {
34 event->profile_disable(event); 116 ftrace_profile_disable_event(event);
117 module_put(event->mod);
35 break; 118 break;
36 } 119 }
37 } 120 }
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
deleted file mode 100644
index 6db005e12487..000000000000
--- a/kernel/trace/trace_event_types.h
+++ /dev/null
@@ -1,178 +0,0 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM ftrace
3
4/*
5 * We cheat and use the proto type field as the ID
6 * and args as the entry type (minus 'struct')
7 */
8TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
9 TRACE_STRUCT(
10 TRACE_FIELD(unsigned long, ip, ip)
11 TRACE_FIELD(unsigned long, parent_ip, parent_ip)
12 ),
13 TP_RAW_FMT(" %lx <-- %lx")
14);
15
16TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
17 ftrace_graph_ent_entry, ignore,
18 TRACE_STRUCT(
19 TRACE_FIELD(unsigned long, graph_ent.func, func)
20 TRACE_FIELD(int, graph_ent.depth, depth)
21 ),
22 TP_RAW_FMT("--> %lx (%d)")
23);
24
25TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
26 ftrace_graph_ret_entry, ignore,
27 TRACE_STRUCT(
28 TRACE_FIELD(unsigned long, ret.func, func)
29 TRACE_FIELD(unsigned long long, ret.calltime, calltime)
30 TRACE_FIELD(unsigned long long, ret.rettime, rettime)
31 TRACE_FIELD(unsigned long, ret.overrun, overrun)
32 TRACE_FIELD(int, ret.depth, depth)
33 ),
34 TP_RAW_FMT("<-- %lx (%d)")
35);
36
37TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
38 TRACE_STRUCT(
39 TRACE_FIELD(unsigned int, prev_pid, prev_pid)
40 TRACE_FIELD(unsigned char, prev_prio, prev_prio)
41 TRACE_FIELD(unsigned char, prev_state, prev_state)
42 TRACE_FIELD(unsigned int, next_pid, next_pid)
43 TRACE_FIELD(unsigned char, next_prio, next_prio)
44 TRACE_FIELD(unsigned char, next_state, next_state)
45 TRACE_FIELD(unsigned int, next_cpu, next_cpu)
46 ),
47 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
48);
49
50TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
51 TRACE_STRUCT(
52 TRACE_FIELD(unsigned int, prev_pid, prev_pid)
53 TRACE_FIELD(unsigned char, prev_prio, prev_prio)
54 TRACE_FIELD(unsigned char, prev_state, prev_state)
55 TRACE_FIELD(unsigned int, next_pid, next_pid)
56 TRACE_FIELD(unsigned char, next_prio, next_prio)
57 TRACE_FIELD(unsigned char, next_state, next_state)
58 TRACE_FIELD(unsigned int, next_cpu, next_cpu)
59 ),
60 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
61);
62
63TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore,
64 TRACE_STRUCT(
65 TRACE_FIELD(unsigned long, arg1, arg1)
66 TRACE_FIELD(unsigned long, arg2, arg2)
67 TRACE_FIELD(unsigned long, arg3, arg3)
68 ),
69 TP_RAW_FMT("(%08lx) (%08lx) (%08lx)")
70);
71
72/*
73 * Stack-trace entry:
74 */
75
76/* #define FTRACE_STACK_ENTRIES 8 */
77
78TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
79 TRACE_STRUCT(
80 TRACE_FIELD(unsigned long, caller[0], stack0)
81 TRACE_FIELD(unsigned long, caller[1], stack1)
82 TRACE_FIELD(unsigned long, caller[2], stack2)
83 TRACE_FIELD(unsigned long, caller[3], stack3)
84 TRACE_FIELD(unsigned long, caller[4], stack4)
85 TRACE_FIELD(unsigned long, caller[5], stack5)
86 TRACE_FIELD(unsigned long, caller[6], stack6)
87 TRACE_FIELD(unsigned long, caller[7], stack7)
88 ),
89 TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
90 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
91);
92
93TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
94 TRACE_STRUCT(
95 TRACE_FIELD(unsigned long, caller[0], stack0)
96 TRACE_FIELD(unsigned long, caller[1], stack1)
97 TRACE_FIELD(unsigned long, caller[2], stack2)
98 TRACE_FIELD(unsigned long, caller[3], stack3)
99 TRACE_FIELD(unsigned long, caller[4], stack4)
100 TRACE_FIELD(unsigned long, caller[5], stack5)
101 TRACE_FIELD(unsigned long, caller[6], stack6)
102 TRACE_FIELD(unsigned long, caller[7], stack7)
103 ),
104 TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
105 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
106);
107
108TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
109 TRACE_STRUCT(
110 TRACE_FIELD(unsigned long, ip, ip)
111 TRACE_FIELD(char *, fmt, fmt)
112 TRACE_FIELD_ZERO_CHAR(buf)
113 ),
114 TP_RAW_FMT("%08lx (%d) fmt:%p %s")
115);
116
117TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
118 TRACE_STRUCT(
119 TRACE_FIELD(unsigned long, ip, ip)
120 TRACE_FIELD_ZERO_CHAR(buf)
121 ),
122 TP_RAW_FMT("%08lx (%d) fmt:%p %s")
123);
124
125TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
126 TRACE_STRUCT(
127 TRACE_FIELD(unsigned int, line, line)
128 TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func,
129 TRACE_FUNC_SIZE+1, func)
130 TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file,
131 TRACE_FUNC_SIZE+1, file)
132 TRACE_FIELD(char, correct, correct)
133 ),
134 TP_RAW_FMT("%u:%s:%s (%u)")
135);
136
137TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
138 TRACE_STRUCT(
139 TRACE_FIELD(u64, from, from)
140 TRACE_FIELD(u64, to, to)
141 ),
142 TP_RAW_FMT("from: %llx to: %llx")
143);
144
145TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
146 TRACE_STRUCT(
147 TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1)
148 TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)
149 TRACE_FIELD(int, state_data.type, type)
150 TRACE_FIELD(int, state_data.state, state)
151 ),
152 TP_RAW_FMT("%llx->%llx type:%u state:%u")
153);
154
155TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
156 TRACE_STRUCT(
157 TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
158 TRACE_FIELD(unsigned long, call_site, call_site)
159 TRACE_FIELD(const void *, ptr, ptr)
160 TRACE_FIELD(size_t, bytes_req, bytes_req)
161 TRACE_FIELD(size_t, bytes_alloc, bytes_alloc)
162 TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
163 TRACE_FIELD(int, node, node)
164 ),
165 TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
166 " flags:%x node:%d")
167);
168
169TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
170 TRACE_STRUCT(
171 TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
172 TRACE_FIELD(unsigned long, call_site, call_site)
173 TRACE_FIELD(const void *, ptr, ptr)
174 ),
175 TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
176);
177
178#undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 78b1ed230177..1d18315dc836 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -21,6 +21,7 @@
21 21
22#include "trace_output.h" 22#include "trace_output.h"
23 23
24#undef TRACE_SYSTEM
24#define TRACE_SYSTEM "TRACE_SYSTEM" 25#define TRACE_SYSTEM "TRACE_SYSTEM"
25 26
26DEFINE_MUTEX(event_mutex); 27DEFINE_MUTEX(event_mutex);
@@ -86,15 +87,13 @@ int trace_define_common_fields(struct ftrace_event_call *call)
86 __common_field(unsigned char, flags); 87 __common_field(unsigned char, flags);
87 __common_field(unsigned char, preempt_count); 88 __common_field(unsigned char, preempt_count);
88 __common_field(int, pid); 89 __common_field(int, pid);
89 __common_field(int, tgid); 90 __common_field(int, lock_depth);
90 91
91 return ret; 92 return ret;
92} 93}
93EXPORT_SYMBOL_GPL(trace_define_common_fields); 94EXPORT_SYMBOL_GPL(trace_define_common_fields);
94 95
95#ifdef CONFIG_MODULES 96void trace_destroy_fields(struct ftrace_event_call *call)
96
97static void trace_destroy_fields(struct ftrace_event_call *call)
98{ 97{
99 struct ftrace_event_field *field, *next; 98 struct ftrace_event_field *field, *next;
100 99
@@ -106,8 +105,6 @@ static void trace_destroy_fields(struct ftrace_event_call *call)
106 } 105 }
107} 106}
108 107
109#endif /* CONFIG_MODULES */
110
111static void ftrace_event_enable_disable(struct ftrace_event_call *call, 108static void ftrace_event_enable_disable(struct ftrace_event_call *call,
112 int enable) 109 int enable)
113{ 110{
@@ -116,14 +113,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
116 if (call->enabled) { 113 if (call->enabled) {
117 call->enabled = 0; 114 call->enabled = 0;
118 tracing_stop_cmdline_record(); 115 tracing_stop_cmdline_record();
119 call->unregfunc(call->data); 116 call->unregfunc(call);
120 } 117 }
121 break; 118 break;
122 case 1: 119 case 1:
123 if (!call->enabled) { 120 if (!call->enabled) {
124 call->enabled = 1; 121 call->enabled = 1;
125 tracing_start_cmdline_record(); 122 tracing_start_cmdline_record();
126 call->regfunc(call->data); 123 call->regfunc(call);
127 } 124 }
128 break; 125 break;
129 } 126 }
@@ -230,73 +227,38 @@ static ssize_t
230ftrace_event_write(struct file *file, const char __user *ubuf, 227ftrace_event_write(struct file *file, const char __user *ubuf,
231 size_t cnt, loff_t *ppos) 228 size_t cnt, loff_t *ppos)
232{ 229{
233 size_t read = 0; 230 struct trace_parser parser;
234 int i, set = 1; 231 ssize_t read, ret;
235 ssize_t ret;
236 char *buf;
237 char ch;
238 232
239 if (!cnt || cnt < 0) 233 if (!cnt)
240 return 0; 234 return 0;
241 235
242 ret = tracing_update_buffers(); 236 ret = tracing_update_buffers();
243 if (ret < 0) 237 if (ret < 0)
244 return ret; 238 return ret;
245 239
246 ret = get_user(ch, ubuf++); 240 if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1))
247 if (ret)
248 return ret;
249 read++;
250 cnt--;
251
252 /* skip white space */
253 while (cnt && isspace(ch)) {
254 ret = get_user(ch, ubuf++);
255 if (ret)
256 return ret;
257 read++;
258 cnt--;
259 }
260
261 /* Only white space found? */
262 if (isspace(ch)) {
263 file->f_pos += read;
264 ret = read;
265 return ret;
266 }
267
268 buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL);
269 if (!buf)
270 return -ENOMEM; 241 return -ENOMEM;
271 242
272 if (cnt > EVENT_BUF_SIZE) 243 read = trace_get_user(&parser, ubuf, cnt, ppos);
273 cnt = EVENT_BUF_SIZE; 244
245 if (read >= 0 && trace_parser_loaded((&parser))) {
246 int set = 1;
274 247
275 i = 0; 248 if (*parser.buffer == '!')
276 while (cnt && !isspace(ch)) {
277 if (!i && ch == '!')
278 set = 0; 249 set = 0;
279 else
280 buf[i++] = ch;
281 250
282 ret = get_user(ch, ubuf++); 251 parser.buffer[parser.idx] = 0;
252
253 ret = ftrace_set_clr_event(parser.buffer + !set, set);
283 if (ret) 254 if (ret)
284 goto out_free; 255 goto out_put;
285 read++;
286 cnt--;
287 } 256 }
288 buf[i] = 0;
289
290 file->f_pos += read;
291
292 ret = ftrace_set_clr_event(buf, set);
293 if (ret)
294 goto out_free;
295 257
296 ret = read; 258 ret = read;
297 259
298 out_free: 260 out_put:
299 kfree(buf); 261 trace_parser_put(&parser);
300 262
301 return ret; 263 return ret;
302} 264}
@@ -304,42 +266,32 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
304static void * 266static void *
305t_next(struct seq_file *m, void *v, loff_t *pos) 267t_next(struct seq_file *m, void *v, loff_t *pos)
306{ 268{
307 struct list_head *list = m->private; 269 struct ftrace_event_call *call = v;
308 struct ftrace_event_call *call;
309 270
310 (*pos)++; 271 (*pos)++;
311 272
312 for (;;) { 273 list_for_each_entry_continue(call, &ftrace_events, list) {
313 if (list == &ftrace_events)
314 return NULL;
315
316 call = list_entry(list, struct ftrace_event_call, list);
317
318 /* 274 /*
319 * The ftrace subsystem is for showing formats only. 275 * The ftrace subsystem is for showing formats only.
320 * They can not be enabled or disabled via the event files. 276 * They can not be enabled or disabled via the event files.
321 */ 277 */
322 if (call->regfunc) 278 if (call->regfunc)
323 break; 279 return call;
324
325 list = list->next;
326 } 280 }
327 281
328 m->private = list->next; 282 return NULL;
329
330 return call;
331} 283}
332 284
333static void *t_start(struct seq_file *m, loff_t *pos) 285static void *t_start(struct seq_file *m, loff_t *pos)
334{ 286{
335 struct ftrace_event_call *call = NULL; 287 struct ftrace_event_call *call;
336 loff_t l; 288 loff_t l;
337 289
338 mutex_lock(&event_mutex); 290 mutex_lock(&event_mutex);
339 291
340 m->private = ftrace_events.next; 292 call = list_entry(&ftrace_events, struct ftrace_event_call, list);
341 for (l = 0; l <= *pos; ) { 293 for (l = 0; l <= *pos; ) {
342 call = t_next(m, NULL, &l); 294 call = t_next(m, call, &l);
343 if (!call) 295 if (!call)
344 break; 296 break;
345 } 297 }
@@ -349,37 +301,28 @@ static void *t_start(struct seq_file *m, loff_t *pos)
349static void * 301static void *
350s_next(struct seq_file *m, void *v, loff_t *pos) 302s_next(struct seq_file *m, void *v, loff_t *pos)
351{ 303{
352 struct list_head *list = m->private; 304 struct ftrace_event_call *call = v;
353 struct ftrace_event_call *call;
354 305
355 (*pos)++; 306 (*pos)++;
356 307
357 retry: 308 list_for_each_entry_continue(call, &ftrace_events, list) {
358 if (list == &ftrace_events) 309 if (call->enabled)
359 return NULL; 310 return call;
360
361 call = list_entry(list, struct ftrace_event_call, list);
362
363 if (!call->enabled) {
364 list = list->next;
365 goto retry;
366 } 311 }
367 312
368 m->private = list->next; 313 return NULL;
369
370 return call;
371} 314}
372 315
373static void *s_start(struct seq_file *m, loff_t *pos) 316static void *s_start(struct seq_file *m, loff_t *pos)
374{ 317{
375 struct ftrace_event_call *call = NULL; 318 struct ftrace_event_call *call;
376 loff_t l; 319 loff_t l;
377 320
378 mutex_lock(&event_mutex); 321 mutex_lock(&event_mutex);
379 322
380 m->private = ftrace_events.next; 323 call = list_entry(&ftrace_events, struct ftrace_event_call, list);
381 for (l = 0; l <= *pos; ) { 324 for (l = 0; l <= *pos; ) {
382 call = s_next(m, NULL, &l); 325 call = s_next(m, call, &l);
383 if (!call) 326 if (!call)
384 break; 327 break;
385 } 328 }
@@ -560,7 +503,7 @@ extern char *__bad_type_size(void);
560#define FIELD(type, name) \ 503#define FIELD(type, name) \
561 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ 504 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \
562 #type, "common_" #name, offsetof(typeof(field), name), \ 505 #type, "common_" #name, offsetof(typeof(field), name), \
563 sizeof(field.name) 506 sizeof(field.name), is_signed_type(type)
564 507
565static int trace_write_header(struct trace_seq *s) 508static int trace_write_header(struct trace_seq *s)
566{ 509{
@@ -568,17 +511,17 @@ static int trace_write_header(struct trace_seq *s)
568 511
569 /* struct trace_entry */ 512 /* struct trace_entry */
570 return trace_seq_printf(s, 513 return trace_seq_printf(s,
571 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 514 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
572 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 515 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
573 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 516 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
574 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 517 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
575 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 518 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
576 "\n", 519 "\n",
577 FIELD(unsigned short, type), 520 FIELD(unsigned short, type),
578 FIELD(unsigned char, flags), 521 FIELD(unsigned char, flags),
579 FIELD(unsigned char, preempt_count), 522 FIELD(unsigned char, preempt_count),
580 FIELD(int, pid), 523 FIELD(int, pid),
581 FIELD(int, tgid)); 524 FIELD(int, lock_depth));
582} 525}
583 526
584static ssize_t 527static ssize_t
@@ -931,9 +874,9 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
931 "'%s/filter' entry\n", name); 874 "'%s/filter' entry\n", name);
932 } 875 }
933 876
934 entry = trace_create_file("enable", 0644, system->entry, 877 trace_create_file("enable", 0644, system->entry,
935 (void *)system->name, 878 (void *)system->name,
936 &ftrace_system_enable_fops); 879 &ftrace_system_enable_fops);
937 880
938 return system->entry; 881 return system->entry;
939} 882}
@@ -945,7 +888,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
945 const struct file_operations *filter, 888 const struct file_operations *filter,
946 const struct file_operations *format) 889 const struct file_operations *format)
947{ 890{
948 struct dentry *entry;
949 int ret; 891 int ret;
950 892
951 /* 893 /*
@@ -963,12 +905,12 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
963 } 905 }
964 906
965 if (call->regfunc) 907 if (call->regfunc)
966 entry = trace_create_file("enable", 0644, call->dir, call, 908 trace_create_file("enable", 0644, call->dir, call,
967 enable); 909 enable);
968 910
969 if (call->id && call->profile_enable) 911 if (call->id && call->profile_enable)
970 entry = trace_create_file("id", 0444, call->dir, call, 912 trace_create_file("id", 0444, call->dir, call,
971 id); 913 id);
972 914
973 if (call->define_fields) { 915 if (call->define_fields) {
974 ret = call->define_fields(call); 916 ret = call->define_fields(call);
@@ -977,41 +919,60 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
977 " events/%s\n", call->name); 919 " events/%s\n", call->name);
978 return ret; 920 return ret;
979 } 921 }
980 entry = trace_create_file("filter", 0644, call->dir, call, 922 trace_create_file("filter", 0644, call->dir, call,
981 filter); 923 filter);
982 } 924 }
983 925
984 /* A trace may not want to export its format */ 926 /* A trace may not want to export its format */
985 if (!call->show_format) 927 if (!call->show_format)
986 return 0; 928 return 0;
987 929
988 entry = trace_create_file("format", 0444, call->dir, call, 930 trace_create_file("format", 0444, call->dir, call,
989 format); 931 format);
990 932
991 return 0; 933 return 0;
992} 934}
993 935
994#define for_each_event(event, start, end) \ 936static int __trace_add_event_call(struct ftrace_event_call *call)
995 for (event = start; \ 937{
996 (unsigned long)event < (unsigned long)end; \ 938 struct dentry *d_events;
997 event++) 939 int ret;
998 940
999#ifdef CONFIG_MODULES 941 if (!call->name)
942 return -EINVAL;
1000 943
1001static LIST_HEAD(ftrace_module_file_list); 944 if (call->raw_init) {
945 ret = call->raw_init(call);
946 if (ret < 0) {
947 if (ret != -ENOSYS)
948 pr_warning("Could not initialize trace "
949 "events/%s\n", call->name);
950 return ret;
951 }
952 }
1002 953
1003/* 954 d_events = event_trace_events_dir();
1004 * Modules must own their file_operations to keep up with 955 if (!d_events)
1005 * reference counting. 956 return -ENOENT;
1006 */ 957
1007struct ftrace_module_file_ops { 958 ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
1008 struct list_head list; 959 &ftrace_enable_fops, &ftrace_event_filter_fops,
1009 struct module *mod; 960 &ftrace_event_format_fops);
1010 struct file_operations id; 961 if (!ret)
1011 struct file_operations enable; 962 list_add(&call->list, &ftrace_events);
1012 struct file_operations format; 963
1013 struct file_operations filter; 964 return ret;
1014}; 965}
966
967/* Add an additional event_call dynamically */
968int trace_add_event_call(struct ftrace_event_call *call)
969{
970 int ret;
971 mutex_lock(&event_mutex);
972 ret = __trace_add_event_call(call);
973 mutex_unlock(&event_mutex);
974 return ret;
975}
1015 976
1016static void remove_subsystem_dir(const char *name) 977static void remove_subsystem_dir(const char *name)
1017{ 978{
@@ -1039,6 +1000,53 @@ static void remove_subsystem_dir(const char *name)
1039 } 1000 }
1040} 1001}
1041 1002
1003/*
1004 * Must be called under locking both of event_mutex and trace_event_mutex.
1005 */
1006static void __trace_remove_event_call(struct ftrace_event_call *call)
1007{
1008 ftrace_event_enable_disable(call, 0);
1009 if (call->event)
1010 __unregister_ftrace_event(call->event);
1011 debugfs_remove_recursive(call->dir);
1012 list_del(&call->list);
1013 trace_destroy_fields(call);
1014 destroy_preds(call);
1015 remove_subsystem_dir(call->system);
1016}
1017
1018/* Remove an event_call */
1019void trace_remove_event_call(struct ftrace_event_call *call)
1020{
1021 mutex_lock(&event_mutex);
1022 down_write(&trace_event_mutex);
1023 __trace_remove_event_call(call);
1024 up_write(&trace_event_mutex);
1025 mutex_unlock(&event_mutex);
1026}
1027
1028#define for_each_event(event, start, end) \
1029 for (event = start; \
1030 (unsigned long)event < (unsigned long)end; \
1031 event++)
1032
1033#ifdef CONFIG_MODULES
1034
1035static LIST_HEAD(ftrace_module_file_list);
1036
1037/*
1038 * Modules must own their file_operations to keep up with
1039 * reference counting.
1040 */
1041struct ftrace_module_file_ops {
1042 struct list_head list;
1043 struct module *mod;
1044 struct file_operations id;
1045 struct file_operations enable;
1046 struct file_operations format;
1047 struct file_operations filter;
1048};
1049
1042static struct ftrace_module_file_ops * 1050static struct ftrace_module_file_ops *
1043trace_create_file_ops(struct module *mod) 1051trace_create_file_ops(struct module *mod)
1044{ 1052{
@@ -1096,7 +1104,7 @@ static void trace_module_add_events(struct module *mod)
1096 if (!call->name) 1104 if (!call->name)
1097 continue; 1105 continue;
1098 if (call->raw_init) { 1106 if (call->raw_init) {
1099 ret = call->raw_init(); 1107 ret = call->raw_init(call);
1100 if (ret < 0) { 1108 if (ret < 0) {
1101 if (ret != -ENOSYS) 1109 if (ret != -ENOSYS)
1102 pr_warning("Could not initialize trace " 1110 pr_warning("Could not initialize trace "
@@ -1114,10 +1122,11 @@ static void trace_module_add_events(struct module *mod)
1114 return; 1122 return;
1115 } 1123 }
1116 call->mod = mod; 1124 call->mod = mod;
1117 list_add(&call->list, &ftrace_events); 1125 ret = event_create_dir(call, d_events,
1118 event_create_dir(call, d_events, 1126 &file_ops->id, &file_ops->enable,
1119 &file_ops->id, &file_ops->enable, 1127 &file_ops->filter, &file_ops->format);
1120 &file_ops->filter, &file_ops->format); 1128 if (!ret)
1129 list_add(&call->list, &ftrace_events);
1121 } 1130 }
1122} 1131}
1123 1132
@@ -1131,14 +1140,7 @@ static void trace_module_remove_events(struct module *mod)
1131 list_for_each_entry_safe(call, p, &ftrace_events, list) { 1140 list_for_each_entry_safe(call, p, &ftrace_events, list) {
1132 if (call->mod == mod) { 1141 if (call->mod == mod) {
1133 found = true; 1142 found = true;
1134 ftrace_event_enable_disable(call, 0); 1143 __trace_remove_event_call(call);
1135 if (call->event)
1136 __unregister_ftrace_event(call->event);
1137 debugfs_remove_recursive(call->dir);
1138 list_del(&call->list);
1139 trace_destroy_fields(call);
1140 destroy_preds(call);
1141 remove_subsystem_dir(call->system);
1142 } 1144 }
1143 } 1145 }
1144 1146
@@ -1187,7 +1189,7 @@ static int trace_module_notify(struct notifier_block *self,
1187} 1189}
1188#endif /* CONFIG_MODULES */ 1190#endif /* CONFIG_MODULES */
1189 1191
1190struct notifier_block trace_module_nb = { 1192static struct notifier_block trace_module_nb = {
1191 .notifier_call = trace_module_notify, 1193 .notifier_call = trace_module_notify,
1192 .priority = 0, 1194 .priority = 0,
1193}; 1195};
@@ -1256,7 +1258,7 @@ static __init int event_trace_init(void)
1256 if (!call->name) 1258 if (!call->name)
1257 continue; 1259 continue;
1258 if (call->raw_init) { 1260 if (call->raw_init) {
1259 ret = call->raw_init(); 1261 ret = call->raw_init(call);
1260 if (ret < 0) { 1262 if (ret < 0) {
1261 if (ret != -ENOSYS) 1263 if (ret != -ENOSYS)
1262 pr_warning("Could not initialize trace " 1264 pr_warning("Could not initialize trace "
@@ -1264,10 +1266,12 @@ static __init int event_trace_init(void)
1264 continue; 1266 continue;
1265 } 1267 }
1266 } 1268 }
1267 list_add(&call->list, &ftrace_events); 1269 ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
1268 event_create_dir(call, d_events, &ftrace_event_id_fops, 1270 &ftrace_enable_fops,
1269 &ftrace_enable_fops, &ftrace_event_filter_fops, 1271 &ftrace_event_filter_fops,
1270 &ftrace_event_format_fops); 1272 &ftrace_event_format_fops);
1273 if (!ret)
1274 list_add(&call->list, &ftrace_events);
1271 } 1275 }
1272 1276
1273 while (true) { 1277 while (true) {
@@ -1359,6 +1363,18 @@ static __init void event_trace_self_tests(void)
1359 if (!call->regfunc) 1363 if (!call->regfunc)
1360 continue; 1364 continue;
1361 1365
1366/*
1367 * Testing syscall events here is pretty useless, but
1368 * we still do it if configured. But this is time consuming.
1369 * What we really need is a user thread to perform the
1370 * syscalls as we test.
1371 */
1372#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
1373 if (call->system &&
1374 strcmp(call->system, "syscalls") == 0)
1375 continue;
1376#endif
1377
1362 pr_info("Testing event %s: ", call->name); 1378 pr_info("Testing event %s: ", call->name);
1363 1379
1364 /* 1380 /*
@@ -1432,7 +1448,7 @@ static __init void event_trace_self_tests(void)
1432 1448
1433#ifdef CONFIG_FUNCTION_TRACER 1449#ifdef CONFIG_FUNCTION_TRACER
1434 1450
1435static DEFINE_PER_CPU(atomic_t, test_event_disable); 1451static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
1436 1452
1437static void 1453static void
1438function_test_events_call(unsigned long ip, unsigned long parent_ip) 1454function_test_events_call(unsigned long ip, unsigned long parent_ip)
@@ -1449,7 +1465,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1449 pc = preempt_count(); 1465 pc = preempt_count();
1450 resched = ftrace_preempt_disable(); 1466 resched = ftrace_preempt_disable();
1451 cpu = raw_smp_processor_id(); 1467 cpu = raw_smp_processor_id();
1452 disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); 1468 disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
1453 1469
1454 if (disabled != 1) 1470 if (disabled != 1)
1455 goto out; 1471 goto out;
@@ -1468,7 +1484,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1468 trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); 1484 trace_nowake_buffer_unlock_commit(buffer, event, flags, pc);
1469 1485
1470 out: 1486 out:
1471 atomic_dec(&per_cpu(test_event_disable, cpu)); 1487 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
1472 ftrace_preempt_enable(resched); 1488 ftrace_preempt_enable(resched);
1473} 1489}
1474 1490
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 93660fbbf629..50504cb228de 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -18,11 +18,10 @@
18 * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> 18 * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
19 */ 19 */
20 20
21#include <linux/debugfs.h>
22#include <linux/uaccess.h>
23#include <linux/module.h> 21#include <linux/module.h>
24#include <linux/ctype.h> 22#include <linux/ctype.h>
25#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/perf_event.h>
26 25
27#include "trace.h" 26#include "trace.h"
28#include "trace_output.h" 27#include "trace_output.h"
@@ -31,6 +30,7 @@ enum filter_op_ids
31{ 30{
32 OP_OR, 31 OP_OR,
33 OP_AND, 32 OP_AND,
33 OP_GLOB,
34 OP_NE, 34 OP_NE,
35 OP_EQ, 35 OP_EQ,
36 OP_LT, 36 OP_LT,
@@ -48,16 +48,17 @@ struct filter_op {
48}; 48};
49 49
50static struct filter_op filter_ops[] = { 50static struct filter_op filter_ops[] = {
51 { OP_OR, "||", 1 }, 51 { OP_OR, "||", 1 },
52 { OP_AND, "&&", 2 }, 52 { OP_AND, "&&", 2 },
53 { OP_NE, "!=", 4 }, 53 { OP_GLOB, "~", 4 },
54 { OP_EQ, "==", 4 }, 54 { OP_NE, "!=", 4 },
55 { OP_LT, "<", 5 }, 55 { OP_EQ, "==", 4 },
56 { OP_LE, "<=", 5 }, 56 { OP_LT, "<", 5 },
57 { OP_GT, ">", 5 }, 57 { OP_LE, "<=", 5 },
58 { OP_GE, ">=", 5 }, 58 { OP_GT, ">", 5 },
59 { OP_NONE, "OP_NONE", 0 }, 59 { OP_GE, ">=", 5 },
60 { OP_OPEN_PAREN, "(", 0 }, 60 { OP_NONE, "OP_NONE", 0 },
61 { OP_OPEN_PAREN, "(", 0 },
61}; 62};
62 63
63enum { 64enum {
@@ -121,6 +122,47 @@ struct filter_parse_state {
121 } operand; 122 } operand;
122}; 123};
123 124
125#define DEFINE_COMPARISON_PRED(type) \
126static int filter_pred_##type(struct filter_pred *pred, void *event, \
127 int val1, int val2) \
128{ \
129 type *addr = (type *)(event + pred->offset); \
130 type val = (type)pred->val; \
131 int match = 0; \
132 \
133 switch (pred->op) { \
134 case OP_LT: \
135 match = (*addr < val); \
136 break; \
137 case OP_LE: \
138 match = (*addr <= val); \
139 break; \
140 case OP_GT: \
141 match = (*addr > val); \
142 break; \
143 case OP_GE: \
144 match = (*addr >= val); \
145 break; \
146 default: \
147 break; \
148 } \
149 \
150 return match; \
151}
152
153#define DEFINE_EQUALITY_PRED(size) \
154static int filter_pred_##size(struct filter_pred *pred, void *event, \
155 int val1, int val2) \
156{ \
157 u##size *addr = (u##size *)(event + pred->offset); \
158 u##size val = (u##size)pred->val; \
159 int match; \
160 \
161 match = (val == *addr) ^ pred->not; \
162 \
163 return match; \
164}
165
124DEFINE_COMPARISON_PRED(s64); 166DEFINE_COMPARISON_PRED(s64);
125DEFINE_COMPARISON_PRED(u64); 167DEFINE_COMPARISON_PRED(u64);
126DEFINE_COMPARISON_PRED(s32); 168DEFINE_COMPARISON_PRED(s32);
@@ -156,9 +198,9 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
156 char *addr = (char *)(event + pred->offset); 198 char *addr = (char *)(event + pred->offset);
157 int cmp, match; 199 int cmp, match;
158 200
159 cmp = strncmp(addr, pred->str_val, pred->str_len); 201 cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len);
160 202
161 match = (!cmp) ^ pred->not; 203 match = cmp ^ pred->not;
162 204
163 return match; 205 return match;
164} 206}
@@ -170,9 +212,9 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
170 char **addr = (char **)(event + pred->offset); 212 char **addr = (char **)(event + pred->offset);
171 int cmp, match; 213 int cmp, match;
172 214
173 cmp = strncmp(*addr, pred->str_val, pred->str_len); 215 cmp = pred->regex.match(*addr, &pred->regex, pred->regex.field_len);
174 216
175 match = (!cmp) ^ pred->not; 217 match = cmp ^ pred->not;
176 218
177 return match; 219 return match;
178} 220}
@@ -196,9 +238,9 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,
196 char *addr = (char *)(event + str_loc); 238 char *addr = (char *)(event + str_loc);
197 int cmp, match; 239 int cmp, match;
198 240
199 cmp = strncmp(addr, pred->str_val, str_len); 241 cmp = pred->regex.match(addr, &pred->regex, str_len);
200 242
201 match = (!cmp) ^ pred->not; 243 match = cmp ^ pred->not;
202 244
203 return match; 245 return match;
204} 246}
@@ -209,10 +251,121 @@ static int filter_pred_none(struct filter_pred *pred, void *event,
209 return 0; 251 return 0;
210} 252}
211 253
254/* Basic regex callbacks */
255static int regex_match_full(char *str, struct regex *r, int len)
256{
257 if (strncmp(str, r->pattern, len) == 0)
258 return 1;
259 return 0;
260}
261
262static int regex_match_front(char *str, struct regex *r, int len)
263{
264 if (strncmp(str, r->pattern, len) == 0)
265 return 1;
266 return 0;
267}
268
269static int regex_match_middle(char *str, struct regex *r, int len)
270{
271 if (strstr(str, r->pattern))
272 return 1;
273 return 0;
274}
275
276static int regex_match_end(char *str, struct regex *r, int len)
277{
278 char *ptr = strstr(str, r->pattern);
279
280 if (ptr && (ptr[r->len] == 0))
281 return 1;
282 return 0;
283}
284
285/**
286 * filter_parse_regex - parse a basic regex
287 * @buff: the raw regex
288 * @len: length of the regex
289 * @search: will point to the beginning of the string to compare
290 * @not: tell whether the match will have to be inverted
291 *
292 * This passes in a buffer containing a regex and this function will
293 * set search to point to the search part of the buffer and
294 * return the type of search it is (see enum above).
295 * This does modify buff.
296 *
297 * Returns enum type.
298 * search returns the pointer to use for comparison.
299 * not returns 1 if buff started with a '!'
300 * 0 otherwise.
301 */
302enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
303{
304 int type = MATCH_FULL;
305 int i;
306
307 if (buff[0] == '!') {
308 *not = 1;
309 buff++;
310 len--;
311 } else
312 *not = 0;
313
314 *search = buff;
315
316 for (i = 0; i < len; i++) {
317 if (buff[i] == '*') {
318 if (!i) {
319 *search = buff + 1;
320 type = MATCH_END_ONLY;
321 } else {
322 if (type == MATCH_END_ONLY)
323 type = MATCH_MIDDLE_ONLY;
324 else
325 type = MATCH_FRONT_ONLY;
326 buff[i] = 0;
327 break;
328 }
329 }
330 }
331
332 return type;
333}
334
335static void filter_build_regex(struct filter_pred *pred)
336{
337 struct regex *r = &pred->regex;
338 char *search;
339 enum regex_type type = MATCH_FULL;
340 int not = 0;
341
342 if (pred->op == OP_GLOB) {
343 type = filter_parse_regex(r->pattern, r->len, &search, &not);
344 r->len = strlen(search);
345 memmove(r->pattern, search, r->len+1);
346 }
347
348 switch (type) {
349 case MATCH_FULL:
350 r->match = regex_match_full;
351 break;
352 case MATCH_FRONT_ONLY:
353 r->match = regex_match_front;
354 break;
355 case MATCH_MIDDLE_ONLY:
356 r->match = regex_match_middle;
357 break;
358 case MATCH_END_ONLY:
359 r->match = regex_match_end;
360 break;
361 }
362
363 pred->not ^= not;
364}
365
212/* return 1 if event matches, 0 otherwise (discard) */ 366/* return 1 if event matches, 0 otherwise (discard) */
213int filter_match_preds(struct ftrace_event_call *call, void *rec) 367int filter_match_preds(struct event_filter *filter, void *rec)
214{ 368{
215 struct event_filter *filter = call->filter;
216 int match, top = 0, val1 = 0, val2 = 0; 369 int match, top = 0, val1 = 0, val2 = 0;
217 int stack[MAX_FILTER_PRED]; 370 int stack[MAX_FILTER_PRED];
218 struct filter_pred *pred; 371 struct filter_pred *pred;
@@ -355,7 +508,7 @@ static void filter_clear_pred(struct filter_pred *pred)
355{ 508{
356 kfree(pred->field_name); 509 kfree(pred->field_name);
357 pred->field_name = NULL; 510 pred->field_name = NULL;
358 pred->str_len = 0; 511 pred->regex.len = 0;
359} 512}
360 513
361static int filter_set_pred(struct filter_pred *dest, 514static int filter_set_pred(struct filter_pred *dest,
@@ -385,9 +538,8 @@ static void filter_disable_preds(struct ftrace_event_call *call)
385 filter->preds[i]->fn = filter_pred_none; 538 filter->preds[i]->fn = filter_pred_none;
386} 539}
387 540
388void destroy_preds(struct ftrace_event_call *call) 541static void __free_preds(struct event_filter *filter)
389{ 542{
390 struct event_filter *filter = call->filter;
391 int i; 543 int i;
392 544
393 if (!filter) 545 if (!filter)
@@ -400,21 +552,24 @@ void destroy_preds(struct ftrace_event_call *call)
400 kfree(filter->preds); 552 kfree(filter->preds);
401 kfree(filter->filter_string); 553 kfree(filter->filter_string);
402 kfree(filter); 554 kfree(filter);
555}
556
557void destroy_preds(struct ftrace_event_call *call)
558{
559 __free_preds(call->filter);
403 call->filter = NULL; 560 call->filter = NULL;
561 call->filter_active = 0;
404} 562}
405 563
406static int init_preds(struct ftrace_event_call *call) 564static struct event_filter *__alloc_preds(void)
407{ 565{
408 struct event_filter *filter; 566 struct event_filter *filter;
409 struct filter_pred *pred; 567 struct filter_pred *pred;
410 int i; 568 int i;
411 569
412 if (call->filter) 570 filter = kzalloc(sizeof(*filter), GFP_KERNEL);
413 return 0; 571 if (!filter)
414 572 return ERR_PTR(-ENOMEM);
415 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
416 if (!call->filter)
417 return -ENOMEM;
418 573
419 filter->n_preds = 0; 574 filter->n_preds = 0;
420 575
@@ -430,12 +585,24 @@ static int init_preds(struct ftrace_event_call *call)
430 filter->preds[i] = pred; 585 filter->preds[i] = pred;
431 } 586 }
432 587
433 return 0; 588 return filter;
434 589
435oom: 590oom:
436 destroy_preds(call); 591 __free_preds(filter);
592 return ERR_PTR(-ENOMEM);
593}
437 594
438 return -ENOMEM; 595static int init_preds(struct ftrace_event_call *call)
596{
597 if (call->filter)
598 return 0;
599
600 call->filter_active = 0;
601 call->filter = __alloc_preds();
602 if (IS_ERR(call->filter))
603 return PTR_ERR(call->filter);
604
605 return 0;
439} 606}
440 607
441static int init_subsystem_preds(struct event_subsystem *system) 608static int init_subsystem_preds(struct event_subsystem *system)
@@ -458,14 +625,7 @@ static int init_subsystem_preds(struct event_subsystem *system)
458 return 0; 625 return 0;
459} 626}
460 627
461enum { 628static void filter_free_subsystem_preds(struct event_subsystem *system)
462 FILTER_DISABLE_ALL,
463 FILTER_INIT_NO_RESET,
464 FILTER_SKIP_NO_RESET,
465};
466
467static void filter_free_subsystem_preds(struct event_subsystem *system,
468 int flag)
469{ 629{
470 struct ftrace_event_call *call; 630 struct ftrace_event_call *call;
471 631
@@ -476,14 +636,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
476 if (strcmp(call->system, system->name) != 0) 636 if (strcmp(call->system, system->name) != 0)
477 continue; 637 continue;
478 638
479 if (flag == FILTER_INIT_NO_RESET) {
480 call->filter->no_reset = false;
481 continue;
482 }
483
484 if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
485 continue;
486
487 filter_disable_preds(call); 639 filter_disable_preds(call);
488 remove_filter_string(call->filter); 640 remove_filter_string(call->filter);
489 } 641 }
@@ -491,10 +643,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
491 643
492static int filter_add_pred_fn(struct filter_parse_state *ps, 644static int filter_add_pred_fn(struct filter_parse_state *ps,
493 struct ftrace_event_call *call, 645 struct ftrace_event_call *call,
646 struct event_filter *filter,
494 struct filter_pred *pred, 647 struct filter_pred *pred,
495 filter_pred_fn_t fn) 648 filter_pred_fn_t fn)
496{ 649{
497 struct event_filter *filter = call->filter;
498 int idx, err; 650 int idx, err;
499 651
500 if (filter->n_preds == MAX_FILTER_PRED) { 652 if (filter->n_preds == MAX_FILTER_PRED) {
@@ -509,7 +661,6 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
509 return err; 661 return err;
510 662
511 filter->n_preds++; 663 filter->n_preds++;
512 call->filter_active = 1;
513 664
514 return 0; 665 return 0;
515} 666}
@@ -534,7 +685,10 @@ static bool is_string_field(struct ftrace_event_field *field)
534 685
535static int is_legal_op(struct ftrace_event_field *field, int op) 686static int is_legal_op(struct ftrace_event_field *field, int op)
536{ 687{
537 if (is_string_field(field) && (op != OP_EQ && op != OP_NE)) 688 if (is_string_field(field) &&
689 (op != OP_EQ && op != OP_NE && op != OP_GLOB))
690 return 0;
691 if (!is_string_field(field) && op == OP_GLOB)
538 return 0; 692 return 0;
539 693
540 return 1; 694 return 1;
@@ -585,6 +739,7 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
585 739
586static int filter_add_pred(struct filter_parse_state *ps, 740static int filter_add_pred(struct filter_parse_state *ps,
587 struct ftrace_event_call *call, 741 struct ftrace_event_call *call,
742 struct event_filter *filter,
588 struct filter_pred *pred, 743 struct filter_pred *pred,
589 bool dry_run) 744 bool dry_run)
590{ 745{
@@ -619,21 +774,22 @@ static int filter_add_pred(struct filter_parse_state *ps,
619 } 774 }
620 775
621 if (is_string_field(field)) { 776 if (is_string_field(field)) {
622 pred->str_len = field->size; 777 filter_build_regex(pred);
623 778
624 if (field->filter_type == FILTER_STATIC_STRING) 779 if (field->filter_type == FILTER_STATIC_STRING) {
625 fn = filter_pred_string; 780 fn = filter_pred_string;
626 else if (field->filter_type == FILTER_DYN_STRING) 781 pred->regex.field_len = field->size;
782 } else if (field->filter_type == FILTER_DYN_STRING)
627 fn = filter_pred_strloc; 783 fn = filter_pred_strloc;
628 else { 784 else {
629 fn = filter_pred_pchar; 785 fn = filter_pred_pchar;
630 pred->str_len = strlen(pred->str_val); 786 pred->regex.field_len = strlen(pred->regex.pattern);
631 } 787 }
632 } else { 788 } else {
633 if (field->is_signed) 789 if (field->is_signed)
634 ret = strict_strtoll(pred->str_val, 0, &val); 790 ret = strict_strtoll(pred->regex.pattern, 0, &val);
635 else 791 else
636 ret = strict_strtoull(pred->str_val, 0, &val); 792 ret = strict_strtoull(pred->regex.pattern, 0, &val);
637 if (ret) { 793 if (ret) {
638 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); 794 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
639 return -EINVAL; 795 return -EINVAL;
@@ -653,45 +809,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
653 809
654add_pred_fn: 810add_pred_fn:
655 if (!dry_run) 811 if (!dry_run)
656 return filter_add_pred_fn(ps, call, pred, fn); 812 return filter_add_pred_fn(ps, call, filter, pred, fn);
657 return 0;
658}
659
660static int filter_add_subsystem_pred(struct filter_parse_state *ps,
661 struct event_subsystem *system,
662 struct filter_pred *pred,
663 char *filter_string,
664 bool dry_run)
665{
666 struct ftrace_event_call *call;
667 int err = 0;
668 bool fail = true;
669
670 list_for_each_entry(call, &ftrace_events, list) {
671
672 if (!call->define_fields)
673 continue;
674
675 if (strcmp(call->system, system->name))
676 continue;
677
678 if (call->filter->no_reset)
679 continue;
680
681 err = filter_add_pred(ps, call, pred, dry_run);
682 if (err)
683 call->filter->no_reset = true;
684 else
685 fail = false;
686
687 if (!dry_run)
688 replace_filter_string(call->filter, filter_string);
689 }
690
691 if (fail) {
692 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
693 return err;
694 }
695 return 0; 813 return 0;
696} 814}
697 815
@@ -892,8 +1010,9 @@ static void postfix_clear(struct filter_parse_state *ps)
892 1010
893 while (!list_empty(&ps->postfix)) { 1011 while (!list_empty(&ps->postfix)) {
894 elt = list_first_entry(&ps->postfix, struct postfix_elt, list); 1012 elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
895 kfree(elt->operand);
896 list_del(&elt->list); 1013 list_del(&elt->list);
1014 kfree(elt->operand);
1015 kfree(elt);
897 } 1016 }
898} 1017}
899 1018
@@ -1003,8 +1122,8 @@ static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
1003 return NULL; 1122 return NULL;
1004 } 1123 }
1005 1124
1006 strcpy(pred->str_val, operand2); 1125 strcpy(pred->regex.pattern, operand2);
1007 pred->str_len = strlen(operand2); 1126 pred->regex.len = strlen(pred->regex.pattern);
1008 1127
1009 pred->op = op; 1128 pred->op = op;
1010 1129
@@ -1048,8 +1167,8 @@ static int check_preds(struct filter_parse_state *ps)
1048 return 0; 1167 return 0;
1049} 1168}
1050 1169
1051static int replace_preds(struct event_subsystem *system, 1170static int replace_preds(struct ftrace_event_call *call,
1052 struct ftrace_event_call *call, 1171 struct event_filter *filter,
1053 struct filter_parse_state *ps, 1172 struct filter_parse_state *ps,
1054 char *filter_string, 1173 char *filter_string,
1055 bool dry_run) 1174 bool dry_run)
@@ -1096,11 +1215,7 @@ static int replace_preds(struct event_subsystem *system,
1096add_pred: 1215add_pred:
1097 if (!pred) 1216 if (!pred)
1098 return -ENOMEM; 1217 return -ENOMEM;
1099 if (call) 1218 err = filter_add_pred(ps, call, filter, pred, dry_run);
1100 err = filter_add_pred(ps, call, pred, false);
1101 else
1102 err = filter_add_subsystem_pred(ps, system, pred,
1103 filter_string, dry_run);
1104 filter_free_pred(pred); 1219 filter_free_pred(pred);
1105 if (err) 1220 if (err)
1106 return err; 1221 return err;
@@ -1111,10 +1226,50 @@ add_pred:
1111 return 0; 1226 return 0;
1112} 1227}
1113 1228
1114int apply_event_filter(struct ftrace_event_call *call, char *filter_string) 1229static int replace_system_preds(struct event_subsystem *system,
1230 struct filter_parse_state *ps,
1231 char *filter_string)
1115{ 1232{
1233 struct ftrace_event_call *call;
1234 bool fail = true;
1116 int err; 1235 int err;
1117 1236
1237 list_for_each_entry(call, &ftrace_events, list) {
1238 struct event_filter *filter = call->filter;
1239
1240 if (!call->define_fields)
1241 continue;
1242
1243 if (strcmp(call->system, system->name) != 0)
1244 continue;
1245
1246 /* try to see if the filter can be applied */
1247 err = replace_preds(call, filter, ps, filter_string, true);
1248 if (err)
1249 continue;
1250
1251 /* really apply the filter */
1252 filter_disable_preds(call);
1253 err = replace_preds(call, filter, ps, filter_string, false);
1254 if (err)
1255 filter_disable_preds(call);
1256 else {
1257 call->filter_active = 1;
1258 replace_filter_string(filter, filter_string);
1259 }
1260 fail = false;
1261 }
1262
1263 if (fail) {
1264 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1265 return -EINVAL;
1266 }
1267 return 0;
1268}
1269
1270int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1271{
1272 int err;
1118 struct filter_parse_state *ps; 1273 struct filter_parse_state *ps;
1119 1274
1120 mutex_lock(&event_mutex); 1275 mutex_lock(&event_mutex);
@@ -1126,8 +1281,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1126 if (!strcmp(strstrip(filter_string), "0")) { 1281 if (!strcmp(strstrip(filter_string), "0")) {
1127 filter_disable_preds(call); 1282 filter_disable_preds(call);
1128 remove_filter_string(call->filter); 1283 remove_filter_string(call->filter);
1129 mutex_unlock(&event_mutex); 1284 goto out_unlock;
1130 return 0;
1131 } 1285 }
1132 1286
1133 err = -ENOMEM; 1287 err = -ENOMEM;
@@ -1145,10 +1299,11 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1145 goto out; 1299 goto out;
1146 } 1300 }
1147 1301
1148 err = replace_preds(NULL, call, ps, filter_string, false); 1302 err = replace_preds(call, call->filter, ps, filter_string, false);
1149 if (err) 1303 if (err)
1150 append_filter_err(ps, call->filter); 1304 append_filter_err(ps, call->filter);
1151 1305 else
1306 call->filter_active = 1;
1152out: 1307out:
1153 filter_opstack_clear(ps); 1308 filter_opstack_clear(ps);
1154 postfix_clear(ps); 1309 postfix_clear(ps);
@@ -1163,7 +1318,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1163 char *filter_string) 1318 char *filter_string)
1164{ 1319{
1165 int err; 1320 int err;
1166
1167 struct filter_parse_state *ps; 1321 struct filter_parse_state *ps;
1168 1322
1169 mutex_lock(&event_mutex); 1323 mutex_lock(&event_mutex);
@@ -1173,10 +1327,9 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1173 goto out_unlock; 1327 goto out_unlock;
1174 1328
1175 if (!strcmp(strstrip(filter_string), "0")) { 1329 if (!strcmp(strstrip(filter_string), "0")) {
1176 filter_free_subsystem_preds(system, FILTER_DISABLE_ALL); 1330 filter_free_subsystem_preds(system);
1177 remove_filter_string(system->filter); 1331 remove_filter_string(system->filter);
1178 mutex_unlock(&event_mutex); 1332 goto out_unlock;
1179 return 0;
1180 } 1333 }
1181 1334
1182 err = -ENOMEM; 1335 err = -ENOMEM;
@@ -1193,31 +1346,87 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1193 goto out; 1346 goto out;
1194 } 1347 }
1195 1348
1196 filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET); 1349 err = replace_system_preds(system, ps, filter_string);
1197 1350 if (err)
1198 /* try to see the filter can be applied to which events */
1199 err = replace_preds(system, NULL, ps, filter_string, true);
1200 if (err) {
1201 append_filter_err(ps, system->filter); 1351 append_filter_err(ps, system->filter);
1202 goto out; 1352
1353out:
1354 filter_opstack_clear(ps);
1355 postfix_clear(ps);
1356 kfree(ps);
1357out_unlock:
1358 mutex_unlock(&event_mutex);
1359
1360 return err;
1361}
1362
1363#ifdef CONFIG_EVENT_PROFILE
1364
1365void ftrace_profile_free_filter(struct perf_event *event)
1366{
1367 struct event_filter *filter = event->filter;
1368
1369 event->filter = NULL;
1370 __free_preds(filter);
1371}
1372
1373int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1374 char *filter_str)
1375{
1376 int err;
1377 struct event_filter *filter;
1378 struct filter_parse_state *ps;
1379 struct ftrace_event_call *call = NULL;
1380
1381 mutex_lock(&event_mutex);
1382
1383 list_for_each_entry(call, &ftrace_events, list) {
1384 if (call->id == event_id)
1385 break;
1203 } 1386 }
1204 1387
1205 filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET); 1388 err = -EINVAL;
1389 if (!call)
1390 goto out_unlock;
1391
1392 err = -EEXIST;
1393 if (event->filter)
1394 goto out_unlock;
1206 1395
1207 /* really apply the filter to the events */ 1396 filter = __alloc_preds();
1208 err = replace_preds(system, NULL, ps, filter_string, false); 1397 if (IS_ERR(filter)) {
1209 if (err) { 1398 err = PTR_ERR(filter);
1210 append_filter_err(ps, system->filter); 1399 goto out_unlock;
1211 filter_free_subsystem_preds(system, 2);
1212 } 1400 }
1213 1401
1214out: 1402 err = -ENOMEM;
1403 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1404 if (!ps)
1405 goto free_preds;
1406
1407 parse_init(ps, filter_ops, filter_str);
1408 err = filter_parse(ps);
1409 if (err)
1410 goto free_ps;
1411
1412 err = replace_preds(call, filter, ps, filter_str, false);
1413 if (!err)
1414 event->filter = filter;
1415
1416free_ps:
1215 filter_opstack_clear(ps); 1417 filter_opstack_clear(ps);
1216 postfix_clear(ps); 1418 postfix_clear(ps);
1217 kfree(ps); 1419 kfree(ps);
1420
1421free_preds:
1422 if (err)
1423 __free_preds(filter);
1424
1218out_unlock: 1425out_unlock:
1219 mutex_unlock(&event_mutex); 1426 mutex_unlock(&event_mutex);
1220 1427
1221 return err; 1428 return err;
1222} 1429}
1223 1430
1431#endif /* CONFIG_EVENT_PROFILE */
1432
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index df1bf6e48bb9..dff8c84ddf17 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -15,146 +15,127 @@
15 15
16#include "trace_output.h" 16#include "trace_output.h"
17 17
18#undef TRACE_SYSTEM
19#define TRACE_SYSTEM ftrace
18 20
19#undef TRACE_STRUCT 21/* not needed for this file */
20#define TRACE_STRUCT(args...) args 22#undef __field_struct
23#define __field_struct(type, item)
21 24
22extern void __bad_type_size(void); 25#undef __field
26#define __field(type, item) type item;
23 27
24#undef TRACE_FIELD 28#undef __field_desc
25#define TRACE_FIELD(type, item, assign) \ 29#define __field_desc(type, container, item) type item;
26 if (sizeof(type) != sizeof(field.item)) \ 30
27 __bad_type_size(); \ 31#undef __array
32#define __array(type, item, size) type item[size];
33
34#undef __array_desc
35#define __array_desc(type, container, item, size) type item[size];
36
37#undef __dynamic_array
38#define __dynamic_array(type, item) type item[];
39
40#undef F_STRUCT
41#define F_STRUCT(args...) args
42
43#undef F_printk
44#define F_printk(fmt, args...) fmt, args
45
46#undef FTRACE_ENTRY
47#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
48struct ____ftrace_##name { \
49 tstruct \
50}; \
51static void __always_unused ____ftrace_check_##name(void) \
52{ \
53 struct ____ftrace_##name *__entry = NULL; \
54 \
55 /* force compile-time check on F_printk() */ \
56 printk(print); \
57}
58
59#undef FTRACE_ENTRY_DUP
60#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \
61 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
62
63#include "trace_entries.h"
64
65
66#undef __field
67#define __field(type, item) \
28 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
29 "offset:%u;\tsize:%u;\n", \ 69 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
30 (unsigned int)offsetof(typeof(field), item), \ 70 offsetof(typeof(field), item), \
31 (unsigned int)sizeof(field.item)); \ 71 sizeof(field.item), is_signed_type(type)); \
32 if (!ret) \ 72 if (!ret) \
33 return 0; 73 return 0;
34 74
75#undef __field_desc
76#define __field_desc(type, container, item) \
77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
78 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
79 offsetof(typeof(field), container.item), \
80 sizeof(field.container.item), \
81 is_signed_type(type)); \
82 if (!ret) \
83 return 0;
35 84
36#undef TRACE_FIELD_SPECIAL 85#undef __array
37#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ 86#define __array(type, item, len) \
38 ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ 87 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
39 "offset:%u;\tsize:%u;\n", \ 88 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
40 (unsigned int)offsetof(typeof(field), item), \ 89 offsetof(typeof(field), item), \
41 (unsigned int)sizeof(field.item)); \ 90 sizeof(field.item), is_signed_type(type)); \
42 if (!ret) \ 91 if (!ret) \
43 return 0; 92 return 0;
44 93
45#undef TRACE_FIELD_ZERO_CHAR 94#undef __array_desc
46#define TRACE_FIELD_ZERO_CHAR(item) \ 95#define __array_desc(type, container, item, len) \
47 ret = trace_seq_printf(s, "\tfield:char " #item ";\t" \ 96 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
48 "offset:%u;\tsize:0;\n", \ 97 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
49 (unsigned int)offsetof(typeof(field), item)); \ 98 offsetof(typeof(field), container.item), \
99 sizeof(field.container.item), \
100 is_signed_type(type)); \
50 if (!ret) \ 101 if (!ret) \
51 return 0; 102 return 0;
52 103
53#undef TRACE_FIELD_SIGN 104#undef __dynamic_array
54#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ 105#define __dynamic_array(type, item) \
55 TRACE_FIELD(type, item, assign) 106 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
107 "offset:%zu;\tsize:0;\tsigned:%u;\n", \
108 offsetof(typeof(field), item), \
109 is_signed_type(type)); \
110 if (!ret) \
111 return 0;
56 112
57#undef TP_RAW_FMT 113#undef F_printk
58#define TP_RAW_FMT(args...) args 114#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
59 115
60#undef TRACE_EVENT_FORMAT 116#undef __entry
61#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 117#define __entry REC
62static int \
63ftrace_format_##call(struct ftrace_event_call *unused, \
64 struct trace_seq *s) \
65{ \
66 struct args field; \
67 int ret; \
68 \
69 tstruct; \
70 \
71 trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \
72 \
73 return ret; \
74}
75 118
76#undef TRACE_EVENT_FORMAT_NOFILTER 119#undef FTRACE_ENTRY
77#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ 120#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
78 tpfmt) \
79static int \ 121static int \
80ftrace_format_##call(struct ftrace_event_call *unused, \ 122ftrace_format_##name(struct ftrace_event_call *unused, \
81 struct trace_seq *s) \ 123 struct trace_seq *s) \
82{ \ 124{ \
83 struct args field; \ 125 struct struct_name field __attribute__((unused)); \
84 int ret; \ 126 int ret = 0; \
85 \ 127 \
86 tstruct; \ 128 tstruct; \
87 \ 129 \
88 trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ 130 trace_seq_printf(s, "\nprint fmt: " print); \
89 \ 131 \
90 return ret; \ 132 return ret; \
91} 133}
92 134
93#include "trace_event_types.h" 135#include "trace_entries.h"
94
95#undef TRACE_ZERO_CHAR
96#define TRACE_ZERO_CHAR(arg)
97
98#undef TRACE_FIELD
99#define TRACE_FIELD(type, item, assign)\
100 entry->item = assign;
101
102#undef TRACE_FIELD
103#define TRACE_FIELD(type, item, assign)\
104 entry->item = assign;
105 136
106#undef TRACE_FIELD_SIGN 137#undef __field
107#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ 138#define __field(type, item) \
108 TRACE_FIELD(type, item, assign)
109
110#undef TP_CMD
111#define TP_CMD(cmd...) cmd
112
113#undef TRACE_ENTRY
114#define TRACE_ENTRY entry
115
116#undef TRACE_FIELD_SPECIAL
117#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \
118 cmd;
119
120#undef TRACE_EVENT_FORMAT
121#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
122int ftrace_define_fields_##call(struct ftrace_event_call *event_call); \
123static int ftrace_raw_init_event_##call(void); \
124 \
125struct ftrace_event_call __used \
126__attribute__((__aligned__(4))) \
127__attribute__((section("_ftrace_events"))) event_##call = { \
128 .name = #call, \
129 .id = proto, \
130 .system = __stringify(TRACE_SYSTEM), \
131 .raw_init = ftrace_raw_init_event_##call, \
132 .show_format = ftrace_format_##call, \
133 .define_fields = ftrace_define_fields_##call, \
134}; \
135static int ftrace_raw_init_event_##call(void) \
136{ \
137 INIT_LIST_HEAD(&event_##call.fields); \
138 return 0; \
139} \
140
141#undef TRACE_EVENT_FORMAT_NOFILTER
142#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
143 tpfmt) \
144 \
145struct ftrace_event_call __used \
146__attribute__((__aligned__(4))) \
147__attribute__((section("_ftrace_events"))) event_##call = { \
148 .name = #call, \
149 .id = proto, \
150 .system = __stringify(TRACE_SYSTEM), \
151 .show_format = ftrace_format_##call, \
152};
153
154#include "trace_event_types.h"
155
156#undef TRACE_FIELD
157#define TRACE_FIELD(type, item, assign) \
158 ret = trace_define_field(event_call, #type, #item, \ 139 ret = trace_define_field(event_call, #type, #item, \
159 offsetof(typeof(field), item), \ 140 offsetof(typeof(field), item), \
160 sizeof(field.item), \ 141 sizeof(field.item), \
@@ -162,32 +143,45 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
162 if (ret) \ 143 if (ret) \
163 return ret; 144 return ret;
164 145
165#undef TRACE_FIELD_SPECIAL 146#undef __field_desc
166#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ 147#define __field_desc(type, container, item) \
148 ret = trace_define_field(event_call, #type, #item, \
149 offsetof(typeof(field), \
150 container.item), \
151 sizeof(field.container.item), \
152 is_signed_type(type), FILTER_OTHER); \
153 if (ret) \
154 return ret;
155
156#undef __array
157#define __array(type, item, len) \
158 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
167 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 159 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
168 offsetof(typeof(field), item), \ 160 offsetof(typeof(field), item), \
169 sizeof(field.item), 0, FILTER_OTHER); \ 161 sizeof(field.item), 0, FILTER_OTHER); \
170 if (ret) \ 162 if (ret) \
171 return ret; 163 return ret;
172 164
173#undef TRACE_FIELD_SIGN 165#undef __array_desc
174#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ 166#define __array_desc(type, container, item, len) \
175 ret = trace_define_field(event_call, #type, #item, \ 167 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
176 offsetof(typeof(field), item), \ 168 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
177 sizeof(field.item), is_signed, \ 169 offsetof(typeof(field), \
170 container.item), \
171 sizeof(field.container.item), 0, \
178 FILTER_OTHER); \ 172 FILTER_OTHER); \
179 if (ret) \ 173 if (ret) \
180 return ret; 174 return ret;
181 175
182#undef TRACE_FIELD_ZERO_CHAR 176#undef __dynamic_array
183#define TRACE_FIELD_ZERO_CHAR(item) 177#define __dynamic_array(type, item)
184 178
185#undef TRACE_EVENT_FORMAT 179#undef FTRACE_ENTRY
186#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 180#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
187int \ 181int \
188ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ 182ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
189{ \ 183{ \
190 struct args field; \ 184 struct struct_name field; \
191 int ret; \ 185 int ret; \
192 \ 186 \
193 ret = trace_define_common_fields(event_call); \ 187 ret = trace_define_common_fields(event_call); \
@@ -199,8 +193,41 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call) \
199 return ret; \ 193 return ret; \
200} 194}
201 195
202#undef TRACE_EVENT_FORMAT_NOFILTER 196#include "trace_entries.h"
203#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ 197
204 tpfmt) 198static int ftrace_raw_init_event(struct ftrace_event_call *call)
199{
200 INIT_LIST_HEAD(&call->fields);
201 return 0;
202}
203
204#undef __field
205#define __field(type, item)
206
207#undef __field_desc
208#define __field_desc(type, container, item)
209
210#undef __array
211#define __array(type, item, len)
212
213#undef __array_desc
214#define __array_desc(type, container, item, len)
215
216#undef __dynamic_array
217#define __dynamic_array(type, item)
218
219#undef FTRACE_ENTRY
220#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
221 \
222struct ftrace_event_call __used \
223__attribute__((__aligned__(4))) \
224__attribute__((section("_ftrace_events"))) event_##call = { \
225 .name = #call, \
226 .id = type, \
227 .system = __stringify(TRACE_SYSTEM), \
228 .raw_init = ftrace_raw_init_event, \
229 .show_format = ftrace_format_##call, \
230 .define_fields = ftrace_define_fields_##call, \
231}; \
205 232
206#include "trace_event_types.h" 233#include "trace_entries.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 5b01b94518fc..b3f3776b0cd6 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -290,7 +290,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
290{ 290{
291 long count = (long)data; 291 long count = (long)data;
292 292
293 seq_printf(m, "%pf:", (void *)ip); 293 seq_printf(m, "%ps:", (void *)ip);
294 294
295 if (ops == &traceon_probe_ops) 295 if (ops == &traceon_probe_ops)
296 seq_printf(m, "traceon"); 296 seq_printf(m, "traceon");
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b3749a2c3132..45e6c01b2e4d 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -124,7 +124,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
124 if (unlikely(current->ret_stack[index].fp != frame_pointer)) { 124 if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
125 ftrace_graph_stop(); 125 ftrace_graph_stop();
126 WARN(1, "Bad frame pointer: expected %lx, received %lx\n" 126 WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
127 " from func %pF return to %lx\n", 127 " from func %ps return to %lx\n",
128 current->ret_stack[index].fp, 128 current->ret_stack[index].fp,
129 frame_pointer, 129 frame_pointer,
130 (void *)current->ret_stack[index].func, 130 (void *)current->ret_stack[index].func,
@@ -364,6 +364,15 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
364} 364}
365 365
366 366
367static enum print_line_t
368print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
369{
370 if (!trace_seq_putc(s, ' '))
371 return 0;
372
373 return trace_print_lat_fmt(s, entry);
374}
375
367/* If the pid changed since the last trace, output this event */ 376/* If the pid changed since the last trace, output this event */
368static enum print_line_t 377static enum print_line_t
369verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) 378verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
@@ -521,6 +530,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
521 if (ret == TRACE_TYPE_PARTIAL_LINE) 530 if (ret == TRACE_TYPE_PARTIAL_LINE)
522 return TRACE_TYPE_PARTIAL_LINE; 531 return TRACE_TYPE_PARTIAL_LINE;
523 } 532 }
533
524 /* Proc */ 534 /* Proc */
525 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 535 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
526 ret = print_graph_proc(s, pid); 536 ret = print_graph_proc(s, pid);
@@ -659,7 +669,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
659 return TRACE_TYPE_PARTIAL_LINE; 669 return TRACE_TYPE_PARTIAL_LINE;
660 } 670 }
661 671
662 ret = trace_seq_printf(s, "%pf();\n", (void *)call->func); 672 ret = trace_seq_printf(s, "%ps();\n", (void *)call->func);
663 if (!ret) 673 if (!ret)
664 return TRACE_TYPE_PARTIAL_LINE; 674 return TRACE_TYPE_PARTIAL_LINE;
665 675
@@ -702,7 +712,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
702 return TRACE_TYPE_PARTIAL_LINE; 712 return TRACE_TYPE_PARTIAL_LINE;
703 } 713 }
704 714
705 ret = trace_seq_printf(s, "%pf() {\n", (void *)call->func); 715 ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func);
706 if (!ret) 716 if (!ret)
707 return TRACE_TYPE_PARTIAL_LINE; 717 return TRACE_TYPE_PARTIAL_LINE;
708 718
@@ -758,6 +768,13 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
758 return TRACE_TYPE_PARTIAL_LINE; 768 return TRACE_TYPE_PARTIAL_LINE;
759 } 769 }
760 770
771 /* Latency format */
772 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
773 ret = print_graph_lat_fmt(s, ent);
774 if (ret == TRACE_TYPE_PARTIAL_LINE)
775 return TRACE_TYPE_PARTIAL_LINE;
776 }
777
761 return 0; 778 return 0;
762} 779}
763 780
@@ -952,28 +969,59 @@ print_graph_function(struct trace_iterator *iter)
952 return TRACE_TYPE_HANDLED; 969 return TRACE_TYPE_HANDLED;
953} 970}
954 971
972static void print_lat_header(struct seq_file *s)
973{
974 static const char spaces[] = " " /* 16 spaces */
975 " " /* 4 spaces */
976 " "; /* 17 spaces */
977 int size = 0;
978
979 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
980 size += 16;
981 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
982 size += 4;
983 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
984 size += 17;
985
986 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces);
987 seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces);
988 seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces);
989 seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces);
990 seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces);
991 seq_printf(s, "#%.*s|||| / \n", size, spaces);
992}
993
955static void print_graph_headers(struct seq_file *s) 994static void print_graph_headers(struct seq_file *s)
956{ 995{
996 int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
997
998 if (lat)
999 print_lat_header(s);
1000
957 /* 1st line */ 1001 /* 1st line */
958 seq_printf(s, "# "); 1002 seq_printf(s, "#");
959 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1003 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
960 seq_printf(s, " TIME "); 1004 seq_printf(s, " TIME ");
961 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1005 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
962 seq_printf(s, "CPU"); 1006 seq_printf(s, " CPU");
963 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1007 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
964 seq_printf(s, " TASK/PID "); 1008 seq_printf(s, " TASK/PID ");
1009 if (lat)
1010 seq_printf(s, "|||||");
965 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1011 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
966 seq_printf(s, " DURATION "); 1012 seq_printf(s, " DURATION ");
967 seq_printf(s, " FUNCTION CALLS\n"); 1013 seq_printf(s, " FUNCTION CALLS\n");
968 1014
969 /* 2nd line */ 1015 /* 2nd line */
970 seq_printf(s, "# "); 1016 seq_printf(s, "#");
971 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1017 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
972 seq_printf(s, " | "); 1018 seq_printf(s, " | ");
973 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1019 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
974 seq_printf(s, "| "); 1020 seq_printf(s, " | ");
975 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1021 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
976 seq_printf(s, " | | "); 1022 seq_printf(s, " | | ");
1023 if (lat)
1024 seq_printf(s, "|||||");
977 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1025 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
978 seq_printf(s, " | | "); 1026 seq_printf(s, " | | ");
979 seq_printf(s, " | | | |\n"); 1027 seq_printf(s, " | | | |\n");
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index ca7d7c4d0c2a..69543a905cd5 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -155,7 +155,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
155 seq_print_ip_sym(seq, it->from, symflags) && 155 seq_print_ip_sym(seq, it->from, symflags) &&
156 trace_seq_printf(seq, "\n")) 156 trace_seq_printf(seq, "\n"))
157 return TRACE_TYPE_HANDLED; 157 return TRACE_TYPE_HANDLED;
158 return TRACE_TYPE_PARTIAL_LINE;; 158 return TRACE_TYPE_PARTIAL_LINE;
159 } 159 }
160 return TRACE_TYPE_UNHANDLED; 160 return TRACE_TYPE_UNHANDLED;
161} 161}
@@ -165,6 +165,7 @@ void trace_hw_branch(u64 from, u64 to)
165 struct ftrace_event_call *call = &event_hw_branch; 165 struct ftrace_event_call *call = &event_hw_branch;
166 struct trace_array *tr = hw_branch_trace; 166 struct trace_array *tr = hw_branch_trace;
167 struct ring_buffer_event *event; 167 struct ring_buffer_event *event;
168 struct ring_buffer *buf;
168 struct hw_branch_entry *entry; 169 struct hw_branch_entry *entry;
169 unsigned long irq1; 170 unsigned long irq1;
170 int cpu; 171 int cpu;
@@ -180,7 +181,8 @@ void trace_hw_branch(u64 from, u64 to)
180 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) 181 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
181 goto out; 182 goto out;
182 183
183 event = trace_buffer_lock_reserve(tr, TRACE_HW_BRANCHES, 184 buf = tr->buffer;
185 event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES,
184 sizeof(*entry), 0, 0); 186 sizeof(*entry), 0, 0);
185 if (!event) 187 if (!event)
186 goto out; 188 goto out;
@@ -189,8 +191,8 @@ void trace_hw_branch(u64 from, u64 to)
189 entry->ent.type = TRACE_HW_BRANCHES; 191 entry->ent.type = TRACE_HW_BRANCHES;
190 entry->from = from; 192 entry->from = from;
191 entry->to = to; 193 entry->to = to;
192 if (!filter_check_discard(call, entry, tr->buffer, event)) 194 if (!filter_check_discard(call, entry, buf, event))
193 trace_buffer_unlock_commit(tr, event, 0, 0); 195 trace_buffer_unlock_commit(buf, event, 0, 0);
194 196
195 out: 197 out:
196 atomic_dec(&tr->data[cpu]->disabled); 198 atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 5555b75a0d12..3aa7eaa2114c 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -129,15 +129,10 @@ check_critical_timing(struct trace_array *tr,
129 unsigned long parent_ip, 129 unsigned long parent_ip,
130 int cpu) 130 int cpu)
131{ 131{
132 unsigned long latency, t0, t1;
133 cycle_t T0, T1, delta; 132 cycle_t T0, T1, delta;
134 unsigned long flags; 133 unsigned long flags;
135 int pc; 134 int pc;
136 135
137 /*
138 * usecs conversion is slow so we try to delay the conversion
139 * as long as possible:
140 */
141 T0 = data->preempt_timestamp; 136 T0 = data->preempt_timestamp;
142 T1 = ftrace_now(cpu); 137 T1 = ftrace_now(cpu);
143 delta = T1-T0; 138 delta = T1-T0;
@@ -157,18 +152,15 @@ check_critical_timing(struct trace_array *tr,
157 152
158 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
159 154
160 latency = nsecs_to_usecs(delta);
161
162 if (data->critical_sequence != max_sequence) 155 if (data->critical_sequence != max_sequence)
163 goto out_unlock; 156 goto out_unlock;
164 157
165 tracing_max_latency = delta;
166 t0 = nsecs_to_usecs(T0);
167 t1 = nsecs_to_usecs(T1);
168
169 data->critical_end = parent_ip; 158 data->critical_end = parent_ip;
170 159
171 update_max_tr_single(tr, current, cpu); 160 if (likely(!is_tracing_stopped())) {
161 tracing_max_latency = delta;
162 update_max_tr_single(tr, current, cpu);
163 }
172 164
173 max_sequence++; 165 max_sequence++;
174 166
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
new file mode 100644
index 000000000000..aff5f80b59b8
--- /dev/null
+++ b/kernel/trace/trace_kprobe.c
@@ -0,0 +1,1523 @@
1/*
2 * Kprobes-based tracing events
3 *
4 * Created by Masami Hiramatsu <mhiramat@redhat.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/module.h>
21#include <linux/uaccess.h>
22#include <linux/kprobes.h>
23#include <linux/seq_file.h>
24#include <linux/slab.h>
25#include <linux/smp.h>
26#include <linux/debugfs.h>
27#include <linux/types.h>
28#include <linux/string.h>
29#include <linux/ctype.h>
30#include <linux/ptrace.h>
31#include <linux/perf_event.h>
32
33#include "trace.h"
34#include "trace_output.h"
35
36#define MAX_TRACE_ARGS 128
37#define MAX_ARGSTR_LEN 63
38#define MAX_EVENT_NAME_LEN 64
39#define KPROBE_EVENT_SYSTEM "kprobes"
40
41/* Reserved field names */
42#define FIELD_STRING_IP "__probe_ip"
43#define FIELD_STRING_NARGS "__probe_nargs"
44#define FIELD_STRING_RETIP "__probe_ret_ip"
45#define FIELD_STRING_FUNC "__probe_func"
46
47const char *reserved_field_names[] = {
48 "common_type",
49 "common_flags",
50 "common_preempt_count",
51 "common_pid",
52 "common_tgid",
53 "common_lock_depth",
54 FIELD_STRING_IP,
55 FIELD_STRING_NARGS,
56 FIELD_STRING_RETIP,
57 FIELD_STRING_FUNC,
58};
59
60struct fetch_func {
61 unsigned long (*func)(struct pt_regs *, void *);
62 void *data;
63};
64
65static __kprobes unsigned long call_fetch(struct fetch_func *f,
66 struct pt_regs *regs)
67{
68 return f->func(regs, f->data);
69}
70
71/* fetch handlers */
72static __kprobes unsigned long fetch_register(struct pt_regs *regs,
73 void *offset)
74{
75 return regs_get_register(regs, (unsigned int)((unsigned long)offset));
76}
77
78static __kprobes unsigned long fetch_stack(struct pt_regs *regs,
79 void *num)
80{
81 return regs_get_kernel_stack_nth(regs,
82 (unsigned int)((unsigned long)num));
83}
84
85static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
86{
87 unsigned long retval;
88
89 if (probe_kernel_address(addr, retval))
90 return 0;
91 return retval;
92}
93
94static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
95{
96 return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num));
97}
98
99static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
100 void *dummy)
101{
102 return regs_return_value(regs);
103}
104
105static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs,
106 void *dummy)
107{
108 return kernel_stack_pointer(regs);
109}
110
111/* Memory fetching by symbol */
112struct symbol_cache {
113 char *symbol;
114 long offset;
115 unsigned long addr;
116};
117
118static unsigned long update_symbol_cache(struct symbol_cache *sc)
119{
120 sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
121 if (sc->addr)
122 sc->addr += sc->offset;
123 return sc->addr;
124}
125
126static void free_symbol_cache(struct symbol_cache *sc)
127{
128 kfree(sc->symbol);
129 kfree(sc);
130}
131
132static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
133{
134 struct symbol_cache *sc;
135
136 if (!sym || strlen(sym) == 0)
137 return NULL;
138 sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
139 if (!sc)
140 return NULL;
141
142 sc->symbol = kstrdup(sym, GFP_KERNEL);
143 if (!sc->symbol) {
144 kfree(sc);
145 return NULL;
146 }
147 sc->offset = offset;
148
149 update_symbol_cache(sc);
150 return sc;
151}
152
153static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data)
154{
155 struct symbol_cache *sc = data;
156
157 if (sc->addr)
158 return fetch_memory(regs, (void *)sc->addr);
159 else
160 return 0;
161}
162
163/* Special indirect memory access interface */
164struct indirect_fetch_data {
165 struct fetch_func orig;
166 long offset;
167};
168
169static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data)
170{
171 struct indirect_fetch_data *ind = data;
172 unsigned long addr;
173
174 addr = call_fetch(&ind->orig, regs);
175 if (addr) {
176 addr += ind->offset;
177 return fetch_memory(regs, (void *)addr);
178 } else
179 return 0;
180}
181
182static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data)
183{
184 if (data->orig.func == fetch_indirect)
185 free_indirect_fetch_data(data->orig.data);
186 else if (data->orig.func == fetch_symbol)
187 free_symbol_cache(data->orig.data);
188 kfree(data);
189}
190
191/**
192 * Kprobe event core functions
193 */
194
195struct probe_arg {
196 struct fetch_func fetch;
197 const char *name;
198};
199
200/* Flags for trace_probe */
201#define TP_FLAG_TRACE 1
202#define TP_FLAG_PROFILE 2
203
204struct trace_probe {
205 struct list_head list;
206 struct kretprobe rp; /* Use rp.kp for kprobe use */
207 unsigned long nhit;
208 unsigned int flags; /* For TP_FLAG_* */
209 const char *symbol; /* symbol name */
210 struct ftrace_event_call call;
211 struct trace_event event;
212 unsigned int nr_args;
213 struct probe_arg args[];
214};
215
216#define SIZEOF_TRACE_PROBE(n) \
217 (offsetof(struct trace_probe, args) + \
218 (sizeof(struct probe_arg) * (n)))
219
220static __kprobes int probe_is_return(struct trace_probe *tp)
221{
222 return tp->rp.handler != NULL;
223}
224
225static __kprobes const char *probe_symbol(struct trace_probe *tp)
226{
227 return tp->symbol ? tp->symbol : "unknown";
228}
229
230static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
231{
232 int ret = -EINVAL;
233
234 if (ff->func == fetch_argument)
235 ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data);
236 else if (ff->func == fetch_register) {
237 const char *name;
238 name = regs_query_register_name((unsigned int)((long)ff->data));
239 ret = snprintf(buf, n, "%%%s", name);
240 } else if (ff->func == fetch_stack)
241 ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
242 else if (ff->func == fetch_memory)
243 ret = snprintf(buf, n, "@0x%p", ff->data);
244 else if (ff->func == fetch_symbol) {
245 struct symbol_cache *sc = ff->data;
246 if (sc->offset)
247 ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
248 sc->offset);
249 else
250 ret = snprintf(buf, n, "@%s", sc->symbol);
251 } else if (ff->func == fetch_retvalue)
252 ret = snprintf(buf, n, "$retval");
253 else if (ff->func == fetch_stack_address)
254 ret = snprintf(buf, n, "$stack");
255 else if (ff->func == fetch_indirect) {
256 struct indirect_fetch_data *id = ff->data;
257 size_t l = 0;
258 ret = snprintf(buf, n, "%+ld(", id->offset);
259 if (ret >= n)
260 goto end;
261 l += ret;
262 ret = probe_arg_string(buf + l, n - l, &id->orig);
263 if (ret < 0)
264 goto end;
265 l += ret;
266 ret = snprintf(buf + l, n - l, ")");
267 ret += l;
268 }
269end:
270 if (ret >= n)
271 return -ENOSPC;
272 return ret;
273}
274
275static int register_probe_event(struct trace_probe *tp);
276static void unregister_probe_event(struct trace_probe *tp);
277
278static DEFINE_MUTEX(probe_lock);
279static LIST_HEAD(probe_list);
280
281static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
282static int kretprobe_dispatcher(struct kretprobe_instance *ri,
283 struct pt_regs *regs);
284
285/*
286 * Allocate new trace_probe and initialize it (including kprobes).
287 */
288static struct trace_probe *alloc_trace_probe(const char *group,
289 const char *event,
290 void *addr,
291 const char *symbol,
292 unsigned long offs,
293 int nargs, int is_return)
294{
295 struct trace_probe *tp;
296
297 tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL);
298 if (!tp)
299 return ERR_PTR(-ENOMEM);
300
301 if (symbol) {
302 tp->symbol = kstrdup(symbol, GFP_KERNEL);
303 if (!tp->symbol)
304 goto error;
305 tp->rp.kp.symbol_name = tp->symbol;
306 tp->rp.kp.offset = offs;
307 } else
308 tp->rp.kp.addr = addr;
309
310 if (is_return)
311 tp->rp.handler = kretprobe_dispatcher;
312 else
313 tp->rp.kp.pre_handler = kprobe_dispatcher;
314
315 if (!event)
316 goto error;
317 tp->call.name = kstrdup(event, GFP_KERNEL);
318 if (!tp->call.name)
319 goto error;
320
321 if (!group)
322 goto error;
323 tp->call.system = kstrdup(group, GFP_KERNEL);
324 if (!tp->call.system)
325 goto error;
326
327 INIT_LIST_HEAD(&tp->list);
328 return tp;
329error:
330 kfree(tp->call.name);
331 kfree(tp->symbol);
332 kfree(tp);
333 return ERR_PTR(-ENOMEM);
334}
335
336static void free_probe_arg(struct probe_arg *arg)
337{
338 if (arg->fetch.func == fetch_symbol)
339 free_symbol_cache(arg->fetch.data);
340 else if (arg->fetch.func == fetch_indirect)
341 free_indirect_fetch_data(arg->fetch.data);
342 kfree(arg->name);
343}
344
345static void free_trace_probe(struct trace_probe *tp)
346{
347 int i;
348
349 for (i = 0; i < tp->nr_args; i++)
350 free_probe_arg(&tp->args[i]);
351
352 kfree(tp->call.system);
353 kfree(tp->call.name);
354 kfree(tp->symbol);
355 kfree(tp);
356}
357
358static struct trace_probe *find_probe_event(const char *event,
359 const char *group)
360{
361 struct trace_probe *tp;
362
363 list_for_each_entry(tp, &probe_list, list)
364 if (strcmp(tp->call.name, event) == 0 &&
365 strcmp(tp->call.system, group) == 0)
366 return tp;
367 return NULL;
368}
369
370/* Unregister a trace_probe and probe_event: call with locking probe_lock */
371static void unregister_trace_probe(struct trace_probe *tp)
372{
373 if (probe_is_return(tp))
374 unregister_kretprobe(&tp->rp);
375 else
376 unregister_kprobe(&tp->rp.kp);
377 list_del(&tp->list);
378 unregister_probe_event(tp);
379}
380
381/* Register a trace_probe and probe_event */
382static int register_trace_probe(struct trace_probe *tp)
383{
384 struct trace_probe *old_tp;
385 int ret;
386
387 mutex_lock(&probe_lock);
388
389 /* register as an event */
390 old_tp = find_probe_event(tp->call.name, tp->call.system);
391 if (old_tp) {
392 /* delete old event */
393 unregister_trace_probe(old_tp);
394 free_trace_probe(old_tp);
395 }
396 ret = register_probe_event(tp);
397 if (ret) {
398 pr_warning("Faild to register probe event(%d)\n", ret);
399 goto end;
400 }
401
402 tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
403 if (probe_is_return(tp))
404 ret = register_kretprobe(&tp->rp);
405 else
406 ret = register_kprobe(&tp->rp.kp);
407
408 if (ret) {
409 pr_warning("Could not insert probe(%d)\n", ret);
410 if (ret == -EILSEQ) {
411 pr_warning("Probing address(0x%p) is not an "
412 "instruction boundary.\n",
413 tp->rp.kp.addr);
414 ret = -EINVAL;
415 }
416 unregister_probe_event(tp);
417 } else
418 list_add_tail(&tp->list, &probe_list);
419end:
420 mutex_unlock(&probe_lock);
421 return ret;
422}
423
424/* Split symbol and offset. */
425static int split_symbol_offset(char *symbol, unsigned long *offset)
426{
427 char *tmp;
428 int ret;
429
430 if (!offset)
431 return -EINVAL;
432
433 tmp = strchr(symbol, '+');
434 if (tmp) {
435 /* skip sign because strict_strtol doesn't accept '+' */
436 ret = strict_strtoul(tmp + 1, 0, offset);
437 if (ret)
438 return ret;
439 *tmp = '\0';
440 } else
441 *offset = 0;
442 return 0;
443}
444
445#define PARAM_MAX_ARGS 16
446#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
447
448static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
449{
450 int ret = 0;
451 unsigned long param;
452
453 if (strcmp(arg, "retval") == 0) {
454 if (is_return) {
455 ff->func = fetch_retvalue;
456 ff->data = NULL;
457 } else
458 ret = -EINVAL;
459 } else if (strncmp(arg, "stack", 5) == 0) {
460 if (arg[5] == '\0') {
461 ff->func = fetch_stack_address;
462 ff->data = NULL;
463 } else if (isdigit(arg[5])) {
464 ret = strict_strtoul(arg + 5, 10, &param);
465 if (ret || param > PARAM_MAX_STACK)
466 ret = -EINVAL;
467 else {
468 ff->func = fetch_stack;
469 ff->data = (void *)param;
470 }
471 } else
472 ret = -EINVAL;
473 } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) {
474 ret = strict_strtoul(arg + 3, 10, &param);
475 if (ret || param > PARAM_MAX_ARGS)
476 ret = -EINVAL;
477 else {
478 ff->func = fetch_argument;
479 ff->data = (void *)param;
480 }
481 } else
482 ret = -EINVAL;
483 return ret;
484}
485
486/* Recursive argument parser */
487static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
488{
489 int ret = 0;
490 unsigned long param;
491 long offset;
492 char *tmp;
493
494 switch (arg[0]) {
495 case '$':
496 ret = parse_probe_vars(arg + 1, ff, is_return);
497 break;
498 case '%': /* named register */
499 ret = regs_query_register_offset(arg + 1);
500 if (ret >= 0) {
501 ff->func = fetch_register;
502 ff->data = (void *)(unsigned long)ret;
503 ret = 0;
504 }
505 break;
506 case '@': /* memory or symbol */
507 if (isdigit(arg[1])) {
508 ret = strict_strtoul(arg + 1, 0, &param);
509 if (ret)
510 break;
511 ff->func = fetch_memory;
512 ff->data = (void *)param;
513 } else {
514 ret = split_symbol_offset(arg + 1, &offset);
515 if (ret)
516 break;
517 ff->data = alloc_symbol_cache(arg + 1, offset);
518 if (ff->data)
519 ff->func = fetch_symbol;
520 else
521 ret = -EINVAL;
522 }
523 break;
524 case '+': /* indirect memory */
525 case '-':
526 tmp = strchr(arg, '(');
527 if (!tmp) {
528 ret = -EINVAL;
529 break;
530 }
531 *tmp = '\0';
532 ret = strict_strtol(arg + 1, 0, &offset);
533 if (ret)
534 break;
535 if (arg[0] == '-')
536 offset = -offset;
537 arg = tmp + 1;
538 tmp = strrchr(arg, ')');
539 if (tmp) {
540 struct indirect_fetch_data *id;
541 *tmp = '\0';
542 id = kzalloc(sizeof(struct indirect_fetch_data),
543 GFP_KERNEL);
544 if (!id)
545 return -ENOMEM;
546 id->offset = offset;
547 ret = __parse_probe_arg(arg, &id->orig, is_return);
548 if (ret)
549 kfree(id);
550 else {
551 ff->func = fetch_indirect;
552 ff->data = (void *)id;
553 }
554 } else
555 ret = -EINVAL;
556 break;
557 default:
558 /* TODO: support custom handler */
559 ret = -EINVAL;
560 }
561 return ret;
562}
563
564/* String length checking wrapper */
565static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
566{
567 if (strlen(arg) > MAX_ARGSTR_LEN) {
568 pr_info("Argument is too long.: %s\n", arg);
569 return -ENOSPC;
570 }
571 return __parse_probe_arg(arg, ff, is_return);
572}
573
574/* Return 1 if name is reserved or already used by another argument */
575static int conflict_field_name(const char *name,
576 struct probe_arg *args, int narg)
577{
578 int i;
579 for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
580 if (strcmp(reserved_field_names[i], name) == 0)
581 return 1;
582 for (i = 0; i < narg; i++)
583 if (strcmp(args[i].name, name) == 0)
584 return 1;
585 return 0;
586}
587
588static int create_trace_probe(int argc, char **argv)
589{
590 /*
591 * Argument syntax:
592 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
593 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
594 * Fetch args:
595 * $argN : fetch Nth of function argument. (N:0-)
596 * $retval : fetch return value
597 * $stack : fetch stack address
598 * $stackN : fetch Nth of stack (N:0-)
599 * @ADDR : fetch memory at ADDR (ADDR should be in kernel)
600 * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
601 * %REG : fetch register REG
602 * Indirect memory fetch:
603 * +|-offs(ARG) : fetch memory at ARG +|- offs address.
604 * Alias name of args:
605 * NAME=FETCHARG : set NAME as alias of FETCHARG.
606 */
607 struct trace_probe *tp;
608 int i, ret = 0;
609 int is_return = 0;
610 char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
611 unsigned long offset = 0;
612 void *addr = NULL;
613 char buf[MAX_EVENT_NAME_LEN];
614
615 if (argc < 2) {
616 pr_info("Probe point is not specified.\n");
617 return -EINVAL;
618 }
619
620 if (argv[0][0] == 'p')
621 is_return = 0;
622 else if (argv[0][0] == 'r')
623 is_return = 1;
624 else {
625 pr_info("Probe definition must be started with 'p' or 'r'.\n");
626 return -EINVAL;
627 }
628
629 if (argv[0][1] == ':') {
630 event = &argv[0][2];
631 if (strchr(event, '/')) {
632 group = event;
633 event = strchr(group, '/') + 1;
634 event[-1] = '\0';
635 if (strlen(group) == 0) {
636 pr_info("Group name is not specifiled\n");
637 return -EINVAL;
638 }
639 }
640 if (strlen(event) == 0) {
641 pr_info("Event name is not specifiled\n");
642 return -EINVAL;
643 }
644 }
645
646 if (isdigit(argv[1][0])) {
647 if (is_return) {
648 pr_info("Return probe point must be a symbol.\n");
649 return -EINVAL;
650 }
651 /* an address specified */
652 ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr);
653 if (ret) {
654 pr_info("Failed to parse address.\n");
655 return ret;
656 }
657 } else {
658 /* a symbol specified */
659 symbol = argv[1];
660 /* TODO: support .init module functions */
661 ret = split_symbol_offset(symbol, &offset);
662 if (ret) {
663 pr_info("Failed to parse symbol.\n");
664 return ret;
665 }
666 if (offset && is_return) {
667 pr_info("Return probe must be used without offset.\n");
668 return -EINVAL;
669 }
670 }
671 argc -= 2; argv += 2;
672
673 /* setup a probe */
674 if (!group)
675 group = KPROBE_EVENT_SYSTEM;
676 if (!event) {
677 /* Make a new event name */
678 if (symbol)
679 snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld",
680 is_return ? 'r' : 'p', symbol, offset);
681 else
682 snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p",
683 is_return ? 'r' : 'p', addr);
684 event = buf;
685 }
686 tp = alloc_trace_probe(group, event, addr, symbol, offset, argc,
687 is_return);
688 if (IS_ERR(tp)) {
689 pr_info("Failed to allocate trace_probe.(%d)\n",
690 (int)PTR_ERR(tp));
691 return PTR_ERR(tp);
692 }
693
694 /* parse arguments */
695 ret = 0;
696 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
697 /* Parse argument name */
698 arg = strchr(argv[i], '=');
699 if (arg)
700 *arg++ = '\0';
701 else
702 arg = argv[i];
703
704 if (conflict_field_name(argv[i], tp->args, i)) {
705 pr_info("Argument%d name '%s' conflicts with "
706 "another field.\n", i, argv[i]);
707 ret = -EINVAL;
708 goto error;
709 }
710
711 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
712 if (!tp->args[i].name) {
713 pr_info("Failed to allocate argument%d name '%s'.\n",
714 i, argv[i]);
715 ret = -ENOMEM;
716 goto error;
717 }
718
719 /* Parse fetch argument */
720 ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return);
721 if (ret) {
722 pr_info("Parse error at argument%d. (%d)\n", i, ret);
723 kfree(tp->args[i].name);
724 goto error;
725 }
726
727 tp->nr_args++;
728 }
729
730 ret = register_trace_probe(tp);
731 if (ret)
732 goto error;
733 return 0;
734
735error:
736 free_trace_probe(tp);
737 return ret;
738}
739
740static void cleanup_all_probes(void)
741{
742 struct trace_probe *tp;
743
744 mutex_lock(&probe_lock);
745 /* TODO: Use batch unregistration */
746 while (!list_empty(&probe_list)) {
747 tp = list_entry(probe_list.next, struct trace_probe, list);
748 unregister_trace_probe(tp);
749 free_trace_probe(tp);
750 }
751 mutex_unlock(&probe_lock);
752}
753
754
755/* Probes listing interfaces */
756static void *probes_seq_start(struct seq_file *m, loff_t *pos)
757{
758 mutex_lock(&probe_lock);
759 return seq_list_start(&probe_list, *pos);
760}
761
762static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
763{
764 return seq_list_next(v, &probe_list, pos);
765}
766
767static void probes_seq_stop(struct seq_file *m, void *v)
768{
769 mutex_unlock(&probe_lock);
770}
771
772static int probes_seq_show(struct seq_file *m, void *v)
773{
774 struct trace_probe *tp = v;
775 int i, ret;
776 char buf[MAX_ARGSTR_LEN + 1];
777
778 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
779 seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
780
781 if (!tp->symbol)
782 seq_printf(m, " 0x%p", tp->rp.kp.addr);
783 else if (tp->rp.kp.offset)
784 seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset);
785 else
786 seq_printf(m, " %s", probe_symbol(tp));
787
788 for (i = 0; i < tp->nr_args; i++) {
789 ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch);
790 if (ret < 0) {
791 pr_warning("Argument%d decoding error(%d).\n", i, ret);
792 return ret;
793 }
794 seq_printf(m, " %s=%s", tp->args[i].name, buf);
795 }
796 seq_printf(m, "\n");
797 return 0;
798}
799
800static const struct seq_operations probes_seq_op = {
801 .start = probes_seq_start,
802 .next = probes_seq_next,
803 .stop = probes_seq_stop,
804 .show = probes_seq_show
805};
806
807static int probes_open(struct inode *inode, struct file *file)
808{
809 if ((file->f_mode & FMODE_WRITE) &&
810 (file->f_flags & O_TRUNC))
811 cleanup_all_probes();
812
813 return seq_open(file, &probes_seq_op);
814}
815
816static int command_trace_probe(const char *buf)
817{
818 char **argv;
819 int argc = 0, ret = 0;
820
821 argv = argv_split(GFP_KERNEL, buf, &argc);
822 if (!argv)
823 return -ENOMEM;
824
825 if (argc)
826 ret = create_trace_probe(argc, argv);
827
828 argv_free(argv);
829 return ret;
830}
831
832#define WRITE_BUFSIZE 128
833
834static ssize_t probes_write(struct file *file, const char __user *buffer,
835 size_t count, loff_t *ppos)
836{
837 char *kbuf, *tmp;
838 int ret;
839 size_t done;
840 size_t size;
841
842 kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
843 if (!kbuf)
844 return -ENOMEM;
845
846 ret = done = 0;
847 while (done < count) {
848 size = count - done;
849 if (size >= WRITE_BUFSIZE)
850 size = WRITE_BUFSIZE - 1;
851 if (copy_from_user(kbuf, buffer + done, size)) {
852 ret = -EFAULT;
853 goto out;
854 }
855 kbuf[size] = '\0';
856 tmp = strchr(kbuf, '\n');
857 if (tmp) {
858 *tmp = '\0';
859 size = tmp - kbuf + 1;
860 } else if (done + size < count) {
861 pr_warning("Line length is too long: "
862 "Should be less than %d.", WRITE_BUFSIZE);
863 ret = -EINVAL;
864 goto out;
865 }
866 done += size;
867 /* Remove comments */
868 tmp = strchr(kbuf, '#');
869 if (tmp)
870 *tmp = '\0';
871
872 ret = command_trace_probe(kbuf);
873 if (ret)
874 goto out;
875 }
876 ret = done;
877out:
878 kfree(kbuf);
879 return ret;
880}
881
882static const struct file_operations kprobe_events_ops = {
883 .owner = THIS_MODULE,
884 .open = probes_open,
885 .read = seq_read,
886 .llseek = seq_lseek,
887 .release = seq_release,
888 .write = probes_write,
889};
890
891/* Probes profiling interfaces */
892static int probes_profile_seq_show(struct seq_file *m, void *v)
893{
894 struct trace_probe *tp = v;
895
896 seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit,
897 tp->rp.kp.nmissed);
898
899 return 0;
900}
901
902static const struct seq_operations profile_seq_op = {
903 .start = probes_seq_start,
904 .next = probes_seq_next,
905 .stop = probes_seq_stop,
906 .show = probes_profile_seq_show
907};
908
909static int profile_open(struct inode *inode, struct file *file)
910{
911 return seq_open(file, &profile_seq_op);
912}
913
914static const struct file_operations kprobe_profile_ops = {
915 .owner = THIS_MODULE,
916 .open = profile_open,
917 .read = seq_read,
918 .llseek = seq_lseek,
919 .release = seq_release,
920};
921
922/* Kprobe handler */
923static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
924{
925 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
926 struct kprobe_trace_entry *entry;
927 struct ring_buffer_event *event;
928 struct ring_buffer *buffer;
929 int size, i, pc;
930 unsigned long irq_flags;
931 struct ftrace_event_call *call = &tp->call;
932
933 tp->nhit++;
934
935 local_save_flags(irq_flags);
936 pc = preempt_count();
937
938 size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
939
940 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
941 irq_flags, pc);
942 if (!event)
943 return 0;
944
945 entry = ring_buffer_event_data(event);
946 entry->nargs = tp->nr_args;
947 entry->ip = (unsigned long)kp->addr;
948 for (i = 0; i < tp->nr_args; i++)
949 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
950
951 if (!filter_current_check_discard(buffer, call, entry, event))
952 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
953 return 0;
954}
955
956/* Kretprobe handler */
957static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
958 struct pt_regs *regs)
959{
960 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
961 struct kretprobe_trace_entry *entry;
962 struct ring_buffer_event *event;
963 struct ring_buffer *buffer;
964 int size, i, pc;
965 unsigned long irq_flags;
966 struct ftrace_event_call *call = &tp->call;
967
968 local_save_flags(irq_flags);
969 pc = preempt_count();
970
971 size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
972
973 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
974 irq_flags, pc);
975 if (!event)
976 return 0;
977
978 entry = ring_buffer_event_data(event);
979 entry->nargs = tp->nr_args;
980 entry->func = (unsigned long)tp->rp.kp.addr;
981 entry->ret_ip = (unsigned long)ri->ret_addr;
982 for (i = 0; i < tp->nr_args; i++)
983 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
984
985 if (!filter_current_check_discard(buffer, call, entry, event))
986 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
987
988 return 0;
989}
990
991/* Event entry printers */
992enum print_line_t
993print_kprobe_event(struct trace_iterator *iter, int flags)
994{
995 struct kprobe_trace_entry *field;
996 struct trace_seq *s = &iter->seq;
997 struct trace_event *event;
998 struct trace_probe *tp;
999 int i;
1000
1001 field = (struct kprobe_trace_entry *)iter->ent;
1002 event = ftrace_find_event(field->ent.type);
1003 tp = container_of(event, struct trace_probe, event);
1004
1005 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1006 goto partial;
1007
1008 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
1009 goto partial;
1010
1011 if (!trace_seq_puts(s, ")"))
1012 goto partial;
1013
1014 for (i = 0; i < field->nargs; i++)
1015 if (!trace_seq_printf(s, " %s=%lx",
1016 tp->args[i].name, field->args[i]))
1017 goto partial;
1018
1019 if (!trace_seq_puts(s, "\n"))
1020 goto partial;
1021
1022 return TRACE_TYPE_HANDLED;
1023partial:
1024 return TRACE_TYPE_PARTIAL_LINE;
1025}
1026
1027enum print_line_t
1028print_kretprobe_event(struct trace_iterator *iter, int flags)
1029{
1030 struct kretprobe_trace_entry *field;
1031 struct trace_seq *s = &iter->seq;
1032 struct trace_event *event;
1033 struct trace_probe *tp;
1034 int i;
1035
1036 field = (struct kretprobe_trace_entry *)iter->ent;
1037 event = ftrace_find_event(field->ent.type);
1038 tp = container_of(event, struct trace_probe, event);
1039
1040 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1041 goto partial;
1042
1043 if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
1044 goto partial;
1045
1046 if (!trace_seq_puts(s, " <- "))
1047 goto partial;
1048
1049 if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
1050 goto partial;
1051
1052 if (!trace_seq_puts(s, ")"))
1053 goto partial;
1054
1055 for (i = 0; i < field->nargs; i++)
1056 if (!trace_seq_printf(s, " %s=%lx",
1057 tp->args[i].name, field->args[i]))
1058 goto partial;
1059
1060 if (!trace_seq_puts(s, "\n"))
1061 goto partial;
1062
1063 return TRACE_TYPE_HANDLED;
1064partial:
1065 return TRACE_TYPE_PARTIAL_LINE;
1066}
1067
1068static int probe_event_enable(struct ftrace_event_call *call)
1069{
1070 struct trace_probe *tp = (struct trace_probe *)call->data;
1071
1072 tp->flags |= TP_FLAG_TRACE;
1073 if (probe_is_return(tp))
1074 return enable_kretprobe(&tp->rp);
1075 else
1076 return enable_kprobe(&tp->rp.kp);
1077}
1078
1079static void probe_event_disable(struct ftrace_event_call *call)
1080{
1081 struct trace_probe *tp = (struct trace_probe *)call->data;
1082
1083 tp->flags &= ~TP_FLAG_TRACE;
1084 if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) {
1085 if (probe_is_return(tp))
1086 disable_kretprobe(&tp->rp);
1087 else
1088 disable_kprobe(&tp->rp.kp);
1089 }
1090}
1091
1092static int probe_event_raw_init(struct ftrace_event_call *event_call)
1093{
1094 INIT_LIST_HEAD(&event_call->fields);
1095
1096 return 0;
1097}
1098
1099#undef DEFINE_FIELD
1100#define DEFINE_FIELD(type, item, name, is_signed) \
1101 do { \
1102 ret = trace_define_field(event_call, #type, name, \
1103 offsetof(typeof(field), item), \
1104 sizeof(field.item), is_signed, \
1105 FILTER_OTHER); \
1106 if (ret) \
1107 return ret; \
1108 } while (0)
1109
1110static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1111{
1112 int ret, i;
1113 struct kprobe_trace_entry field;
1114 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1115
1116 ret = trace_define_common_fields(event_call);
1117 if (!ret)
1118 return ret;
1119
1120 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
1121 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1122 /* Set argument names as fields */
1123 for (i = 0; i < tp->nr_args; i++)
1124 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
1125 return 0;
1126}
1127
1128static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1129{
1130 int ret, i;
1131 struct kretprobe_trace_entry field;
1132 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1133
1134 ret = trace_define_common_fields(event_call);
1135 if (!ret)
1136 return ret;
1137
1138 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
1139 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1140 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1141 /* Set argument names as fields */
1142 for (i = 0; i < tp->nr_args; i++)
1143 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
1144 return 0;
1145}
1146
1147static int __probe_event_show_format(struct trace_seq *s,
1148 struct trace_probe *tp, const char *fmt,
1149 const char *arg)
1150{
1151 int i;
1152
1153 /* Show format */
1154 if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
1155 return 0;
1156
1157 for (i = 0; i < tp->nr_args; i++)
1158 if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name))
1159 return 0;
1160
1161 if (!trace_seq_printf(s, "\", %s", arg))
1162 return 0;
1163
1164 for (i = 0; i < tp->nr_args; i++)
1165 if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name))
1166 return 0;
1167
1168 return trace_seq_puts(s, "\n");
1169}
1170
1171#undef SHOW_FIELD
1172#define SHOW_FIELD(type, item, name) \
1173 do { \
1174 ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \
1175 "offset:%u;\tsize:%u;\n", name, \
1176 (unsigned int)offsetof(typeof(field), item),\
1177 (unsigned int)sizeof(type)); \
1178 if (!ret) \
1179 return 0; \
1180 } while (0)
1181
1182static int kprobe_event_show_format(struct ftrace_event_call *call,
1183 struct trace_seq *s)
1184{
1185 struct kprobe_trace_entry field __attribute__((unused));
1186 int ret, i;
1187 struct trace_probe *tp = (struct trace_probe *)call->data;
1188
1189 SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP);
1190 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
1191
1192 /* Show fields */
1193 for (i = 0; i < tp->nr_args; i++)
1194 SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
1195 trace_seq_puts(s, "\n");
1196
1197 return __probe_event_show_format(s, tp, "(%lx)",
1198 "REC->" FIELD_STRING_IP);
1199}
1200
1201static int kretprobe_event_show_format(struct ftrace_event_call *call,
1202 struct trace_seq *s)
1203{
1204 struct kretprobe_trace_entry field __attribute__((unused));
1205 int ret, i;
1206 struct trace_probe *tp = (struct trace_probe *)call->data;
1207
1208 SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC);
1209 SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP);
1210 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
1211
1212 /* Show fields */
1213 for (i = 0; i < tp->nr_args; i++)
1214 SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
1215 trace_seq_puts(s, "\n");
1216
1217 return __probe_event_show_format(s, tp, "(%lx <- %lx)",
1218 "REC->" FIELD_STRING_FUNC
1219 ", REC->" FIELD_STRING_RETIP);
1220}
1221
1222#ifdef CONFIG_EVENT_PROFILE
1223
1224/* Kprobe profile handler */
1225static __kprobes int kprobe_profile_func(struct kprobe *kp,
1226 struct pt_regs *regs)
1227{
1228 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1229 struct ftrace_event_call *call = &tp->call;
1230 struct kprobe_trace_entry *entry;
1231 struct trace_entry *ent;
1232 int size, __size, i, pc, __cpu;
1233 unsigned long irq_flags;
1234 char *trace_buf;
1235 char *raw_data;
1236 int rctx;
1237
1238 pc = preempt_count();
1239 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
1240 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1241 size -= sizeof(u32);
1242 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1243 "profile buffer not large enough"))
1244 return 0;
1245
1246 /*
1247 * Protect the non nmi buffer
1248 * This also protects the rcu read side
1249 */
1250 local_irq_save(irq_flags);
1251
1252 rctx = perf_swevent_get_recursion_context();
1253 if (rctx < 0)
1254 goto end_recursion;
1255
1256 __cpu = smp_processor_id();
1257
1258 if (in_nmi())
1259 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1260 else
1261 trace_buf = rcu_dereference(perf_trace_buf);
1262
1263 if (!trace_buf)
1264 goto end;
1265
1266 raw_data = per_cpu_ptr(trace_buf, __cpu);
1267
1268 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1269 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1270 entry = (struct kprobe_trace_entry *)raw_data;
1271 ent = &entry->ent;
1272
1273 tracing_generic_entry_update(ent, irq_flags, pc);
1274 ent->type = call->id;
1275 entry->nargs = tp->nr_args;
1276 entry->ip = (unsigned long)kp->addr;
1277 for (i = 0; i < tp->nr_args; i++)
1278 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1279 perf_tp_event(call->id, entry->ip, 1, entry, size);
1280
1281end:
1282 perf_swevent_put_recursion_context(rctx);
1283end_recursion:
1284 local_irq_restore(irq_flags);
1285
1286 return 0;
1287}
1288
1289/* Kretprobe profile handler */
1290static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
1291 struct pt_regs *regs)
1292{
1293 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1294 struct ftrace_event_call *call = &tp->call;
1295 struct kretprobe_trace_entry *entry;
1296 struct trace_entry *ent;
1297 int size, __size, i, pc, __cpu;
1298 unsigned long irq_flags;
1299 char *trace_buf;
1300 char *raw_data;
1301 int rctx;
1302
1303 pc = preempt_count();
1304 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
1305 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1306 size -= sizeof(u32);
1307 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1308 "profile buffer not large enough"))
1309 return 0;
1310
1311 /*
1312 * Protect the non nmi buffer
1313 * This also protects the rcu read side
1314 */
1315 local_irq_save(irq_flags);
1316
1317 rctx = perf_swevent_get_recursion_context();
1318 if (rctx < 0)
1319 goto end_recursion;
1320
1321 __cpu = smp_processor_id();
1322
1323 if (in_nmi())
1324 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1325 else
1326 trace_buf = rcu_dereference(perf_trace_buf);
1327
1328 if (!trace_buf)
1329 goto end;
1330
1331 raw_data = per_cpu_ptr(trace_buf, __cpu);
1332
1333 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1334 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1335 entry = (struct kretprobe_trace_entry *)raw_data;
1336 ent = &entry->ent;
1337
1338 tracing_generic_entry_update(ent, irq_flags, pc);
1339 ent->type = call->id;
1340 entry->nargs = tp->nr_args;
1341 entry->func = (unsigned long)tp->rp.kp.addr;
1342 entry->ret_ip = (unsigned long)ri->ret_addr;
1343 for (i = 0; i < tp->nr_args; i++)
1344 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1345 perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
1346
1347end:
1348 perf_swevent_put_recursion_context(rctx);
1349end_recursion:
1350 local_irq_restore(irq_flags);
1351
1352 return 0;
1353}
1354
1355static int probe_profile_enable(struct ftrace_event_call *call)
1356{
1357 struct trace_probe *tp = (struct trace_probe *)call->data;
1358
1359 tp->flags |= TP_FLAG_PROFILE;
1360
1361 if (probe_is_return(tp))
1362 return enable_kretprobe(&tp->rp);
1363 else
1364 return enable_kprobe(&tp->rp.kp);
1365}
1366
1367static void probe_profile_disable(struct ftrace_event_call *call)
1368{
1369 struct trace_probe *tp = (struct trace_probe *)call->data;
1370
1371 tp->flags &= ~TP_FLAG_PROFILE;
1372
1373 if (!(tp->flags & TP_FLAG_TRACE)) {
1374 if (probe_is_return(tp))
1375 disable_kretprobe(&tp->rp);
1376 else
1377 disable_kprobe(&tp->rp.kp);
1378 }
1379}
1380#endif /* CONFIG_EVENT_PROFILE */
1381
1382
1383static __kprobes
1384int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1385{
1386 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1387
1388 if (tp->flags & TP_FLAG_TRACE)
1389 kprobe_trace_func(kp, regs);
1390#ifdef CONFIG_EVENT_PROFILE
1391 if (tp->flags & TP_FLAG_PROFILE)
1392 kprobe_profile_func(kp, regs);
1393#endif /* CONFIG_EVENT_PROFILE */
1394 return 0; /* We don't tweek kernel, so just return 0 */
1395}
1396
1397static __kprobes
1398int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1399{
1400 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1401
1402 if (tp->flags & TP_FLAG_TRACE)
1403 kretprobe_trace_func(ri, regs);
1404#ifdef CONFIG_EVENT_PROFILE
1405 if (tp->flags & TP_FLAG_PROFILE)
1406 kretprobe_profile_func(ri, regs);
1407#endif /* CONFIG_EVENT_PROFILE */
1408 return 0; /* We don't tweek kernel, so just return 0 */
1409}
1410
1411static int register_probe_event(struct trace_probe *tp)
1412{
1413 struct ftrace_event_call *call = &tp->call;
1414 int ret;
1415
1416 /* Initialize ftrace_event_call */
1417 if (probe_is_return(tp)) {
1418 tp->event.trace = print_kretprobe_event;
1419 call->raw_init = probe_event_raw_init;
1420 call->show_format = kretprobe_event_show_format;
1421 call->define_fields = kretprobe_event_define_fields;
1422 } else {
1423 tp->event.trace = print_kprobe_event;
1424 call->raw_init = probe_event_raw_init;
1425 call->show_format = kprobe_event_show_format;
1426 call->define_fields = kprobe_event_define_fields;
1427 }
1428 call->event = &tp->event;
1429 call->id = register_ftrace_event(&tp->event);
1430 if (!call->id)
1431 return -ENODEV;
1432 call->enabled = 0;
1433 call->regfunc = probe_event_enable;
1434 call->unregfunc = probe_event_disable;
1435
1436#ifdef CONFIG_EVENT_PROFILE
1437 atomic_set(&call->profile_count, -1);
1438 call->profile_enable = probe_profile_enable;
1439 call->profile_disable = probe_profile_disable;
1440#endif
1441 call->data = tp;
1442 ret = trace_add_event_call(call);
1443 if (ret) {
1444 pr_info("Failed to register kprobe event: %s\n", call->name);
1445 unregister_ftrace_event(&tp->event);
1446 }
1447 return ret;
1448}
1449
1450static void unregister_probe_event(struct trace_probe *tp)
1451{
1452 /* tp->event is unregistered in trace_remove_event_call() */
1453 trace_remove_event_call(&tp->call);
1454}
1455
1456/* Make a debugfs interface for controling probe points */
1457static __init int init_kprobe_trace(void)
1458{
1459 struct dentry *d_tracer;
1460 struct dentry *entry;
1461
1462 d_tracer = tracing_init_dentry();
1463 if (!d_tracer)
1464 return 0;
1465
1466 entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
1467 NULL, &kprobe_events_ops);
1468
1469 /* Event list interface */
1470 if (!entry)
1471 pr_warning("Could not create debugfs "
1472 "'kprobe_events' entry\n");
1473
1474 /* Profile interface */
1475 entry = debugfs_create_file("kprobe_profile", 0444, d_tracer,
1476 NULL, &kprobe_profile_ops);
1477
1478 if (!entry)
1479 pr_warning("Could not create debugfs "
1480 "'kprobe_profile' entry\n");
1481 return 0;
1482}
1483fs_initcall(init_kprobe_trace);
1484
1485
1486#ifdef CONFIG_FTRACE_STARTUP_TEST
1487
1488static int kprobe_trace_selftest_target(int a1, int a2, int a3,
1489 int a4, int a5, int a6)
1490{
1491 return a1 + a2 + a3 + a4 + a5 + a6;
1492}
1493
1494static __init int kprobe_trace_self_tests_init(void)
1495{
1496 int ret;
1497 int (*target)(int, int, int, int, int, int);
1498
1499 target = kprobe_trace_selftest_target;
1500
1501 pr_info("Testing kprobe tracing: ");
1502
1503 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
1504 "$arg1 $arg2 $arg3 $arg4 $stack $stack0");
1505 if (WARN_ON_ONCE(ret))
1506 pr_warning("error enabling function entry\n");
1507
1508 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
1509 "$retval");
1510 if (WARN_ON_ONCE(ret))
1511 pr_warning("error enabling function return\n");
1512
1513 ret = target(1, 2, 3, 4, 5, 6);
1514
1515 cleanup_all_probes();
1516
1517 pr_cont("OK\n");
1518 return 0;
1519}
1520
1521late_initcall(kprobe_trace_self_tests_init);
1522
1523#endif
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
new file mode 100644
index 000000000000..ddfa0fd43bc0
--- /dev/null
+++ b/kernel/trace/trace_ksym.c
@@ -0,0 +1,550 @@
1/*
2 * trace_ksym.c - Kernel Symbol Tracer
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2009
19 */
20
21#include <linux/kallsyms.h>
22#include <linux/uaccess.h>
23#include <linux/debugfs.h>
24#include <linux/ftrace.h>
25#include <linux/module.h>
26#include <linux/fs.h>
27
28#include "trace_output.h"
29#include "trace_stat.h"
30#include "trace.h"
31
32#include <linux/hw_breakpoint.h>
33#include <asm/hw_breakpoint.h>
34
35/*
36 * For now, let us restrict the no. of symbols traced simultaneously to number
37 * of available hardware breakpoint registers.
38 */
39#define KSYM_TRACER_MAX HBP_NUM
40
41#define KSYM_TRACER_OP_LEN 3 /* rw- */
42
43struct trace_ksym {
44 struct perf_event **ksym_hbp;
45 struct perf_event_attr attr;
46#ifdef CONFIG_PROFILE_KSYM_TRACER
47 unsigned long counter;
48#endif
49 struct hlist_node ksym_hlist;
50};
51
52static struct trace_array *ksym_trace_array;
53
54static unsigned int ksym_filter_entry_count;
55static unsigned int ksym_tracing_enabled;
56
57static HLIST_HEAD(ksym_filter_head);
58
59static DEFINE_MUTEX(ksym_tracer_mutex);
60
61#ifdef CONFIG_PROFILE_KSYM_TRACER
62
63#define MAX_UL_INT 0xffffffff
64
65void ksym_collect_stats(unsigned long hbp_hit_addr)
66{
67 struct hlist_node *node;
68 struct trace_ksym *entry;
69
70 rcu_read_lock();
71 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
72 if ((entry->attr.bp_addr == hbp_hit_addr) &&
73 (entry->counter <= MAX_UL_INT)) {
74 entry->counter++;
75 break;
76 }
77 }
78 rcu_read_unlock();
79}
80#endif /* CONFIG_PROFILE_KSYM_TRACER */
81
82void ksym_hbp_handler(struct perf_event *hbp, void *data)
83{
84 struct ring_buffer_event *event;
85 struct ksym_trace_entry *entry;
86 struct pt_regs *regs = data;
87 struct ring_buffer *buffer;
88 int pc;
89
90 if (!ksym_tracing_enabled)
91 return;
92
93 buffer = ksym_trace_array->buffer;
94
95 pc = preempt_count();
96
97 event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
98 sizeof(*entry), 0, pc);
99 if (!event)
100 return;
101
102 entry = ring_buffer_event_data(event);
103 entry->ip = instruction_pointer(regs);
104 entry->type = hw_breakpoint_type(hbp);
105 entry->addr = hw_breakpoint_addr(hbp);
106 strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
107
108#ifdef CONFIG_PROFILE_KSYM_TRACER
109 ksym_collect_stats(hw_breakpoint_addr(hbp));
110#endif /* CONFIG_PROFILE_KSYM_TRACER */
111
112 trace_buffer_unlock_commit(buffer, event, 0, pc);
113}
114
115/* Valid access types are represented as
116 *
117 * rw- : Set Read/Write Access Breakpoint
118 * -w- : Set Write Access Breakpoint
119 * --- : Clear Breakpoints
120 * --x : Set Execution Break points (Not available yet)
121 *
122 */
123static int ksym_trace_get_access_type(char *str)
124{
125 int access = 0;
126
127 if (str[0] == 'r')
128 access |= HW_BREAKPOINT_R;
129
130 if (str[1] == 'w')
131 access |= HW_BREAKPOINT_W;
132
133 if (str[2] == 'x')
134 access |= HW_BREAKPOINT_X;
135
136 switch (access) {
137 case HW_BREAKPOINT_R:
138 case HW_BREAKPOINT_W:
139 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
140 return access;
141 default:
142 return -EINVAL;
143 }
144}
145
146/*
147 * There can be several possible malformed requests and we attempt to capture
148 * all of them. We enumerate some of the rules
149 * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
150 * i.e. multiple ':' symbols disallowed. Possible uses are of the form
151 * <module>:<ksym_name>:<op>.
152 * 2. No delimiter symbol ':' in the input string
153 * 3. Spurious operator symbols or symbols not in their respective positions
154 * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
155 * 5. Kernel symbol not a part of /proc/kallsyms
156 * 6. Duplicate requests
157 */
158static int parse_ksym_trace_str(char *input_string, char **ksymname,
159 unsigned long *addr)
160{
161 int ret;
162
163 *ksymname = strsep(&input_string, ":");
164 *addr = kallsyms_lookup_name(*ksymname);
165
166 /* Check for malformed request: (2), (1) and (5) */
167 if ((!input_string) ||
168 (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
169 (*addr == 0))
170 return -EINVAL;;
171
172 ret = ksym_trace_get_access_type(input_string);
173
174 return ret;
175}
176
177int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
178{
179 struct trace_ksym *entry;
180 int ret = -ENOMEM;
181
182 if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
183 printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
184 " new requests for tracing can be accepted now.\n",
185 KSYM_TRACER_MAX);
186 return -ENOSPC;
187 }
188
189 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
190 if (!entry)
191 return -ENOMEM;
192
193 hw_breakpoint_init(&entry->attr);
194
195 entry->attr.bp_type = op;
196 entry->attr.bp_addr = addr;
197 entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
198
199 ret = -EAGAIN;
200 entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
201 ksym_hbp_handler);
202
203 if (IS_ERR(entry->ksym_hbp)) {
204 ret = PTR_ERR(entry->ksym_hbp);
205 printk(KERN_INFO "ksym_tracer request failed. Try again"
206 " later!!\n");
207 goto err;
208 }
209
210 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
211 ksym_filter_entry_count++;
212
213 return 0;
214
215err:
216 kfree(entry);
217
218 return ret;
219}
220
221static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
222 size_t count, loff_t *ppos)
223{
224 struct trace_ksym *entry;
225 struct hlist_node *node;
226 struct trace_seq *s;
227 ssize_t cnt = 0;
228 int ret;
229
230 s = kmalloc(sizeof(*s), GFP_KERNEL);
231 if (!s)
232 return -ENOMEM;
233 trace_seq_init(s);
234
235 mutex_lock(&ksym_tracer_mutex);
236
237 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
238 ret = trace_seq_printf(s, "%pS:", (void *)entry->attr.bp_addr);
239 if (entry->attr.bp_type == HW_BREAKPOINT_R)
240 ret = trace_seq_puts(s, "r--\n");
241 else if (entry->attr.bp_type == HW_BREAKPOINT_W)
242 ret = trace_seq_puts(s, "-w-\n");
243 else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
244 ret = trace_seq_puts(s, "rw-\n");
245 WARN_ON_ONCE(!ret);
246 }
247
248 cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
249
250 mutex_unlock(&ksym_tracer_mutex);
251
252 kfree(s);
253
254 return cnt;
255}
256
257static void __ksym_trace_reset(void)
258{
259 struct trace_ksym *entry;
260 struct hlist_node *node, *node1;
261
262 mutex_lock(&ksym_tracer_mutex);
263 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
264 ksym_hlist) {
265 unregister_wide_hw_breakpoint(entry->ksym_hbp);
266 ksym_filter_entry_count--;
267 hlist_del_rcu(&(entry->ksym_hlist));
268 synchronize_rcu();
269 kfree(entry);
270 }
271 mutex_unlock(&ksym_tracer_mutex);
272}
273
274static ssize_t ksym_trace_filter_write(struct file *file,
275 const char __user *buffer,
276 size_t count, loff_t *ppos)
277{
278 struct trace_ksym *entry;
279 struct hlist_node *node;
280 char *input_string, *ksymname = NULL;
281 unsigned long ksym_addr = 0;
282 int ret, op, changed = 0;
283
284 input_string = kzalloc(count + 1, GFP_KERNEL);
285 if (!input_string)
286 return -ENOMEM;
287
288 if (copy_from_user(input_string, buffer, count)) {
289 kfree(input_string);
290 return -EFAULT;
291 }
292 input_string[count] = '\0';
293
294 strstrip(input_string);
295
296 /*
297 * Clear all breakpoints if:
298 * 1: echo > ksym_trace_filter
299 * 2: echo 0 > ksym_trace_filter
300 * 3: echo "*:---" > ksym_trace_filter
301 */
302 if (!input_string[0] || !strcmp(input_string, "0") ||
303 !strcmp(input_string, "*:---")) {
304 __ksym_trace_reset();
305 kfree(input_string);
306 return count;
307 }
308
309 ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
310 if (ret < 0) {
311 kfree(input_string);
312 return ret;
313 }
314
315 mutex_lock(&ksym_tracer_mutex);
316
317 ret = -EINVAL;
318 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
319 if (entry->attr.bp_addr == ksym_addr) {
320 /* Check for malformed request: (6) */
321 if (entry->attr.bp_type != op)
322 changed = 1;
323 else
324 goto out;
325 break;
326 }
327 }
328 if (changed) {
329 unregister_wide_hw_breakpoint(entry->ksym_hbp);
330 entry->attr.bp_type = op;
331 ret = 0;
332 if (op > 0) {
333 entry->ksym_hbp =
334 register_wide_hw_breakpoint(&entry->attr,
335 ksym_hbp_handler);
336 if (IS_ERR(entry->ksym_hbp))
337 ret = PTR_ERR(entry->ksym_hbp);
338 else
339 goto out;
340 }
341 /* Error or "symbol:---" case: drop it */
342 ksym_filter_entry_count--;
343 hlist_del_rcu(&(entry->ksym_hlist));
344 synchronize_rcu();
345 kfree(entry);
346 goto out;
347 } else {
348 /* Check for malformed request: (4) */
349 if (op == 0)
350 goto out;
351 ret = process_new_ksym_entry(ksymname, op, ksym_addr);
352 }
353out:
354 mutex_unlock(&ksym_tracer_mutex);
355
356 kfree(input_string);
357
358 if (!ret)
359 ret = count;
360 return ret;
361}
362
363static const struct file_operations ksym_tracing_fops = {
364 .open = tracing_open_generic,
365 .read = ksym_trace_filter_read,
366 .write = ksym_trace_filter_write,
367};
368
369static void ksym_trace_reset(struct trace_array *tr)
370{
371 ksym_tracing_enabled = 0;
372 __ksym_trace_reset();
373}
374
375static int ksym_trace_init(struct trace_array *tr)
376{
377 int cpu, ret = 0;
378
379 for_each_online_cpu(cpu)
380 tracing_reset(tr, cpu);
381 ksym_tracing_enabled = 1;
382 ksym_trace_array = tr;
383
384 return ret;
385}
386
387static void ksym_trace_print_header(struct seq_file *m)
388{
389 seq_puts(m,
390 "# TASK-PID CPU# Symbol "
391 "Type Function\n");
392 seq_puts(m,
393 "# | | | "
394 " | |\n");
395}
396
397static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
398{
399 struct trace_entry *entry = iter->ent;
400 struct trace_seq *s = &iter->seq;
401 struct ksym_trace_entry *field;
402 char str[KSYM_SYMBOL_LEN];
403 int ret;
404
405 if (entry->type != TRACE_KSYM)
406 return TRACE_TYPE_UNHANDLED;
407
408 trace_assign_type(field, entry);
409
410 ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
411 entry->pid, iter->cpu, (char *)field->addr);
412 if (!ret)
413 return TRACE_TYPE_PARTIAL_LINE;
414
415 switch (field->type) {
416 case HW_BREAKPOINT_R:
417 ret = trace_seq_printf(s, " R ");
418 break;
419 case HW_BREAKPOINT_W:
420 ret = trace_seq_printf(s, " W ");
421 break;
422 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
423 ret = trace_seq_printf(s, " RW ");
424 break;
425 default:
426 return TRACE_TYPE_PARTIAL_LINE;
427 }
428
429 if (!ret)
430 return TRACE_TYPE_PARTIAL_LINE;
431
432 sprint_symbol(str, field->ip);
433 ret = trace_seq_printf(s, "%s\n", str);
434 if (!ret)
435 return TRACE_TYPE_PARTIAL_LINE;
436
437 return TRACE_TYPE_HANDLED;
438}
439
440struct tracer ksym_tracer __read_mostly =
441{
442 .name = "ksym_tracer",
443 .init = ksym_trace_init,
444 .reset = ksym_trace_reset,
445#ifdef CONFIG_FTRACE_SELFTEST
446 .selftest = trace_selftest_startup_ksym,
447#endif
448 .print_header = ksym_trace_print_header,
449 .print_line = ksym_trace_output
450};
451
452__init static int init_ksym_trace(void)
453{
454 struct dentry *d_tracer;
455 struct dentry *entry;
456
457 d_tracer = tracing_init_dentry();
458 ksym_filter_entry_count = 0;
459
460 entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer,
461 NULL, &ksym_tracing_fops);
462 if (!entry)
463 pr_warning("Could not create debugfs "
464 "'ksym_trace_filter' file\n");
465
466 return register_tracer(&ksym_tracer);
467}
468device_initcall(init_ksym_trace);
469
470
471#ifdef CONFIG_PROFILE_KSYM_TRACER
472static int ksym_tracer_stat_headers(struct seq_file *m)
473{
474 seq_puts(m, " Access Type ");
475 seq_puts(m, " Symbol Counter\n");
476 seq_puts(m, " ----------- ");
477 seq_puts(m, " ------ -------\n");
478 return 0;
479}
480
481static int ksym_tracer_stat_show(struct seq_file *m, void *v)
482{
483 struct hlist_node *stat = v;
484 struct trace_ksym *entry;
485 int access_type = 0;
486 char fn_name[KSYM_NAME_LEN];
487
488 entry = hlist_entry(stat, struct trace_ksym, ksym_hlist);
489
490 access_type = entry->attr.bp_type;
491
492 switch (access_type) {
493 case HW_BREAKPOINT_R:
494 seq_puts(m, " R ");
495 break;
496 case HW_BREAKPOINT_W:
497 seq_puts(m, " W ");
498 break;
499 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
500 seq_puts(m, " RW ");
501 break;
502 default:
503 seq_puts(m, " NA ");
504 }
505
506 if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
507 seq_printf(m, " %-36s", fn_name);
508 else
509 seq_printf(m, " %-36s", "<NA>");
510 seq_printf(m, " %15lu\n", entry->counter);
511
512 return 0;
513}
514
515static void *ksym_tracer_stat_start(struct tracer_stat *trace)
516{
517 return ksym_filter_head.first;
518}
519
520static void *
521ksym_tracer_stat_next(void *v, int idx)
522{
523 struct hlist_node *stat = v;
524
525 return stat->next;
526}
527
528static struct tracer_stat ksym_tracer_stats = {
529 .name = "ksym_tracer",
530 .stat_start = ksym_tracer_stat_start,
531 .stat_next = ksym_tracer_stat_next,
532 .stat_headers = ksym_tracer_stat_headers,
533 .stat_show = ksym_tracer_stat_show
534};
535
536__init static int ksym_tracer_stat_init(void)
537{
538 int ret;
539
540 ret = register_stat_tracer(&ksym_tracer_stats);
541 if (ret) {
542 printk(KERN_WARNING "Warning: could not register "
543 "ksym tracer stats\n");
544 return 1;
545 }
546
547 return 0;
548}
549fs_initcall(ksym_tracer_stat_init);
550#endif /* CONFIG_PROFILE_KSYM_TRACER */
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index c4c9bbda53d3..0acd834659ed 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -307,6 +307,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
307 struct trace_array_cpu *data, 307 struct trace_array_cpu *data,
308 struct mmiotrace_rw *rw) 308 struct mmiotrace_rw *rw)
309{ 309{
310 struct ftrace_event_call *call = &event_mmiotrace_rw;
310 struct ring_buffer *buffer = tr->buffer; 311 struct ring_buffer *buffer = tr->buffer;
311 struct ring_buffer_event *event; 312 struct ring_buffer_event *event;
312 struct trace_mmiotrace_rw *entry; 313 struct trace_mmiotrace_rw *entry;
@@ -320,7 +321,9 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
320 } 321 }
321 entry = ring_buffer_event_data(event); 322 entry = ring_buffer_event_data(event);
322 entry->rw = *rw; 323 entry->rw = *rw;
323 trace_buffer_unlock_commit(buffer, event, 0, pc); 324
325 if (!filter_check_discard(call, entry, buffer, event))
326 trace_buffer_unlock_commit(buffer, event, 0, pc);
324} 327}
325 328
326void mmio_trace_rw(struct mmiotrace_rw *rw) 329void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -334,6 +337,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
334 struct trace_array_cpu *data, 337 struct trace_array_cpu *data,
335 struct mmiotrace_map *map) 338 struct mmiotrace_map *map)
336{ 339{
340 struct ftrace_event_call *call = &event_mmiotrace_map;
337 struct ring_buffer *buffer = tr->buffer; 341 struct ring_buffer *buffer = tr->buffer;
338 struct ring_buffer_event *event; 342 struct ring_buffer_event *event;
339 struct trace_mmiotrace_map *entry; 343 struct trace_mmiotrace_map *entry;
@@ -347,7 +351,9 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
347 } 351 }
348 entry = ring_buffer_event_data(event); 352 entry = ring_buffer_event_data(event);
349 entry->map = *map; 353 entry->map = *map;
350 trace_buffer_unlock_commit(buffer, event, 0, pc); 354
355 if (!filter_check_discard(call, entry, buffer, event))
356 trace_buffer_unlock_commit(buffer, event, 0, pc);
351} 357}
352 358
353void mmio_trace_mapping(struct mmiotrace_map *map) 359void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index e0c2545622e8..b6c12c6a1bcd 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -69,6 +69,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
69 * @s: trace sequence descriptor 69 * @s: trace sequence descriptor
70 * @fmt: printf format string 70 * @fmt: printf format string
71 * 71 *
72 * It returns 0 if the trace oversizes the buffer's free
73 * space, 1 otherwise.
74 *
72 * The tracer may use either sequence operations or its own 75 * The tracer may use either sequence operations or its own
73 * copy to user routines. To simplify formating of a trace 76 * copy to user routines. To simplify formating of a trace
74 * trace_seq_printf is used to store strings into a special 77 * trace_seq_printf is used to store strings into a special
@@ -95,7 +98,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
95 98
96 s->len += ret; 99 s->len += ret;
97 100
98 return len; 101 return 1;
99} 102}
100EXPORT_SYMBOL_GPL(trace_seq_printf); 103EXPORT_SYMBOL_GPL(trace_seq_printf);
101 104
@@ -407,7 +410,7 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
407 * since individual threads might have already quit! 410 * since individual threads might have already quit!
408 */ 411 */
409 rcu_read_lock(); 412 rcu_read_lock();
410 task = find_task_by_vpid(entry->ent.tgid); 413 task = find_task_by_vpid(entry->tgid);
411 if (task) 414 if (task)
412 mm = get_task_mm(task); 415 mm = get_task_mm(task);
413 rcu_read_unlock(); 416 rcu_read_unlock();
@@ -460,18 +463,23 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
460 return ret; 463 return ret;
461} 464}
462 465
463static int 466/**
464lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) 467 * trace_print_lat_fmt - print the irq, preempt and lockdep fields
468 * @s: trace seq struct to write to
469 * @entry: The trace entry field from the ring buffer
470 *
471 * Prints the generic fields of irqs off, in hard or softirq, preempt
472 * count and lock depth.
473 */
474int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
465{ 475{
466 int hardirq, softirq; 476 int hardirq, softirq;
467 char comm[TASK_COMM_LEN]; 477 int ret;
468 478
469 trace_find_cmdline(entry->pid, comm);
470 hardirq = entry->flags & TRACE_FLAG_HARDIRQ; 479 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
471 softirq = entry->flags & TRACE_FLAG_SOFTIRQ; 480 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
472 481
473 if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c", 482 if (!trace_seq_printf(s, "%c%c%c",
474 comm, entry->pid, cpu,
475 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : 483 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
476 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 484 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
477 'X' : '.', 485 'X' : '.',
@@ -482,8 +490,31 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
482 return 0; 490 return 0;
483 491
484 if (entry->preempt_count) 492 if (entry->preempt_count)
485 return trace_seq_printf(s, "%x", entry->preempt_count); 493 ret = trace_seq_printf(s, "%x", entry->preempt_count);
486 return trace_seq_puts(s, "."); 494 else
495 ret = trace_seq_putc(s, '.');
496
497 if (!ret)
498 return 0;
499
500 if (entry->lock_depth < 0)
501 return trace_seq_putc(s, '.');
502
503 return trace_seq_printf(s, "%d", entry->lock_depth);
504}
505
506static int
507lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
508{
509 char comm[TASK_COMM_LEN];
510
511 trace_find_cmdline(entry->pid, comm);
512
513 if (!trace_seq_printf(s, "%8.8s-%-5d %3d",
514 comm, entry->pid, cpu))
515 return 0;
516
517 return trace_print_lat_fmt(s, entry);
487} 518}
488 519
489static unsigned long preempt_mark_thresh = 100; 520static unsigned long preempt_mark_thresh = 100;
@@ -857,7 +888,7 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
857 trace_assign_type(field, iter->ent); 888 trace_assign_type(field, iter->ent);
858 889
859 if (!S) 890 if (!S)
860 task_state_char(field->prev_state); 891 S = task_state_char(field->prev_state);
861 T = task_state_char(field->next_state); 892 T = task_state_char(field->next_state);
862 if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", 893 if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
863 field->prev_pid, 894 field->prev_pid,
@@ -892,7 +923,7 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
892 trace_assign_type(field, iter->ent); 923 trace_assign_type(field, iter->ent);
893 924
894 if (!S) 925 if (!S)
895 task_state_char(field->prev_state); 926 S = task_state_char(field->prev_state);
896 T = task_state_char(field->next_state); 927 T = task_state_char(field->next_state);
897 928
898 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); 929 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index d38bec4a9c30..9d91c72ba38b 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -26,6 +26,8 @@ extern struct trace_event *ftrace_find_event(int type);
26 26
27extern enum print_line_t trace_nop_print(struct trace_iterator *iter, 27extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
28 int flags); 28 int flags);
29extern int
30trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
29 31
30/* used by module unregistering */ 32/* used by module unregistering */
31extern int __unregister_ftrace_event(struct trace_event *event); 33extern int __unregister_ftrace_event(struct trace_event *event);
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
deleted file mode 100644
index fe1a00f1445a..000000000000
--- a/kernel/trace/trace_power.c
+++ /dev/null
@@ -1,218 +0,0 @@
1/*
2 * ring buffer based C-state tracer
3 *
4 * Arjan van de Ven <arjan@linux.intel.com>
5 * Copyright (C) 2008 Intel Corporation
6 *
7 * Much is borrowed from trace_boot.c which is
8 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
9 *
10 */
11
12#include <linux/init.h>
13#include <linux/debugfs.h>
14#include <trace/power.h>
15#include <linux/kallsyms.h>
16#include <linux/module.h>
17
18#include "trace.h"
19#include "trace_output.h"
20
21static struct trace_array *power_trace;
22static int __read_mostly trace_power_enabled;
23
24static void probe_power_start(struct power_trace *it, unsigned int type,
25 unsigned int level)
26{
27 if (!trace_power_enabled)
28 return;
29
30 memset(it, 0, sizeof(struct power_trace));
31 it->state = level;
32 it->type = type;
33 it->stamp = ktime_get();
34}
35
36
37static void probe_power_end(struct power_trace *it)
38{
39 struct ftrace_event_call *call = &event_power;
40 struct ring_buffer_event *event;
41 struct ring_buffer *buffer;
42 struct trace_power *entry;
43 struct trace_array_cpu *data;
44 struct trace_array *tr = power_trace;
45
46 if (!trace_power_enabled)
47 return;
48
49 buffer = tr->buffer;
50
51 preempt_disable();
52 it->end = ktime_get();
53 data = tr->data[smp_processor_id()];
54
55 event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
56 sizeof(*entry), 0, 0);
57 if (!event)
58 goto out;
59 entry = ring_buffer_event_data(event);
60 entry->state_data = *it;
61 if (!filter_check_discard(call, entry, buffer, event))
62 trace_buffer_unlock_commit(buffer, event, 0, 0);
63 out:
64 preempt_enable();
65}
66
67static void probe_power_mark(struct power_trace *it, unsigned int type,
68 unsigned int level)
69{
70 struct ftrace_event_call *call = &event_power;
71 struct ring_buffer_event *event;
72 struct ring_buffer *buffer;
73 struct trace_power *entry;
74 struct trace_array_cpu *data;
75 struct trace_array *tr = power_trace;
76
77 if (!trace_power_enabled)
78 return;
79
80 buffer = tr->buffer;
81
82 memset(it, 0, sizeof(struct power_trace));
83 it->state = level;
84 it->type = type;
85 it->stamp = ktime_get();
86 preempt_disable();
87 it->end = it->stamp;
88 data = tr->data[smp_processor_id()];
89
90 event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
91 sizeof(*entry), 0, 0);
92 if (!event)
93 goto out;
94 entry = ring_buffer_event_data(event);
95 entry->state_data = *it;
96 if (!filter_check_discard(call, entry, buffer, event))
97 trace_buffer_unlock_commit(buffer, event, 0, 0);
98 out:
99 preempt_enable();
100}
101
102static int tracing_power_register(void)
103{
104 int ret;
105
106 ret = register_trace_power_start(probe_power_start);
107 if (ret) {
108 pr_info("power trace: Couldn't activate tracepoint"
109 " probe to trace_power_start\n");
110 return ret;
111 }
112 ret = register_trace_power_end(probe_power_end);
113 if (ret) {
114 pr_info("power trace: Couldn't activate tracepoint"
115 " probe to trace_power_end\n");
116 goto fail_start;
117 }
118 ret = register_trace_power_mark(probe_power_mark);
119 if (ret) {
120 pr_info("power trace: Couldn't activate tracepoint"
121 " probe to trace_power_mark\n");
122 goto fail_end;
123 }
124 return ret;
125fail_end:
126 unregister_trace_power_end(probe_power_end);
127fail_start:
128 unregister_trace_power_start(probe_power_start);
129 return ret;
130}
131
132static void start_power_trace(struct trace_array *tr)
133{
134 trace_power_enabled = 1;
135}
136
137static void stop_power_trace(struct trace_array *tr)
138{
139 trace_power_enabled = 0;
140}
141
142static void power_trace_reset(struct trace_array *tr)
143{
144 trace_power_enabled = 0;
145 unregister_trace_power_start(probe_power_start);
146 unregister_trace_power_end(probe_power_end);
147 unregister_trace_power_mark(probe_power_mark);
148}
149
150
151static int power_trace_init(struct trace_array *tr)
152{
153 power_trace = tr;
154
155 trace_power_enabled = 1;
156 tracing_power_register();
157
158 tracing_reset_online_cpus(tr);
159 return 0;
160}
161
162static enum print_line_t power_print_line(struct trace_iterator *iter)
163{
164 int ret = 0;
165 struct trace_entry *entry = iter->ent;
166 struct trace_power *field ;
167 struct power_trace *it;
168 struct trace_seq *s = &iter->seq;
169 struct timespec stamp;
170 struct timespec duration;
171
172 trace_assign_type(field, entry);
173 it = &field->state_data;
174 stamp = ktime_to_timespec(it->stamp);
175 duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
176
177 if (entry->type == TRACE_POWER) {
178 if (it->type == POWER_CSTATE)
179 ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
180 stamp.tv_sec,
181 stamp.tv_nsec,
182 it->state, iter->cpu,
183 duration.tv_sec,
184 duration.tv_nsec);
185 if (it->type == POWER_PSTATE)
186 ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
187 stamp.tv_sec,
188 stamp.tv_nsec,
189 it->state, iter->cpu);
190 if (!ret)
191 return TRACE_TYPE_PARTIAL_LINE;
192 return TRACE_TYPE_HANDLED;
193 }
194 return TRACE_TYPE_UNHANDLED;
195}
196
197static void power_print_header(struct seq_file *s)
198{
199 seq_puts(s, "# TIMESTAMP STATE EVENT\n");
200 seq_puts(s, "# | | |\n");
201}
202
203static struct tracer power_tracer __read_mostly =
204{
205 .name = "power",
206 .init = power_trace_init,
207 .start = start_power_trace,
208 .stop = stop_power_trace,
209 .reset = power_trace_reset,
210 .print_line = power_print_line,
211 .print_header = power_print_header,
212};
213
214static int init_power_trace(void)
215{
216 return register_tracer(&power_tracer);
217}
218device_initcall(init_power_trace);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 687699d365ae..2547d8813cf0 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -11,7 +11,6 @@
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/marker.h>
15#include <linux/mutex.h> 14#include <linux/mutex.h>
16#include <linux/ctype.h> 15#include <linux/ctype.h>
17#include <linux/list.h> 16#include <linux/list.h>
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index ad69f105a7c6..26185d727676 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -24,6 +24,7 @@ static int __read_mostly tracer_enabled;
24 24
25static struct task_struct *wakeup_task; 25static struct task_struct *wakeup_task;
26static int wakeup_cpu; 26static int wakeup_cpu;
27static int wakeup_current_cpu;
27static unsigned wakeup_prio = -1; 28static unsigned wakeup_prio = -1;
28static int wakeup_rt; 29static int wakeup_rt;
29 30
@@ -56,33 +57,23 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
56 resched = ftrace_preempt_disable(); 57 resched = ftrace_preempt_disable();
57 58
58 cpu = raw_smp_processor_id(); 59 cpu = raw_smp_processor_id();
60 if (cpu != wakeup_current_cpu)
61 goto out_enable;
62
59 data = tr->data[cpu]; 63 data = tr->data[cpu];
60 disabled = atomic_inc_return(&data->disabled); 64 disabled = atomic_inc_return(&data->disabled);
61 if (unlikely(disabled != 1)) 65 if (unlikely(disabled != 1))
62 goto out; 66 goto out;
63 67
64 local_irq_save(flags); 68 local_irq_save(flags);
65 __raw_spin_lock(&wakeup_lock);
66
67 if (unlikely(!wakeup_task))
68 goto unlock;
69
70 /*
71 * The task can't disappear because it needs to
72 * wake up first, and we have the wakeup_lock.
73 */
74 if (task_cpu(wakeup_task) != cpu)
75 goto unlock;
76 69
77 trace_function(tr, ip, parent_ip, flags, pc); 70 trace_function(tr, ip, parent_ip, flags, pc);
78 71
79 unlock:
80 __raw_spin_unlock(&wakeup_lock);
81 local_irq_restore(flags); 72 local_irq_restore(flags);
82 73
83 out: 74 out:
84 atomic_dec(&data->disabled); 75 atomic_dec(&data->disabled);
85 76 out_enable:
86 ftrace_preempt_enable(resched); 77 ftrace_preempt_enable(resched);
87} 78}
88 79
@@ -107,11 +98,18 @@ static int report_latency(cycle_t delta)
107 return 1; 98 return 1;
108} 99}
109 100
101static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
102{
103 if (task != wakeup_task)
104 return;
105
106 wakeup_current_cpu = cpu;
107}
108
110static void notrace 109static void notrace
111probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, 110probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
112 struct task_struct *next) 111 struct task_struct *next)
113{ 112{
114 unsigned long latency = 0, t0 = 0, t1 = 0;
115 struct trace_array_cpu *data; 113 struct trace_array_cpu *data;
116 cycle_t T0, T1, delta; 114 cycle_t T0, T1, delta;
117 unsigned long flags; 115 unsigned long flags;
@@ -157,10 +155,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
157 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); 155 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
158 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); 156 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
159 157
160 /*
161 * usecs conversion is slow so we try to delay the conversion
162 * as long as possible:
163 */
164 T0 = data->preempt_timestamp; 158 T0 = data->preempt_timestamp;
165 T1 = ftrace_now(cpu); 159 T1 = ftrace_now(cpu);
166 delta = T1-T0; 160 delta = T1-T0;
@@ -168,13 +162,10 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
168 if (!report_latency(delta)) 162 if (!report_latency(delta))
169 goto out_unlock; 163 goto out_unlock;
170 164
171 latency = nsecs_to_usecs(delta); 165 if (likely(!is_tracing_stopped())) {
172 166 tracing_max_latency = delta;
173 tracing_max_latency = delta; 167 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
174 t0 = nsecs_to_usecs(T0); 168 }
175 t1 = nsecs_to_usecs(T1);
176
177 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
178 169
179out_unlock: 170out_unlock:
180 __wakeup_reset(wakeup_trace); 171 __wakeup_reset(wakeup_trace);
@@ -244,6 +235,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
244 __wakeup_reset(wakeup_trace); 235 __wakeup_reset(wakeup_trace);
245 236
246 wakeup_cpu = task_cpu(p); 237 wakeup_cpu = task_cpu(p);
238 wakeup_current_cpu = wakeup_cpu;
247 wakeup_prio = p->prio; 239 wakeup_prio = p->prio;
248 240
249 wakeup_task = p; 241 wakeup_task = p;
@@ -293,6 +285,13 @@ static void start_wakeup_tracer(struct trace_array *tr)
293 goto fail_deprobe_wake_new; 285 goto fail_deprobe_wake_new;
294 } 286 }
295 287
288 ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task);
289 if (ret) {
290 pr_info("wakeup trace: Couldn't activate tracepoint"
291 " probe to kernel_sched_migrate_task\n");
292 return;
293 }
294
296 wakeup_reset(tr); 295 wakeup_reset(tr);
297 296
298 /* 297 /*
@@ -325,6 +324,7 @@ static void stop_wakeup_tracer(struct trace_array *tr)
325 unregister_trace_sched_switch(probe_wakeup_sched_switch); 324 unregister_trace_sched_switch(probe_wakeup_sched_switch);
326 unregister_trace_sched_wakeup_new(probe_wakeup); 325 unregister_trace_sched_wakeup_new(probe_wakeup);
327 unregister_trace_sched_wakeup(probe_wakeup); 326 unregister_trace_sched_wakeup(probe_wakeup);
327 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task);
328} 328}
329 329
330static int __wakeup_tracer_init(struct trace_array *tr) 330static int __wakeup_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index d2cdbabb4ead..dc98309e839a 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
17 case TRACE_GRAPH_ENT: 17 case TRACE_GRAPH_ENT:
18 case TRACE_GRAPH_RET: 18 case TRACE_GRAPH_RET:
19 case TRACE_HW_BRANCHES: 19 case TRACE_HW_BRANCHES:
20 case TRACE_KSYM:
20 return 1; 21 return 1;
21 } 22 }
22 return 0; 23 return 0;
@@ -808,3 +809,57 @@ trace_selftest_startup_hw_branches(struct tracer *trace,
808 return ret; 809 return ret;
809} 810}
810#endif /* CONFIG_HW_BRANCH_TRACER */ 811#endif /* CONFIG_HW_BRANCH_TRACER */
812
813#ifdef CONFIG_KSYM_TRACER
814static int ksym_selftest_dummy;
815
816int
817trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
818{
819 unsigned long count;
820 int ret;
821
822 /* start the tracing */
823 ret = tracer_init(trace, tr);
824 if (ret) {
825 warn_failed_init_tracer(trace, ret);
826 return ret;
827 }
828
829 ksym_selftest_dummy = 0;
830 /* Register the read-write tracing request */
831
832 ret = process_new_ksym_entry("ksym_selftest_dummy",
833 HW_BREAKPOINT_R | HW_BREAKPOINT_W,
834 (unsigned long)(&ksym_selftest_dummy));
835
836 if (ret < 0) {
837 printk(KERN_CONT "ksym_trace read-write startup test failed\n");
838 goto ret_path;
839 }
840 /* Perform a read and a write operation over the dummy variable to
841 * trigger the tracer
842 */
843 if (ksym_selftest_dummy == 0)
844 ksym_selftest_dummy++;
845
846 /* stop the tracing. */
847 tracing_stop();
848 /* check the trace buffer */
849 ret = trace_test_buffer(tr, &count);
850 trace->reset(tr);
851 tracing_start();
852
853 /* read & write operations - one each is performed on the dummy variable
854 * triggering two entries in the trace buffer
855 */
856 if (!ret && count != 2) {
857 printk(KERN_CONT "Ksym tracer startup test failed");
858 ret = -1;
859 }
860
861ret_path:
862 return ret;
863}
864#endif /* CONFIG_KSYM_TRACER */
865
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 0f6facb050a1..8504ac71e4e8 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -296,14 +296,14 @@ static const struct file_operations stack_trace_fops = {
296 296
297int 297int
298stack_trace_sysctl(struct ctl_table *table, int write, 298stack_trace_sysctl(struct ctl_table *table, int write,
299 struct file *file, void __user *buffer, size_t *lenp, 299 void __user *buffer, size_t *lenp,
300 loff_t *ppos) 300 loff_t *ppos)
301{ 301{
302 int ret; 302 int ret;
303 303
304 mutex_lock(&stack_sysctl_mutex); 304 mutex_lock(&stack_sysctl_mutex);
305 305
306 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 306 ret = proc_dointvec(table, write, buffer, lenp, ppos);
307 307
308 if (ret || !write || 308 if (ret || !write ||
309 (last_stack_tracer_enabled == !!stack_tracer_enabled)) 309 (last_stack_tracer_enabled == !!stack_tracer_enabled))
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8712ce3c6a0e..57501d90096a 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -2,7 +2,7 @@
2#include <trace/events/syscalls.h> 2#include <trace/events/syscalls.h>
3#include <linux/kernel.h> 3#include <linux/kernel.h>
4#include <linux/ftrace.h> 4#include <linux/ftrace.h>
5#include <linux/perf_counter.h> 5#include <linux/perf_event.h>
6#include <asm/syscall.h> 6#include <asm/syscall.h>
7 7
8#include "trace_output.h" 8#include "trace_output.h"
@@ -14,6 +14,43 @@ static int sys_refcount_exit;
14static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); 14static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
15static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); 15static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
16 16
17extern unsigned long __start_syscalls_metadata[];
18extern unsigned long __stop_syscalls_metadata[];
19
20static struct syscall_metadata **syscalls_metadata;
21
22static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
23{
24 struct syscall_metadata *start;
25 struct syscall_metadata *stop;
26 char str[KSYM_SYMBOL_LEN];
27
28
29 start = (struct syscall_metadata *)__start_syscalls_metadata;
30 stop = (struct syscall_metadata *)__stop_syscalls_metadata;
31 kallsyms_lookup(syscall, NULL, NULL, NULL, str);
32
33 for ( ; start < stop; start++) {
34 /*
35 * Only compare after the "sys" prefix. Archs that use
36 * syscall wrappers may have syscalls symbols aliases prefixed
37 * with "SyS" instead of "sys", leading to an unwanted
38 * mismatch.
39 */
40 if (start->name && !strcmp(start->name + 3, str + 3))
41 return start;
42 }
43 return NULL;
44}
45
46static struct syscall_metadata *syscall_nr_to_meta(int nr)
47{
48 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
49 return NULL;
50
51 return syscalls_metadata[nr];
52}
53
17enum print_line_t 54enum print_line_t
18print_syscall_enter(struct trace_iterator *iter, int flags) 55print_syscall_enter(struct trace_iterator *iter, int flags)
19{ 56{
@@ -30,7 +67,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
30 if (!entry) 67 if (!entry)
31 goto end; 68 goto end;
32 69
33 if (entry->enter_id != ent->type) { 70 if (entry->enter_event->id != ent->type) {
34 WARN_ON_ONCE(1); 71 WARN_ON_ONCE(1);
35 goto end; 72 goto end;
36 } 73 }
@@ -85,7 +122,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
85 return TRACE_TYPE_HANDLED; 122 return TRACE_TYPE_HANDLED;
86 } 123 }
87 124
88 if (entry->exit_id != ent->type) { 125 if (entry->exit_event->id != ent->type) {
89 WARN_ON_ONCE(1); 126 WARN_ON_ONCE(1);
90 return TRACE_TYPE_UNHANDLED; 127 return TRACE_TYPE_UNHANDLED;
91 } 128 }
@@ -103,24 +140,19 @@ extern char *__bad_type_size(void);
103#define SYSCALL_FIELD(type, name) \ 140#define SYSCALL_FIELD(type, name) \
104 sizeof(type) != sizeof(trace.name) ? \ 141 sizeof(type) != sizeof(trace.name) ? \
105 __bad_type_size() : \ 142 __bad_type_size() : \
106 #type, #name, offsetof(typeof(trace), name), sizeof(trace.name) 143 #type, #name, offsetof(typeof(trace), name), \
144 sizeof(trace.name), is_signed_type(type)
107 145
108int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) 146int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
109{ 147{
110 int i; 148 int i;
111 int nr;
112 int ret; 149 int ret;
113 struct syscall_metadata *entry; 150 struct syscall_metadata *entry = call->data;
114 struct syscall_trace_enter trace; 151 struct syscall_trace_enter trace;
115 int offset = offsetof(struct syscall_trace_enter, args); 152 int offset = offsetof(struct syscall_trace_enter, args);
116 153
117 nr = syscall_name_to_nr(call->data); 154 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
118 entry = syscall_nr_to_meta(nr); 155 "\tsigned:%u;\n",
119
120 if (!entry)
121 return 0;
122
123 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
124 SYSCALL_FIELD(int, nr)); 156 SYSCALL_FIELD(int, nr));
125 if (!ret) 157 if (!ret)
126 return 0; 158 return 0;
@@ -130,8 +162,10 @@ int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
130 entry->args[i]); 162 entry->args[i]);
131 if (!ret) 163 if (!ret)
132 return 0; 164 return 0;
133 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset, 165 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;"
134 sizeof(unsigned long)); 166 "\tsigned:%u;\n", offset,
167 sizeof(unsigned long),
168 is_signed_type(unsigned long));
135 if (!ret) 169 if (!ret)
136 return 0; 170 return 0;
137 offset += sizeof(unsigned long); 171 offset += sizeof(unsigned long);
@@ -163,10 +197,12 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
163 struct syscall_trace_exit trace; 197 struct syscall_trace_exit trace;
164 198
165 ret = trace_seq_printf(s, 199 ret = trace_seq_printf(s,
166 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 200 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
167 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", 201 "\tsigned:%u;\n"
202 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
203 "\tsigned:%u;\n",
168 SYSCALL_FIELD(int, nr), 204 SYSCALL_FIELD(int, nr),
169 SYSCALL_FIELD(unsigned long, ret)); 205 SYSCALL_FIELD(long, ret));
170 if (!ret) 206 if (!ret)
171 return 0; 207 return 0;
172 208
@@ -176,22 +212,19 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
176int syscall_enter_define_fields(struct ftrace_event_call *call) 212int syscall_enter_define_fields(struct ftrace_event_call *call)
177{ 213{
178 struct syscall_trace_enter trace; 214 struct syscall_trace_enter trace;
179 struct syscall_metadata *meta; 215 struct syscall_metadata *meta = call->data;
180 int ret; 216 int ret;
181 int nr;
182 int i; 217 int i;
183 int offset = offsetof(typeof(trace), args); 218 int offset = offsetof(typeof(trace), args);
184 219
185 nr = syscall_name_to_nr(call->data);
186 meta = syscall_nr_to_meta(nr);
187
188 if (!meta)
189 return 0;
190
191 ret = trace_define_common_fields(call); 220 ret = trace_define_common_fields(call);
192 if (ret) 221 if (ret)
193 return ret; 222 return ret;
194 223
224 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
225 if (ret)
226 return ret;
227
195 for (i = 0; i < meta->nb_args; i++) { 228 for (i = 0; i < meta->nb_args; i++) {
196 ret = trace_define_field(call, meta->types[i], 229 ret = trace_define_field(call, meta->types[i],
197 meta->args[i], offset, 230 meta->args[i], offset,
@@ -212,7 +245,11 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
212 if (ret) 245 if (ret)
213 return ret; 246 return ret;
214 247
215 ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0, 248 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
249 if (ret)
250 return ret;
251
252 ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
216 FILTER_OTHER); 253 FILTER_OTHER);
217 254
218 return ret; 255 return ret;
@@ -239,8 +276,8 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
239 276
240 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 277 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
241 278
242 event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id, 279 event = trace_current_buffer_lock_reserve(&buffer,
243 size, 0, 0); 280 sys_data->enter_event->id, size, 0, 0);
244 if (!event) 281 if (!event)
245 return; 282 return;
246 283
@@ -271,8 +308,8 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
271 if (!sys_data) 308 if (!sys_data)
272 return; 309 return;
273 310
274 event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id, 311 event = trace_current_buffer_lock_reserve(&buffer,
275 sizeof(*entry), 0, 0); 312 sys_data->exit_event->id, sizeof(*entry), 0, 0);
276 if (!event) 313 if (!event)
277 return; 314 return;
278 315
@@ -285,14 +322,12 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
285 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 322 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
286} 323}
287 324
288int reg_event_syscall_enter(void *ptr) 325int reg_event_syscall_enter(struct ftrace_event_call *call)
289{ 326{
290 int ret = 0; 327 int ret = 0;
291 int num; 328 int num;
292 char *name;
293 329
294 name = (char *)ptr; 330 num = ((struct syscall_metadata *)call->data)->syscall_nr;
295 num = syscall_name_to_nr(name);
296 if (num < 0 || num >= NR_syscalls) 331 if (num < 0 || num >= NR_syscalls)
297 return -ENOSYS; 332 return -ENOSYS;
298 mutex_lock(&syscall_trace_lock); 333 mutex_lock(&syscall_trace_lock);
@@ -309,13 +344,11 @@ int reg_event_syscall_enter(void *ptr)
309 return ret; 344 return ret;
310} 345}
311 346
312void unreg_event_syscall_enter(void *ptr) 347void unreg_event_syscall_enter(struct ftrace_event_call *call)
313{ 348{
314 int num; 349 int num;
315 char *name;
316 350
317 name = (char *)ptr; 351 num = ((struct syscall_metadata *)call->data)->syscall_nr;
318 num = syscall_name_to_nr(name);
319 if (num < 0 || num >= NR_syscalls) 352 if (num < 0 || num >= NR_syscalls)
320 return; 353 return;
321 mutex_lock(&syscall_trace_lock); 354 mutex_lock(&syscall_trace_lock);
@@ -326,14 +359,12 @@ void unreg_event_syscall_enter(void *ptr)
326 mutex_unlock(&syscall_trace_lock); 359 mutex_unlock(&syscall_trace_lock);
327} 360}
328 361
329int reg_event_syscall_exit(void *ptr) 362int reg_event_syscall_exit(struct ftrace_event_call *call)
330{ 363{
331 int ret = 0; 364 int ret = 0;
332 int num; 365 int num;
333 char *name;
334 366
335 name = (char *)ptr; 367 num = ((struct syscall_metadata *)call->data)->syscall_nr;
336 num = syscall_name_to_nr(name);
337 if (num < 0 || num >= NR_syscalls) 368 if (num < 0 || num >= NR_syscalls)
338 return -ENOSYS; 369 return -ENOSYS;
339 mutex_lock(&syscall_trace_lock); 370 mutex_lock(&syscall_trace_lock);
@@ -350,13 +381,11 @@ int reg_event_syscall_exit(void *ptr)
350 return ret; 381 return ret;
351} 382}
352 383
353void unreg_event_syscall_exit(void *ptr) 384void unreg_event_syscall_exit(struct ftrace_event_call *call)
354{ 385{
355 int num; 386 int num;
356 char *name;
357 387
358 name = (char *)ptr; 388 num = ((struct syscall_metadata *)call->data)->syscall_nr;
359 num = syscall_name_to_nr(name);
360 if (num < 0 || num >= NR_syscalls) 389 if (num < 0 || num >= NR_syscalls)
361 return; 390 return;
362 mutex_lock(&syscall_trace_lock); 391 mutex_lock(&syscall_trace_lock);
@@ -367,13 +396,44 @@ void unreg_event_syscall_exit(void *ptr)
367 mutex_unlock(&syscall_trace_lock); 396 mutex_unlock(&syscall_trace_lock);
368} 397}
369 398
370struct trace_event event_syscall_enter = { 399int init_syscall_trace(struct ftrace_event_call *call)
371 .trace = print_syscall_enter, 400{
372}; 401 int id;
402
403 id = register_ftrace_event(call->event);
404 if (!id)
405 return -ENODEV;
406 call->id = id;
407 INIT_LIST_HEAD(&call->fields);
408 return 0;
409}
410
411int __init init_ftrace_syscalls(void)
412{
413 struct syscall_metadata *meta;
414 unsigned long addr;
415 int i;
416
417 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
418 NR_syscalls, GFP_KERNEL);
419 if (!syscalls_metadata) {
420 WARN_ON(1);
421 return -ENOMEM;
422 }
373 423
374struct trace_event event_syscall_exit = { 424 for (i = 0; i < NR_syscalls; i++) {
375 .trace = print_syscall_exit, 425 addr = arch_syscall_addr(i);
376}; 426 meta = find_syscall_meta(addr);
427 if (!meta)
428 continue;
429
430 meta->syscall_nr = i;
431 syscalls_metadata[i] = meta;
432 }
433
434 return 0;
435}
436core_initcall(init_ftrace_syscalls);
377 437
378#ifdef CONFIG_EVENT_PROFILE 438#ifdef CONFIG_EVENT_PROFILE
379 439
@@ -384,10 +444,15 @@ static int sys_prof_refcount_exit;
384 444
385static void prof_syscall_enter(struct pt_regs *regs, long id) 445static void prof_syscall_enter(struct pt_regs *regs, long id)
386{ 446{
387 struct syscall_trace_enter *rec;
388 struct syscall_metadata *sys_data; 447 struct syscall_metadata *sys_data;
448 struct syscall_trace_enter *rec;
449 unsigned long flags;
450 char *trace_buf;
451 char *raw_data;
389 int syscall_nr; 452 int syscall_nr;
453 int rctx;
390 int size; 454 int size;
455 int cpu;
391 456
392 syscall_nr = syscall_get_nr(current, regs); 457 syscall_nr = syscall_get_nr(current, regs);
393 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) 458 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
@@ -402,30 +467,49 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
402 size = ALIGN(size + sizeof(u32), sizeof(u64)); 467 size = ALIGN(size + sizeof(u32), sizeof(u64));
403 size -= sizeof(u32); 468 size -= sizeof(u32);
404 469
405 do { 470 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
406 char raw_data[size]; 471 "profile buffer not large enough"))
472 return;
473
474 /* Protect the per cpu buffer, begin the rcu read side */
475 local_irq_save(flags);
476
477 rctx = perf_swevent_get_recursion_context();
478 if (rctx < 0)
479 goto end_recursion;
480
481 cpu = smp_processor_id();
482
483 trace_buf = rcu_dereference(perf_trace_buf);
484
485 if (!trace_buf)
486 goto end;
487
488 raw_data = per_cpu_ptr(trace_buf, cpu);
489
490 /* zero the dead bytes from align to not leak stack to user */
491 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
407 492
408 /* zero the dead bytes from align to not leak stack to user */ 493 rec = (struct syscall_trace_enter *) raw_data;
409 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 494 tracing_generic_entry_update(&rec->ent, 0, 0);
495 rec->ent.type = sys_data->enter_event->id;
496 rec->nr = syscall_nr;
497 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
498 (unsigned long *)&rec->args);
499 perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size);
410 500
411 rec = (struct syscall_trace_enter *) raw_data; 501end:
412 tracing_generic_entry_update(&rec->ent, 0, 0); 502 perf_swevent_put_recursion_context(rctx);
413 rec->ent.type = sys_data->enter_id; 503end_recursion:
414 rec->nr = syscall_nr; 504 local_irq_restore(flags);
415 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
416 (unsigned long *)&rec->args);
417 perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
418 } while(0);
419} 505}
420 506
421int reg_prof_syscall_enter(char *name) 507int prof_sysenter_enable(struct ftrace_event_call *call)
422{ 508{
423 int ret = 0; 509 int ret = 0;
424 int num; 510 int num;
425 511
426 num = syscall_name_to_nr(name); 512 num = ((struct syscall_metadata *)call->data)->syscall_nr;
427 if (num < 0 || num >= NR_syscalls)
428 return -ENOSYS;
429 513
430 mutex_lock(&syscall_trace_lock); 514 mutex_lock(&syscall_trace_lock);
431 if (!sys_prof_refcount_enter) 515 if (!sys_prof_refcount_enter)
@@ -441,13 +525,11 @@ int reg_prof_syscall_enter(char *name)
441 return ret; 525 return ret;
442} 526}
443 527
444void unreg_prof_syscall_enter(char *name) 528void prof_sysenter_disable(struct ftrace_event_call *call)
445{ 529{
446 int num; 530 int num;
447 531
448 num = syscall_name_to_nr(name); 532 num = ((struct syscall_metadata *)call->data)->syscall_nr;
449 if (num < 0 || num >= NR_syscalls)
450 return;
451 533
452 mutex_lock(&syscall_trace_lock); 534 mutex_lock(&syscall_trace_lock);
453 sys_prof_refcount_enter--; 535 sys_prof_refcount_enter--;
@@ -460,8 +542,14 @@ void unreg_prof_syscall_enter(char *name)
460static void prof_syscall_exit(struct pt_regs *regs, long ret) 542static void prof_syscall_exit(struct pt_regs *regs, long ret)
461{ 543{
462 struct syscall_metadata *sys_data; 544 struct syscall_metadata *sys_data;
463 struct syscall_trace_exit rec; 545 struct syscall_trace_exit *rec;
546 unsigned long flags;
464 int syscall_nr; 547 int syscall_nr;
548 char *trace_buf;
549 char *raw_data;
550 int rctx;
551 int size;
552 int cpu;
465 553
466 syscall_nr = syscall_get_nr(current, regs); 554 syscall_nr = syscall_get_nr(current, regs);
467 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) 555 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
@@ -471,22 +559,58 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
471 if (!sys_data) 559 if (!sys_data)
472 return; 560 return;
473 561
474 tracing_generic_entry_update(&rec.ent, 0, 0); 562 /* We can probably do that at build time */
475 rec.ent.type = sys_data->exit_id; 563 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
476 rec.nr = syscall_nr; 564 size -= sizeof(u32);
477 rec.ret = syscall_get_return_value(current, regs); 565
566 /*
567 * Impossible, but be paranoid with the future
568 * How to put this check outside runtime?
569 */
570 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
571 "exit event has grown above profile buffer size"))
572 return;
573
574 /* Protect the per cpu buffer, begin the rcu read side */
575 local_irq_save(flags);
576
577 rctx = perf_swevent_get_recursion_context();
578 if (rctx < 0)
579 goto end_recursion;
580
581 cpu = smp_processor_id();
582
583 trace_buf = rcu_dereference(perf_trace_buf);
584
585 if (!trace_buf)
586 goto end;
587
588 raw_data = per_cpu_ptr(trace_buf, cpu);
478 589
479 perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec)); 590 /* zero the dead bytes from align to not leak stack to user */
591 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
592
593 rec = (struct syscall_trace_exit *)raw_data;
594
595 tracing_generic_entry_update(&rec->ent, 0, 0);
596 rec->ent.type = sys_data->exit_event->id;
597 rec->nr = syscall_nr;
598 rec->ret = syscall_get_return_value(current, regs);
599
600 perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size);
601
602end:
603 perf_swevent_put_recursion_context(rctx);
604end_recursion:
605 local_irq_restore(flags);
480} 606}
481 607
482int reg_prof_syscall_exit(char *name) 608int prof_sysexit_enable(struct ftrace_event_call *call)
483{ 609{
484 int ret = 0; 610 int ret = 0;
485 int num; 611 int num;
486 612
487 num = syscall_name_to_nr(name); 613 num = ((struct syscall_metadata *)call->data)->syscall_nr;
488 if (num < 0 || num >= NR_syscalls)
489 return -ENOSYS;
490 614
491 mutex_lock(&syscall_trace_lock); 615 mutex_lock(&syscall_trace_lock);
492 if (!sys_prof_refcount_exit) 616 if (!sys_prof_refcount_exit)
@@ -502,13 +626,11 @@ int reg_prof_syscall_exit(char *name)
502 return ret; 626 return ret;
503} 627}
504 628
505void unreg_prof_syscall_exit(char *name) 629void prof_sysexit_disable(struct ftrace_event_call *call)
506{ 630{
507 int num; 631 int num;
508 632
509 num = syscall_name_to_nr(name); 633 num = ((struct syscall_metadata *)call->data)->syscall_nr;
510 if (num < 0 || num >= NR_syscalls)
511 return;
512 634
513 mutex_lock(&syscall_trace_lock); 635 mutex_lock(&syscall_trace_lock);
514 sys_prof_refcount_exit--; 636 sys_prof_refcount_exit--;
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 9489a0a9b1be..cc89be5bc0f8 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -48,7 +48,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
48 48
49/* 49/*
50 * Note about RCU : 50 * Note about RCU :
51 * It is used to to delay the free of multiple probes array until a quiescent 51 * It is used to delay the free of multiple probes array until a quiescent
52 * state is reached. 52 * state is reached.
53 * Tracepoint entries modifications are protected by the tracepoints_mutex. 53 * Tracepoint entries modifications are protected by the tracepoints_mutex.
54 */ 54 */
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 0314501688b9..419209893d87 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -4,7 +4,6 @@
4 */ 4 */
5 5
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/utsname.h>
8#include <linux/mman.h> 7#include <linux/mman.h>
9#include <linux/notifier.h> 8#include <linux/notifier.h>
10#include <linux/reboot.h> 9#include <linux/reboot.h>
diff --git a/kernel/user.c b/kernel/user.c
index 2c000e7132ac..46d0165ca70c 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -330,9 +330,9 @@ done:
330 */ 330 */
331static void free_user(struct user_struct *up, unsigned long flags) 331static void free_user(struct user_struct *up, unsigned long flags)
332{ 332{
333 spin_unlock_irqrestore(&uidhash_lock, flags);
334 INIT_DELAYED_WORK(&up->work, cleanup_user_struct); 333 INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
335 schedule_delayed_work(&up->work, msecs_to_jiffies(1000)); 334 schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
335 spin_unlock_irqrestore(&uidhash_lock, flags);
336} 336}
337 337
338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ 338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 92359cc747a7..69eae358a726 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -42,14 +42,14 @@ static void put_uts(ctl_table *table, int write, void *which)
42 * Special case of dostring for the UTS structure. This has locks 42 * Special case of dostring for the UTS structure. This has locks
43 * to observe. Should this be in kernel/sys.c ???? 43 * to observe. Should this be in kernel/sys.c ????
44 */ 44 */
45static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, 45static int proc_do_uts_string(ctl_table *table, int write,
46 void __user *buffer, size_t *lenp, loff_t *ppos) 46 void __user *buffer, size_t *lenp, loff_t *ppos)
47{ 47{
48 struct ctl_table uts_table; 48 struct ctl_table uts_table;
49 int r; 49 int r;
50 memcpy(&uts_table, table, sizeof(uts_table)); 50 memcpy(&uts_table, table, sizeof(uts_table));
51 uts_table.data = get_uts(table, write); 51 uts_table.data = get_uts(table, write);
52 r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos); 52 r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
53 put_uts(table, write, uts_table.data); 53 put_uts(table, write, uts_table.data);
54 return r; 54 return r;
55} 55}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index addfe2df93b1..67e526b6ae81 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -640,6 +640,24 @@ int schedule_delayed_work(struct delayed_work *dwork,
640EXPORT_SYMBOL(schedule_delayed_work); 640EXPORT_SYMBOL(schedule_delayed_work);
641 641
642/** 642/**
643 * flush_delayed_work - block until a dwork_struct's callback has terminated
644 * @dwork: the delayed work which is to be flushed
645 *
646 * Any timeout is cancelled, and any pending work is run immediately.
647 */
648void flush_delayed_work(struct delayed_work *dwork)
649{
650 if (del_timer_sync(&dwork->timer)) {
651 struct cpu_workqueue_struct *cwq;
652 cwq = wq_per_cpu(keventd_wq, get_cpu());
653 __queue_work(cwq, &dwork->work);
654 put_cpu();
655 }
656 flush_work(&dwork->work);
657}
658EXPORT_SYMBOL(flush_delayed_work);
659
660/**
643 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay 661 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
644 * @cpu: cpu to use 662 * @cpu: cpu to use
645 * @dwork: job to be done 663 * @dwork: job to be done
@@ -667,6 +685,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
667int schedule_on_each_cpu(work_func_t func) 685int schedule_on_each_cpu(work_func_t func)
668{ 686{
669 int cpu; 687 int cpu;
688 int orig = -1;
670 struct work_struct *works; 689 struct work_struct *works;
671 690
672 works = alloc_percpu(struct work_struct); 691 works = alloc_percpu(struct work_struct);
@@ -674,14 +693,28 @@ int schedule_on_each_cpu(work_func_t func)
674 return -ENOMEM; 693 return -ENOMEM;
675 694
676 get_online_cpus(); 695 get_online_cpus();
696
697 /*
698 * When running in keventd don't schedule a work item on
699 * itself. Can just call directly because the work queue is
700 * already bound. This also is faster.
701 */
702 if (current_is_keventd())
703 orig = raw_smp_processor_id();
704
677 for_each_online_cpu(cpu) { 705 for_each_online_cpu(cpu) {
678 struct work_struct *work = per_cpu_ptr(works, cpu); 706 struct work_struct *work = per_cpu_ptr(works, cpu);
679 707
680 INIT_WORK(work, func); 708 INIT_WORK(work, func);
681 schedule_work_on(cpu, work); 709 if (cpu != orig)
710 schedule_work_on(cpu, work);
682 } 711 }
712 if (orig >= 0)
713 func(per_cpu_ptr(works, orig));
714
683 for_each_online_cpu(cpu) 715 for_each_online_cpu(cpu)
684 flush_work(per_cpu_ptr(works, cpu)); 716 flush_work(per_cpu_ptr(works, cpu));
717
685 put_online_cpus(); 718 put_online_cpus();
686 free_percpu(works); 719 free_percpu(works);
687 return 0; 720 return 0;