diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-03-15 21:31:30 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-03-15 21:31:30 -0400 |
commit | a926021cb1f8a99a275eaf6eb546102e9469dc59 (patch) | |
tree | c6d0300cd4b1a1fd658708476db4577b68b4de31 /kernel | |
parent | 0586bed3e8563c2eb89bc7256e30ce633ae06cfb (diff) | |
parent | 5e814dd597c42daeb8d2a276e64a6ec986ad0e2a (diff) |
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (184 commits)
perf probe: Clean up probe_point_lazy_walker() return value
tracing: Fix irqoff selftest expanding max buffer
tracing: Align 4 byte ints together in struct tracer
tracing: Export trace_set_clr_event()
tracing: Explain about unstable clock on resume with ring buffer warning
ftrace/graph: Trace function entry before updating index
ftrace: Add .ref.text as one of the safe areas to trace
tracing: Adjust conditional expression latency formatting.
tracing: Fix event alignment: skb:kfree_skb
tracing: Fix event alignment: mce:mce_record
tracing: Fix event alignment: kvm:kvm_hv_hypercall
tracing: Fix event alignment: module:module_request
tracing: Fix event alignment: ftrace:context_switch and ftrace:wakeup
tracing: Remove lock_depth from event entry
perf header: Stop using 'self'
perf session: Use evlist/evsel for managing perf.data attributes
perf top: Don't let events to eat up whole header line
perf top: Fix events overflow in top command
ring-buffer: Remove unused #include <linux/trace_irq.h>
tracing: Add an 'overwrite' trace_option.
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup.c | 54 | ||||
-rw-r--r-- | kernel/perf_event.c | 1004 | ||||
-rw-r--r-- | kernel/sched.c | 37 | ||||
-rw-r--r-- | kernel/sysctl.c | 2 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 52 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 24 | ||||
-rw-r--r-- | kernel/trace/trace.c | 38 | ||||
-rw-r--r-- | kernel/trace/trace.h | 41 | ||||
-rw-r--r-- | kernel/trace/trace_entries.h | 6 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_events_filter.c | 885 | ||||
-rw-r--r-- | kernel/trace/trace_kprobe.c | 111 | ||||
-rw-r--r-- | kernel/trace/trace_output.c | 36 | ||||
-rw-r--r-- | kernel/trace/trace_sched_switch.c | 48 | ||||
-rw-r--r-- | kernel/trace/trace_syscalls.c | 42 |
15 files changed, 1904 insertions, 478 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b24d7027b83c..95362d15128c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -4230,20 +4230,8 @@ void cgroup_post_fork(struct task_struct *child) | |||
4230 | */ | 4230 | */ |
4231 | void cgroup_exit(struct task_struct *tsk, int run_callbacks) | 4231 | void cgroup_exit(struct task_struct *tsk, int run_callbacks) |
4232 | { | 4232 | { |
4233 | int i; | ||
4234 | struct css_set *cg; | 4233 | struct css_set *cg; |
4235 | 4234 | int i; | |
4236 | if (run_callbacks && need_forkexit_callback) { | ||
4237 | /* | ||
4238 | * modular subsystems can't use callbacks, so no need to lock | ||
4239 | * the subsys array | ||
4240 | */ | ||
4241 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
4242 | struct cgroup_subsys *ss = subsys[i]; | ||
4243 | if (ss->exit) | ||
4244 | ss->exit(ss, tsk); | ||
4245 | } | ||
4246 | } | ||
4247 | 4235 | ||
4248 | /* | 4236 | /* |
4249 | * Unlink from the css_set task list if necessary. | 4237 | * Unlink from the css_set task list if necessary. |
@@ -4261,7 +4249,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
4261 | task_lock(tsk); | 4249 | task_lock(tsk); |
4262 | cg = tsk->cgroups; | 4250 | cg = tsk->cgroups; |
4263 | tsk->cgroups = &init_css_set; | 4251 | tsk->cgroups = &init_css_set; |
4252 | |||
4253 | if (run_callbacks && need_forkexit_callback) { | ||
4254 | /* | ||
4255 | * modular subsystems can't use callbacks, so no need to lock | ||
4256 | * the subsys array | ||
4257 | */ | ||
4258 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
4259 | struct cgroup_subsys *ss = subsys[i]; | ||
4260 | if (ss->exit) { | ||
4261 | struct cgroup *old_cgrp = | ||
4262 | rcu_dereference_raw(cg->subsys[i])->cgroup; | ||
4263 | struct cgroup *cgrp = task_cgroup(tsk, i); | ||
4264 | ss->exit(ss, cgrp, old_cgrp, tsk); | ||
4265 | } | ||
4266 | } | ||
4267 | } | ||
4264 | task_unlock(tsk); | 4268 | task_unlock(tsk); |
4269 | |||
4265 | if (cg) | 4270 | if (cg) |
4266 | put_css_set_taskexit(cg); | 4271 | put_css_set_taskexit(cg); |
4267 | } | 4272 | } |
@@ -4813,6 +4818,29 @@ css_get_next(struct cgroup_subsys *ss, int id, | |||
4813 | return ret; | 4818 | return ret; |
4814 | } | 4819 | } |
4815 | 4820 | ||
4821 | /* | ||
4822 | * get corresponding css from file open on cgroupfs directory | ||
4823 | */ | ||
4824 | struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) | ||
4825 | { | ||
4826 | struct cgroup *cgrp; | ||
4827 | struct inode *inode; | ||
4828 | struct cgroup_subsys_state *css; | ||
4829 | |||
4830 | inode = f->f_dentry->d_inode; | ||
4831 | /* check in cgroup filesystem dir */ | ||
4832 | if (inode->i_op != &cgroup_dir_inode_operations) | ||
4833 | return ERR_PTR(-EBADF); | ||
4834 | |||
4835 | if (id < 0 || id >= CGROUP_SUBSYS_COUNT) | ||
4836 | return ERR_PTR(-EINVAL); | ||
4837 | |||
4838 | /* get cgroup */ | ||
4839 | cgrp = __d_cgrp(f->f_dentry); | ||
4840 | css = cgrp->subsys[id]; | ||
4841 | return css ? css : ERR_PTR(-ENOENT); | ||
4842 | } | ||
4843 | |||
4816 | #ifdef CONFIG_CGROUP_DEBUG | 4844 | #ifdef CONFIG_CGROUP_DEBUG |
4817 | static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, | 4845 | static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, |
4818 | struct cgroup *cont) | 4846 | struct cgroup *cont) |
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 656222fcf767..ed253aa24ba4 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
@@ -38,13 +38,96 @@ | |||
38 | 38 | ||
39 | #include <asm/irq_regs.h> | 39 | #include <asm/irq_regs.h> |
40 | 40 | ||
41 | struct remote_function_call { | ||
42 | struct task_struct *p; | ||
43 | int (*func)(void *info); | ||
44 | void *info; | ||
45 | int ret; | ||
46 | }; | ||
47 | |||
48 | static void remote_function(void *data) | ||
49 | { | ||
50 | struct remote_function_call *tfc = data; | ||
51 | struct task_struct *p = tfc->p; | ||
52 | |||
53 | if (p) { | ||
54 | tfc->ret = -EAGAIN; | ||
55 | if (task_cpu(p) != smp_processor_id() || !task_curr(p)) | ||
56 | return; | ||
57 | } | ||
58 | |||
59 | tfc->ret = tfc->func(tfc->info); | ||
60 | } | ||
61 | |||
62 | /** | ||
63 | * task_function_call - call a function on the cpu on which a task runs | ||
64 | * @p: the task to evaluate | ||
65 | * @func: the function to be called | ||
66 | * @info: the function call argument | ||
67 | * | ||
68 | * Calls the function @func when the task is currently running. This might | ||
69 | * be on the current CPU, which just calls the function directly | ||
70 | * | ||
71 | * returns: @func return value, or | ||
72 | * -ESRCH - when the process isn't running | ||
73 | * -EAGAIN - when the process moved away | ||
74 | */ | ||
75 | static int | ||
76 | task_function_call(struct task_struct *p, int (*func) (void *info), void *info) | ||
77 | { | ||
78 | struct remote_function_call data = { | ||
79 | .p = p, | ||
80 | .func = func, | ||
81 | .info = info, | ||
82 | .ret = -ESRCH, /* No such (running) process */ | ||
83 | }; | ||
84 | |||
85 | if (task_curr(p)) | ||
86 | smp_call_function_single(task_cpu(p), remote_function, &data, 1); | ||
87 | |||
88 | return data.ret; | ||
89 | } | ||
90 | |||
91 | /** | ||
92 | * cpu_function_call - call a function on the cpu | ||
93 | * @func: the function to be called | ||
94 | * @info: the function call argument | ||
95 | * | ||
96 | * Calls the function @func on the remote cpu. | ||
97 | * | ||
98 | * returns: @func return value or -ENXIO when the cpu is offline | ||
99 | */ | ||
100 | static int cpu_function_call(int cpu, int (*func) (void *info), void *info) | ||
101 | { | ||
102 | struct remote_function_call data = { | ||
103 | .p = NULL, | ||
104 | .func = func, | ||
105 | .info = info, | ||
106 | .ret = -ENXIO, /* No such CPU */ | ||
107 | }; | ||
108 | |||
109 | smp_call_function_single(cpu, remote_function, &data, 1); | ||
110 | |||
111 | return data.ret; | ||
112 | } | ||
113 | |||
114 | #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ | ||
115 | PERF_FLAG_FD_OUTPUT |\ | ||
116 | PERF_FLAG_PID_CGROUP) | ||
117 | |||
41 | enum event_type_t { | 118 | enum event_type_t { |
42 | EVENT_FLEXIBLE = 0x1, | 119 | EVENT_FLEXIBLE = 0x1, |
43 | EVENT_PINNED = 0x2, | 120 | EVENT_PINNED = 0x2, |
44 | EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, | 121 | EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, |
45 | }; | 122 | }; |
46 | 123 | ||
47 | atomic_t perf_task_events __read_mostly; | 124 | /* |
125 | * perf_sched_events : >0 events exist | ||
126 | * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu | ||
127 | */ | ||
128 | atomic_t perf_sched_events __read_mostly; | ||
129 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); | ||
130 | |||
48 | static atomic_t nr_mmap_events __read_mostly; | 131 | static atomic_t nr_mmap_events __read_mostly; |
49 | static atomic_t nr_comm_events __read_mostly; | 132 | static atomic_t nr_comm_events __read_mostly; |
50 | static atomic_t nr_task_events __read_mostly; | 133 | static atomic_t nr_task_events __read_mostly; |
@@ -67,7 +150,24 @@ int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ | |||
67 | /* | 150 | /* |
68 | * max perf event sample rate | 151 | * max perf event sample rate |
69 | */ | 152 | */ |
70 | int sysctl_perf_event_sample_rate __read_mostly = 100000; | 153 | #define DEFAULT_MAX_SAMPLE_RATE 100000 |
154 | int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; | ||
155 | static int max_samples_per_tick __read_mostly = | ||
156 | DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); | ||
157 | |||
158 | int perf_proc_update_handler(struct ctl_table *table, int write, | ||
159 | void __user *buffer, size_t *lenp, | ||
160 | loff_t *ppos) | ||
161 | { | ||
162 | int ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
163 | |||
164 | if (ret || !write) | ||
165 | return ret; | ||
166 | |||
167 | max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); | ||
168 | |||
169 | return 0; | ||
170 | } | ||
71 | 171 | ||
72 | static atomic64_t perf_event_id; | 172 | static atomic64_t perf_event_id; |
73 | 173 | ||
@@ -75,7 +175,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, | |||
75 | enum event_type_t event_type); | 175 | enum event_type_t event_type); |
76 | 176 | ||
77 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | 177 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, |
78 | enum event_type_t event_type); | 178 | enum event_type_t event_type, |
179 | struct task_struct *task); | ||
180 | |||
181 | static void update_context_time(struct perf_event_context *ctx); | ||
182 | static u64 perf_event_time(struct perf_event *event); | ||
79 | 183 | ||
80 | void __weak perf_event_print_debug(void) { } | 184 | void __weak perf_event_print_debug(void) { } |
81 | 185 | ||
@@ -89,6 +193,360 @@ static inline u64 perf_clock(void) | |||
89 | return local_clock(); | 193 | return local_clock(); |
90 | } | 194 | } |
91 | 195 | ||
196 | static inline struct perf_cpu_context * | ||
197 | __get_cpu_context(struct perf_event_context *ctx) | ||
198 | { | ||
199 | return this_cpu_ptr(ctx->pmu->pmu_cpu_context); | ||
200 | } | ||
201 | |||
202 | #ifdef CONFIG_CGROUP_PERF | ||
203 | |||
204 | /* | ||
205 | * Must ensure cgroup is pinned (css_get) before calling | ||
206 | * this function. In other words, we cannot call this function | ||
207 | * if there is no cgroup event for the current CPU context. | ||
208 | */ | ||
209 | static inline struct perf_cgroup * | ||
210 | perf_cgroup_from_task(struct task_struct *task) | ||
211 | { | ||
212 | return container_of(task_subsys_state(task, perf_subsys_id), | ||
213 | struct perf_cgroup, css); | ||
214 | } | ||
215 | |||
216 | static inline bool | ||
217 | perf_cgroup_match(struct perf_event *event) | ||
218 | { | ||
219 | struct perf_event_context *ctx = event->ctx; | ||
220 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
221 | |||
222 | return !event->cgrp || event->cgrp == cpuctx->cgrp; | ||
223 | } | ||
224 | |||
225 | static inline void perf_get_cgroup(struct perf_event *event) | ||
226 | { | ||
227 | css_get(&event->cgrp->css); | ||
228 | } | ||
229 | |||
230 | static inline void perf_put_cgroup(struct perf_event *event) | ||
231 | { | ||
232 | css_put(&event->cgrp->css); | ||
233 | } | ||
234 | |||
235 | static inline void perf_detach_cgroup(struct perf_event *event) | ||
236 | { | ||
237 | perf_put_cgroup(event); | ||
238 | event->cgrp = NULL; | ||
239 | } | ||
240 | |||
241 | static inline int is_cgroup_event(struct perf_event *event) | ||
242 | { | ||
243 | return event->cgrp != NULL; | ||
244 | } | ||
245 | |||
246 | static inline u64 perf_cgroup_event_time(struct perf_event *event) | ||
247 | { | ||
248 | struct perf_cgroup_info *t; | ||
249 | |||
250 | t = per_cpu_ptr(event->cgrp->info, event->cpu); | ||
251 | return t->time; | ||
252 | } | ||
253 | |||
254 | static inline void __update_cgrp_time(struct perf_cgroup *cgrp) | ||
255 | { | ||
256 | struct perf_cgroup_info *info; | ||
257 | u64 now; | ||
258 | |||
259 | now = perf_clock(); | ||
260 | |||
261 | info = this_cpu_ptr(cgrp->info); | ||
262 | |||
263 | info->time += now - info->timestamp; | ||
264 | info->timestamp = now; | ||
265 | } | ||
266 | |||
267 | static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) | ||
268 | { | ||
269 | struct perf_cgroup *cgrp_out = cpuctx->cgrp; | ||
270 | if (cgrp_out) | ||
271 | __update_cgrp_time(cgrp_out); | ||
272 | } | ||
273 | |||
274 | static inline void update_cgrp_time_from_event(struct perf_event *event) | ||
275 | { | ||
276 | struct perf_cgroup *cgrp; | ||
277 | |||
278 | /* | ||
279 | * ensure we access cgroup data only when needed and | ||
280 | * when we know the cgroup is pinned (css_get) | ||
281 | */ | ||
282 | if (!is_cgroup_event(event)) | ||
283 | return; | ||
284 | |||
285 | cgrp = perf_cgroup_from_task(current); | ||
286 | /* | ||
287 | * Do not update time when cgroup is not active | ||
288 | */ | ||
289 | if (cgrp == event->cgrp) | ||
290 | __update_cgrp_time(event->cgrp); | ||
291 | } | ||
292 | |||
293 | static inline void | ||
294 | perf_cgroup_set_timestamp(struct task_struct *task, | ||
295 | struct perf_event_context *ctx) | ||
296 | { | ||
297 | struct perf_cgroup *cgrp; | ||
298 | struct perf_cgroup_info *info; | ||
299 | |||
300 | /* | ||
301 | * ctx->lock held by caller | ||
302 | * ensure we do not access cgroup data | ||
303 | * unless we have the cgroup pinned (css_get) | ||
304 | */ | ||
305 | if (!task || !ctx->nr_cgroups) | ||
306 | return; | ||
307 | |||
308 | cgrp = perf_cgroup_from_task(task); | ||
309 | info = this_cpu_ptr(cgrp->info); | ||
310 | info->timestamp = ctx->timestamp; | ||
311 | } | ||
312 | |||
313 | #define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ | ||
314 | #define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ | ||
315 | |||
316 | /* | ||
317 | * reschedule events based on the cgroup constraint of task. | ||
318 | * | ||
319 | * mode SWOUT : schedule out everything | ||
320 | * mode SWIN : schedule in based on cgroup for next | ||
321 | */ | ||
322 | void perf_cgroup_switch(struct task_struct *task, int mode) | ||
323 | { | ||
324 | struct perf_cpu_context *cpuctx; | ||
325 | struct pmu *pmu; | ||
326 | unsigned long flags; | ||
327 | |||
328 | /* | ||
329 | * disable interrupts to avoid geting nr_cgroup | ||
330 | * changes via __perf_event_disable(). Also | ||
331 | * avoids preemption. | ||
332 | */ | ||
333 | local_irq_save(flags); | ||
334 | |||
335 | /* | ||
336 | * we reschedule only in the presence of cgroup | ||
337 | * constrained events. | ||
338 | */ | ||
339 | rcu_read_lock(); | ||
340 | |||
341 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
342 | |||
343 | cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | ||
344 | |||
345 | perf_pmu_disable(cpuctx->ctx.pmu); | ||
346 | |||
347 | /* | ||
348 | * perf_cgroup_events says at least one | ||
349 | * context on this CPU has cgroup events. | ||
350 | * | ||
351 | * ctx->nr_cgroups reports the number of cgroup | ||
352 | * events for a context. | ||
353 | */ | ||
354 | if (cpuctx->ctx.nr_cgroups > 0) { | ||
355 | |||
356 | if (mode & PERF_CGROUP_SWOUT) { | ||
357 | cpu_ctx_sched_out(cpuctx, EVENT_ALL); | ||
358 | /* | ||
359 | * must not be done before ctxswout due | ||
360 | * to event_filter_match() in event_sched_out() | ||
361 | */ | ||
362 | cpuctx->cgrp = NULL; | ||
363 | } | ||
364 | |||
365 | if (mode & PERF_CGROUP_SWIN) { | ||
366 | /* set cgrp before ctxsw in to | ||
367 | * allow event_filter_match() to not | ||
368 | * have to pass task around | ||
369 | */ | ||
370 | cpuctx->cgrp = perf_cgroup_from_task(task); | ||
371 | cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); | ||
372 | } | ||
373 | } | ||
374 | |||
375 | perf_pmu_enable(cpuctx->ctx.pmu); | ||
376 | } | ||
377 | |||
378 | rcu_read_unlock(); | ||
379 | |||
380 | local_irq_restore(flags); | ||
381 | } | ||
382 | |||
383 | static inline void perf_cgroup_sched_out(struct task_struct *task) | ||
384 | { | ||
385 | perf_cgroup_switch(task, PERF_CGROUP_SWOUT); | ||
386 | } | ||
387 | |||
388 | static inline void perf_cgroup_sched_in(struct task_struct *task) | ||
389 | { | ||
390 | perf_cgroup_switch(task, PERF_CGROUP_SWIN); | ||
391 | } | ||
392 | |||
393 | static inline int perf_cgroup_connect(int fd, struct perf_event *event, | ||
394 | struct perf_event_attr *attr, | ||
395 | struct perf_event *group_leader) | ||
396 | { | ||
397 | struct perf_cgroup *cgrp; | ||
398 | struct cgroup_subsys_state *css; | ||
399 | struct file *file; | ||
400 | int ret = 0, fput_needed; | ||
401 | |||
402 | file = fget_light(fd, &fput_needed); | ||
403 | if (!file) | ||
404 | return -EBADF; | ||
405 | |||
406 | css = cgroup_css_from_dir(file, perf_subsys_id); | ||
407 | if (IS_ERR(css)) { | ||
408 | ret = PTR_ERR(css); | ||
409 | goto out; | ||
410 | } | ||
411 | |||
412 | cgrp = container_of(css, struct perf_cgroup, css); | ||
413 | event->cgrp = cgrp; | ||
414 | |||
415 | /* must be done before we fput() the file */ | ||
416 | perf_get_cgroup(event); | ||
417 | |||
418 | /* | ||
419 | * all events in a group must monitor | ||
420 | * the same cgroup because a task belongs | ||
421 | * to only one perf cgroup at a time | ||
422 | */ | ||
423 | if (group_leader && group_leader->cgrp != cgrp) { | ||
424 | perf_detach_cgroup(event); | ||
425 | ret = -EINVAL; | ||
426 | } | ||
427 | out: | ||
428 | fput_light(file, fput_needed); | ||
429 | return ret; | ||
430 | } | ||
431 | |||
432 | static inline void | ||
433 | perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) | ||
434 | { | ||
435 | struct perf_cgroup_info *t; | ||
436 | t = per_cpu_ptr(event->cgrp->info, event->cpu); | ||
437 | event->shadow_ctx_time = now - t->timestamp; | ||
438 | } | ||
439 | |||
440 | static inline void | ||
441 | perf_cgroup_defer_enabled(struct perf_event *event) | ||
442 | { | ||
443 | /* | ||
444 | * when the current task's perf cgroup does not match | ||
445 | * the event's, we need to remember to call the | ||
446 | * perf_mark_enable() function the first time a task with | ||
447 | * a matching perf cgroup is scheduled in. | ||
448 | */ | ||
449 | if (is_cgroup_event(event) && !perf_cgroup_match(event)) | ||
450 | event->cgrp_defer_enabled = 1; | ||
451 | } | ||
452 | |||
453 | static inline void | ||
454 | perf_cgroup_mark_enabled(struct perf_event *event, | ||
455 | struct perf_event_context *ctx) | ||
456 | { | ||
457 | struct perf_event *sub; | ||
458 | u64 tstamp = perf_event_time(event); | ||
459 | |||
460 | if (!event->cgrp_defer_enabled) | ||
461 | return; | ||
462 | |||
463 | event->cgrp_defer_enabled = 0; | ||
464 | |||
465 | event->tstamp_enabled = tstamp - event->total_time_enabled; | ||
466 | list_for_each_entry(sub, &event->sibling_list, group_entry) { | ||
467 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) { | ||
468 | sub->tstamp_enabled = tstamp - sub->total_time_enabled; | ||
469 | sub->cgrp_defer_enabled = 0; | ||
470 | } | ||
471 | } | ||
472 | } | ||
473 | #else /* !CONFIG_CGROUP_PERF */ | ||
474 | |||
475 | static inline bool | ||
476 | perf_cgroup_match(struct perf_event *event) | ||
477 | { | ||
478 | return true; | ||
479 | } | ||
480 | |||
481 | static inline void perf_detach_cgroup(struct perf_event *event) | ||
482 | {} | ||
483 | |||
484 | static inline int is_cgroup_event(struct perf_event *event) | ||
485 | { | ||
486 | return 0; | ||
487 | } | ||
488 | |||
489 | static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) | ||
490 | { | ||
491 | return 0; | ||
492 | } | ||
493 | |||
494 | static inline void update_cgrp_time_from_event(struct perf_event *event) | ||
495 | { | ||
496 | } | ||
497 | |||
498 | static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) | ||
499 | { | ||
500 | } | ||
501 | |||
502 | static inline void perf_cgroup_sched_out(struct task_struct *task) | ||
503 | { | ||
504 | } | ||
505 | |||
506 | static inline void perf_cgroup_sched_in(struct task_struct *task) | ||
507 | { | ||
508 | } | ||
509 | |||
510 | static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, | ||
511 | struct perf_event_attr *attr, | ||
512 | struct perf_event *group_leader) | ||
513 | { | ||
514 | return -EINVAL; | ||
515 | } | ||
516 | |||
517 | static inline void | ||
518 | perf_cgroup_set_timestamp(struct task_struct *task, | ||
519 | struct perf_event_context *ctx) | ||
520 | { | ||
521 | } | ||
522 | |||
523 | void | ||
524 | perf_cgroup_switch(struct task_struct *task, struct task_struct *next) | ||
525 | { | ||
526 | } | ||
527 | |||
528 | static inline void | ||
529 | perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) | ||
530 | { | ||
531 | } | ||
532 | |||
533 | static inline u64 perf_cgroup_event_time(struct perf_event *event) | ||
534 | { | ||
535 | return 0; | ||
536 | } | ||
537 | |||
538 | static inline void | ||
539 | perf_cgroup_defer_enabled(struct perf_event *event) | ||
540 | { | ||
541 | } | ||
542 | |||
543 | static inline void | ||
544 | perf_cgroup_mark_enabled(struct perf_event *event, | ||
545 | struct perf_event_context *ctx) | ||
546 | { | ||
547 | } | ||
548 | #endif | ||
549 | |||
92 | void perf_pmu_disable(struct pmu *pmu) | 550 | void perf_pmu_disable(struct pmu *pmu) |
93 | { | 551 | { |
94 | int *count = this_cpu_ptr(pmu->pmu_disable_count); | 552 | int *count = this_cpu_ptr(pmu->pmu_disable_count); |
@@ -254,7 +712,6 @@ static void perf_unpin_context(struct perf_event_context *ctx) | |||
254 | raw_spin_lock_irqsave(&ctx->lock, flags); | 712 | raw_spin_lock_irqsave(&ctx->lock, flags); |
255 | --ctx->pin_count; | 713 | --ctx->pin_count; |
256 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 714 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
257 | put_ctx(ctx); | ||
258 | } | 715 | } |
259 | 716 | ||
260 | /* | 717 | /* |
@@ -271,6 +728,10 @@ static void update_context_time(struct perf_event_context *ctx) | |||
271 | static u64 perf_event_time(struct perf_event *event) | 728 | static u64 perf_event_time(struct perf_event *event) |
272 | { | 729 | { |
273 | struct perf_event_context *ctx = event->ctx; | 730 | struct perf_event_context *ctx = event->ctx; |
731 | |||
732 | if (is_cgroup_event(event)) | ||
733 | return perf_cgroup_event_time(event); | ||
734 | |||
274 | return ctx ? ctx->time : 0; | 735 | return ctx ? ctx->time : 0; |
275 | } | 736 | } |
276 | 737 | ||
@@ -285,9 +746,20 @@ static void update_event_times(struct perf_event *event) | |||
285 | if (event->state < PERF_EVENT_STATE_INACTIVE || | 746 | if (event->state < PERF_EVENT_STATE_INACTIVE || |
286 | event->group_leader->state < PERF_EVENT_STATE_INACTIVE) | 747 | event->group_leader->state < PERF_EVENT_STATE_INACTIVE) |
287 | return; | 748 | return; |
288 | 749 | /* | |
289 | if (ctx->is_active) | 750 | * in cgroup mode, time_enabled represents |
751 | * the time the event was enabled AND active | ||
752 | * tasks were in the monitored cgroup. This is | ||
753 | * independent of the activity of the context as | ||
754 | * there may be a mix of cgroup and non-cgroup events. | ||
755 | * | ||
756 | * That is why we treat cgroup events differently | ||
757 | * here. | ||
758 | */ | ||
759 | if (is_cgroup_event(event)) | ||
290 | run_end = perf_event_time(event); | 760 | run_end = perf_event_time(event); |
761 | else if (ctx->is_active) | ||
762 | run_end = ctx->time; | ||
291 | else | 763 | else |
292 | run_end = event->tstamp_stopped; | 764 | run_end = event->tstamp_stopped; |
293 | 765 | ||
@@ -299,6 +771,7 @@ static void update_event_times(struct perf_event *event) | |||
299 | run_end = perf_event_time(event); | 771 | run_end = perf_event_time(event); |
300 | 772 | ||
301 | event->total_time_running = run_end - event->tstamp_running; | 773 | event->total_time_running = run_end - event->tstamp_running; |
774 | |||
302 | } | 775 | } |
303 | 776 | ||
304 | /* | 777 | /* |
@@ -347,6 +820,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
347 | list_add_tail(&event->group_entry, list); | 820 | list_add_tail(&event->group_entry, list); |
348 | } | 821 | } |
349 | 822 | ||
823 | if (is_cgroup_event(event)) | ||
824 | ctx->nr_cgroups++; | ||
825 | |||
350 | list_add_rcu(&event->event_entry, &ctx->event_list); | 826 | list_add_rcu(&event->event_entry, &ctx->event_list); |
351 | if (!ctx->nr_events) | 827 | if (!ctx->nr_events) |
352 | perf_pmu_rotate_start(ctx->pmu); | 828 | perf_pmu_rotate_start(ctx->pmu); |
@@ -473,6 +949,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) | |||
473 | 949 | ||
474 | event->attach_state &= ~PERF_ATTACH_CONTEXT; | 950 | event->attach_state &= ~PERF_ATTACH_CONTEXT; |
475 | 951 | ||
952 | if (is_cgroup_event(event)) | ||
953 | ctx->nr_cgroups--; | ||
954 | |||
476 | ctx->nr_events--; | 955 | ctx->nr_events--; |
477 | if (event->attr.inherit_stat) | 956 | if (event->attr.inherit_stat) |
478 | ctx->nr_stat--; | 957 | ctx->nr_stat--; |
@@ -544,7 +1023,8 @@ out: | |||
544 | static inline int | 1023 | static inline int |
545 | event_filter_match(struct perf_event *event) | 1024 | event_filter_match(struct perf_event *event) |
546 | { | 1025 | { |
547 | return event->cpu == -1 || event->cpu == smp_processor_id(); | 1026 | return (event->cpu == -1 || event->cpu == smp_processor_id()) |
1027 | && perf_cgroup_match(event); | ||
548 | } | 1028 | } |
549 | 1029 | ||
550 | static void | 1030 | static void |
@@ -562,7 +1042,7 @@ event_sched_out(struct perf_event *event, | |||
562 | */ | 1042 | */ |
563 | if (event->state == PERF_EVENT_STATE_INACTIVE | 1043 | if (event->state == PERF_EVENT_STATE_INACTIVE |
564 | && !event_filter_match(event)) { | 1044 | && !event_filter_match(event)) { |
565 | delta = ctx->time - event->tstamp_stopped; | 1045 | delta = tstamp - event->tstamp_stopped; |
566 | event->tstamp_running += delta; | 1046 | event->tstamp_running += delta; |
567 | event->tstamp_stopped = tstamp; | 1047 | event->tstamp_stopped = tstamp; |
568 | } | 1048 | } |
@@ -606,47 +1086,30 @@ group_sched_out(struct perf_event *group_event, | |||
606 | cpuctx->exclusive = 0; | 1086 | cpuctx->exclusive = 0; |
607 | } | 1087 | } |
608 | 1088 | ||
609 | static inline struct perf_cpu_context * | ||
610 | __get_cpu_context(struct perf_event_context *ctx) | ||
611 | { | ||
612 | return this_cpu_ptr(ctx->pmu->pmu_cpu_context); | ||
613 | } | ||
614 | |||
615 | /* | 1089 | /* |
616 | * Cross CPU call to remove a performance event | 1090 | * Cross CPU call to remove a performance event |
617 | * | 1091 | * |
618 | * We disable the event on the hardware level first. After that we | 1092 | * We disable the event on the hardware level first. After that we |
619 | * remove it from the context list. | 1093 | * remove it from the context list. |
620 | */ | 1094 | */ |
621 | static void __perf_event_remove_from_context(void *info) | 1095 | static int __perf_remove_from_context(void *info) |
622 | { | 1096 | { |
623 | struct perf_event *event = info; | 1097 | struct perf_event *event = info; |
624 | struct perf_event_context *ctx = event->ctx; | 1098 | struct perf_event_context *ctx = event->ctx; |
625 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | 1099 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
626 | 1100 | ||
627 | /* | ||
628 | * If this is a task context, we need to check whether it is | ||
629 | * the current task context of this cpu. If not it has been | ||
630 | * scheduled out before the smp call arrived. | ||
631 | */ | ||
632 | if (ctx->task && cpuctx->task_ctx != ctx) | ||
633 | return; | ||
634 | |||
635 | raw_spin_lock(&ctx->lock); | 1101 | raw_spin_lock(&ctx->lock); |
636 | |||
637 | event_sched_out(event, cpuctx, ctx); | 1102 | event_sched_out(event, cpuctx, ctx); |
638 | |||
639 | list_del_event(event, ctx); | 1103 | list_del_event(event, ctx); |
640 | |||
641 | raw_spin_unlock(&ctx->lock); | 1104 | raw_spin_unlock(&ctx->lock); |
1105 | |||
1106 | return 0; | ||
642 | } | 1107 | } |
643 | 1108 | ||
644 | 1109 | ||
645 | /* | 1110 | /* |
646 | * Remove the event from a task's (or a CPU's) list of events. | 1111 | * Remove the event from a task's (or a CPU's) list of events. |
647 | * | 1112 | * |
648 | * Must be called with ctx->mutex held. | ||
649 | * | ||
650 | * CPU events are removed with a smp call. For task events we only | 1113 | * CPU events are removed with a smp call. For task events we only |
651 | * call when the task is on a CPU. | 1114 | * call when the task is on a CPU. |
652 | * | 1115 | * |
@@ -657,49 +1120,48 @@ static void __perf_event_remove_from_context(void *info) | |||
657 | * When called from perf_event_exit_task, it's OK because the | 1120 | * When called from perf_event_exit_task, it's OK because the |
658 | * context has been detached from its task. | 1121 | * context has been detached from its task. |
659 | */ | 1122 | */ |
660 | static void perf_event_remove_from_context(struct perf_event *event) | 1123 | static void perf_remove_from_context(struct perf_event *event) |
661 | { | 1124 | { |
662 | struct perf_event_context *ctx = event->ctx; | 1125 | struct perf_event_context *ctx = event->ctx; |
663 | struct task_struct *task = ctx->task; | 1126 | struct task_struct *task = ctx->task; |
664 | 1127 | ||
1128 | lockdep_assert_held(&ctx->mutex); | ||
1129 | |||
665 | if (!task) { | 1130 | if (!task) { |
666 | /* | 1131 | /* |
667 | * Per cpu events are removed via an smp call and | 1132 | * Per cpu events are removed via an smp call and |
668 | * the removal is always successful. | 1133 | * the removal is always successful. |
669 | */ | 1134 | */ |
670 | smp_call_function_single(event->cpu, | 1135 | cpu_function_call(event->cpu, __perf_remove_from_context, event); |
671 | __perf_event_remove_from_context, | ||
672 | event, 1); | ||
673 | return; | 1136 | return; |
674 | } | 1137 | } |
675 | 1138 | ||
676 | retry: | 1139 | retry: |
677 | task_oncpu_function_call(task, __perf_event_remove_from_context, | 1140 | if (!task_function_call(task, __perf_remove_from_context, event)) |
678 | event); | 1141 | return; |
679 | 1142 | ||
680 | raw_spin_lock_irq(&ctx->lock); | 1143 | raw_spin_lock_irq(&ctx->lock); |
681 | /* | 1144 | /* |
682 | * If the context is active we need to retry the smp call. | 1145 | * If we failed to find a running task, but find the context active now |
1146 | * that we've acquired the ctx->lock, retry. | ||
683 | */ | 1147 | */ |
684 | if (ctx->nr_active && !list_empty(&event->group_entry)) { | 1148 | if (ctx->is_active) { |
685 | raw_spin_unlock_irq(&ctx->lock); | 1149 | raw_spin_unlock_irq(&ctx->lock); |
686 | goto retry; | 1150 | goto retry; |
687 | } | 1151 | } |
688 | 1152 | ||
689 | /* | 1153 | /* |
690 | * The lock prevents that this context is scheduled in so we | 1154 | * Since the task isn't running, its safe to remove the event, us |
691 | * can remove the event safely, if the call above did not | 1155 | * holding the ctx->lock ensures the task won't get scheduled in. |
692 | * succeed. | ||
693 | */ | 1156 | */ |
694 | if (!list_empty(&event->group_entry)) | 1157 | list_del_event(event, ctx); |
695 | list_del_event(event, ctx); | ||
696 | raw_spin_unlock_irq(&ctx->lock); | 1158 | raw_spin_unlock_irq(&ctx->lock); |
697 | } | 1159 | } |
698 | 1160 | ||
699 | /* | 1161 | /* |
700 | * Cross CPU call to disable a performance event | 1162 | * Cross CPU call to disable a performance event |
701 | */ | 1163 | */ |
702 | static void __perf_event_disable(void *info) | 1164 | static int __perf_event_disable(void *info) |
703 | { | 1165 | { |
704 | struct perf_event *event = info; | 1166 | struct perf_event *event = info; |
705 | struct perf_event_context *ctx = event->ctx; | 1167 | struct perf_event_context *ctx = event->ctx; |
@@ -708,9 +1170,12 @@ static void __perf_event_disable(void *info) | |||
708 | /* | 1170 | /* |
709 | * If this is a per-task event, need to check whether this | 1171 | * If this is a per-task event, need to check whether this |
710 | * event's task is the current task on this cpu. | 1172 | * event's task is the current task on this cpu. |
1173 | * | ||
1174 | * Can trigger due to concurrent perf_event_context_sched_out() | ||
1175 | * flipping contexts around. | ||
711 | */ | 1176 | */ |
712 | if (ctx->task && cpuctx->task_ctx != ctx) | 1177 | if (ctx->task && cpuctx->task_ctx != ctx) |
713 | return; | 1178 | return -EINVAL; |
714 | 1179 | ||
715 | raw_spin_lock(&ctx->lock); | 1180 | raw_spin_lock(&ctx->lock); |
716 | 1181 | ||
@@ -720,6 +1185,7 @@ static void __perf_event_disable(void *info) | |||
720 | */ | 1185 | */ |
721 | if (event->state >= PERF_EVENT_STATE_INACTIVE) { | 1186 | if (event->state >= PERF_EVENT_STATE_INACTIVE) { |
722 | update_context_time(ctx); | 1187 | update_context_time(ctx); |
1188 | update_cgrp_time_from_event(event); | ||
723 | update_group_times(event); | 1189 | update_group_times(event); |
724 | if (event == event->group_leader) | 1190 | if (event == event->group_leader) |
725 | group_sched_out(event, cpuctx, ctx); | 1191 | group_sched_out(event, cpuctx, ctx); |
@@ -729,6 +1195,8 @@ static void __perf_event_disable(void *info) | |||
729 | } | 1195 | } |
730 | 1196 | ||
731 | raw_spin_unlock(&ctx->lock); | 1197 | raw_spin_unlock(&ctx->lock); |
1198 | |||
1199 | return 0; | ||
732 | } | 1200 | } |
733 | 1201 | ||
734 | /* | 1202 | /* |
@@ -753,13 +1221,13 @@ void perf_event_disable(struct perf_event *event) | |||
753 | /* | 1221 | /* |
754 | * Disable the event on the cpu that it's on | 1222 | * Disable the event on the cpu that it's on |
755 | */ | 1223 | */ |
756 | smp_call_function_single(event->cpu, __perf_event_disable, | 1224 | cpu_function_call(event->cpu, __perf_event_disable, event); |
757 | event, 1); | ||
758 | return; | 1225 | return; |
759 | } | 1226 | } |
760 | 1227 | ||
761 | retry: | 1228 | retry: |
762 | task_oncpu_function_call(task, __perf_event_disable, event); | 1229 | if (!task_function_call(task, __perf_event_disable, event)) |
1230 | return; | ||
763 | 1231 | ||
764 | raw_spin_lock_irq(&ctx->lock); | 1232 | raw_spin_lock_irq(&ctx->lock); |
765 | /* | 1233 | /* |
@@ -767,6 +1235,11 @@ retry: | |||
767 | */ | 1235 | */ |
768 | if (event->state == PERF_EVENT_STATE_ACTIVE) { | 1236 | if (event->state == PERF_EVENT_STATE_ACTIVE) { |
769 | raw_spin_unlock_irq(&ctx->lock); | 1237 | raw_spin_unlock_irq(&ctx->lock); |
1238 | /* | ||
1239 | * Reload the task pointer, it might have been changed by | ||
1240 | * a concurrent perf_event_context_sched_out(). | ||
1241 | */ | ||
1242 | task = ctx->task; | ||
770 | goto retry; | 1243 | goto retry; |
771 | } | 1244 | } |
772 | 1245 | ||
@@ -778,10 +1251,44 @@ retry: | |||
778 | update_group_times(event); | 1251 | update_group_times(event); |
779 | event->state = PERF_EVENT_STATE_OFF; | 1252 | event->state = PERF_EVENT_STATE_OFF; |
780 | } | 1253 | } |
781 | |||
782 | raw_spin_unlock_irq(&ctx->lock); | 1254 | raw_spin_unlock_irq(&ctx->lock); |
783 | } | 1255 | } |
784 | 1256 | ||
1257 | static void perf_set_shadow_time(struct perf_event *event, | ||
1258 | struct perf_event_context *ctx, | ||
1259 | u64 tstamp) | ||
1260 | { | ||
1261 | /* | ||
1262 | * use the correct time source for the time snapshot | ||
1263 | * | ||
1264 | * We could get by without this by leveraging the | ||
1265 | * fact that to get to this function, the caller | ||
1266 | * has most likely already called update_context_time() | ||
1267 | * and update_cgrp_time_xx() and thus both timestamp | ||
1268 | * are identical (or very close). Given that tstamp is, | ||
1269 | * already adjusted for cgroup, we could say that: | ||
1270 | * tstamp - ctx->timestamp | ||
1271 | * is equivalent to | ||
1272 | * tstamp - cgrp->timestamp. | ||
1273 | * | ||
1274 | * Then, in perf_output_read(), the calculation would | ||
1275 | * work with no changes because: | ||
1276 | * - event is guaranteed scheduled in | ||
1277 | * - no scheduled out in between | ||
1278 | * - thus the timestamp would be the same | ||
1279 | * | ||
1280 | * But this is a bit hairy. | ||
1281 | * | ||
1282 | * So instead, we have an explicit cgroup call to remain | ||
1283 | * within the time time source all along. We believe it | ||
1284 | * is cleaner and simpler to understand. | ||
1285 | */ | ||
1286 | if (is_cgroup_event(event)) | ||
1287 | perf_cgroup_set_shadow_time(event, tstamp); | ||
1288 | else | ||
1289 | event->shadow_ctx_time = tstamp - ctx->timestamp; | ||
1290 | } | ||
1291 | |||
785 | #define MAX_INTERRUPTS (~0ULL) | 1292 | #define MAX_INTERRUPTS (~0ULL) |
786 | 1293 | ||
787 | static void perf_log_throttle(struct perf_event *event, int enable); | 1294 | static void perf_log_throttle(struct perf_event *event, int enable); |
@@ -822,7 +1329,7 @@ event_sched_in(struct perf_event *event, | |||
822 | 1329 | ||
823 | event->tstamp_running += tstamp - event->tstamp_stopped; | 1330 | event->tstamp_running += tstamp - event->tstamp_stopped; |
824 | 1331 | ||
825 | event->shadow_ctx_time = tstamp - ctx->timestamp; | 1332 | perf_set_shadow_time(event, ctx, tstamp); |
826 | 1333 | ||
827 | if (!is_software_event(event)) | 1334 | if (!is_software_event(event)) |
828 | cpuctx->active_oncpu++; | 1335 | cpuctx->active_oncpu++; |
@@ -943,12 +1450,15 @@ static void add_event_to_ctx(struct perf_event *event, | |||
943 | event->tstamp_stopped = tstamp; | 1450 | event->tstamp_stopped = tstamp; |
944 | } | 1451 | } |
945 | 1452 | ||
1453 | static void perf_event_context_sched_in(struct perf_event_context *ctx, | ||
1454 | struct task_struct *tsk); | ||
1455 | |||
946 | /* | 1456 | /* |
947 | * Cross CPU call to install and enable a performance event | 1457 | * Cross CPU call to install and enable a performance event |
948 | * | 1458 | * |
949 | * Must be called with ctx->mutex held | 1459 | * Must be called with ctx->mutex held |
950 | */ | 1460 | */ |
951 | static void __perf_install_in_context(void *info) | 1461 | static int __perf_install_in_context(void *info) |
952 | { | 1462 | { |
953 | struct perf_event *event = info; | 1463 | struct perf_event *event = info; |
954 | struct perf_event_context *ctx = event->ctx; | 1464 | struct perf_event_context *ctx = event->ctx; |
@@ -957,21 +1467,22 @@ static void __perf_install_in_context(void *info) | |||
957 | int err; | 1467 | int err; |
958 | 1468 | ||
959 | /* | 1469 | /* |
960 | * If this is a task context, we need to check whether it is | 1470 | * In case we're installing a new context to an already running task, |
961 | * the current task context of this cpu. If not it has been | 1471 | * could also happen before perf_event_task_sched_in() on architectures |
962 | * scheduled out before the smp call arrived. | 1472 | * which do context switches with IRQs enabled. |
963 | * Or possibly this is the right context but it isn't | ||
964 | * on this cpu because it had no events. | ||
965 | */ | 1473 | */ |
966 | if (ctx->task && cpuctx->task_ctx != ctx) { | 1474 | if (ctx->task && !cpuctx->task_ctx) |
967 | if (cpuctx->task_ctx || ctx->task != current) | 1475 | perf_event_context_sched_in(ctx, ctx->task); |
968 | return; | ||
969 | cpuctx->task_ctx = ctx; | ||
970 | } | ||
971 | 1476 | ||
972 | raw_spin_lock(&ctx->lock); | 1477 | raw_spin_lock(&ctx->lock); |
973 | ctx->is_active = 1; | 1478 | ctx->is_active = 1; |
974 | update_context_time(ctx); | 1479 | update_context_time(ctx); |
1480 | /* | ||
1481 | * update cgrp time only if current cgrp | ||
1482 | * matches event->cgrp. Must be done before | ||
1483 | * calling add_event_to_ctx() | ||
1484 | */ | ||
1485 | update_cgrp_time_from_event(event); | ||
975 | 1486 | ||
976 | add_event_to_ctx(event, ctx); | 1487 | add_event_to_ctx(event, ctx); |
977 | 1488 | ||
@@ -1012,6 +1523,8 @@ static void __perf_install_in_context(void *info) | |||
1012 | 1523 | ||
1013 | unlock: | 1524 | unlock: |
1014 | raw_spin_unlock(&ctx->lock); | 1525 | raw_spin_unlock(&ctx->lock); |
1526 | |||
1527 | return 0; | ||
1015 | } | 1528 | } |
1016 | 1529 | ||
1017 | /* | 1530 | /* |
@@ -1023,8 +1536,6 @@ unlock: | |||
1023 | * If the event is attached to a task which is on a CPU we use a smp | 1536 | * If the event is attached to a task which is on a CPU we use a smp |
1024 | * call to enable it in the task context. The task might have been | 1537 | * call to enable it in the task context. The task might have been |
1025 | * scheduled away, but we check this in the smp call again. | 1538 | * scheduled away, but we check this in the smp call again. |
1026 | * | ||
1027 | * Must be called with ctx->mutex held. | ||
1028 | */ | 1539 | */ |
1029 | static void | 1540 | static void |
1030 | perf_install_in_context(struct perf_event_context *ctx, | 1541 | perf_install_in_context(struct perf_event_context *ctx, |
@@ -1033,6 +1544,8 @@ perf_install_in_context(struct perf_event_context *ctx, | |||
1033 | { | 1544 | { |
1034 | struct task_struct *task = ctx->task; | 1545 | struct task_struct *task = ctx->task; |
1035 | 1546 | ||
1547 | lockdep_assert_held(&ctx->mutex); | ||
1548 | |||
1036 | event->ctx = ctx; | 1549 | event->ctx = ctx; |
1037 | 1550 | ||
1038 | if (!task) { | 1551 | if (!task) { |
@@ -1040,31 +1553,29 @@ perf_install_in_context(struct perf_event_context *ctx, | |||
1040 | * Per cpu events are installed via an smp call and | 1553 | * Per cpu events are installed via an smp call and |
1041 | * the install is always successful. | 1554 | * the install is always successful. |
1042 | */ | 1555 | */ |
1043 | smp_call_function_single(cpu, __perf_install_in_context, | 1556 | cpu_function_call(cpu, __perf_install_in_context, event); |
1044 | event, 1); | ||
1045 | return; | 1557 | return; |
1046 | } | 1558 | } |
1047 | 1559 | ||
1048 | retry: | 1560 | retry: |
1049 | task_oncpu_function_call(task, __perf_install_in_context, | 1561 | if (!task_function_call(task, __perf_install_in_context, event)) |
1050 | event); | 1562 | return; |
1051 | 1563 | ||
1052 | raw_spin_lock_irq(&ctx->lock); | 1564 | raw_spin_lock_irq(&ctx->lock); |
1053 | /* | 1565 | /* |
1054 | * we need to retry the smp call. | 1566 | * If we failed to find a running task, but find the context active now |
1567 | * that we've acquired the ctx->lock, retry. | ||
1055 | */ | 1568 | */ |
1056 | if (ctx->is_active && list_empty(&event->group_entry)) { | 1569 | if (ctx->is_active) { |
1057 | raw_spin_unlock_irq(&ctx->lock); | 1570 | raw_spin_unlock_irq(&ctx->lock); |
1058 | goto retry; | 1571 | goto retry; |
1059 | } | 1572 | } |
1060 | 1573 | ||
1061 | /* | 1574 | /* |
1062 | * The lock prevents that this context is scheduled in so we | 1575 | * Since the task isn't running, its safe to add the event, us holding |
1063 | * can add the event safely, if it the call above did not | 1576 | * the ctx->lock ensures the task won't get scheduled in. |
1064 | * succeed. | ||
1065 | */ | 1577 | */ |
1066 | if (list_empty(&event->group_entry)) | 1578 | add_event_to_ctx(event, ctx); |
1067 | add_event_to_ctx(event, ctx); | ||
1068 | raw_spin_unlock_irq(&ctx->lock); | 1579 | raw_spin_unlock_irq(&ctx->lock); |
1069 | } | 1580 | } |
1070 | 1581 | ||
@@ -1093,7 +1604,7 @@ static void __perf_event_mark_enabled(struct perf_event *event, | |||
1093 | /* | 1604 | /* |
1094 | * Cross CPU call to enable a performance event | 1605 | * Cross CPU call to enable a performance event |
1095 | */ | 1606 | */ |
1096 | static void __perf_event_enable(void *info) | 1607 | static int __perf_event_enable(void *info) |
1097 | { | 1608 | { |
1098 | struct perf_event *event = info; | 1609 | struct perf_event *event = info; |
1099 | struct perf_event_context *ctx = event->ctx; | 1610 | struct perf_event_context *ctx = event->ctx; |
@@ -1101,26 +1612,27 @@ static void __perf_event_enable(void *info) | |||
1101 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | 1612 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
1102 | int err; | 1613 | int err; |
1103 | 1614 | ||
1104 | /* | 1615 | if (WARN_ON_ONCE(!ctx->is_active)) |
1105 | * If this is a per-task event, need to check whether this | 1616 | return -EINVAL; |
1106 | * event's task is the current task on this cpu. | ||
1107 | */ | ||
1108 | if (ctx->task && cpuctx->task_ctx != ctx) { | ||
1109 | if (cpuctx->task_ctx || ctx->task != current) | ||
1110 | return; | ||
1111 | cpuctx->task_ctx = ctx; | ||
1112 | } | ||
1113 | 1617 | ||
1114 | raw_spin_lock(&ctx->lock); | 1618 | raw_spin_lock(&ctx->lock); |
1115 | ctx->is_active = 1; | ||
1116 | update_context_time(ctx); | 1619 | update_context_time(ctx); |
1117 | 1620 | ||
1118 | if (event->state >= PERF_EVENT_STATE_INACTIVE) | 1621 | if (event->state >= PERF_EVENT_STATE_INACTIVE) |
1119 | goto unlock; | 1622 | goto unlock; |
1623 | |||
1624 | /* | ||
1625 | * set current task's cgroup time reference point | ||
1626 | */ | ||
1627 | perf_cgroup_set_timestamp(current, ctx); | ||
1628 | |||
1120 | __perf_event_mark_enabled(event, ctx); | 1629 | __perf_event_mark_enabled(event, ctx); |
1121 | 1630 | ||
1122 | if (!event_filter_match(event)) | 1631 | if (!event_filter_match(event)) { |
1632 | if (is_cgroup_event(event)) | ||
1633 | perf_cgroup_defer_enabled(event); | ||
1123 | goto unlock; | 1634 | goto unlock; |
1635 | } | ||
1124 | 1636 | ||
1125 | /* | 1637 | /* |
1126 | * If the event is in a group and isn't the group leader, | 1638 | * If the event is in a group and isn't the group leader, |
@@ -1153,6 +1665,8 @@ static void __perf_event_enable(void *info) | |||
1153 | 1665 | ||
1154 | unlock: | 1666 | unlock: |
1155 | raw_spin_unlock(&ctx->lock); | 1667 | raw_spin_unlock(&ctx->lock); |
1668 | |||
1669 | return 0; | ||
1156 | } | 1670 | } |
1157 | 1671 | ||
1158 | /* | 1672 | /* |
@@ -1173,8 +1687,7 @@ void perf_event_enable(struct perf_event *event) | |||
1173 | /* | 1687 | /* |
1174 | * Enable the event on the cpu that it's on | 1688 | * Enable the event on the cpu that it's on |
1175 | */ | 1689 | */ |
1176 | smp_call_function_single(event->cpu, __perf_event_enable, | 1690 | cpu_function_call(event->cpu, __perf_event_enable, event); |
1177 | event, 1); | ||
1178 | return; | 1691 | return; |
1179 | } | 1692 | } |
1180 | 1693 | ||
@@ -1193,8 +1706,15 @@ void perf_event_enable(struct perf_event *event) | |||
1193 | event->state = PERF_EVENT_STATE_OFF; | 1706 | event->state = PERF_EVENT_STATE_OFF; |
1194 | 1707 | ||
1195 | retry: | 1708 | retry: |
1709 | if (!ctx->is_active) { | ||
1710 | __perf_event_mark_enabled(event, ctx); | ||
1711 | goto out; | ||
1712 | } | ||
1713 | |||
1196 | raw_spin_unlock_irq(&ctx->lock); | 1714 | raw_spin_unlock_irq(&ctx->lock); |
1197 | task_oncpu_function_call(task, __perf_event_enable, event); | 1715 | |
1716 | if (!task_function_call(task, __perf_event_enable, event)) | ||
1717 | return; | ||
1198 | 1718 | ||
1199 | raw_spin_lock_irq(&ctx->lock); | 1719 | raw_spin_lock_irq(&ctx->lock); |
1200 | 1720 | ||
@@ -1202,15 +1722,14 @@ retry: | |||
1202 | * If the context is active and the event is still off, | 1722 | * If the context is active and the event is still off, |
1203 | * we need to retry the cross-call. | 1723 | * we need to retry the cross-call. |
1204 | */ | 1724 | */ |
1205 | if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) | 1725 | if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) { |
1726 | /* | ||
1727 | * task could have been flipped by a concurrent | ||
1728 | * perf_event_context_sched_out() | ||
1729 | */ | ||
1730 | task = ctx->task; | ||
1206 | goto retry; | 1731 | goto retry; |
1207 | 1732 | } | |
1208 | /* | ||
1209 | * Since we have the lock this context can't be scheduled | ||
1210 | * in, so we can change the state safely. | ||
1211 | */ | ||
1212 | if (event->state == PERF_EVENT_STATE_OFF) | ||
1213 | __perf_event_mark_enabled(event, ctx); | ||
1214 | 1733 | ||
1215 | out: | 1734 | out: |
1216 | raw_spin_unlock_irq(&ctx->lock); | 1735 | raw_spin_unlock_irq(&ctx->lock); |
@@ -1242,6 +1761,7 @@ static void ctx_sched_out(struct perf_event_context *ctx, | |||
1242 | if (likely(!ctx->nr_events)) | 1761 | if (likely(!ctx->nr_events)) |
1243 | goto out; | 1762 | goto out; |
1244 | update_context_time(ctx); | 1763 | update_context_time(ctx); |
1764 | update_cgrp_time_from_cpuctx(cpuctx); | ||
1245 | 1765 | ||
1246 | if (!ctx->nr_active) | 1766 | if (!ctx->nr_active) |
1247 | goto out; | 1767 | goto out; |
@@ -1354,8 +1874,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, | |||
1354 | } | 1874 | } |
1355 | } | 1875 | } |
1356 | 1876 | ||
1357 | void perf_event_context_sched_out(struct task_struct *task, int ctxn, | 1877 | static void perf_event_context_sched_out(struct task_struct *task, int ctxn, |
1358 | struct task_struct *next) | 1878 | struct task_struct *next) |
1359 | { | 1879 | { |
1360 | struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; | 1880 | struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; |
1361 | struct perf_event_context *next_ctx; | 1881 | struct perf_event_context *next_ctx; |
@@ -1431,6 +1951,14 @@ void __perf_event_task_sched_out(struct task_struct *task, | |||
1431 | 1951 | ||
1432 | for_each_task_context_nr(ctxn) | 1952 | for_each_task_context_nr(ctxn) |
1433 | perf_event_context_sched_out(task, ctxn, next); | 1953 | perf_event_context_sched_out(task, ctxn, next); |
1954 | |||
1955 | /* | ||
1956 | * if cgroup events exist on this CPU, then we need | ||
1957 | * to check if we have to switch out PMU state. | ||
1958 | * cgroup event are system-wide mode only | ||
1959 | */ | ||
1960 | if (atomic_read(&__get_cpu_var(perf_cgroup_events))) | ||
1961 | perf_cgroup_sched_out(task); | ||
1434 | } | 1962 | } |
1435 | 1963 | ||
1436 | static void task_ctx_sched_out(struct perf_event_context *ctx, | 1964 | static void task_ctx_sched_out(struct perf_event_context *ctx, |
@@ -1469,6 +1997,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, | |||
1469 | if (!event_filter_match(event)) | 1997 | if (!event_filter_match(event)) |
1470 | continue; | 1998 | continue; |
1471 | 1999 | ||
2000 | /* may need to reset tstamp_enabled */ | ||
2001 | if (is_cgroup_event(event)) | ||
2002 | perf_cgroup_mark_enabled(event, ctx); | ||
2003 | |||
1472 | if (group_can_go_on(event, cpuctx, 1)) | 2004 | if (group_can_go_on(event, cpuctx, 1)) |
1473 | group_sched_in(event, cpuctx, ctx); | 2005 | group_sched_in(event, cpuctx, ctx); |
1474 | 2006 | ||
@@ -1501,6 +2033,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, | |||
1501 | if (!event_filter_match(event)) | 2033 | if (!event_filter_match(event)) |
1502 | continue; | 2034 | continue; |
1503 | 2035 | ||
2036 | /* may need to reset tstamp_enabled */ | ||
2037 | if (is_cgroup_event(event)) | ||
2038 | perf_cgroup_mark_enabled(event, ctx); | ||
2039 | |||
1504 | if (group_can_go_on(event, cpuctx, can_add_hw)) { | 2040 | if (group_can_go_on(event, cpuctx, can_add_hw)) { |
1505 | if (group_sched_in(event, cpuctx, ctx)) | 2041 | if (group_sched_in(event, cpuctx, ctx)) |
1506 | can_add_hw = 0; | 2042 | can_add_hw = 0; |
@@ -1511,15 +2047,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, | |||
1511 | static void | 2047 | static void |
1512 | ctx_sched_in(struct perf_event_context *ctx, | 2048 | ctx_sched_in(struct perf_event_context *ctx, |
1513 | struct perf_cpu_context *cpuctx, | 2049 | struct perf_cpu_context *cpuctx, |
1514 | enum event_type_t event_type) | 2050 | enum event_type_t event_type, |
2051 | struct task_struct *task) | ||
1515 | { | 2052 | { |
2053 | u64 now; | ||
2054 | |||
1516 | raw_spin_lock(&ctx->lock); | 2055 | raw_spin_lock(&ctx->lock); |
1517 | ctx->is_active = 1; | 2056 | ctx->is_active = 1; |
1518 | if (likely(!ctx->nr_events)) | 2057 | if (likely(!ctx->nr_events)) |
1519 | goto out; | 2058 | goto out; |
1520 | 2059 | ||
1521 | ctx->timestamp = perf_clock(); | 2060 | now = perf_clock(); |
1522 | 2061 | ctx->timestamp = now; | |
2062 | perf_cgroup_set_timestamp(task, ctx); | ||
1523 | /* | 2063 | /* |
1524 | * First go through the list and put on any pinned groups | 2064 | * First go through the list and put on any pinned groups |
1525 | * in order to give them the best chance of going on. | 2065 | * in order to give them the best chance of going on. |
@@ -1536,11 +2076,12 @@ out: | |||
1536 | } | 2076 | } |
1537 | 2077 | ||
1538 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | 2078 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, |
1539 | enum event_type_t event_type) | 2079 | enum event_type_t event_type, |
2080 | struct task_struct *task) | ||
1540 | { | 2081 | { |
1541 | struct perf_event_context *ctx = &cpuctx->ctx; | 2082 | struct perf_event_context *ctx = &cpuctx->ctx; |
1542 | 2083 | ||
1543 | ctx_sched_in(ctx, cpuctx, event_type); | 2084 | ctx_sched_in(ctx, cpuctx, event_type, task); |
1544 | } | 2085 | } |
1545 | 2086 | ||
1546 | static void task_ctx_sched_in(struct perf_event_context *ctx, | 2087 | static void task_ctx_sched_in(struct perf_event_context *ctx, |
@@ -1548,15 +2089,16 @@ static void task_ctx_sched_in(struct perf_event_context *ctx, | |||
1548 | { | 2089 | { |
1549 | struct perf_cpu_context *cpuctx; | 2090 | struct perf_cpu_context *cpuctx; |
1550 | 2091 | ||
1551 | cpuctx = __get_cpu_context(ctx); | 2092 | cpuctx = __get_cpu_context(ctx); |
1552 | if (cpuctx->task_ctx == ctx) | 2093 | if (cpuctx->task_ctx == ctx) |
1553 | return; | 2094 | return; |
1554 | 2095 | ||
1555 | ctx_sched_in(ctx, cpuctx, event_type); | 2096 | ctx_sched_in(ctx, cpuctx, event_type, NULL); |
1556 | cpuctx->task_ctx = ctx; | 2097 | cpuctx->task_ctx = ctx; |
1557 | } | 2098 | } |
1558 | 2099 | ||
1559 | void perf_event_context_sched_in(struct perf_event_context *ctx) | 2100 | static void perf_event_context_sched_in(struct perf_event_context *ctx, |
2101 | struct task_struct *task) | ||
1560 | { | 2102 | { |
1561 | struct perf_cpu_context *cpuctx; | 2103 | struct perf_cpu_context *cpuctx; |
1562 | 2104 | ||
@@ -1572,9 +2114,9 @@ void perf_event_context_sched_in(struct perf_event_context *ctx) | |||
1572 | */ | 2114 | */ |
1573 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 2115 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
1574 | 2116 | ||
1575 | ctx_sched_in(ctx, cpuctx, EVENT_PINNED); | 2117 | ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); |
1576 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); | 2118 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); |
1577 | ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); | 2119 | ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); |
1578 | 2120 | ||
1579 | cpuctx->task_ctx = ctx; | 2121 | cpuctx->task_ctx = ctx; |
1580 | 2122 | ||
@@ -1607,8 +2149,15 @@ void __perf_event_task_sched_in(struct task_struct *task) | |||
1607 | if (likely(!ctx)) | 2149 | if (likely(!ctx)) |
1608 | continue; | 2150 | continue; |
1609 | 2151 | ||
1610 | perf_event_context_sched_in(ctx); | 2152 | perf_event_context_sched_in(ctx, task); |
1611 | } | 2153 | } |
2154 | /* | ||
2155 | * if cgroup events exist on this CPU, then we need | ||
2156 | * to check if we have to switch in PMU state. | ||
2157 | * cgroup event are system-wide mode only | ||
2158 | */ | ||
2159 | if (atomic_read(&__get_cpu_var(perf_cgroup_events))) | ||
2160 | perf_cgroup_sched_in(task); | ||
1612 | } | 2161 | } |
1613 | 2162 | ||
1614 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) | 2163 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) |
@@ -1638,7 +2187,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) | |||
1638 | * Reduce accuracy by one bit such that @a and @b converge | 2187 | * Reduce accuracy by one bit such that @a and @b converge |
1639 | * to a similar magnitude. | 2188 | * to a similar magnitude. |
1640 | */ | 2189 | */ |
1641 | #define REDUCE_FLS(a, b) \ | 2190 | #define REDUCE_FLS(a, b) \ |
1642 | do { \ | 2191 | do { \ |
1643 | if (a##_fls > b##_fls) { \ | 2192 | if (a##_fls > b##_fls) { \ |
1644 | a >>= 1; \ | 2193 | a >>= 1; \ |
@@ -1808,7 +2357,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) | |||
1808 | if (ctx) | 2357 | if (ctx) |
1809 | rotate_ctx(ctx); | 2358 | rotate_ctx(ctx); |
1810 | 2359 | ||
1811 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); | 2360 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); |
1812 | if (ctx) | 2361 | if (ctx) |
1813 | task_ctx_sched_in(ctx, EVENT_FLEXIBLE); | 2362 | task_ctx_sched_in(ctx, EVENT_FLEXIBLE); |
1814 | 2363 | ||
@@ -1887,7 +2436,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) | |||
1887 | 2436 | ||
1888 | raw_spin_unlock(&ctx->lock); | 2437 | raw_spin_unlock(&ctx->lock); |
1889 | 2438 | ||
1890 | perf_event_context_sched_in(ctx); | 2439 | perf_event_context_sched_in(ctx, ctx->task); |
1891 | out: | 2440 | out: |
1892 | local_irq_restore(flags); | 2441 | local_irq_restore(flags); |
1893 | } | 2442 | } |
@@ -1912,8 +2461,10 @@ static void __perf_event_read(void *info) | |||
1912 | return; | 2461 | return; |
1913 | 2462 | ||
1914 | raw_spin_lock(&ctx->lock); | 2463 | raw_spin_lock(&ctx->lock); |
1915 | if (ctx->is_active) | 2464 | if (ctx->is_active) { |
1916 | update_context_time(ctx); | 2465 | update_context_time(ctx); |
2466 | update_cgrp_time_from_event(event); | ||
2467 | } | ||
1917 | update_event_times(event); | 2468 | update_event_times(event); |
1918 | if (event->state == PERF_EVENT_STATE_ACTIVE) | 2469 | if (event->state == PERF_EVENT_STATE_ACTIVE) |
1919 | event->pmu->read(event); | 2470 | event->pmu->read(event); |
@@ -1944,8 +2495,10 @@ static u64 perf_event_read(struct perf_event *event) | |||
1944 | * (e.g., thread is blocked), in that case | 2495 | * (e.g., thread is blocked), in that case |
1945 | * we cannot update context time | 2496 | * we cannot update context time |
1946 | */ | 2497 | */ |
1947 | if (ctx->is_active) | 2498 | if (ctx->is_active) { |
1948 | update_context_time(ctx); | 2499 | update_context_time(ctx); |
2500 | update_cgrp_time_from_event(event); | ||
2501 | } | ||
1949 | update_event_times(event); | 2502 | update_event_times(event); |
1950 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 2503 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
1951 | } | 2504 | } |
@@ -2224,6 +2777,9 @@ errout: | |||
2224 | 2777 | ||
2225 | } | 2778 | } |
2226 | 2779 | ||
2780 | /* | ||
2781 | * Returns a matching context with refcount and pincount. | ||
2782 | */ | ||
2227 | static struct perf_event_context * | 2783 | static struct perf_event_context * |
2228 | find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | 2784 | find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) |
2229 | { | 2785 | { |
@@ -2248,6 +2804,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | |||
2248 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | 2804 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); |
2249 | ctx = &cpuctx->ctx; | 2805 | ctx = &cpuctx->ctx; |
2250 | get_ctx(ctx); | 2806 | get_ctx(ctx); |
2807 | ++ctx->pin_count; | ||
2251 | 2808 | ||
2252 | return ctx; | 2809 | return ctx; |
2253 | } | 2810 | } |
@@ -2261,6 +2818,7 @@ retry: | |||
2261 | ctx = perf_lock_task_context(task, ctxn, &flags); | 2818 | ctx = perf_lock_task_context(task, ctxn, &flags); |
2262 | if (ctx) { | 2819 | if (ctx) { |
2263 | unclone_ctx(ctx); | 2820 | unclone_ctx(ctx); |
2821 | ++ctx->pin_count; | ||
2264 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 2822 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
2265 | } | 2823 | } |
2266 | 2824 | ||
@@ -2282,8 +2840,10 @@ retry: | |||
2282 | err = -ESRCH; | 2840 | err = -ESRCH; |
2283 | else if (task->perf_event_ctxp[ctxn]) | 2841 | else if (task->perf_event_ctxp[ctxn]) |
2284 | err = -EAGAIN; | 2842 | err = -EAGAIN; |
2285 | else | 2843 | else { |
2844 | ++ctx->pin_count; | ||
2286 | rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); | 2845 | rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); |
2846 | } | ||
2287 | mutex_unlock(&task->perf_event_mutex); | 2847 | mutex_unlock(&task->perf_event_mutex); |
2288 | 2848 | ||
2289 | if (unlikely(err)) { | 2849 | if (unlikely(err)) { |
@@ -2323,7 +2883,7 @@ static void free_event(struct perf_event *event) | |||
2323 | 2883 | ||
2324 | if (!event->parent) { | 2884 | if (!event->parent) { |
2325 | if (event->attach_state & PERF_ATTACH_TASK) | 2885 | if (event->attach_state & PERF_ATTACH_TASK) |
2326 | jump_label_dec(&perf_task_events); | 2886 | jump_label_dec(&perf_sched_events); |
2327 | if (event->attr.mmap || event->attr.mmap_data) | 2887 | if (event->attr.mmap || event->attr.mmap_data) |
2328 | atomic_dec(&nr_mmap_events); | 2888 | atomic_dec(&nr_mmap_events); |
2329 | if (event->attr.comm) | 2889 | if (event->attr.comm) |
@@ -2332,6 +2892,10 @@ static void free_event(struct perf_event *event) | |||
2332 | atomic_dec(&nr_task_events); | 2892 | atomic_dec(&nr_task_events); |
2333 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) | 2893 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) |
2334 | put_callchain_buffers(); | 2894 | put_callchain_buffers(); |
2895 | if (is_cgroup_event(event)) { | ||
2896 | atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); | ||
2897 | jump_label_dec(&perf_sched_events); | ||
2898 | } | ||
2335 | } | 2899 | } |
2336 | 2900 | ||
2337 | if (event->buffer) { | 2901 | if (event->buffer) { |
@@ -2339,6 +2903,9 @@ static void free_event(struct perf_event *event) | |||
2339 | event->buffer = NULL; | 2903 | event->buffer = NULL; |
2340 | } | 2904 | } |
2341 | 2905 | ||
2906 | if (is_cgroup_event(event)) | ||
2907 | perf_detach_cgroup(event); | ||
2908 | |||
2342 | if (event->destroy) | 2909 | if (event->destroy) |
2343 | event->destroy(event); | 2910 | event->destroy(event); |
2344 | 2911 | ||
@@ -4406,26 +4973,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
4406 | if (unlikely(!is_sampling_event(event))) | 4973 | if (unlikely(!is_sampling_event(event))) |
4407 | return 0; | 4974 | return 0; |
4408 | 4975 | ||
4409 | if (!throttle) { | 4976 | if (unlikely(hwc->interrupts >= max_samples_per_tick)) { |
4410 | hwc->interrupts++; | 4977 | if (throttle) { |
4411 | } else { | 4978 | hwc->interrupts = MAX_INTERRUPTS; |
4412 | if (hwc->interrupts != MAX_INTERRUPTS) { | 4979 | perf_log_throttle(event, 0); |
4413 | hwc->interrupts++; | ||
4414 | if (HZ * hwc->interrupts > | ||
4415 | (u64)sysctl_perf_event_sample_rate) { | ||
4416 | hwc->interrupts = MAX_INTERRUPTS; | ||
4417 | perf_log_throttle(event, 0); | ||
4418 | ret = 1; | ||
4419 | } | ||
4420 | } else { | ||
4421 | /* | ||
4422 | * Keep re-disabling events even though on the previous | ||
4423 | * pass we disabled it - just in case we raced with a | ||
4424 | * sched-in and the event got enabled again: | ||
4425 | */ | ||
4426 | ret = 1; | 4980 | ret = 1; |
4427 | } | 4981 | } |
4428 | } | 4982 | } else |
4983 | hwc->interrupts++; | ||
4429 | 4984 | ||
4430 | if (event->attr.freq) { | 4985 | if (event->attr.freq) { |
4431 | u64 now = perf_clock(); | 4986 | u64 now = perf_clock(); |
@@ -5062,6 +5617,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | |||
5062 | u64 period; | 5617 | u64 period; |
5063 | 5618 | ||
5064 | event = container_of(hrtimer, struct perf_event, hw.hrtimer); | 5619 | event = container_of(hrtimer, struct perf_event, hw.hrtimer); |
5620 | |||
5621 | if (event->state != PERF_EVENT_STATE_ACTIVE) | ||
5622 | return HRTIMER_NORESTART; | ||
5623 | |||
5065 | event->pmu->read(event); | 5624 | event->pmu->read(event); |
5066 | 5625 | ||
5067 | perf_sample_data_init(&data, 0); | 5626 | perf_sample_data_init(&data, 0); |
@@ -5088,9 +5647,6 @@ static void perf_swevent_start_hrtimer(struct perf_event *event) | |||
5088 | if (!is_sampling_event(event)) | 5647 | if (!is_sampling_event(event)) |
5089 | return; | 5648 | return; |
5090 | 5649 | ||
5091 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
5092 | hwc->hrtimer.function = perf_swevent_hrtimer; | ||
5093 | |||
5094 | period = local64_read(&hwc->period_left); | 5650 | period = local64_read(&hwc->period_left); |
5095 | if (period) { | 5651 | if (period) { |
5096 | if (period < 0) | 5652 | if (period < 0) |
@@ -5117,6 +5673,30 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event) | |||
5117 | } | 5673 | } |
5118 | } | 5674 | } |
5119 | 5675 | ||
5676 | static void perf_swevent_init_hrtimer(struct perf_event *event) | ||
5677 | { | ||
5678 | struct hw_perf_event *hwc = &event->hw; | ||
5679 | |||
5680 | if (!is_sampling_event(event)) | ||
5681 | return; | ||
5682 | |||
5683 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
5684 | hwc->hrtimer.function = perf_swevent_hrtimer; | ||
5685 | |||
5686 | /* | ||
5687 | * Since hrtimers have a fixed rate, we can do a static freq->period | ||
5688 | * mapping and avoid the whole period adjust feedback stuff. | ||
5689 | */ | ||
5690 | if (event->attr.freq) { | ||
5691 | long freq = event->attr.sample_freq; | ||
5692 | |||
5693 | event->attr.sample_period = NSEC_PER_SEC / freq; | ||
5694 | hwc->sample_period = event->attr.sample_period; | ||
5695 | local64_set(&hwc->period_left, hwc->sample_period); | ||
5696 | event->attr.freq = 0; | ||
5697 | } | ||
5698 | } | ||
5699 | |||
5120 | /* | 5700 | /* |
5121 | * Software event: cpu wall time clock | 5701 | * Software event: cpu wall time clock |
5122 | */ | 5702 | */ |
@@ -5169,6 +5749,8 @@ static int cpu_clock_event_init(struct perf_event *event) | |||
5169 | if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) | 5749 | if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) |
5170 | return -ENOENT; | 5750 | return -ENOENT; |
5171 | 5751 | ||
5752 | perf_swevent_init_hrtimer(event); | ||
5753 | |||
5172 | return 0; | 5754 | return 0; |
5173 | } | 5755 | } |
5174 | 5756 | ||
@@ -5224,16 +5806,9 @@ static void task_clock_event_del(struct perf_event *event, int flags) | |||
5224 | 5806 | ||
5225 | static void task_clock_event_read(struct perf_event *event) | 5807 | static void task_clock_event_read(struct perf_event *event) |
5226 | { | 5808 | { |
5227 | u64 time; | 5809 | u64 now = perf_clock(); |
5228 | 5810 | u64 delta = now - event->ctx->timestamp; | |
5229 | if (!in_nmi()) { | 5811 | u64 time = event->ctx->time + delta; |
5230 | update_context_time(event->ctx); | ||
5231 | time = event->ctx->time; | ||
5232 | } else { | ||
5233 | u64 now = perf_clock(); | ||
5234 | u64 delta = now - event->ctx->timestamp; | ||
5235 | time = event->ctx->time + delta; | ||
5236 | } | ||
5237 | 5812 | ||
5238 | task_clock_event_update(event, time); | 5813 | task_clock_event_update(event, time); |
5239 | } | 5814 | } |
@@ -5246,6 +5821,8 @@ static int task_clock_event_init(struct perf_event *event) | |||
5246 | if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) | 5821 | if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) |
5247 | return -ENOENT; | 5822 | return -ENOENT; |
5248 | 5823 | ||
5824 | perf_swevent_init_hrtimer(event); | ||
5825 | |||
5249 | return 0; | 5826 | return 0; |
5250 | } | 5827 | } |
5251 | 5828 | ||
@@ -5517,17 +6094,22 @@ struct pmu *perf_init_event(struct perf_event *event) | |||
5517 | { | 6094 | { |
5518 | struct pmu *pmu = NULL; | 6095 | struct pmu *pmu = NULL; |
5519 | int idx; | 6096 | int idx; |
6097 | int ret; | ||
5520 | 6098 | ||
5521 | idx = srcu_read_lock(&pmus_srcu); | 6099 | idx = srcu_read_lock(&pmus_srcu); |
5522 | 6100 | ||
5523 | rcu_read_lock(); | 6101 | rcu_read_lock(); |
5524 | pmu = idr_find(&pmu_idr, event->attr.type); | 6102 | pmu = idr_find(&pmu_idr, event->attr.type); |
5525 | rcu_read_unlock(); | 6103 | rcu_read_unlock(); |
5526 | if (pmu) | 6104 | if (pmu) { |
6105 | ret = pmu->event_init(event); | ||
6106 | if (ret) | ||
6107 | pmu = ERR_PTR(ret); | ||
5527 | goto unlock; | 6108 | goto unlock; |
6109 | } | ||
5528 | 6110 | ||
5529 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 6111 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
5530 | int ret = pmu->event_init(event); | 6112 | ret = pmu->event_init(event); |
5531 | if (!ret) | 6113 | if (!ret) |
5532 | goto unlock; | 6114 | goto unlock; |
5533 | 6115 | ||
@@ -5653,7 +6235,7 @@ done: | |||
5653 | 6235 | ||
5654 | if (!event->parent) { | 6236 | if (!event->parent) { |
5655 | if (event->attach_state & PERF_ATTACH_TASK) | 6237 | if (event->attach_state & PERF_ATTACH_TASK) |
5656 | jump_label_inc(&perf_task_events); | 6238 | jump_label_inc(&perf_sched_events); |
5657 | if (event->attr.mmap || event->attr.mmap_data) | 6239 | if (event->attr.mmap || event->attr.mmap_data) |
5658 | atomic_inc(&nr_mmap_events); | 6240 | atomic_inc(&nr_mmap_events); |
5659 | if (event->attr.comm) | 6241 | if (event->attr.comm) |
@@ -5828,7 +6410,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5828 | int err; | 6410 | int err; |
5829 | 6411 | ||
5830 | /* for future expandability... */ | 6412 | /* for future expandability... */ |
5831 | if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) | 6413 | if (flags & ~PERF_FLAG_ALL) |
5832 | return -EINVAL; | 6414 | return -EINVAL; |
5833 | 6415 | ||
5834 | err = perf_copy_attr(attr_uptr, &attr); | 6416 | err = perf_copy_attr(attr_uptr, &attr); |
@@ -5845,6 +6427,15 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5845 | return -EINVAL; | 6427 | return -EINVAL; |
5846 | } | 6428 | } |
5847 | 6429 | ||
6430 | /* | ||
6431 | * In cgroup mode, the pid argument is used to pass the fd | ||
6432 | * opened to the cgroup directory in cgroupfs. The cpu argument | ||
6433 | * designates the cpu on which to monitor threads from that | ||
6434 | * cgroup. | ||
6435 | */ | ||
6436 | if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) | ||
6437 | return -EINVAL; | ||
6438 | |||
5848 | event_fd = get_unused_fd_flags(O_RDWR); | 6439 | event_fd = get_unused_fd_flags(O_RDWR); |
5849 | if (event_fd < 0) | 6440 | if (event_fd < 0) |
5850 | return event_fd; | 6441 | return event_fd; |
@@ -5862,7 +6453,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5862 | group_leader = NULL; | 6453 | group_leader = NULL; |
5863 | } | 6454 | } |
5864 | 6455 | ||
5865 | if (pid != -1) { | 6456 | if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { |
5866 | task = find_lively_task_by_vpid(pid); | 6457 | task = find_lively_task_by_vpid(pid); |
5867 | if (IS_ERR(task)) { | 6458 | if (IS_ERR(task)) { |
5868 | err = PTR_ERR(task); | 6459 | err = PTR_ERR(task); |
@@ -5876,6 +6467,19 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5876 | goto err_task; | 6467 | goto err_task; |
5877 | } | 6468 | } |
5878 | 6469 | ||
6470 | if (flags & PERF_FLAG_PID_CGROUP) { | ||
6471 | err = perf_cgroup_connect(pid, event, &attr, group_leader); | ||
6472 | if (err) | ||
6473 | goto err_alloc; | ||
6474 | /* | ||
6475 | * one more event: | ||
6476 | * - that has cgroup constraint on event->cpu | ||
6477 | * - that may need work on context switch | ||
6478 | */ | ||
6479 | atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); | ||
6480 | jump_label_inc(&perf_sched_events); | ||
6481 | } | ||
6482 | |||
5879 | /* | 6483 | /* |
5880 | * Special case software events and allow them to be part of | 6484 | * Special case software events and allow them to be part of |
5881 | * any hardware group. | 6485 | * any hardware group. |
@@ -5961,10 +6565,10 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5961 | struct perf_event_context *gctx = group_leader->ctx; | 6565 | struct perf_event_context *gctx = group_leader->ctx; |
5962 | 6566 | ||
5963 | mutex_lock(&gctx->mutex); | 6567 | mutex_lock(&gctx->mutex); |
5964 | perf_event_remove_from_context(group_leader); | 6568 | perf_remove_from_context(group_leader); |
5965 | list_for_each_entry(sibling, &group_leader->sibling_list, | 6569 | list_for_each_entry(sibling, &group_leader->sibling_list, |
5966 | group_entry) { | 6570 | group_entry) { |
5967 | perf_event_remove_from_context(sibling); | 6571 | perf_remove_from_context(sibling); |
5968 | put_ctx(gctx); | 6572 | put_ctx(gctx); |
5969 | } | 6573 | } |
5970 | mutex_unlock(&gctx->mutex); | 6574 | mutex_unlock(&gctx->mutex); |
@@ -5987,6 +6591,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5987 | 6591 | ||
5988 | perf_install_in_context(ctx, event, cpu); | 6592 | perf_install_in_context(ctx, event, cpu); |
5989 | ++ctx->generation; | 6593 | ++ctx->generation; |
6594 | perf_unpin_context(ctx); | ||
5990 | mutex_unlock(&ctx->mutex); | 6595 | mutex_unlock(&ctx->mutex); |
5991 | 6596 | ||
5992 | event->owner = current; | 6597 | event->owner = current; |
@@ -6012,6 +6617,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
6012 | return event_fd; | 6617 | return event_fd; |
6013 | 6618 | ||
6014 | err_context: | 6619 | err_context: |
6620 | perf_unpin_context(ctx); | ||
6015 | put_ctx(ctx); | 6621 | put_ctx(ctx); |
6016 | err_alloc: | 6622 | err_alloc: |
6017 | free_event(event); | 6623 | free_event(event); |
@@ -6062,6 +6668,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
6062 | mutex_lock(&ctx->mutex); | 6668 | mutex_lock(&ctx->mutex); |
6063 | perf_install_in_context(ctx, event, cpu); | 6669 | perf_install_in_context(ctx, event, cpu); |
6064 | ++ctx->generation; | 6670 | ++ctx->generation; |
6671 | perf_unpin_context(ctx); | ||
6065 | mutex_unlock(&ctx->mutex); | 6672 | mutex_unlock(&ctx->mutex); |
6066 | 6673 | ||
6067 | return event; | 6674 | return event; |
@@ -6115,7 +6722,7 @@ __perf_event_exit_task(struct perf_event *child_event, | |||
6115 | { | 6722 | { |
6116 | struct perf_event *parent_event; | 6723 | struct perf_event *parent_event; |
6117 | 6724 | ||
6118 | perf_event_remove_from_context(child_event); | 6725 | perf_remove_from_context(child_event); |
6119 | 6726 | ||
6120 | parent_event = child_event->parent; | 6727 | parent_event = child_event->parent; |
6121 | /* | 6728 | /* |
@@ -6422,7 +7029,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, | |||
6422 | return 0; | 7029 | return 0; |
6423 | } | 7030 | } |
6424 | 7031 | ||
6425 | child_ctx = child->perf_event_ctxp[ctxn]; | 7032 | child_ctx = child->perf_event_ctxp[ctxn]; |
6426 | if (!child_ctx) { | 7033 | if (!child_ctx) { |
6427 | /* | 7034 | /* |
6428 | * This is executed from the parent task context, so | 7035 | * This is executed from the parent task context, so |
@@ -6537,6 +7144,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn) | |||
6537 | mutex_unlock(&parent_ctx->mutex); | 7144 | mutex_unlock(&parent_ctx->mutex); |
6538 | 7145 | ||
6539 | perf_unpin_context(parent_ctx); | 7146 | perf_unpin_context(parent_ctx); |
7147 | put_ctx(parent_ctx); | ||
6540 | 7148 | ||
6541 | return ret; | 7149 | return ret; |
6542 | } | 7150 | } |
@@ -6606,9 +7214,9 @@ static void __perf_event_exit_context(void *__info) | |||
6606 | perf_pmu_rotate_stop(ctx->pmu); | 7214 | perf_pmu_rotate_stop(ctx->pmu); |
6607 | 7215 | ||
6608 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) | 7216 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) |
6609 | __perf_event_remove_from_context(event); | 7217 | __perf_remove_from_context(event); |
6610 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) | 7218 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) |
6611 | __perf_event_remove_from_context(event); | 7219 | __perf_remove_from_context(event); |
6612 | } | 7220 | } |
6613 | 7221 | ||
6614 | static void perf_event_exit_cpu_context(int cpu) | 7222 | static void perf_event_exit_cpu_context(int cpu) |
@@ -6732,3 +7340,83 @@ unlock: | |||
6732 | return ret; | 7340 | return ret; |
6733 | } | 7341 | } |
6734 | device_initcall(perf_event_sysfs_init); | 7342 | device_initcall(perf_event_sysfs_init); |
7343 | |||
7344 | #ifdef CONFIG_CGROUP_PERF | ||
7345 | static struct cgroup_subsys_state *perf_cgroup_create( | ||
7346 | struct cgroup_subsys *ss, struct cgroup *cont) | ||
7347 | { | ||
7348 | struct perf_cgroup *jc; | ||
7349 | |||
7350 | jc = kzalloc(sizeof(*jc), GFP_KERNEL); | ||
7351 | if (!jc) | ||
7352 | return ERR_PTR(-ENOMEM); | ||
7353 | |||
7354 | jc->info = alloc_percpu(struct perf_cgroup_info); | ||
7355 | if (!jc->info) { | ||
7356 | kfree(jc); | ||
7357 | return ERR_PTR(-ENOMEM); | ||
7358 | } | ||
7359 | |||
7360 | return &jc->css; | ||
7361 | } | ||
7362 | |||
7363 | static void perf_cgroup_destroy(struct cgroup_subsys *ss, | ||
7364 | struct cgroup *cont) | ||
7365 | { | ||
7366 | struct perf_cgroup *jc; | ||
7367 | jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), | ||
7368 | struct perf_cgroup, css); | ||
7369 | free_percpu(jc->info); | ||
7370 | kfree(jc); | ||
7371 | } | ||
7372 | |||
7373 | static int __perf_cgroup_move(void *info) | ||
7374 | { | ||
7375 | struct task_struct *task = info; | ||
7376 | perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); | ||
7377 | return 0; | ||
7378 | } | ||
7379 | |||
7380 | static void perf_cgroup_move(struct task_struct *task) | ||
7381 | { | ||
7382 | task_function_call(task, __perf_cgroup_move, task); | ||
7383 | } | ||
7384 | |||
7385 | static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | ||
7386 | struct cgroup *old_cgrp, struct task_struct *task, | ||
7387 | bool threadgroup) | ||
7388 | { | ||
7389 | perf_cgroup_move(task); | ||
7390 | if (threadgroup) { | ||
7391 | struct task_struct *c; | ||
7392 | rcu_read_lock(); | ||
7393 | list_for_each_entry_rcu(c, &task->thread_group, thread_group) { | ||
7394 | perf_cgroup_move(c); | ||
7395 | } | ||
7396 | rcu_read_unlock(); | ||
7397 | } | ||
7398 | } | ||
7399 | |||
7400 | static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, | ||
7401 | struct cgroup *old_cgrp, struct task_struct *task) | ||
7402 | { | ||
7403 | /* | ||
7404 | * cgroup_exit() is called in the copy_process() failure path. | ||
7405 | * Ignore this case since the task hasn't ran yet, this avoids | ||
7406 | * trying to poke a half freed task state from generic code. | ||
7407 | */ | ||
7408 | if (!(task->flags & PF_EXITING)) | ||
7409 | return; | ||
7410 | |||
7411 | perf_cgroup_move(task); | ||
7412 | } | ||
7413 | |||
7414 | struct cgroup_subsys perf_subsys = { | ||
7415 | .name = "perf_event", | ||
7416 | .subsys_id = perf_subsys_id, | ||
7417 | .create = perf_cgroup_create, | ||
7418 | .destroy = perf_cgroup_destroy, | ||
7419 | .exit = perf_cgroup_exit, | ||
7420 | .attach = perf_cgroup_attach, | ||
7421 | }; | ||
7422 | #endif /* CONFIG_CGROUP_PERF */ | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 42eab5a8437d..57a18e8d28c8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -606,9 +606,6 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
606 | struct task_group *tg; | 606 | struct task_group *tg; |
607 | struct cgroup_subsys_state *css; | 607 | struct cgroup_subsys_state *css; |
608 | 608 | ||
609 | if (p->flags & PF_EXITING) | ||
610 | return &root_task_group; | ||
611 | |||
612 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | 609 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, |
613 | lockdep_is_held(&task_rq(p)->lock)); | 610 | lockdep_is_held(&task_rq(p)->lock)); |
614 | tg = container_of(css, struct task_group, css); | 611 | tg = container_of(css, struct task_group, css); |
@@ -2265,27 +2262,6 @@ void kick_process(struct task_struct *p) | |||
2265 | EXPORT_SYMBOL_GPL(kick_process); | 2262 | EXPORT_SYMBOL_GPL(kick_process); |
2266 | #endif /* CONFIG_SMP */ | 2263 | #endif /* CONFIG_SMP */ |
2267 | 2264 | ||
2268 | /** | ||
2269 | * task_oncpu_function_call - call a function on the cpu on which a task runs | ||
2270 | * @p: the task to evaluate | ||
2271 | * @func: the function to be called | ||
2272 | * @info: the function call argument | ||
2273 | * | ||
2274 | * Calls the function @func when the task is currently running. This might | ||
2275 | * be on the current CPU, which just calls the function directly | ||
2276 | */ | ||
2277 | void task_oncpu_function_call(struct task_struct *p, | ||
2278 | void (*func) (void *info), void *info) | ||
2279 | { | ||
2280 | int cpu; | ||
2281 | |||
2282 | preempt_disable(); | ||
2283 | cpu = task_cpu(p); | ||
2284 | if (task_curr(p)) | ||
2285 | smp_call_function_single(cpu, func, info, 1); | ||
2286 | preempt_enable(); | ||
2287 | } | ||
2288 | |||
2289 | #ifdef CONFIG_SMP | 2265 | #ifdef CONFIG_SMP |
2290 | /* | 2266 | /* |
2291 | * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. | 2267 | * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. |
@@ -2776,9 +2752,12 @@ static inline void | |||
2776 | prepare_task_switch(struct rq *rq, struct task_struct *prev, | 2752 | prepare_task_switch(struct rq *rq, struct task_struct *prev, |
2777 | struct task_struct *next) | 2753 | struct task_struct *next) |
2778 | { | 2754 | { |
2755 | sched_info_switch(prev, next); | ||
2756 | perf_event_task_sched_out(prev, next); | ||
2779 | fire_sched_out_preempt_notifiers(prev, next); | 2757 | fire_sched_out_preempt_notifiers(prev, next); |
2780 | prepare_lock_switch(rq, next); | 2758 | prepare_lock_switch(rq, next); |
2781 | prepare_arch_switch(next); | 2759 | prepare_arch_switch(next); |
2760 | trace_sched_switch(prev, next); | ||
2782 | } | 2761 | } |
2783 | 2762 | ||
2784 | /** | 2763 | /** |
@@ -2911,7 +2890,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2911 | struct mm_struct *mm, *oldmm; | 2890 | struct mm_struct *mm, *oldmm; |
2912 | 2891 | ||
2913 | prepare_task_switch(rq, prev, next); | 2892 | prepare_task_switch(rq, prev, next); |
2914 | trace_sched_switch(prev, next); | 2893 | |
2915 | mm = next->mm; | 2894 | mm = next->mm; |
2916 | oldmm = prev->active_mm; | 2895 | oldmm = prev->active_mm; |
2917 | /* | 2896 | /* |
@@ -3989,9 +3968,6 @@ need_resched_nonpreemptible: | |||
3989 | rq->skip_clock_update = 0; | 3968 | rq->skip_clock_update = 0; |
3990 | 3969 | ||
3991 | if (likely(prev != next)) { | 3970 | if (likely(prev != next)) { |
3992 | sched_info_switch(prev, next); | ||
3993 | perf_event_task_sched_out(prev, next); | ||
3994 | |||
3995 | rq->nr_switches++; | 3971 | rq->nr_switches++; |
3996 | rq->curr = next; | 3972 | rq->curr = next; |
3997 | ++*switch_count; | 3973 | ++*switch_count; |
@@ -5572,7 +5548,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
5572 | * The idle tasks have their own, simple scheduling class: | 5548 | * The idle tasks have their own, simple scheduling class: |
5573 | */ | 5549 | */ |
5574 | idle->sched_class = &idle_sched_class; | 5550 | idle->sched_class = &idle_sched_class; |
5575 | ftrace_graph_init_task(idle); | 5551 | ftrace_graph_init_idle_task(idle, cpu); |
5576 | } | 5552 | } |
5577 | 5553 | ||
5578 | /* | 5554 | /* |
@@ -8885,7 +8861,8 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
8885 | } | 8861 | } |
8886 | 8862 | ||
8887 | static void | 8863 | static void |
8888 | cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) | 8864 | cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, |
8865 | struct cgroup *old_cgrp, struct task_struct *task) | ||
8889 | { | 8866 | { |
8890 | /* | 8867 | /* |
8891 | * cgroup_exit() is called in the copy_process() failure path. | 8868 | * cgroup_exit() is called in the copy_process() failure path. |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4eed0af5d144..19b9d85e06cc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -948,7 +948,7 @@ static struct ctl_table kern_table[] = { | |||
948 | .data = &sysctl_perf_event_sample_rate, | 948 | .data = &sysctl_perf_event_sample_rate, |
949 | .maxlen = sizeof(sysctl_perf_event_sample_rate), | 949 | .maxlen = sizeof(sysctl_perf_event_sample_rate), |
950 | .mode = 0644, | 950 | .mode = 0644, |
951 | .proc_handler = proc_dointvec, | 951 | .proc_handler = perf_proc_update_handler, |
952 | }, | 952 | }, |
953 | #endif | 953 | #endif |
954 | #ifdef CONFIG_KMEMCHECK | 954 | #ifdef CONFIG_KMEMCHECK |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f3dadae83883..888b611897d3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -3328,7 +3328,7 @@ static int start_graph_tracing(void) | |||
3328 | /* The cpu_boot init_task->ret_stack will never be freed */ | 3328 | /* The cpu_boot init_task->ret_stack will never be freed */ |
3329 | for_each_online_cpu(cpu) { | 3329 | for_each_online_cpu(cpu) { |
3330 | if (!idle_task(cpu)->ret_stack) | 3330 | if (!idle_task(cpu)->ret_stack) |
3331 | ftrace_graph_init_task(idle_task(cpu)); | 3331 | ftrace_graph_init_idle_task(idle_task(cpu), cpu); |
3332 | } | 3332 | } |
3333 | 3333 | ||
3334 | do { | 3334 | do { |
@@ -3418,6 +3418,49 @@ void unregister_ftrace_graph(void) | |||
3418 | mutex_unlock(&ftrace_lock); | 3418 | mutex_unlock(&ftrace_lock); |
3419 | } | 3419 | } |
3420 | 3420 | ||
3421 | static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack); | ||
3422 | |||
3423 | static void | ||
3424 | graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) | ||
3425 | { | ||
3426 | atomic_set(&t->tracing_graph_pause, 0); | ||
3427 | atomic_set(&t->trace_overrun, 0); | ||
3428 | t->ftrace_timestamp = 0; | ||
3429 | /* make curr_ret_stack visable before we add the ret_stack */ | ||
3430 | smp_wmb(); | ||
3431 | t->ret_stack = ret_stack; | ||
3432 | } | ||
3433 | |||
3434 | /* | ||
3435 | * Allocate a return stack for the idle task. May be the first | ||
3436 | * time through, or it may be done by CPU hotplug online. | ||
3437 | */ | ||
3438 | void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) | ||
3439 | { | ||
3440 | t->curr_ret_stack = -1; | ||
3441 | /* | ||
3442 | * The idle task has no parent, it either has its own | ||
3443 | * stack or no stack at all. | ||
3444 | */ | ||
3445 | if (t->ret_stack) | ||
3446 | WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu)); | ||
3447 | |||
3448 | if (ftrace_graph_active) { | ||
3449 | struct ftrace_ret_stack *ret_stack; | ||
3450 | |||
3451 | ret_stack = per_cpu(idle_ret_stack, cpu); | ||
3452 | if (!ret_stack) { | ||
3453 | ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH | ||
3454 | * sizeof(struct ftrace_ret_stack), | ||
3455 | GFP_KERNEL); | ||
3456 | if (!ret_stack) | ||
3457 | return; | ||
3458 | per_cpu(idle_ret_stack, cpu) = ret_stack; | ||
3459 | } | ||
3460 | graph_init_task(t, ret_stack); | ||
3461 | } | ||
3462 | } | ||
3463 | |||
3421 | /* Allocate a return stack for newly created task */ | 3464 | /* Allocate a return stack for newly created task */ |
3422 | void ftrace_graph_init_task(struct task_struct *t) | 3465 | void ftrace_graph_init_task(struct task_struct *t) |
3423 | { | 3466 | { |
@@ -3433,12 +3476,7 @@ void ftrace_graph_init_task(struct task_struct *t) | |||
3433 | GFP_KERNEL); | 3476 | GFP_KERNEL); |
3434 | if (!ret_stack) | 3477 | if (!ret_stack) |
3435 | return; | 3478 | return; |
3436 | atomic_set(&t->tracing_graph_pause, 0); | 3479 | graph_init_task(t, ret_stack); |
3437 | atomic_set(&t->trace_overrun, 0); | ||
3438 | t->ftrace_timestamp = 0; | ||
3439 | /* make curr_ret_stack visable before we add the ret_stack */ | ||
3440 | smp_wmb(); | ||
3441 | t->ret_stack = ret_stack; | ||
3442 | } | 3480 | } |
3443 | } | 3481 | } |
3444 | 3482 | ||
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bd1c35a4fbcc..db7b439d23ee 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -5,7 +5,6 @@ | |||
5 | */ | 5 | */ |
6 | #include <linux/ring_buffer.h> | 6 | #include <linux/ring_buffer.h> |
7 | #include <linux/trace_clock.h> | 7 | #include <linux/trace_clock.h> |
8 | #include <linux/ftrace_irq.h> | ||
9 | #include <linux/spinlock.h> | 8 | #include <linux/spinlock.h> |
10 | #include <linux/debugfs.h> | 9 | #include <linux/debugfs.h> |
11 | #include <linux/uaccess.h> | 10 | #include <linux/uaccess.h> |
@@ -1429,6 +1428,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) | |||
1429 | } | 1428 | } |
1430 | EXPORT_SYMBOL_GPL(ring_buffer_resize); | 1429 | EXPORT_SYMBOL_GPL(ring_buffer_resize); |
1431 | 1430 | ||
1431 | void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) | ||
1432 | { | ||
1433 | mutex_lock(&buffer->mutex); | ||
1434 | if (val) | ||
1435 | buffer->flags |= RB_FL_OVERWRITE; | ||
1436 | else | ||
1437 | buffer->flags &= ~RB_FL_OVERWRITE; | ||
1438 | mutex_unlock(&buffer->mutex); | ||
1439 | } | ||
1440 | EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); | ||
1441 | |||
1432 | static inline void * | 1442 | static inline void * |
1433 | __rb_data_page_index(struct buffer_data_page *bpage, unsigned index) | 1443 | __rb_data_page_index(struct buffer_data_page *bpage, unsigned index) |
1434 | { | 1444 | { |
@@ -2162,11 +2172,19 @@ rb_reserve_next_event(struct ring_buffer *buffer, | |||
2162 | if (likely(ts >= cpu_buffer->write_stamp)) { | 2172 | if (likely(ts >= cpu_buffer->write_stamp)) { |
2163 | delta = diff; | 2173 | delta = diff; |
2164 | if (unlikely(test_time_stamp(delta))) { | 2174 | if (unlikely(test_time_stamp(delta))) { |
2175 | int local_clock_stable = 1; | ||
2176 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | ||
2177 | local_clock_stable = sched_clock_stable; | ||
2178 | #endif | ||
2165 | WARN_ONCE(delta > (1ULL << 59), | 2179 | WARN_ONCE(delta > (1ULL << 59), |
2166 | KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", | 2180 | KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", |
2167 | (unsigned long long)delta, | 2181 | (unsigned long long)delta, |
2168 | (unsigned long long)ts, | 2182 | (unsigned long long)ts, |
2169 | (unsigned long long)cpu_buffer->write_stamp); | 2183 | (unsigned long long)cpu_buffer->write_stamp, |
2184 | local_clock_stable ? "" : | ||
2185 | "If you just came from a suspend/resume,\n" | ||
2186 | "please switch to the trace global clock:\n" | ||
2187 | " echo global > /sys/kernel/debug/tracing/trace_clock\n"); | ||
2170 | add_timestamp = 1; | 2188 | add_timestamp = 1; |
2171 | } | 2189 | } |
2172 | } | 2190 | } |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dc53ecb80589..9541c27c1cf2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -41,8 +41,6 @@ | |||
41 | #include "trace.h" | 41 | #include "trace.h" |
42 | #include "trace_output.h" | 42 | #include "trace_output.h" |
43 | 43 | ||
44 | #define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) | ||
45 | |||
46 | /* | 44 | /* |
47 | * On boot up, the ring buffer is set to the minimum size, so that | 45 | * On boot up, the ring buffer is set to the minimum size, so that |
48 | * we do not waste memory on systems that are not using tracing. | 46 | * we do not waste memory on systems that are not using tracing. |
@@ -340,7 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); | |||
340 | /* trace_flags holds trace_options default values */ | 338 | /* trace_flags holds trace_options default values */ |
341 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | | 339 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | |
342 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | | 340 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | |
343 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; | 341 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; |
344 | 342 | ||
345 | static int trace_stop_count; | 343 | static int trace_stop_count; |
346 | static DEFINE_SPINLOCK(tracing_start_lock); | 344 | static DEFINE_SPINLOCK(tracing_start_lock); |
@@ -425,6 +423,7 @@ static const char *trace_options[] = { | |||
425 | "sleep-time", | 423 | "sleep-time", |
426 | "graph-time", | 424 | "graph-time", |
427 | "record-cmd", | 425 | "record-cmd", |
426 | "overwrite", | ||
428 | NULL | 427 | NULL |
429 | }; | 428 | }; |
430 | 429 | ||
@@ -780,6 +779,11 @@ __acquires(kernel_lock) | |||
780 | tracing_reset_online_cpus(tr); | 779 | tracing_reset_online_cpus(tr); |
781 | 780 | ||
782 | current_trace = type; | 781 | current_trace = type; |
782 | |||
783 | /* If we expanded the buffers, make sure the max is expanded too */ | ||
784 | if (ring_buffer_expanded && type->use_max_tr) | ||
785 | ring_buffer_resize(max_tr.buffer, trace_buf_size); | ||
786 | |||
783 | /* the test is responsible for initializing and enabling */ | 787 | /* the test is responsible for initializing and enabling */ |
784 | pr_info("Testing tracer %s: ", type->name); | 788 | pr_info("Testing tracer %s: ", type->name); |
785 | ret = type->selftest(type, tr); | 789 | ret = type->selftest(type, tr); |
@@ -792,6 +796,10 @@ __acquires(kernel_lock) | |||
792 | /* Only reset on passing, to avoid touching corrupted buffers */ | 796 | /* Only reset on passing, to avoid touching corrupted buffers */ |
793 | tracing_reset_online_cpus(tr); | 797 | tracing_reset_online_cpus(tr); |
794 | 798 | ||
799 | /* Shrink the max buffer again */ | ||
800 | if (ring_buffer_expanded && type->use_max_tr) | ||
801 | ring_buffer_resize(max_tr.buffer, 1); | ||
802 | |||
795 | printk(KERN_CONT "PASSED\n"); | 803 | printk(KERN_CONT "PASSED\n"); |
796 | } | 804 | } |
797 | #endif | 805 | #endif |
@@ -1102,7 +1110,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, | |||
1102 | 1110 | ||
1103 | entry->preempt_count = pc & 0xff; | 1111 | entry->preempt_count = pc & 0xff; |
1104 | entry->pid = (tsk) ? tsk->pid : 0; | 1112 | entry->pid = (tsk) ? tsk->pid : 0; |
1105 | entry->lock_depth = (tsk) ? tsk->lock_depth : 0; | ||
1106 | entry->flags = | 1113 | entry->flags = |
1107 | #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT | 1114 | #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT |
1108 | (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | | 1115 | (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | |
@@ -1749,10 +1756,9 @@ static void print_lat_help_header(struct seq_file *m) | |||
1749 | seq_puts(m, "# | / _----=> need-resched \n"); | 1756 | seq_puts(m, "# | / _----=> need-resched \n"); |
1750 | seq_puts(m, "# || / _---=> hardirq/softirq \n"); | 1757 | seq_puts(m, "# || / _---=> hardirq/softirq \n"); |
1751 | seq_puts(m, "# ||| / _--=> preempt-depth \n"); | 1758 | seq_puts(m, "# ||| / _--=> preempt-depth \n"); |
1752 | seq_puts(m, "# |||| /_--=> lock-depth \n"); | 1759 | seq_puts(m, "# |||| / delay \n"); |
1753 | seq_puts(m, "# |||||/ delay \n"); | 1760 | seq_puts(m, "# cmd pid ||||| time | caller \n"); |
1754 | seq_puts(m, "# cmd pid |||||| time | caller \n"); | 1761 | seq_puts(m, "# \\ / ||||| \\ | / \n"); |
1755 | seq_puts(m, "# \\ / |||||| \\ | / \n"); | ||
1756 | } | 1762 | } |
1757 | 1763 | ||
1758 | static void print_func_help_header(struct seq_file *m) | 1764 | static void print_func_help_header(struct seq_file *m) |
@@ -2529,6 +2535,9 @@ static void set_tracer_flags(unsigned int mask, int enabled) | |||
2529 | 2535 | ||
2530 | if (mask == TRACE_ITER_RECORD_CMD) | 2536 | if (mask == TRACE_ITER_RECORD_CMD) |
2531 | trace_event_enable_cmd_record(enabled); | 2537 | trace_event_enable_cmd_record(enabled); |
2538 | |||
2539 | if (mask == TRACE_ITER_OVERWRITE) | ||
2540 | ring_buffer_change_overwrite(global_trace.buffer, enabled); | ||
2532 | } | 2541 | } |
2533 | 2542 | ||
2534 | static ssize_t | 2543 | static ssize_t |
@@ -2710,6 +2719,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, | |||
2710 | 2719 | ||
2711 | mutex_lock(&trace_types_lock); | 2720 | mutex_lock(&trace_types_lock); |
2712 | if (tracer_enabled ^ val) { | 2721 | if (tracer_enabled ^ val) { |
2722 | |||
2723 | /* Only need to warn if this is used to change the state */ | ||
2724 | WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on"); | ||
2725 | |||
2713 | if (val) { | 2726 | if (val) { |
2714 | tracer_enabled = 1; | 2727 | tracer_enabled = 1; |
2715 | if (current_trace->start) | 2728 | if (current_trace->start) |
@@ -4551,9 +4564,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) | |||
4551 | __init static int tracer_alloc_buffers(void) | 4564 | __init static int tracer_alloc_buffers(void) |
4552 | { | 4565 | { |
4553 | int ring_buf_size; | 4566 | int ring_buf_size; |
4567 | enum ring_buffer_flags rb_flags; | ||
4554 | int i; | 4568 | int i; |
4555 | int ret = -ENOMEM; | 4569 | int ret = -ENOMEM; |
4556 | 4570 | ||
4571 | |||
4557 | if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) | 4572 | if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) |
4558 | goto out; | 4573 | goto out; |
4559 | 4574 | ||
@@ -4566,12 +4581,13 @@ __init static int tracer_alloc_buffers(void) | |||
4566 | else | 4581 | else |
4567 | ring_buf_size = 1; | 4582 | ring_buf_size = 1; |
4568 | 4583 | ||
4584 | rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; | ||
4585 | |||
4569 | cpumask_copy(tracing_buffer_mask, cpu_possible_mask); | 4586 | cpumask_copy(tracing_buffer_mask, cpu_possible_mask); |
4570 | cpumask_copy(tracing_cpumask, cpu_all_mask); | 4587 | cpumask_copy(tracing_cpumask, cpu_all_mask); |
4571 | 4588 | ||
4572 | /* TODO: make the number of buffers hot pluggable with CPUS */ | 4589 | /* TODO: make the number of buffers hot pluggable with CPUS */ |
4573 | global_trace.buffer = ring_buffer_alloc(ring_buf_size, | 4590 | global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); |
4574 | TRACE_BUFFER_FLAGS); | ||
4575 | if (!global_trace.buffer) { | 4591 | if (!global_trace.buffer) { |
4576 | printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); | 4592 | printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); |
4577 | WARN_ON(1); | 4593 | WARN_ON(1); |
@@ -4581,7 +4597,7 @@ __init static int tracer_alloc_buffers(void) | |||
4581 | 4597 | ||
4582 | 4598 | ||
4583 | #ifdef CONFIG_TRACER_MAX_TRACE | 4599 | #ifdef CONFIG_TRACER_MAX_TRACE |
4584 | max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); | 4600 | max_tr.buffer = ring_buffer_alloc(1, rb_flags); |
4585 | if (!max_tr.buffer) { | 4601 | if (!max_tr.buffer) { |
4586 | printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); | 4602 | printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); |
4587 | WARN_ON(1); | 4603 | WARN_ON(1); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9021f8c0c0c3..5e9dfc6286dd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -272,8 +272,8 @@ struct tracer { | |||
272 | /* If you handled the flag setting, return 0 */ | 272 | /* If you handled the flag setting, return 0 */ |
273 | int (*set_flag)(u32 old_flags, u32 bit, int set); | 273 | int (*set_flag)(u32 old_flags, u32 bit, int set); |
274 | struct tracer *next; | 274 | struct tracer *next; |
275 | int print_max; | ||
276 | struct tracer_flags *flags; | 275 | struct tracer_flags *flags; |
276 | int print_max; | ||
277 | int use_max_tr; | 277 | int use_max_tr; |
278 | }; | 278 | }; |
279 | 279 | ||
@@ -606,6 +606,7 @@ enum trace_iterator_flags { | |||
606 | TRACE_ITER_SLEEP_TIME = 0x40000, | 606 | TRACE_ITER_SLEEP_TIME = 0x40000, |
607 | TRACE_ITER_GRAPH_TIME = 0x80000, | 607 | TRACE_ITER_GRAPH_TIME = 0x80000, |
608 | TRACE_ITER_RECORD_CMD = 0x100000, | 608 | TRACE_ITER_RECORD_CMD = 0x100000, |
609 | TRACE_ITER_OVERWRITE = 0x200000, | ||
609 | }; | 610 | }; |
610 | 611 | ||
611 | /* | 612 | /* |
@@ -661,8 +662,10 @@ struct ftrace_event_field { | |||
661 | }; | 662 | }; |
662 | 663 | ||
663 | struct event_filter { | 664 | struct event_filter { |
664 | int n_preds; | 665 | int n_preds; /* Number assigned */ |
665 | struct filter_pred **preds; | 666 | int a_preds; /* allocated */ |
667 | struct filter_pred *preds; | ||
668 | struct filter_pred *root; | ||
666 | char *filter_string; | 669 | char *filter_string; |
667 | }; | 670 | }; |
668 | 671 | ||
@@ -674,11 +677,23 @@ struct event_subsystem { | |||
674 | int nr_events; | 677 | int nr_events; |
675 | }; | 678 | }; |
676 | 679 | ||
680 | #define FILTER_PRED_INVALID ((unsigned short)-1) | ||
681 | #define FILTER_PRED_IS_RIGHT (1 << 15) | ||
682 | #define FILTER_PRED_FOLD (1 << 15) | ||
683 | |||
684 | /* | ||
685 | * The max preds is the size of unsigned short with | ||
686 | * two flags at the MSBs. One bit is used for both the IS_RIGHT | ||
687 | * and FOLD flags. The other is reserved. | ||
688 | * | ||
689 | * 2^14 preds is way more than enough. | ||
690 | */ | ||
691 | #define MAX_FILTER_PRED 16384 | ||
692 | |||
677 | struct filter_pred; | 693 | struct filter_pred; |
678 | struct regex; | 694 | struct regex; |
679 | 695 | ||
680 | typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, | 696 | typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); |
681 | int val1, int val2); | ||
682 | 697 | ||
683 | typedef int (*regex_match_func)(char *str, struct regex *r, int len); | 698 | typedef int (*regex_match_func)(char *str, struct regex *r, int len); |
684 | 699 | ||
@@ -700,11 +715,23 @@ struct filter_pred { | |||
700 | filter_pred_fn_t fn; | 715 | filter_pred_fn_t fn; |
701 | u64 val; | 716 | u64 val; |
702 | struct regex regex; | 717 | struct regex regex; |
703 | char *field_name; | 718 | /* |
719 | * Leaf nodes use field_name, ops is used by AND and OR | ||
720 | * nodes. The field_name is always freed when freeing a pred. | ||
721 | * We can overload field_name for ops and have it freed | ||
722 | * as well. | ||
723 | */ | ||
724 | union { | ||
725 | char *field_name; | ||
726 | unsigned short *ops; | ||
727 | }; | ||
704 | int offset; | 728 | int offset; |
705 | int not; | 729 | int not; |
706 | int op; | 730 | int op; |
707 | int pop_n; | 731 | unsigned short index; |
732 | unsigned short parent; | ||
733 | unsigned short left; | ||
734 | unsigned short right; | ||
708 | }; | 735 | }; |
709 | 736 | ||
710 | extern struct list_head ftrace_common_fields; | 737 | extern struct list_head ftrace_common_fields; |
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 6cf223764be8..1516cb3ec549 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h | |||
@@ -109,12 +109,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, | |||
109 | */ | 109 | */ |
110 | #define FTRACE_CTX_FIELDS \ | 110 | #define FTRACE_CTX_FIELDS \ |
111 | __field( unsigned int, prev_pid ) \ | 111 | __field( unsigned int, prev_pid ) \ |
112 | __field( unsigned int, next_pid ) \ | ||
113 | __field( unsigned int, next_cpu ) \ | ||
112 | __field( unsigned char, prev_prio ) \ | 114 | __field( unsigned char, prev_prio ) \ |
113 | __field( unsigned char, prev_state ) \ | 115 | __field( unsigned char, prev_state ) \ |
114 | __field( unsigned int, next_pid ) \ | ||
115 | __field( unsigned char, next_prio ) \ | 116 | __field( unsigned char, next_prio ) \ |
116 | __field( unsigned char, next_state ) \ | 117 | __field( unsigned char, next_state ) |
117 | __field( unsigned int, next_cpu ) | ||
118 | 118 | ||
119 | FTRACE_ENTRY(context_switch, ctx_switch_entry, | 119 | FTRACE_ENTRY(context_switch, ctx_switch_entry, |
120 | 120 | ||
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5f499e0438a4..e88f74fe1d4c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -116,7 +116,6 @@ static int trace_define_common_fields(void) | |||
116 | __common_field(unsigned char, flags); | 116 | __common_field(unsigned char, flags); |
117 | __common_field(unsigned char, preempt_count); | 117 | __common_field(unsigned char, preempt_count); |
118 | __common_field(int, pid); | 118 | __common_field(int, pid); |
119 | __common_field(int, lock_depth); | ||
120 | 119 | ||
121 | return ret; | 120 | return ret; |
122 | } | 121 | } |
@@ -326,6 +325,7 @@ int trace_set_clr_event(const char *system, const char *event, int set) | |||
326 | { | 325 | { |
327 | return __ftrace_set_clr_event(NULL, system, event, set); | 326 | return __ftrace_set_clr_event(NULL, system, event, set); |
328 | } | 327 | } |
328 | EXPORT_SYMBOL_GPL(trace_set_clr_event); | ||
329 | 329 | ||
330 | /* 128 should be much more than enough */ | 330 | /* 128 should be much more than enough */ |
331 | #define EVENT_BUF_SIZE 127 | 331 | #define EVENT_BUF_SIZE 127 |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 36d40104b17f..3249b4f77ef0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
@@ -123,9 +123,13 @@ struct filter_parse_state { | |||
123 | } operand; | 123 | } operand; |
124 | }; | 124 | }; |
125 | 125 | ||
126 | struct pred_stack { | ||
127 | struct filter_pred **preds; | ||
128 | int index; | ||
129 | }; | ||
130 | |||
126 | #define DEFINE_COMPARISON_PRED(type) \ | 131 | #define DEFINE_COMPARISON_PRED(type) \ |
127 | static int filter_pred_##type(struct filter_pred *pred, void *event, \ | 132 | static int filter_pred_##type(struct filter_pred *pred, void *event) \ |
128 | int val1, int val2) \ | ||
129 | { \ | 133 | { \ |
130 | type *addr = (type *)(event + pred->offset); \ | 134 | type *addr = (type *)(event + pred->offset); \ |
131 | type val = (type)pred->val; \ | 135 | type val = (type)pred->val; \ |
@@ -152,8 +156,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event, \ | |||
152 | } | 156 | } |
153 | 157 | ||
154 | #define DEFINE_EQUALITY_PRED(size) \ | 158 | #define DEFINE_EQUALITY_PRED(size) \ |
155 | static int filter_pred_##size(struct filter_pred *pred, void *event, \ | 159 | static int filter_pred_##size(struct filter_pred *pred, void *event) \ |
156 | int val1, int val2) \ | ||
157 | { \ | 160 | { \ |
158 | u##size *addr = (u##size *)(event + pred->offset); \ | 161 | u##size *addr = (u##size *)(event + pred->offset); \ |
159 | u##size val = (u##size)pred->val; \ | 162 | u##size val = (u##size)pred->val; \ |
@@ -178,23 +181,8 @@ DEFINE_EQUALITY_PRED(32); | |||
178 | DEFINE_EQUALITY_PRED(16); | 181 | DEFINE_EQUALITY_PRED(16); |
179 | DEFINE_EQUALITY_PRED(8); | 182 | DEFINE_EQUALITY_PRED(8); |
180 | 183 | ||
181 | static int filter_pred_and(struct filter_pred *pred __attribute((unused)), | ||
182 | void *event __attribute((unused)), | ||
183 | int val1, int val2) | ||
184 | { | ||
185 | return val1 && val2; | ||
186 | } | ||
187 | |||
188 | static int filter_pred_or(struct filter_pred *pred __attribute((unused)), | ||
189 | void *event __attribute((unused)), | ||
190 | int val1, int val2) | ||
191 | { | ||
192 | return val1 || val2; | ||
193 | } | ||
194 | |||
195 | /* Filter predicate for fixed sized arrays of characters */ | 184 | /* Filter predicate for fixed sized arrays of characters */ |
196 | static int filter_pred_string(struct filter_pred *pred, void *event, | 185 | static int filter_pred_string(struct filter_pred *pred, void *event) |
197 | int val1, int val2) | ||
198 | { | 186 | { |
199 | char *addr = (char *)(event + pred->offset); | 187 | char *addr = (char *)(event + pred->offset); |
200 | int cmp, match; | 188 | int cmp, match; |
@@ -207,8 +195,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event, | |||
207 | } | 195 | } |
208 | 196 | ||
209 | /* Filter predicate for char * pointers */ | 197 | /* Filter predicate for char * pointers */ |
210 | static int filter_pred_pchar(struct filter_pred *pred, void *event, | 198 | static int filter_pred_pchar(struct filter_pred *pred, void *event) |
211 | int val1, int val2) | ||
212 | { | 199 | { |
213 | char **addr = (char **)(event + pred->offset); | 200 | char **addr = (char **)(event + pred->offset); |
214 | int cmp, match; | 201 | int cmp, match; |
@@ -231,8 +218,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event, | |||
231 | * and add it to the address of the entry, and at last we have | 218 | * and add it to the address of the entry, and at last we have |
232 | * the address of the string. | 219 | * the address of the string. |
233 | */ | 220 | */ |
234 | static int filter_pred_strloc(struct filter_pred *pred, void *event, | 221 | static int filter_pred_strloc(struct filter_pred *pred, void *event) |
235 | int val1, int val2) | ||
236 | { | 222 | { |
237 | u32 str_item = *(u32 *)(event + pred->offset); | 223 | u32 str_item = *(u32 *)(event + pred->offset); |
238 | int str_loc = str_item & 0xffff; | 224 | int str_loc = str_item & 0xffff; |
@@ -247,8 +233,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event, | |||
247 | return match; | 233 | return match; |
248 | } | 234 | } |
249 | 235 | ||
250 | static int filter_pred_none(struct filter_pred *pred, void *event, | 236 | static int filter_pred_none(struct filter_pred *pred, void *event) |
251 | int val1, int val2) | ||
252 | { | 237 | { |
253 | return 0; | 238 | return 0; |
254 | } | 239 | } |
@@ -377,32 +362,147 @@ static void filter_build_regex(struct filter_pred *pred) | |||
377 | pred->not ^= not; | 362 | pred->not ^= not; |
378 | } | 363 | } |
379 | 364 | ||
365 | enum move_type { | ||
366 | MOVE_DOWN, | ||
367 | MOVE_UP_FROM_LEFT, | ||
368 | MOVE_UP_FROM_RIGHT | ||
369 | }; | ||
370 | |||
371 | static struct filter_pred * | ||
372 | get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, | ||
373 | int index, enum move_type *move) | ||
374 | { | ||
375 | if (pred->parent & FILTER_PRED_IS_RIGHT) | ||
376 | *move = MOVE_UP_FROM_RIGHT; | ||
377 | else | ||
378 | *move = MOVE_UP_FROM_LEFT; | ||
379 | pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT]; | ||
380 | |||
381 | return pred; | ||
382 | } | ||
383 | |||
384 | /* | ||
385 | * A series of AND or ORs where found together. Instead of | ||
386 | * climbing up and down the tree branches, an array of the | ||
387 | * ops were made in order of checks. We can just move across | ||
388 | * the array and short circuit if needed. | ||
389 | */ | ||
390 | static int process_ops(struct filter_pred *preds, | ||
391 | struct filter_pred *op, void *rec) | ||
392 | { | ||
393 | struct filter_pred *pred; | ||
394 | int type; | ||
395 | int match; | ||
396 | int i; | ||
397 | |||
398 | /* | ||
399 | * Micro-optimization: We set type to true if op | ||
400 | * is an OR and false otherwise (AND). Then we | ||
401 | * just need to test if the match is equal to | ||
402 | * the type, and if it is, we can short circuit the | ||
403 | * rest of the checks: | ||
404 | * | ||
405 | * if ((match && op->op == OP_OR) || | ||
406 | * (!match && op->op == OP_AND)) | ||
407 | * return match; | ||
408 | */ | ||
409 | type = op->op == OP_OR; | ||
410 | |||
411 | for (i = 0; i < op->val; i++) { | ||
412 | pred = &preds[op->ops[i]]; | ||
413 | match = pred->fn(pred, rec); | ||
414 | if (!!match == type) | ||
415 | return match; | ||
416 | } | ||
417 | return match; | ||
418 | } | ||
419 | |||
380 | /* return 1 if event matches, 0 otherwise (discard) */ | 420 | /* return 1 if event matches, 0 otherwise (discard) */ |
381 | int filter_match_preds(struct event_filter *filter, void *rec) | 421 | int filter_match_preds(struct event_filter *filter, void *rec) |
382 | { | 422 | { |
383 | int match, top = 0, val1 = 0, val2 = 0; | 423 | int match = -1; |
384 | int stack[MAX_FILTER_PRED]; | 424 | enum move_type move = MOVE_DOWN; |
425 | struct filter_pred *preds; | ||
385 | struct filter_pred *pred; | 426 | struct filter_pred *pred; |
386 | int i; | 427 | struct filter_pred *root; |
428 | int n_preds; | ||
429 | int done = 0; | ||
430 | |||
431 | /* no filter is considered a match */ | ||
432 | if (!filter) | ||
433 | return 1; | ||
434 | |||
435 | n_preds = filter->n_preds; | ||
436 | |||
437 | if (!n_preds) | ||
438 | return 1; | ||
439 | |||
440 | /* | ||
441 | * n_preds, root and filter->preds are protect with preemption disabled. | ||
442 | */ | ||
443 | preds = rcu_dereference_sched(filter->preds); | ||
444 | root = rcu_dereference_sched(filter->root); | ||
445 | if (!root) | ||
446 | return 1; | ||
447 | |||
448 | pred = root; | ||
387 | 449 | ||
388 | for (i = 0; i < filter->n_preds; i++) { | 450 | /* match is currently meaningless */ |
389 | pred = filter->preds[i]; | 451 | match = -1; |
390 | if (!pred->pop_n) { | 452 | |
391 | match = pred->fn(pred, rec, val1, val2); | 453 | do { |
392 | stack[top++] = match; | 454 | switch (move) { |
455 | case MOVE_DOWN: | ||
456 | /* only AND and OR have children */ | ||
457 | if (pred->left != FILTER_PRED_INVALID) { | ||
458 | /* If ops is set, then it was folded. */ | ||
459 | if (!pred->ops) { | ||
460 | /* keep going to down the left side */ | ||
461 | pred = &preds[pred->left]; | ||
462 | continue; | ||
463 | } | ||
464 | /* We can treat folded ops as a leaf node */ | ||
465 | match = process_ops(preds, pred, rec); | ||
466 | } else | ||
467 | match = pred->fn(pred, rec); | ||
468 | /* If this pred is the only pred */ | ||
469 | if (pred == root) | ||
470 | break; | ||
471 | pred = get_pred_parent(pred, preds, | ||
472 | pred->parent, &move); | ||
473 | continue; | ||
474 | case MOVE_UP_FROM_LEFT: | ||
475 | /* | ||
476 | * Check for short circuits. | ||
477 | * | ||
478 | * Optimization: !!match == (pred->op == OP_OR) | ||
479 | * is the same as: | ||
480 | * if ((match && pred->op == OP_OR) || | ||
481 | * (!match && pred->op == OP_AND)) | ||
482 | */ | ||
483 | if (!!match == (pred->op == OP_OR)) { | ||
484 | if (pred == root) | ||
485 | break; | ||
486 | pred = get_pred_parent(pred, preds, | ||
487 | pred->parent, &move); | ||
488 | continue; | ||
489 | } | ||
490 | /* now go down the right side of the tree. */ | ||
491 | pred = &preds[pred->right]; | ||
492 | move = MOVE_DOWN; | ||
493 | continue; | ||
494 | case MOVE_UP_FROM_RIGHT: | ||
495 | /* We finished this equation. */ | ||
496 | if (pred == root) | ||
497 | break; | ||
498 | pred = get_pred_parent(pred, preds, | ||
499 | pred->parent, &move); | ||
393 | continue; | 500 | continue; |
394 | } | 501 | } |
395 | if (pred->pop_n > top) { | 502 | done = 1; |
396 | WARN_ON_ONCE(1); | 503 | } while (!done); |
397 | return 0; | ||
398 | } | ||
399 | val1 = stack[--top]; | ||
400 | val2 = stack[--top]; | ||
401 | match = pred->fn(pred, rec, val1, val2); | ||
402 | stack[top++] = match; | ||
403 | } | ||
404 | 504 | ||
405 | return stack[--top]; | 505 | return match; |
406 | } | 506 | } |
407 | EXPORT_SYMBOL_GPL(filter_match_preds); | 507 | EXPORT_SYMBOL_GPL(filter_match_preds); |
408 | 508 | ||
@@ -414,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos) | |||
414 | 514 | ||
415 | static void remove_filter_string(struct event_filter *filter) | 515 | static void remove_filter_string(struct event_filter *filter) |
416 | { | 516 | { |
517 | if (!filter) | ||
518 | return; | ||
519 | |||
417 | kfree(filter->filter_string); | 520 | kfree(filter->filter_string); |
418 | filter->filter_string = NULL; | 521 | filter->filter_string = NULL; |
419 | } | 522 | } |
@@ -473,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps, | |||
473 | 576 | ||
474 | void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) | 577 | void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) |
475 | { | 578 | { |
476 | struct event_filter *filter = call->filter; | 579 | struct event_filter *filter; |
477 | 580 | ||
478 | mutex_lock(&event_mutex); | 581 | mutex_lock(&event_mutex); |
582 | filter = call->filter; | ||
479 | if (filter && filter->filter_string) | 583 | if (filter && filter->filter_string) |
480 | trace_seq_printf(s, "%s\n", filter->filter_string); | 584 | trace_seq_printf(s, "%s\n", filter->filter_string); |
481 | else | 585 | else |
@@ -486,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) | |||
486 | void print_subsystem_event_filter(struct event_subsystem *system, | 590 | void print_subsystem_event_filter(struct event_subsystem *system, |
487 | struct trace_seq *s) | 591 | struct trace_seq *s) |
488 | { | 592 | { |
489 | struct event_filter *filter = system->filter; | 593 | struct event_filter *filter; |
490 | 594 | ||
491 | mutex_lock(&event_mutex); | 595 | mutex_lock(&event_mutex); |
596 | filter = system->filter; | ||
492 | if (filter && filter->filter_string) | 597 | if (filter && filter->filter_string) |
493 | trace_seq_printf(s, "%s\n", filter->filter_string); | 598 | trace_seq_printf(s, "%s\n", filter->filter_string); |
494 | else | 599 | else |
@@ -539,10 +644,58 @@ static void filter_clear_pred(struct filter_pred *pred) | |||
539 | pred->regex.len = 0; | 644 | pred->regex.len = 0; |
540 | } | 645 | } |
541 | 646 | ||
542 | static int filter_set_pred(struct filter_pred *dest, | 647 | static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) |
648 | { | ||
649 | stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); | ||
650 | if (!stack->preds) | ||
651 | return -ENOMEM; | ||
652 | stack->index = n_preds; | ||
653 | return 0; | ||
654 | } | ||
655 | |||
656 | static void __free_pred_stack(struct pred_stack *stack) | ||
657 | { | ||
658 | kfree(stack->preds); | ||
659 | stack->index = 0; | ||
660 | } | ||
661 | |||
662 | static int __push_pred_stack(struct pred_stack *stack, | ||
663 | struct filter_pred *pred) | ||
664 | { | ||
665 | int index = stack->index; | ||
666 | |||
667 | if (WARN_ON(index == 0)) | ||
668 | return -ENOSPC; | ||
669 | |||
670 | stack->preds[--index] = pred; | ||
671 | stack->index = index; | ||
672 | return 0; | ||
673 | } | ||
674 | |||
675 | static struct filter_pred * | ||
676 | __pop_pred_stack(struct pred_stack *stack) | ||
677 | { | ||
678 | struct filter_pred *pred; | ||
679 | int index = stack->index; | ||
680 | |||
681 | pred = stack->preds[index++]; | ||
682 | if (!pred) | ||
683 | return NULL; | ||
684 | |||
685 | stack->index = index; | ||
686 | return pred; | ||
687 | } | ||
688 | |||
689 | static int filter_set_pred(struct event_filter *filter, | ||
690 | int idx, | ||
691 | struct pred_stack *stack, | ||
543 | struct filter_pred *src, | 692 | struct filter_pred *src, |
544 | filter_pred_fn_t fn) | 693 | filter_pred_fn_t fn) |
545 | { | 694 | { |
695 | struct filter_pred *dest = &filter->preds[idx]; | ||
696 | struct filter_pred *left; | ||
697 | struct filter_pred *right; | ||
698 | |||
546 | *dest = *src; | 699 | *dest = *src; |
547 | if (src->field_name) { | 700 | if (src->field_name) { |
548 | dest->field_name = kstrdup(src->field_name, GFP_KERNEL); | 701 | dest->field_name = kstrdup(src->field_name, GFP_KERNEL); |
@@ -550,116 +703,140 @@ static int filter_set_pred(struct filter_pred *dest, | |||
550 | return -ENOMEM; | 703 | return -ENOMEM; |
551 | } | 704 | } |
552 | dest->fn = fn; | 705 | dest->fn = fn; |
706 | dest->index = idx; | ||
553 | 707 | ||
554 | return 0; | 708 | if (dest->op == OP_OR || dest->op == OP_AND) { |
709 | right = __pop_pred_stack(stack); | ||
710 | left = __pop_pred_stack(stack); | ||
711 | if (!left || !right) | ||
712 | return -EINVAL; | ||
713 | /* | ||
714 | * If both children can be folded | ||
715 | * and they are the same op as this op or a leaf, | ||
716 | * then this op can be folded. | ||
717 | */ | ||
718 | if (left->index & FILTER_PRED_FOLD && | ||
719 | (left->op == dest->op || | ||
720 | left->left == FILTER_PRED_INVALID) && | ||
721 | right->index & FILTER_PRED_FOLD && | ||
722 | (right->op == dest->op || | ||
723 | right->left == FILTER_PRED_INVALID)) | ||
724 | dest->index |= FILTER_PRED_FOLD; | ||
725 | |||
726 | dest->left = left->index & ~FILTER_PRED_FOLD; | ||
727 | dest->right = right->index & ~FILTER_PRED_FOLD; | ||
728 | left->parent = dest->index & ~FILTER_PRED_FOLD; | ||
729 | right->parent = dest->index | FILTER_PRED_IS_RIGHT; | ||
730 | } else { | ||
731 | /* | ||
732 | * Make dest->left invalid to be used as a quick | ||
733 | * way to know this is a leaf node. | ||
734 | */ | ||
735 | dest->left = FILTER_PRED_INVALID; | ||
736 | |||
737 | /* All leafs allow folding the parent ops. */ | ||
738 | dest->index |= FILTER_PRED_FOLD; | ||
739 | } | ||
740 | |||
741 | return __push_pred_stack(stack, dest); | ||
555 | } | 742 | } |
556 | 743 | ||
557 | static void filter_disable_preds(struct ftrace_event_call *call) | 744 | static void __free_preds(struct event_filter *filter) |
558 | { | 745 | { |
559 | struct event_filter *filter = call->filter; | ||
560 | int i; | 746 | int i; |
561 | 747 | ||
562 | call->flags &= ~TRACE_EVENT_FL_FILTERED; | 748 | if (filter->preds) { |
749 | for (i = 0; i < filter->a_preds; i++) | ||
750 | kfree(filter->preds[i].field_name); | ||
751 | kfree(filter->preds); | ||
752 | filter->preds = NULL; | ||
753 | } | ||
754 | filter->a_preds = 0; | ||
563 | filter->n_preds = 0; | 755 | filter->n_preds = 0; |
564 | |||
565 | for (i = 0; i < MAX_FILTER_PRED; i++) | ||
566 | filter->preds[i]->fn = filter_pred_none; | ||
567 | } | 756 | } |
568 | 757 | ||
569 | static void __free_preds(struct event_filter *filter) | 758 | static void filter_disable(struct ftrace_event_call *call) |
570 | { | 759 | { |
571 | int i; | 760 | call->flags &= ~TRACE_EVENT_FL_FILTERED; |
761 | } | ||
572 | 762 | ||
763 | static void __free_filter(struct event_filter *filter) | ||
764 | { | ||
573 | if (!filter) | 765 | if (!filter) |
574 | return; | 766 | return; |
575 | 767 | ||
576 | for (i = 0; i < MAX_FILTER_PRED; i++) { | 768 | __free_preds(filter); |
577 | if (filter->preds[i]) | ||
578 | filter_free_pred(filter->preds[i]); | ||
579 | } | ||
580 | kfree(filter->preds); | ||
581 | kfree(filter->filter_string); | 769 | kfree(filter->filter_string); |
582 | kfree(filter); | 770 | kfree(filter); |
583 | } | 771 | } |
584 | 772 | ||
773 | /* | ||
774 | * Called when destroying the ftrace_event_call. | ||
775 | * The call is being freed, so we do not need to worry about | ||
776 | * the call being currently used. This is for module code removing | ||
777 | * the tracepoints from within it. | ||
778 | */ | ||
585 | void destroy_preds(struct ftrace_event_call *call) | 779 | void destroy_preds(struct ftrace_event_call *call) |
586 | { | 780 | { |
587 | __free_preds(call->filter); | 781 | __free_filter(call->filter); |
588 | call->filter = NULL; | 782 | call->filter = NULL; |
589 | call->flags &= ~TRACE_EVENT_FL_FILTERED; | ||
590 | } | 783 | } |
591 | 784 | ||
592 | static struct event_filter *__alloc_preds(void) | 785 | static struct event_filter *__alloc_filter(void) |
593 | { | 786 | { |
594 | struct event_filter *filter; | 787 | struct event_filter *filter; |
788 | |||
789 | filter = kzalloc(sizeof(*filter), GFP_KERNEL); | ||
790 | return filter; | ||
791 | } | ||
792 | |||
793 | static int __alloc_preds(struct event_filter *filter, int n_preds) | ||
794 | { | ||
595 | struct filter_pred *pred; | 795 | struct filter_pred *pred; |
596 | int i; | 796 | int i; |
597 | 797 | ||
598 | filter = kzalloc(sizeof(*filter), GFP_KERNEL); | 798 | if (filter->preds) |
599 | if (!filter) | 799 | __free_preds(filter); |
600 | return ERR_PTR(-ENOMEM); | ||
601 | 800 | ||
602 | filter->n_preds = 0; | 801 | filter->preds = |
802 | kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); | ||
603 | 803 | ||
604 | filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); | ||
605 | if (!filter->preds) | 804 | if (!filter->preds) |
606 | goto oom; | 805 | return -ENOMEM; |
607 | 806 | ||
608 | for (i = 0; i < MAX_FILTER_PRED; i++) { | 807 | filter->a_preds = n_preds; |
609 | pred = kzalloc(sizeof(*pred), GFP_KERNEL); | 808 | filter->n_preds = 0; |
610 | if (!pred) | 809 | |
611 | goto oom; | 810 | for (i = 0; i < n_preds; i++) { |
811 | pred = &filter->preds[i]; | ||
612 | pred->fn = filter_pred_none; | 812 | pred->fn = filter_pred_none; |
613 | filter->preds[i] = pred; | ||
614 | } | 813 | } |
615 | 814 | ||
616 | return filter; | ||
617 | |||
618 | oom: | ||
619 | __free_preds(filter); | ||
620 | return ERR_PTR(-ENOMEM); | ||
621 | } | ||
622 | |||
623 | static int init_preds(struct ftrace_event_call *call) | ||
624 | { | ||
625 | if (call->filter) | ||
626 | return 0; | ||
627 | |||
628 | call->flags &= ~TRACE_EVENT_FL_FILTERED; | ||
629 | call->filter = __alloc_preds(); | ||
630 | if (IS_ERR(call->filter)) | ||
631 | return PTR_ERR(call->filter); | ||
632 | |||
633 | return 0; | 815 | return 0; |
634 | } | 816 | } |
635 | 817 | ||
636 | static int init_subsystem_preds(struct event_subsystem *system) | 818 | static void filter_free_subsystem_preds(struct event_subsystem *system) |
637 | { | 819 | { |
638 | struct ftrace_event_call *call; | 820 | struct ftrace_event_call *call; |
639 | int err; | ||
640 | 821 | ||
641 | list_for_each_entry(call, &ftrace_events, list) { | 822 | list_for_each_entry(call, &ftrace_events, list) { |
642 | if (strcmp(call->class->system, system->name) != 0) | 823 | if (strcmp(call->class->system, system->name) != 0) |
643 | continue; | 824 | continue; |
644 | 825 | ||
645 | err = init_preds(call); | 826 | filter_disable(call); |
646 | if (err) | 827 | remove_filter_string(call->filter); |
647 | return err; | ||
648 | } | 828 | } |
649 | |||
650 | return 0; | ||
651 | } | 829 | } |
652 | 830 | ||
653 | static void filter_free_subsystem_preds(struct event_subsystem *system) | 831 | static void filter_free_subsystem_filters(struct event_subsystem *system) |
654 | { | 832 | { |
655 | struct ftrace_event_call *call; | 833 | struct ftrace_event_call *call; |
656 | 834 | ||
657 | list_for_each_entry(call, &ftrace_events, list) { | 835 | list_for_each_entry(call, &ftrace_events, list) { |
658 | if (strcmp(call->class->system, system->name) != 0) | 836 | if (strcmp(call->class->system, system->name) != 0) |
659 | continue; | 837 | continue; |
660 | 838 | __free_filter(call->filter); | |
661 | filter_disable_preds(call); | 839 | call->filter = NULL; |
662 | remove_filter_string(call->filter); | ||
663 | } | 840 | } |
664 | } | 841 | } |
665 | 842 | ||
@@ -667,18 +844,19 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, | |||
667 | struct ftrace_event_call *call, | 844 | struct ftrace_event_call *call, |
668 | struct event_filter *filter, | 845 | struct event_filter *filter, |
669 | struct filter_pred *pred, | 846 | struct filter_pred *pred, |
847 | struct pred_stack *stack, | ||
670 | filter_pred_fn_t fn) | 848 | filter_pred_fn_t fn) |
671 | { | 849 | { |
672 | int idx, err; | 850 | int idx, err; |
673 | 851 | ||
674 | if (filter->n_preds == MAX_FILTER_PRED) { | 852 | if (WARN_ON(filter->n_preds == filter->a_preds)) { |
675 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); | 853 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); |
676 | return -ENOSPC; | 854 | return -ENOSPC; |
677 | } | 855 | } |
678 | 856 | ||
679 | idx = filter->n_preds; | 857 | idx = filter->n_preds; |
680 | filter_clear_pred(filter->preds[idx]); | 858 | filter_clear_pred(&filter->preds[idx]); |
681 | err = filter_set_pred(filter->preds[idx], pred, fn); | 859 | err = filter_set_pred(filter, idx, stack, pred, fn); |
682 | if (err) | 860 | if (err) |
683 | return err; | 861 | return err; |
684 | 862 | ||
@@ -763,6 +941,7 @@ static int filter_add_pred(struct filter_parse_state *ps, | |||
763 | struct ftrace_event_call *call, | 941 | struct ftrace_event_call *call, |
764 | struct event_filter *filter, | 942 | struct event_filter *filter, |
765 | struct filter_pred *pred, | 943 | struct filter_pred *pred, |
944 | struct pred_stack *stack, | ||
766 | bool dry_run) | 945 | bool dry_run) |
767 | { | 946 | { |
768 | struct ftrace_event_field *field; | 947 | struct ftrace_event_field *field; |
@@ -770,17 +949,12 @@ static int filter_add_pred(struct filter_parse_state *ps, | |||
770 | unsigned long long val; | 949 | unsigned long long val; |
771 | int ret; | 950 | int ret; |
772 | 951 | ||
773 | pred->fn = filter_pred_none; | 952 | fn = pred->fn = filter_pred_none; |
774 | 953 | ||
775 | if (pred->op == OP_AND) { | 954 | if (pred->op == OP_AND) |
776 | pred->pop_n = 2; | ||
777 | fn = filter_pred_and; | ||
778 | goto add_pred_fn; | 955 | goto add_pred_fn; |
779 | } else if (pred->op == OP_OR) { | 956 | else if (pred->op == OP_OR) |
780 | pred->pop_n = 2; | ||
781 | fn = filter_pred_or; | ||
782 | goto add_pred_fn; | 957 | goto add_pred_fn; |
783 | } | ||
784 | 958 | ||
785 | field = find_event_field(call, pred->field_name); | 959 | field = find_event_field(call, pred->field_name); |
786 | if (!field) { | 960 | if (!field) { |
@@ -829,7 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps, | |||
829 | 1003 | ||
830 | add_pred_fn: | 1004 | add_pred_fn: |
831 | if (!dry_run) | 1005 | if (!dry_run) |
832 | return filter_add_pred_fn(ps, call, filter, pred, fn); | 1006 | return filter_add_pred_fn(ps, call, filter, pred, stack, fn); |
833 | return 0; | 1007 | return 0; |
834 | } | 1008 | } |
835 | 1009 | ||
@@ -1187,6 +1361,234 @@ static int check_preds(struct filter_parse_state *ps) | |||
1187 | return 0; | 1361 | return 0; |
1188 | } | 1362 | } |
1189 | 1363 | ||
1364 | static int count_preds(struct filter_parse_state *ps) | ||
1365 | { | ||
1366 | struct postfix_elt *elt; | ||
1367 | int n_preds = 0; | ||
1368 | |||
1369 | list_for_each_entry(elt, &ps->postfix, list) { | ||
1370 | if (elt->op == OP_NONE) | ||
1371 | continue; | ||
1372 | n_preds++; | ||
1373 | } | ||
1374 | |||
1375 | return n_preds; | ||
1376 | } | ||
1377 | |||
1378 | /* | ||
1379 | * The tree is walked at filtering of an event. If the tree is not correctly | ||
1380 | * built, it may cause an infinite loop. Check here that the tree does | ||
1381 | * indeed terminate. | ||
1382 | */ | ||
1383 | static int check_pred_tree(struct event_filter *filter, | ||
1384 | struct filter_pred *root) | ||
1385 | { | ||
1386 | struct filter_pred *preds; | ||
1387 | struct filter_pred *pred; | ||
1388 | enum move_type move = MOVE_DOWN; | ||
1389 | int count = 0; | ||
1390 | int done = 0; | ||
1391 | int max; | ||
1392 | |||
1393 | /* | ||
1394 | * The max that we can hit a node is three times. | ||
1395 | * Once going down, once coming up from left, and | ||
1396 | * once coming up from right. This is more than enough | ||
1397 | * since leafs are only hit a single time. | ||
1398 | */ | ||
1399 | max = 3 * filter->n_preds; | ||
1400 | |||
1401 | preds = filter->preds; | ||
1402 | if (!preds) | ||
1403 | return -EINVAL; | ||
1404 | pred = root; | ||
1405 | |||
1406 | do { | ||
1407 | if (WARN_ON(count++ > max)) | ||
1408 | return -EINVAL; | ||
1409 | |||
1410 | switch (move) { | ||
1411 | case MOVE_DOWN: | ||
1412 | if (pred->left != FILTER_PRED_INVALID) { | ||
1413 | pred = &preds[pred->left]; | ||
1414 | continue; | ||
1415 | } | ||
1416 | /* A leaf at the root is just a leaf in the tree */ | ||
1417 | if (pred == root) | ||
1418 | break; | ||
1419 | pred = get_pred_parent(pred, preds, | ||
1420 | pred->parent, &move); | ||
1421 | continue; | ||
1422 | case MOVE_UP_FROM_LEFT: | ||
1423 | pred = &preds[pred->right]; | ||
1424 | move = MOVE_DOWN; | ||
1425 | continue; | ||
1426 | case MOVE_UP_FROM_RIGHT: | ||
1427 | if (pred == root) | ||
1428 | break; | ||
1429 | pred = get_pred_parent(pred, preds, | ||
1430 | pred->parent, &move); | ||
1431 | continue; | ||
1432 | } | ||
1433 | done = 1; | ||
1434 | } while (!done); | ||
1435 | |||
1436 | /* We are fine. */ | ||
1437 | return 0; | ||
1438 | } | ||
1439 | |||
1440 | static int count_leafs(struct filter_pred *preds, struct filter_pred *root) | ||
1441 | { | ||
1442 | struct filter_pred *pred; | ||
1443 | enum move_type move = MOVE_DOWN; | ||
1444 | int count = 0; | ||
1445 | int done = 0; | ||
1446 | |||
1447 | pred = root; | ||
1448 | |||
1449 | do { | ||
1450 | switch (move) { | ||
1451 | case MOVE_DOWN: | ||
1452 | if (pred->left != FILTER_PRED_INVALID) { | ||
1453 | pred = &preds[pred->left]; | ||
1454 | continue; | ||
1455 | } | ||
1456 | /* A leaf at the root is just a leaf in the tree */ | ||
1457 | if (pred == root) | ||
1458 | return 1; | ||
1459 | count++; | ||
1460 | pred = get_pred_parent(pred, preds, | ||
1461 | pred->parent, &move); | ||
1462 | continue; | ||
1463 | case MOVE_UP_FROM_LEFT: | ||
1464 | pred = &preds[pred->right]; | ||
1465 | move = MOVE_DOWN; | ||
1466 | continue; | ||
1467 | case MOVE_UP_FROM_RIGHT: | ||
1468 | if (pred == root) | ||
1469 | break; | ||
1470 | pred = get_pred_parent(pred, preds, | ||
1471 | pred->parent, &move); | ||
1472 | continue; | ||
1473 | } | ||
1474 | done = 1; | ||
1475 | } while (!done); | ||
1476 | |||
1477 | return count; | ||
1478 | } | ||
1479 | |||
1480 | static int fold_pred(struct filter_pred *preds, struct filter_pred *root) | ||
1481 | { | ||
1482 | struct filter_pred *pred; | ||
1483 | enum move_type move = MOVE_DOWN; | ||
1484 | int count = 0; | ||
1485 | int children; | ||
1486 | int done = 0; | ||
1487 | |||
1488 | /* No need to keep the fold flag */ | ||
1489 | root->index &= ~FILTER_PRED_FOLD; | ||
1490 | |||
1491 | /* If the root is a leaf then do nothing */ | ||
1492 | if (root->left == FILTER_PRED_INVALID) | ||
1493 | return 0; | ||
1494 | |||
1495 | /* count the children */ | ||
1496 | children = count_leafs(preds, &preds[root->left]); | ||
1497 | children += count_leafs(preds, &preds[root->right]); | ||
1498 | |||
1499 | root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL); | ||
1500 | if (!root->ops) | ||
1501 | return -ENOMEM; | ||
1502 | |||
1503 | root->val = children; | ||
1504 | |||
1505 | pred = root; | ||
1506 | do { | ||
1507 | switch (move) { | ||
1508 | case MOVE_DOWN: | ||
1509 | if (pred->left != FILTER_PRED_INVALID) { | ||
1510 | pred = &preds[pred->left]; | ||
1511 | continue; | ||
1512 | } | ||
1513 | if (WARN_ON(count == children)) | ||
1514 | return -EINVAL; | ||
1515 | pred->index &= ~FILTER_PRED_FOLD; | ||
1516 | root->ops[count++] = pred->index; | ||
1517 | pred = get_pred_parent(pred, preds, | ||
1518 | pred->parent, &move); | ||
1519 | continue; | ||
1520 | case MOVE_UP_FROM_LEFT: | ||
1521 | pred = &preds[pred->right]; | ||
1522 | move = MOVE_DOWN; | ||
1523 | continue; | ||
1524 | case MOVE_UP_FROM_RIGHT: | ||
1525 | if (pred == root) | ||
1526 | break; | ||
1527 | pred = get_pred_parent(pred, preds, | ||
1528 | pred->parent, &move); | ||
1529 | continue; | ||
1530 | } | ||
1531 | done = 1; | ||
1532 | } while (!done); | ||
1533 | |||
1534 | return 0; | ||
1535 | } | ||
1536 | |||
1537 | /* | ||
1538 | * To optimize the processing of the ops, if we have several "ors" or | ||
1539 | * "ands" together, we can put them in an array and process them all | ||
1540 | * together speeding up the filter logic. | ||
1541 | */ | ||
1542 | static int fold_pred_tree(struct event_filter *filter, | ||
1543 | struct filter_pred *root) | ||
1544 | { | ||
1545 | struct filter_pred *preds; | ||
1546 | struct filter_pred *pred; | ||
1547 | enum move_type move = MOVE_DOWN; | ||
1548 | int done = 0; | ||
1549 | int err; | ||
1550 | |||
1551 | preds = filter->preds; | ||
1552 | if (!preds) | ||
1553 | return -EINVAL; | ||
1554 | pred = root; | ||
1555 | |||
1556 | do { | ||
1557 | switch (move) { | ||
1558 | case MOVE_DOWN: | ||
1559 | if (pred->index & FILTER_PRED_FOLD) { | ||
1560 | err = fold_pred(preds, pred); | ||
1561 | if (err) | ||
1562 | return err; | ||
1563 | /* Folded nodes are like leafs */ | ||
1564 | } else if (pred->left != FILTER_PRED_INVALID) { | ||
1565 | pred = &preds[pred->left]; | ||
1566 | continue; | ||
1567 | } | ||
1568 | |||
1569 | /* A leaf at the root is just a leaf in the tree */ | ||
1570 | if (pred == root) | ||
1571 | break; | ||
1572 | pred = get_pred_parent(pred, preds, | ||
1573 | pred->parent, &move); | ||
1574 | continue; | ||
1575 | case MOVE_UP_FROM_LEFT: | ||
1576 | pred = &preds[pred->right]; | ||
1577 | move = MOVE_DOWN; | ||
1578 | continue; | ||
1579 | case MOVE_UP_FROM_RIGHT: | ||
1580 | if (pred == root) | ||
1581 | break; | ||
1582 | pred = get_pred_parent(pred, preds, | ||
1583 | pred->parent, &move); | ||
1584 | continue; | ||
1585 | } | ||
1586 | done = 1; | ||
1587 | } while (!done); | ||
1588 | |||
1589 | return 0; | ||
1590 | } | ||
1591 | |||
1190 | static int replace_preds(struct ftrace_event_call *call, | 1592 | static int replace_preds(struct ftrace_event_call *call, |
1191 | struct event_filter *filter, | 1593 | struct event_filter *filter, |
1192 | struct filter_parse_state *ps, | 1594 | struct filter_parse_state *ps, |
@@ -1195,14 +1597,32 @@ static int replace_preds(struct ftrace_event_call *call, | |||
1195 | { | 1597 | { |
1196 | char *operand1 = NULL, *operand2 = NULL; | 1598 | char *operand1 = NULL, *operand2 = NULL; |
1197 | struct filter_pred *pred; | 1599 | struct filter_pred *pred; |
1600 | struct filter_pred *root; | ||
1198 | struct postfix_elt *elt; | 1601 | struct postfix_elt *elt; |
1602 | struct pred_stack stack = { }; /* init to NULL */ | ||
1199 | int err; | 1603 | int err; |
1200 | int n_preds = 0; | 1604 | int n_preds = 0; |
1201 | 1605 | ||
1606 | n_preds = count_preds(ps); | ||
1607 | if (n_preds >= MAX_FILTER_PRED) { | ||
1608 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); | ||
1609 | return -ENOSPC; | ||
1610 | } | ||
1611 | |||
1202 | err = check_preds(ps); | 1612 | err = check_preds(ps); |
1203 | if (err) | 1613 | if (err) |
1204 | return err; | 1614 | return err; |
1205 | 1615 | ||
1616 | if (!dry_run) { | ||
1617 | err = __alloc_pred_stack(&stack, n_preds); | ||
1618 | if (err) | ||
1619 | return err; | ||
1620 | err = __alloc_preds(filter, n_preds); | ||
1621 | if (err) | ||
1622 | goto fail; | ||
1623 | } | ||
1624 | |||
1625 | n_preds = 0; | ||
1206 | list_for_each_entry(elt, &ps->postfix, list) { | 1626 | list_for_each_entry(elt, &ps->postfix, list) { |
1207 | if (elt->op == OP_NONE) { | 1627 | if (elt->op == OP_NONE) { |
1208 | if (!operand1) | 1628 | if (!operand1) |
@@ -1211,14 +1631,16 @@ static int replace_preds(struct ftrace_event_call *call, | |||
1211 | operand2 = elt->operand; | 1631 | operand2 = elt->operand; |
1212 | else { | 1632 | else { |
1213 | parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); | 1633 | parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); |
1214 | return -EINVAL; | 1634 | err = -EINVAL; |
1635 | goto fail; | ||
1215 | } | 1636 | } |
1216 | continue; | 1637 | continue; |
1217 | } | 1638 | } |
1218 | 1639 | ||
1219 | if (n_preds++ == MAX_FILTER_PRED) { | 1640 | if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { |
1220 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); | 1641 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); |
1221 | return -ENOSPC; | 1642 | err = -ENOSPC; |
1643 | goto fail; | ||
1222 | } | 1644 | } |
1223 | 1645 | ||
1224 | if (elt->op == OP_AND || elt->op == OP_OR) { | 1646 | if (elt->op == OP_AND || elt->op == OP_OR) { |
@@ -1228,76 +1650,181 @@ static int replace_preds(struct ftrace_event_call *call, | |||
1228 | 1650 | ||
1229 | if (!operand1 || !operand2) { | 1651 | if (!operand1 || !operand2) { |
1230 | parse_error(ps, FILT_ERR_MISSING_FIELD, 0); | 1652 | parse_error(ps, FILT_ERR_MISSING_FIELD, 0); |
1231 | return -EINVAL; | 1653 | err = -EINVAL; |
1654 | goto fail; | ||
1232 | } | 1655 | } |
1233 | 1656 | ||
1234 | pred = create_pred(elt->op, operand1, operand2); | 1657 | pred = create_pred(elt->op, operand1, operand2); |
1235 | add_pred: | 1658 | add_pred: |
1236 | if (!pred) | 1659 | if (!pred) { |
1237 | return -ENOMEM; | 1660 | err = -ENOMEM; |
1238 | err = filter_add_pred(ps, call, filter, pred, dry_run); | 1661 | goto fail; |
1662 | } | ||
1663 | err = filter_add_pred(ps, call, filter, pred, &stack, dry_run); | ||
1239 | filter_free_pred(pred); | 1664 | filter_free_pred(pred); |
1240 | if (err) | 1665 | if (err) |
1241 | return err; | 1666 | goto fail; |
1242 | 1667 | ||
1243 | operand1 = operand2 = NULL; | 1668 | operand1 = operand2 = NULL; |
1244 | } | 1669 | } |
1245 | 1670 | ||
1246 | return 0; | 1671 | if (!dry_run) { |
1672 | /* We should have one item left on the stack */ | ||
1673 | pred = __pop_pred_stack(&stack); | ||
1674 | if (!pred) | ||
1675 | return -EINVAL; | ||
1676 | /* This item is where we start from in matching */ | ||
1677 | root = pred; | ||
1678 | /* Make sure the stack is empty */ | ||
1679 | pred = __pop_pred_stack(&stack); | ||
1680 | if (WARN_ON(pred)) { | ||
1681 | err = -EINVAL; | ||
1682 | filter->root = NULL; | ||
1683 | goto fail; | ||
1684 | } | ||
1685 | err = check_pred_tree(filter, root); | ||
1686 | if (err) | ||
1687 | goto fail; | ||
1688 | |||
1689 | /* Optimize the tree */ | ||
1690 | err = fold_pred_tree(filter, root); | ||
1691 | if (err) | ||
1692 | goto fail; | ||
1693 | |||
1694 | /* We don't set root until we know it works */ | ||
1695 | barrier(); | ||
1696 | filter->root = root; | ||
1697 | } | ||
1698 | |||
1699 | err = 0; | ||
1700 | fail: | ||
1701 | __free_pred_stack(&stack); | ||
1702 | return err; | ||
1247 | } | 1703 | } |
1248 | 1704 | ||
1705 | struct filter_list { | ||
1706 | struct list_head list; | ||
1707 | struct event_filter *filter; | ||
1708 | }; | ||
1709 | |||
1249 | static int replace_system_preds(struct event_subsystem *system, | 1710 | static int replace_system_preds(struct event_subsystem *system, |
1250 | struct filter_parse_state *ps, | 1711 | struct filter_parse_state *ps, |
1251 | char *filter_string) | 1712 | char *filter_string) |
1252 | { | 1713 | { |
1253 | struct ftrace_event_call *call; | 1714 | struct ftrace_event_call *call; |
1715 | struct filter_list *filter_item; | ||
1716 | struct filter_list *tmp; | ||
1717 | LIST_HEAD(filter_list); | ||
1254 | bool fail = true; | 1718 | bool fail = true; |
1255 | int err; | 1719 | int err; |
1256 | 1720 | ||
1257 | list_for_each_entry(call, &ftrace_events, list) { | 1721 | list_for_each_entry(call, &ftrace_events, list) { |
1258 | struct event_filter *filter = call->filter; | ||
1259 | 1722 | ||
1260 | if (strcmp(call->class->system, system->name) != 0) | 1723 | if (strcmp(call->class->system, system->name) != 0) |
1261 | continue; | 1724 | continue; |
1262 | 1725 | ||
1263 | /* try to see if the filter can be applied */ | 1726 | /* |
1264 | err = replace_preds(call, filter, ps, filter_string, true); | 1727 | * Try to see if the filter can be applied |
1728 | * (filter arg is ignored on dry_run) | ||
1729 | */ | ||
1730 | err = replace_preds(call, NULL, ps, filter_string, true); | ||
1265 | if (err) | 1731 | if (err) |
1732 | goto fail; | ||
1733 | } | ||
1734 | |||
1735 | list_for_each_entry(call, &ftrace_events, list) { | ||
1736 | struct event_filter *filter; | ||
1737 | |||
1738 | if (strcmp(call->class->system, system->name) != 0) | ||
1266 | continue; | 1739 | continue; |
1267 | 1740 | ||
1268 | /* really apply the filter */ | 1741 | filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); |
1269 | filter_disable_preds(call); | 1742 | if (!filter_item) |
1270 | err = replace_preds(call, filter, ps, filter_string, false); | 1743 | goto fail_mem; |
1744 | |||
1745 | list_add_tail(&filter_item->list, &filter_list); | ||
1746 | |||
1747 | filter_item->filter = __alloc_filter(); | ||
1748 | if (!filter_item->filter) | ||
1749 | goto fail_mem; | ||
1750 | filter = filter_item->filter; | ||
1751 | |||
1752 | /* Can only fail on no memory */ | ||
1753 | err = replace_filter_string(filter, filter_string); | ||
1271 | if (err) | 1754 | if (err) |
1272 | filter_disable_preds(call); | 1755 | goto fail_mem; |
1273 | else { | 1756 | |
1757 | err = replace_preds(call, filter, ps, filter_string, false); | ||
1758 | if (err) { | ||
1759 | filter_disable(call); | ||
1760 | parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); | ||
1761 | append_filter_err(ps, filter); | ||
1762 | } else | ||
1274 | call->flags |= TRACE_EVENT_FL_FILTERED; | 1763 | call->flags |= TRACE_EVENT_FL_FILTERED; |
1275 | replace_filter_string(filter, filter_string); | 1764 | /* |
1276 | } | 1765 | * Regardless of if this returned an error, we still |
1766 | * replace the filter for the call. | ||
1767 | */ | ||
1768 | filter = call->filter; | ||
1769 | call->filter = filter_item->filter; | ||
1770 | filter_item->filter = filter; | ||
1771 | |||
1277 | fail = false; | 1772 | fail = false; |
1278 | } | 1773 | } |
1279 | 1774 | ||
1280 | if (fail) { | 1775 | if (fail) |
1281 | parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); | 1776 | goto fail; |
1282 | return -EINVAL; | 1777 | |
1778 | /* | ||
1779 | * The calls can still be using the old filters. | ||
1780 | * Do a synchronize_sched() to ensure all calls are | ||
1781 | * done with them before we free them. | ||
1782 | */ | ||
1783 | synchronize_sched(); | ||
1784 | list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { | ||
1785 | __free_filter(filter_item->filter); | ||
1786 | list_del(&filter_item->list); | ||
1787 | kfree(filter_item); | ||
1283 | } | 1788 | } |
1284 | return 0; | 1789 | return 0; |
1790 | fail: | ||
1791 | /* No call succeeded */ | ||
1792 | list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { | ||
1793 | list_del(&filter_item->list); | ||
1794 | kfree(filter_item); | ||
1795 | } | ||
1796 | parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); | ||
1797 | return -EINVAL; | ||
1798 | fail_mem: | ||
1799 | /* If any call succeeded, we still need to sync */ | ||
1800 | if (!fail) | ||
1801 | synchronize_sched(); | ||
1802 | list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { | ||
1803 | __free_filter(filter_item->filter); | ||
1804 | list_del(&filter_item->list); | ||
1805 | kfree(filter_item); | ||
1806 | } | ||
1807 | return -ENOMEM; | ||
1285 | } | 1808 | } |
1286 | 1809 | ||
1287 | int apply_event_filter(struct ftrace_event_call *call, char *filter_string) | 1810 | int apply_event_filter(struct ftrace_event_call *call, char *filter_string) |
1288 | { | 1811 | { |
1289 | int err; | ||
1290 | struct filter_parse_state *ps; | 1812 | struct filter_parse_state *ps; |
1813 | struct event_filter *filter; | ||
1814 | struct event_filter *tmp; | ||
1815 | int err = 0; | ||
1291 | 1816 | ||
1292 | mutex_lock(&event_mutex); | 1817 | mutex_lock(&event_mutex); |
1293 | 1818 | ||
1294 | err = init_preds(call); | ||
1295 | if (err) | ||
1296 | goto out_unlock; | ||
1297 | |||
1298 | if (!strcmp(strstrip(filter_string), "0")) { | 1819 | if (!strcmp(strstrip(filter_string), "0")) { |
1299 | filter_disable_preds(call); | 1820 | filter_disable(call); |
1300 | remove_filter_string(call->filter); | 1821 | filter = call->filter; |
1822 | if (!filter) | ||
1823 | goto out_unlock; | ||
1824 | call->filter = NULL; | ||
1825 | /* Make sure the filter is not being used */ | ||
1826 | synchronize_sched(); | ||
1827 | __free_filter(filter); | ||
1301 | goto out_unlock; | 1828 | goto out_unlock; |
1302 | } | 1829 | } |
1303 | 1830 | ||
@@ -1306,22 +1833,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) | |||
1306 | if (!ps) | 1833 | if (!ps) |
1307 | goto out_unlock; | 1834 | goto out_unlock; |
1308 | 1835 | ||
1309 | filter_disable_preds(call); | 1836 | filter = __alloc_filter(); |
1310 | replace_filter_string(call->filter, filter_string); | 1837 | if (!filter) { |
1838 | kfree(ps); | ||
1839 | goto out_unlock; | ||
1840 | } | ||
1841 | |||
1842 | replace_filter_string(filter, filter_string); | ||
1311 | 1843 | ||
1312 | parse_init(ps, filter_ops, filter_string); | 1844 | parse_init(ps, filter_ops, filter_string); |
1313 | err = filter_parse(ps); | 1845 | err = filter_parse(ps); |
1314 | if (err) { | 1846 | if (err) { |
1315 | append_filter_err(ps, call->filter); | 1847 | append_filter_err(ps, filter); |
1316 | goto out; | 1848 | goto out; |
1317 | } | 1849 | } |
1318 | 1850 | ||
1319 | err = replace_preds(call, call->filter, ps, filter_string, false); | 1851 | err = replace_preds(call, filter, ps, filter_string, false); |
1320 | if (err) | 1852 | if (err) { |
1321 | append_filter_err(ps, call->filter); | 1853 | filter_disable(call); |
1322 | else | 1854 | append_filter_err(ps, filter); |
1855 | } else | ||
1323 | call->flags |= TRACE_EVENT_FL_FILTERED; | 1856 | call->flags |= TRACE_EVENT_FL_FILTERED; |
1324 | out: | 1857 | out: |
1858 | /* | ||
1859 | * Always swap the call filter with the new filter | ||
1860 | * even if there was an error. If there was an error | ||
1861 | * in the filter, we disable the filter and show the error | ||
1862 | * string | ||
1863 | */ | ||
1864 | tmp = call->filter; | ||
1865 | call->filter = filter; | ||
1866 | if (tmp) { | ||
1867 | /* Make sure the call is done with the filter */ | ||
1868 | synchronize_sched(); | ||
1869 | __free_filter(tmp); | ||
1870 | } | ||
1325 | filter_opstack_clear(ps); | 1871 | filter_opstack_clear(ps); |
1326 | postfix_clear(ps); | 1872 | postfix_clear(ps); |
1327 | kfree(ps); | 1873 | kfree(ps); |
@@ -1334,18 +1880,21 @@ out_unlock: | |||
1334 | int apply_subsystem_event_filter(struct event_subsystem *system, | 1880 | int apply_subsystem_event_filter(struct event_subsystem *system, |
1335 | char *filter_string) | 1881 | char *filter_string) |
1336 | { | 1882 | { |
1337 | int err; | ||
1338 | struct filter_parse_state *ps; | 1883 | struct filter_parse_state *ps; |
1884 | struct event_filter *filter; | ||
1885 | int err = 0; | ||
1339 | 1886 | ||
1340 | mutex_lock(&event_mutex); | 1887 | mutex_lock(&event_mutex); |
1341 | 1888 | ||
1342 | err = init_subsystem_preds(system); | ||
1343 | if (err) | ||
1344 | goto out_unlock; | ||
1345 | |||
1346 | if (!strcmp(strstrip(filter_string), "0")) { | 1889 | if (!strcmp(strstrip(filter_string), "0")) { |
1347 | filter_free_subsystem_preds(system); | 1890 | filter_free_subsystem_preds(system); |
1348 | remove_filter_string(system->filter); | 1891 | remove_filter_string(system->filter); |
1892 | filter = system->filter; | ||
1893 | system->filter = NULL; | ||
1894 | /* Ensure all filters are no longer used */ | ||
1895 | synchronize_sched(); | ||
1896 | filter_free_subsystem_filters(system); | ||
1897 | __free_filter(filter); | ||
1349 | goto out_unlock; | 1898 | goto out_unlock; |
1350 | } | 1899 | } |
1351 | 1900 | ||
@@ -1354,7 +1903,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system, | |||
1354 | if (!ps) | 1903 | if (!ps) |
1355 | goto out_unlock; | 1904 | goto out_unlock; |
1356 | 1905 | ||
1357 | replace_filter_string(system->filter, filter_string); | 1906 | filter = __alloc_filter(); |
1907 | if (!filter) | ||
1908 | goto out; | ||
1909 | |||
1910 | replace_filter_string(filter, filter_string); | ||
1911 | /* | ||
1912 | * No event actually uses the system filter | ||
1913 | * we can free it without synchronize_sched(). | ||
1914 | */ | ||
1915 | __free_filter(system->filter); | ||
1916 | system->filter = filter; | ||
1358 | 1917 | ||
1359 | parse_init(ps, filter_ops, filter_string); | 1918 | parse_init(ps, filter_ops, filter_string); |
1360 | err = filter_parse(ps); | 1919 | err = filter_parse(ps); |
@@ -1384,7 +1943,7 @@ void ftrace_profile_free_filter(struct perf_event *event) | |||
1384 | struct event_filter *filter = event->filter; | 1943 | struct event_filter *filter = event->filter; |
1385 | 1944 | ||
1386 | event->filter = NULL; | 1945 | event->filter = NULL; |
1387 | __free_preds(filter); | 1946 | __free_filter(filter); |
1388 | } | 1947 | } |
1389 | 1948 | ||
1390 | int ftrace_profile_set_filter(struct perf_event *event, int event_id, | 1949 | int ftrace_profile_set_filter(struct perf_event *event, int event_id, |
@@ -1410,8 +1969,8 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, | |||
1410 | if (event->filter) | 1969 | if (event->filter) |
1411 | goto out_unlock; | 1970 | goto out_unlock; |
1412 | 1971 | ||
1413 | filter = __alloc_preds(); | 1972 | filter = __alloc_filter(); |
1414 | if (IS_ERR(filter)) { | 1973 | if (!filter) { |
1415 | err = PTR_ERR(filter); | 1974 | err = PTR_ERR(filter); |
1416 | goto out_unlock; | 1975 | goto out_unlock; |
1417 | } | 1976 | } |
@@ -1419,7 +1978,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, | |||
1419 | err = -ENOMEM; | 1978 | err = -ENOMEM; |
1420 | ps = kzalloc(sizeof(*ps), GFP_KERNEL); | 1979 | ps = kzalloc(sizeof(*ps), GFP_KERNEL); |
1421 | if (!ps) | 1980 | if (!ps) |
1422 | goto free_preds; | 1981 | goto free_filter; |
1423 | 1982 | ||
1424 | parse_init(ps, filter_ops, filter_str); | 1983 | parse_init(ps, filter_ops, filter_str); |
1425 | err = filter_parse(ps); | 1984 | err = filter_parse(ps); |
@@ -1435,9 +1994,9 @@ free_ps: | |||
1435 | postfix_clear(ps); | 1994 | postfix_clear(ps); |
1436 | kfree(ps); | 1995 | kfree(ps); |
1437 | 1996 | ||
1438 | free_preds: | 1997 | free_filter: |
1439 | if (err) | 1998 | if (err) |
1440 | __free_preds(filter); | 1999 | __free_filter(filter); |
1441 | 2000 | ||
1442 | out_unlock: | 2001 | out_unlock: |
1443 | mutex_unlock(&event_mutex); | 2002 | mutex_unlock(&event_mutex); |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 2dec9bcde8b4..8435b43b1782 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -353,6 +353,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) | |||
353 | kfree(data); | 353 | kfree(data); |
354 | } | 354 | } |
355 | 355 | ||
356 | /* Bitfield fetch function */ | ||
357 | struct bitfield_fetch_param { | ||
358 | struct fetch_param orig; | ||
359 | unsigned char hi_shift; | ||
360 | unsigned char low_shift; | ||
361 | }; | ||
362 | |||
363 | #define DEFINE_FETCH_bitfield(type) \ | ||
364 | static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\ | ||
365 | void *data, void *dest) \ | ||
366 | { \ | ||
367 | struct bitfield_fetch_param *bprm = data; \ | ||
368 | type buf = 0; \ | ||
369 | call_fetch(&bprm->orig, regs, &buf); \ | ||
370 | if (buf) { \ | ||
371 | buf <<= bprm->hi_shift; \ | ||
372 | buf >>= bprm->low_shift; \ | ||
373 | } \ | ||
374 | *(type *)dest = buf; \ | ||
375 | } | ||
376 | DEFINE_BASIC_FETCH_FUNCS(bitfield) | ||
377 | #define fetch_bitfield_string NULL | ||
378 | #define fetch_bitfield_string_size NULL | ||
379 | |||
380 | static __kprobes void | ||
381 | free_bitfield_fetch_param(struct bitfield_fetch_param *data) | ||
382 | { | ||
383 | /* | ||
384 | * Don't check the bitfield itself, because this must be the | ||
385 | * last fetch function. | ||
386 | */ | ||
387 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | ||
388 | free_deref_fetch_param(data->orig.data); | ||
389 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | ||
390 | free_symbol_cache(data->orig.data); | ||
391 | kfree(data); | ||
392 | } | ||
356 | /* Default (unsigned long) fetch type */ | 393 | /* Default (unsigned long) fetch type */ |
357 | #define __DEFAULT_FETCH_TYPE(t) u##t | 394 | #define __DEFAULT_FETCH_TYPE(t) u##t |
358 | #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) | 395 | #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) |
@@ -367,6 +404,7 @@ enum { | |||
367 | FETCH_MTD_memory, | 404 | FETCH_MTD_memory, |
368 | FETCH_MTD_symbol, | 405 | FETCH_MTD_symbol, |
369 | FETCH_MTD_deref, | 406 | FETCH_MTD_deref, |
407 | FETCH_MTD_bitfield, | ||
370 | FETCH_MTD_END, | 408 | FETCH_MTD_END, |
371 | }; | 409 | }; |
372 | 410 | ||
@@ -387,6 +425,7 @@ ASSIGN_FETCH_FUNC(retval, ftype), \ | |||
387 | ASSIGN_FETCH_FUNC(memory, ftype), \ | 425 | ASSIGN_FETCH_FUNC(memory, ftype), \ |
388 | ASSIGN_FETCH_FUNC(symbol, ftype), \ | 426 | ASSIGN_FETCH_FUNC(symbol, ftype), \ |
389 | ASSIGN_FETCH_FUNC(deref, ftype), \ | 427 | ASSIGN_FETCH_FUNC(deref, ftype), \ |
428 | ASSIGN_FETCH_FUNC(bitfield, ftype), \ | ||
390 | } \ | 429 | } \ |
391 | } | 430 | } |
392 | 431 | ||
@@ -430,9 +469,33 @@ static const struct fetch_type *find_fetch_type(const char *type) | |||
430 | if (!type) | 469 | if (!type) |
431 | type = DEFAULT_FETCH_TYPE_STR; | 470 | type = DEFAULT_FETCH_TYPE_STR; |
432 | 471 | ||
472 | /* Special case: bitfield */ | ||
473 | if (*type == 'b') { | ||
474 | unsigned long bs; | ||
475 | type = strchr(type, '/'); | ||
476 | if (!type) | ||
477 | goto fail; | ||
478 | type++; | ||
479 | if (strict_strtoul(type, 0, &bs)) | ||
480 | goto fail; | ||
481 | switch (bs) { | ||
482 | case 8: | ||
483 | return find_fetch_type("u8"); | ||
484 | case 16: | ||
485 | return find_fetch_type("u16"); | ||
486 | case 32: | ||
487 | return find_fetch_type("u32"); | ||
488 | case 64: | ||
489 | return find_fetch_type("u64"); | ||
490 | default: | ||
491 | goto fail; | ||
492 | } | ||
493 | } | ||
494 | |||
433 | for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) | 495 | for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) |
434 | if (strcmp(type, fetch_type_table[i].name) == 0) | 496 | if (strcmp(type, fetch_type_table[i].name) == 0) |
435 | return &fetch_type_table[i]; | 497 | return &fetch_type_table[i]; |
498 | fail: | ||
436 | return NULL; | 499 | return NULL; |
437 | } | 500 | } |
438 | 501 | ||
@@ -586,7 +649,9 @@ error: | |||
586 | 649 | ||
587 | static void free_probe_arg(struct probe_arg *arg) | 650 | static void free_probe_arg(struct probe_arg *arg) |
588 | { | 651 | { |
589 | if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) | 652 | if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) |
653 | free_bitfield_fetch_param(arg->fetch.data); | ||
654 | else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) | ||
590 | free_deref_fetch_param(arg->fetch.data); | 655 | free_deref_fetch_param(arg->fetch.data); |
591 | else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) | 656 | else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) |
592 | free_symbol_cache(arg->fetch.data); | 657 | free_symbol_cache(arg->fetch.data); |
@@ -767,16 +832,15 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, | |||
767 | } | 832 | } |
768 | break; | 833 | break; |
769 | case '+': /* deref memory */ | 834 | case '+': /* deref memory */ |
835 | arg++; /* Skip '+', because strict_strtol() rejects it. */ | ||
770 | case '-': | 836 | case '-': |
771 | tmp = strchr(arg, '('); | 837 | tmp = strchr(arg, '('); |
772 | if (!tmp) | 838 | if (!tmp) |
773 | break; | 839 | break; |
774 | *tmp = '\0'; | 840 | *tmp = '\0'; |
775 | ret = strict_strtol(arg + 1, 0, &offset); | 841 | ret = strict_strtol(arg, 0, &offset); |
776 | if (ret) | 842 | if (ret) |
777 | break; | 843 | break; |
778 | if (arg[0] == '-') | ||
779 | offset = -offset; | ||
780 | arg = tmp + 1; | 844 | arg = tmp + 1; |
781 | tmp = strrchr(arg, ')'); | 845 | tmp = strrchr(arg, ')'); |
782 | if (tmp) { | 846 | if (tmp) { |
@@ -807,6 +871,41 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, | |||
807 | return ret; | 871 | return ret; |
808 | } | 872 | } |
809 | 873 | ||
874 | #define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) | ||
875 | |||
876 | /* Bitfield type needs to be parsed into a fetch function */ | ||
877 | static int __parse_bitfield_probe_arg(const char *bf, | ||
878 | const struct fetch_type *t, | ||
879 | struct fetch_param *f) | ||
880 | { | ||
881 | struct bitfield_fetch_param *bprm; | ||
882 | unsigned long bw, bo; | ||
883 | char *tail; | ||
884 | |||
885 | if (*bf != 'b') | ||
886 | return 0; | ||
887 | |||
888 | bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); | ||
889 | if (!bprm) | ||
890 | return -ENOMEM; | ||
891 | bprm->orig = *f; | ||
892 | f->fn = t->fetch[FETCH_MTD_bitfield]; | ||
893 | f->data = (void *)bprm; | ||
894 | |||
895 | bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ | ||
896 | if (bw == 0 || *tail != '@') | ||
897 | return -EINVAL; | ||
898 | |||
899 | bf = tail + 1; | ||
900 | bo = simple_strtoul(bf, &tail, 0); | ||
901 | if (tail == bf || *tail != '/') | ||
902 | return -EINVAL; | ||
903 | |||
904 | bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); | ||
905 | bprm->low_shift = bprm->hi_shift + bo; | ||
906 | return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; | ||
907 | } | ||
908 | |||
810 | /* String length checking wrapper */ | 909 | /* String length checking wrapper */ |
811 | static int parse_probe_arg(char *arg, struct trace_probe *tp, | 910 | static int parse_probe_arg(char *arg, struct trace_probe *tp, |
812 | struct probe_arg *parg, int is_return) | 911 | struct probe_arg *parg, int is_return) |
@@ -836,6 +935,8 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, | |||
836 | parg->offset = tp->size; | 935 | parg->offset = tp->size; |
837 | tp->size += parg->type->size; | 936 | tp->size += parg->type->size; |
838 | ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); | 937 | ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); |
938 | if (ret >= 0 && t != NULL) | ||
939 | ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); | ||
839 | if (ret >= 0) { | 940 | if (ret >= 0) { |
840 | parg->fetch_size.fn = get_fetch_size_function(parg->type, | 941 | parg->fetch_size.fn = get_fetch_size_function(parg->type, |
841 | parg->fetch.fn); | 942 | parg->fetch.fn); |
@@ -1130,7 +1231,7 @@ static int command_trace_probe(const char *buf) | |||
1130 | return ret; | 1231 | return ret; |
1131 | } | 1232 | } |
1132 | 1233 | ||
1133 | #define WRITE_BUFSIZE 128 | 1234 | #define WRITE_BUFSIZE 4096 |
1134 | 1235 | ||
1135 | static ssize_t probes_write(struct file *file, const char __user *buffer, | 1236 | static ssize_t probes_write(struct file *file, const char __user *buffer, |
1136 | size_t count, loff_t *ppos) | 1237 | size_t count, loff_t *ppos) |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 02272baa2206..456be9063c2d 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -529,24 +529,34 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) | |||
529 | * @entry: The trace entry field from the ring buffer | 529 | * @entry: The trace entry field from the ring buffer |
530 | * | 530 | * |
531 | * Prints the generic fields of irqs off, in hard or softirq, preempt | 531 | * Prints the generic fields of irqs off, in hard or softirq, preempt |
532 | * count and lock depth. | 532 | * count. |
533 | */ | 533 | */ |
534 | int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) | 534 | int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) |
535 | { | 535 | { |
536 | int hardirq, softirq; | 536 | char hardsoft_irq; |
537 | char need_resched; | ||
538 | char irqs_off; | ||
539 | int hardirq; | ||
540 | int softirq; | ||
537 | int ret; | 541 | int ret; |
538 | 542 | ||
539 | hardirq = entry->flags & TRACE_FLAG_HARDIRQ; | 543 | hardirq = entry->flags & TRACE_FLAG_HARDIRQ; |
540 | softirq = entry->flags & TRACE_FLAG_SOFTIRQ; | 544 | softirq = entry->flags & TRACE_FLAG_SOFTIRQ; |
541 | 545 | ||
546 | irqs_off = | ||
547 | (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : | ||
548 | (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : | ||
549 | '.'; | ||
550 | need_resched = | ||
551 | (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'; | ||
552 | hardsoft_irq = | ||
553 | (hardirq && softirq) ? 'H' : | ||
554 | hardirq ? 'h' : | ||
555 | softirq ? 's' : | ||
556 | '.'; | ||
557 | |||
542 | if (!trace_seq_printf(s, "%c%c%c", | 558 | if (!trace_seq_printf(s, "%c%c%c", |
543 | (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : | 559 | irqs_off, need_resched, hardsoft_irq)) |
544 | (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? | ||
545 | 'X' : '.', | ||
546 | (entry->flags & TRACE_FLAG_NEED_RESCHED) ? | ||
547 | 'N' : '.', | ||
548 | (hardirq && softirq) ? 'H' : | ||
549 | hardirq ? 'h' : softirq ? 's' : '.')) | ||
550 | return 0; | 560 | return 0; |
551 | 561 | ||
552 | if (entry->preempt_count) | 562 | if (entry->preempt_count) |
@@ -554,13 +564,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) | |||
554 | else | 564 | else |
555 | ret = trace_seq_putc(s, '.'); | 565 | ret = trace_seq_putc(s, '.'); |
556 | 566 | ||
557 | if (!ret) | 567 | return ret; |
558 | return 0; | ||
559 | |||
560 | if (entry->lock_depth < 0) | ||
561 | return trace_seq_putc(s, '.'); | ||
562 | |||
563 | return trace_seq_printf(s, "%d", entry->lock_depth); | ||
564 | } | 568 | } |
565 | 569 | ||
566 | static int | 570 | static int |
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 8f758d070c43..7e62c0a18456 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c | |||
@@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr) | |||
247 | ctx_trace = tr; | 247 | ctx_trace = tr; |
248 | } | 248 | } |
249 | 249 | ||
250 | static void stop_sched_trace(struct trace_array *tr) | ||
251 | { | ||
252 | tracing_stop_sched_switch_record(); | ||
253 | } | ||
254 | |||
255 | static int sched_switch_trace_init(struct trace_array *tr) | ||
256 | { | ||
257 | ctx_trace = tr; | ||
258 | tracing_reset_online_cpus(tr); | ||
259 | tracing_start_sched_switch_record(); | ||
260 | return 0; | ||
261 | } | ||
262 | |||
263 | static void sched_switch_trace_reset(struct trace_array *tr) | ||
264 | { | ||
265 | if (sched_ref) | ||
266 | stop_sched_trace(tr); | ||
267 | } | ||
268 | |||
269 | static void sched_switch_trace_start(struct trace_array *tr) | ||
270 | { | ||
271 | sched_stopped = 0; | ||
272 | } | ||
273 | |||
274 | static void sched_switch_trace_stop(struct trace_array *tr) | ||
275 | { | ||
276 | sched_stopped = 1; | ||
277 | } | ||
278 | |||
279 | static struct tracer sched_switch_trace __read_mostly = | ||
280 | { | ||
281 | .name = "sched_switch", | ||
282 | .init = sched_switch_trace_init, | ||
283 | .reset = sched_switch_trace_reset, | ||
284 | .start = sched_switch_trace_start, | ||
285 | .stop = sched_switch_trace_stop, | ||
286 | .wait_pipe = poll_wait_pipe, | ||
287 | #ifdef CONFIG_FTRACE_SELFTEST | ||
288 | .selftest = trace_selftest_startup_sched_switch, | ||
289 | #endif | ||
290 | }; | ||
291 | |||
292 | __init static int init_sched_switch_trace(void) | ||
293 | { | ||
294 | return register_tracer(&sched_switch_trace); | ||
295 | } | ||
296 | device_initcall(init_sched_switch_trace); | ||
297 | |||
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 5c9fe08d2093..ee7b5a0bb9f8 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -60,6 +60,19 @@ extern struct syscall_metadata *__stop_syscalls_metadata[]; | |||
60 | 60 | ||
61 | static struct syscall_metadata **syscalls_metadata; | 61 | static struct syscall_metadata **syscalls_metadata; |
62 | 62 | ||
63 | #ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME | ||
64 | static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) | ||
65 | { | ||
66 | /* | ||
67 | * Only compare after the "sys" prefix. Archs that use | ||
68 | * syscall wrappers may have syscalls symbols aliases prefixed | ||
69 | * with "SyS" instead of "sys", leading to an unwanted | ||
70 | * mismatch. | ||
71 | */ | ||
72 | return !strcmp(sym + 3, name + 3); | ||
73 | } | ||
74 | #endif | ||
75 | |||
63 | static __init struct syscall_metadata * | 76 | static __init struct syscall_metadata * |
64 | find_syscall_meta(unsigned long syscall) | 77 | find_syscall_meta(unsigned long syscall) |
65 | { | 78 | { |
@@ -72,14 +85,11 @@ find_syscall_meta(unsigned long syscall) | |||
72 | stop = __stop_syscalls_metadata; | 85 | stop = __stop_syscalls_metadata; |
73 | kallsyms_lookup(syscall, NULL, NULL, NULL, str); | 86 | kallsyms_lookup(syscall, NULL, NULL, NULL, str); |
74 | 87 | ||
88 | if (arch_syscall_match_sym_name(str, "sys_ni_syscall")) | ||
89 | return NULL; | ||
90 | |||
75 | for ( ; start < stop; start++) { | 91 | for ( ; start < stop; start++) { |
76 | /* | 92 | if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name)) |
77 | * Only compare after the "sys" prefix. Archs that use | ||
78 | * syscall wrappers may have syscalls symbols aliases prefixed | ||
79 | * with "SyS" instead of "sys", leading to an unwanted | ||
80 | * mismatch. | ||
81 | */ | ||
82 | if ((*start)->name && !strcmp((*start)->name + 3, str + 3)) | ||
83 | return *start; | 93 | return *start; |
84 | } | 94 | } |
85 | return NULL; | 95 | return NULL; |
@@ -359,7 +369,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call) | |||
359 | int num; | 369 | int num; |
360 | 370 | ||
361 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | 371 | num = ((struct syscall_metadata *)call->data)->syscall_nr; |
362 | if (num < 0 || num >= NR_syscalls) | 372 | if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) |
363 | return -ENOSYS; | 373 | return -ENOSYS; |
364 | mutex_lock(&syscall_trace_lock); | 374 | mutex_lock(&syscall_trace_lock); |
365 | if (!sys_refcount_enter) | 375 | if (!sys_refcount_enter) |
@@ -377,7 +387,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call) | |||
377 | int num; | 387 | int num; |
378 | 388 | ||
379 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | 389 | num = ((struct syscall_metadata *)call->data)->syscall_nr; |
380 | if (num < 0 || num >= NR_syscalls) | 390 | if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) |
381 | return; | 391 | return; |
382 | mutex_lock(&syscall_trace_lock); | 392 | mutex_lock(&syscall_trace_lock); |
383 | sys_refcount_enter--; | 393 | sys_refcount_enter--; |
@@ -393,7 +403,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call) | |||
393 | int num; | 403 | int num; |
394 | 404 | ||
395 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | 405 | num = ((struct syscall_metadata *)call->data)->syscall_nr; |
396 | if (num < 0 || num >= NR_syscalls) | 406 | if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) |
397 | return -ENOSYS; | 407 | return -ENOSYS; |
398 | mutex_lock(&syscall_trace_lock); | 408 | mutex_lock(&syscall_trace_lock); |
399 | if (!sys_refcount_exit) | 409 | if (!sys_refcount_exit) |
@@ -411,7 +421,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) | |||
411 | int num; | 421 | int num; |
412 | 422 | ||
413 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | 423 | num = ((struct syscall_metadata *)call->data)->syscall_nr; |
414 | if (num < 0 || num >= NR_syscalls) | 424 | if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) |
415 | return; | 425 | return; |
416 | mutex_lock(&syscall_trace_lock); | 426 | mutex_lock(&syscall_trace_lock); |
417 | sys_refcount_exit--; | 427 | sys_refcount_exit--; |
@@ -424,6 +434,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) | |||
424 | int init_syscall_trace(struct ftrace_event_call *call) | 434 | int init_syscall_trace(struct ftrace_event_call *call) |
425 | { | 435 | { |
426 | int id; | 436 | int id; |
437 | int num; | ||
438 | |||
439 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | ||
440 | if (num < 0 || num >= NR_syscalls) { | ||
441 | pr_debug("syscall %s metadata not mapped, disabling ftrace event\n", | ||
442 | ((struct syscall_metadata *)call->data)->name); | ||
443 | return -ENOSYS; | ||
444 | } | ||
427 | 445 | ||
428 | if (set_syscall_print_fmt(call) < 0) | 446 | if (set_syscall_print_fmt(call) < 0) |
429 | return -ENOMEM; | 447 | return -ENOMEM; |
@@ -438,7 +456,7 @@ int init_syscall_trace(struct ftrace_event_call *call) | |||
438 | return id; | 456 | return id; |
439 | } | 457 | } |
440 | 458 | ||
441 | unsigned long __init arch_syscall_addr(int nr) | 459 | unsigned long __init __weak arch_syscall_addr(int nr) |
442 | { | 460 | { |
443 | return (unsigned long)sys_call_table[nr]; | 461 | return (unsigned long)sys_call_table[nr]; |
444 | } | 462 | } |