aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-03-15 21:31:30 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-03-15 21:31:30 -0400
commita926021cb1f8a99a275eaf6eb546102e9469dc59 (patch)
treec6d0300cd4b1a1fd658708476db4577b68b4de31 /kernel
parent0586bed3e8563c2eb89bc7256e30ce633ae06cfb (diff)
parent5e814dd597c42daeb8d2a276e64a6ec986ad0e2a (diff)
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (184 commits) perf probe: Clean up probe_point_lazy_walker() return value tracing: Fix irqoff selftest expanding max buffer tracing: Align 4 byte ints together in struct tracer tracing: Export trace_set_clr_event() tracing: Explain about unstable clock on resume with ring buffer warning ftrace/graph: Trace function entry before updating index ftrace: Add .ref.text as one of the safe areas to trace tracing: Adjust conditional expression latency formatting. tracing: Fix event alignment: skb:kfree_skb tracing: Fix event alignment: mce:mce_record tracing: Fix event alignment: kvm:kvm_hv_hypercall tracing: Fix event alignment: module:module_request tracing: Fix event alignment: ftrace:context_switch and ftrace:wakeup tracing: Remove lock_depth from event entry perf header: Stop using 'self' perf session: Use evlist/evsel for managing perf.data attributes perf top: Don't let events to eat up whole header line perf top: Fix events overflow in top command ring-buffer: Remove unused #include <linux/trace_irq.h> tracing: Add an 'overwrite' trace_option. ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c54
-rw-r--r--kernel/perf_event.c1004
-rw-r--r--kernel/sched.c37
-rw-r--r--kernel/sysctl.c2
-rw-r--r--kernel/trace/ftrace.c52
-rw-r--r--kernel/trace/ring_buffer.c24
-rw-r--r--kernel/trace/trace.c38
-rw-r--r--kernel/trace/trace.h41
-rw-r--r--kernel/trace/trace_entries.h6
-rw-r--r--kernel/trace/trace_events.c2
-rw-r--r--kernel/trace/trace_events_filter.c885
-rw-r--r--kernel/trace/trace_kprobe.c111
-rw-r--r--kernel/trace/trace_output.c36
-rw-r--r--kernel/trace/trace_sched_switch.c48
-rw-r--r--kernel/trace/trace_syscalls.c42
15 files changed, 1904 insertions, 478 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b24d7027b83c..95362d15128c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4230,20 +4230,8 @@ void cgroup_post_fork(struct task_struct *child)
4230 */ 4230 */
4231void cgroup_exit(struct task_struct *tsk, int run_callbacks) 4231void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4232{ 4232{
4233 int i;
4234 struct css_set *cg; 4233 struct css_set *cg;
4235 4234 int i;
4236 if (run_callbacks && need_forkexit_callback) {
4237 /*
4238 * modular subsystems can't use callbacks, so no need to lock
4239 * the subsys array
4240 */
4241 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4242 struct cgroup_subsys *ss = subsys[i];
4243 if (ss->exit)
4244 ss->exit(ss, tsk);
4245 }
4246 }
4247 4235
4248 /* 4236 /*
4249 * Unlink from the css_set task list if necessary. 4237 * Unlink from the css_set task list if necessary.
@@ -4261,7 +4249,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4261 task_lock(tsk); 4249 task_lock(tsk);
4262 cg = tsk->cgroups; 4250 cg = tsk->cgroups;
4263 tsk->cgroups = &init_css_set; 4251 tsk->cgroups = &init_css_set;
4252
4253 if (run_callbacks && need_forkexit_callback) {
4254 /*
4255 * modular subsystems can't use callbacks, so no need to lock
4256 * the subsys array
4257 */
4258 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4259 struct cgroup_subsys *ss = subsys[i];
4260 if (ss->exit) {
4261 struct cgroup *old_cgrp =
4262 rcu_dereference_raw(cg->subsys[i])->cgroup;
4263 struct cgroup *cgrp = task_cgroup(tsk, i);
4264 ss->exit(ss, cgrp, old_cgrp, tsk);
4265 }
4266 }
4267 }
4264 task_unlock(tsk); 4268 task_unlock(tsk);
4269
4265 if (cg) 4270 if (cg)
4266 put_css_set_taskexit(cg); 4271 put_css_set_taskexit(cg);
4267} 4272}
@@ -4813,6 +4818,29 @@ css_get_next(struct cgroup_subsys *ss, int id,
4813 return ret; 4818 return ret;
4814} 4819}
4815 4820
4821/*
4822 * get corresponding css from file open on cgroupfs directory
4823 */
4824struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
4825{
4826 struct cgroup *cgrp;
4827 struct inode *inode;
4828 struct cgroup_subsys_state *css;
4829
4830 inode = f->f_dentry->d_inode;
4831 /* check in cgroup filesystem dir */
4832 if (inode->i_op != &cgroup_dir_inode_operations)
4833 return ERR_PTR(-EBADF);
4834
4835 if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
4836 return ERR_PTR(-EINVAL);
4837
4838 /* get cgroup */
4839 cgrp = __d_cgrp(f->f_dentry);
4840 css = cgrp->subsys[id];
4841 return css ? css : ERR_PTR(-ENOENT);
4842}
4843
4816#ifdef CONFIG_CGROUP_DEBUG 4844#ifdef CONFIG_CGROUP_DEBUG
4817static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, 4845static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
4818 struct cgroup *cont) 4846 struct cgroup *cont)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 656222fcf767..ed253aa24ba4 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -38,13 +38,96 @@
38 38
39#include <asm/irq_regs.h> 39#include <asm/irq_regs.h>
40 40
41struct remote_function_call {
42 struct task_struct *p;
43 int (*func)(void *info);
44 void *info;
45 int ret;
46};
47
48static void remote_function(void *data)
49{
50 struct remote_function_call *tfc = data;
51 struct task_struct *p = tfc->p;
52
53 if (p) {
54 tfc->ret = -EAGAIN;
55 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
56 return;
57 }
58
59 tfc->ret = tfc->func(tfc->info);
60}
61
62/**
63 * task_function_call - call a function on the cpu on which a task runs
64 * @p: the task to evaluate
65 * @func: the function to be called
66 * @info: the function call argument
67 *
68 * Calls the function @func when the task is currently running. This might
69 * be on the current CPU, which just calls the function directly
70 *
71 * returns: @func return value, or
72 * -ESRCH - when the process isn't running
73 * -EAGAIN - when the process moved away
74 */
75static int
76task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
77{
78 struct remote_function_call data = {
79 .p = p,
80 .func = func,
81 .info = info,
82 .ret = -ESRCH, /* No such (running) process */
83 };
84
85 if (task_curr(p))
86 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
87
88 return data.ret;
89}
90
91/**
92 * cpu_function_call - call a function on the cpu
93 * @func: the function to be called
94 * @info: the function call argument
95 *
96 * Calls the function @func on the remote cpu.
97 *
98 * returns: @func return value or -ENXIO when the cpu is offline
99 */
100static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
101{
102 struct remote_function_call data = {
103 .p = NULL,
104 .func = func,
105 .info = info,
106 .ret = -ENXIO, /* No such CPU */
107 };
108
109 smp_call_function_single(cpu, remote_function, &data, 1);
110
111 return data.ret;
112}
113
114#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
115 PERF_FLAG_FD_OUTPUT |\
116 PERF_FLAG_PID_CGROUP)
117
41enum event_type_t { 118enum event_type_t {
42 EVENT_FLEXIBLE = 0x1, 119 EVENT_FLEXIBLE = 0x1,
43 EVENT_PINNED = 0x2, 120 EVENT_PINNED = 0x2,
44 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, 121 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
45}; 122};
46 123
47atomic_t perf_task_events __read_mostly; 124/*
125 * perf_sched_events : >0 events exist
126 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
127 */
128atomic_t perf_sched_events __read_mostly;
129static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
130
48static atomic_t nr_mmap_events __read_mostly; 131static atomic_t nr_mmap_events __read_mostly;
49static atomic_t nr_comm_events __read_mostly; 132static atomic_t nr_comm_events __read_mostly;
50static atomic_t nr_task_events __read_mostly; 133static atomic_t nr_task_events __read_mostly;
@@ -67,7 +150,24 @@ int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
67/* 150/*
68 * max perf event sample rate 151 * max perf event sample rate
69 */ 152 */
70int sysctl_perf_event_sample_rate __read_mostly = 100000; 153#define DEFAULT_MAX_SAMPLE_RATE 100000
154int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
155static int max_samples_per_tick __read_mostly =
156 DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
157
158int perf_proc_update_handler(struct ctl_table *table, int write,
159 void __user *buffer, size_t *lenp,
160 loff_t *ppos)
161{
162 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
163
164 if (ret || !write)
165 return ret;
166
167 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
168
169 return 0;
170}
71 171
72static atomic64_t perf_event_id; 172static atomic64_t perf_event_id;
73 173
@@ -75,7 +175,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
75 enum event_type_t event_type); 175 enum event_type_t event_type);
76 176
77static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 177static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
78 enum event_type_t event_type); 178 enum event_type_t event_type,
179 struct task_struct *task);
180
181static void update_context_time(struct perf_event_context *ctx);
182static u64 perf_event_time(struct perf_event *event);
79 183
80void __weak perf_event_print_debug(void) { } 184void __weak perf_event_print_debug(void) { }
81 185
@@ -89,6 +193,360 @@ static inline u64 perf_clock(void)
89 return local_clock(); 193 return local_clock();
90} 194}
91 195
196static inline struct perf_cpu_context *
197__get_cpu_context(struct perf_event_context *ctx)
198{
199 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
200}
201
202#ifdef CONFIG_CGROUP_PERF
203
204/*
205 * Must ensure cgroup is pinned (css_get) before calling
206 * this function. In other words, we cannot call this function
207 * if there is no cgroup event for the current CPU context.
208 */
209static inline struct perf_cgroup *
210perf_cgroup_from_task(struct task_struct *task)
211{
212 return container_of(task_subsys_state(task, perf_subsys_id),
213 struct perf_cgroup, css);
214}
215
216static inline bool
217perf_cgroup_match(struct perf_event *event)
218{
219 struct perf_event_context *ctx = event->ctx;
220 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
221
222 return !event->cgrp || event->cgrp == cpuctx->cgrp;
223}
224
225static inline void perf_get_cgroup(struct perf_event *event)
226{
227 css_get(&event->cgrp->css);
228}
229
230static inline void perf_put_cgroup(struct perf_event *event)
231{
232 css_put(&event->cgrp->css);
233}
234
235static inline void perf_detach_cgroup(struct perf_event *event)
236{
237 perf_put_cgroup(event);
238 event->cgrp = NULL;
239}
240
241static inline int is_cgroup_event(struct perf_event *event)
242{
243 return event->cgrp != NULL;
244}
245
246static inline u64 perf_cgroup_event_time(struct perf_event *event)
247{
248 struct perf_cgroup_info *t;
249
250 t = per_cpu_ptr(event->cgrp->info, event->cpu);
251 return t->time;
252}
253
254static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
255{
256 struct perf_cgroup_info *info;
257 u64 now;
258
259 now = perf_clock();
260
261 info = this_cpu_ptr(cgrp->info);
262
263 info->time += now - info->timestamp;
264 info->timestamp = now;
265}
266
267static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
268{
269 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
270 if (cgrp_out)
271 __update_cgrp_time(cgrp_out);
272}
273
274static inline void update_cgrp_time_from_event(struct perf_event *event)
275{
276 struct perf_cgroup *cgrp;
277
278 /*
279 * ensure we access cgroup data only when needed and
280 * when we know the cgroup is pinned (css_get)
281 */
282 if (!is_cgroup_event(event))
283 return;
284
285 cgrp = perf_cgroup_from_task(current);
286 /*
287 * Do not update time when cgroup is not active
288 */
289 if (cgrp == event->cgrp)
290 __update_cgrp_time(event->cgrp);
291}
292
293static inline void
294perf_cgroup_set_timestamp(struct task_struct *task,
295 struct perf_event_context *ctx)
296{
297 struct perf_cgroup *cgrp;
298 struct perf_cgroup_info *info;
299
300 /*
301 * ctx->lock held by caller
302 * ensure we do not access cgroup data
303 * unless we have the cgroup pinned (css_get)
304 */
305 if (!task || !ctx->nr_cgroups)
306 return;
307
308 cgrp = perf_cgroup_from_task(task);
309 info = this_cpu_ptr(cgrp->info);
310 info->timestamp = ctx->timestamp;
311}
312
313#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
314#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
315
316/*
317 * reschedule events based on the cgroup constraint of task.
318 *
319 * mode SWOUT : schedule out everything
320 * mode SWIN : schedule in based on cgroup for next
321 */
322void perf_cgroup_switch(struct task_struct *task, int mode)
323{
324 struct perf_cpu_context *cpuctx;
325 struct pmu *pmu;
326 unsigned long flags;
327
328 /*
329 * disable interrupts to avoid geting nr_cgroup
330 * changes via __perf_event_disable(). Also
331 * avoids preemption.
332 */
333 local_irq_save(flags);
334
335 /*
336 * we reschedule only in the presence of cgroup
337 * constrained events.
338 */
339 rcu_read_lock();
340
341 list_for_each_entry_rcu(pmu, &pmus, entry) {
342
343 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
344
345 perf_pmu_disable(cpuctx->ctx.pmu);
346
347 /*
348 * perf_cgroup_events says at least one
349 * context on this CPU has cgroup events.
350 *
351 * ctx->nr_cgroups reports the number of cgroup
352 * events for a context.
353 */
354 if (cpuctx->ctx.nr_cgroups > 0) {
355
356 if (mode & PERF_CGROUP_SWOUT) {
357 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
358 /*
359 * must not be done before ctxswout due
360 * to event_filter_match() in event_sched_out()
361 */
362 cpuctx->cgrp = NULL;
363 }
364
365 if (mode & PERF_CGROUP_SWIN) {
366 /* set cgrp before ctxsw in to
367 * allow event_filter_match() to not
368 * have to pass task around
369 */
370 cpuctx->cgrp = perf_cgroup_from_task(task);
371 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
372 }
373 }
374
375 perf_pmu_enable(cpuctx->ctx.pmu);
376 }
377
378 rcu_read_unlock();
379
380 local_irq_restore(flags);
381}
382
383static inline void perf_cgroup_sched_out(struct task_struct *task)
384{
385 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
386}
387
388static inline void perf_cgroup_sched_in(struct task_struct *task)
389{
390 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
391}
392
393static inline int perf_cgroup_connect(int fd, struct perf_event *event,
394 struct perf_event_attr *attr,
395 struct perf_event *group_leader)
396{
397 struct perf_cgroup *cgrp;
398 struct cgroup_subsys_state *css;
399 struct file *file;
400 int ret = 0, fput_needed;
401
402 file = fget_light(fd, &fput_needed);
403 if (!file)
404 return -EBADF;
405
406 css = cgroup_css_from_dir(file, perf_subsys_id);
407 if (IS_ERR(css)) {
408 ret = PTR_ERR(css);
409 goto out;
410 }
411
412 cgrp = container_of(css, struct perf_cgroup, css);
413 event->cgrp = cgrp;
414
415 /* must be done before we fput() the file */
416 perf_get_cgroup(event);
417
418 /*
419 * all events in a group must monitor
420 * the same cgroup because a task belongs
421 * to only one perf cgroup at a time
422 */
423 if (group_leader && group_leader->cgrp != cgrp) {
424 perf_detach_cgroup(event);
425 ret = -EINVAL;
426 }
427out:
428 fput_light(file, fput_needed);
429 return ret;
430}
431
432static inline void
433perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
434{
435 struct perf_cgroup_info *t;
436 t = per_cpu_ptr(event->cgrp->info, event->cpu);
437 event->shadow_ctx_time = now - t->timestamp;
438}
439
440static inline void
441perf_cgroup_defer_enabled(struct perf_event *event)
442{
443 /*
444 * when the current task's perf cgroup does not match
445 * the event's, we need to remember to call the
446 * perf_mark_enable() function the first time a task with
447 * a matching perf cgroup is scheduled in.
448 */
449 if (is_cgroup_event(event) && !perf_cgroup_match(event))
450 event->cgrp_defer_enabled = 1;
451}
452
453static inline void
454perf_cgroup_mark_enabled(struct perf_event *event,
455 struct perf_event_context *ctx)
456{
457 struct perf_event *sub;
458 u64 tstamp = perf_event_time(event);
459
460 if (!event->cgrp_defer_enabled)
461 return;
462
463 event->cgrp_defer_enabled = 0;
464
465 event->tstamp_enabled = tstamp - event->total_time_enabled;
466 list_for_each_entry(sub, &event->sibling_list, group_entry) {
467 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
468 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
469 sub->cgrp_defer_enabled = 0;
470 }
471 }
472}
473#else /* !CONFIG_CGROUP_PERF */
474
475static inline bool
476perf_cgroup_match(struct perf_event *event)
477{
478 return true;
479}
480
481static inline void perf_detach_cgroup(struct perf_event *event)
482{}
483
484static inline int is_cgroup_event(struct perf_event *event)
485{
486 return 0;
487}
488
489static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
490{
491 return 0;
492}
493
494static inline void update_cgrp_time_from_event(struct perf_event *event)
495{
496}
497
498static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
499{
500}
501
502static inline void perf_cgroup_sched_out(struct task_struct *task)
503{
504}
505
506static inline void perf_cgroup_sched_in(struct task_struct *task)
507{
508}
509
510static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
511 struct perf_event_attr *attr,
512 struct perf_event *group_leader)
513{
514 return -EINVAL;
515}
516
517static inline void
518perf_cgroup_set_timestamp(struct task_struct *task,
519 struct perf_event_context *ctx)
520{
521}
522
523void
524perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
525{
526}
527
528static inline void
529perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
530{
531}
532
533static inline u64 perf_cgroup_event_time(struct perf_event *event)
534{
535 return 0;
536}
537
538static inline void
539perf_cgroup_defer_enabled(struct perf_event *event)
540{
541}
542
543static inline void
544perf_cgroup_mark_enabled(struct perf_event *event,
545 struct perf_event_context *ctx)
546{
547}
548#endif
549
92void perf_pmu_disable(struct pmu *pmu) 550void perf_pmu_disable(struct pmu *pmu)
93{ 551{
94 int *count = this_cpu_ptr(pmu->pmu_disable_count); 552 int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -254,7 +712,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
254 raw_spin_lock_irqsave(&ctx->lock, flags); 712 raw_spin_lock_irqsave(&ctx->lock, flags);
255 --ctx->pin_count; 713 --ctx->pin_count;
256 raw_spin_unlock_irqrestore(&ctx->lock, flags); 714 raw_spin_unlock_irqrestore(&ctx->lock, flags);
257 put_ctx(ctx);
258} 715}
259 716
260/* 717/*
@@ -271,6 +728,10 @@ static void update_context_time(struct perf_event_context *ctx)
271static u64 perf_event_time(struct perf_event *event) 728static u64 perf_event_time(struct perf_event *event)
272{ 729{
273 struct perf_event_context *ctx = event->ctx; 730 struct perf_event_context *ctx = event->ctx;
731
732 if (is_cgroup_event(event))
733 return perf_cgroup_event_time(event);
734
274 return ctx ? ctx->time : 0; 735 return ctx ? ctx->time : 0;
275} 736}
276 737
@@ -285,9 +746,20 @@ static void update_event_times(struct perf_event *event)
285 if (event->state < PERF_EVENT_STATE_INACTIVE || 746 if (event->state < PERF_EVENT_STATE_INACTIVE ||
286 event->group_leader->state < PERF_EVENT_STATE_INACTIVE) 747 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
287 return; 748 return;
288 749 /*
289 if (ctx->is_active) 750 * in cgroup mode, time_enabled represents
751 * the time the event was enabled AND active
752 * tasks were in the monitored cgroup. This is
753 * independent of the activity of the context as
754 * there may be a mix of cgroup and non-cgroup events.
755 *
756 * That is why we treat cgroup events differently
757 * here.
758 */
759 if (is_cgroup_event(event))
290 run_end = perf_event_time(event); 760 run_end = perf_event_time(event);
761 else if (ctx->is_active)
762 run_end = ctx->time;
291 else 763 else
292 run_end = event->tstamp_stopped; 764 run_end = event->tstamp_stopped;
293 765
@@ -299,6 +771,7 @@ static void update_event_times(struct perf_event *event)
299 run_end = perf_event_time(event); 771 run_end = perf_event_time(event);
300 772
301 event->total_time_running = run_end - event->tstamp_running; 773 event->total_time_running = run_end - event->tstamp_running;
774
302} 775}
303 776
304/* 777/*
@@ -347,6 +820,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
347 list_add_tail(&event->group_entry, list); 820 list_add_tail(&event->group_entry, list);
348 } 821 }
349 822
823 if (is_cgroup_event(event))
824 ctx->nr_cgroups++;
825
350 list_add_rcu(&event->event_entry, &ctx->event_list); 826 list_add_rcu(&event->event_entry, &ctx->event_list);
351 if (!ctx->nr_events) 827 if (!ctx->nr_events)
352 perf_pmu_rotate_start(ctx->pmu); 828 perf_pmu_rotate_start(ctx->pmu);
@@ -473,6 +949,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
473 949
474 event->attach_state &= ~PERF_ATTACH_CONTEXT; 950 event->attach_state &= ~PERF_ATTACH_CONTEXT;
475 951
952 if (is_cgroup_event(event))
953 ctx->nr_cgroups--;
954
476 ctx->nr_events--; 955 ctx->nr_events--;
477 if (event->attr.inherit_stat) 956 if (event->attr.inherit_stat)
478 ctx->nr_stat--; 957 ctx->nr_stat--;
@@ -544,7 +1023,8 @@ out:
544static inline int 1023static inline int
545event_filter_match(struct perf_event *event) 1024event_filter_match(struct perf_event *event)
546{ 1025{
547 return event->cpu == -1 || event->cpu == smp_processor_id(); 1026 return (event->cpu == -1 || event->cpu == smp_processor_id())
1027 && perf_cgroup_match(event);
548} 1028}
549 1029
550static void 1030static void
@@ -562,7 +1042,7 @@ event_sched_out(struct perf_event *event,
562 */ 1042 */
563 if (event->state == PERF_EVENT_STATE_INACTIVE 1043 if (event->state == PERF_EVENT_STATE_INACTIVE
564 && !event_filter_match(event)) { 1044 && !event_filter_match(event)) {
565 delta = ctx->time - event->tstamp_stopped; 1045 delta = tstamp - event->tstamp_stopped;
566 event->tstamp_running += delta; 1046 event->tstamp_running += delta;
567 event->tstamp_stopped = tstamp; 1047 event->tstamp_stopped = tstamp;
568 } 1048 }
@@ -606,47 +1086,30 @@ group_sched_out(struct perf_event *group_event,
606 cpuctx->exclusive = 0; 1086 cpuctx->exclusive = 0;
607} 1087}
608 1088
609static inline struct perf_cpu_context *
610__get_cpu_context(struct perf_event_context *ctx)
611{
612 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
613}
614
615/* 1089/*
616 * Cross CPU call to remove a performance event 1090 * Cross CPU call to remove a performance event
617 * 1091 *
618 * We disable the event on the hardware level first. After that we 1092 * We disable the event on the hardware level first. After that we
619 * remove it from the context list. 1093 * remove it from the context list.
620 */ 1094 */
621static void __perf_event_remove_from_context(void *info) 1095static int __perf_remove_from_context(void *info)
622{ 1096{
623 struct perf_event *event = info; 1097 struct perf_event *event = info;
624 struct perf_event_context *ctx = event->ctx; 1098 struct perf_event_context *ctx = event->ctx;
625 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1099 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
626 1100
627 /*
628 * If this is a task context, we need to check whether it is
629 * the current task context of this cpu. If not it has been
630 * scheduled out before the smp call arrived.
631 */
632 if (ctx->task && cpuctx->task_ctx != ctx)
633 return;
634
635 raw_spin_lock(&ctx->lock); 1101 raw_spin_lock(&ctx->lock);
636
637 event_sched_out(event, cpuctx, ctx); 1102 event_sched_out(event, cpuctx, ctx);
638
639 list_del_event(event, ctx); 1103 list_del_event(event, ctx);
640
641 raw_spin_unlock(&ctx->lock); 1104 raw_spin_unlock(&ctx->lock);
1105
1106 return 0;
642} 1107}
643 1108
644 1109
645/* 1110/*
646 * Remove the event from a task's (or a CPU's) list of events. 1111 * Remove the event from a task's (or a CPU's) list of events.
647 * 1112 *
648 * Must be called with ctx->mutex held.
649 *
650 * CPU events are removed with a smp call. For task events we only 1113 * CPU events are removed with a smp call. For task events we only
651 * call when the task is on a CPU. 1114 * call when the task is on a CPU.
652 * 1115 *
@@ -657,49 +1120,48 @@ static void __perf_event_remove_from_context(void *info)
657 * When called from perf_event_exit_task, it's OK because the 1120 * When called from perf_event_exit_task, it's OK because the
658 * context has been detached from its task. 1121 * context has been detached from its task.
659 */ 1122 */
660static void perf_event_remove_from_context(struct perf_event *event) 1123static void perf_remove_from_context(struct perf_event *event)
661{ 1124{
662 struct perf_event_context *ctx = event->ctx; 1125 struct perf_event_context *ctx = event->ctx;
663 struct task_struct *task = ctx->task; 1126 struct task_struct *task = ctx->task;
664 1127
1128 lockdep_assert_held(&ctx->mutex);
1129
665 if (!task) { 1130 if (!task) {
666 /* 1131 /*
667 * Per cpu events are removed via an smp call and 1132 * Per cpu events are removed via an smp call and
668 * the removal is always successful. 1133 * the removal is always successful.
669 */ 1134 */
670 smp_call_function_single(event->cpu, 1135 cpu_function_call(event->cpu, __perf_remove_from_context, event);
671 __perf_event_remove_from_context,
672 event, 1);
673 return; 1136 return;
674 } 1137 }
675 1138
676retry: 1139retry:
677 task_oncpu_function_call(task, __perf_event_remove_from_context, 1140 if (!task_function_call(task, __perf_remove_from_context, event))
678 event); 1141 return;
679 1142
680 raw_spin_lock_irq(&ctx->lock); 1143 raw_spin_lock_irq(&ctx->lock);
681 /* 1144 /*
682 * If the context is active we need to retry the smp call. 1145 * If we failed to find a running task, but find the context active now
1146 * that we've acquired the ctx->lock, retry.
683 */ 1147 */
684 if (ctx->nr_active && !list_empty(&event->group_entry)) { 1148 if (ctx->is_active) {
685 raw_spin_unlock_irq(&ctx->lock); 1149 raw_spin_unlock_irq(&ctx->lock);
686 goto retry; 1150 goto retry;
687 } 1151 }
688 1152
689 /* 1153 /*
690 * The lock prevents that this context is scheduled in so we 1154 * Since the task isn't running, its safe to remove the event, us
691 * can remove the event safely, if the call above did not 1155 * holding the ctx->lock ensures the task won't get scheduled in.
692 * succeed.
693 */ 1156 */
694 if (!list_empty(&event->group_entry)) 1157 list_del_event(event, ctx);
695 list_del_event(event, ctx);
696 raw_spin_unlock_irq(&ctx->lock); 1158 raw_spin_unlock_irq(&ctx->lock);
697} 1159}
698 1160
699/* 1161/*
700 * Cross CPU call to disable a performance event 1162 * Cross CPU call to disable a performance event
701 */ 1163 */
702static void __perf_event_disable(void *info) 1164static int __perf_event_disable(void *info)
703{ 1165{
704 struct perf_event *event = info; 1166 struct perf_event *event = info;
705 struct perf_event_context *ctx = event->ctx; 1167 struct perf_event_context *ctx = event->ctx;
@@ -708,9 +1170,12 @@ static void __perf_event_disable(void *info)
708 /* 1170 /*
709 * If this is a per-task event, need to check whether this 1171 * If this is a per-task event, need to check whether this
710 * event's task is the current task on this cpu. 1172 * event's task is the current task on this cpu.
1173 *
1174 * Can trigger due to concurrent perf_event_context_sched_out()
1175 * flipping contexts around.
711 */ 1176 */
712 if (ctx->task && cpuctx->task_ctx != ctx) 1177 if (ctx->task && cpuctx->task_ctx != ctx)
713 return; 1178 return -EINVAL;
714 1179
715 raw_spin_lock(&ctx->lock); 1180 raw_spin_lock(&ctx->lock);
716 1181
@@ -720,6 +1185,7 @@ static void __perf_event_disable(void *info)
720 */ 1185 */
721 if (event->state >= PERF_EVENT_STATE_INACTIVE) { 1186 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
722 update_context_time(ctx); 1187 update_context_time(ctx);
1188 update_cgrp_time_from_event(event);
723 update_group_times(event); 1189 update_group_times(event);
724 if (event == event->group_leader) 1190 if (event == event->group_leader)
725 group_sched_out(event, cpuctx, ctx); 1191 group_sched_out(event, cpuctx, ctx);
@@ -729,6 +1195,8 @@ static void __perf_event_disable(void *info)
729 } 1195 }
730 1196
731 raw_spin_unlock(&ctx->lock); 1197 raw_spin_unlock(&ctx->lock);
1198
1199 return 0;
732} 1200}
733 1201
734/* 1202/*
@@ -753,13 +1221,13 @@ void perf_event_disable(struct perf_event *event)
753 /* 1221 /*
754 * Disable the event on the cpu that it's on 1222 * Disable the event on the cpu that it's on
755 */ 1223 */
756 smp_call_function_single(event->cpu, __perf_event_disable, 1224 cpu_function_call(event->cpu, __perf_event_disable, event);
757 event, 1);
758 return; 1225 return;
759 } 1226 }
760 1227
761retry: 1228retry:
762 task_oncpu_function_call(task, __perf_event_disable, event); 1229 if (!task_function_call(task, __perf_event_disable, event))
1230 return;
763 1231
764 raw_spin_lock_irq(&ctx->lock); 1232 raw_spin_lock_irq(&ctx->lock);
765 /* 1233 /*
@@ -767,6 +1235,11 @@ retry:
767 */ 1235 */
768 if (event->state == PERF_EVENT_STATE_ACTIVE) { 1236 if (event->state == PERF_EVENT_STATE_ACTIVE) {
769 raw_spin_unlock_irq(&ctx->lock); 1237 raw_spin_unlock_irq(&ctx->lock);
1238 /*
1239 * Reload the task pointer, it might have been changed by
1240 * a concurrent perf_event_context_sched_out().
1241 */
1242 task = ctx->task;
770 goto retry; 1243 goto retry;
771 } 1244 }
772 1245
@@ -778,10 +1251,44 @@ retry:
778 update_group_times(event); 1251 update_group_times(event);
779 event->state = PERF_EVENT_STATE_OFF; 1252 event->state = PERF_EVENT_STATE_OFF;
780 } 1253 }
781
782 raw_spin_unlock_irq(&ctx->lock); 1254 raw_spin_unlock_irq(&ctx->lock);
783} 1255}
784 1256
1257static void perf_set_shadow_time(struct perf_event *event,
1258 struct perf_event_context *ctx,
1259 u64 tstamp)
1260{
1261 /*
1262 * use the correct time source for the time snapshot
1263 *
1264 * We could get by without this by leveraging the
1265 * fact that to get to this function, the caller
1266 * has most likely already called update_context_time()
1267 * and update_cgrp_time_xx() and thus both timestamp
1268 * are identical (or very close). Given that tstamp is,
1269 * already adjusted for cgroup, we could say that:
1270 * tstamp - ctx->timestamp
1271 * is equivalent to
1272 * tstamp - cgrp->timestamp.
1273 *
1274 * Then, in perf_output_read(), the calculation would
1275 * work with no changes because:
1276 * - event is guaranteed scheduled in
1277 * - no scheduled out in between
1278 * - thus the timestamp would be the same
1279 *
1280 * But this is a bit hairy.
1281 *
1282 * So instead, we have an explicit cgroup call to remain
1283 * within the time time source all along. We believe it
1284 * is cleaner and simpler to understand.
1285 */
1286 if (is_cgroup_event(event))
1287 perf_cgroup_set_shadow_time(event, tstamp);
1288 else
1289 event->shadow_ctx_time = tstamp - ctx->timestamp;
1290}
1291
785#define MAX_INTERRUPTS (~0ULL) 1292#define MAX_INTERRUPTS (~0ULL)
786 1293
787static void perf_log_throttle(struct perf_event *event, int enable); 1294static void perf_log_throttle(struct perf_event *event, int enable);
@@ -822,7 +1329,7 @@ event_sched_in(struct perf_event *event,
822 1329
823 event->tstamp_running += tstamp - event->tstamp_stopped; 1330 event->tstamp_running += tstamp - event->tstamp_stopped;
824 1331
825 event->shadow_ctx_time = tstamp - ctx->timestamp; 1332 perf_set_shadow_time(event, ctx, tstamp);
826 1333
827 if (!is_software_event(event)) 1334 if (!is_software_event(event))
828 cpuctx->active_oncpu++; 1335 cpuctx->active_oncpu++;
@@ -943,12 +1450,15 @@ static void add_event_to_ctx(struct perf_event *event,
943 event->tstamp_stopped = tstamp; 1450 event->tstamp_stopped = tstamp;
944} 1451}
945 1452
1453static void perf_event_context_sched_in(struct perf_event_context *ctx,
1454 struct task_struct *tsk);
1455
946/* 1456/*
947 * Cross CPU call to install and enable a performance event 1457 * Cross CPU call to install and enable a performance event
948 * 1458 *
949 * Must be called with ctx->mutex held 1459 * Must be called with ctx->mutex held
950 */ 1460 */
951static void __perf_install_in_context(void *info) 1461static int __perf_install_in_context(void *info)
952{ 1462{
953 struct perf_event *event = info; 1463 struct perf_event *event = info;
954 struct perf_event_context *ctx = event->ctx; 1464 struct perf_event_context *ctx = event->ctx;
@@ -957,21 +1467,22 @@ static void __perf_install_in_context(void *info)
957 int err; 1467 int err;
958 1468
959 /* 1469 /*
960 * If this is a task context, we need to check whether it is 1470 * In case we're installing a new context to an already running task,
961 * the current task context of this cpu. If not it has been 1471 * could also happen before perf_event_task_sched_in() on architectures
962 * scheduled out before the smp call arrived. 1472 * which do context switches with IRQs enabled.
963 * Or possibly this is the right context but it isn't
964 * on this cpu because it had no events.
965 */ 1473 */
966 if (ctx->task && cpuctx->task_ctx != ctx) { 1474 if (ctx->task && !cpuctx->task_ctx)
967 if (cpuctx->task_ctx || ctx->task != current) 1475 perf_event_context_sched_in(ctx, ctx->task);
968 return;
969 cpuctx->task_ctx = ctx;
970 }
971 1476
972 raw_spin_lock(&ctx->lock); 1477 raw_spin_lock(&ctx->lock);
973 ctx->is_active = 1; 1478 ctx->is_active = 1;
974 update_context_time(ctx); 1479 update_context_time(ctx);
1480 /*
1481 * update cgrp time only if current cgrp
1482 * matches event->cgrp. Must be done before
1483 * calling add_event_to_ctx()
1484 */
1485 update_cgrp_time_from_event(event);
975 1486
976 add_event_to_ctx(event, ctx); 1487 add_event_to_ctx(event, ctx);
977 1488
@@ -1012,6 +1523,8 @@ static void __perf_install_in_context(void *info)
1012 1523
1013unlock: 1524unlock:
1014 raw_spin_unlock(&ctx->lock); 1525 raw_spin_unlock(&ctx->lock);
1526
1527 return 0;
1015} 1528}
1016 1529
1017/* 1530/*
@@ -1023,8 +1536,6 @@ unlock:
1023 * If the event is attached to a task which is on a CPU we use a smp 1536 * If the event is attached to a task which is on a CPU we use a smp
1024 * call to enable it in the task context. The task might have been 1537 * call to enable it in the task context. The task might have been
1025 * scheduled away, but we check this in the smp call again. 1538 * scheduled away, but we check this in the smp call again.
1026 *
1027 * Must be called with ctx->mutex held.
1028 */ 1539 */
1029static void 1540static void
1030perf_install_in_context(struct perf_event_context *ctx, 1541perf_install_in_context(struct perf_event_context *ctx,
@@ -1033,6 +1544,8 @@ perf_install_in_context(struct perf_event_context *ctx,
1033{ 1544{
1034 struct task_struct *task = ctx->task; 1545 struct task_struct *task = ctx->task;
1035 1546
1547 lockdep_assert_held(&ctx->mutex);
1548
1036 event->ctx = ctx; 1549 event->ctx = ctx;
1037 1550
1038 if (!task) { 1551 if (!task) {
@@ -1040,31 +1553,29 @@ perf_install_in_context(struct perf_event_context *ctx,
1040 * Per cpu events are installed via an smp call and 1553 * Per cpu events are installed via an smp call and
1041 * the install is always successful. 1554 * the install is always successful.
1042 */ 1555 */
1043 smp_call_function_single(cpu, __perf_install_in_context, 1556 cpu_function_call(cpu, __perf_install_in_context, event);
1044 event, 1);
1045 return; 1557 return;
1046 } 1558 }
1047 1559
1048retry: 1560retry:
1049 task_oncpu_function_call(task, __perf_install_in_context, 1561 if (!task_function_call(task, __perf_install_in_context, event))
1050 event); 1562 return;
1051 1563
1052 raw_spin_lock_irq(&ctx->lock); 1564 raw_spin_lock_irq(&ctx->lock);
1053 /* 1565 /*
1054 * we need to retry the smp call. 1566 * If we failed to find a running task, but find the context active now
1567 * that we've acquired the ctx->lock, retry.
1055 */ 1568 */
1056 if (ctx->is_active && list_empty(&event->group_entry)) { 1569 if (ctx->is_active) {
1057 raw_spin_unlock_irq(&ctx->lock); 1570 raw_spin_unlock_irq(&ctx->lock);
1058 goto retry; 1571 goto retry;
1059 } 1572 }
1060 1573
1061 /* 1574 /*
1062 * The lock prevents that this context is scheduled in so we 1575 * Since the task isn't running, its safe to add the event, us holding
1063 * can add the event safely, if it the call above did not 1576 * the ctx->lock ensures the task won't get scheduled in.
1064 * succeed.
1065 */ 1577 */
1066 if (list_empty(&event->group_entry)) 1578 add_event_to_ctx(event, ctx);
1067 add_event_to_ctx(event, ctx);
1068 raw_spin_unlock_irq(&ctx->lock); 1579 raw_spin_unlock_irq(&ctx->lock);
1069} 1580}
1070 1581
@@ -1093,7 +1604,7 @@ static void __perf_event_mark_enabled(struct perf_event *event,
1093/* 1604/*
1094 * Cross CPU call to enable a performance event 1605 * Cross CPU call to enable a performance event
1095 */ 1606 */
1096static void __perf_event_enable(void *info) 1607static int __perf_event_enable(void *info)
1097{ 1608{
1098 struct perf_event *event = info; 1609 struct perf_event *event = info;
1099 struct perf_event_context *ctx = event->ctx; 1610 struct perf_event_context *ctx = event->ctx;
@@ -1101,26 +1612,27 @@ static void __perf_event_enable(void *info)
1101 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1612 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1102 int err; 1613 int err;
1103 1614
1104 /* 1615 if (WARN_ON_ONCE(!ctx->is_active))
1105 * If this is a per-task event, need to check whether this 1616 return -EINVAL;
1106 * event's task is the current task on this cpu.
1107 */
1108 if (ctx->task && cpuctx->task_ctx != ctx) {
1109 if (cpuctx->task_ctx || ctx->task != current)
1110 return;
1111 cpuctx->task_ctx = ctx;
1112 }
1113 1617
1114 raw_spin_lock(&ctx->lock); 1618 raw_spin_lock(&ctx->lock);
1115 ctx->is_active = 1;
1116 update_context_time(ctx); 1619 update_context_time(ctx);
1117 1620
1118 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1621 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1119 goto unlock; 1622 goto unlock;
1623
1624 /*
1625 * set current task's cgroup time reference point
1626 */
1627 perf_cgroup_set_timestamp(current, ctx);
1628
1120 __perf_event_mark_enabled(event, ctx); 1629 __perf_event_mark_enabled(event, ctx);
1121 1630
1122 if (!event_filter_match(event)) 1631 if (!event_filter_match(event)) {
1632 if (is_cgroup_event(event))
1633 perf_cgroup_defer_enabled(event);
1123 goto unlock; 1634 goto unlock;
1635 }
1124 1636
1125 /* 1637 /*
1126 * If the event is in a group and isn't the group leader, 1638 * If the event is in a group and isn't the group leader,
@@ -1153,6 +1665,8 @@ static void __perf_event_enable(void *info)
1153 1665
1154unlock: 1666unlock:
1155 raw_spin_unlock(&ctx->lock); 1667 raw_spin_unlock(&ctx->lock);
1668
1669 return 0;
1156} 1670}
1157 1671
1158/* 1672/*
@@ -1173,8 +1687,7 @@ void perf_event_enable(struct perf_event *event)
1173 /* 1687 /*
1174 * Enable the event on the cpu that it's on 1688 * Enable the event on the cpu that it's on
1175 */ 1689 */
1176 smp_call_function_single(event->cpu, __perf_event_enable, 1690 cpu_function_call(event->cpu, __perf_event_enable, event);
1177 event, 1);
1178 return; 1691 return;
1179 } 1692 }
1180 1693
@@ -1193,8 +1706,15 @@ void perf_event_enable(struct perf_event *event)
1193 event->state = PERF_EVENT_STATE_OFF; 1706 event->state = PERF_EVENT_STATE_OFF;
1194 1707
1195retry: 1708retry:
1709 if (!ctx->is_active) {
1710 __perf_event_mark_enabled(event, ctx);
1711 goto out;
1712 }
1713
1196 raw_spin_unlock_irq(&ctx->lock); 1714 raw_spin_unlock_irq(&ctx->lock);
1197 task_oncpu_function_call(task, __perf_event_enable, event); 1715
1716 if (!task_function_call(task, __perf_event_enable, event))
1717 return;
1198 1718
1199 raw_spin_lock_irq(&ctx->lock); 1719 raw_spin_lock_irq(&ctx->lock);
1200 1720
@@ -1202,15 +1722,14 @@ retry:
1202 * If the context is active and the event is still off, 1722 * If the context is active and the event is still off,
1203 * we need to retry the cross-call. 1723 * we need to retry the cross-call.
1204 */ 1724 */
1205 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) 1725 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
1726 /*
1727 * task could have been flipped by a concurrent
1728 * perf_event_context_sched_out()
1729 */
1730 task = ctx->task;
1206 goto retry; 1731 goto retry;
1207 1732 }
1208 /*
1209 * Since we have the lock this context can't be scheduled
1210 * in, so we can change the state safely.
1211 */
1212 if (event->state == PERF_EVENT_STATE_OFF)
1213 __perf_event_mark_enabled(event, ctx);
1214 1733
1215out: 1734out:
1216 raw_spin_unlock_irq(&ctx->lock); 1735 raw_spin_unlock_irq(&ctx->lock);
@@ -1242,6 +1761,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
1242 if (likely(!ctx->nr_events)) 1761 if (likely(!ctx->nr_events))
1243 goto out; 1762 goto out;
1244 update_context_time(ctx); 1763 update_context_time(ctx);
1764 update_cgrp_time_from_cpuctx(cpuctx);
1245 1765
1246 if (!ctx->nr_active) 1766 if (!ctx->nr_active)
1247 goto out; 1767 goto out;
@@ -1354,8 +1874,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1354 } 1874 }
1355} 1875}
1356 1876
1357void perf_event_context_sched_out(struct task_struct *task, int ctxn, 1877static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1358 struct task_struct *next) 1878 struct task_struct *next)
1359{ 1879{
1360 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; 1880 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
1361 struct perf_event_context *next_ctx; 1881 struct perf_event_context *next_ctx;
@@ -1431,6 +1951,14 @@ void __perf_event_task_sched_out(struct task_struct *task,
1431 1951
1432 for_each_task_context_nr(ctxn) 1952 for_each_task_context_nr(ctxn)
1433 perf_event_context_sched_out(task, ctxn, next); 1953 perf_event_context_sched_out(task, ctxn, next);
1954
1955 /*
1956 * if cgroup events exist on this CPU, then we need
1957 * to check if we have to switch out PMU state.
1958 * cgroup event are system-wide mode only
1959 */
1960 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
1961 perf_cgroup_sched_out(task);
1434} 1962}
1435 1963
1436static void task_ctx_sched_out(struct perf_event_context *ctx, 1964static void task_ctx_sched_out(struct perf_event_context *ctx,
@@ -1469,6 +1997,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
1469 if (!event_filter_match(event)) 1997 if (!event_filter_match(event))
1470 continue; 1998 continue;
1471 1999
2000 /* may need to reset tstamp_enabled */
2001 if (is_cgroup_event(event))
2002 perf_cgroup_mark_enabled(event, ctx);
2003
1472 if (group_can_go_on(event, cpuctx, 1)) 2004 if (group_can_go_on(event, cpuctx, 1))
1473 group_sched_in(event, cpuctx, ctx); 2005 group_sched_in(event, cpuctx, ctx);
1474 2006
@@ -1501,6 +2033,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1501 if (!event_filter_match(event)) 2033 if (!event_filter_match(event))
1502 continue; 2034 continue;
1503 2035
2036 /* may need to reset tstamp_enabled */
2037 if (is_cgroup_event(event))
2038 perf_cgroup_mark_enabled(event, ctx);
2039
1504 if (group_can_go_on(event, cpuctx, can_add_hw)) { 2040 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1505 if (group_sched_in(event, cpuctx, ctx)) 2041 if (group_sched_in(event, cpuctx, ctx))
1506 can_add_hw = 0; 2042 can_add_hw = 0;
@@ -1511,15 +2047,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1511static void 2047static void
1512ctx_sched_in(struct perf_event_context *ctx, 2048ctx_sched_in(struct perf_event_context *ctx,
1513 struct perf_cpu_context *cpuctx, 2049 struct perf_cpu_context *cpuctx,
1514 enum event_type_t event_type) 2050 enum event_type_t event_type,
2051 struct task_struct *task)
1515{ 2052{
2053 u64 now;
2054
1516 raw_spin_lock(&ctx->lock); 2055 raw_spin_lock(&ctx->lock);
1517 ctx->is_active = 1; 2056 ctx->is_active = 1;
1518 if (likely(!ctx->nr_events)) 2057 if (likely(!ctx->nr_events))
1519 goto out; 2058 goto out;
1520 2059
1521 ctx->timestamp = perf_clock(); 2060 now = perf_clock();
1522 2061 ctx->timestamp = now;
2062 perf_cgroup_set_timestamp(task, ctx);
1523 /* 2063 /*
1524 * First go through the list and put on any pinned groups 2064 * First go through the list and put on any pinned groups
1525 * in order to give them the best chance of going on. 2065 * in order to give them the best chance of going on.
@@ -1536,11 +2076,12 @@ out:
1536} 2076}
1537 2077
1538static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 2078static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1539 enum event_type_t event_type) 2079 enum event_type_t event_type,
2080 struct task_struct *task)
1540{ 2081{
1541 struct perf_event_context *ctx = &cpuctx->ctx; 2082 struct perf_event_context *ctx = &cpuctx->ctx;
1542 2083
1543 ctx_sched_in(ctx, cpuctx, event_type); 2084 ctx_sched_in(ctx, cpuctx, event_type, task);
1544} 2085}
1545 2086
1546static void task_ctx_sched_in(struct perf_event_context *ctx, 2087static void task_ctx_sched_in(struct perf_event_context *ctx,
@@ -1548,15 +2089,16 @@ static void task_ctx_sched_in(struct perf_event_context *ctx,
1548{ 2089{
1549 struct perf_cpu_context *cpuctx; 2090 struct perf_cpu_context *cpuctx;
1550 2091
1551 cpuctx = __get_cpu_context(ctx); 2092 cpuctx = __get_cpu_context(ctx);
1552 if (cpuctx->task_ctx == ctx) 2093 if (cpuctx->task_ctx == ctx)
1553 return; 2094 return;
1554 2095
1555 ctx_sched_in(ctx, cpuctx, event_type); 2096 ctx_sched_in(ctx, cpuctx, event_type, NULL);
1556 cpuctx->task_ctx = ctx; 2097 cpuctx->task_ctx = ctx;
1557} 2098}
1558 2099
1559void perf_event_context_sched_in(struct perf_event_context *ctx) 2100static void perf_event_context_sched_in(struct perf_event_context *ctx,
2101 struct task_struct *task)
1560{ 2102{
1561 struct perf_cpu_context *cpuctx; 2103 struct perf_cpu_context *cpuctx;
1562 2104
@@ -1572,9 +2114,9 @@ void perf_event_context_sched_in(struct perf_event_context *ctx)
1572 */ 2114 */
1573 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2115 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1574 2116
1575 ctx_sched_in(ctx, cpuctx, EVENT_PINNED); 2117 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1576 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 2118 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1577 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); 2119 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1578 2120
1579 cpuctx->task_ctx = ctx; 2121 cpuctx->task_ctx = ctx;
1580 2122
@@ -1607,8 +2149,15 @@ void __perf_event_task_sched_in(struct task_struct *task)
1607 if (likely(!ctx)) 2149 if (likely(!ctx))
1608 continue; 2150 continue;
1609 2151
1610 perf_event_context_sched_in(ctx); 2152 perf_event_context_sched_in(ctx, task);
1611 } 2153 }
2154 /*
2155 * if cgroup events exist on this CPU, then we need
2156 * to check if we have to switch in PMU state.
2157 * cgroup event are system-wide mode only
2158 */
2159 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2160 perf_cgroup_sched_in(task);
1612} 2161}
1613 2162
1614static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2163static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -1638,7 +2187,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1638 * Reduce accuracy by one bit such that @a and @b converge 2187 * Reduce accuracy by one bit such that @a and @b converge
1639 * to a similar magnitude. 2188 * to a similar magnitude.
1640 */ 2189 */
1641#define REDUCE_FLS(a, b) \ 2190#define REDUCE_FLS(a, b) \
1642do { \ 2191do { \
1643 if (a##_fls > b##_fls) { \ 2192 if (a##_fls > b##_fls) { \
1644 a >>= 1; \ 2193 a >>= 1; \
@@ -1808,7 +2357,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
1808 if (ctx) 2357 if (ctx)
1809 rotate_ctx(ctx); 2358 rotate_ctx(ctx);
1810 2359
1811 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 2360 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
1812 if (ctx) 2361 if (ctx)
1813 task_ctx_sched_in(ctx, EVENT_FLEXIBLE); 2362 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
1814 2363
@@ -1887,7 +2436,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1887 2436
1888 raw_spin_unlock(&ctx->lock); 2437 raw_spin_unlock(&ctx->lock);
1889 2438
1890 perf_event_context_sched_in(ctx); 2439 perf_event_context_sched_in(ctx, ctx->task);
1891out: 2440out:
1892 local_irq_restore(flags); 2441 local_irq_restore(flags);
1893} 2442}
@@ -1912,8 +2461,10 @@ static void __perf_event_read(void *info)
1912 return; 2461 return;
1913 2462
1914 raw_spin_lock(&ctx->lock); 2463 raw_spin_lock(&ctx->lock);
1915 if (ctx->is_active) 2464 if (ctx->is_active) {
1916 update_context_time(ctx); 2465 update_context_time(ctx);
2466 update_cgrp_time_from_event(event);
2467 }
1917 update_event_times(event); 2468 update_event_times(event);
1918 if (event->state == PERF_EVENT_STATE_ACTIVE) 2469 if (event->state == PERF_EVENT_STATE_ACTIVE)
1919 event->pmu->read(event); 2470 event->pmu->read(event);
@@ -1944,8 +2495,10 @@ static u64 perf_event_read(struct perf_event *event)
1944 * (e.g., thread is blocked), in that case 2495 * (e.g., thread is blocked), in that case
1945 * we cannot update context time 2496 * we cannot update context time
1946 */ 2497 */
1947 if (ctx->is_active) 2498 if (ctx->is_active) {
1948 update_context_time(ctx); 2499 update_context_time(ctx);
2500 update_cgrp_time_from_event(event);
2501 }
1949 update_event_times(event); 2502 update_event_times(event);
1950 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2503 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1951 } 2504 }
@@ -2224,6 +2777,9 @@ errout:
2224 2777
2225} 2778}
2226 2779
2780/*
2781 * Returns a matching context with refcount and pincount.
2782 */
2227static struct perf_event_context * 2783static struct perf_event_context *
2228find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) 2784find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2229{ 2785{
@@ -2248,6 +2804,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2248 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 2804 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
2249 ctx = &cpuctx->ctx; 2805 ctx = &cpuctx->ctx;
2250 get_ctx(ctx); 2806 get_ctx(ctx);
2807 ++ctx->pin_count;
2251 2808
2252 return ctx; 2809 return ctx;
2253 } 2810 }
@@ -2261,6 +2818,7 @@ retry:
2261 ctx = perf_lock_task_context(task, ctxn, &flags); 2818 ctx = perf_lock_task_context(task, ctxn, &flags);
2262 if (ctx) { 2819 if (ctx) {
2263 unclone_ctx(ctx); 2820 unclone_ctx(ctx);
2821 ++ctx->pin_count;
2264 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2822 raw_spin_unlock_irqrestore(&ctx->lock, flags);
2265 } 2823 }
2266 2824
@@ -2282,8 +2840,10 @@ retry:
2282 err = -ESRCH; 2840 err = -ESRCH;
2283 else if (task->perf_event_ctxp[ctxn]) 2841 else if (task->perf_event_ctxp[ctxn])
2284 err = -EAGAIN; 2842 err = -EAGAIN;
2285 else 2843 else {
2844 ++ctx->pin_count;
2286 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); 2845 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
2846 }
2287 mutex_unlock(&task->perf_event_mutex); 2847 mutex_unlock(&task->perf_event_mutex);
2288 2848
2289 if (unlikely(err)) { 2849 if (unlikely(err)) {
@@ -2323,7 +2883,7 @@ static void free_event(struct perf_event *event)
2323 2883
2324 if (!event->parent) { 2884 if (!event->parent) {
2325 if (event->attach_state & PERF_ATTACH_TASK) 2885 if (event->attach_state & PERF_ATTACH_TASK)
2326 jump_label_dec(&perf_task_events); 2886 jump_label_dec(&perf_sched_events);
2327 if (event->attr.mmap || event->attr.mmap_data) 2887 if (event->attr.mmap || event->attr.mmap_data)
2328 atomic_dec(&nr_mmap_events); 2888 atomic_dec(&nr_mmap_events);
2329 if (event->attr.comm) 2889 if (event->attr.comm)
@@ -2332,6 +2892,10 @@ static void free_event(struct perf_event *event)
2332 atomic_dec(&nr_task_events); 2892 atomic_dec(&nr_task_events);
2333 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 2893 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2334 put_callchain_buffers(); 2894 put_callchain_buffers();
2895 if (is_cgroup_event(event)) {
2896 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2897 jump_label_dec(&perf_sched_events);
2898 }
2335 } 2899 }
2336 2900
2337 if (event->buffer) { 2901 if (event->buffer) {
@@ -2339,6 +2903,9 @@ static void free_event(struct perf_event *event)
2339 event->buffer = NULL; 2903 event->buffer = NULL;
2340 } 2904 }
2341 2905
2906 if (is_cgroup_event(event))
2907 perf_detach_cgroup(event);
2908
2342 if (event->destroy) 2909 if (event->destroy)
2343 event->destroy(event); 2910 event->destroy(event);
2344 2911
@@ -4406,26 +4973,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
4406 if (unlikely(!is_sampling_event(event))) 4973 if (unlikely(!is_sampling_event(event)))
4407 return 0; 4974 return 0;
4408 4975
4409 if (!throttle) { 4976 if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
4410 hwc->interrupts++; 4977 if (throttle) {
4411 } else { 4978 hwc->interrupts = MAX_INTERRUPTS;
4412 if (hwc->interrupts != MAX_INTERRUPTS) { 4979 perf_log_throttle(event, 0);
4413 hwc->interrupts++;
4414 if (HZ * hwc->interrupts >
4415 (u64)sysctl_perf_event_sample_rate) {
4416 hwc->interrupts = MAX_INTERRUPTS;
4417 perf_log_throttle(event, 0);
4418 ret = 1;
4419 }
4420 } else {
4421 /*
4422 * Keep re-disabling events even though on the previous
4423 * pass we disabled it - just in case we raced with a
4424 * sched-in and the event got enabled again:
4425 */
4426 ret = 1; 4980 ret = 1;
4427 } 4981 }
4428 } 4982 } else
4983 hwc->interrupts++;
4429 4984
4430 if (event->attr.freq) { 4985 if (event->attr.freq) {
4431 u64 now = perf_clock(); 4986 u64 now = perf_clock();
@@ -5062,6 +5617,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5062 u64 period; 5617 u64 period;
5063 5618
5064 event = container_of(hrtimer, struct perf_event, hw.hrtimer); 5619 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
5620
5621 if (event->state != PERF_EVENT_STATE_ACTIVE)
5622 return HRTIMER_NORESTART;
5623
5065 event->pmu->read(event); 5624 event->pmu->read(event);
5066 5625
5067 perf_sample_data_init(&data, 0); 5626 perf_sample_data_init(&data, 0);
@@ -5088,9 +5647,6 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
5088 if (!is_sampling_event(event)) 5647 if (!is_sampling_event(event))
5089 return; 5648 return;
5090 5649
5091 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5092 hwc->hrtimer.function = perf_swevent_hrtimer;
5093
5094 period = local64_read(&hwc->period_left); 5650 period = local64_read(&hwc->period_left);
5095 if (period) { 5651 if (period) {
5096 if (period < 0) 5652 if (period < 0)
@@ -5117,6 +5673,30 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
5117 } 5673 }
5118} 5674}
5119 5675
5676static void perf_swevent_init_hrtimer(struct perf_event *event)
5677{
5678 struct hw_perf_event *hwc = &event->hw;
5679
5680 if (!is_sampling_event(event))
5681 return;
5682
5683 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5684 hwc->hrtimer.function = perf_swevent_hrtimer;
5685
5686 /*
5687 * Since hrtimers have a fixed rate, we can do a static freq->period
5688 * mapping and avoid the whole period adjust feedback stuff.
5689 */
5690 if (event->attr.freq) {
5691 long freq = event->attr.sample_freq;
5692
5693 event->attr.sample_period = NSEC_PER_SEC / freq;
5694 hwc->sample_period = event->attr.sample_period;
5695 local64_set(&hwc->period_left, hwc->sample_period);
5696 event->attr.freq = 0;
5697 }
5698}
5699
5120/* 5700/*
5121 * Software event: cpu wall time clock 5701 * Software event: cpu wall time clock
5122 */ 5702 */
@@ -5169,6 +5749,8 @@ static int cpu_clock_event_init(struct perf_event *event)
5169 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) 5749 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5170 return -ENOENT; 5750 return -ENOENT;
5171 5751
5752 perf_swevent_init_hrtimer(event);
5753
5172 return 0; 5754 return 0;
5173} 5755}
5174 5756
@@ -5224,16 +5806,9 @@ static void task_clock_event_del(struct perf_event *event, int flags)
5224 5806
5225static void task_clock_event_read(struct perf_event *event) 5807static void task_clock_event_read(struct perf_event *event)
5226{ 5808{
5227 u64 time; 5809 u64 now = perf_clock();
5228 5810 u64 delta = now - event->ctx->timestamp;
5229 if (!in_nmi()) { 5811 u64 time = event->ctx->time + delta;
5230 update_context_time(event->ctx);
5231 time = event->ctx->time;
5232 } else {
5233 u64 now = perf_clock();
5234 u64 delta = now - event->ctx->timestamp;
5235 time = event->ctx->time + delta;
5236 }
5237 5812
5238 task_clock_event_update(event, time); 5813 task_clock_event_update(event, time);
5239} 5814}
@@ -5246,6 +5821,8 @@ static int task_clock_event_init(struct perf_event *event)
5246 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) 5821 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5247 return -ENOENT; 5822 return -ENOENT;
5248 5823
5824 perf_swevent_init_hrtimer(event);
5825
5249 return 0; 5826 return 0;
5250} 5827}
5251 5828
@@ -5517,17 +6094,22 @@ struct pmu *perf_init_event(struct perf_event *event)
5517{ 6094{
5518 struct pmu *pmu = NULL; 6095 struct pmu *pmu = NULL;
5519 int idx; 6096 int idx;
6097 int ret;
5520 6098
5521 idx = srcu_read_lock(&pmus_srcu); 6099 idx = srcu_read_lock(&pmus_srcu);
5522 6100
5523 rcu_read_lock(); 6101 rcu_read_lock();
5524 pmu = idr_find(&pmu_idr, event->attr.type); 6102 pmu = idr_find(&pmu_idr, event->attr.type);
5525 rcu_read_unlock(); 6103 rcu_read_unlock();
5526 if (pmu) 6104 if (pmu) {
6105 ret = pmu->event_init(event);
6106 if (ret)
6107 pmu = ERR_PTR(ret);
5527 goto unlock; 6108 goto unlock;
6109 }
5528 6110
5529 list_for_each_entry_rcu(pmu, &pmus, entry) { 6111 list_for_each_entry_rcu(pmu, &pmus, entry) {
5530 int ret = pmu->event_init(event); 6112 ret = pmu->event_init(event);
5531 if (!ret) 6113 if (!ret)
5532 goto unlock; 6114 goto unlock;
5533 6115
@@ -5653,7 +6235,7 @@ done:
5653 6235
5654 if (!event->parent) { 6236 if (!event->parent) {
5655 if (event->attach_state & PERF_ATTACH_TASK) 6237 if (event->attach_state & PERF_ATTACH_TASK)
5656 jump_label_inc(&perf_task_events); 6238 jump_label_inc(&perf_sched_events);
5657 if (event->attr.mmap || event->attr.mmap_data) 6239 if (event->attr.mmap || event->attr.mmap_data)
5658 atomic_inc(&nr_mmap_events); 6240 atomic_inc(&nr_mmap_events);
5659 if (event->attr.comm) 6241 if (event->attr.comm)
@@ -5828,7 +6410,7 @@ SYSCALL_DEFINE5(perf_event_open,
5828 int err; 6410 int err;
5829 6411
5830 /* for future expandability... */ 6412 /* for future expandability... */
5831 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) 6413 if (flags & ~PERF_FLAG_ALL)
5832 return -EINVAL; 6414 return -EINVAL;
5833 6415
5834 err = perf_copy_attr(attr_uptr, &attr); 6416 err = perf_copy_attr(attr_uptr, &attr);
@@ -5845,6 +6427,15 @@ SYSCALL_DEFINE5(perf_event_open,
5845 return -EINVAL; 6427 return -EINVAL;
5846 } 6428 }
5847 6429
6430 /*
6431 * In cgroup mode, the pid argument is used to pass the fd
6432 * opened to the cgroup directory in cgroupfs. The cpu argument
6433 * designates the cpu on which to monitor threads from that
6434 * cgroup.
6435 */
6436 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
6437 return -EINVAL;
6438
5848 event_fd = get_unused_fd_flags(O_RDWR); 6439 event_fd = get_unused_fd_flags(O_RDWR);
5849 if (event_fd < 0) 6440 if (event_fd < 0)
5850 return event_fd; 6441 return event_fd;
@@ -5862,7 +6453,7 @@ SYSCALL_DEFINE5(perf_event_open,
5862 group_leader = NULL; 6453 group_leader = NULL;
5863 } 6454 }
5864 6455
5865 if (pid != -1) { 6456 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
5866 task = find_lively_task_by_vpid(pid); 6457 task = find_lively_task_by_vpid(pid);
5867 if (IS_ERR(task)) { 6458 if (IS_ERR(task)) {
5868 err = PTR_ERR(task); 6459 err = PTR_ERR(task);
@@ -5876,6 +6467,19 @@ SYSCALL_DEFINE5(perf_event_open,
5876 goto err_task; 6467 goto err_task;
5877 } 6468 }
5878 6469
6470 if (flags & PERF_FLAG_PID_CGROUP) {
6471 err = perf_cgroup_connect(pid, event, &attr, group_leader);
6472 if (err)
6473 goto err_alloc;
6474 /*
6475 * one more event:
6476 * - that has cgroup constraint on event->cpu
6477 * - that may need work on context switch
6478 */
6479 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6480 jump_label_inc(&perf_sched_events);
6481 }
6482
5879 /* 6483 /*
5880 * Special case software events and allow them to be part of 6484 * Special case software events and allow them to be part of
5881 * any hardware group. 6485 * any hardware group.
@@ -5961,10 +6565,10 @@ SYSCALL_DEFINE5(perf_event_open,
5961 struct perf_event_context *gctx = group_leader->ctx; 6565 struct perf_event_context *gctx = group_leader->ctx;
5962 6566
5963 mutex_lock(&gctx->mutex); 6567 mutex_lock(&gctx->mutex);
5964 perf_event_remove_from_context(group_leader); 6568 perf_remove_from_context(group_leader);
5965 list_for_each_entry(sibling, &group_leader->sibling_list, 6569 list_for_each_entry(sibling, &group_leader->sibling_list,
5966 group_entry) { 6570 group_entry) {
5967 perf_event_remove_from_context(sibling); 6571 perf_remove_from_context(sibling);
5968 put_ctx(gctx); 6572 put_ctx(gctx);
5969 } 6573 }
5970 mutex_unlock(&gctx->mutex); 6574 mutex_unlock(&gctx->mutex);
@@ -5987,6 +6591,7 @@ SYSCALL_DEFINE5(perf_event_open,
5987 6591
5988 perf_install_in_context(ctx, event, cpu); 6592 perf_install_in_context(ctx, event, cpu);
5989 ++ctx->generation; 6593 ++ctx->generation;
6594 perf_unpin_context(ctx);
5990 mutex_unlock(&ctx->mutex); 6595 mutex_unlock(&ctx->mutex);
5991 6596
5992 event->owner = current; 6597 event->owner = current;
@@ -6012,6 +6617,7 @@ SYSCALL_DEFINE5(perf_event_open,
6012 return event_fd; 6617 return event_fd;
6013 6618
6014err_context: 6619err_context:
6620 perf_unpin_context(ctx);
6015 put_ctx(ctx); 6621 put_ctx(ctx);
6016err_alloc: 6622err_alloc:
6017 free_event(event); 6623 free_event(event);
@@ -6062,6 +6668,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6062 mutex_lock(&ctx->mutex); 6668 mutex_lock(&ctx->mutex);
6063 perf_install_in_context(ctx, event, cpu); 6669 perf_install_in_context(ctx, event, cpu);
6064 ++ctx->generation; 6670 ++ctx->generation;
6671 perf_unpin_context(ctx);
6065 mutex_unlock(&ctx->mutex); 6672 mutex_unlock(&ctx->mutex);
6066 6673
6067 return event; 6674 return event;
@@ -6115,7 +6722,7 @@ __perf_event_exit_task(struct perf_event *child_event,
6115{ 6722{
6116 struct perf_event *parent_event; 6723 struct perf_event *parent_event;
6117 6724
6118 perf_event_remove_from_context(child_event); 6725 perf_remove_from_context(child_event);
6119 6726
6120 parent_event = child_event->parent; 6727 parent_event = child_event->parent;
6121 /* 6728 /*
@@ -6422,7 +7029,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
6422 return 0; 7029 return 0;
6423 } 7030 }
6424 7031
6425 child_ctx = child->perf_event_ctxp[ctxn]; 7032 child_ctx = child->perf_event_ctxp[ctxn];
6426 if (!child_ctx) { 7033 if (!child_ctx) {
6427 /* 7034 /*
6428 * This is executed from the parent task context, so 7035 * This is executed from the parent task context, so
@@ -6537,6 +7144,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6537 mutex_unlock(&parent_ctx->mutex); 7144 mutex_unlock(&parent_ctx->mutex);
6538 7145
6539 perf_unpin_context(parent_ctx); 7146 perf_unpin_context(parent_ctx);
7147 put_ctx(parent_ctx);
6540 7148
6541 return ret; 7149 return ret;
6542} 7150}
@@ -6606,9 +7214,9 @@ static void __perf_event_exit_context(void *__info)
6606 perf_pmu_rotate_stop(ctx->pmu); 7214 perf_pmu_rotate_stop(ctx->pmu);
6607 7215
6608 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 7216 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
6609 __perf_event_remove_from_context(event); 7217 __perf_remove_from_context(event);
6610 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) 7218 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
6611 __perf_event_remove_from_context(event); 7219 __perf_remove_from_context(event);
6612} 7220}
6613 7221
6614static void perf_event_exit_cpu_context(int cpu) 7222static void perf_event_exit_cpu_context(int cpu)
@@ -6732,3 +7340,83 @@ unlock:
6732 return ret; 7340 return ret;
6733} 7341}
6734device_initcall(perf_event_sysfs_init); 7342device_initcall(perf_event_sysfs_init);
7343
7344#ifdef CONFIG_CGROUP_PERF
7345static struct cgroup_subsys_state *perf_cgroup_create(
7346 struct cgroup_subsys *ss, struct cgroup *cont)
7347{
7348 struct perf_cgroup *jc;
7349
7350 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
7351 if (!jc)
7352 return ERR_PTR(-ENOMEM);
7353
7354 jc->info = alloc_percpu(struct perf_cgroup_info);
7355 if (!jc->info) {
7356 kfree(jc);
7357 return ERR_PTR(-ENOMEM);
7358 }
7359
7360 return &jc->css;
7361}
7362
7363static void perf_cgroup_destroy(struct cgroup_subsys *ss,
7364 struct cgroup *cont)
7365{
7366 struct perf_cgroup *jc;
7367 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
7368 struct perf_cgroup, css);
7369 free_percpu(jc->info);
7370 kfree(jc);
7371}
7372
7373static int __perf_cgroup_move(void *info)
7374{
7375 struct task_struct *task = info;
7376 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
7377 return 0;
7378}
7379
7380static void perf_cgroup_move(struct task_struct *task)
7381{
7382 task_function_call(task, __perf_cgroup_move, task);
7383}
7384
7385static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7386 struct cgroup *old_cgrp, struct task_struct *task,
7387 bool threadgroup)
7388{
7389 perf_cgroup_move(task);
7390 if (threadgroup) {
7391 struct task_struct *c;
7392 rcu_read_lock();
7393 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
7394 perf_cgroup_move(c);
7395 }
7396 rcu_read_unlock();
7397 }
7398}
7399
7400static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7401 struct cgroup *old_cgrp, struct task_struct *task)
7402{
7403 /*
7404 * cgroup_exit() is called in the copy_process() failure path.
7405 * Ignore this case since the task hasn't ran yet, this avoids
7406 * trying to poke a half freed task state from generic code.
7407 */
7408 if (!(task->flags & PF_EXITING))
7409 return;
7410
7411 perf_cgroup_move(task);
7412}
7413
7414struct cgroup_subsys perf_subsys = {
7415 .name = "perf_event",
7416 .subsys_id = perf_subsys_id,
7417 .create = perf_cgroup_create,
7418 .destroy = perf_cgroup_destroy,
7419 .exit = perf_cgroup_exit,
7420 .attach = perf_cgroup_attach,
7421};
7422#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/sched.c b/kernel/sched.c
index 42eab5a8437d..57a18e8d28c8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -606,9 +606,6 @@ static inline struct task_group *task_group(struct task_struct *p)
606 struct task_group *tg; 606 struct task_group *tg;
607 struct cgroup_subsys_state *css; 607 struct cgroup_subsys_state *css;
608 608
609 if (p->flags & PF_EXITING)
610 return &root_task_group;
611
612 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 609 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
613 lockdep_is_held(&task_rq(p)->lock)); 610 lockdep_is_held(&task_rq(p)->lock));
614 tg = container_of(css, struct task_group, css); 611 tg = container_of(css, struct task_group, css);
@@ -2265,27 +2262,6 @@ void kick_process(struct task_struct *p)
2265EXPORT_SYMBOL_GPL(kick_process); 2262EXPORT_SYMBOL_GPL(kick_process);
2266#endif /* CONFIG_SMP */ 2263#endif /* CONFIG_SMP */
2267 2264
2268/**
2269 * task_oncpu_function_call - call a function on the cpu on which a task runs
2270 * @p: the task to evaluate
2271 * @func: the function to be called
2272 * @info: the function call argument
2273 *
2274 * Calls the function @func when the task is currently running. This might
2275 * be on the current CPU, which just calls the function directly
2276 */
2277void task_oncpu_function_call(struct task_struct *p,
2278 void (*func) (void *info), void *info)
2279{
2280 int cpu;
2281
2282 preempt_disable();
2283 cpu = task_cpu(p);
2284 if (task_curr(p))
2285 smp_call_function_single(cpu, func, info, 1);
2286 preempt_enable();
2287}
2288
2289#ifdef CONFIG_SMP 2265#ifdef CONFIG_SMP
2290/* 2266/*
2291 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2267 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
@@ -2776,9 +2752,12 @@ static inline void
2776prepare_task_switch(struct rq *rq, struct task_struct *prev, 2752prepare_task_switch(struct rq *rq, struct task_struct *prev,
2777 struct task_struct *next) 2753 struct task_struct *next)
2778{ 2754{
2755 sched_info_switch(prev, next);
2756 perf_event_task_sched_out(prev, next);
2779 fire_sched_out_preempt_notifiers(prev, next); 2757 fire_sched_out_preempt_notifiers(prev, next);
2780 prepare_lock_switch(rq, next); 2758 prepare_lock_switch(rq, next);
2781 prepare_arch_switch(next); 2759 prepare_arch_switch(next);
2760 trace_sched_switch(prev, next);
2782} 2761}
2783 2762
2784/** 2763/**
@@ -2911,7 +2890,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2911 struct mm_struct *mm, *oldmm; 2890 struct mm_struct *mm, *oldmm;
2912 2891
2913 prepare_task_switch(rq, prev, next); 2892 prepare_task_switch(rq, prev, next);
2914 trace_sched_switch(prev, next); 2893
2915 mm = next->mm; 2894 mm = next->mm;
2916 oldmm = prev->active_mm; 2895 oldmm = prev->active_mm;
2917 /* 2896 /*
@@ -3989,9 +3968,6 @@ need_resched_nonpreemptible:
3989 rq->skip_clock_update = 0; 3968 rq->skip_clock_update = 0;
3990 3969
3991 if (likely(prev != next)) { 3970 if (likely(prev != next)) {
3992 sched_info_switch(prev, next);
3993 perf_event_task_sched_out(prev, next);
3994
3995 rq->nr_switches++; 3971 rq->nr_switches++;
3996 rq->curr = next; 3972 rq->curr = next;
3997 ++*switch_count; 3973 ++*switch_count;
@@ -5572,7 +5548,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5572 * The idle tasks have their own, simple scheduling class: 5548 * The idle tasks have their own, simple scheduling class:
5573 */ 5549 */
5574 idle->sched_class = &idle_sched_class; 5550 idle->sched_class = &idle_sched_class;
5575 ftrace_graph_init_task(idle); 5551 ftrace_graph_init_idle_task(idle, cpu);
5576} 5552}
5577 5553
5578/* 5554/*
@@ -8885,7 +8861,8 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
8885} 8861}
8886 8862
8887static void 8863static void
8888cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) 8864cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
8865 struct cgroup *old_cgrp, struct task_struct *task)
8889{ 8866{
8890 /* 8867 /*
8891 * cgroup_exit() is called in the copy_process() failure path. 8868 * cgroup_exit() is called in the copy_process() failure path.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4eed0af5d144..19b9d85e06cc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -948,7 +948,7 @@ static struct ctl_table kern_table[] = {
948 .data = &sysctl_perf_event_sample_rate, 948 .data = &sysctl_perf_event_sample_rate,
949 .maxlen = sizeof(sysctl_perf_event_sample_rate), 949 .maxlen = sizeof(sysctl_perf_event_sample_rate),
950 .mode = 0644, 950 .mode = 0644,
951 .proc_handler = proc_dointvec, 951 .proc_handler = perf_proc_update_handler,
952 }, 952 },
953#endif 953#endif
954#ifdef CONFIG_KMEMCHECK 954#ifdef CONFIG_KMEMCHECK
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f3dadae83883..888b611897d3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3328,7 +3328,7 @@ static int start_graph_tracing(void)
3328 /* The cpu_boot init_task->ret_stack will never be freed */ 3328 /* The cpu_boot init_task->ret_stack will never be freed */
3329 for_each_online_cpu(cpu) { 3329 for_each_online_cpu(cpu) {
3330 if (!idle_task(cpu)->ret_stack) 3330 if (!idle_task(cpu)->ret_stack)
3331 ftrace_graph_init_task(idle_task(cpu)); 3331 ftrace_graph_init_idle_task(idle_task(cpu), cpu);
3332 } 3332 }
3333 3333
3334 do { 3334 do {
@@ -3418,6 +3418,49 @@ void unregister_ftrace_graph(void)
3418 mutex_unlock(&ftrace_lock); 3418 mutex_unlock(&ftrace_lock);
3419} 3419}
3420 3420
3421static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
3422
3423static void
3424graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
3425{
3426 atomic_set(&t->tracing_graph_pause, 0);
3427 atomic_set(&t->trace_overrun, 0);
3428 t->ftrace_timestamp = 0;
3429 /* make curr_ret_stack visable before we add the ret_stack */
3430 smp_wmb();
3431 t->ret_stack = ret_stack;
3432}
3433
3434/*
3435 * Allocate a return stack for the idle task. May be the first
3436 * time through, or it may be done by CPU hotplug online.
3437 */
3438void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
3439{
3440 t->curr_ret_stack = -1;
3441 /*
3442 * The idle task has no parent, it either has its own
3443 * stack or no stack at all.
3444 */
3445 if (t->ret_stack)
3446 WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
3447
3448 if (ftrace_graph_active) {
3449 struct ftrace_ret_stack *ret_stack;
3450
3451 ret_stack = per_cpu(idle_ret_stack, cpu);
3452 if (!ret_stack) {
3453 ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
3454 * sizeof(struct ftrace_ret_stack),
3455 GFP_KERNEL);
3456 if (!ret_stack)
3457 return;
3458 per_cpu(idle_ret_stack, cpu) = ret_stack;
3459 }
3460 graph_init_task(t, ret_stack);
3461 }
3462}
3463
3421/* Allocate a return stack for newly created task */ 3464/* Allocate a return stack for newly created task */
3422void ftrace_graph_init_task(struct task_struct *t) 3465void ftrace_graph_init_task(struct task_struct *t)
3423{ 3466{
@@ -3433,12 +3476,7 @@ void ftrace_graph_init_task(struct task_struct *t)
3433 GFP_KERNEL); 3476 GFP_KERNEL);
3434 if (!ret_stack) 3477 if (!ret_stack)
3435 return; 3478 return;
3436 atomic_set(&t->tracing_graph_pause, 0); 3479 graph_init_task(t, ret_stack);
3437 atomic_set(&t->trace_overrun, 0);
3438 t->ftrace_timestamp = 0;
3439 /* make curr_ret_stack visable before we add the ret_stack */
3440 smp_wmb();
3441 t->ret_stack = ret_stack;
3442 } 3480 }
3443} 3481}
3444 3482
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bd1c35a4fbcc..db7b439d23ee 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5,7 +5,6 @@
5 */ 5 */
6#include <linux/ring_buffer.h> 6#include <linux/ring_buffer.h>
7#include <linux/trace_clock.h> 7#include <linux/trace_clock.h>
8#include <linux/ftrace_irq.h>
9#include <linux/spinlock.h> 8#include <linux/spinlock.h>
10#include <linux/debugfs.h> 9#include <linux/debugfs.h>
11#include <linux/uaccess.h> 10#include <linux/uaccess.h>
@@ -1429,6 +1428,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1429} 1428}
1430EXPORT_SYMBOL_GPL(ring_buffer_resize); 1429EXPORT_SYMBOL_GPL(ring_buffer_resize);
1431 1430
1431void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
1432{
1433 mutex_lock(&buffer->mutex);
1434 if (val)
1435 buffer->flags |= RB_FL_OVERWRITE;
1436 else
1437 buffer->flags &= ~RB_FL_OVERWRITE;
1438 mutex_unlock(&buffer->mutex);
1439}
1440EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
1441
1432static inline void * 1442static inline void *
1433__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) 1443__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
1434{ 1444{
@@ -2162,11 +2172,19 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2162 if (likely(ts >= cpu_buffer->write_stamp)) { 2172 if (likely(ts >= cpu_buffer->write_stamp)) {
2163 delta = diff; 2173 delta = diff;
2164 if (unlikely(test_time_stamp(delta))) { 2174 if (unlikely(test_time_stamp(delta))) {
2175 int local_clock_stable = 1;
2176#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
2177 local_clock_stable = sched_clock_stable;
2178#endif
2165 WARN_ONCE(delta > (1ULL << 59), 2179 WARN_ONCE(delta > (1ULL << 59),
2166 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", 2180 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
2167 (unsigned long long)delta, 2181 (unsigned long long)delta,
2168 (unsigned long long)ts, 2182 (unsigned long long)ts,
2169 (unsigned long long)cpu_buffer->write_stamp); 2183 (unsigned long long)cpu_buffer->write_stamp,
2184 local_clock_stable ? "" :
2185 "If you just came from a suspend/resume,\n"
2186 "please switch to the trace global clock:\n"
2187 " echo global > /sys/kernel/debug/tracing/trace_clock\n");
2170 add_timestamp = 1; 2188 add_timestamp = 1;
2171 } 2189 }
2172 } 2190 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dc53ecb80589..9541c27c1cf2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -41,8 +41,6 @@
41#include "trace.h" 41#include "trace.h"
42#include "trace_output.h" 42#include "trace_output.h"
43 43
44#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
45
46/* 44/*
47 * On boot up, the ring buffer is set to the minimum size, so that 45 * On boot up, the ring buffer is set to the minimum size, so that
48 * we do not waste memory on systems that are not using tracing. 46 * we do not waste memory on systems that are not using tracing.
@@ -340,7 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
340/* trace_flags holds trace_options default values */ 338/* trace_flags holds trace_options default values */
341unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 339unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
342 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 340 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
343 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; 341 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
344 342
345static int trace_stop_count; 343static int trace_stop_count;
346static DEFINE_SPINLOCK(tracing_start_lock); 344static DEFINE_SPINLOCK(tracing_start_lock);
@@ -425,6 +423,7 @@ static const char *trace_options[] = {
425 "sleep-time", 423 "sleep-time",
426 "graph-time", 424 "graph-time",
427 "record-cmd", 425 "record-cmd",
426 "overwrite",
428 NULL 427 NULL
429}; 428};
430 429
@@ -780,6 +779,11 @@ __acquires(kernel_lock)
780 tracing_reset_online_cpus(tr); 779 tracing_reset_online_cpus(tr);
781 780
782 current_trace = type; 781 current_trace = type;
782
783 /* If we expanded the buffers, make sure the max is expanded too */
784 if (ring_buffer_expanded && type->use_max_tr)
785 ring_buffer_resize(max_tr.buffer, trace_buf_size);
786
783 /* the test is responsible for initializing and enabling */ 787 /* the test is responsible for initializing and enabling */
784 pr_info("Testing tracer %s: ", type->name); 788 pr_info("Testing tracer %s: ", type->name);
785 ret = type->selftest(type, tr); 789 ret = type->selftest(type, tr);
@@ -792,6 +796,10 @@ __acquires(kernel_lock)
792 /* Only reset on passing, to avoid touching corrupted buffers */ 796 /* Only reset on passing, to avoid touching corrupted buffers */
793 tracing_reset_online_cpus(tr); 797 tracing_reset_online_cpus(tr);
794 798
799 /* Shrink the max buffer again */
800 if (ring_buffer_expanded && type->use_max_tr)
801 ring_buffer_resize(max_tr.buffer, 1);
802
795 printk(KERN_CONT "PASSED\n"); 803 printk(KERN_CONT "PASSED\n");
796 } 804 }
797#endif 805#endif
@@ -1102,7 +1110,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
1102 1110
1103 entry->preempt_count = pc & 0xff; 1111 entry->preempt_count = pc & 0xff;
1104 entry->pid = (tsk) ? tsk->pid : 0; 1112 entry->pid = (tsk) ? tsk->pid : 0;
1105 entry->lock_depth = (tsk) ? tsk->lock_depth : 0;
1106 entry->flags = 1113 entry->flags =
1107#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 1114#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
1108 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 1115 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1749,10 +1756,9 @@ static void print_lat_help_header(struct seq_file *m)
1749 seq_puts(m, "# | / _----=> need-resched \n"); 1756 seq_puts(m, "# | / _----=> need-resched \n");
1750 seq_puts(m, "# || / _---=> hardirq/softirq \n"); 1757 seq_puts(m, "# || / _---=> hardirq/softirq \n");
1751 seq_puts(m, "# ||| / _--=> preempt-depth \n"); 1758 seq_puts(m, "# ||| / _--=> preempt-depth \n");
1752 seq_puts(m, "# |||| /_--=> lock-depth \n"); 1759 seq_puts(m, "# |||| / delay \n");
1753 seq_puts(m, "# |||||/ delay \n"); 1760 seq_puts(m, "# cmd pid ||||| time | caller \n");
1754 seq_puts(m, "# cmd pid |||||| time | caller \n"); 1761 seq_puts(m, "# \\ / ||||| \\ | / \n");
1755 seq_puts(m, "# \\ / |||||| \\ | / \n");
1756} 1762}
1757 1763
1758static void print_func_help_header(struct seq_file *m) 1764static void print_func_help_header(struct seq_file *m)
@@ -2529,6 +2535,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2529 2535
2530 if (mask == TRACE_ITER_RECORD_CMD) 2536 if (mask == TRACE_ITER_RECORD_CMD)
2531 trace_event_enable_cmd_record(enabled); 2537 trace_event_enable_cmd_record(enabled);
2538
2539 if (mask == TRACE_ITER_OVERWRITE)
2540 ring_buffer_change_overwrite(global_trace.buffer, enabled);
2532} 2541}
2533 2542
2534static ssize_t 2543static ssize_t
@@ -2710,6 +2719,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2710 2719
2711 mutex_lock(&trace_types_lock); 2720 mutex_lock(&trace_types_lock);
2712 if (tracer_enabled ^ val) { 2721 if (tracer_enabled ^ val) {
2722
2723 /* Only need to warn if this is used to change the state */
2724 WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on");
2725
2713 if (val) { 2726 if (val) {
2714 tracer_enabled = 1; 2727 tracer_enabled = 1;
2715 if (current_trace->start) 2728 if (current_trace->start)
@@ -4551,9 +4564,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
4551__init static int tracer_alloc_buffers(void) 4564__init static int tracer_alloc_buffers(void)
4552{ 4565{
4553 int ring_buf_size; 4566 int ring_buf_size;
4567 enum ring_buffer_flags rb_flags;
4554 int i; 4568 int i;
4555 int ret = -ENOMEM; 4569 int ret = -ENOMEM;
4556 4570
4571
4557 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) 4572 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
4558 goto out; 4573 goto out;
4559 4574
@@ -4566,12 +4581,13 @@ __init static int tracer_alloc_buffers(void)
4566 else 4581 else
4567 ring_buf_size = 1; 4582 ring_buf_size = 1;
4568 4583
4584 rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
4585
4569 cpumask_copy(tracing_buffer_mask, cpu_possible_mask); 4586 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
4570 cpumask_copy(tracing_cpumask, cpu_all_mask); 4587 cpumask_copy(tracing_cpumask, cpu_all_mask);
4571 4588
4572 /* TODO: make the number of buffers hot pluggable with CPUS */ 4589 /* TODO: make the number of buffers hot pluggable with CPUS */
4573 global_trace.buffer = ring_buffer_alloc(ring_buf_size, 4590 global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags);
4574 TRACE_BUFFER_FLAGS);
4575 if (!global_trace.buffer) { 4591 if (!global_trace.buffer) {
4576 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 4592 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
4577 WARN_ON(1); 4593 WARN_ON(1);
@@ -4581,7 +4597,7 @@ __init static int tracer_alloc_buffers(void)
4581 4597
4582 4598
4583#ifdef CONFIG_TRACER_MAX_TRACE 4599#ifdef CONFIG_TRACER_MAX_TRACE
4584 max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); 4600 max_tr.buffer = ring_buffer_alloc(1, rb_flags);
4585 if (!max_tr.buffer) { 4601 if (!max_tr.buffer) {
4586 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); 4602 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
4587 WARN_ON(1); 4603 WARN_ON(1);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9021f8c0c0c3..5e9dfc6286dd 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -272,8 +272,8 @@ struct tracer {
272 /* If you handled the flag setting, return 0 */ 272 /* If you handled the flag setting, return 0 */
273 int (*set_flag)(u32 old_flags, u32 bit, int set); 273 int (*set_flag)(u32 old_flags, u32 bit, int set);
274 struct tracer *next; 274 struct tracer *next;
275 int print_max;
276 struct tracer_flags *flags; 275 struct tracer_flags *flags;
276 int print_max;
277 int use_max_tr; 277 int use_max_tr;
278}; 278};
279 279
@@ -606,6 +606,7 @@ enum trace_iterator_flags {
606 TRACE_ITER_SLEEP_TIME = 0x40000, 606 TRACE_ITER_SLEEP_TIME = 0x40000,
607 TRACE_ITER_GRAPH_TIME = 0x80000, 607 TRACE_ITER_GRAPH_TIME = 0x80000,
608 TRACE_ITER_RECORD_CMD = 0x100000, 608 TRACE_ITER_RECORD_CMD = 0x100000,
609 TRACE_ITER_OVERWRITE = 0x200000,
609}; 610};
610 611
611/* 612/*
@@ -661,8 +662,10 @@ struct ftrace_event_field {
661}; 662};
662 663
663struct event_filter { 664struct event_filter {
664 int n_preds; 665 int n_preds; /* Number assigned */
665 struct filter_pred **preds; 666 int a_preds; /* allocated */
667 struct filter_pred *preds;
668 struct filter_pred *root;
666 char *filter_string; 669 char *filter_string;
667}; 670};
668 671
@@ -674,11 +677,23 @@ struct event_subsystem {
674 int nr_events; 677 int nr_events;
675}; 678};
676 679
680#define FILTER_PRED_INVALID ((unsigned short)-1)
681#define FILTER_PRED_IS_RIGHT (1 << 15)
682#define FILTER_PRED_FOLD (1 << 15)
683
684/*
685 * The max preds is the size of unsigned short with
686 * two flags at the MSBs. One bit is used for both the IS_RIGHT
687 * and FOLD flags. The other is reserved.
688 *
689 * 2^14 preds is way more than enough.
690 */
691#define MAX_FILTER_PRED 16384
692
677struct filter_pred; 693struct filter_pred;
678struct regex; 694struct regex;
679 695
680typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, 696typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
681 int val1, int val2);
682 697
683typedef int (*regex_match_func)(char *str, struct regex *r, int len); 698typedef int (*regex_match_func)(char *str, struct regex *r, int len);
684 699
@@ -700,11 +715,23 @@ struct filter_pred {
700 filter_pred_fn_t fn; 715 filter_pred_fn_t fn;
701 u64 val; 716 u64 val;
702 struct regex regex; 717 struct regex regex;
703 char *field_name; 718 /*
719 * Leaf nodes use field_name, ops is used by AND and OR
720 * nodes. The field_name is always freed when freeing a pred.
721 * We can overload field_name for ops and have it freed
722 * as well.
723 */
724 union {
725 char *field_name;
726 unsigned short *ops;
727 };
704 int offset; 728 int offset;
705 int not; 729 int not;
706 int op; 730 int op;
707 int pop_n; 731 unsigned short index;
732 unsigned short parent;
733 unsigned short left;
734 unsigned short right;
708}; 735};
709 736
710extern struct list_head ftrace_common_fields; 737extern struct list_head ftrace_common_fields;
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 6cf223764be8..1516cb3ec549 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -109,12 +109,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
109 */ 109 */
110#define FTRACE_CTX_FIELDS \ 110#define FTRACE_CTX_FIELDS \
111 __field( unsigned int, prev_pid ) \ 111 __field( unsigned int, prev_pid ) \
112 __field( unsigned int, next_pid ) \
113 __field( unsigned int, next_cpu ) \
112 __field( unsigned char, prev_prio ) \ 114 __field( unsigned char, prev_prio ) \
113 __field( unsigned char, prev_state ) \ 115 __field( unsigned char, prev_state ) \
114 __field( unsigned int, next_pid ) \
115 __field( unsigned char, next_prio ) \ 116 __field( unsigned char, next_prio ) \
116 __field( unsigned char, next_state ) \ 117 __field( unsigned char, next_state )
117 __field( unsigned int, next_cpu )
118 118
119FTRACE_ENTRY(context_switch, ctx_switch_entry, 119FTRACE_ENTRY(context_switch, ctx_switch_entry,
120 120
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5f499e0438a4..e88f74fe1d4c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,7 +116,6 @@ static int trace_define_common_fields(void)
116 __common_field(unsigned char, flags); 116 __common_field(unsigned char, flags);
117 __common_field(unsigned char, preempt_count); 117 __common_field(unsigned char, preempt_count);
118 __common_field(int, pid); 118 __common_field(int, pid);
119 __common_field(int, lock_depth);
120 119
121 return ret; 120 return ret;
122} 121}
@@ -326,6 +325,7 @@ int trace_set_clr_event(const char *system, const char *event, int set)
326{ 325{
327 return __ftrace_set_clr_event(NULL, system, event, set); 326 return __ftrace_set_clr_event(NULL, system, event, set);
328} 327}
328EXPORT_SYMBOL_GPL(trace_set_clr_event);
329 329
330/* 128 should be much more than enough */ 330/* 128 should be much more than enough */
331#define EVENT_BUF_SIZE 127 331#define EVENT_BUF_SIZE 127
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 36d40104b17f..3249b4f77ef0 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -123,9 +123,13 @@ struct filter_parse_state {
123 } operand; 123 } operand;
124}; 124};
125 125
126struct pred_stack {
127 struct filter_pred **preds;
128 int index;
129};
130
126#define DEFINE_COMPARISON_PRED(type) \ 131#define DEFINE_COMPARISON_PRED(type) \
127static int filter_pred_##type(struct filter_pred *pred, void *event, \ 132static int filter_pred_##type(struct filter_pred *pred, void *event) \
128 int val1, int val2) \
129{ \ 133{ \
130 type *addr = (type *)(event + pred->offset); \ 134 type *addr = (type *)(event + pred->offset); \
131 type val = (type)pred->val; \ 135 type val = (type)pred->val; \
@@ -152,8 +156,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event, \
152} 156}
153 157
154#define DEFINE_EQUALITY_PRED(size) \ 158#define DEFINE_EQUALITY_PRED(size) \
155static int filter_pred_##size(struct filter_pred *pred, void *event, \ 159static int filter_pred_##size(struct filter_pred *pred, void *event) \
156 int val1, int val2) \
157{ \ 160{ \
158 u##size *addr = (u##size *)(event + pred->offset); \ 161 u##size *addr = (u##size *)(event + pred->offset); \
159 u##size val = (u##size)pred->val; \ 162 u##size val = (u##size)pred->val; \
@@ -178,23 +181,8 @@ DEFINE_EQUALITY_PRED(32);
178DEFINE_EQUALITY_PRED(16); 181DEFINE_EQUALITY_PRED(16);
179DEFINE_EQUALITY_PRED(8); 182DEFINE_EQUALITY_PRED(8);
180 183
181static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
182 void *event __attribute((unused)),
183 int val1, int val2)
184{
185 return val1 && val2;
186}
187
188static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
189 void *event __attribute((unused)),
190 int val1, int val2)
191{
192 return val1 || val2;
193}
194
195/* Filter predicate for fixed sized arrays of characters */ 184/* Filter predicate for fixed sized arrays of characters */
196static int filter_pred_string(struct filter_pred *pred, void *event, 185static int filter_pred_string(struct filter_pred *pred, void *event)
197 int val1, int val2)
198{ 186{
199 char *addr = (char *)(event + pred->offset); 187 char *addr = (char *)(event + pred->offset);
200 int cmp, match; 188 int cmp, match;
@@ -207,8 +195,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
207} 195}
208 196
209/* Filter predicate for char * pointers */ 197/* Filter predicate for char * pointers */
210static int filter_pred_pchar(struct filter_pred *pred, void *event, 198static int filter_pred_pchar(struct filter_pred *pred, void *event)
211 int val1, int val2)
212{ 199{
213 char **addr = (char **)(event + pred->offset); 200 char **addr = (char **)(event + pred->offset);
214 int cmp, match; 201 int cmp, match;
@@ -231,8 +218,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
231 * and add it to the address of the entry, and at last we have 218 * and add it to the address of the entry, and at last we have
232 * the address of the string. 219 * the address of the string.
233 */ 220 */
234static int filter_pred_strloc(struct filter_pred *pred, void *event, 221static int filter_pred_strloc(struct filter_pred *pred, void *event)
235 int val1, int val2)
236{ 222{
237 u32 str_item = *(u32 *)(event + pred->offset); 223 u32 str_item = *(u32 *)(event + pred->offset);
238 int str_loc = str_item & 0xffff; 224 int str_loc = str_item & 0xffff;
@@ -247,8 +233,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,
247 return match; 233 return match;
248} 234}
249 235
250static int filter_pred_none(struct filter_pred *pred, void *event, 236static int filter_pred_none(struct filter_pred *pred, void *event)
251 int val1, int val2)
252{ 237{
253 return 0; 238 return 0;
254} 239}
@@ -377,32 +362,147 @@ static void filter_build_regex(struct filter_pred *pred)
377 pred->not ^= not; 362 pred->not ^= not;
378} 363}
379 364
365enum move_type {
366 MOVE_DOWN,
367 MOVE_UP_FROM_LEFT,
368 MOVE_UP_FROM_RIGHT
369};
370
371static struct filter_pred *
372get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
373 int index, enum move_type *move)
374{
375 if (pred->parent & FILTER_PRED_IS_RIGHT)
376 *move = MOVE_UP_FROM_RIGHT;
377 else
378 *move = MOVE_UP_FROM_LEFT;
379 pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT];
380
381 return pred;
382}
383
384/*
385 * A series of AND or ORs where found together. Instead of
386 * climbing up and down the tree branches, an array of the
387 * ops were made in order of checks. We can just move across
388 * the array and short circuit if needed.
389 */
390static int process_ops(struct filter_pred *preds,
391 struct filter_pred *op, void *rec)
392{
393 struct filter_pred *pred;
394 int type;
395 int match;
396 int i;
397
398 /*
399 * Micro-optimization: We set type to true if op
400 * is an OR and false otherwise (AND). Then we
401 * just need to test if the match is equal to
402 * the type, and if it is, we can short circuit the
403 * rest of the checks:
404 *
405 * if ((match && op->op == OP_OR) ||
406 * (!match && op->op == OP_AND))
407 * return match;
408 */
409 type = op->op == OP_OR;
410
411 for (i = 0; i < op->val; i++) {
412 pred = &preds[op->ops[i]];
413 match = pred->fn(pred, rec);
414 if (!!match == type)
415 return match;
416 }
417 return match;
418}
419
380/* return 1 if event matches, 0 otherwise (discard) */ 420/* return 1 if event matches, 0 otherwise (discard) */
381int filter_match_preds(struct event_filter *filter, void *rec) 421int filter_match_preds(struct event_filter *filter, void *rec)
382{ 422{
383 int match, top = 0, val1 = 0, val2 = 0; 423 int match = -1;
384 int stack[MAX_FILTER_PRED]; 424 enum move_type move = MOVE_DOWN;
425 struct filter_pred *preds;
385 struct filter_pred *pred; 426 struct filter_pred *pred;
386 int i; 427 struct filter_pred *root;
428 int n_preds;
429 int done = 0;
430
431 /* no filter is considered a match */
432 if (!filter)
433 return 1;
434
435 n_preds = filter->n_preds;
436
437 if (!n_preds)
438 return 1;
439
440 /*
441 * n_preds, root and filter->preds are protect with preemption disabled.
442 */
443 preds = rcu_dereference_sched(filter->preds);
444 root = rcu_dereference_sched(filter->root);
445 if (!root)
446 return 1;
447
448 pred = root;
387 449
388 for (i = 0; i < filter->n_preds; i++) { 450 /* match is currently meaningless */
389 pred = filter->preds[i]; 451 match = -1;
390 if (!pred->pop_n) { 452
391 match = pred->fn(pred, rec, val1, val2); 453 do {
392 stack[top++] = match; 454 switch (move) {
455 case MOVE_DOWN:
456 /* only AND and OR have children */
457 if (pred->left != FILTER_PRED_INVALID) {
458 /* If ops is set, then it was folded. */
459 if (!pred->ops) {
460 /* keep going to down the left side */
461 pred = &preds[pred->left];
462 continue;
463 }
464 /* We can treat folded ops as a leaf node */
465 match = process_ops(preds, pred, rec);
466 } else
467 match = pred->fn(pred, rec);
468 /* If this pred is the only pred */
469 if (pred == root)
470 break;
471 pred = get_pred_parent(pred, preds,
472 pred->parent, &move);
473 continue;
474 case MOVE_UP_FROM_LEFT:
475 /*
476 * Check for short circuits.
477 *
478 * Optimization: !!match == (pred->op == OP_OR)
479 * is the same as:
480 * if ((match && pred->op == OP_OR) ||
481 * (!match && pred->op == OP_AND))
482 */
483 if (!!match == (pred->op == OP_OR)) {
484 if (pred == root)
485 break;
486 pred = get_pred_parent(pred, preds,
487 pred->parent, &move);
488 continue;
489 }
490 /* now go down the right side of the tree. */
491 pred = &preds[pred->right];
492 move = MOVE_DOWN;
493 continue;
494 case MOVE_UP_FROM_RIGHT:
495 /* We finished this equation. */
496 if (pred == root)
497 break;
498 pred = get_pred_parent(pred, preds,
499 pred->parent, &move);
393 continue; 500 continue;
394 } 501 }
395 if (pred->pop_n > top) { 502 done = 1;
396 WARN_ON_ONCE(1); 503 } while (!done);
397 return 0;
398 }
399 val1 = stack[--top];
400 val2 = stack[--top];
401 match = pred->fn(pred, rec, val1, val2);
402 stack[top++] = match;
403 }
404 504
405 return stack[--top]; 505 return match;
406} 506}
407EXPORT_SYMBOL_GPL(filter_match_preds); 507EXPORT_SYMBOL_GPL(filter_match_preds);
408 508
@@ -414,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos)
414 514
415static void remove_filter_string(struct event_filter *filter) 515static void remove_filter_string(struct event_filter *filter)
416{ 516{
517 if (!filter)
518 return;
519
417 kfree(filter->filter_string); 520 kfree(filter->filter_string);
418 filter->filter_string = NULL; 521 filter->filter_string = NULL;
419} 522}
@@ -473,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps,
473 576
474void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) 577void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
475{ 578{
476 struct event_filter *filter = call->filter; 579 struct event_filter *filter;
477 580
478 mutex_lock(&event_mutex); 581 mutex_lock(&event_mutex);
582 filter = call->filter;
479 if (filter && filter->filter_string) 583 if (filter && filter->filter_string)
480 trace_seq_printf(s, "%s\n", filter->filter_string); 584 trace_seq_printf(s, "%s\n", filter->filter_string);
481 else 585 else
@@ -486,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
486void print_subsystem_event_filter(struct event_subsystem *system, 590void print_subsystem_event_filter(struct event_subsystem *system,
487 struct trace_seq *s) 591 struct trace_seq *s)
488{ 592{
489 struct event_filter *filter = system->filter; 593 struct event_filter *filter;
490 594
491 mutex_lock(&event_mutex); 595 mutex_lock(&event_mutex);
596 filter = system->filter;
492 if (filter && filter->filter_string) 597 if (filter && filter->filter_string)
493 trace_seq_printf(s, "%s\n", filter->filter_string); 598 trace_seq_printf(s, "%s\n", filter->filter_string);
494 else 599 else
@@ -539,10 +644,58 @@ static void filter_clear_pred(struct filter_pred *pred)
539 pred->regex.len = 0; 644 pred->regex.len = 0;
540} 645}
541 646
542static int filter_set_pred(struct filter_pred *dest, 647static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
648{
649 stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
650 if (!stack->preds)
651 return -ENOMEM;
652 stack->index = n_preds;
653 return 0;
654}
655
656static void __free_pred_stack(struct pred_stack *stack)
657{
658 kfree(stack->preds);
659 stack->index = 0;
660}
661
662static int __push_pred_stack(struct pred_stack *stack,
663 struct filter_pred *pred)
664{
665 int index = stack->index;
666
667 if (WARN_ON(index == 0))
668 return -ENOSPC;
669
670 stack->preds[--index] = pred;
671 stack->index = index;
672 return 0;
673}
674
675static struct filter_pred *
676__pop_pred_stack(struct pred_stack *stack)
677{
678 struct filter_pred *pred;
679 int index = stack->index;
680
681 pred = stack->preds[index++];
682 if (!pred)
683 return NULL;
684
685 stack->index = index;
686 return pred;
687}
688
689static int filter_set_pred(struct event_filter *filter,
690 int idx,
691 struct pred_stack *stack,
543 struct filter_pred *src, 692 struct filter_pred *src,
544 filter_pred_fn_t fn) 693 filter_pred_fn_t fn)
545{ 694{
695 struct filter_pred *dest = &filter->preds[idx];
696 struct filter_pred *left;
697 struct filter_pred *right;
698
546 *dest = *src; 699 *dest = *src;
547 if (src->field_name) { 700 if (src->field_name) {
548 dest->field_name = kstrdup(src->field_name, GFP_KERNEL); 701 dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
@@ -550,116 +703,140 @@ static int filter_set_pred(struct filter_pred *dest,
550 return -ENOMEM; 703 return -ENOMEM;
551 } 704 }
552 dest->fn = fn; 705 dest->fn = fn;
706 dest->index = idx;
553 707
554 return 0; 708 if (dest->op == OP_OR || dest->op == OP_AND) {
709 right = __pop_pred_stack(stack);
710 left = __pop_pred_stack(stack);
711 if (!left || !right)
712 return -EINVAL;
713 /*
714 * If both children can be folded
715 * and they are the same op as this op or a leaf,
716 * then this op can be folded.
717 */
718 if (left->index & FILTER_PRED_FOLD &&
719 (left->op == dest->op ||
720 left->left == FILTER_PRED_INVALID) &&
721 right->index & FILTER_PRED_FOLD &&
722 (right->op == dest->op ||
723 right->left == FILTER_PRED_INVALID))
724 dest->index |= FILTER_PRED_FOLD;
725
726 dest->left = left->index & ~FILTER_PRED_FOLD;
727 dest->right = right->index & ~FILTER_PRED_FOLD;
728 left->parent = dest->index & ~FILTER_PRED_FOLD;
729 right->parent = dest->index | FILTER_PRED_IS_RIGHT;
730 } else {
731 /*
732 * Make dest->left invalid to be used as a quick
733 * way to know this is a leaf node.
734 */
735 dest->left = FILTER_PRED_INVALID;
736
737 /* All leafs allow folding the parent ops. */
738 dest->index |= FILTER_PRED_FOLD;
739 }
740
741 return __push_pred_stack(stack, dest);
555} 742}
556 743
557static void filter_disable_preds(struct ftrace_event_call *call) 744static void __free_preds(struct event_filter *filter)
558{ 745{
559 struct event_filter *filter = call->filter;
560 int i; 746 int i;
561 747
562 call->flags &= ~TRACE_EVENT_FL_FILTERED; 748 if (filter->preds) {
749 for (i = 0; i < filter->a_preds; i++)
750 kfree(filter->preds[i].field_name);
751 kfree(filter->preds);
752 filter->preds = NULL;
753 }
754 filter->a_preds = 0;
563 filter->n_preds = 0; 755 filter->n_preds = 0;
564
565 for (i = 0; i < MAX_FILTER_PRED; i++)
566 filter->preds[i]->fn = filter_pred_none;
567} 756}
568 757
569static void __free_preds(struct event_filter *filter) 758static void filter_disable(struct ftrace_event_call *call)
570{ 759{
571 int i; 760 call->flags &= ~TRACE_EVENT_FL_FILTERED;
761}
572 762
763static void __free_filter(struct event_filter *filter)
764{
573 if (!filter) 765 if (!filter)
574 return; 766 return;
575 767
576 for (i = 0; i < MAX_FILTER_PRED; i++) { 768 __free_preds(filter);
577 if (filter->preds[i])
578 filter_free_pred(filter->preds[i]);
579 }
580 kfree(filter->preds);
581 kfree(filter->filter_string); 769 kfree(filter->filter_string);
582 kfree(filter); 770 kfree(filter);
583} 771}
584 772
773/*
774 * Called when destroying the ftrace_event_call.
775 * The call is being freed, so we do not need to worry about
776 * the call being currently used. This is for module code removing
777 * the tracepoints from within it.
778 */
585void destroy_preds(struct ftrace_event_call *call) 779void destroy_preds(struct ftrace_event_call *call)
586{ 780{
587 __free_preds(call->filter); 781 __free_filter(call->filter);
588 call->filter = NULL; 782 call->filter = NULL;
589 call->flags &= ~TRACE_EVENT_FL_FILTERED;
590} 783}
591 784
592static struct event_filter *__alloc_preds(void) 785static struct event_filter *__alloc_filter(void)
593{ 786{
594 struct event_filter *filter; 787 struct event_filter *filter;
788
789 filter = kzalloc(sizeof(*filter), GFP_KERNEL);
790 return filter;
791}
792
793static int __alloc_preds(struct event_filter *filter, int n_preds)
794{
595 struct filter_pred *pred; 795 struct filter_pred *pred;
596 int i; 796 int i;
597 797
598 filter = kzalloc(sizeof(*filter), GFP_KERNEL); 798 if (filter->preds)
599 if (!filter) 799 __free_preds(filter);
600 return ERR_PTR(-ENOMEM);
601 800
602 filter->n_preds = 0; 801 filter->preds =
802 kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
603 803
604 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
605 if (!filter->preds) 804 if (!filter->preds)
606 goto oom; 805 return -ENOMEM;
607 806
608 for (i = 0; i < MAX_FILTER_PRED; i++) { 807 filter->a_preds = n_preds;
609 pred = kzalloc(sizeof(*pred), GFP_KERNEL); 808 filter->n_preds = 0;
610 if (!pred) 809
611 goto oom; 810 for (i = 0; i < n_preds; i++) {
811 pred = &filter->preds[i];
612 pred->fn = filter_pred_none; 812 pred->fn = filter_pred_none;
613 filter->preds[i] = pred;
614 } 813 }
615 814
616 return filter;
617
618oom:
619 __free_preds(filter);
620 return ERR_PTR(-ENOMEM);
621}
622
623static int init_preds(struct ftrace_event_call *call)
624{
625 if (call->filter)
626 return 0;
627
628 call->flags &= ~TRACE_EVENT_FL_FILTERED;
629 call->filter = __alloc_preds();
630 if (IS_ERR(call->filter))
631 return PTR_ERR(call->filter);
632
633 return 0; 815 return 0;
634} 816}
635 817
636static int init_subsystem_preds(struct event_subsystem *system) 818static void filter_free_subsystem_preds(struct event_subsystem *system)
637{ 819{
638 struct ftrace_event_call *call; 820 struct ftrace_event_call *call;
639 int err;
640 821
641 list_for_each_entry(call, &ftrace_events, list) { 822 list_for_each_entry(call, &ftrace_events, list) {
642 if (strcmp(call->class->system, system->name) != 0) 823 if (strcmp(call->class->system, system->name) != 0)
643 continue; 824 continue;
644 825
645 err = init_preds(call); 826 filter_disable(call);
646 if (err) 827 remove_filter_string(call->filter);
647 return err;
648 } 828 }
649
650 return 0;
651} 829}
652 830
653static void filter_free_subsystem_preds(struct event_subsystem *system) 831static void filter_free_subsystem_filters(struct event_subsystem *system)
654{ 832{
655 struct ftrace_event_call *call; 833 struct ftrace_event_call *call;
656 834
657 list_for_each_entry(call, &ftrace_events, list) { 835 list_for_each_entry(call, &ftrace_events, list) {
658 if (strcmp(call->class->system, system->name) != 0) 836 if (strcmp(call->class->system, system->name) != 0)
659 continue; 837 continue;
660 838 __free_filter(call->filter);
661 filter_disable_preds(call); 839 call->filter = NULL;
662 remove_filter_string(call->filter);
663 } 840 }
664} 841}
665 842
@@ -667,18 +844,19 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
667 struct ftrace_event_call *call, 844 struct ftrace_event_call *call,
668 struct event_filter *filter, 845 struct event_filter *filter,
669 struct filter_pred *pred, 846 struct filter_pred *pred,
847 struct pred_stack *stack,
670 filter_pred_fn_t fn) 848 filter_pred_fn_t fn)
671{ 849{
672 int idx, err; 850 int idx, err;
673 851
674 if (filter->n_preds == MAX_FILTER_PRED) { 852 if (WARN_ON(filter->n_preds == filter->a_preds)) {
675 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); 853 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
676 return -ENOSPC; 854 return -ENOSPC;
677 } 855 }
678 856
679 idx = filter->n_preds; 857 idx = filter->n_preds;
680 filter_clear_pred(filter->preds[idx]); 858 filter_clear_pred(&filter->preds[idx]);
681 err = filter_set_pred(filter->preds[idx], pred, fn); 859 err = filter_set_pred(filter, idx, stack, pred, fn);
682 if (err) 860 if (err)
683 return err; 861 return err;
684 862
@@ -763,6 +941,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
763 struct ftrace_event_call *call, 941 struct ftrace_event_call *call,
764 struct event_filter *filter, 942 struct event_filter *filter,
765 struct filter_pred *pred, 943 struct filter_pred *pred,
944 struct pred_stack *stack,
766 bool dry_run) 945 bool dry_run)
767{ 946{
768 struct ftrace_event_field *field; 947 struct ftrace_event_field *field;
@@ -770,17 +949,12 @@ static int filter_add_pred(struct filter_parse_state *ps,
770 unsigned long long val; 949 unsigned long long val;
771 int ret; 950 int ret;
772 951
773 pred->fn = filter_pred_none; 952 fn = pred->fn = filter_pred_none;
774 953
775 if (pred->op == OP_AND) { 954 if (pred->op == OP_AND)
776 pred->pop_n = 2;
777 fn = filter_pred_and;
778 goto add_pred_fn; 955 goto add_pred_fn;
779 } else if (pred->op == OP_OR) { 956 else if (pred->op == OP_OR)
780 pred->pop_n = 2;
781 fn = filter_pred_or;
782 goto add_pred_fn; 957 goto add_pred_fn;
783 }
784 958
785 field = find_event_field(call, pred->field_name); 959 field = find_event_field(call, pred->field_name);
786 if (!field) { 960 if (!field) {
@@ -829,7 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
829 1003
830add_pred_fn: 1004add_pred_fn:
831 if (!dry_run) 1005 if (!dry_run)
832 return filter_add_pred_fn(ps, call, filter, pred, fn); 1006 return filter_add_pred_fn(ps, call, filter, pred, stack, fn);
833 return 0; 1007 return 0;
834} 1008}
835 1009
@@ -1187,6 +1361,234 @@ static int check_preds(struct filter_parse_state *ps)
1187 return 0; 1361 return 0;
1188} 1362}
1189 1363
1364static int count_preds(struct filter_parse_state *ps)
1365{
1366 struct postfix_elt *elt;
1367 int n_preds = 0;
1368
1369 list_for_each_entry(elt, &ps->postfix, list) {
1370 if (elt->op == OP_NONE)
1371 continue;
1372 n_preds++;
1373 }
1374
1375 return n_preds;
1376}
1377
1378/*
1379 * The tree is walked at filtering of an event. If the tree is not correctly
1380 * built, it may cause an infinite loop. Check here that the tree does
1381 * indeed terminate.
1382 */
1383static int check_pred_tree(struct event_filter *filter,
1384 struct filter_pred *root)
1385{
1386 struct filter_pred *preds;
1387 struct filter_pred *pred;
1388 enum move_type move = MOVE_DOWN;
1389 int count = 0;
1390 int done = 0;
1391 int max;
1392
1393 /*
1394 * The max that we can hit a node is three times.
1395 * Once going down, once coming up from left, and
1396 * once coming up from right. This is more than enough
1397 * since leafs are only hit a single time.
1398 */
1399 max = 3 * filter->n_preds;
1400
1401 preds = filter->preds;
1402 if (!preds)
1403 return -EINVAL;
1404 pred = root;
1405
1406 do {
1407 if (WARN_ON(count++ > max))
1408 return -EINVAL;
1409
1410 switch (move) {
1411 case MOVE_DOWN:
1412 if (pred->left != FILTER_PRED_INVALID) {
1413 pred = &preds[pred->left];
1414 continue;
1415 }
1416 /* A leaf at the root is just a leaf in the tree */
1417 if (pred == root)
1418 break;
1419 pred = get_pred_parent(pred, preds,
1420 pred->parent, &move);
1421 continue;
1422 case MOVE_UP_FROM_LEFT:
1423 pred = &preds[pred->right];
1424 move = MOVE_DOWN;
1425 continue;
1426 case MOVE_UP_FROM_RIGHT:
1427 if (pred == root)
1428 break;
1429 pred = get_pred_parent(pred, preds,
1430 pred->parent, &move);
1431 continue;
1432 }
1433 done = 1;
1434 } while (!done);
1435
1436 /* We are fine. */
1437 return 0;
1438}
1439
1440static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
1441{
1442 struct filter_pred *pred;
1443 enum move_type move = MOVE_DOWN;
1444 int count = 0;
1445 int done = 0;
1446
1447 pred = root;
1448
1449 do {
1450 switch (move) {
1451 case MOVE_DOWN:
1452 if (pred->left != FILTER_PRED_INVALID) {
1453 pred = &preds[pred->left];
1454 continue;
1455 }
1456 /* A leaf at the root is just a leaf in the tree */
1457 if (pred == root)
1458 return 1;
1459 count++;
1460 pred = get_pred_parent(pred, preds,
1461 pred->parent, &move);
1462 continue;
1463 case MOVE_UP_FROM_LEFT:
1464 pred = &preds[pred->right];
1465 move = MOVE_DOWN;
1466 continue;
1467 case MOVE_UP_FROM_RIGHT:
1468 if (pred == root)
1469 break;
1470 pred = get_pred_parent(pred, preds,
1471 pred->parent, &move);
1472 continue;
1473 }
1474 done = 1;
1475 } while (!done);
1476
1477 return count;
1478}
1479
1480static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
1481{
1482 struct filter_pred *pred;
1483 enum move_type move = MOVE_DOWN;
1484 int count = 0;
1485 int children;
1486 int done = 0;
1487
1488 /* No need to keep the fold flag */
1489 root->index &= ~FILTER_PRED_FOLD;
1490
1491 /* If the root is a leaf then do nothing */
1492 if (root->left == FILTER_PRED_INVALID)
1493 return 0;
1494
1495 /* count the children */
1496 children = count_leafs(preds, &preds[root->left]);
1497 children += count_leafs(preds, &preds[root->right]);
1498
1499 root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL);
1500 if (!root->ops)
1501 return -ENOMEM;
1502
1503 root->val = children;
1504
1505 pred = root;
1506 do {
1507 switch (move) {
1508 case MOVE_DOWN:
1509 if (pred->left != FILTER_PRED_INVALID) {
1510 pred = &preds[pred->left];
1511 continue;
1512 }
1513 if (WARN_ON(count == children))
1514 return -EINVAL;
1515 pred->index &= ~FILTER_PRED_FOLD;
1516 root->ops[count++] = pred->index;
1517 pred = get_pred_parent(pred, preds,
1518 pred->parent, &move);
1519 continue;
1520 case MOVE_UP_FROM_LEFT:
1521 pred = &preds[pred->right];
1522 move = MOVE_DOWN;
1523 continue;
1524 case MOVE_UP_FROM_RIGHT:
1525 if (pred == root)
1526 break;
1527 pred = get_pred_parent(pred, preds,
1528 pred->parent, &move);
1529 continue;
1530 }
1531 done = 1;
1532 } while (!done);
1533
1534 return 0;
1535}
1536
1537/*
1538 * To optimize the processing of the ops, if we have several "ors" or
1539 * "ands" together, we can put them in an array and process them all
1540 * together speeding up the filter logic.
1541 */
1542static int fold_pred_tree(struct event_filter *filter,
1543 struct filter_pred *root)
1544{
1545 struct filter_pred *preds;
1546 struct filter_pred *pred;
1547 enum move_type move = MOVE_DOWN;
1548 int done = 0;
1549 int err;
1550
1551 preds = filter->preds;
1552 if (!preds)
1553 return -EINVAL;
1554 pred = root;
1555
1556 do {
1557 switch (move) {
1558 case MOVE_DOWN:
1559 if (pred->index & FILTER_PRED_FOLD) {
1560 err = fold_pred(preds, pred);
1561 if (err)
1562 return err;
1563 /* Folded nodes are like leafs */
1564 } else if (pred->left != FILTER_PRED_INVALID) {
1565 pred = &preds[pred->left];
1566 continue;
1567 }
1568
1569 /* A leaf at the root is just a leaf in the tree */
1570 if (pred == root)
1571 break;
1572 pred = get_pred_parent(pred, preds,
1573 pred->parent, &move);
1574 continue;
1575 case MOVE_UP_FROM_LEFT:
1576 pred = &preds[pred->right];
1577 move = MOVE_DOWN;
1578 continue;
1579 case MOVE_UP_FROM_RIGHT:
1580 if (pred == root)
1581 break;
1582 pred = get_pred_parent(pred, preds,
1583 pred->parent, &move);
1584 continue;
1585 }
1586 done = 1;
1587 } while (!done);
1588
1589 return 0;
1590}
1591
1190static int replace_preds(struct ftrace_event_call *call, 1592static int replace_preds(struct ftrace_event_call *call,
1191 struct event_filter *filter, 1593 struct event_filter *filter,
1192 struct filter_parse_state *ps, 1594 struct filter_parse_state *ps,
@@ -1195,14 +1597,32 @@ static int replace_preds(struct ftrace_event_call *call,
1195{ 1597{
1196 char *operand1 = NULL, *operand2 = NULL; 1598 char *operand1 = NULL, *operand2 = NULL;
1197 struct filter_pred *pred; 1599 struct filter_pred *pred;
1600 struct filter_pred *root;
1198 struct postfix_elt *elt; 1601 struct postfix_elt *elt;
1602 struct pred_stack stack = { }; /* init to NULL */
1199 int err; 1603 int err;
1200 int n_preds = 0; 1604 int n_preds = 0;
1201 1605
1606 n_preds = count_preds(ps);
1607 if (n_preds >= MAX_FILTER_PRED) {
1608 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1609 return -ENOSPC;
1610 }
1611
1202 err = check_preds(ps); 1612 err = check_preds(ps);
1203 if (err) 1613 if (err)
1204 return err; 1614 return err;
1205 1615
1616 if (!dry_run) {
1617 err = __alloc_pred_stack(&stack, n_preds);
1618 if (err)
1619 return err;
1620 err = __alloc_preds(filter, n_preds);
1621 if (err)
1622 goto fail;
1623 }
1624
1625 n_preds = 0;
1206 list_for_each_entry(elt, &ps->postfix, list) { 1626 list_for_each_entry(elt, &ps->postfix, list) {
1207 if (elt->op == OP_NONE) { 1627 if (elt->op == OP_NONE) {
1208 if (!operand1) 1628 if (!operand1)
@@ -1211,14 +1631,16 @@ static int replace_preds(struct ftrace_event_call *call,
1211 operand2 = elt->operand; 1631 operand2 = elt->operand;
1212 else { 1632 else {
1213 parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); 1633 parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
1214 return -EINVAL; 1634 err = -EINVAL;
1635 goto fail;
1215 } 1636 }
1216 continue; 1637 continue;
1217 } 1638 }
1218 1639
1219 if (n_preds++ == MAX_FILTER_PRED) { 1640 if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
1220 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); 1641 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1221 return -ENOSPC; 1642 err = -ENOSPC;
1643 goto fail;
1222 } 1644 }
1223 1645
1224 if (elt->op == OP_AND || elt->op == OP_OR) { 1646 if (elt->op == OP_AND || elt->op == OP_OR) {
@@ -1228,76 +1650,181 @@ static int replace_preds(struct ftrace_event_call *call,
1228 1650
1229 if (!operand1 || !operand2) { 1651 if (!operand1 || !operand2) {
1230 parse_error(ps, FILT_ERR_MISSING_FIELD, 0); 1652 parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
1231 return -EINVAL; 1653 err = -EINVAL;
1654 goto fail;
1232 } 1655 }
1233 1656
1234 pred = create_pred(elt->op, operand1, operand2); 1657 pred = create_pred(elt->op, operand1, operand2);
1235add_pred: 1658add_pred:
1236 if (!pred) 1659 if (!pred) {
1237 return -ENOMEM; 1660 err = -ENOMEM;
1238 err = filter_add_pred(ps, call, filter, pred, dry_run); 1661 goto fail;
1662 }
1663 err = filter_add_pred(ps, call, filter, pred, &stack, dry_run);
1239 filter_free_pred(pred); 1664 filter_free_pred(pred);
1240 if (err) 1665 if (err)
1241 return err; 1666 goto fail;
1242 1667
1243 operand1 = operand2 = NULL; 1668 operand1 = operand2 = NULL;
1244 } 1669 }
1245 1670
1246 return 0; 1671 if (!dry_run) {
1672 /* We should have one item left on the stack */
1673 pred = __pop_pred_stack(&stack);
1674 if (!pred)
1675 return -EINVAL;
1676 /* This item is where we start from in matching */
1677 root = pred;
1678 /* Make sure the stack is empty */
1679 pred = __pop_pred_stack(&stack);
1680 if (WARN_ON(pred)) {
1681 err = -EINVAL;
1682 filter->root = NULL;
1683 goto fail;
1684 }
1685 err = check_pred_tree(filter, root);
1686 if (err)
1687 goto fail;
1688
1689 /* Optimize the tree */
1690 err = fold_pred_tree(filter, root);
1691 if (err)
1692 goto fail;
1693
1694 /* We don't set root until we know it works */
1695 barrier();
1696 filter->root = root;
1697 }
1698
1699 err = 0;
1700fail:
1701 __free_pred_stack(&stack);
1702 return err;
1247} 1703}
1248 1704
1705struct filter_list {
1706 struct list_head list;
1707 struct event_filter *filter;
1708};
1709
1249static int replace_system_preds(struct event_subsystem *system, 1710static int replace_system_preds(struct event_subsystem *system,
1250 struct filter_parse_state *ps, 1711 struct filter_parse_state *ps,
1251 char *filter_string) 1712 char *filter_string)
1252{ 1713{
1253 struct ftrace_event_call *call; 1714 struct ftrace_event_call *call;
1715 struct filter_list *filter_item;
1716 struct filter_list *tmp;
1717 LIST_HEAD(filter_list);
1254 bool fail = true; 1718 bool fail = true;
1255 int err; 1719 int err;
1256 1720
1257 list_for_each_entry(call, &ftrace_events, list) { 1721 list_for_each_entry(call, &ftrace_events, list) {
1258 struct event_filter *filter = call->filter;
1259 1722
1260 if (strcmp(call->class->system, system->name) != 0) 1723 if (strcmp(call->class->system, system->name) != 0)
1261 continue; 1724 continue;
1262 1725
1263 /* try to see if the filter can be applied */ 1726 /*
1264 err = replace_preds(call, filter, ps, filter_string, true); 1727 * Try to see if the filter can be applied
1728 * (filter arg is ignored on dry_run)
1729 */
1730 err = replace_preds(call, NULL, ps, filter_string, true);
1265 if (err) 1731 if (err)
1732 goto fail;
1733 }
1734
1735 list_for_each_entry(call, &ftrace_events, list) {
1736 struct event_filter *filter;
1737
1738 if (strcmp(call->class->system, system->name) != 0)
1266 continue; 1739 continue;
1267 1740
1268 /* really apply the filter */ 1741 filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
1269 filter_disable_preds(call); 1742 if (!filter_item)
1270 err = replace_preds(call, filter, ps, filter_string, false); 1743 goto fail_mem;
1744
1745 list_add_tail(&filter_item->list, &filter_list);
1746
1747 filter_item->filter = __alloc_filter();
1748 if (!filter_item->filter)
1749 goto fail_mem;
1750 filter = filter_item->filter;
1751
1752 /* Can only fail on no memory */
1753 err = replace_filter_string(filter, filter_string);
1271 if (err) 1754 if (err)
1272 filter_disable_preds(call); 1755 goto fail_mem;
1273 else { 1756
1757 err = replace_preds(call, filter, ps, filter_string, false);
1758 if (err) {
1759 filter_disable(call);
1760 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1761 append_filter_err(ps, filter);
1762 } else
1274 call->flags |= TRACE_EVENT_FL_FILTERED; 1763 call->flags |= TRACE_EVENT_FL_FILTERED;
1275 replace_filter_string(filter, filter_string); 1764 /*
1276 } 1765 * Regardless of if this returned an error, we still
1766 * replace the filter for the call.
1767 */
1768 filter = call->filter;
1769 call->filter = filter_item->filter;
1770 filter_item->filter = filter;
1771
1277 fail = false; 1772 fail = false;
1278 } 1773 }
1279 1774
1280 if (fail) { 1775 if (fail)
1281 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); 1776 goto fail;
1282 return -EINVAL; 1777
1778 /*
1779 * The calls can still be using the old filters.
1780 * Do a synchronize_sched() to ensure all calls are
1781 * done with them before we free them.
1782 */
1783 synchronize_sched();
1784 list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
1785 __free_filter(filter_item->filter);
1786 list_del(&filter_item->list);
1787 kfree(filter_item);
1283 } 1788 }
1284 return 0; 1789 return 0;
1790 fail:
1791 /* No call succeeded */
1792 list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
1793 list_del(&filter_item->list);
1794 kfree(filter_item);
1795 }
1796 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1797 return -EINVAL;
1798 fail_mem:
1799 /* If any call succeeded, we still need to sync */
1800 if (!fail)
1801 synchronize_sched();
1802 list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
1803 __free_filter(filter_item->filter);
1804 list_del(&filter_item->list);
1805 kfree(filter_item);
1806 }
1807 return -ENOMEM;
1285} 1808}
1286 1809
1287int apply_event_filter(struct ftrace_event_call *call, char *filter_string) 1810int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1288{ 1811{
1289 int err;
1290 struct filter_parse_state *ps; 1812 struct filter_parse_state *ps;
1813 struct event_filter *filter;
1814 struct event_filter *tmp;
1815 int err = 0;
1291 1816
1292 mutex_lock(&event_mutex); 1817 mutex_lock(&event_mutex);
1293 1818
1294 err = init_preds(call);
1295 if (err)
1296 goto out_unlock;
1297
1298 if (!strcmp(strstrip(filter_string), "0")) { 1819 if (!strcmp(strstrip(filter_string), "0")) {
1299 filter_disable_preds(call); 1820 filter_disable(call);
1300 remove_filter_string(call->filter); 1821 filter = call->filter;
1822 if (!filter)
1823 goto out_unlock;
1824 call->filter = NULL;
1825 /* Make sure the filter is not being used */
1826 synchronize_sched();
1827 __free_filter(filter);
1301 goto out_unlock; 1828 goto out_unlock;
1302 } 1829 }
1303 1830
@@ -1306,22 +1833,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1306 if (!ps) 1833 if (!ps)
1307 goto out_unlock; 1834 goto out_unlock;
1308 1835
1309 filter_disable_preds(call); 1836 filter = __alloc_filter();
1310 replace_filter_string(call->filter, filter_string); 1837 if (!filter) {
1838 kfree(ps);
1839 goto out_unlock;
1840 }
1841
1842 replace_filter_string(filter, filter_string);
1311 1843
1312 parse_init(ps, filter_ops, filter_string); 1844 parse_init(ps, filter_ops, filter_string);
1313 err = filter_parse(ps); 1845 err = filter_parse(ps);
1314 if (err) { 1846 if (err) {
1315 append_filter_err(ps, call->filter); 1847 append_filter_err(ps, filter);
1316 goto out; 1848 goto out;
1317 } 1849 }
1318 1850
1319 err = replace_preds(call, call->filter, ps, filter_string, false); 1851 err = replace_preds(call, filter, ps, filter_string, false);
1320 if (err) 1852 if (err) {
1321 append_filter_err(ps, call->filter); 1853 filter_disable(call);
1322 else 1854 append_filter_err(ps, filter);
1855 } else
1323 call->flags |= TRACE_EVENT_FL_FILTERED; 1856 call->flags |= TRACE_EVENT_FL_FILTERED;
1324out: 1857out:
1858 /*
1859 * Always swap the call filter with the new filter
1860 * even if there was an error. If there was an error
1861 * in the filter, we disable the filter and show the error
1862 * string
1863 */
1864 tmp = call->filter;
1865 call->filter = filter;
1866 if (tmp) {
1867 /* Make sure the call is done with the filter */
1868 synchronize_sched();
1869 __free_filter(tmp);
1870 }
1325 filter_opstack_clear(ps); 1871 filter_opstack_clear(ps);
1326 postfix_clear(ps); 1872 postfix_clear(ps);
1327 kfree(ps); 1873 kfree(ps);
@@ -1334,18 +1880,21 @@ out_unlock:
1334int apply_subsystem_event_filter(struct event_subsystem *system, 1880int apply_subsystem_event_filter(struct event_subsystem *system,
1335 char *filter_string) 1881 char *filter_string)
1336{ 1882{
1337 int err;
1338 struct filter_parse_state *ps; 1883 struct filter_parse_state *ps;
1884 struct event_filter *filter;
1885 int err = 0;
1339 1886
1340 mutex_lock(&event_mutex); 1887 mutex_lock(&event_mutex);
1341 1888
1342 err = init_subsystem_preds(system);
1343 if (err)
1344 goto out_unlock;
1345
1346 if (!strcmp(strstrip(filter_string), "0")) { 1889 if (!strcmp(strstrip(filter_string), "0")) {
1347 filter_free_subsystem_preds(system); 1890 filter_free_subsystem_preds(system);
1348 remove_filter_string(system->filter); 1891 remove_filter_string(system->filter);
1892 filter = system->filter;
1893 system->filter = NULL;
1894 /* Ensure all filters are no longer used */
1895 synchronize_sched();
1896 filter_free_subsystem_filters(system);
1897 __free_filter(filter);
1349 goto out_unlock; 1898 goto out_unlock;
1350 } 1899 }
1351 1900
@@ -1354,7 +1903,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1354 if (!ps) 1903 if (!ps)
1355 goto out_unlock; 1904 goto out_unlock;
1356 1905
1357 replace_filter_string(system->filter, filter_string); 1906 filter = __alloc_filter();
1907 if (!filter)
1908 goto out;
1909
1910 replace_filter_string(filter, filter_string);
1911 /*
1912 * No event actually uses the system filter
1913 * we can free it without synchronize_sched().
1914 */
1915 __free_filter(system->filter);
1916 system->filter = filter;
1358 1917
1359 parse_init(ps, filter_ops, filter_string); 1918 parse_init(ps, filter_ops, filter_string);
1360 err = filter_parse(ps); 1919 err = filter_parse(ps);
@@ -1384,7 +1943,7 @@ void ftrace_profile_free_filter(struct perf_event *event)
1384 struct event_filter *filter = event->filter; 1943 struct event_filter *filter = event->filter;
1385 1944
1386 event->filter = NULL; 1945 event->filter = NULL;
1387 __free_preds(filter); 1946 __free_filter(filter);
1388} 1947}
1389 1948
1390int ftrace_profile_set_filter(struct perf_event *event, int event_id, 1949int ftrace_profile_set_filter(struct perf_event *event, int event_id,
@@ -1410,8 +1969,8 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1410 if (event->filter) 1969 if (event->filter)
1411 goto out_unlock; 1970 goto out_unlock;
1412 1971
1413 filter = __alloc_preds(); 1972 filter = __alloc_filter();
1414 if (IS_ERR(filter)) { 1973 if (!filter) {
1415 err = PTR_ERR(filter); 1974 err = PTR_ERR(filter);
1416 goto out_unlock; 1975 goto out_unlock;
1417 } 1976 }
@@ -1419,7 +1978,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1419 err = -ENOMEM; 1978 err = -ENOMEM;
1420 ps = kzalloc(sizeof(*ps), GFP_KERNEL); 1979 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1421 if (!ps) 1980 if (!ps)
1422 goto free_preds; 1981 goto free_filter;
1423 1982
1424 parse_init(ps, filter_ops, filter_str); 1983 parse_init(ps, filter_ops, filter_str);
1425 err = filter_parse(ps); 1984 err = filter_parse(ps);
@@ -1435,9 +1994,9 @@ free_ps:
1435 postfix_clear(ps); 1994 postfix_clear(ps);
1436 kfree(ps); 1995 kfree(ps);
1437 1996
1438free_preds: 1997free_filter:
1439 if (err) 1998 if (err)
1440 __free_preds(filter); 1999 __free_filter(filter);
1441 2000
1442out_unlock: 2001out_unlock:
1443 mutex_unlock(&event_mutex); 2002 mutex_unlock(&event_mutex);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 2dec9bcde8b4..8435b43b1782 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -353,6 +353,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
353 kfree(data); 353 kfree(data);
354} 354}
355 355
356/* Bitfield fetch function */
357struct bitfield_fetch_param {
358 struct fetch_param orig;
359 unsigned char hi_shift;
360 unsigned char low_shift;
361};
362
363#define DEFINE_FETCH_bitfield(type) \
364static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
365 void *data, void *dest) \
366{ \
367 struct bitfield_fetch_param *bprm = data; \
368 type buf = 0; \
369 call_fetch(&bprm->orig, regs, &buf); \
370 if (buf) { \
371 buf <<= bprm->hi_shift; \
372 buf >>= bprm->low_shift; \
373 } \
374 *(type *)dest = buf; \
375}
376DEFINE_BASIC_FETCH_FUNCS(bitfield)
377#define fetch_bitfield_string NULL
378#define fetch_bitfield_string_size NULL
379
380static __kprobes void
381free_bitfield_fetch_param(struct bitfield_fetch_param *data)
382{
383 /*
384 * Don't check the bitfield itself, because this must be the
385 * last fetch function.
386 */
387 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
388 free_deref_fetch_param(data->orig.data);
389 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
390 free_symbol_cache(data->orig.data);
391 kfree(data);
392}
356/* Default (unsigned long) fetch type */ 393/* Default (unsigned long) fetch type */
357#define __DEFAULT_FETCH_TYPE(t) u##t 394#define __DEFAULT_FETCH_TYPE(t) u##t
358#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) 395#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
@@ -367,6 +404,7 @@ enum {
367 FETCH_MTD_memory, 404 FETCH_MTD_memory,
368 FETCH_MTD_symbol, 405 FETCH_MTD_symbol,
369 FETCH_MTD_deref, 406 FETCH_MTD_deref,
407 FETCH_MTD_bitfield,
370 FETCH_MTD_END, 408 FETCH_MTD_END,
371}; 409};
372 410
@@ -387,6 +425,7 @@ ASSIGN_FETCH_FUNC(retval, ftype), \
387ASSIGN_FETCH_FUNC(memory, ftype), \ 425ASSIGN_FETCH_FUNC(memory, ftype), \
388ASSIGN_FETCH_FUNC(symbol, ftype), \ 426ASSIGN_FETCH_FUNC(symbol, ftype), \
389ASSIGN_FETCH_FUNC(deref, ftype), \ 427ASSIGN_FETCH_FUNC(deref, ftype), \
428ASSIGN_FETCH_FUNC(bitfield, ftype), \
390 } \ 429 } \
391 } 430 }
392 431
@@ -430,9 +469,33 @@ static const struct fetch_type *find_fetch_type(const char *type)
430 if (!type) 469 if (!type)
431 type = DEFAULT_FETCH_TYPE_STR; 470 type = DEFAULT_FETCH_TYPE_STR;
432 471
472 /* Special case: bitfield */
473 if (*type == 'b') {
474 unsigned long bs;
475 type = strchr(type, '/');
476 if (!type)
477 goto fail;
478 type++;
479 if (strict_strtoul(type, 0, &bs))
480 goto fail;
481 switch (bs) {
482 case 8:
483 return find_fetch_type("u8");
484 case 16:
485 return find_fetch_type("u16");
486 case 32:
487 return find_fetch_type("u32");
488 case 64:
489 return find_fetch_type("u64");
490 default:
491 goto fail;
492 }
493 }
494
433 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) 495 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
434 if (strcmp(type, fetch_type_table[i].name) == 0) 496 if (strcmp(type, fetch_type_table[i].name) == 0)
435 return &fetch_type_table[i]; 497 return &fetch_type_table[i];
498fail:
436 return NULL; 499 return NULL;
437} 500}
438 501
@@ -586,7 +649,9 @@ error:
586 649
587static void free_probe_arg(struct probe_arg *arg) 650static void free_probe_arg(struct probe_arg *arg)
588{ 651{
589 if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) 652 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
653 free_bitfield_fetch_param(arg->fetch.data);
654 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
590 free_deref_fetch_param(arg->fetch.data); 655 free_deref_fetch_param(arg->fetch.data);
591 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) 656 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
592 free_symbol_cache(arg->fetch.data); 657 free_symbol_cache(arg->fetch.data);
@@ -767,16 +832,15 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
767 } 832 }
768 break; 833 break;
769 case '+': /* deref memory */ 834 case '+': /* deref memory */
835 arg++; /* Skip '+', because strict_strtol() rejects it. */
770 case '-': 836 case '-':
771 tmp = strchr(arg, '('); 837 tmp = strchr(arg, '(');
772 if (!tmp) 838 if (!tmp)
773 break; 839 break;
774 *tmp = '\0'; 840 *tmp = '\0';
775 ret = strict_strtol(arg + 1, 0, &offset); 841 ret = strict_strtol(arg, 0, &offset);
776 if (ret) 842 if (ret)
777 break; 843 break;
778 if (arg[0] == '-')
779 offset = -offset;
780 arg = tmp + 1; 844 arg = tmp + 1;
781 tmp = strrchr(arg, ')'); 845 tmp = strrchr(arg, ')');
782 if (tmp) { 846 if (tmp) {
@@ -807,6 +871,41 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
807 return ret; 871 return ret;
808} 872}
809 873
874#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
875
876/* Bitfield type needs to be parsed into a fetch function */
877static int __parse_bitfield_probe_arg(const char *bf,
878 const struct fetch_type *t,
879 struct fetch_param *f)
880{
881 struct bitfield_fetch_param *bprm;
882 unsigned long bw, bo;
883 char *tail;
884
885 if (*bf != 'b')
886 return 0;
887
888 bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
889 if (!bprm)
890 return -ENOMEM;
891 bprm->orig = *f;
892 f->fn = t->fetch[FETCH_MTD_bitfield];
893 f->data = (void *)bprm;
894
895 bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */
896 if (bw == 0 || *tail != '@')
897 return -EINVAL;
898
899 bf = tail + 1;
900 bo = simple_strtoul(bf, &tail, 0);
901 if (tail == bf || *tail != '/')
902 return -EINVAL;
903
904 bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
905 bprm->low_shift = bprm->hi_shift + bo;
906 return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
907}
908
810/* String length checking wrapper */ 909/* String length checking wrapper */
811static int parse_probe_arg(char *arg, struct trace_probe *tp, 910static int parse_probe_arg(char *arg, struct trace_probe *tp,
812 struct probe_arg *parg, int is_return) 911 struct probe_arg *parg, int is_return)
@@ -836,6 +935,8 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
836 parg->offset = tp->size; 935 parg->offset = tp->size;
837 tp->size += parg->type->size; 936 tp->size += parg->type->size;
838 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); 937 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
938 if (ret >= 0 && t != NULL)
939 ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
839 if (ret >= 0) { 940 if (ret >= 0) {
840 parg->fetch_size.fn = get_fetch_size_function(parg->type, 941 parg->fetch_size.fn = get_fetch_size_function(parg->type,
841 parg->fetch.fn); 942 parg->fetch.fn);
@@ -1130,7 +1231,7 @@ static int command_trace_probe(const char *buf)
1130 return ret; 1231 return ret;
1131} 1232}
1132 1233
1133#define WRITE_BUFSIZE 128 1234#define WRITE_BUFSIZE 4096
1134 1235
1135static ssize_t probes_write(struct file *file, const char __user *buffer, 1236static ssize_t probes_write(struct file *file, const char __user *buffer,
1136 size_t count, loff_t *ppos) 1237 size_t count, loff_t *ppos)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 02272baa2206..456be9063c2d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -529,24 +529,34 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
529 * @entry: The trace entry field from the ring buffer 529 * @entry: The trace entry field from the ring buffer
530 * 530 *
531 * Prints the generic fields of irqs off, in hard or softirq, preempt 531 * Prints the generic fields of irqs off, in hard or softirq, preempt
532 * count and lock depth. 532 * count.
533 */ 533 */
534int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) 534int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
535{ 535{
536 int hardirq, softirq; 536 char hardsoft_irq;
537 char need_resched;
538 char irqs_off;
539 int hardirq;
540 int softirq;
537 int ret; 541 int ret;
538 542
539 hardirq = entry->flags & TRACE_FLAG_HARDIRQ; 543 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
540 softirq = entry->flags & TRACE_FLAG_SOFTIRQ; 544 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
541 545
546 irqs_off =
547 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
548 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
549 '.';
550 need_resched =
551 (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
552 hardsoft_irq =
553 (hardirq && softirq) ? 'H' :
554 hardirq ? 'h' :
555 softirq ? 's' :
556 '.';
557
542 if (!trace_seq_printf(s, "%c%c%c", 558 if (!trace_seq_printf(s, "%c%c%c",
543 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : 559 irqs_off, need_resched, hardsoft_irq))
544 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
545 'X' : '.',
546 (entry->flags & TRACE_FLAG_NEED_RESCHED) ?
547 'N' : '.',
548 (hardirq && softirq) ? 'H' :
549 hardirq ? 'h' : softirq ? 's' : '.'))
550 return 0; 560 return 0;
551 561
552 if (entry->preempt_count) 562 if (entry->preempt_count)
@@ -554,13 +564,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
554 else 564 else
555 ret = trace_seq_putc(s, '.'); 565 ret = trace_seq_putc(s, '.');
556 566
557 if (!ret) 567 return ret;
558 return 0;
559
560 if (entry->lock_depth < 0)
561 return trace_seq_putc(s, '.');
562
563 return trace_seq_printf(s, "%d", entry->lock_depth);
564} 568}
565 569
566static int 570static int
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 8f758d070c43..7e62c0a18456 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
247 ctx_trace = tr; 247 ctx_trace = tr;
248} 248}
249 249
250static void stop_sched_trace(struct trace_array *tr)
251{
252 tracing_stop_sched_switch_record();
253}
254
255static int sched_switch_trace_init(struct trace_array *tr)
256{
257 ctx_trace = tr;
258 tracing_reset_online_cpus(tr);
259 tracing_start_sched_switch_record();
260 return 0;
261}
262
263static void sched_switch_trace_reset(struct trace_array *tr)
264{
265 if (sched_ref)
266 stop_sched_trace(tr);
267}
268
269static void sched_switch_trace_start(struct trace_array *tr)
270{
271 sched_stopped = 0;
272}
273
274static void sched_switch_trace_stop(struct trace_array *tr)
275{
276 sched_stopped = 1;
277}
278
279static struct tracer sched_switch_trace __read_mostly =
280{
281 .name = "sched_switch",
282 .init = sched_switch_trace_init,
283 .reset = sched_switch_trace_reset,
284 .start = sched_switch_trace_start,
285 .stop = sched_switch_trace_stop,
286 .wait_pipe = poll_wait_pipe,
287#ifdef CONFIG_FTRACE_SELFTEST
288 .selftest = trace_selftest_startup_sched_switch,
289#endif
290};
291
292__init static int init_sched_switch_trace(void)
293{
294 return register_tracer(&sched_switch_trace);
295}
296device_initcall(init_sched_switch_trace);
297
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5c9fe08d2093..ee7b5a0bb9f8 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -60,6 +60,19 @@ extern struct syscall_metadata *__stop_syscalls_metadata[];
60 60
61static struct syscall_metadata **syscalls_metadata; 61static struct syscall_metadata **syscalls_metadata;
62 62
63#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
64static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
65{
66 /*
67 * Only compare after the "sys" prefix. Archs that use
68 * syscall wrappers may have syscalls symbols aliases prefixed
69 * with "SyS" instead of "sys", leading to an unwanted
70 * mismatch.
71 */
72 return !strcmp(sym + 3, name + 3);
73}
74#endif
75
63static __init struct syscall_metadata * 76static __init struct syscall_metadata *
64find_syscall_meta(unsigned long syscall) 77find_syscall_meta(unsigned long syscall)
65{ 78{
@@ -72,14 +85,11 @@ find_syscall_meta(unsigned long syscall)
72 stop = __stop_syscalls_metadata; 85 stop = __stop_syscalls_metadata;
73 kallsyms_lookup(syscall, NULL, NULL, NULL, str); 86 kallsyms_lookup(syscall, NULL, NULL, NULL, str);
74 87
88 if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
89 return NULL;
90
75 for ( ; start < stop; start++) { 91 for ( ; start < stop; start++) {
76 /* 92 if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
77 * Only compare after the "sys" prefix. Archs that use
78 * syscall wrappers may have syscalls symbols aliases prefixed
79 * with "SyS" instead of "sys", leading to an unwanted
80 * mismatch.
81 */
82 if ((*start)->name && !strcmp((*start)->name + 3, str + 3))
83 return *start; 93 return *start;
84 } 94 }
85 return NULL; 95 return NULL;
@@ -359,7 +369,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
359 int num; 369 int num;
360 370
361 num = ((struct syscall_metadata *)call->data)->syscall_nr; 371 num = ((struct syscall_metadata *)call->data)->syscall_nr;
362 if (num < 0 || num >= NR_syscalls) 372 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
363 return -ENOSYS; 373 return -ENOSYS;
364 mutex_lock(&syscall_trace_lock); 374 mutex_lock(&syscall_trace_lock);
365 if (!sys_refcount_enter) 375 if (!sys_refcount_enter)
@@ -377,7 +387,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
377 int num; 387 int num;
378 388
379 num = ((struct syscall_metadata *)call->data)->syscall_nr; 389 num = ((struct syscall_metadata *)call->data)->syscall_nr;
380 if (num < 0 || num >= NR_syscalls) 390 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
381 return; 391 return;
382 mutex_lock(&syscall_trace_lock); 392 mutex_lock(&syscall_trace_lock);
383 sys_refcount_enter--; 393 sys_refcount_enter--;
@@ -393,7 +403,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
393 int num; 403 int num;
394 404
395 num = ((struct syscall_metadata *)call->data)->syscall_nr; 405 num = ((struct syscall_metadata *)call->data)->syscall_nr;
396 if (num < 0 || num >= NR_syscalls) 406 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
397 return -ENOSYS; 407 return -ENOSYS;
398 mutex_lock(&syscall_trace_lock); 408 mutex_lock(&syscall_trace_lock);
399 if (!sys_refcount_exit) 409 if (!sys_refcount_exit)
@@ -411,7 +421,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
411 int num; 421 int num;
412 422
413 num = ((struct syscall_metadata *)call->data)->syscall_nr; 423 num = ((struct syscall_metadata *)call->data)->syscall_nr;
414 if (num < 0 || num >= NR_syscalls) 424 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
415 return; 425 return;
416 mutex_lock(&syscall_trace_lock); 426 mutex_lock(&syscall_trace_lock);
417 sys_refcount_exit--; 427 sys_refcount_exit--;
@@ -424,6 +434,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
424int init_syscall_trace(struct ftrace_event_call *call) 434int init_syscall_trace(struct ftrace_event_call *call)
425{ 435{
426 int id; 436 int id;
437 int num;
438
439 num = ((struct syscall_metadata *)call->data)->syscall_nr;
440 if (num < 0 || num >= NR_syscalls) {
441 pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
442 ((struct syscall_metadata *)call->data)->name);
443 return -ENOSYS;
444 }
427 445
428 if (set_syscall_print_fmt(call) < 0) 446 if (set_syscall_print_fmt(call) < 0)
429 return -ENOMEM; 447 return -ENOMEM;
@@ -438,7 +456,7 @@ int init_syscall_trace(struct ftrace_event_call *call)
438 return id; 456 return id;
439} 457}
440 458
441unsigned long __init arch_syscall_addr(int nr) 459unsigned long __init __weak arch_syscall_addr(int nr)
442{ 460{
443 return (unsigned long)sys_call_table[nr]; 461 return (unsigned long)sys_call_table[nr];
444} 462}