aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/syscall.c7
-rw-r--r--kernel/events/core.c752
-rw-r--r--kernel/events/hw_breakpoint.c8
-rw-r--r--kernel/events/internal.h33
-rw-r--r--kernel/events/ring_buffer.c327
-rw-r--r--kernel/trace/Kconfig8
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/bpf_trace.c222
-rw-r--r--kernel/trace/trace_kprobe.c10
-rw-r--r--kernel/trace/trace_uprobe.c10
-rw-r--r--kernel/watchdog.c28
11 files changed, 1226 insertions, 180 deletions
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 536edc2be307..504c10b990ef 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -16,6 +16,7 @@
16#include <linux/file.h> 16#include <linux/file.h>
17#include <linux/license.h> 17#include <linux/license.h>
18#include <linux/filter.h> 18#include <linux/filter.h>
19#include <linux/version.h>
19 20
20static LIST_HEAD(bpf_map_types); 21static LIST_HEAD(bpf_map_types);
21 22
@@ -467,7 +468,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
467} 468}
468 469
469/* last field in 'union bpf_attr' used by this command */ 470/* last field in 'union bpf_attr' used by this command */
470#define BPF_PROG_LOAD_LAST_FIELD log_buf 471#define BPF_PROG_LOAD_LAST_FIELD kern_version
471 472
472static int bpf_prog_load(union bpf_attr *attr) 473static int bpf_prog_load(union bpf_attr *attr)
473{ 474{
@@ -492,6 +493,10 @@ static int bpf_prog_load(union bpf_attr *attr)
492 if (attr->insn_cnt >= BPF_MAXINSNS) 493 if (attr->insn_cnt >= BPF_MAXINSNS)
493 return -EINVAL; 494 return -EINVAL;
494 495
496 if (type == BPF_PROG_TYPE_KPROBE &&
497 attr->kern_version != LINUX_VERSION_CODE)
498 return -EINVAL;
499
495 /* plain bpf_prog allocation */ 500 /* plain bpf_prog allocation */
496 prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); 501 prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
497 if (!prog) 502 if (!prog)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2fabc0627165..06917d537302 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -34,14 +34,16 @@
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/anon_inodes.h> 35#include <linux/anon_inodes.h>
36#include <linux/kernel_stat.h> 36#include <linux/kernel_stat.h>
37#include <linux/cgroup.h>
37#include <linux/perf_event.h> 38#include <linux/perf_event.h>
38#include <linux/ftrace_event.h> 39#include <linux/ftrace_event.h>
39#include <linux/hw_breakpoint.h> 40#include <linux/hw_breakpoint.h>
40#include <linux/mm_types.h> 41#include <linux/mm_types.h>
41#include <linux/cgroup.h>
42#include <linux/module.h> 42#include <linux/module.h>
43#include <linux/mman.h> 43#include <linux/mman.h>
44#include <linux/compat.h> 44#include <linux/compat.h>
45#include <linux/bpf.h>
46#include <linux/filter.h>
45 47
46#include "internal.h" 48#include "internal.h"
47 49
@@ -153,7 +155,7 @@ enum event_type_t {
153 */ 155 */
154struct static_key_deferred perf_sched_events __read_mostly; 156struct static_key_deferred perf_sched_events __read_mostly;
155static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 157static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
156static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); 158static DEFINE_PER_CPU(int, perf_sched_cb_usages);
157 159
158static atomic_t nr_mmap_events __read_mostly; 160static atomic_t nr_mmap_events __read_mostly;
159static atomic_t nr_comm_events __read_mostly; 161static atomic_t nr_comm_events __read_mostly;
@@ -327,6 +329,11 @@ static inline u64 perf_clock(void)
327 return local_clock(); 329 return local_clock();
328} 330}
329 331
332static inline u64 perf_event_clock(struct perf_event *event)
333{
334 return event->clock();
335}
336
330static inline struct perf_cpu_context * 337static inline struct perf_cpu_context *
331__get_cpu_context(struct perf_event_context *ctx) 338__get_cpu_context(struct perf_event_context *ctx)
332{ 339{
@@ -351,32 +358,6 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
351 358
352#ifdef CONFIG_CGROUP_PERF 359#ifdef CONFIG_CGROUP_PERF
353 360
354/*
355 * perf_cgroup_info keeps track of time_enabled for a cgroup.
356 * This is a per-cpu dynamically allocated data structure.
357 */
358struct perf_cgroup_info {
359 u64 time;
360 u64 timestamp;
361};
362
363struct perf_cgroup {
364 struct cgroup_subsys_state css;
365 struct perf_cgroup_info __percpu *info;
366};
367
368/*
369 * Must ensure cgroup is pinned (css_get) before calling
370 * this function. In other words, we cannot call this function
371 * if there is no cgroup event for the current CPU context.
372 */
373static inline struct perf_cgroup *
374perf_cgroup_from_task(struct task_struct *task)
375{
376 return container_of(task_css(task, perf_event_cgrp_id),
377 struct perf_cgroup, css);
378}
379
380static inline bool 361static inline bool
381perf_cgroup_match(struct perf_event *event) 362perf_cgroup_match(struct perf_event *event)
382{ 363{
@@ -905,6 +886,15 @@ static void get_ctx(struct perf_event_context *ctx)
905 WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); 886 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
906} 887}
907 888
889static void free_ctx(struct rcu_head *head)
890{
891 struct perf_event_context *ctx;
892
893 ctx = container_of(head, struct perf_event_context, rcu_head);
894 kfree(ctx->task_ctx_data);
895 kfree(ctx);
896}
897
908static void put_ctx(struct perf_event_context *ctx) 898static void put_ctx(struct perf_event_context *ctx)
909{ 899{
910 if (atomic_dec_and_test(&ctx->refcount)) { 900 if (atomic_dec_and_test(&ctx->refcount)) {
@@ -912,7 +902,7 @@ static void put_ctx(struct perf_event_context *ctx)
912 put_ctx(ctx->parent_ctx); 902 put_ctx(ctx->parent_ctx);
913 if (ctx->task) 903 if (ctx->task)
914 put_task_struct(ctx->task); 904 put_task_struct(ctx->task);
915 kfree_rcu(ctx, rcu_head); 905 call_rcu(&ctx->rcu_head, free_ctx);
916 } 906 }
917} 907}
918 908
@@ -1239,9 +1229,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1239 if (is_cgroup_event(event)) 1229 if (is_cgroup_event(event))
1240 ctx->nr_cgroups++; 1230 ctx->nr_cgroups++;
1241 1231
1242 if (has_branch_stack(event))
1243 ctx->nr_branch_stack++;
1244
1245 list_add_rcu(&event->event_entry, &ctx->event_list); 1232 list_add_rcu(&event->event_entry, &ctx->event_list);
1246 ctx->nr_events++; 1233 ctx->nr_events++;
1247 if (event->attr.inherit_stat) 1234 if (event->attr.inherit_stat)
@@ -1408,9 +1395,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1408 cpuctx->cgrp = NULL; 1395 cpuctx->cgrp = NULL;
1409 } 1396 }
1410 1397
1411 if (has_branch_stack(event))
1412 ctx->nr_branch_stack--;
1413
1414 ctx->nr_events--; 1398 ctx->nr_events--;
1415 if (event->attr.inherit_stat) 1399 if (event->attr.inherit_stat)
1416 ctx->nr_stat--; 1400 ctx->nr_stat--;
@@ -1847,6 +1831,7 @@ static void perf_set_shadow_time(struct perf_event *event,
1847#define MAX_INTERRUPTS (~0ULL) 1831#define MAX_INTERRUPTS (~0ULL)
1848 1832
1849static void perf_log_throttle(struct perf_event *event, int enable); 1833static void perf_log_throttle(struct perf_event *event, int enable);
1834static void perf_log_itrace_start(struct perf_event *event);
1850 1835
1851static int 1836static int
1852event_sched_in(struct perf_event *event, 1837event_sched_in(struct perf_event *event,
@@ -1881,6 +1866,12 @@ event_sched_in(struct perf_event *event,
1881 1866
1882 perf_pmu_disable(event->pmu); 1867 perf_pmu_disable(event->pmu);
1883 1868
1869 event->tstamp_running += tstamp - event->tstamp_stopped;
1870
1871 perf_set_shadow_time(event, ctx, tstamp);
1872
1873 perf_log_itrace_start(event);
1874
1884 if (event->pmu->add(event, PERF_EF_START)) { 1875 if (event->pmu->add(event, PERF_EF_START)) {
1885 event->state = PERF_EVENT_STATE_INACTIVE; 1876 event->state = PERF_EVENT_STATE_INACTIVE;
1886 event->oncpu = -1; 1877 event->oncpu = -1;
@@ -1888,10 +1879,6 @@ event_sched_in(struct perf_event *event,
1888 goto out; 1879 goto out;
1889 } 1880 }
1890 1881
1891 event->tstamp_running += tstamp - event->tstamp_stopped;
1892
1893 perf_set_shadow_time(event, ctx, tstamp);
1894
1895 if (!is_software_event(event)) 1882 if (!is_software_event(event))
1896 cpuctx->active_oncpu++; 1883 cpuctx->active_oncpu++;
1897 if (!ctx->nr_active++) 1884 if (!ctx->nr_active++)
@@ -2559,6 +2546,9 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2559 next->perf_event_ctxp[ctxn] = ctx; 2546 next->perf_event_ctxp[ctxn] = ctx;
2560 ctx->task = next; 2547 ctx->task = next;
2561 next_ctx->task = task; 2548 next_ctx->task = task;
2549
2550 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
2551
2562 do_switch = 0; 2552 do_switch = 0;
2563 2553
2564 perf_event_sync_stat(ctx, next_ctx); 2554 perf_event_sync_stat(ctx, next_ctx);
@@ -2577,6 +2567,56 @@ unlock:
2577 } 2567 }
2578} 2568}
2579 2569
2570void perf_sched_cb_dec(struct pmu *pmu)
2571{
2572 this_cpu_dec(perf_sched_cb_usages);
2573}
2574
2575void perf_sched_cb_inc(struct pmu *pmu)
2576{
2577 this_cpu_inc(perf_sched_cb_usages);
2578}
2579
2580/*
2581 * This function provides the context switch callback to the lower code
2582 * layer. It is invoked ONLY when the context switch callback is enabled.
2583 */
2584static void perf_pmu_sched_task(struct task_struct *prev,
2585 struct task_struct *next,
2586 bool sched_in)
2587{
2588 struct perf_cpu_context *cpuctx;
2589 struct pmu *pmu;
2590 unsigned long flags;
2591
2592 if (prev == next)
2593 return;
2594
2595 local_irq_save(flags);
2596
2597 rcu_read_lock();
2598
2599 list_for_each_entry_rcu(pmu, &pmus, entry) {
2600 if (pmu->sched_task) {
2601 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2602
2603 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2604
2605 perf_pmu_disable(pmu);
2606
2607 pmu->sched_task(cpuctx->task_ctx, sched_in);
2608
2609 perf_pmu_enable(pmu);
2610
2611 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2612 }
2613 }
2614
2615 rcu_read_unlock();
2616
2617 local_irq_restore(flags);
2618}
2619
2580#define for_each_task_context_nr(ctxn) \ 2620#define for_each_task_context_nr(ctxn) \
2581 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) 2621 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2582 2622
@@ -2596,6 +2636,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
2596{ 2636{
2597 int ctxn; 2637 int ctxn;
2598 2638
2639 if (__this_cpu_read(perf_sched_cb_usages))
2640 perf_pmu_sched_task(task, next, false);
2641
2599 for_each_task_context_nr(ctxn) 2642 for_each_task_context_nr(ctxn)
2600 perf_event_context_sched_out(task, ctxn, next); 2643 perf_event_context_sched_out(task, ctxn, next);
2601 2644
@@ -2755,64 +2798,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2755} 2798}
2756 2799
2757/* 2800/*
2758 * When sampling the branck stack in system-wide, it may be necessary
2759 * to flush the stack on context switch. This happens when the branch
2760 * stack does not tag its entries with the pid of the current task.
2761 * Otherwise it becomes impossible to associate a branch entry with a
2762 * task. This ambiguity is more likely to appear when the branch stack
2763 * supports priv level filtering and the user sets it to monitor only
2764 * at the user level (which could be a useful measurement in system-wide
2765 * mode). In that case, the risk is high of having a branch stack with
2766 * branch from multiple tasks. Flushing may mean dropping the existing
2767 * entries or stashing them somewhere in the PMU specific code layer.
2768 *
2769 * This function provides the context switch callback to the lower code
2770 * layer. It is invoked ONLY when there is at least one system-wide context
2771 * with at least one active event using taken branch sampling.
2772 */
2773static void perf_branch_stack_sched_in(struct task_struct *prev,
2774 struct task_struct *task)
2775{
2776 struct perf_cpu_context *cpuctx;
2777 struct pmu *pmu;
2778 unsigned long flags;
2779
2780 /* no need to flush branch stack if not changing task */
2781 if (prev == task)
2782 return;
2783
2784 local_irq_save(flags);
2785
2786 rcu_read_lock();
2787
2788 list_for_each_entry_rcu(pmu, &pmus, entry) {
2789 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2790
2791 /*
2792 * check if the context has at least one
2793 * event using PERF_SAMPLE_BRANCH_STACK
2794 */
2795 if (cpuctx->ctx.nr_branch_stack > 0
2796 && pmu->flush_branch_stack) {
2797
2798 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2799
2800 perf_pmu_disable(pmu);
2801
2802 pmu->flush_branch_stack();
2803
2804 perf_pmu_enable(pmu);
2805
2806 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2807 }
2808 }
2809
2810 rcu_read_unlock();
2811
2812 local_irq_restore(flags);
2813}
2814
2815/*
2816 * Called from scheduler to add the events of the current task 2801 * Called from scheduler to add the events of the current task
2817 * with interrupts disabled. 2802 * with interrupts disabled.
2818 * 2803 *
@@ -2844,9 +2829,8 @@ void __perf_event_task_sched_in(struct task_struct *prev,
2844 if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) 2829 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2845 perf_cgroup_sched_in(prev, task); 2830 perf_cgroup_sched_in(prev, task);
2846 2831
2847 /* check for system-wide branch_stack events */ 2832 if (__this_cpu_read(perf_sched_cb_usages))
2848 if (atomic_read(this_cpu_ptr(&perf_branch_stack_events))) 2833 perf_pmu_sched_task(prev, task, true);
2849 perf_branch_stack_sched_in(prev, task);
2850} 2834}
2851 2835
2852static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2836static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -3220,7 +3204,10 @@ static void __perf_event_read(void *info)
3220 3204
3221static inline u64 perf_event_count(struct perf_event *event) 3205static inline u64 perf_event_count(struct perf_event *event)
3222{ 3206{
3223 return local64_read(&event->count) + atomic64_read(&event->child_count); 3207 if (event->pmu->count)
3208 return event->pmu->count(event);
3209
3210 return __perf_event_count(event);
3224} 3211}
3225 3212
3226static u64 perf_event_read(struct perf_event *event) 3213static u64 perf_event_read(struct perf_event *event)
@@ -3321,12 +3308,15 @@ errout:
3321 * Returns a matching context with refcount and pincount. 3308 * Returns a matching context with refcount and pincount.
3322 */ 3309 */
3323static struct perf_event_context * 3310static struct perf_event_context *
3324find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) 3311find_get_context(struct pmu *pmu, struct task_struct *task,
3312 struct perf_event *event)
3325{ 3313{
3326 struct perf_event_context *ctx, *clone_ctx = NULL; 3314 struct perf_event_context *ctx, *clone_ctx = NULL;
3327 struct perf_cpu_context *cpuctx; 3315 struct perf_cpu_context *cpuctx;
3316 void *task_ctx_data = NULL;
3328 unsigned long flags; 3317 unsigned long flags;
3329 int ctxn, err; 3318 int ctxn, err;
3319 int cpu = event->cpu;
3330 3320
3331 if (!task) { 3321 if (!task) {
3332 /* Must be root to operate on a CPU event: */ 3322 /* Must be root to operate on a CPU event: */
@@ -3354,11 +3344,24 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
3354 if (ctxn < 0) 3344 if (ctxn < 0)
3355 goto errout; 3345 goto errout;
3356 3346
3347 if (event->attach_state & PERF_ATTACH_TASK_DATA) {
3348 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
3349 if (!task_ctx_data) {
3350 err = -ENOMEM;
3351 goto errout;
3352 }
3353 }
3354
3357retry: 3355retry:
3358 ctx = perf_lock_task_context(task, ctxn, &flags); 3356 ctx = perf_lock_task_context(task, ctxn, &flags);
3359 if (ctx) { 3357 if (ctx) {
3360 clone_ctx = unclone_ctx(ctx); 3358 clone_ctx = unclone_ctx(ctx);
3361 ++ctx->pin_count; 3359 ++ctx->pin_count;
3360
3361 if (task_ctx_data && !ctx->task_ctx_data) {
3362 ctx->task_ctx_data = task_ctx_data;
3363 task_ctx_data = NULL;
3364 }
3362 raw_spin_unlock_irqrestore(&ctx->lock, flags); 3365 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3363 3366
3364 if (clone_ctx) 3367 if (clone_ctx)
@@ -3369,6 +3372,11 @@ retry:
3369 if (!ctx) 3372 if (!ctx)
3370 goto errout; 3373 goto errout;
3371 3374
3375 if (task_ctx_data) {
3376 ctx->task_ctx_data = task_ctx_data;
3377 task_ctx_data = NULL;
3378 }
3379
3372 err = 0; 3380 err = 0;
3373 mutex_lock(&task->perf_event_mutex); 3381 mutex_lock(&task->perf_event_mutex);
3374 /* 3382 /*
@@ -3395,13 +3403,16 @@ retry:
3395 } 3403 }
3396 } 3404 }
3397 3405
3406 kfree(task_ctx_data);
3398 return ctx; 3407 return ctx;
3399 3408
3400errout: 3409errout:
3410 kfree(task_ctx_data);
3401 return ERR_PTR(err); 3411 return ERR_PTR(err);
3402} 3412}
3403 3413
3404static void perf_event_free_filter(struct perf_event *event); 3414static void perf_event_free_filter(struct perf_event *event);
3415static void perf_event_free_bpf_prog(struct perf_event *event);
3405 3416
3406static void free_event_rcu(struct rcu_head *head) 3417static void free_event_rcu(struct rcu_head *head)
3407{ 3418{
@@ -3411,10 +3422,10 @@ static void free_event_rcu(struct rcu_head *head)
3411 if (event->ns) 3422 if (event->ns)
3412 put_pid_ns(event->ns); 3423 put_pid_ns(event->ns);
3413 perf_event_free_filter(event); 3424 perf_event_free_filter(event);
3425 perf_event_free_bpf_prog(event);
3414 kfree(event); 3426 kfree(event);
3415} 3427}
3416 3428
3417static void ring_buffer_put(struct ring_buffer *rb);
3418static void ring_buffer_attach(struct perf_event *event, 3429static void ring_buffer_attach(struct perf_event *event,
3419 struct ring_buffer *rb); 3430 struct ring_buffer *rb);
3420 3431
@@ -3423,10 +3434,6 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu)
3423 if (event->parent) 3434 if (event->parent)
3424 return; 3435 return;
3425 3436
3426 if (has_branch_stack(event)) {
3427 if (!(event->attach_state & PERF_ATTACH_TASK))
3428 atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
3429 }
3430 if (is_cgroup_event(event)) 3437 if (is_cgroup_event(event))
3431 atomic_dec(&per_cpu(perf_cgroup_events, cpu)); 3438 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3432} 3439}
@@ -3454,6 +3461,91 @@ static void unaccount_event(struct perf_event *event)
3454 unaccount_event_cpu(event, event->cpu); 3461 unaccount_event_cpu(event, event->cpu);
3455} 3462}
3456 3463
3464/*
3465 * The following implement mutual exclusion of events on "exclusive" pmus
3466 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
3467 * at a time, so we disallow creating events that might conflict, namely:
3468 *
3469 * 1) cpu-wide events in the presence of per-task events,
3470 * 2) per-task events in the presence of cpu-wide events,
3471 * 3) two matching events on the same context.
3472 *
3473 * The former two cases are handled in the allocation path (perf_event_alloc(),
3474 * __free_event()), the latter -- before the first perf_install_in_context().
3475 */
3476static int exclusive_event_init(struct perf_event *event)
3477{
3478 struct pmu *pmu = event->pmu;
3479
3480 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3481 return 0;
3482
3483 /*
3484 * Prevent co-existence of per-task and cpu-wide events on the
3485 * same exclusive pmu.
3486 *
3487 * Negative pmu::exclusive_cnt means there are cpu-wide
3488 * events on this "exclusive" pmu, positive means there are
3489 * per-task events.
3490 *
3491 * Since this is called in perf_event_alloc() path, event::ctx
3492 * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
3493 * to mean "per-task event", because unlike other attach states it
3494 * never gets cleared.
3495 */
3496 if (event->attach_state & PERF_ATTACH_TASK) {
3497 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
3498 return -EBUSY;
3499 } else {
3500 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
3501 return -EBUSY;
3502 }
3503
3504 return 0;
3505}
3506
3507static void exclusive_event_destroy(struct perf_event *event)
3508{
3509 struct pmu *pmu = event->pmu;
3510
3511 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3512 return;
3513
3514 /* see comment in exclusive_event_init() */
3515 if (event->attach_state & PERF_ATTACH_TASK)
3516 atomic_dec(&pmu->exclusive_cnt);
3517 else
3518 atomic_inc(&pmu->exclusive_cnt);
3519}
3520
3521static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
3522{
3523 if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
3524 (e1->cpu == e2->cpu ||
3525 e1->cpu == -1 ||
3526 e2->cpu == -1))
3527 return true;
3528 return false;
3529}
3530
3531/* Called under the same ctx::mutex as perf_install_in_context() */
3532static bool exclusive_event_installable(struct perf_event *event,
3533 struct perf_event_context *ctx)
3534{
3535 struct perf_event *iter_event;
3536 struct pmu *pmu = event->pmu;
3537
3538 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3539 return true;
3540
3541 list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
3542 if (exclusive_event_match(iter_event, event))
3543 return false;
3544 }
3545
3546 return true;
3547}
3548
3457static void __free_event(struct perf_event *event) 3549static void __free_event(struct perf_event *event)
3458{ 3550{
3459 if (!event->parent) { 3551 if (!event->parent) {
@@ -3467,8 +3559,10 @@ static void __free_event(struct perf_event *event)
3467 if (event->ctx) 3559 if (event->ctx)
3468 put_ctx(event->ctx); 3560 put_ctx(event->ctx);
3469 3561
3470 if (event->pmu) 3562 if (event->pmu) {
3563 exclusive_event_destroy(event);
3471 module_put(event->pmu->module); 3564 module_put(event->pmu->module);
3565 }
3472 3566
3473 call_rcu(&event->rcu_head, free_event_rcu); 3567 call_rcu(&event->rcu_head, free_event_rcu);
3474} 3568}
@@ -3927,6 +4021,7 @@ static inline int perf_fget_light(int fd, struct fd *p)
3927static int perf_event_set_output(struct perf_event *event, 4021static int perf_event_set_output(struct perf_event *event,
3928 struct perf_event *output_event); 4022 struct perf_event *output_event);
3929static int perf_event_set_filter(struct perf_event *event, void __user *arg); 4023static int perf_event_set_filter(struct perf_event *event, void __user *arg);
4024static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
3930 4025
3931static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) 4026static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
3932{ 4027{
@@ -3980,6 +4075,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
3980 case PERF_EVENT_IOC_SET_FILTER: 4075 case PERF_EVENT_IOC_SET_FILTER:
3981 return perf_event_set_filter(event, (void __user *)arg); 4076 return perf_event_set_filter(event, (void __user *)arg);
3982 4077
4078 case PERF_EVENT_IOC_SET_BPF:
4079 return perf_event_set_bpf_prog(event, arg);
4080
3983 default: 4081 default:
3984 return -ENOTTY; 4082 return -ENOTTY;
3985 } 4083 }
@@ -4096,6 +4194,8 @@ static void perf_event_init_userpage(struct perf_event *event)
4096 /* Allow new userspace to detect that bit 0 is deprecated */ 4194 /* Allow new userspace to detect that bit 0 is deprecated */
4097 userpg->cap_bit0_is_deprecated = 1; 4195 userpg->cap_bit0_is_deprecated = 1;
4098 userpg->size = offsetof(struct perf_event_mmap_page, __reserved); 4196 userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
4197 userpg->data_offset = PAGE_SIZE;
4198 userpg->data_size = perf_data_size(rb);
4099 4199
4100unlock: 4200unlock:
4101 rcu_read_unlock(); 4201 rcu_read_unlock();
@@ -4263,7 +4363,7 @@ static void rb_free_rcu(struct rcu_head *rcu_head)
4263 rb_free(rb); 4363 rb_free(rb);
4264} 4364}
4265 4365
4266static struct ring_buffer *ring_buffer_get(struct perf_event *event) 4366struct ring_buffer *ring_buffer_get(struct perf_event *event)
4267{ 4367{
4268 struct ring_buffer *rb; 4368 struct ring_buffer *rb;
4269 4369
@@ -4278,7 +4378,7 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
4278 return rb; 4378 return rb;
4279} 4379}
4280 4380
4281static void ring_buffer_put(struct ring_buffer *rb) 4381void ring_buffer_put(struct ring_buffer *rb)
4282{ 4382{
4283 if (!atomic_dec_and_test(&rb->refcount)) 4383 if (!atomic_dec_and_test(&rb->refcount))
4284 return; 4384 return;
@@ -4295,6 +4395,9 @@ static void perf_mmap_open(struct vm_area_struct *vma)
4295 atomic_inc(&event->mmap_count); 4395 atomic_inc(&event->mmap_count);
4296 atomic_inc(&event->rb->mmap_count); 4396 atomic_inc(&event->rb->mmap_count);
4297 4397
4398 if (vma->vm_pgoff)
4399 atomic_inc(&event->rb->aux_mmap_count);
4400
4298 if (event->pmu->event_mapped) 4401 if (event->pmu->event_mapped)
4299 event->pmu->event_mapped(event); 4402 event->pmu->event_mapped(event);
4300} 4403}
@@ -4319,6 +4422,20 @@ static void perf_mmap_close(struct vm_area_struct *vma)
4319 if (event->pmu->event_unmapped) 4422 if (event->pmu->event_unmapped)
4320 event->pmu->event_unmapped(event); 4423 event->pmu->event_unmapped(event);
4321 4424
4425 /*
4426 * rb->aux_mmap_count will always drop before rb->mmap_count and
4427 * event->mmap_count, so it is ok to use event->mmap_mutex to
4428 * serialize with perf_mmap here.
4429 */
4430 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
4431 atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
4432 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
4433 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
4434
4435 rb_free_aux(rb);
4436 mutex_unlock(&event->mmap_mutex);
4437 }
4438
4322 atomic_dec(&rb->mmap_count); 4439 atomic_dec(&rb->mmap_count);
4323 4440
4324 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) 4441 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
@@ -4392,7 +4509,7 @@ out_put:
4392 4509
4393static const struct vm_operations_struct perf_mmap_vmops = { 4510static const struct vm_operations_struct perf_mmap_vmops = {
4394 .open = perf_mmap_open, 4511 .open = perf_mmap_open,
4395 .close = perf_mmap_close, 4512 .close = perf_mmap_close, /* non mergable */
4396 .fault = perf_mmap_fault, 4513 .fault = perf_mmap_fault,
4397 .page_mkwrite = perf_mmap_fault, 4514 .page_mkwrite = perf_mmap_fault,
4398}; 4515};
@@ -4403,10 +4520,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
4403 unsigned long user_locked, user_lock_limit; 4520 unsigned long user_locked, user_lock_limit;
4404 struct user_struct *user = current_user(); 4521 struct user_struct *user = current_user();
4405 unsigned long locked, lock_limit; 4522 unsigned long locked, lock_limit;
4406 struct ring_buffer *rb; 4523 struct ring_buffer *rb = NULL;
4407 unsigned long vma_size; 4524 unsigned long vma_size;
4408 unsigned long nr_pages; 4525 unsigned long nr_pages;
4409 long user_extra, extra; 4526 long user_extra = 0, extra = 0;
4410 int ret = 0, flags = 0; 4527 int ret = 0, flags = 0;
4411 4528
4412 /* 4529 /*
@@ -4421,7 +4538,66 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
4421 return -EINVAL; 4538 return -EINVAL;
4422 4539
4423 vma_size = vma->vm_end - vma->vm_start; 4540 vma_size = vma->vm_end - vma->vm_start;
4424 nr_pages = (vma_size / PAGE_SIZE) - 1; 4541
4542 if (vma->vm_pgoff == 0) {
4543 nr_pages = (vma_size / PAGE_SIZE) - 1;
4544 } else {
4545 /*
4546 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
4547 * mapped, all subsequent mappings should have the same size
4548 * and offset. Must be above the normal perf buffer.
4549 */
4550 u64 aux_offset, aux_size;
4551
4552 if (!event->rb)
4553 return -EINVAL;
4554
4555 nr_pages = vma_size / PAGE_SIZE;
4556
4557 mutex_lock(&event->mmap_mutex);
4558 ret = -EINVAL;
4559
4560 rb = event->rb;
4561 if (!rb)
4562 goto aux_unlock;
4563
4564 aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
4565 aux_size = ACCESS_ONCE(rb->user_page->aux_size);
4566
4567 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
4568 goto aux_unlock;
4569
4570 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
4571 goto aux_unlock;
4572
4573 /* already mapped with a different offset */
4574 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
4575 goto aux_unlock;
4576
4577 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
4578 goto aux_unlock;
4579
4580 /* already mapped with a different size */
4581 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
4582 goto aux_unlock;
4583
4584 if (!is_power_of_2(nr_pages))
4585 goto aux_unlock;
4586
4587 if (!atomic_inc_not_zero(&rb->mmap_count))
4588 goto aux_unlock;
4589
4590 if (rb_has_aux(rb)) {
4591 atomic_inc(&rb->aux_mmap_count);
4592 ret = 0;
4593 goto unlock;
4594 }
4595
4596 atomic_set(&rb->aux_mmap_count, 1);
4597 user_extra = nr_pages;
4598
4599 goto accounting;
4600 }
4425 4601
4426 /* 4602 /*
4427 * If we have rb pages ensure they're a power-of-two number, so we 4603 * If we have rb pages ensure they're a power-of-two number, so we
@@ -4433,9 +4609,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
4433 if (vma_size != PAGE_SIZE * (1 + nr_pages)) 4609 if (vma_size != PAGE_SIZE * (1 + nr_pages))
4434 return -EINVAL; 4610 return -EINVAL;
4435 4611
4436 if (vma->vm_pgoff != 0)
4437 return -EINVAL;
4438
4439 WARN_ON_ONCE(event->ctx->parent_ctx); 4612 WARN_ON_ONCE(event->ctx->parent_ctx);
4440again: 4613again:
4441 mutex_lock(&event->mmap_mutex); 4614 mutex_lock(&event->mmap_mutex);
@@ -4459,6 +4632,8 @@ again:
4459 } 4632 }
4460 4633
4461 user_extra = nr_pages + 1; 4634 user_extra = nr_pages + 1;
4635
4636accounting:
4462 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); 4637 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
4463 4638
4464 /* 4639 /*
@@ -4468,7 +4643,6 @@ again:
4468 4643
4469 user_locked = atomic_long_read(&user->locked_vm) + user_extra; 4644 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
4470 4645
4471 extra = 0;
4472 if (user_locked > user_lock_limit) 4646 if (user_locked > user_lock_limit)
4473 extra = user_locked - user_lock_limit; 4647 extra = user_locked - user_lock_limit;
4474 4648
@@ -4482,35 +4656,46 @@ again:
4482 goto unlock; 4656 goto unlock;
4483 } 4657 }
4484 4658
4485 WARN_ON(event->rb); 4659 WARN_ON(!rb && event->rb);
4486 4660
4487 if (vma->vm_flags & VM_WRITE) 4661 if (vma->vm_flags & VM_WRITE)
4488 flags |= RING_BUFFER_WRITABLE; 4662 flags |= RING_BUFFER_WRITABLE;
4489 4663
4490 rb = rb_alloc(nr_pages,
4491 event->attr.watermark ? event->attr.wakeup_watermark : 0,
4492 event->cpu, flags);
4493
4494 if (!rb) { 4664 if (!rb) {
4495 ret = -ENOMEM; 4665 rb = rb_alloc(nr_pages,
4496 goto unlock; 4666 event->attr.watermark ? event->attr.wakeup_watermark : 0,
4497 } 4667 event->cpu, flags);
4498 4668
4499 atomic_set(&rb->mmap_count, 1); 4669 if (!rb) {
4500 rb->mmap_locked = extra; 4670 ret = -ENOMEM;
4501 rb->mmap_user = get_current_user(); 4671 goto unlock;
4672 }
4502 4673
4503 atomic_long_add(user_extra, &user->locked_vm); 4674 atomic_set(&rb->mmap_count, 1);
4504 vma->vm_mm->pinned_vm += extra; 4675 rb->mmap_user = get_current_user();
4676 rb->mmap_locked = extra;
4505 4677
4506 ring_buffer_attach(event, rb); 4678 ring_buffer_attach(event, rb);
4507 4679
4508 perf_event_init_userpage(event); 4680 perf_event_init_userpage(event);
4509 perf_event_update_userpage(event); 4681 perf_event_update_userpage(event);
4682 } else {
4683 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
4684 event->attr.aux_watermark, flags);
4685 if (!ret)
4686 rb->aux_mmap_locked = extra;
4687 }
4510 4688
4511unlock: 4689unlock:
4512 if (!ret) 4690 if (!ret) {
4691 atomic_long_add(user_extra, &user->locked_vm);
4692 vma->vm_mm->pinned_vm += extra;
4693
4513 atomic_inc(&event->mmap_count); 4694 atomic_inc(&event->mmap_count);
4695 } else if (rb) {
4696 atomic_dec(&rb->mmap_count);
4697 }
4698aux_unlock:
4514 mutex_unlock(&event->mmap_mutex); 4699 mutex_unlock(&event->mmap_mutex);
4515 4700
4516 /* 4701 /*
@@ -4766,7 +4951,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
4766 } 4951 }
4767 4952
4768 if (sample_type & PERF_SAMPLE_TIME) 4953 if (sample_type & PERF_SAMPLE_TIME)
4769 data->time = perf_clock(); 4954 data->time = perf_event_clock(event);
4770 4955
4771 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) 4956 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
4772 data->id = primary_event_id(event); 4957 data->id = primary_event_id(event);
@@ -5344,6 +5529,8 @@ static void perf_event_task_output(struct perf_event *event,
5344 task_event->event_id.tid = perf_event_tid(event, task); 5529 task_event->event_id.tid = perf_event_tid(event, task);
5345 task_event->event_id.ptid = perf_event_tid(event, current); 5530 task_event->event_id.ptid = perf_event_tid(event, current);
5346 5531
5532 task_event->event_id.time = perf_event_clock(event);
5533
5347 perf_output_put(&handle, task_event->event_id); 5534 perf_output_put(&handle, task_event->event_id);
5348 5535
5349 perf_event__output_id_sample(event, &handle, &sample); 5536 perf_event__output_id_sample(event, &handle, &sample);
@@ -5377,7 +5564,7 @@ static void perf_event_task(struct task_struct *task,
5377 /* .ppid */ 5564 /* .ppid */
5378 /* .tid */ 5565 /* .tid */
5379 /* .ptid */ 5566 /* .ptid */
5380 .time = perf_clock(), 5567 /* .time */
5381 }, 5568 },
5382 }; 5569 };
5383 5570
@@ -5732,6 +5919,40 @@ void perf_event_mmap(struct vm_area_struct *vma)
5732 perf_event_mmap_event(&mmap_event); 5919 perf_event_mmap_event(&mmap_event);
5733} 5920}
5734 5921
5922void perf_event_aux_event(struct perf_event *event, unsigned long head,
5923 unsigned long size, u64 flags)
5924{
5925 struct perf_output_handle handle;
5926 struct perf_sample_data sample;
5927 struct perf_aux_event {
5928 struct perf_event_header header;
5929 u64 offset;
5930 u64 size;
5931 u64 flags;
5932 } rec = {
5933 .header = {
5934 .type = PERF_RECORD_AUX,
5935 .misc = 0,
5936 .size = sizeof(rec),
5937 },
5938 .offset = head,
5939 .size = size,
5940 .flags = flags,
5941 };
5942 int ret;
5943
5944 perf_event_header__init_id(&rec.header, &sample, event);
5945 ret = perf_output_begin(&handle, event, rec.header.size);
5946
5947 if (ret)
5948 return;
5949
5950 perf_output_put(&handle, rec);
5951 perf_event__output_id_sample(event, &handle, &sample);
5952
5953 perf_output_end(&handle);
5954}
5955
5735/* 5956/*
5736 * IRQ throttle logging 5957 * IRQ throttle logging
5737 */ 5958 */
@@ -5753,7 +5974,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
5753 .misc = 0, 5974 .misc = 0,
5754 .size = sizeof(throttle_event), 5975 .size = sizeof(throttle_event),
5755 }, 5976 },
5756 .time = perf_clock(), 5977 .time = perf_event_clock(event),
5757 .id = primary_event_id(event), 5978 .id = primary_event_id(event),
5758 .stream_id = event->id, 5979 .stream_id = event->id,
5759 }; 5980 };
@@ -5773,6 +5994,44 @@ static void perf_log_throttle(struct perf_event *event, int enable)
5773 perf_output_end(&handle); 5994 perf_output_end(&handle);
5774} 5995}
5775 5996
5997static void perf_log_itrace_start(struct perf_event *event)
5998{
5999 struct perf_output_handle handle;
6000 struct perf_sample_data sample;
6001 struct perf_aux_event {
6002 struct perf_event_header header;
6003 u32 pid;
6004 u32 tid;
6005 } rec;
6006 int ret;
6007
6008 if (event->parent)
6009 event = event->parent;
6010
6011 if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
6012 event->hw.itrace_started)
6013 return;
6014
6015 event->hw.itrace_started = 1;
6016
6017 rec.header.type = PERF_RECORD_ITRACE_START;
6018 rec.header.misc = 0;
6019 rec.header.size = sizeof(rec);
6020 rec.pid = perf_event_pid(event, current);
6021 rec.tid = perf_event_tid(event, current);
6022
6023 perf_event_header__init_id(&rec.header, &sample, event);
6024 ret = perf_output_begin(&handle, event, rec.header.size);
6025
6026 if (ret)
6027 return;
6028
6029 perf_output_put(&handle, rec);
6030 perf_event__output_id_sample(event, &handle, &sample);
6031
6032 perf_output_end(&handle);
6033}
6034
5776/* 6035/*
5777 * Generic event overflow handling, sampling. 6036 * Generic event overflow handling, sampling.
5778 */ 6037 */
@@ -6133,6 +6392,7 @@ static int perf_swevent_add(struct perf_event *event, int flags)
6133 } 6392 }
6134 6393
6135 hlist_add_head_rcu(&event->hlist_entry, head); 6394 hlist_add_head_rcu(&event->hlist_entry, head);
6395 perf_event_update_userpage(event);
6136 6396
6137 return 0; 6397 return 0;
6138} 6398}
@@ -6296,6 +6556,8 @@ static int perf_swevent_init(struct perf_event *event)
6296static struct pmu perf_swevent = { 6556static struct pmu perf_swevent = {
6297 .task_ctx_nr = perf_sw_context, 6557 .task_ctx_nr = perf_sw_context,
6298 6558
6559 .capabilities = PERF_PMU_CAP_NO_NMI,
6560
6299 .event_init = perf_swevent_init, 6561 .event_init = perf_swevent_init,
6300 .add = perf_swevent_add, 6562 .add = perf_swevent_add,
6301 .del = perf_swevent_del, 6563 .del = perf_swevent_del,
@@ -6449,6 +6711,49 @@ static void perf_event_free_filter(struct perf_event *event)
6449 ftrace_profile_free_filter(event); 6711 ftrace_profile_free_filter(event);
6450} 6712}
6451 6713
6714static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
6715{
6716 struct bpf_prog *prog;
6717
6718 if (event->attr.type != PERF_TYPE_TRACEPOINT)
6719 return -EINVAL;
6720
6721 if (event->tp_event->prog)
6722 return -EEXIST;
6723
6724 if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
6725 /* bpf programs can only be attached to kprobes */
6726 return -EINVAL;
6727
6728 prog = bpf_prog_get(prog_fd);
6729 if (IS_ERR(prog))
6730 return PTR_ERR(prog);
6731
6732 if (prog->aux->prog_type != BPF_PROG_TYPE_KPROBE) {
6733 /* valid fd, but invalid bpf program type */
6734 bpf_prog_put(prog);
6735 return -EINVAL;
6736 }
6737
6738 event->tp_event->prog = prog;
6739
6740 return 0;
6741}
6742
6743static void perf_event_free_bpf_prog(struct perf_event *event)
6744{
6745 struct bpf_prog *prog;
6746
6747 if (!event->tp_event)
6748 return;
6749
6750 prog = event->tp_event->prog;
6751 if (prog) {
6752 event->tp_event->prog = NULL;
6753 bpf_prog_put(prog);
6754 }
6755}
6756
6452#else 6757#else
6453 6758
6454static inline void perf_tp_register(void) 6759static inline void perf_tp_register(void)
@@ -6464,6 +6769,14 @@ static void perf_event_free_filter(struct perf_event *event)
6464{ 6769{
6465} 6770}
6466 6771
6772static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
6773{
6774 return -ENOENT;
6775}
6776
6777static void perf_event_free_bpf_prog(struct perf_event *event)
6778{
6779}
6467#endif /* CONFIG_EVENT_TRACING */ 6780#endif /* CONFIG_EVENT_TRACING */
6468 6781
6469#ifdef CONFIG_HAVE_HW_BREAKPOINT 6782#ifdef CONFIG_HAVE_HW_BREAKPOINT
@@ -6602,6 +6915,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags)
6602{ 6915{
6603 if (flags & PERF_EF_START) 6916 if (flags & PERF_EF_START)
6604 cpu_clock_event_start(event, flags); 6917 cpu_clock_event_start(event, flags);
6918 perf_event_update_userpage(event);
6605 6919
6606 return 0; 6920 return 0;
6607} 6921}
@@ -6638,6 +6952,8 @@ static int cpu_clock_event_init(struct perf_event *event)
6638static struct pmu perf_cpu_clock = { 6952static struct pmu perf_cpu_clock = {
6639 .task_ctx_nr = perf_sw_context, 6953 .task_ctx_nr = perf_sw_context,
6640 6954
6955 .capabilities = PERF_PMU_CAP_NO_NMI,
6956
6641 .event_init = cpu_clock_event_init, 6957 .event_init = cpu_clock_event_init,
6642 .add = cpu_clock_event_add, 6958 .add = cpu_clock_event_add,
6643 .del = cpu_clock_event_del, 6959 .del = cpu_clock_event_del,
@@ -6676,6 +6992,7 @@ static int task_clock_event_add(struct perf_event *event, int flags)
6676{ 6992{
6677 if (flags & PERF_EF_START) 6993 if (flags & PERF_EF_START)
6678 task_clock_event_start(event, flags); 6994 task_clock_event_start(event, flags);
6995 perf_event_update_userpage(event);
6679 6996
6680 return 0; 6997 return 0;
6681} 6998}
@@ -6716,6 +7033,8 @@ static int task_clock_event_init(struct perf_event *event)
6716static struct pmu perf_task_clock = { 7033static struct pmu perf_task_clock = {
6717 .task_ctx_nr = perf_sw_context, 7034 .task_ctx_nr = perf_sw_context,
6718 7035
7036 .capabilities = PERF_PMU_CAP_NO_NMI,
7037
6719 .event_init = task_clock_event_init, 7038 .event_init = task_clock_event_init,
6720 .add = task_clock_event_add, 7039 .add = task_clock_event_add,
6721 .del = task_clock_event_del, 7040 .del = task_clock_event_del,
@@ -6993,6 +7312,7 @@ got_cpu_context:
6993 pmu->event_idx = perf_event_idx_default; 7312 pmu->event_idx = perf_event_idx_default;
6994 7313
6995 list_add_rcu(&pmu->entry, &pmus); 7314 list_add_rcu(&pmu->entry, &pmus);
7315 atomic_set(&pmu->exclusive_cnt, 0);
6996 ret = 0; 7316 ret = 0;
6997unlock: 7317unlock:
6998 mutex_unlock(&pmus_lock); 7318 mutex_unlock(&pmus_lock);
@@ -7037,12 +7357,23 @@ EXPORT_SYMBOL_GPL(perf_pmu_unregister);
7037 7357
7038static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) 7358static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
7039{ 7359{
7360 struct perf_event_context *ctx = NULL;
7040 int ret; 7361 int ret;
7041 7362
7042 if (!try_module_get(pmu->module)) 7363 if (!try_module_get(pmu->module))
7043 return -ENODEV; 7364 return -ENODEV;
7365
7366 if (event->group_leader != event) {
7367 ctx = perf_event_ctx_lock(event->group_leader);
7368 BUG_ON(!ctx);
7369 }
7370
7044 event->pmu = pmu; 7371 event->pmu = pmu;
7045 ret = pmu->event_init(event); 7372 ret = pmu->event_init(event);
7373
7374 if (ctx)
7375 perf_event_ctx_unlock(event->group_leader, ctx);
7376
7046 if (ret) 7377 if (ret)
7047 module_put(pmu->module); 7378 module_put(pmu->module);
7048 7379
@@ -7089,10 +7420,6 @@ static void account_event_cpu(struct perf_event *event, int cpu)
7089 if (event->parent) 7420 if (event->parent)
7090 return; 7421 return;
7091 7422
7092 if (has_branch_stack(event)) {
7093 if (!(event->attach_state & PERF_ATTACH_TASK))
7094 atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
7095 }
7096 if (is_cgroup_event(event)) 7423 if (is_cgroup_event(event))
7097 atomic_inc(&per_cpu(perf_cgroup_events, cpu)); 7424 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
7098} 7425}
@@ -7131,7 +7458,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
7131 struct perf_event *group_leader, 7458 struct perf_event *group_leader,
7132 struct perf_event *parent_event, 7459 struct perf_event *parent_event,
7133 perf_overflow_handler_t overflow_handler, 7460 perf_overflow_handler_t overflow_handler,
7134 void *context) 7461 void *context, int cgroup_fd)
7135{ 7462{
7136 struct pmu *pmu; 7463 struct pmu *pmu;
7137 struct perf_event *event; 7464 struct perf_event *event;
@@ -7186,18 +7513,18 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
7186 7513
7187 if (task) { 7514 if (task) {
7188 event->attach_state = PERF_ATTACH_TASK; 7515 event->attach_state = PERF_ATTACH_TASK;
7189
7190 if (attr->type == PERF_TYPE_TRACEPOINT)
7191 event->hw.tp_target = task;
7192#ifdef CONFIG_HAVE_HW_BREAKPOINT
7193 /* 7516 /*
7194 * hw_breakpoint is a bit difficult here.. 7517 * XXX pmu::event_init needs to know what task to account to
7518 * and we cannot use the ctx information because we need the
7519 * pmu before we get a ctx.
7195 */ 7520 */
7196 else if (attr->type == PERF_TYPE_BREAKPOINT) 7521 event->hw.target = task;
7197 event->hw.bp_target = task;
7198#endif
7199 } 7522 }
7200 7523
7524 event->clock = &local_clock;
7525 if (parent_event)
7526 event->clock = parent_event->clock;
7527
7201 if (!overflow_handler && parent_event) { 7528 if (!overflow_handler && parent_event) {
7202 overflow_handler = parent_event->overflow_handler; 7529 overflow_handler = parent_event->overflow_handler;
7203 context = parent_event->overflow_handler_context; 7530 context = parent_event->overflow_handler_context;
@@ -7224,6 +7551,15 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
7224 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) 7551 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
7225 goto err_ns; 7552 goto err_ns;
7226 7553
7554 if (!has_branch_stack(event))
7555 event->attr.branch_sample_type = 0;
7556
7557 if (cgroup_fd != -1) {
7558 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
7559 if (err)
7560 goto err_ns;
7561 }
7562
7227 pmu = perf_init_event(event); 7563 pmu = perf_init_event(event);
7228 if (!pmu) 7564 if (!pmu)
7229 goto err_ns; 7565 goto err_ns;
@@ -7232,21 +7568,30 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
7232 goto err_ns; 7568 goto err_ns;
7233 } 7569 }
7234 7570
7571 err = exclusive_event_init(event);
7572 if (err)
7573 goto err_pmu;
7574
7235 if (!event->parent) { 7575 if (!event->parent) {
7236 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { 7576 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
7237 err = get_callchain_buffers(); 7577 err = get_callchain_buffers();
7238 if (err) 7578 if (err)
7239 goto err_pmu; 7579 goto err_per_task;
7240 } 7580 }
7241 } 7581 }
7242 7582
7243 return event; 7583 return event;
7244 7584
7585err_per_task:
7586 exclusive_event_destroy(event);
7587
7245err_pmu: 7588err_pmu:
7246 if (event->destroy) 7589 if (event->destroy)
7247 event->destroy(event); 7590 event->destroy(event);
7248 module_put(pmu->module); 7591 module_put(pmu->module);
7249err_ns: 7592err_ns:
7593 if (is_cgroup_event(event))
7594 perf_detach_cgroup(event);
7250 if (event->ns) 7595 if (event->ns)
7251 put_pid_ns(event->ns); 7596 put_pid_ns(event->ns);
7252 kfree(event); 7597 kfree(event);
@@ -7409,6 +7754,19 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
7409 if (output_event->cpu == -1 && output_event->ctx != event->ctx) 7754 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
7410 goto out; 7755 goto out;
7411 7756
7757 /*
7758 * Mixing clocks in the same buffer is trouble you don't need.
7759 */
7760 if (output_event->clock != event->clock)
7761 goto out;
7762
7763 /*
7764 * If both events generate aux data, they must be on the same PMU
7765 */
7766 if (has_aux(event) && has_aux(output_event) &&
7767 event->pmu != output_event->pmu)
7768 goto out;
7769
7412set: 7770set:
7413 mutex_lock(&event->mmap_mutex); 7771 mutex_lock(&event->mmap_mutex);
7414 /* Can't redirect output if we've got an active mmap() */ 7772 /* Can't redirect output if we've got an active mmap() */
@@ -7441,6 +7799,43 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b)
7441 mutex_lock_nested(b, SINGLE_DEPTH_NESTING); 7799 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
7442} 7800}
7443 7801
7802static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
7803{
7804 bool nmi_safe = false;
7805
7806 switch (clk_id) {
7807 case CLOCK_MONOTONIC:
7808 event->clock = &ktime_get_mono_fast_ns;
7809 nmi_safe = true;
7810 break;
7811
7812 case CLOCK_MONOTONIC_RAW:
7813 event->clock = &ktime_get_raw_fast_ns;
7814 nmi_safe = true;
7815 break;
7816
7817 case CLOCK_REALTIME:
7818 event->clock = &ktime_get_real_ns;
7819 break;
7820
7821 case CLOCK_BOOTTIME:
7822 event->clock = &ktime_get_boot_ns;
7823 break;
7824
7825 case CLOCK_TAI:
7826 event->clock = &ktime_get_tai_ns;
7827 break;
7828
7829 default:
7830 return -EINVAL;
7831 }
7832
7833 if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
7834 return -EINVAL;
7835
7836 return 0;
7837}
7838
7444/** 7839/**
7445 * sys_perf_event_open - open a performance event, associate it to a task/cpu 7840 * sys_perf_event_open - open a performance event, associate it to a task/cpu
7446 * 7841 *
@@ -7465,6 +7860,7 @@ SYSCALL_DEFINE5(perf_event_open,
7465 int move_group = 0; 7860 int move_group = 0;
7466 int err; 7861 int err;
7467 int f_flags = O_RDWR; 7862 int f_flags = O_RDWR;
7863 int cgroup_fd = -1;
7468 7864
7469 /* for future expandability... */ 7865 /* for future expandability... */
7470 if (flags & ~PERF_FLAG_ALL) 7866 if (flags & ~PERF_FLAG_ALL)
@@ -7530,21 +7926,16 @@ SYSCALL_DEFINE5(perf_event_open,
7530 7926
7531 get_online_cpus(); 7927 get_online_cpus();
7532 7928
7929 if (flags & PERF_FLAG_PID_CGROUP)
7930 cgroup_fd = pid;
7931
7533 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, 7932 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
7534 NULL, NULL); 7933 NULL, NULL, cgroup_fd);
7535 if (IS_ERR(event)) { 7934 if (IS_ERR(event)) {
7536 err = PTR_ERR(event); 7935 err = PTR_ERR(event);
7537 goto err_cpus; 7936 goto err_cpus;
7538 } 7937 }
7539 7938
7540 if (flags & PERF_FLAG_PID_CGROUP) {
7541 err = perf_cgroup_connect(pid, event, &attr, group_leader);
7542 if (err) {
7543 __free_event(event);
7544 goto err_cpus;
7545 }
7546 }
7547
7548 if (is_sampling_event(event)) { 7939 if (is_sampling_event(event)) {
7549 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { 7940 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
7550 err = -ENOTSUPP; 7941 err = -ENOTSUPP;
@@ -7560,6 +7951,12 @@ SYSCALL_DEFINE5(perf_event_open,
7560 */ 7951 */
7561 pmu = event->pmu; 7952 pmu = event->pmu;
7562 7953
7954 if (attr.use_clockid) {
7955 err = perf_event_set_clock(event, attr.clockid);
7956 if (err)
7957 goto err_alloc;
7958 }
7959
7563 if (group_leader && 7960 if (group_leader &&
7564 (is_software_event(event) != is_software_event(group_leader))) { 7961 (is_software_event(event) != is_software_event(group_leader))) {
7565 if (is_software_event(event)) { 7962 if (is_software_event(event)) {
@@ -7586,12 +7983,17 @@ SYSCALL_DEFINE5(perf_event_open,
7586 /* 7983 /*
7587 * Get the target context (task or percpu): 7984 * Get the target context (task or percpu):
7588 */ 7985 */
7589 ctx = find_get_context(pmu, task, event->cpu); 7986 ctx = find_get_context(pmu, task, event);
7590 if (IS_ERR(ctx)) { 7987 if (IS_ERR(ctx)) {
7591 err = PTR_ERR(ctx); 7988 err = PTR_ERR(ctx);
7592 goto err_alloc; 7989 goto err_alloc;
7593 } 7990 }
7594 7991
7992 if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
7993 err = -EBUSY;
7994 goto err_context;
7995 }
7996
7595 if (task) { 7997 if (task) {
7596 put_task_struct(task); 7998 put_task_struct(task);
7597 task = NULL; 7999 task = NULL;
@@ -7609,6 +8011,11 @@ SYSCALL_DEFINE5(perf_event_open,
7609 */ 8011 */
7610 if (group_leader->group_leader != group_leader) 8012 if (group_leader->group_leader != group_leader)
7611 goto err_context; 8013 goto err_context;
8014
8015 /* All events in a group should have the same clock */
8016 if (group_leader->clock != event->clock)
8017 goto err_context;
8018
7612 /* 8019 /*
7613 * Do not allow to attach to a group in a different 8020 * Do not allow to attach to a group in a different
7614 * task or CPU context: 8021 * task or CPU context:
@@ -7709,6 +8116,13 @@ SYSCALL_DEFINE5(perf_event_open,
7709 get_ctx(ctx); 8116 get_ctx(ctx);
7710 } 8117 }
7711 8118
8119 if (!exclusive_event_installable(event, ctx)) {
8120 err = -EBUSY;
8121 mutex_unlock(&ctx->mutex);
8122 fput(event_file);
8123 goto err_context;
8124 }
8125
7712 perf_install_in_context(ctx, event, event->cpu); 8126 perf_install_in_context(ctx, event, event->cpu);
7713 perf_unpin_context(ctx); 8127 perf_unpin_context(ctx);
7714 8128
@@ -7781,7 +8195,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
7781 */ 8195 */
7782 8196
7783 event = perf_event_alloc(attr, cpu, task, NULL, NULL, 8197 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
7784 overflow_handler, context); 8198 overflow_handler, context, -1);
7785 if (IS_ERR(event)) { 8199 if (IS_ERR(event)) {
7786 err = PTR_ERR(event); 8200 err = PTR_ERR(event);
7787 goto err; 8201 goto err;
@@ -7792,7 +8206,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
7792 8206
7793 account_event(event); 8207 account_event(event);
7794 8208
7795 ctx = find_get_context(event->pmu, task, cpu); 8209 ctx = find_get_context(event->pmu, task, event);
7796 if (IS_ERR(ctx)) { 8210 if (IS_ERR(ctx)) {
7797 err = PTR_ERR(ctx); 8211 err = PTR_ERR(ctx);
7798 goto err_free; 8212 goto err_free;
@@ -7800,6 +8214,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
7800 8214
7801 WARN_ON_ONCE(ctx->parent_ctx); 8215 WARN_ON_ONCE(ctx->parent_ctx);
7802 mutex_lock(&ctx->mutex); 8216 mutex_lock(&ctx->mutex);
8217 if (!exclusive_event_installable(event, ctx)) {
8218 mutex_unlock(&ctx->mutex);
8219 perf_unpin_context(ctx);
8220 put_ctx(ctx);
8221 err = -EBUSY;
8222 goto err_free;
8223 }
8224
7803 perf_install_in_context(ctx, event, cpu); 8225 perf_install_in_context(ctx, event, cpu);
7804 perf_unpin_context(ctx); 8226 perf_unpin_context(ctx);
7805 mutex_unlock(&ctx->mutex); 8227 mutex_unlock(&ctx->mutex);
@@ -8142,7 +8564,7 @@ inherit_event(struct perf_event *parent_event,
8142 parent_event->cpu, 8564 parent_event->cpu,
8143 child, 8565 child,
8144 group_leader, parent_event, 8566 group_leader, parent_event,
8145 NULL, NULL); 8567 NULL, NULL, -1);
8146 if (IS_ERR(child_event)) 8568 if (IS_ERR(child_event))
8147 return child_event; 8569 return child_event;
8148 8570
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 9803a6600d49..92ce5f4ccc26 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -116,12 +116,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
116 */ 116 */
117static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) 117static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
118{ 118{
119 struct task_struct *tsk = bp->hw.bp_target; 119 struct task_struct *tsk = bp->hw.target;
120 struct perf_event *iter; 120 struct perf_event *iter;
121 int count = 0; 121 int count = 0;
122 122
123 list_for_each_entry(iter, &bp_task_head, hw.bp_list) { 123 list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
124 if (iter->hw.bp_target == tsk && 124 if (iter->hw.target == tsk &&
125 find_slot_idx(iter) == type && 125 find_slot_idx(iter) == type &&
126 (iter->cpu < 0 || cpu == iter->cpu)) 126 (iter->cpu < 0 || cpu == iter->cpu))
127 count += hw_breakpoint_weight(iter); 127 count += hw_breakpoint_weight(iter);
@@ -153,7 +153,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
153 int nr; 153 int nr;
154 154
155 nr = info->cpu_pinned; 155 nr = info->cpu_pinned;
156 if (!bp->hw.bp_target) 156 if (!bp->hw.target)
157 nr += max_task_bp_pinned(cpu, type); 157 nr += max_task_bp_pinned(cpu, type);
158 else 158 else
159 nr += task_bp_pinned(cpu, bp, type); 159 nr += task_bp_pinned(cpu, bp, type);
@@ -210,7 +210,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
210 weight = -weight; 210 weight = -weight;
211 211
212 /* Pinned counter cpu profiling */ 212 /* Pinned counter cpu profiling */
213 if (!bp->hw.bp_target) { 213 if (!bp->hw.target) {
214 get_bp_info(bp->cpu, type)->cpu_pinned += weight; 214 get_bp_info(bp->cpu, type)->cpu_pinned += weight;
215 return; 215 return;
216 } 216 }
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 569b218782ad..9f6ce9ba4a04 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -27,6 +27,7 @@ struct ring_buffer {
27 local_t lost; /* nr records lost */ 27 local_t lost; /* nr records lost */
28 28
29 long watermark; /* wakeup watermark */ 29 long watermark; /* wakeup watermark */
30 long aux_watermark;
30 /* poll crap */ 31 /* poll crap */
31 spinlock_t event_lock; 32 spinlock_t event_lock;
32 struct list_head event_list; 33 struct list_head event_list;
@@ -35,6 +36,20 @@ struct ring_buffer {
35 unsigned long mmap_locked; 36 unsigned long mmap_locked;
36 struct user_struct *mmap_user; 37 struct user_struct *mmap_user;
37 38
39 /* AUX area */
40 local_t aux_head;
41 local_t aux_nest;
42 local_t aux_wakeup;
43 unsigned long aux_pgoff;
44 int aux_nr_pages;
45 int aux_overwrite;
46 atomic_t aux_mmap_count;
47 unsigned long aux_mmap_locked;
48 void (*free_aux)(void *);
49 atomic_t aux_refcount;
50 void **aux_pages;
51 void *aux_priv;
52
38 struct perf_event_mmap_page *user_page; 53 struct perf_event_mmap_page *user_page;
39 void *data_pages[0]; 54 void *data_pages[0];
40}; 55};
@@ -43,6 +58,19 @@ extern void rb_free(struct ring_buffer *rb);
43extern struct ring_buffer * 58extern struct ring_buffer *
44rb_alloc(int nr_pages, long watermark, int cpu, int flags); 59rb_alloc(int nr_pages, long watermark, int cpu, int flags);
45extern void perf_event_wakeup(struct perf_event *event); 60extern void perf_event_wakeup(struct perf_event *event);
61extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
62 pgoff_t pgoff, int nr_pages, long watermark, int flags);
63extern void rb_free_aux(struct ring_buffer *rb);
64extern struct ring_buffer *ring_buffer_get(struct perf_event *event);
65extern void ring_buffer_put(struct ring_buffer *rb);
66
67static inline bool rb_has_aux(struct ring_buffer *rb)
68{
69 return !!rb->aux_nr_pages;
70}
71
72void perf_event_aux_event(struct perf_event *event, unsigned long head,
73 unsigned long size, u64 flags);
46 74
47extern void 75extern void
48perf_event_header__init_id(struct perf_event_header *header, 76perf_event_header__init_id(struct perf_event_header *header,
@@ -81,6 +109,11 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
81 return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); 109 return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
82} 110}
83 111
112static inline unsigned long perf_aux_size(struct ring_buffer *rb)
113{
114 return rb->aux_nr_pages << PAGE_SHIFT;
115}
116
84#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ 117#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
85static inline unsigned long \ 118static inline unsigned long \
86func_name(struct perf_output_handle *handle, \ 119func_name(struct perf_output_handle *handle, \
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index eadb95ce7aac..232f00f273cb 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -243,14 +243,317 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
243 spin_lock_init(&rb->event_lock); 243 spin_lock_init(&rb->event_lock);
244} 244}
245 245
246/*
247 * This is called before hardware starts writing to the AUX area to
248 * obtain an output handle and make sure there's room in the buffer.
249 * When the capture completes, call perf_aux_output_end() to commit
250 * the recorded data to the buffer.
251 *
252 * The ordering is similar to that of perf_output_{begin,end}, with
253 * the exception of (B), which should be taken care of by the pmu
254 * driver, since ordering rules will differ depending on hardware.
255 */
256void *perf_aux_output_begin(struct perf_output_handle *handle,
257 struct perf_event *event)
258{
259 struct perf_event *output_event = event;
260 unsigned long aux_head, aux_tail;
261 struct ring_buffer *rb;
262
263 if (output_event->parent)
264 output_event = output_event->parent;
265
266 /*
267 * Since this will typically be open across pmu::add/pmu::del, we
268 * grab ring_buffer's refcount instead of holding rcu read lock
269 * to make sure it doesn't disappear under us.
270 */
271 rb = ring_buffer_get(output_event);
272 if (!rb)
273 return NULL;
274
275 if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount))
276 goto err;
277
278 /*
279 * Nesting is not supported for AUX area, make sure nested
280 * writers are caught early
281 */
282 if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
283 goto err_put;
284
285 aux_head = local_read(&rb->aux_head);
286
287 handle->rb = rb;
288 handle->event = event;
289 handle->head = aux_head;
290 handle->size = 0;
291
292 /*
293 * In overwrite mode, AUX data stores do not depend on aux_tail,
294 * therefore (A) control dependency barrier does not exist. The
295 * (B) <-> (C) ordering is still observed by the pmu driver.
296 */
297 if (!rb->aux_overwrite) {
298 aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
299 handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark;
300 if (aux_head - aux_tail < perf_aux_size(rb))
301 handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
302
303 /*
304 * handle->size computation depends on aux_tail load; this forms a
305 * control dependency barrier separating aux_tail load from aux data
306 * store that will be enabled on successful return
307 */
308 if (!handle->size) { /* A, matches D */
309 event->pending_disable = 1;
310 perf_output_wakeup(handle);
311 local_set(&rb->aux_nest, 0);
312 goto err_put;
313 }
314 }
315
316 return handle->rb->aux_priv;
317
318err_put:
319 rb_free_aux(rb);
320
321err:
322 ring_buffer_put(rb);
323 handle->event = NULL;
324
325 return NULL;
326}
327
328/*
329 * Commit the data written by hardware into the ring buffer by adjusting
330 * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
331 * pmu driver's responsibility to observe ordering rules of the hardware,
332 * so that all the data is externally visible before this is called.
333 */
334void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
335 bool truncated)
336{
337 struct ring_buffer *rb = handle->rb;
338 unsigned long aux_head;
339 u64 flags = 0;
340
341 if (truncated)
342 flags |= PERF_AUX_FLAG_TRUNCATED;
343
344 /* in overwrite mode, driver provides aux_head via handle */
345 if (rb->aux_overwrite) {
346 flags |= PERF_AUX_FLAG_OVERWRITE;
347
348 aux_head = handle->head;
349 local_set(&rb->aux_head, aux_head);
350 } else {
351 aux_head = local_read(&rb->aux_head);
352 local_add(size, &rb->aux_head);
353 }
354
355 if (size || flags) {
356 /*
357 * Only send RECORD_AUX if we have something useful to communicate
358 */
359
360 perf_event_aux_event(handle->event, aux_head, size, flags);
361 }
362
363 aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
364
365 if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
366 perf_output_wakeup(handle);
367 local_add(rb->aux_watermark, &rb->aux_wakeup);
368 }
369 handle->event = NULL;
370
371 local_set(&rb->aux_nest, 0);
372 rb_free_aux(rb);
373 ring_buffer_put(rb);
374}
375
376/*
377 * Skip over a given number of bytes in the AUX buffer, due to, for example,
378 * hardware's alignment constraints.
379 */
380int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
381{
382 struct ring_buffer *rb = handle->rb;
383 unsigned long aux_head;
384
385 if (size > handle->size)
386 return -ENOSPC;
387
388 local_add(size, &rb->aux_head);
389
390 aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
391 if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
392 perf_output_wakeup(handle);
393 local_add(rb->aux_watermark, &rb->aux_wakeup);
394 handle->wakeup = local_read(&rb->aux_wakeup) +
395 rb->aux_watermark;
396 }
397
398 handle->head = aux_head;
399 handle->size -= size;
400
401 return 0;
402}
403
404void *perf_get_aux(struct perf_output_handle *handle)
405{
406 /* this is only valid between perf_aux_output_begin and *_end */
407 if (!handle->event)
408 return NULL;
409
410 return handle->rb->aux_priv;
411}
412
413#define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
414
415static struct page *rb_alloc_aux_page(int node, int order)
416{
417 struct page *page;
418
419 if (order > MAX_ORDER)
420 order = MAX_ORDER;
421
422 do {
423 page = alloc_pages_node(node, PERF_AUX_GFP, order);
424 } while (!page && order--);
425
426 if (page && order) {
427 /*
428 * Communicate the allocation size to the driver
429 */
430 split_page(page, order);
431 SetPagePrivate(page);
432 set_page_private(page, order);
433 }
434
435 return page;
436}
437
438static void rb_free_aux_page(struct ring_buffer *rb, int idx)
439{
440 struct page *page = virt_to_page(rb->aux_pages[idx]);
441
442 ClearPagePrivate(page);
443 page->mapping = NULL;
444 __free_page(page);
445}
446
447int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
448 pgoff_t pgoff, int nr_pages, long watermark, int flags)
449{
450 bool overwrite = !(flags & RING_BUFFER_WRITABLE);
451 int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
452 int ret = -ENOMEM, max_order = 0;
453
454 if (!has_aux(event))
455 return -ENOTSUPP;
456
457 if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) {
458 /*
459 * We need to start with the max_order that fits in nr_pages,
460 * not the other way around, hence ilog2() and not get_order.
461 */
462 max_order = ilog2(nr_pages);
463
464 /*
465 * PMU requests more than one contiguous chunks of memory
466 * for SW double buffering
467 */
468 if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) &&
469 !overwrite) {
470 if (!max_order)
471 return -EINVAL;
472
473 max_order--;
474 }
475 }
476
477 rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node);
478 if (!rb->aux_pages)
479 return -ENOMEM;
480
481 rb->free_aux = event->pmu->free_aux;
482 for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) {
483 struct page *page;
484 int last, order;
485
486 order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages));
487 page = rb_alloc_aux_page(node, order);
488 if (!page)
489 goto out;
490
491 for (last = rb->aux_nr_pages + (1 << page_private(page));
492 last > rb->aux_nr_pages; rb->aux_nr_pages++)
493 rb->aux_pages[rb->aux_nr_pages] = page_address(page++);
494 }
495
496 rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages,
497 overwrite);
498 if (!rb->aux_priv)
499 goto out;
500
501 ret = 0;
502
503 /*
504 * aux_pages (and pmu driver's private data, aux_priv) will be
505 * referenced in both producer's and consumer's contexts, thus
506 * we keep a refcount here to make sure either of the two can
507 * reference them safely.
508 */
509 atomic_set(&rb->aux_refcount, 1);
510
511 rb->aux_overwrite = overwrite;
512 rb->aux_watermark = watermark;
513
514 if (!rb->aux_watermark && !rb->aux_overwrite)
515 rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1);
516
517out:
518 if (!ret)
519 rb->aux_pgoff = pgoff;
520 else
521 rb_free_aux(rb);
522
523 return ret;
524}
525
526static void __rb_free_aux(struct ring_buffer *rb)
527{
528 int pg;
529
530 if (rb->aux_priv) {
531 rb->free_aux(rb->aux_priv);
532 rb->free_aux = NULL;
533 rb->aux_priv = NULL;
534 }
535
536 for (pg = 0; pg < rb->aux_nr_pages; pg++)
537 rb_free_aux_page(rb, pg);
538
539 kfree(rb->aux_pages);
540 rb->aux_nr_pages = 0;
541}
542
543void rb_free_aux(struct ring_buffer *rb)
544{
545 if (atomic_dec_and_test(&rb->aux_refcount))
546 __rb_free_aux(rb);
547}
548
246#ifndef CONFIG_PERF_USE_VMALLOC 549#ifndef CONFIG_PERF_USE_VMALLOC
247 550
248/* 551/*
249 * Back perf_mmap() with regular GFP_KERNEL-0 pages. 552 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
250 */ 553 */
251 554
252struct page * 555static struct page *
253perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) 556__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
254{ 557{
255 if (pgoff > rb->nr_pages) 558 if (pgoff > rb->nr_pages)
256 return NULL; 559 return NULL;
@@ -340,8 +643,8 @@ static int data_page_nr(struct ring_buffer *rb)
340 return rb->nr_pages << page_order(rb); 643 return rb->nr_pages << page_order(rb);
341} 644}
342 645
343struct page * 646static struct page *
344perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) 647__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
345{ 648{
346 /* The '>' counts in the user page. */ 649 /* The '>' counts in the user page. */
347 if (pgoff > data_page_nr(rb)) 650 if (pgoff > data_page_nr(rb))
@@ -416,3 +719,19 @@ fail:
416} 719}
417 720
418#endif 721#endif
722
723struct page *
724perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
725{
726 if (rb->aux_nr_pages) {
727 /* above AUX space */
728 if (pgoff > rb->aux_pgoff + rb->aux_nr_pages)
729 return NULL;
730
731 /* AUX space */
732 if (pgoff >= rb->aux_pgoff)
733 return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]);
734 }
735
736 return __perf_mmap_to_page(rb, pgoff);
737}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index fedbdd7d5d1e..3b9a48ae153a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -432,6 +432,14 @@ config UPROBE_EVENT
432 This option is required if you plan to use perf-probe subcommand 432 This option is required if you plan to use perf-probe subcommand
433 of perf tools on user space applications. 433 of perf tools on user space applications.
434 434
435config BPF_EVENTS
436 depends on BPF_SYSCALL
437 depends on KPROBE_EVENT
438 bool
439 default y
440 help
441 This allows the user to attach BPF programs to kprobe events.
442
435config PROBE_EVENTS 443config PROBE_EVENTS
436 def_bool n 444 def_bool n
437 445
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 98f26588255e..9b1044e936a6 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
53endif 53endif
54obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 54obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
55obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o 55obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
56obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 57obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
57obj-$(CONFIG_TRACEPOINTS) += power-traces.o 58obj-$(CONFIG_TRACEPOINTS) += power-traces.o
58ifeq ($(CONFIG_PM),y) 59ifeq ($(CONFIG_PM),y)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
new file mode 100644
index 000000000000..2d56ce501632
--- /dev/null
+++ b/kernel/trace/bpf_trace.c
@@ -0,0 +1,222 @@
1/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#include <linux/kernel.h>
8#include <linux/types.h>
9#include <linux/slab.h>
10#include <linux/bpf.h>
11#include <linux/filter.h>
12#include <linux/uaccess.h>
13#include <linux/ctype.h>
14#include "trace.h"
15
16static DEFINE_PER_CPU(int, bpf_prog_active);
17
18/**
19 * trace_call_bpf - invoke BPF program
20 * @prog: BPF program
21 * @ctx: opaque context pointer
22 *
23 * kprobe handlers execute BPF programs via this helper.
24 * Can be used from static tracepoints in the future.
25 *
26 * Return: BPF programs always return an integer which is interpreted by
27 * kprobe handler as:
28 * 0 - return from kprobe (event is filtered out)
29 * 1 - store kprobe event into ring buffer
30 * Other values are reserved and currently alias to 1
31 */
32unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
33{
34 unsigned int ret;
35
36 if (in_nmi()) /* not supported yet */
37 return 1;
38
39 preempt_disable();
40
41 if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
42 /*
43 * since some bpf program is already running on this cpu,
44 * don't call into another bpf program (same or different)
45 * and don't send kprobe event into ring-buffer,
46 * so return zero here
47 */
48 ret = 0;
49 goto out;
50 }
51
52 rcu_read_lock();
53 ret = BPF_PROG_RUN(prog, ctx);
54 rcu_read_unlock();
55
56 out:
57 __this_cpu_dec(bpf_prog_active);
58 preempt_enable();
59
60 return ret;
61}
62EXPORT_SYMBOL_GPL(trace_call_bpf);
63
64static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
65{
66 void *dst = (void *) (long) r1;
67 int size = (int) r2;
68 void *unsafe_ptr = (void *) (long) r3;
69
70 return probe_kernel_read(dst, unsafe_ptr, size);
71}
72
73static const struct bpf_func_proto bpf_probe_read_proto = {
74 .func = bpf_probe_read,
75 .gpl_only = true,
76 .ret_type = RET_INTEGER,
77 .arg1_type = ARG_PTR_TO_STACK,
78 .arg2_type = ARG_CONST_STACK_SIZE,
79 .arg3_type = ARG_ANYTHING,
80};
81
82static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
83{
84 /* NMI safe access to clock monotonic */
85 return ktime_get_mono_fast_ns();
86}
87
88static const struct bpf_func_proto bpf_ktime_get_ns_proto = {
89 .func = bpf_ktime_get_ns,
90 .gpl_only = true,
91 .ret_type = RET_INTEGER,
92};
93
94/*
95 * limited trace_printk()
96 * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
97 */
98static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
99{
100 char *fmt = (char *) (long) r1;
101 int mod[3] = {};
102 int fmt_cnt = 0;
103 int i;
104
105 /*
106 * bpf_check()->check_func_arg()->check_stack_boundary()
107 * guarantees that fmt points to bpf program stack,
108 * fmt_size bytes of it were initialized and fmt_size > 0
109 */
110 if (fmt[--fmt_size] != 0)
111 return -EINVAL;
112
113 /* check format string for allowed specifiers */
114 for (i = 0; i < fmt_size; i++) {
115 if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
116 return -EINVAL;
117
118 if (fmt[i] != '%')
119 continue;
120
121 if (fmt_cnt >= 3)
122 return -EINVAL;
123
124 /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
125 i++;
126 if (fmt[i] == 'l') {
127 mod[fmt_cnt]++;
128 i++;
129 } else if (fmt[i] == 'p') {
130 mod[fmt_cnt]++;
131 i++;
132 if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
133 return -EINVAL;
134 fmt_cnt++;
135 continue;
136 }
137
138 if (fmt[i] == 'l') {
139 mod[fmt_cnt]++;
140 i++;
141 }
142
143 if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x')
144 return -EINVAL;
145 fmt_cnt++;
146 }
147
148 return __trace_printk(1/* fake ip will not be printed */, fmt,
149 mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3,
150 mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4,
151 mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5);
152}
153
154static const struct bpf_func_proto bpf_trace_printk_proto = {
155 .func = bpf_trace_printk,
156 .gpl_only = true,
157 .ret_type = RET_INTEGER,
158 .arg1_type = ARG_PTR_TO_STACK,
159 .arg2_type = ARG_CONST_STACK_SIZE,
160};
161
162static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
163{
164 switch (func_id) {
165 case BPF_FUNC_map_lookup_elem:
166 return &bpf_map_lookup_elem_proto;
167 case BPF_FUNC_map_update_elem:
168 return &bpf_map_update_elem_proto;
169 case BPF_FUNC_map_delete_elem:
170 return &bpf_map_delete_elem_proto;
171 case BPF_FUNC_probe_read:
172 return &bpf_probe_read_proto;
173 case BPF_FUNC_ktime_get_ns:
174 return &bpf_ktime_get_ns_proto;
175
176 case BPF_FUNC_trace_printk:
177 /*
178 * this program might be calling bpf_trace_printk,
179 * so allocate per-cpu printk buffers
180 */
181 trace_printk_init_buffers();
182
183 return &bpf_trace_printk_proto;
184 default:
185 return NULL;
186 }
187}
188
189/* bpf+kprobe programs can access fields of 'struct pt_regs' */
190static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type)
191{
192 /* check bounds */
193 if (off < 0 || off >= sizeof(struct pt_regs))
194 return false;
195
196 /* only read is allowed */
197 if (type != BPF_READ)
198 return false;
199
200 /* disallow misaligned access */
201 if (off % size != 0)
202 return false;
203
204 return true;
205}
206
207static struct bpf_verifier_ops kprobe_prog_ops = {
208 .get_func_proto = kprobe_prog_func_proto,
209 .is_valid_access = kprobe_prog_is_valid_access,
210};
211
212static struct bpf_prog_type_list kprobe_tl = {
213 .ops = &kprobe_prog_ops,
214 .type = BPF_PROG_TYPE_KPROBE,
215};
216
217static int __init register_kprobe_prog_ops(void)
218{
219 bpf_register_prog_type(&kprobe_tl);
220 return 0;
221}
222late_initcall(register_kprobe_prog_ops);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 9ba3f43f580e..d0ce590f06e1 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1135,11 +1135,15 @@ static void
1135kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) 1135kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
1136{ 1136{
1137 struct ftrace_event_call *call = &tk->tp.call; 1137 struct ftrace_event_call *call = &tk->tp.call;
1138 struct bpf_prog *prog = call->prog;
1138 struct kprobe_trace_entry_head *entry; 1139 struct kprobe_trace_entry_head *entry;
1139 struct hlist_head *head; 1140 struct hlist_head *head;
1140 int size, __size, dsize; 1141 int size, __size, dsize;
1141 int rctx; 1142 int rctx;
1142 1143
1144 if (prog && !trace_call_bpf(prog, regs))
1145 return;
1146
1143 head = this_cpu_ptr(call->perf_events); 1147 head = this_cpu_ptr(call->perf_events);
1144 if (hlist_empty(head)) 1148 if (hlist_empty(head))
1145 return; 1149 return;
@@ -1166,11 +1170,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
1166 struct pt_regs *regs) 1170 struct pt_regs *regs)
1167{ 1171{
1168 struct ftrace_event_call *call = &tk->tp.call; 1172 struct ftrace_event_call *call = &tk->tp.call;
1173 struct bpf_prog *prog = call->prog;
1169 struct kretprobe_trace_entry_head *entry; 1174 struct kretprobe_trace_entry_head *entry;
1170 struct hlist_head *head; 1175 struct hlist_head *head;
1171 int size, __size, dsize; 1176 int size, __size, dsize;
1172 int rctx; 1177 int rctx;
1173 1178
1179 if (prog && !trace_call_bpf(prog, regs))
1180 return;
1181
1174 head = this_cpu_ptr(call->perf_events); 1182 head = this_cpu_ptr(call->perf_events);
1175 if (hlist_empty(head)) 1183 if (hlist_empty(head))
1176 return; 1184 return;
@@ -1287,7 +1295,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
1287 kfree(call->print_fmt); 1295 kfree(call->print_fmt);
1288 return -ENODEV; 1296 return -ENODEV;
1289 } 1297 }
1290 call->flags = 0; 1298 call->flags = TRACE_EVENT_FL_KPROBE;
1291 call->class->reg = kprobe_register; 1299 call->class->reg = kprobe_register;
1292 call->data = tk; 1300 call->data = tk;
1293 ret = trace_add_event_call(call); 1301 ret = trace_add_event_call(call);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 74865465e0b7..d60fe62ec4fa 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1006,7 +1006,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
1006 return true; 1006 return true;
1007 1007
1008 list_for_each_entry(event, &filter->perf_events, hw.tp_list) { 1008 list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
1009 if (event->hw.tp_target->mm == mm) 1009 if (event->hw.target->mm == mm)
1010 return true; 1010 return true;
1011 } 1011 }
1012 1012
@@ -1016,7 +1016,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
1016static inline bool 1016static inline bool
1017uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) 1017uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
1018{ 1018{
1019 return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); 1019 return __uprobe_perf_filter(&tu->filter, event->hw.target->mm);
1020} 1020}
1021 1021
1022static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) 1022static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
@@ -1024,10 +1024,10 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
1024 bool done; 1024 bool done;
1025 1025
1026 write_lock(&tu->filter.rwlock); 1026 write_lock(&tu->filter.rwlock);
1027 if (event->hw.tp_target) { 1027 if (event->hw.target) {
1028 list_del(&event->hw.tp_list); 1028 list_del(&event->hw.tp_list);
1029 done = tu->filter.nr_systemwide || 1029 done = tu->filter.nr_systemwide ||
1030 (event->hw.tp_target->flags & PF_EXITING) || 1030 (event->hw.target->flags & PF_EXITING) ||
1031 uprobe_filter_event(tu, event); 1031 uprobe_filter_event(tu, event);
1032 } else { 1032 } else {
1033 tu->filter.nr_systemwide--; 1033 tu->filter.nr_systemwide--;
@@ -1047,7 +1047,7 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
1047 int err; 1047 int err;
1048 1048
1049 write_lock(&tu->filter.rwlock); 1049 write_lock(&tu->filter.rwlock);
1050 if (event->hw.tp_target) { 1050 if (event->hw.target) {
1051 /* 1051 /*
1052 * event->parent != NULL means copy_process(), we can avoid 1052 * event->parent != NULL means copy_process(), we can avoid
1053 * uprobe_apply(). current->mm must be probed and we can rely 1053 * uprobe_apply(). current->mm must be probed and we can rely
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 3174bf8e3538..9a056f5bc02c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -567,9 +567,37 @@ static void watchdog_nmi_disable(unsigned int cpu)
567 cpu0_err = 0; 567 cpu0_err = 0;
568 } 568 }
569} 569}
570
571void watchdog_nmi_enable_all(void)
572{
573 int cpu;
574
575 if (!watchdog_user_enabled)
576 return;
577
578 get_online_cpus();
579 for_each_online_cpu(cpu)
580 watchdog_nmi_enable(cpu);
581 put_online_cpus();
582}
583
584void watchdog_nmi_disable_all(void)
585{
586 int cpu;
587
588 if (!watchdog_running)
589 return;
590
591 get_online_cpus();
592 for_each_online_cpu(cpu)
593 watchdog_nmi_disable(cpu);
594 put_online_cpus();
595}
570#else 596#else
571static int watchdog_nmi_enable(unsigned int cpu) { return 0; } 597static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
572static void watchdog_nmi_disable(unsigned int cpu) { return; } 598static void watchdog_nmi_disable(unsigned int cpu) { return; }
599void watchdog_nmi_enable_all(void) {}
600void watchdog_nmi_disable_all(void) {}
573#endif /* CONFIG_HARDLOCKUP_DETECTOR */ 601#endif /* CONFIG_HARDLOCKUP_DETECTOR */
574 602
575static struct smp_hotplug_thread watchdog_threads = { 603static struct smp_hotplug_thread watchdog_threads = {