11 files changed, 1226 insertions, 180 deletions
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 536edc2be307..504c10b990ef 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -16,6 +16,7 @@
 #include <linux/file.h>
 #include <linux/license.h>
 #include <linux/filter.h>
+#include <linux/version.h>
 static LIST_HEAD(bpf_map_types);
@@ -467,7 +468,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
 }
 /* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD log_buf
+#define BPF_PROG_LOAD_LAST_FIELD kern_version
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -492,6 +493,10 @@ static int bpf_prog_load(union bpf_attr *attr)
        if (attr->insn_cnt >= BPF_MAXINSNS)
                return -EINVAL;
+        if (type == BPF_PROG_TYPE_KPROBE &&
+            attr->kern_version != LINUX_VERSION_CODE)
+                return -EINVAL;
        /* plain bpf_prog allocation */
        prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
        if (!prog)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2fabc0627165..06917d537302 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -34,14 +34,16 @@
 #include <linux/syscalls.h>
 #include <linux/anon_inodes.h>
 #include <linux/kernel_stat.h>
+#include <linux/cgroup.h>
 #include <linux/perf_event.h>
 #include <linux/ftrace_event.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/mm_types.h>
-#include <linux/cgroup.h>
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/compat.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
 #include "internal.h"
@@ -153,7 +155,7 @@ enum event_type_t {
 */
 struct static_key_deferred perf_sched_events __read_mostly;
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -327,6 +329,11 @@ static inline u64 perf_clock(void)
        return local_clock();
 }
+static inline u64 perf_event_clock(struct perf_event *event)
+{
+        return event->clock();
+}
 static inline struct perf_cpu_context *
 __get_cpu_context(struct perf_event_context *ctx)
 {
@@ -351,32 +358,6 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
 #ifdef CONFIG_CGROUP_PERF
-/*
- * perf_cgroup_info keeps track of time_enabled for a cgroup.
- * This is a per-cpu dynamically allocated data structure.
- */
-struct perf_cgroup_info {
-        u64                             time;
-        u64                             timestamp;
-};
-struct perf_cgroup {
-        struct cgroup_subsys_state      css;
-        struct perf_cgroup_info __percpu *info;
-};
-/*
- * Must ensure cgroup is pinned (css_get) before calling
- * this function. In other words, we cannot call this function
- * if there is no cgroup event for the current CPU context.
- */
-static inline struct perf_cgroup *
-perf_cgroup_from_task(struct task_struct *task)
-{
-        return container_of(task_css(task, perf_event_cgrp_id),
-                            struct perf_cgroup, css);
-}
 static inline bool
 perf_cgroup_match(struct perf_event *event)
 {
@@ -905,6 +886,15 @@ static void get_ctx(struct perf_event_context *ctx)
        WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 }
+static void free_ctx(struct rcu_head *head)
+{
+        struct perf_event_context *ctx;
+        ctx = container_of(head, struct perf_event_context, rcu_head);
+        kfree(ctx->task_ctx_data);
+        kfree(ctx);
+}
 static void put_ctx(struct perf_event_context *ctx)
 {
        if (atomic_dec_and_test(&ctx->refcount)) {
@@ -912,7 +902,7 @@ static void put_ctx(struct perf_event_context *ctx)
                        put_ctx(ctx->parent_ctx);
                if (ctx->task)
                        put_task_struct(ctx->task);
-                kfree_rcu(ctx, rcu_head);
+                call_rcu(&ctx->rcu_head, free_ctx);
        }
 }
@@ -1239,9 +1229,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
        if (is_cgroup_event(event))
                ctx->nr_cgroups++;
-        if (has_branch_stack(event))
-                ctx->nr_branch_stack++;
        list_add_rcu(&event->event_entry, &ctx->event_list);
        ctx->nr_events++;
        if (event->attr.inherit_stat)
@@ -1408,9 +1395,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
                        cpuctx->cgrp = NULL;
        }
-        if (has_branch_stack(event))
-                ctx->nr_branch_stack--;
        ctx->nr_events--;
        if (event->attr.inherit_stat)
                ctx->nr_stat--;
@@ -1847,6 +1831,7 @@ static void perf_set_shadow_time(struct perf_event *event,
 #define MAX_INTERRUPTS (~0ULL)
 static void perf_log_throttle(struct perf_event *event, int enable);
+static void perf_log_itrace_start(struct perf_event *event);
 static int
 event_sched_in(struct perf_event *event,
@@ -1881,6 +1866,12 @@ event_sched_in(struct perf_event *event,
        perf_pmu_disable(event->pmu);
+        event->tstamp_running += tstamp - event->tstamp_stopped;
+        perf_set_shadow_time(event, ctx, tstamp);
+        perf_log_itrace_start(event);
        if (event->pmu->add(event, PERF_EF_START)) {
                event->state = PERF_EVENT_STATE_INACTIVE;
                event->oncpu = -1;
@@ -1888,10 +1879,6 @@ event_sched_in(struct perf_event *event,
                goto out;
        }
-        event->tstamp_running += tstamp - event->tstamp_stopped;
-        perf_set_shadow_time(event, ctx, tstamp);
        if (!is_software_event(event))
                cpuctx->active_oncpu++;
        if (!ctx->nr_active++)
@@ -2559,6 +2546,9 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
                        next->perf_event_ctxp[ctxn] = ctx;
                        ctx->task = next;
                        next_ctx->task = task;
+                        swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
                        do_switch = 0;
                        perf_event_sync_stat(ctx, next_ctx);
@@ -2577,6 +2567,56 @@ unlock:
        }
 }
+void perf_sched_cb_dec(struct pmu *pmu)
+{
+        this_cpu_dec(perf_sched_cb_usages);
+}
+void perf_sched_cb_inc(struct pmu *pmu)
+{
+        this_cpu_inc(perf_sched_cb_usages);
+}
+/*
+ * This function provides the context switch callback to the lower code
+ * layer. It is invoked ONLY when the context switch callback is enabled.
+ */
+static void perf_pmu_sched_task(struct task_struct *prev,
+                                struct task_struct *next,
+                                bool sched_in)
+{
+        struct perf_cpu_context *cpuctx;
+        struct pmu *pmu;
+        unsigned long flags;
+        if (prev == next)
+                return;
+        local_irq_save(flags);
+        rcu_read_lock();
+        list_for_each_entry_rcu(pmu, &pmus, entry) {
+                if (pmu->sched_task) {
+                        cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+                        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+                        perf_pmu_disable(pmu);
+                        pmu->sched_task(cpuctx->task_ctx, sched_in);
+                        perf_pmu_enable(pmu);
+                        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+                }
+        }
+        rcu_read_unlock();
+        local_irq_restore(flags);
+}
 #define for_each_task_context_nr(ctxn)                                  \
        for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
@@ -2596,6 +2636,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
 {
        int ctxn;
+        if (__this_cpu_read(perf_sched_cb_usages))
+                perf_pmu_sched_task(task, next, false);
        for_each_task_context_nr(ctxn)
                perf_event_context_sched_out(task, ctxn, next);
@@ -2755,64 +2798,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 }
 /*
- * When sampling the branck stack in system-wide, it may be necessary
- * to flush the stack on context switch. This happens when the branch
- * stack does not tag its entries with the pid of the current task.
- * Otherwise it becomes impossible to associate a branch entry with a
- * task. This ambiguity is more likely to appear when the branch stack
- * supports priv level filtering and the user sets it to monitor only
- * at the user level (which could be a useful measurement in system-wide
- * mode). In that case, the risk is high of having a branch stack with
- * branch from multiple tasks. Flushing may mean dropping the existing
- * entries or stashing them somewhere in the PMU specific code layer.
- *
- * This function provides the context switch callback to the lower code
- * layer. It is invoked ONLY when there is at least one system-wide context
- * with at least one active event using taken branch sampling.
- */
-static void perf_branch_stack_sched_in(struct task_struct *prev,
-                                       struct task_struct *task)
-{
-        struct perf_cpu_context *cpuctx;
-        struct pmu *pmu;
-        unsigned long flags;
-        /* no need to flush branch stack if not changing task */
-        if (prev == task)
-                return;
-        local_irq_save(flags);
-        rcu_read_lock();
-        list_for_each_entry_rcu(pmu, &pmus, entry) {
-                cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-                /*
-                 * check if the context has at least one
-                 * event using PERF_SAMPLE_BRANCH_STACK
-                 */
-                if (cpuctx->ctx.nr_branch_stack > 0
-                    && pmu->flush_branch_stack) {
-                        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-                        perf_pmu_disable(pmu);
-                        pmu->flush_branch_stack();
-                        perf_pmu_enable(pmu);
-                        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-                }
-        }
-        rcu_read_unlock();
-        local_irq_restore(flags);
-}
-/*
 * Called from scheduler to add the events of the current task
 * with interrupts disabled.
 *
@@ -2844,9 +2829,8 @@ void __perf_event_task_sched_in(struct task_struct *prev,
        if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
                perf_cgroup_sched_in(prev, task);
-        /* check for system-wide branch_stack events */
+        if (__this_cpu_read(perf_sched_cb_usages))
-        if (atomic_read(this_cpu_ptr(&perf_branch_stack_events)))
+                perf_pmu_sched_task(prev, task, true);
-                perf_branch_stack_sched_in(prev, task);
 }
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -3220,7 +3204,10 @@ static void __perf_event_read(void *info)
 static inline u64 perf_event_count(struct perf_event *event)
 {
-        return local64_read(&event->count) + atomic64_read(&event->child_count);
+        if (event->pmu->count)
+                return event->pmu->count(event);
+        return __perf_event_count(event);
 }
 static u64 perf_event_read(struct perf_event *event)
@@ -3321,12 +3308,15 @@ errout:
 * Returns a matching context with refcount and pincount.
 */
 static struct perf_event_context *
-find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
+find_get_context(struct pmu *pmu, struct task_struct *task,
+                struct perf_event *event)
 {
        struct perf_event_context *ctx, *clone_ctx = NULL;
        struct perf_cpu_context *cpuctx;
+        void *task_ctx_data = NULL;
        unsigned long flags;
        int ctxn, err;
+        int cpu = event->cpu;
        if (!task) {
                /* Must be root to operate on a CPU event: */
@@ -3354,11 +3344,24 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
        if (ctxn < 0)
                goto errout;
+        if (event->attach_state & PERF_ATTACH_TASK_DATA) {
+                task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
+                if (!task_ctx_data) {
+                        err = -ENOMEM;
+                        goto errout;
+                }
+        }
 retry:
        ctx = perf_lock_task_context(task, ctxn, &flags);
        if (ctx) {
                clone_ctx = unclone_ctx(ctx);
                ++ctx->pin_count;
+                if (task_ctx_data && !ctx->task_ctx_data) {
+                        ctx->task_ctx_data = task_ctx_data;
+                        task_ctx_data = NULL;
+                }
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
                if (clone_ctx)
@@ -3369,6 +3372,11 @@ retry:
                if (!ctx)
                        goto errout;
+                if (task_ctx_data) {
+                        ctx->task_ctx_data = task_ctx_data;
+                        task_ctx_data = NULL;
+                }
                err = 0;
                mutex_lock(&task->perf_event_mutex);
                /*
@@ -3395,13 +3403,16 @@ retry:
                }
        }
+        kfree(task_ctx_data);
        return ctx;
 errout:
+        kfree(task_ctx_data);
        return ERR_PTR(err);
 }
 static void perf_event_free_filter(struct perf_event *event);
+static void perf_event_free_bpf_prog(struct perf_event *event);
 static void free_event_rcu(struct rcu_head *head)
 {
@@ -3411,10 +3422,10 @@ static void free_event_rcu(struct rcu_head *head)
        if (event->ns)
                put_pid_ns(event->ns);
        perf_event_free_filter(event);
+        perf_event_free_bpf_prog(event);
        kfree(event);
 }
-static void ring_buffer_put(struct ring_buffer *rb);
 static void ring_buffer_attach(struct perf_event *event,
                               struct ring_buffer *rb);
@@ -3423,10 +3434,6 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu)
        if (event->parent)
                return;
-        if (has_branch_stack(event)) {
-                if (!(event->attach_state & PERF_ATTACH_TASK))
-                        atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
-        }
        if (is_cgroup_event(event))
                atomic_dec(&per_cpu(perf_cgroup_events, cpu));
 }
@@ -3454,6 +3461,91 @@ static void unaccount_event(struct perf_event *event)
        unaccount_event_cpu(event, event->cpu);
 }
+/*
+ * The following implement mutual exclusion of events on "exclusive" pmus
+ * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
+ * at a time, so we disallow creating events that might conflict, namely:
+ *
+ *  1) cpu-wide events in the presence of per-task events,
+ *  2) per-task events in the presence of cpu-wide events,
+ *  3) two matching events on the same context.
+ *
+ * The former two cases are handled in the allocation path (perf_event_alloc(),
+ * __free_event()), the latter -- before the first perf_install_in_context().
+ */
+static int exclusive_event_init(struct perf_event *event)
+{
+        struct pmu *pmu = event->pmu;
+        if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+                return 0;
+        /*
+         * Prevent co-existence of per-task and cpu-wide events on the
+         * same exclusive pmu.
+         *
+         * Negative pmu::exclusive_cnt means there are cpu-wide
+         * events on this "exclusive" pmu, positive means there are
+         * per-task events.
+         *
+         * Since this is called in perf_event_alloc() path, event::ctx
+         * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
+         * to mean "per-task event", because unlike other attach states it
+         * never gets cleared.
+         */
+        if (event->attach_state & PERF_ATTACH_TASK) {
+                if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
+                        return -EBUSY;
+        } else {
+                if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
+                        return -EBUSY;
+        }
+        return 0;
+}
+static void exclusive_event_destroy(struct perf_event *event)
+{
+        struct pmu *pmu = event->pmu;
+        if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+                return;
+        /* see comment in exclusive_event_init() */
+        if (event->attach_state & PERF_ATTACH_TASK)
+                atomic_dec(&pmu->exclusive_cnt);
+        else
+                atomic_inc(&pmu->exclusive_cnt);
+}
+static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
+{
+        if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
+            (e1->cpu == e2->cpu ||
+             e1->cpu == -1 ||
+             e2->cpu == -1))
+                return true;
+        return false;
+}
+/* Called under the same ctx::mutex as perf_install_in_context() */
+static bool exclusive_event_installable(struct perf_event *event,
+                                        struct perf_event_context *ctx)
+{
+        struct perf_event *iter_event;
+        struct pmu *pmu = event->pmu;
+        if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+                return true;
+        list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
+                if (exclusive_event_match(iter_event, event))
+                        return false;
+        }
+        return true;
+}
 static void __free_event(struct perf_event *event)
 {
        if (!event->parent) {
@@ -3467,8 +3559,10 @@ static void __free_event(struct perf_event *event)
        if (event->ctx)
                put_ctx(event->ctx);
-        if (event->pmu)
+        if (event->pmu) {
+                exclusive_event_destroy(event);
                module_put(event->pmu->module);
+        }
        call_rcu(&event->rcu_head, free_event_rcu);
 }
@@ -3927,6 +4021,7 @@ static inline int perf_fget_light(int fd, struct fd *p)
 static int perf_event_set_output(struct perf_event *event,
                                 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
 {
@@ -3980,6 +4075,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
        case PERF_EVENT_IOC_SET_FILTER:
                return perf_event_set_filter(event, (void __user *)arg);
+        case PERF_EVENT_IOC_SET_BPF:
+                return perf_event_set_bpf_prog(event, arg);
        default:
                return -ENOTTY;
        }
@@ -4096,6 +4194,8 @@ static void perf_event_init_userpage(struct perf_event *event)
        /* Allow new userspace to detect that bit 0 is deprecated */
        userpg->cap_bit0_is_deprecated = 1;
        userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
+        userpg->data_offset = PAGE_SIZE;
+        userpg->data_size = perf_data_size(rb);
 unlock:
        rcu_read_unlock();
@@ -4263,7 +4363,7 @@ static void rb_free_rcu(struct rcu_head *rcu_head)
        rb_free(rb);
 }
-static struct ring_buffer *ring_buffer_get(struct perf_event *event)
+struct ring_buffer *ring_buffer_get(struct perf_event *event)
 {
        struct ring_buffer *rb;
@@ -4278,7 +4378,7 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
        return rb;
 }
-static void ring_buffer_put(struct ring_buffer *rb)
+void ring_buffer_put(struct ring_buffer *rb)
 {
        if (!atomic_dec_and_test(&rb->refcount))
                return;
@@ -4295,6 +4395,9 @@ static void perf_mmap_open(struct vm_area_struct *vma)
        atomic_inc(&event->mmap_count);
        atomic_inc(&event->rb->mmap_count);
+        if (vma->vm_pgoff)
+                atomic_inc(&event->rb->aux_mmap_count);
        if (event->pmu->event_mapped)
                event->pmu->event_mapped(event);
 }
@@ -4319,6 +4422,20 @@ static void perf_mmap_close(struct vm_area_struct *vma)
        if (event->pmu->event_unmapped)
                event->pmu->event_unmapped(event);
+        /*
+         * rb->aux_mmap_count will always drop before rb->mmap_count and
+         * event->mmap_count, so it is ok to use event->mmap_mutex to
+         * serialize with perf_mmap here.
+         */
+        if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
+            atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
+                atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
+                vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
+                rb_free_aux(rb);
+                mutex_unlock(&event->mmap_mutex);
+        }
        atomic_dec(&rb->mmap_count);
        if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
@@ -4392,7 +4509,7 @@ out_put:
 static const struct vm_operations_struct perf_mmap_vmops = {
        .open           = perf_mmap_open,
-        .close          = perf_mmap_close,
+        .close          = perf_mmap_close, /* non mergable */
        .fault          = perf_mmap_fault,
        .page_mkwrite   = perf_mmap_fault,
 };
@@ -4403,10 +4520,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        unsigned long user_locked, user_lock_limit;
        struct user_struct *user = current_user();
        unsigned long locked, lock_limit;
-        struct ring_buffer *rb;
+        struct ring_buffer *rb = NULL;
        unsigned long vma_size;
        unsigned long nr_pages;
-        long user_extra, extra;
+        long user_extra = 0, extra = 0;
        int ret = 0, flags = 0;
        /*
@@ -4421,7 +4538,66 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
                return -EINVAL;
        vma_size = vma->vm_end - vma->vm_start;
-        nr_pages = (vma_size / PAGE_SIZE) - 1;
+        if (vma->vm_pgoff == 0) {
+                nr_pages = (vma_size / PAGE_SIZE) - 1;
+        } else {
+                /*
+                 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
+                 * mapped, all subsequent mappings should have the same size
+                 * and offset. Must be above the normal perf buffer.
+                 */
+                u64 aux_offset, aux_size;
+                if (!event->rb)
+                        return -EINVAL;
+                nr_pages = vma_size / PAGE_SIZE;
+                mutex_lock(&event->mmap_mutex);
+                ret = -EINVAL;
+                rb = event->rb;
+                if (!rb)
+                        goto aux_unlock;
+                aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
+                aux_size = ACCESS_ONCE(rb->user_page->aux_size);
+                if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
+                        goto aux_unlock;
+                if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
+                        goto aux_unlock;
+                /* already mapped with a different offset */
+                if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
+                        goto aux_unlock;
+                if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
+                        goto aux_unlock;
+                /* already mapped with a different size */
+                if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
+                        goto aux_unlock;
+                if (!is_power_of_2(nr_pages))
+                        goto aux_unlock;
+                if (!atomic_inc_not_zero(&rb->mmap_count))
+                        goto aux_unlock;
+                if (rb_has_aux(rb)) {
+                        atomic_inc(&rb->aux_mmap_count);
+                        ret = 0;
+                        goto unlock;
+                }
+                atomic_set(&rb->aux_mmap_count, 1);
+                user_extra = nr_pages;
+                goto accounting;
+        }
        /*
         * If we have rb pages ensure they're a power-of-two number, so we
@@ -4433,9 +4609,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        if (vma_size != PAGE_SIZE * (1 + nr_pages))
                return -EINVAL;
-        if (vma->vm_pgoff != 0)
-                return -EINVAL;
        WARN_ON_ONCE(event->ctx->parent_ctx);
 again:
        mutex_lock(&event->mmap_mutex);
@@ -4459,6 +4632,8 @@ again:
        }
        user_extra = nr_pages + 1;
+accounting:
        user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
        /*
@@ -4468,7 +4643,6 @@ again:
        user_locked = atomic_long_read(&user->locked_vm) + user_extra;
-        extra = 0;
        if (user_locked > user_lock_limit)
                extra = user_locked - user_lock_limit;
@@ -4482,35 +4656,46 @@ again:
                goto unlock;
        }
-        WARN_ON(event->rb);
+        WARN_ON(!rb && event->rb);
        if (vma->vm_flags & VM_WRITE)
                flags |= RING_BUFFER_WRITABLE;
-        rb = rb_alloc(nr_pages, 
-                event->attr.watermark ? event->attr.wakeup_watermark : 0,
-                event->cpu, flags);
        if (!rb) {
-                ret = -ENOMEM;
+                rb = rb_alloc(nr_pages,
-                goto unlock;
+                              event->attr.watermark ? event->attr.wakeup_watermark : 0,
-        }
+                              event->cpu, flags);
-        atomic_set(&rb->mmap_count, 1);
+                if (!rb) {
-        rb->mmap_locked = extra;
+                        ret = -ENOMEM;
-        rb->mmap_user = get_current_user();
+                        goto unlock;
+                }
-        atomic_long_add(user_extra, &user->locked_vm);
+                atomic_set(&rb->mmap_count, 1);
-        vma->vm_mm->pinned_vm += extra;
+                rb->mmap_user = get_current_user();
+                rb->mmap_locked = extra;
-        ring_buffer_attach(event, rb);
+                ring_buffer_attach(event, rb);
-        perf_event_init_userpage(event);
+                perf_event_init_userpage(event);
-        perf_event_update_userpage(event);
+                perf_event_update_userpage(event);
+        } else {
+                ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
+                                   event->attr.aux_watermark, flags);
+                if (!ret)
+                        rb->aux_mmap_locked = extra;
+        }
 unlock:
-        if (!ret)
+        if (!ret) {
+                atomic_long_add(user_extra, &user->locked_vm);
+                vma->vm_mm->pinned_vm += extra;
                atomic_inc(&event->mmap_count);
+        } else if (rb) {
+                atomic_dec(&rb->mmap_count);
+        }
+aux_unlock:
        mutex_unlock(&event->mmap_mutex);
        /*
@@ -4766,7 +4951,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
        }
        if (sample_type & PERF_SAMPLE_TIME)
-                data->time = perf_clock();
+                data->time = perf_event_clock(event);
        if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
                data->id = primary_event_id(event);
@@ -5344,6 +5529,8 @@ static void perf_event_task_output(struct perf_event *event,
        task_event->event_id.tid = perf_event_tid(event, task);
        task_event->event_id.ptid = perf_event_tid(event, current);
+        task_event->event_id.time = perf_event_clock(event);
        perf_output_put(&handle, task_event->event_id);
        perf_event__output_id_sample(event, &handle, &sample);
@@ -5377,7 +5564,7 @@ static void perf_event_task(struct task_struct *task,
                        /* .ppid */
                        /* .tid  */
                        /* .ptid */
-                        .time = perf_clock(),
+                        /* .time */
                },
        };
@@ -5732,6 +5919,40 @@ void perf_event_mmap(struct vm_area_struct *vma)
        perf_event_mmap_event(&mmap_event);
 }
+void perf_event_aux_event(struct perf_event *event, unsigned long head,
+                          unsigned long size, u64 flags)
+{
+        struct perf_output_handle handle;
+        struct perf_sample_data sample;
+        struct perf_aux_event {
+                struct perf_event_header        header;
+                u64                             offset;
+                u64                             size;
+                u64                             flags;
+        } rec = {
+                .header = {
+                        .type = PERF_RECORD_AUX,
+                        .misc = 0,
+                        .size = sizeof(rec),
+                },
+                .offset         = head,
+                .size           = size,
+                .flags          = flags,
+        };
+        int ret;
+        perf_event_header__init_id(&rec.header, &sample, event);
+        ret = perf_output_begin(&handle, event, rec.header.size);
+        if (ret)
+                return;
+        perf_output_put(&handle, rec);
+        perf_event__output_id_sample(event, &handle, &sample);
+        perf_output_end(&handle);
+}
 /*
 * IRQ throttle logging
 */
@@ -5753,7 +5974,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
                        .misc = 0,
                        .size = sizeof(throttle_event),
                },
-                .time           = perf_clock(),
+                .time           = perf_event_clock(event),
                .id             = primary_event_id(event),
                .stream_id      = event->id,
        };
@@ -5773,6 +5994,44 @@ static void perf_log_throttle(struct perf_event *event, int enable)
        perf_output_end(&handle);
 }
+static void perf_log_itrace_start(struct perf_event *event)
+{
+        struct perf_output_handle handle;
+        struct perf_sample_data sample;
+        struct perf_aux_event {
+                struct perf_event_header        header;
+                u32                             pid;
+                u32                             tid;
+        } rec;
+        int ret;
+        if (event->parent)
+                event = event->parent;
+        if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
+            event->hw.itrace_started)
+                return;
+        event->hw.itrace_started = 1;
+        rec.header.type = PERF_RECORD_ITRACE_START;
+        rec.header.misc = 0;
+        rec.header.size = sizeof(rec);
+        rec.pid = perf_event_pid(event, current);
+        rec.tid = perf_event_tid(event, current);
+        perf_event_header__init_id(&rec.header, &sample, event);
+        ret = perf_output_begin(&handle, event, rec.header.size);
+        if (ret)
+                return;
+        perf_output_put(&handle, rec);
+        perf_event__output_id_sample(event, &handle, &sample);
+        perf_output_end(&handle);
+}
 /*
 * Generic event overflow handling, sampling.
 */
@@ -6133,6 +6392,7 @@ static int perf_swevent_add(struct perf_event *event, int flags)
        }
        hlist_add_head_rcu(&event->hlist_entry, head);
+        perf_event_update_userpage(event);
        return 0;
 }
@@ -6296,6 +6556,8 @@ static int perf_swevent_init(struct perf_event *event)
 static struct pmu perf_swevent = {
        .task_ctx_nr    = perf_sw_context,
+        .capabilities   = PERF_PMU_CAP_NO_NMI,
        .event_init     = perf_swevent_init,
        .add            = perf_swevent_add,
        .del            = perf_swevent_del,
@@ -6449,6 +6711,49 @@ static void perf_event_free_filter(struct perf_event *event)
        ftrace_profile_free_filter(event);
 }
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+        struct bpf_prog *prog;
+        if (event->attr.type != PERF_TYPE_TRACEPOINT)
+                return -EINVAL;
+        if (event->tp_event->prog)
+                return -EEXIST;
+        if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
+                /* bpf programs can only be attached to kprobes */
+                return -EINVAL;
+        prog = bpf_prog_get(prog_fd);
+        if (IS_ERR(prog))
+                return PTR_ERR(prog);
+        if (prog->aux->prog_type != BPF_PROG_TYPE_KPROBE) {
+                /* valid fd, but invalid bpf program type */
+                bpf_prog_put(prog);
+                return -EINVAL;
+        }
+        event->tp_event->prog = prog;
+        return 0;
+}
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+        struct bpf_prog *prog;
+        if (!event->tp_event)
+                return;
+        prog = event->tp_event->prog;
+        if (prog) {
+                event->tp_event->prog = NULL;
+                bpf_prog_put(prog);
+        }
+}
 #else
 static inline void perf_tp_register(void)
@@ -6464,6 +6769,14 @@ static void perf_event_free_filter(struct perf_event *event)
 {
 }
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+        return -ENOENT;
+}
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+}
 #endif /* CONFIG_EVENT_TRACING */
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
@@ -6602,6 +6915,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags)
 {
        if (flags & PERF_EF_START)
                cpu_clock_event_start(event, flags);
+        perf_event_update_userpage(event);
        return 0;
 }
@@ -6638,6 +6952,8 @@ static int cpu_clock_event_init(struct perf_event *event)
 static struct pmu perf_cpu_clock = {
        .task_ctx_nr    = perf_sw_context,
+        .capabilities   = PERF_PMU_CAP_NO_NMI,
        .event_init     = cpu_clock_event_init,
        .add            = cpu_clock_event_add,
        .del            = cpu_clock_event_del,
@@ -6676,6 +6992,7 @@ static int task_clock_event_add(struct perf_event *event, int flags)
 {
        if (flags & PERF_EF_START)
                task_clock_event_start(event, flags);
+        perf_event_update_userpage(event);
        return 0;
 }
@@ -6716,6 +7033,8 @@ static int task_clock_event_init(struct perf_event *event)
 static struct pmu perf_task_clock = {
        .task_ctx_nr    = perf_sw_context,
+        .capabilities   = PERF_PMU_CAP_NO_NMI,
        .event_init     = task_clock_event_init,
        .add            = task_clock_event_add,
        .del            = task_clock_event_del,
@@ -6993,6 +7312,7 @@ got_cpu_context:
                pmu->event_idx = perf_event_idx_default;
        list_add_rcu(&pmu->entry, &pmus);
+        atomic_set(&pmu->exclusive_cnt, 0);
        ret = 0;
 unlock:
        mutex_unlock(&pmus_lock);
@@ -7037,12 +7357,23 @@ EXPORT_SYMBOL_GPL(perf_pmu_unregister);
 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
 {
+        struct perf_event_context *ctx = NULL;
        int ret;
        if (!try_module_get(pmu->module))
                return -ENODEV;
+        if (event->group_leader != event) {
+                ctx = perf_event_ctx_lock(event->group_leader);
+                BUG_ON(!ctx);
+        }
        event->pmu = pmu;
        ret = pmu->event_init(event);
+        if (ctx)
+                perf_event_ctx_unlock(event->group_leader, ctx);
        if (ret)
                module_put(pmu->module);
@@ -7089,10 +7420,6 @@ static void account_event_cpu(struct perf_event *event, int cpu)
        if (event->parent)
                return;
-        if (has_branch_stack(event)) {
-                if (!(event->attach_state & PERF_ATTACH_TASK))
-                        atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
-        }
        if (is_cgroup_event(event))
                atomic_inc(&per_cpu(perf_cgroup_events, cpu));
 }
@@ -7131,7 +7458,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
                 struct perf_event *group_leader,
                 struct perf_event *parent_event,
                 perf_overflow_handler_t overflow_handler,
-                 void *context)
+                 void *context, int cgroup_fd)
 {
        struct pmu *pmu;
        struct perf_event *event;
@@ -7186,18 +7513,18 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        if (task) {
                event->attach_state = PERF_ATTACH_TASK;
-                if (attr->type == PERF_TYPE_TRACEPOINT)
-                        event->hw.tp_target = task;
-#ifdef CONFIG_HAVE_HW_BREAKPOINT
                /*
-                 * hw_breakpoint is a bit difficult here..
+                 * XXX pmu::event_init needs to know what task to account to
+                 * and we cannot use the ctx information because we need the
+                 * pmu before we get a ctx.
                 */
-                else if (attr->type == PERF_TYPE_BREAKPOINT)
+                event->hw.target = task;
-                        event->hw.bp_target = task;
-#endif
        }
+        event->clock = &local_clock;
+        if (parent_event)
+                event->clock = parent_event->clock;
        if (!overflow_handler && parent_event) {
                overflow_handler = parent_event->overflow_handler;
                context = parent_event->overflow_handler_context;
@@ -7224,6 +7551,15 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
                goto err_ns;
+        if (!has_branch_stack(event))
+                event->attr.branch_sample_type = 0;
+        if (cgroup_fd != -1) {
+                err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
+                if (err)
+                        goto err_ns;
+        }
        pmu = perf_init_event(event);
        if (!pmu)
                goto err_ns;
@@ -7232,21 +7568,30 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
                goto err_ns;
        }
+        err = exclusive_event_init(event);
+        if (err)
+                goto err_pmu;
        if (!event->parent) {
                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
                        err = get_callchain_buffers();
                        if (err)
-                                goto err_pmu;
+                                goto err_per_task;
                }
        }
        return event;
+err_per_task:
+        exclusive_event_destroy(event);
 err_pmu:
        if (event->destroy)
                event->destroy(event);
        module_put(pmu->module);
 err_ns:
+        if (is_cgroup_event(event))
+                perf_detach_cgroup(event);
        if (event->ns)
                put_pid_ns(event->ns);
        kfree(event);
@@ -7409,6 +7754,19 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
        if (output_event->cpu == -1 && output_event->ctx != event->ctx)
                goto out;
+        /*
+         * Mixing clocks in the same buffer is trouble you don't need.
+         */
+        if (output_event->clock != event->clock)
+                goto out;
+        /*
+         * If both events generate aux data, they must be on the same PMU
+         */
+        if (has_aux(event) && has_aux(output_event) &&
+            event->pmu != output_event->pmu)
+                goto out;
 set:
        mutex_lock(&event->mmap_mutex);
        /* Can't redirect output if we've got an active mmap() */
@@ -7441,6 +7799,43 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b)
        mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
 }
+static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
+{
+        bool nmi_safe = false;
+        switch (clk_id) {
+        case CLOCK_MONOTONIC:
+                event->clock = &ktime_get_mono_fast_ns;
+                nmi_safe = true;
+                break;
+        case CLOCK_MONOTONIC_RAW:
+                event->clock = &ktime_get_raw_fast_ns;
+                nmi_safe = true;
+                break;
+        case CLOCK_REALTIME:
+                event->clock = &ktime_get_real_ns;
+                break;
+        case CLOCK_BOOTTIME:
+                event->clock = &ktime_get_boot_ns;
+                break;
+        case CLOCK_TAI:
+                event->clock = &ktime_get_tai_ns;
+                break;
+        default:
+                return -EINVAL;
+        }
+        if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
+                return -EINVAL;
+        return 0;
+}
 /**
 * sys_perf_event_open - open a performance event, associate it to a task/cpu
 *
@@ -7465,6 +7860,7 @@ SYSCALL_DEFINE5(perf_event_open,
        int move_group = 0;
        int err;
        int f_flags = O_RDWR;
+        int cgroup_fd = -1;
        /* for future expandability... */
        if (flags & ~PERF_FLAG_ALL)
@@ -7530,21 +7926,16 @@ SYSCALL_DEFINE5(perf_event_open,
        get_online_cpus();
+        if (flags & PERF_FLAG_PID_CGROUP)
+                cgroup_fd = pid;
        event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
-                                 NULL, NULL);
+                                 NULL, NULL, cgroup_fd);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err_cpus;
        }
-        if (flags & PERF_FLAG_PID_CGROUP) {
-                err = perf_cgroup_connect(pid, event, &attr, group_leader);
-                if (err) {
-                        __free_event(event);
-                        goto err_cpus;
-                }
-        }
        if (is_sampling_event(event)) {
                if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
                        err = -ENOTSUPP;
@@ -7560,6 +7951,12 @@ SYSCALL_DEFINE5(perf_event_open,
         */
        pmu = event->pmu;
+        if (attr.use_clockid) {
+                err = perf_event_set_clock(event, attr.clockid);
+                if (err)
+                        goto err_alloc;
+        }
        if (group_leader &&
            (is_software_event(event) != is_software_event(group_leader))) {
                if (is_software_event(event)) {
@@ -7586,12 +7983,17 @@ SYSCALL_DEFINE5(perf_event_open,
        /*
         * Get the target context (task or percpu):
         */
-        ctx = find_get_context(pmu, task, event->cpu);
+        ctx = find_get_context(pmu, task, event);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
                goto err_alloc;
        }
+        if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
+                err = -EBUSY;
+                goto err_context;
+        }
        if (task) {
                put_task_struct(task);
                task = NULL;
@@ -7609,6 +8011,11 @@ SYSCALL_DEFINE5(perf_event_open,
                 */
                if (group_leader->group_leader != group_leader)
                        goto err_context;
+                /* All events in a group should have the same clock */
+                if (group_leader->clock != event->clock)
+                        goto err_context;
                /*
                 * Do not allow to attach to a group in a different
                 * task or CPU context:
@@ -7709,6 +8116,13 @@ SYSCALL_DEFINE5(perf_event_open,
                get_ctx(ctx);
        }
+        if (!exclusive_event_installable(event, ctx)) {
+                err = -EBUSY;
+                mutex_unlock(&ctx->mutex);
+                fput(event_file);
+                goto err_context;
+        }
        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);
@@ -7781,7 +8195,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
         */
        event = perf_event_alloc(attr, cpu, task, NULL, NULL,
-                                 overflow_handler, context);
+                                 overflow_handler, context, -1);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err;
@@ -7792,7 +8206,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
        account_event(event);
-        ctx = find_get_context(event->pmu, task, cpu);
+        ctx = find_get_context(event->pmu, task, event);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
                goto err_free;
@@ -7800,6 +8214,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
+        if (!exclusive_event_installable(event, ctx)) {
+                mutex_unlock(&ctx->mutex);
+                perf_unpin_context(ctx);
+                put_ctx(ctx);
+                err = -EBUSY;
+                goto err_free;
+        }
        perf_install_in_context(ctx, event, cpu);
        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);
@@ -8142,7 +8564,7 @@ inherit_event(struct perf_event *parent_event,
                                           parent_event->cpu,
                                           child,
                                           group_leader, parent_event,
-                                           NULL, NULL);
+                                           NULL, NULL, -1);
        if (IS_ERR(child_event))
                return child_event;
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 9803a6600d49..92ce5f4ccc26 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -116,12 +116,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
 */
 static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
 {
-        struct task_struct *tsk = bp->hw.bp_target;
+        struct task_struct *tsk = bp->hw.target;
        struct perf_event *iter;
        int count = 0;
        list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
-                if (iter->hw.bp_target == tsk &&
+                if (iter->hw.target == tsk &&
                    find_slot_idx(iter) == type &&
                    (iter->cpu < 0 || cpu == iter->cpu))
                        count += hw_breakpoint_weight(iter);
@@ -153,7 +153,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
                int nr;
                nr = info->cpu_pinned;
-                if (!bp->hw.bp_target)
+                if (!bp->hw.target)
                        nr += max_task_bp_pinned(cpu, type);
                else
                        nr += task_bp_pinned(cpu, bp, type);
@@ -210,7 +210,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
                weight = -weight;
        /* Pinned counter cpu profiling */
-        if (!bp->hw.bp_target) {
+        if (!bp->hw.target) {
                get_bp_info(bp->cpu, type)->cpu_pinned += weight;
                return;
        }
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 569b218782ad..9f6ce9ba4a04 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -27,6 +27,7 @@ struct ring_buffer {
        local_t                         lost;           /* nr records lost   */
        long                            watermark;      /* wakeup watermark  */
+        long                            aux_watermark;
        /* poll crap */
        spinlock_t                      event_lock;
        struct list_head                event_list;
@@ -35,6 +36,20 @@ struct ring_buffer {
        unsigned long                   mmap_locked;
        struct user_struct              *mmap_user;
+        /* AUX area */
+        local_t                         aux_head;
+        local_t                         aux_nest;
+        local_t                         aux_wakeup;
+        unsigned long                   aux_pgoff;
+        int                             aux_nr_pages;
+        int                             aux_overwrite;
+        atomic_t                        aux_mmap_count;
+        unsigned long                   aux_mmap_locked;
+        void                            (*free_aux)(void *);
+        atomic_t                        aux_refcount;
+        void                            **aux_pages;
+        void                            *aux_priv;
        struct perf_event_mmap_page     *user_page;
        void                            *data_pages[0];
 };
@@ -43,6 +58,19 @@ extern void rb_free(struct ring_buffer *rb);
 extern struct ring_buffer *
 rb_alloc(int nr_pages, long watermark, int cpu, int flags);
 extern void perf_event_wakeup(struct perf_event *event);
+extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
+                        pgoff_t pgoff, int nr_pages, long watermark, int flags);
+extern void rb_free_aux(struct ring_buffer *rb);
+extern struct ring_buffer *ring_buffer_get(struct perf_event *event);
+extern void ring_buffer_put(struct ring_buffer *rb);
+static inline bool rb_has_aux(struct ring_buffer *rb)
+{
+        return !!rb->aux_nr_pages;
+}
+void perf_event_aux_event(struct perf_event *event, unsigned long head,
+                          unsigned long size, u64 flags);
 extern void
 perf_event_header__init_id(struct perf_event_header *header,
@@ -81,6 +109,11 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
        return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
 }
+static inline unsigned long perf_aux_size(struct ring_buffer *rb)
+{
+        return rb->aux_nr_pages << PAGE_SHIFT;
+}
 #define DEFINE_OUTPUT_COPY(func_name, memcpy_func)                      \
 static inline unsigned long                                             \
 func_name(struct perf_output_handle *handle,                            \
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index eadb95ce7aac..232f00f273cb 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -243,14 +243,317 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
        spin_lock_init(&rb->event_lock);
 }
+/*
+ * This is called before hardware starts writing to the AUX area to
+ * obtain an output handle and make sure there's room in the buffer.
+ * When the capture completes, call perf_aux_output_end() to commit
+ * the recorded data to the buffer.
+ *
+ * The ordering is similar to that of perf_output_{begin,end}, with
+ * the exception of (B), which should be taken care of by the pmu
+ * driver, since ordering rules will differ depending on hardware.
+ */
+void *perf_aux_output_begin(struct perf_output_handle *handle,
+                            struct perf_event *event)
+{
+        struct perf_event *output_event = event;
+        unsigned long aux_head, aux_tail;
+        struct ring_buffer *rb;
+        if (output_event->parent)
+                output_event = output_event->parent;
+        /*
+         * Since this will typically be open across pmu::add/pmu::del, we
+         * grab ring_buffer's refcount instead of holding rcu read lock
+         * to make sure it doesn't disappear under us.
+         */
+        rb = ring_buffer_get(output_event);
+        if (!rb)
+                return NULL;
+        if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount))
+                goto err;
+        /*
+         * Nesting is not supported for AUX area, make sure nested
+         * writers are caught early
+         */
+        if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
+                goto err_put;
+        aux_head = local_read(&rb->aux_head);
+        handle->rb = rb;
+        handle->event = event;
+        handle->head = aux_head;
+        handle->size = 0;
+        /*
+         * In overwrite mode, AUX data stores do not depend on aux_tail,
+         * therefore (A) control dependency barrier does not exist. The
+         * (B) <-> (C) ordering is still observed by the pmu driver.
+         */
+        if (!rb->aux_overwrite) {
+                aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
+                handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark;
+                if (aux_head - aux_tail < perf_aux_size(rb))
+                        handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
+                /*
+                 * handle->size computation depends on aux_tail load; this forms a
+                 * control dependency barrier separating aux_tail load from aux data
+                 * store that will be enabled on successful return
+                 */
+                if (!handle->size) { /* A, matches D */
+                        event->pending_disable = 1;
+                        perf_output_wakeup(handle);
+                        local_set(&rb->aux_nest, 0);
+                        goto err_put;
+                }
+        }
+        return handle->rb->aux_priv;
+err_put:
+        rb_free_aux(rb);
+err:
+        ring_buffer_put(rb);
+        handle->event = NULL;
+        return NULL;
+}
+/*
+ * Commit the data written by hardware into the ring buffer by adjusting
+ * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
+ * pmu driver's responsibility to observe ordering rules of the hardware,
+ * so that all the data is externally visible before this is called.
+ */
+void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
+                         bool truncated)
+{
+        struct ring_buffer *rb = handle->rb;
+        unsigned long aux_head;
+        u64 flags = 0;
+        if (truncated)
+                flags |= PERF_AUX_FLAG_TRUNCATED;
+        /* in overwrite mode, driver provides aux_head via handle */
+        if (rb->aux_overwrite) {
+                flags |= PERF_AUX_FLAG_OVERWRITE;
+                aux_head = handle->head;
+                local_set(&rb->aux_head, aux_head);
+        } else {
+                aux_head = local_read(&rb->aux_head);
+                local_add(size, &rb->aux_head);
+        }
+        if (size || flags) {
+                /*
+                 * Only send RECORD_AUX if we have something useful to communicate
+                 */
+                perf_event_aux_event(handle->event, aux_head, size, flags);
+        }
+        aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
+        if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+                perf_output_wakeup(handle);
+                local_add(rb->aux_watermark, &rb->aux_wakeup);
+        }
+        handle->event = NULL;
+        local_set(&rb->aux_nest, 0);
+        rb_free_aux(rb);
+        ring_buffer_put(rb);
+}
+/*
+ * Skip over a given number of bytes in the AUX buffer, due to, for example,
+ * hardware's alignment constraints.
+ */
+int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
+{
+        struct ring_buffer *rb = handle->rb;
+        unsigned long aux_head;
+        if (size > handle->size)
+                return -ENOSPC;
+        local_add(size, &rb->aux_head);
+        aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
+        if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+                perf_output_wakeup(handle);
+                local_add(rb->aux_watermark, &rb->aux_wakeup);
+                handle->wakeup = local_read(&rb->aux_wakeup) +
+                                 rb->aux_watermark;
+        }
+        handle->head = aux_head;
+        handle->size -= size;
+        return 0;
+}
+void *perf_get_aux(struct perf_output_handle *handle)
+{
+        /* this is only valid between perf_aux_output_begin and *_end */
+        if (!handle->event)
+                return NULL;
+        return handle->rb->aux_priv;
+}
+#define PERF_AUX_GFP    (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
+static struct page *rb_alloc_aux_page(int node, int order)
+{
+        struct page *page;
+        if (order > MAX_ORDER)
+                order = MAX_ORDER;
+        do {
+                page = alloc_pages_node(node, PERF_AUX_GFP, order);
+        } while (!page && order--);
+        if (page && order) {
+                /*
+                 * Communicate the allocation size to the driver
+                 */
+                split_page(page, order);
+                SetPagePrivate(page);
+                set_page_private(page, order);
+        }
+        return page;
+}
+static void rb_free_aux_page(struct ring_buffer *rb, int idx)
+{
+        struct page *page = virt_to_page(rb->aux_pages[idx]);
+        ClearPagePrivate(page);
+        page->mapping = NULL;
+        __free_page(page);
+}
+int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
+                 pgoff_t pgoff, int nr_pages, long watermark, int flags)
+{
+        bool overwrite = !(flags & RING_BUFFER_WRITABLE);
+        int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
+        int ret = -ENOMEM, max_order = 0;
+        if (!has_aux(event))
+                return -ENOTSUPP;
+        if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) {
+                /*
+                 * We need to start with the max_order that fits in nr_pages,
+                 * not the other way around, hence ilog2() and not get_order.
+                 */
+                max_order = ilog2(nr_pages);
+                /*
+                 * PMU requests more than one contiguous chunks of memory
+                 * for SW double buffering
+                 */
+                if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) &&
+                    !overwrite) {
+                        if (!max_order)
+                                return -EINVAL;
+                        max_order--;
+                }
+        }
+        rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node);
+        if (!rb->aux_pages)
+                return -ENOMEM;
+        rb->free_aux = event->pmu->free_aux;
+        for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) {
+                struct page *page;
+                int last, order;
+                order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages));
+                page = rb_alloc_aux_page(node, order);
+                if (!page)
+                        goto out;
+                for (last = rb->aux_nr_pages + (1 << page_private(page));
+                     last > rb->aux_nr_pages; rb->aux_nr_pages++)
+                        rb->aux_pages[rb->aux_nr_pages] = page_address(page++);
+        }
+        rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages,
+                                             overwrite);
+        if (!rb->aux_priv)
+                goto out;
+        ret = 0;
+        /*
+         * aux_pages (and pmu driver's private data, aux_priv) will be
+         * referenced in both producer's and consumer's contexts, thus
+         * we keep a refcount here to make sure either of the two can
+         * reference them safely.
+         */
+        atomic_set(&rb->aux_refcount, 1);
+        rb->aux_overwrite = overwrite;
+        rb->aux_watermark = watermark;
+        if (!rb->aux_watermark && !rb->aux_overwrite)
+                rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1);
+out:
+        if (!ret)
+                rb->aux_pgoff = pgoff;
+        else
+                rb_free_aux(rb);
+        return ret;
+}
+static void __rb_free_aux(struct ring_buffer *rb)
+{
+        int pg;
+        if (rb->aux_priv) {
+                rb->free_aux(rb->aux_priv);
+                rb->free_aux = NULL;
+                rb->aux_priv = NULL;
+        }
+        for (pg = 0; pg < rb->aux_nr_pages; pg++)
+                rb_free_aux_page(rb, pg);
+        kfree(rb->aux_pages);
+        rb->aux_nr_pages = 0;
+}
+void rb_free_aux(struct ring_buffer *rb)
+{
+        if (atomic_dec_and_test(&rb->aux_refcount))
+                __rb_free_aux(rb);
+}
 #ifndef CONFIG_PERF_USE_VMALLOC
 /*
 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
 */
-struct page *
+static struct page *
-perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
 {
        if (pgoff > rb->nr_pages)
                return NULL;
@@ -340,8 +643,8 @@ static int data_page_nr(struct ring_buffer *rb)
        return rb->nr_pages << page_order(rb);
 }
-struct page *
+static struct page *
-perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
 {
        /* The '>' counts in the user page. */
        if (pgoff > data_page_nr(rb))
@@ -416,3 +719,19 @@ fail:
 }
 #endif
+struct page *
+perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+{
+        if (rb->aux_nr_pages) {
+                /* above AUX space */
+                if (pgoff > rb->aux_pgoff + rb->aux_nr_pages)
+                        return NULL;
+                /* AUX space */
+                if (pgoff >= rb->aux_pgoff)
+                        return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]);
+        }
+        return __perf_mmap_to_page(rb, pgoff);
+}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index fedbdd7d5d1e..3b9a48ae153a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -432,6 +432,14 @@ config UPROBE_EVENT
          This option is required if you plan to use perf-probe subcommand
          of perf tools on user space applications.
+config BPF_EVENTS
+        depends on BPF_SYSCALL
+        depends on KPROBE_EVENT
+        bool
+        default y
+        help
+          This allows the user to attach BPF programs to kprobe events.
 config PROBE_EVENTS
        def_bool n
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 98f26588255e..9b1044e936a6 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_PM),y)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
new file mode 100644
index 000000000000..2d56ce501632
--- /dev/null
+++ b/kernel/trace/bpf_trace.c
@@ -0,0 +1,222 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/uaccess.h>
+#include <linux/ctype.h>
+#include "trace.h"
+static DEFINE_PER_CPU(int, bpf_prog_active);
+/**
+ * trace_call_bpf - invoke BPF program
+ * @prog: BPF program
+ * @ctx: opaque context pointer
+ *
+ * kprobe handlers execute BPF programs via this helper.
+ * Can be used from static tracepoints in the future.
+ *
+ * Return: BPF programs always return an integer which is interpreted by
+ * kprobe handler as:
+ * 0 - return from kprobe (event is filtered out)
+ * 1 - store kprobe event into ring buffer
+ * Other values are reserved and currently alias to 1
+ */
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+        unsigned int ret;
+        if (in_nmi()) /* not supported yet */
+                return 1;
+        preempt_disable();
+        if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
+                /*
+                 * since some bpf program is already running on this cpu,
+                 * don't call into another bpf program (same or different)
+                 * and don't send kprobe event into ring-buffer,
+                 * so return zero here
+                 */
+                ret = 0;
+                goto out;
+        }
+        rcu_read_lock();
+        ret = BPF_PROG_RUN(prog, ctx);
+        rcu_read_unlock();
+ out:
+        __this_cpu_dec(bpf_prog_active);
+        preempt_enable();
+        return ret;
+}
+EXPORT_SYMBOL_GPL(trace_call_bpf);
+static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+        void *dst = (void *) (long) r1;
+        int size = (int) r2;
+        void *unsafe_ptr = (void *) (long) r3;
+        return probe_kernel_read(dst, unsafe_ptr, size);
+}
+static const struct bpf_func_proto bpf_probe_read_proto = {
+        .func           = bpf_probe_read,
+        .gpl_only       = true,
+        .ret_type       = RET_INTEGER,
+        .arg1_type      = ARG_PTR_TO_STACK,
+        .arg2_type      = ARG_CONST_STACK_SIZE,
+        .arg3_type      = ARG_ANYTHING,
+};
+static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+        /* NMI safe access to clock monotonic */
+        return ktime_get_mono_fast_ns();
+}
+static const struct bpf_func_proto bpf_ktime_get_ns_proto = {
+        .func           = bpf_ktime_get_ns,
+        .gpl_only       = true,
+        .ret_type       = RET_INTEGER,
+};
+/*
+ * limited trace_printk()
+ * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
+ */
+static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
+{
+        char *fmt = (char *) (long) r1;
+        int mod[3] = {};
+        int fmt_cnt = 0;
+        int i;
+        /*
+         * bpf_check()->check_func_arg()->check_stack_boundary()
+         * guarantees that fmt points to bpf program stack,
+         * fmt_size bytes of it were initialized and fmt_size > 0
+         */
+        if (fmt[--fmt_size] != 0)
+                return -EINVAL;
+        /* check format string for allowed specifiers */
+        for (i = 0; i < fmt_size; i++) {
+                if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
+                        return -EINVAL;
+                if (fmt[i] != '%')
+                        continue;
+                if (fmt_cnt >= 3)
+                        return -EINVAL;
+                /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
+                i++;
+                if (fmt[i] == 'l') {
+                        mod[fmt_cnt]++;
+                        i++;
+                } else if (fmt[i] == 'p') {
+                        mod[fmt_cnt]++;
+                        i++;
+                        if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
+                                return -EINVAL;
+                        fmt_cnt++;
+                        continue;
+                }
+                if (fmt[i] == 'l') {
+                        mod[fmt_cnt]++;
+                        i++;
+                }
+                if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x')
+                        return -EINVAL;
+                fmt_cnt++;
+        }
+        return __trace_printk(1/* fake ip will not be printed */, fmt,
+                              mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3,
+                              mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4,
+                              mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5);
+}
+static const struct bpf_func_proto bpf_trace_printk_proto = {
+        .func           = bpf_trace_printk,
+        .gpl_only       = true,
+        .ret_type       = RET_INTEGER,
+        .arg1_type      = ARG_PTR_TO_STACK,
+        .arg2_type      = ARG_CONST_STACK_SIZE,
+};
+static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
+{
+        switch (func_id) {
+        case BPF_FUNC_map_lookup_elem:
+                return &bpf_map_lookup_elem_proto;
+        case BPF_FUNC_map_update_elem:
+                return &bpf_map_update_elem_proto;
+        case BPF_FUNC_map_delete_elem:
+                return &bpf_map_delete_elem_proto;
+        case BPF_FUNC_probe_read:
+                return &bpf_probe_read_proto;
+        case BPF_FUNC_ktime_get_ns:
+                return &bpf_ktime_get_ns_proto;
+        case BPF_FUNC_trace_printk:
+                /*
+                 * this program might be calling bpf_trace_printk,
+                 * so allocate per-cpu printk buffers
+                 */
+                trace_printk_init_buffers();
+                return &bpf_trace_printk_proto;
+        default:
+                return NULL;
+        }
+}
+/* bpf+kprobe programs can access fields of 'struct pt_regs' */
+static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+        /* check bounds */
+        if (off < 0 || off >= sizeof(struct pt_regs))
+                return false;
+        /* only read is allowed */
+        if (type != BPF_READ)
+                return false;
+        /* disallow misaligned access */
+        if (off % size != 0)
+                return false;
+        return true;
+}
+static struct bpf_verifier_ops kprobe_prog_ops = {
+        .get_func_proto  = kprobe_prog_func_proto,
+        .is_valid_access = kprobe_prog_is_valid_access,
+};
+static struct bpf_prog_type_list kprobe_tl = {
+        .ops    = &kprobe_prog_ops,
+        .type   = BPF_PROG_TYPE_KPROBE,
+};
+static int __init register_kprobe_prog_ops(void)
+{
+        bpf_register_prog_type(&kprobe_tl);
+        return 0;
+}
+late_initcall(register_kprobe_prog_ops);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 9ba3f43f580e..d0ce590f06e1 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1135,11 +1135,15 @@ static void
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
        struct ftrace_event_call *call = &tk->tp.call;
+        struct bpf_prog *prog = call->prog;
        struct kprobe_trace_entry_head *entry;
        struct hlist_head *head;
        int size, __size, dsize;
        int rctx;
+        if (prog && !trace_call_bpf(prog, regs))
+                return;
        head = this_cpu_ptr(call->perf_events);
        if (hlist_empty(head))
                return;
@@ -1166,11 +1170,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
                    struct pt_regs *regs)
 {
        struct ftrace_event_call *call = &tk->tp.call;
+        struct bpf_prog *prog = call->prog;
        struct kretprobe_trace_entry_head *entry;
        struct hlist_head *head;
        int size, __size, dsize;
        int rctx;
+        if (prog && !trace_call_bpf(prog, regs))
+                return;
        head = this_cpu_ptr(call->perf_events);
        if (hlist_empty(head))
                return;
@@ -1287,7 +1295,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
                kfree(call->print_fmt);
                return -ENODEV;
        }
-        call->flags = 0;
+        call->flags = TRACE_EVENT_FL_KPROBE;
        call->class->reg = kprobe_register;
        call->data = tk;
        ret = trace_add_event_call(call);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 74865465e0b7..d60fe62ec4fa 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1006,7 +1006,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
                return true;
        list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
-                if (event->hw.tp_target->mm == mm)
+                if (event->hw.target->mm == mm)
                        return true;
        }
@@ -1016,7 +1016,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
 static inline bool
 uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
 {
-        return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
+        return __uprobe_perf_filter(&tu->filter, event->hw.target->mm);
 }
 static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
@@ -1024,10 +1024,10 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
        bool done;
        write_lock(&tu->filter.rwlock);
-        if (event->hw.tp_target) {
+        if (event->hw.target) {
                list_del(&event->hw.tp_list);
                done = tu->filter.nr_systemwide ||
-                        (event->hw.tp_target->flags & PF_EXITING) ||
+                        (event->hw.target->flags & PF_EXITING) ||
                        uprobe_filter_event(tu, event);
        } else {
                tu->filter.nr_systemwide--;
@@ -1047,7 +1047,7 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
        int err;
        write_lock(&tu->filter.rwlock);
-        if (event->hw.tp_target) {
+        if (event->hw.target) {
                /*
                 * event->parent != NULL means copy_process(), we can avoid
                 * uprobe_apply(). current->mm must be probed and we can rely
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 3174bf8e3538..9a056f5bc02c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -567,9 +567,37 @@ static void watchdog_nmi_disable(unsigned int cpu)
                cpu0_err = 0;
        }
 }
+void watchdog_nmi_enable_all(void)
+{
+        int cpu;
+        if (!watchdog_user_enabled)
+                return;
+        get_online_cpus();
+        for_each_online_cpu(cpu)
+                watchdog_nmi_enable(cpu);
+        put_online_cpus();
+}
+void watchdog_nmi_disable_all(void)
+{
+        int cpu;
+        if (!watchdog_running)
+                return;
+        get_online_cpus();
+        for_each_online_cpu(cpu)
+                watchdog_nmi_disable(cpu);
+        put_online_cpus();
+}
 #else
 static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
 static void watchdog_nmi_disable(unsigned int cpu) { return; }
+void watchdog_nmi_enable_all(void) {}
+void watchdog_nmi_disable_all(void) {}
 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
 static struct smp_hotplug_thread watchdog_threads = {