1 files changed, 217 insertions, 172 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index a4fa381db3c2..e099650cd249 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2297,11 +2297,6 @@ unlock:
        rcu_read_unlock();
 }
-static unsigned long perf_data_size(struct perf_mmap_data *data)
-{
-        return data->nr_pages << (PAGE_SHIFT + data->data_order);
-}
 #ifndef CONFIG_PERF_USE_VMALLOC
 /*
@@ -2320,6 +2315,19 @@ perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
        return virt_to_page(data->data_pages[pgoff - 1]);
 }
+static void *perf_mmap_alloc_page(int cpu)
+{
+        struct page *page;
+        int node;
+        node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+        page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+        if (!page)
+                return NULL;
+        return page_address(page);
+}
 static struct perf_mmap_data *
 perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
 {
@@ -2336,17 +2344,16 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
        if (!data)
                goto fail;
-        data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
+        data->user_page = perf_mmap_alloc_page(event->cpu);
        if (!data->user_page)
                goto fail_user_page;
        for (i = 0; i < nr_pages; i++) {
-                data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+                data->data_pages[i] = perf_mmap_alloc_page(event->cpu);
                if (!data->data_pages[i])
                        goto fail_data_pages;
        }
-        data->data_order = 0;
        data->nr_pages = nr_pages;
        return data;
@@ -2382,6 +2389,11 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
        kfree(data);
 }
+static inline int page_order(struct perf_mmap_data *data)
+{
+        return 0;
+}
 #else
 /*
@@ -2390,10 +2402,15 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
 * Required for architectures that have d-cache aliasing issues.
 */
+static inline int page_order(struct perf_mmap_data *data)
+{
+        return data->page_order;
+}
 static struct page *
 perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
 {
-        if (pgoff > (1UL << data->data_order))
+        if (pgoff > (1UL << page_order(data)))
                return NULL;
        return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
@@ -2413,7 +2430,7 @@ static void perf_mmap_data_free_work(struct work_struct *work)
        int i, nr;
        data = container_of(work, struct perf_mmap_data, work);
-        nr = 1 << data->data_order;
+        nr = 1 << page_order(data);
        base = data->user_page;
        for (i = 0; i < nr + 1; i++)
@@ -2452,7 +2469,7 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
        data->user_page = all_buf;
        data->data_pages[0] = all_buf + PAGE_SIZE;
-        data->data_order = ilog2(nr_pages);
+        data->page_order = ilog2(nr_pages);
        data->nr_pages = 1;
        return data;
@@ -2466,6 +2483,11 @@ fail:
 #endif
+static unsigned long perf_data_size(struct perf_mmap_data *data)
+{
+        return data->nr_pages << (PAGE_SHIFT + page_order(data));
+}
 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct perf_event *event = vma->vm_file->private_data;
@@ -2506,8 +2528,6 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
 {
        long max_size = perf_data_size(data);
-        atomic_set(&data->lock, -1);
        if (event->attr.watermark) {
                data->watermark = min_t(long, max_size,
                                        event->attr.wakeup_watermark);
@@ -2580,6 +2600,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        long user_extra, extra;
        int ret = 0;
+        /*
+         * Don't allow mmap() of inherited per-task counters. This would
+         * create a performance issue due to all children writing to the
+         * same buffer.
+         */
+        if (event->cpu == -1 && event->attr.inherit)
+                return -EINVAL;
        if (!(vma->vm_flags & VM_SHARED))
                return -EINVAL;
@@ -2885,120 +2913,80 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
 }
 /*
- * Curious locking construct.
- *
 * We need to ensure a later event_id doesn't publish a head when a former
- * event_id isn't done writing. However since we need to deal with NMIs we
+ * event isn't done writing. However since we need to deal with NMIs we
 * cannot fully serialize things.
 *
- * What we do is serialize between CPUs so we only have to deal with NMI
- * nesting on a single CPU.
- *
 * We only publish the head (and generate a wakeup) when the outer-most
- * event_id completes.
+ * event completes.
 */
-static void perf_output_lock(struct perf_output_handle *handle)
+static void perf_output_get_handle(struct perf_output_handle *handle)
 {
        struct perf_mmap_data *data = handle->data;
-        int cur, cpu = get_cpu();
-        handle->locked = 0;
-        for (;;) {
-                cur = atomic_cmpxchg(&data->lock, -1, cpu);
-                if (cur == -1) {
-                        handle->locked = 1;
-                        break;
-                }
-                if (cur == cpu)
-                        break;
-                cpu_relax();
+        preempt_disable();
-        }
+        local_inc(&data->nest);
+        handle->wakeup = local_read(&data->wakeup);
 }
-static void perf_output_unlock(struct perf_output_handle *handle)
+static void perf_output_put_handle(struct perf_output_handle *handle)
 {
        struct perf_mmap_data *data = handle->data;
        unsigned long head;
-        int cpu;
-        data->done_head = data->head;
-        if (!handle->locked)
-                goto out;
 again:
-        /*
+        head = local_read(&data->head);
-         * The xchg implies a full barrier that ensures all writes are done
-         * before we publish the new head, matched by a rmb() in userspace when
-         * reading this position.
-         */
-        while ((head = atomic_long_xchg(&data->done_head, 0)))
-                data->user_page->data_head = head;
        /*
-         * NMI can happen here, which means we can miss a done_head update.
+         * IRQ/NMI can happen here, which means we can miss a head update.
         */
-        cpu = atomic_xchg(&data->lock, -1);
+        if (!local_dec_and_test(&data->nest))
-        WARN_ON_ONCE(cpu != smp_processor_id());
+                goto out;
        /*
-         * Therefore we have to validate we did not indeed do so.
+         * Publish the known good head. Rely on the full barrier implied
+         * by atomic_dec_and_test() order the data->head read and this
+         * write.
         */
-        if (unlikely(atomic_long_read(&data->done_head))) {
+        data->user_page->data_head = head;
-                /*
-                 * Since we had it locked, we can lock it again.
-                 */
-                while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
-                        cpu_relax();
+        /*
+         * Now check if we missed an update, rely on the (compiler)
+         * barrier in atomic_dec_and_test() to re-read data->head.
+         */
+        if (unlikely(head != local_read(&data->head))) {
+                local_inc(&data->nest);
                goto again;
        }
-        if (atomic_xchg(&data->wakeup, 0))
+        if (handle->wakeup != local_read(&data->wakeup))
                perf_output_wakeup(handle);
-out:
-        put_cpu();
+ out:
+        preempt_enable();
 }
-void perf_output_copy(struct perf_output_handle *handle,
+__always_inline void perf_output_copy(struct perf_output_handle *handle,
                      const void *buf, unsigned int len)
 {
-        unsigned int pages_mask;
-        unsigned long offset;
-        unsigned int size;
-        void **pages;
-        offset          = handle->offset;
-        pages_mask      = handle->data->nr_pages - 1;
-        pages           = handle->data->data_pages;
        do {
-                unsigned long page_offset;
+                unsigned long size = min_t(unsigned long, handle->size, len);
-                unsigned long page_size;
-                int nr;
-                nr          = (offset >> PAGE_SHIFT) & pages_mask;
+                memcpy(handle->addr, buf, size);
-                page_size   = 1UL << (handle->data->data_order + PAGE_SHIFT);
-                page_offset = offset & (page_size - 1);
-                size        = min_t(unsigned int, page_size - page_offset, len);
-                memcpy(pages[nr] + page_offset, buf, size);
+                len -= size;
+                handle->addr += size;
+                handle->size -= size;
+                if (!handle->size) {
+                        struct perf_mmap_data *data = handle->data;
-                len         -= size;
+                        handle->page++;
-                buf         += size;
+                        handle->page &= data->nr_pages - 1;
-                offset      += size;
+                        handle->addr = data->data_pages[handle->page];
+                        handle->size = PAGE_SIZE << page_order(data);
+                }
        } while (len);
-        handle->offset = offset;
-        /*
-         * Check we didn't copy past our reservation window, taking the
-         * possible unsigned int wrap into account.
-         */
-        WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
 }
 int perf_output_begin(struct perf_output_handle *handle,
@@ -3036,13 +3024,13 @@ int perf_output_begin(struct perf_output_handle *handle,
        handle->sample  = sample;
        if (!data->nr_pages)
-                goto fail;
+                goto out;
-        have_lost = atomic_read(&data->lost);
+        have_lost = local_read(&data->lost);
        if (have_lost)
                size += sizeof(lost_event);
-        perf_output_lock(handle);
+        perf_output_get_handle(handle);
        do {
                /*
@@ -3052,24 +3040,28 @@ int perf_output_begin(struct perf_output_handle *handle,
                 */
                tail = ACCESS_ONCE(data->user_page->data_tail);
                smp_rmb();
-                offset = head = atomic_long_read(&data->head);
+                offset = head = local_read(&data->head);
                head += size;
                if (unlikely(!perf_output_space(data, tail, offset, head)))
                        goto fail;
-        } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
+        } while (local_cmpxchg(&data->head, offset, head) != offset);
-        handle->offset  = offset;
+        if (head - local_read(&data->wakeup) > data->watermark)
-        handle->head    = head;
+                local_add(data->watermark, &data->wakeup);
-        if (head - tail > data->watermark)
+        handle->page = offset >> (PAGE_SHIFT + page_order(data));
-                atomic_set(&data->wakeup, 1);
+        handle->page &= data->nr_pages - 1;
+        handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1);
+        handle->addr = data->data_pages[handle->page];
+        handle->addr += handle->size;
+        handle->size = (PAGE_SIZE << page_order(data)) - handle->size;
        if (have_lost) {
                lost_event.header.type = PERF_RECORD_LOST;
                lost_event.header.misc = 0;
                lost_event.header.size = sizeof(lost_event);
                lost_event.id          = event->id;
-                lost_event.lost        = atomic_xchg(&data->lost, 0);
+                lost_event.lost        = local_xchg(&data->lost, 0);
                perf_output_put(handle, lost_event);
        }
@@ -3077,8 +3069,8 @@ int perf_output_begin(struct perf_output_handle *handle,
        return 0;
 fail:
-        atomic_inc(&data->lost);
+        local_inc(&data->lost);
-        perf_output_unlock(handle);
+        perf_output_put_handle(handle);
 out:
        rcu_read_unlock();
@@ -3093,14 +3085,14 @@ void perf_output_end(struct perf_output_handle *handle)
        int wakeup_events = event->attr.wakeup_events;
        if (handle->sample && wakeup_events) {
-                int events = atomic_inc_return(&data->events);
+                int events = local_inc_return(&data->events);
                if (events >= wakeup_events) {
-                        atomic_sub(wakeup_events, &data->events);
+                        local_sub(wakeup_events, &data->events);
-                        atomic_set(&data->wakeup, 1);
+                        local_inc(&data->wakeup);
                }
        }
-        perf_output_unlock(handle);
+        perf_output_put_handle(handle);
        rcu_read_unlock();
 }
@@ -3436,22 +3428,13 @@ static void perf_event_task_output(struct perf_event *event,
 {
        struct perf_output_handle handle;
        struct task_struct *task = task_event->task;
-        unsigned long flags;
        int size, ret;
-        /*
-         * If this CPU attempts to acquire an rq lock held by a CPU spinning
-         * in perf_output_lock() from interrupt context, it's game over.
-         */
-        local_irq_save(flags);
        size  = task_event->event_id.header.size;
        ret = perf_output_begin(&handle, event, size, 0, 0);
-        if (ret) {
+        if (ret)
-                local_irq_restore(flags);
                return;
-        }
        task_event->event_id.pid = perf_event_pid(event, task);
        task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3462,7 +3445,6 @@ static void perf_event_task_output(struct perf_event *event,
        perf_output_put(&handle, task_event->event_id);
        perf_output_end(&handle);
-        local_irq_restore(flags);
 }
 static int perf_event_task_match(struct perf_event *event)
@@ -4020,9 +4002,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
        perf_swevent_overflow(event, 0, nmi, data, regs);
 }
-static int perf_tp_event_match(struct perf_event *event,
-                                struct perf_sample_data *data);
 static int perf_exclude_event(struct perf_event *event,
                              struct pt_regs *regs)
 {
@@ -4052,10 +4031,6 @@ static int perf_swevent_match(struct perf_event *event,
        if (perf_exclude_event(event, regs))
                return 0;
-        if (event->attr.type == PERF_TYPE_TRACEPOINT &&
-            !perf_tp_event_match(event, data))
-                return 0;
        return 1;
 }
@@ -4066,19 +4041,46 @@ static inline u64 swevent_hash(u64 type, u32 event_id)
        return hash_64(val, SWEVENT_HLIST_BITS);
 }
-static struct hlist_head *
+static inline struct hlist_head *
-find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
 {
-        u64 hash;
+        u64 hash = swevent_hash(type, event_id);
-        struct swevent_hlist *hlist;
+        return &hlist->heads[hash];
+}
-        hash = swevent_hash(type, event_id);
+/* For the read side: events when they trigger */
+static inline struct hlist_head *
+find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+{
+        struct swevent_hlist *hlist;
        hlist = rcu_dereference(ctx->swevent_hlist);
        if (!hlist)
                return NULL;
-        return &hlist->heads[hash];
+        return __find_swevent_head(hlist, type, event_id);
+}
+/* For the event head insertion and removal in the hlist */
+static inline struct hlist_head *
+find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
+{
+        struct swevent_hlist *hlist;
+        u32 event_id = event->attr.config;
+        u64 type = event->attr.type;
+        /*
+         * Event scheduling is always serialized against hlist allocation
+         * and release. Which makes the protected version suitable here.
+         * The context lock guarantees that.
+         */
+        hlist = rcu_dereference_protected(ctx->swevent_hlist,
+                                          lockdep_is_held(&event->ctx->lock));
+        if (!hlist)
+                return NULL;
+        return __find_swevent_head(hlist, type, event_id);
 }
 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
@@ -4095,7 +4097,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
        rcu_read_lock();
-        head = find_swevent_head(cpuctx, type, event_id);
+        head = find_swevent_head_rcu(cpuctx, type, event_id);
        if (!head)
                goto end;
@@ -4110,7 +4112,7 @@ end:
 int perf_swevent_get_recursion_context(void)
 {
-        struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        int rctx;
        if (in_nmi())
@@ -4122,10 +4124,8 @@ int perf_swevent_get_recursion_context(void)
        else
                rctx = 0;
-        if (cpuctx->recursion[rctx]) {
+        if (cpuctx->recursion[rctx])
-                put_cpu_var(perf_cpu_context);
                return -1;
-        }
        cpuctx->recursion[rctx]++;
        barrier();
@@ -4139,7 +4139,6 @@ void perf_swevent_put_recursion_context(int rctx)
        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        barrier();
        cpuctx->recursion[rctx]--;
-        put_cpu_var(perf_cpu_context);
 }
 EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
@@ -4150,6 +4149,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
        struct perf_sample_data data;
        int rctx;
+        preempt_disable_notrace();
        rctx = perf_swevent_get_recursion_context();
        if (rctx < 0)
                return;
@@ -4159,6 +4159,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
        perf_swevent_put_recursion_context(rctx);
+        preempt_enable_notrace();
 }
 static void perf_swevent_read(struct perf_event *event)
@@ -4178,7 +4179,7 @@ static int perf_swevent_enable(struct perf_event *event)
                perf_swevent_set_period(event);
        }
-        head = find_swevent_head(cpuctx, event->attr.type, event->attr.config);
+        head = find_swevent_head(cpuctx, event);
        if (WARN_ON_ONCE(!head))
                return -EINVAL;
@@ -4366,6 +4367,14 @@ static const struct pmu perf_ops_task_clock = {
        .read           = task_clock_perf_event_read,
 };
+/* Deref the hlist from the update side */
+static inline struct swevent_hlist *
+swevent_hlist_deref(struct perf_cpu_context *cpuctx)
+{
+        return rcu_dereference_protected(cpuctx->swevent_hlist,
+                                         lockdep_is_held(&cpuctx->hlist_mutex));
+}
 static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
 {
        struct swevent_hlist *hlist;
@@ -4376,12 +4385,11 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
 static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
 {
-        struct swevent_hlist *hlist;
+        struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
-        if (!cpuctx->swevent_hlist)
+        if (!hlist)
                return;
-        hlist = cpuctx->swevent_hlist;
        rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
        call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
 }
@@ -4418,7 +4426,7 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
        mutex_lock(&cpuctx->hlist_mutex);
-        if (!cpuctx->swevent_hlist && cpu_online(cpu)) {
+        if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
                struct swevent_hlist *hlist;
                hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -4467,10 +4475,46 @@ static int swevent_hlist_get(struct perf_event *event)
 #ifdef CONFIG_EVENT_TRACING
-void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
+static const struct pmu perf_ops_tracepoint = {
-                   int entry_size, struct pt_regs *regs)
+        .enable         = perf_trace_enable,
+        .disable        = perf_trace_disable,
+        .read           = perf_swevent_read,
+        .unthrottle     = perf_swevent_unthrottle,
+};
+static int perf_tp_filter_match(struct perf_event *event,
+                                struct perf_sample_data *data)
+{
+        void *record = data->raw->data;
+        if (likely(!event->filter) || filter_match_preds(event->filter, record))
+                return 1;
+        return 0;
+}
+static int perf_tp_event_match(struct perf_event *event,
+                                struct perf_sample_data *data,
+                                struct pt_regs *regs)
+{
+        /*
+         * All tracepoints are from kernel-space.
+         */
+        if (event->attr.exclude_kernel)
+                return 0;
+        if (!perf_tp_filter_match(event, data))
+                return 0;
+        return 1;
+}
+void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
+                   struct pt_regs *regs, struct hlist_head *head)
 {
        struct perf_sample_data data;
+        struct perf_event *event;
+        struct hlist_node *node;
        struct perf_raw_record raw = {
                .size = entry_size,
                .data = record,
@@ -4479,26 +4523,18 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
        perf_sample_data_init(&data, addr);
        data.raw = &raw;
-        /* Trace events already protected against recursion */
+        rcu_read_lock();
-        do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
+        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
-                         &data, regs);
+                if (perf_tp_event_match(event, &data, regs))
+                        perf_swevent_add(event, count, 1, &data, regs);
+        }
+        rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
-static int perf_tp_event_match(struct perf_event *event,
-                                struct perf_sample_data *data)
-{
-        void *record = data->raw->data;
-        if (likely(!event->filter) || filter_match_preds(event->filter, record))
-                return 1;
-        return 0;
-}
 static void tp_perf_event_destroy(struct perf_event *event)
 {
-        perf_trace_disable(event->attr.config);
+        perf_trace_destroy(event);
-        swevent_hlist_put(event);
 }
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
@@ -4514,17 +4550,13 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
                        !capable(CAP_SYS_ADMIN))
                return ERR_PTR(-EPERM);
-        if (perf_trace_enable(event->attr.config))
+        err = perf_trace_init(event);
+        if (err)
                return NULL;
        event->destroy = tp_perf_event_destroy;
-        err = swevent_hlist_get(event);
-        if (err) {
-                perf_trace_disable(event->attr.config);
-                return ERR_PTR(err);
-        }
-        return &perf_ops_generic;
+        return &perf_ops_tracepoint;
 }
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4552,12 +4584,6 @@ static void perf_event_free_filter(struct perf_event *event)
 #else
-static int perf_tp_event_match(struct perf_event *event,
-                                struct perf_sample_data *data)
-{
-        return 1;
-}
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
 {
        return NULL;
@@ -4894,6 +4920,13 @@ static int perf_event_set_output(struct perf_event *event, int output_fd)
        int fput_needed = 0;
        int ret = -EINVAL;
+        /*
+         * Don't allow output of inherited per-task events. This would
+         * create performance issues due to cross cpu access.
+         */
+        if (event->cpu == -1 && event->attr.inherit)
+                return -EINVAL;
        if (!output_fd)
                goto set;
@@ -4914,6 +4947,18 @@ static int perf_event_set_output(struct perf_event *event, int output_fd)
        if (event->data)
                goto out;
+        /*
+         * Don't allow cross-cpu buffers
+         */
+        if (output_event->cpu != event->cpu)
+                goto out;
+        /*
+         * If its not a per-cpu buffer, it must be the same task.
+         */
+        if (output_event->cpu == -1 && output_event->ctx != event->ctx)
+                goto out;
        atomic_long_inc(&output_file->f_count);
 set: