1 files changed, 275 insertions, 104 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 2f3fbf84215a..a4fa381db3c2 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -16,6 +16,7 @@
 #include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/slab.h>
+#include <linux/hash.h>
 #include <linux/sysfs.h>
 #include <linux/dcache.h>
 #include <linux/percpu.h>
@@ -82,14 +83,6 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
 void __weak hw_perf_disable(void)               { barrier(); }
 void __weak hw_perf_enable(void)                { barrier(); }
-int __weak
-hw_perf_group_sched_in(struct perf_event *group_leader,
-               struct perf_cpu_context *cpuctx,
-               struct perf_event_context *ctx)
-{
-        return 0;
-}
 void __weak perf_event_print_debug(void)        { }
 static DEFINE_PER_CPU(int, perf_disable_count);
@@ -262,6 +255,18 @@ static void update_event_times(struct perf_event *event)
        event->total_time_running = run_end - event->tstamp_running;
 }
+/*
+ * Update total_time_enabled and total_time_running for all events in a group.
+ */
+static void update_group_times(struct perf_event *leader)
+{
+        struct perf_event *event;
+        update_event_times(leader);
+        list_for_each_entry(event, &leader->sibling_list, group_entry)
+                update_event_times(event);
+}
 static struct list_head *
 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 {
@@ -315,8 +320,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 static void
 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 {
-        struct perf_event *sibling, *tmp;
        if (list_empty(&event->group_entry))
                return;
        ctx->nr_events--;
@@ -329,7 +332,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
        if (event->group_leader != event)
                event->group_leader->nr_siblings--;
-        update_event_times(event);
+        update_group_times(event);
        /*
         * If event was in error state, then keep it
@@ -340,6 +343,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
         */
        if (event->state > PERF_EVENT_STATE_OFF)
                event->state = PERF_EVENT_STATE_OFF;
+}
+static void
+perf_destroy_group(struct perf_event *event, struct perf_event_context *ctx)
+{
+        struct perf_event *sibling, *tmp;
        /*
         * If this was a group event with sibling events then
@@ -505,18 +514,6 @@ retry:
 }
 /*
- * Update total_time_enabled and total_time_running for all events in a group.
- */
-static void update_group_times(struct perf_event *leader)
-{
-        struct perf_event *event;
-        update_event_times(leader);
-        list_for_each_entry(event, &leader->sibling_list, group_entry)
-                update_event_times(event);
-}
-/*
 * Cross CPU call to disable a performance event
 */
 static void __perf_event_disable(void *info)
@@ -640,15 +637,20 @@ group_sched_in(struct perf_event *group_event,
               struct perf_cpu_context *cpuctx,
               struct perf_event_context *ctx)
 {
-        struct perf_event *event, *partial_group;
+        struct perf_event *event, *partial_group = NULL;
+        const struct pmu *pmu = group_event->pmu;
+        bool txn = false;
        int ret;
        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;
-        ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
+        /* Check if group transaction availabe */
-        if (ret)
+        if (pmu->start_txn)
-                return ret < 0 ? ret : 0;
+                txn = true;
+        if (txn)
+                pmu->start_txn(pmu);
        if (event_sched_in(group_event, cpuctx, ctx))
                return -EAGAIN;
@@ -663,9 +665,19 @@ group_sched_in(struct perf_event *group_event,
                }
        }
-        return 0;
+        if (!txn)
+                return 0;
+        ret = pmu->commit_txn(pmu);
+        if (!ret) {
+                pmu->cancel_txn(pmu);
+                return 0;
+        }
 group_error:
+        if (txn)
+                pmu->cancel_txn(pmu);
        /*
         * Groups can be scheduled in as one unit only, so undo any
         * partial group before returning:
@@ -1367,6 +1379,8 @@ void perf_event_task_sched_in(struct task_struct *task)
        if (cpuctx->task_ctx == ctx)
                return;
+        perf_disable();
        /*
         * We want to keep the following priority order:
         * cpu pinned (that don't need to move), task pinned,
@@ -1379,6 +1393,8 @@ void perf_event_task_sched_in(struct task_struct *task)
        ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
        cpuctx->task_ctx = ctx;
+        perf_enable();
 }
 #define MAX_INTERRUPTS (~0ULL)
@@ -1856,9 +1872,30 @@ int perf_event_release_kernel(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
+        /*
+         * Remove from the PMU, can't get re-enabled since we got
+         * here because the last ref went.
+         */
+        perf_event_disable(event);
        WARN_ON_ONCE(ctx->parent_ctx);
-        mutex_lock(&ctx->mutex);
+        /*
-        perf_event_remove_from_context(event);
+         * There are two ways this annotation is useful:
+         *
+         *  1) there is a lock recursion from perf_event_exit_task
+         *     see the comment there.
+         *
+         *  2) there is a lock-inversion with mmap_sem through
+         *     perf_event_read_group(), which takes faults while
+         *     holding ctx->mutex, however this is called after
+         *     the last filedesc died, so there is no possibility
+         *     to trigger the AB-BA case.
+         */
+        mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
+        raw_spin_lock_irq(&ctx->lock);
+        list_del_event(event, ctx);
+        perf_destroy_group(event, ctx);
+        raw_spin_unlock_irq(&ctx->lock);
        mutex_unlock(&ctx->mutex);
        mutex_lock(&event->owner->perf_event_mutex);
@@ -2642,6 +2679,7 @@ static int perf_fasync(int fd, struct file *filp, int on)
 }
 static const struct file_operations perf_fops = {
+        .llseek                 = no_llseek,
        .release                = perf_release,
        .read                   = perf_read,
        .poll                   = perf_poll,
@@ -2792,6 +2830,27 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
 /*
+ * We assume there is only KVM supporting the callbacks.
+ * Later on, we might change it to a list if there is
+ * another virtualization implementation supporting the callbacks.
+ */
+struct perf_guest_info_callbacks *perf_guest_cbs;
+int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+        perf_guest_cbs = cbs;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
+int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+        perf_guest_cbs = NULL;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
+/*
 * Output
 */
 static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
@@ -3743,7 +3802,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_MMAP,
-                                .misc = 0,
+                                .misc = PERF_RECORD_MISC_USER,
                                /* .size */
                        },
                        /* .pid */
@@ -3961,36 +4020,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
        perf_swevent_overflow(event, 0, nmi, data, regs);
 }
-static int perf_swevent_is_counting(struct perf_event *event)
-{
-        /*
-         * The event is active, we're good!
-         */
-        if (event->state == PERF_EVENT_STATE_ACTIVE)
-                return 1;
-        /*
-         * The event is off/error, not counting.
-         */
-        if (event->state != PERF_EVENT_STATE_INACTIVE)
-                return 0;
-        /*
-         * The event is inactive, if the context is active
-         * we're part of a group that didn't make it on the 'pmu',
-         * not counting.
-         */
-        if (event->ctx->is_active)
-                return 0;
-        /*
-         * We're inactive and the context is too, this means the
-         * task is scheduled out, we're counting events that happen
-         * to us, like migration events.
-         */
-        return 1;
-}
 static int perf_tp_event_match(struct perf_event *event,
                                struct perf_sample_data *data);
@@ -4014,12 +4043,6 @@ static int perf_swevent_match(struct perf_event *event,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
 {
-        if (event->cpu != -1 && event->cpu != smp_processor_id())
-                return 0;
-        if (!perf_swevent_is_counting(event))
-                return 0;
        if (event->attr.type != type)
                return 0;
@@ -4036,18 +4059,53 @@ static int perf_swevent_match(struct perf_event *event,
        return 1;
 }
-static void perf_swevent_ctx_event(struct perf_event_context *ctx,
+static inline u64 swevent_hash(u64 type, u32 event_id)
-                                     enum perf_type_id type,
+{
-                                     u32 event_id, u64 nr, int nmi,
+        u64 val = event_id | (type << 32);
-                                     struct perf_sample_data *data,
-                                     struct pt_regs *regs)
+        return hash_64(val, SWEVENT_HLIST_BITS);
+}
+static struct hlist_head *
+find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+{
+        u64 hash;
+        struct swevent_hlist *hlist;
+        hash = swevent_hash(type, event_id);
+        hlist = rcu_dereference(ctx->swevent_hlist);
+        if (!hlist)
+                return NULL;
+        return &hlist->heads[hash];
+}
+static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
+                                    u64 nr, int nmi,
+                                    struct perf_sample_data *data,
+                                    struct pt_regs *regs)
 {
+        struct perf_cpu_context *cpuctx;
        struct perf_event *event;
+        struct hlist_node *node;
+        struct hlist_head *head;
-        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+        cpuctx = &__get_cpu_var(perf_cpu_context);
+        rcu_read_lock();
+        head = find_swevent_head(cpuctx, type, event_id);
+        if (!head)
+                goto end;
+        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
                if (perf_swevent_match(event, type, event_id, data, regs))
                        perf_swevent_add(event, nr, nmi, data, regs);
        }
+end:
+        rcu_read_unlock();
 }
 int perf_swevent_get_recursion_context(void)
@@ -4085,27 +4143,6 @@ void perf_swevent_put_recursion_context(int rctx)
 }
 EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
-static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
-                                    u64 nr, int nmi,
-                                    struct perf_sample_data *data,
-                                    struct pt_regs *regs)
-{
-        struct perf_cpu_context *cpuctx;
-        struct perf_event_context *ctx;
-        cpuctx = &__get_cpu_var(perf_cpu_context);
-        rcu_read_lock();
-        perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
-                                 nr, nmi, data, regs);
-        /*
-         * doesn't really matter which of the child contexts the
-         * events ends up in.
-         */
-        ctx = rcu_dereference(current->perf_event_ctxp);
-        if (ctx)
-                perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
-        rcu_read_unlock();
-}
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
                            struct pt_regs *regs, u64 addr)
@@ -4131,16 +4168,28 @@ static void perf_swevent_read(struct perf_event *event)
 static int perf_swevent_enable(struct perf_event *event)
 {
        struct hw_perf_event *hwc = &event->hw;
+        struct perf_cpu_context *cpuctx;
+        struct hlist_head *head;
+        cpuctx = &__get_cpu_var(perf_cpu_context);
        if (hwc->sample_period) {
                hwc->last_period = hwc->sample_period;
                perf_swevent_set_period(event);
        }
+        head = find_swevent_head(cpuctx, event->attr.type, event->attr.config);
+        if (WARN_ON_ONCE(!head))
+                return -EINVAL;
+        hlist_add_head_rcu(&event->hlist_entry, head);
        return 0;
 }
 static void perf_swevent_disable(struct perf_event *event)
 {
+        hlist_del_rcu(&event->hlist_entry);
 }
 static const struct pmu perf_ops_generic = {
@@ -4168,15 +4217,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
        perf_sample_data_init(&data, 0);
        data.period = event->hw.last_period;
        regs = get_irq_regs();
-        /*
-         * In case we exclude kernel IPs or are somehow not in interrupt
-         * context, provide the next best thing, the user IP.
-         */
-        if ((event->attr.exclude_kernel || !regs) &&
-                        !event->attr.exclude_user)
-                regs = task_pt_regs(current);
-        if (regs) {
+        if (regs && !perf_exclude_event(event, regs)) {
                if (!(event->attr.exclude_idle && current->pid == 0))
                        if (perf_event_overflow(event, 0, &data, regs))
                                ret = HRTIMER_NORESTART;
@@ -4324,6 +4366,105 @@ static const struct pmu perf_ops_task_clock = {
        .read           = task_clock_perf_event_read,
 };
+static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
+{
+        struct swevent_hlist *hlist;
+        hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
+        kfree(hlist);
+}
+static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
+{
+        struct swevent_hlist *hlist;
+        if (!cpuctx->swevent_hlist)
+                return;
+        hlist = cpuctx->swevent_hlist;
+        rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
+        call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
+}
+static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
+{
+        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+        mutex_lock(&cpuctx->hlist_mutex);
+        if (!--cpuctx->hlist_refcount)
+                swevent_hlist_release(cpuctx);
+        mutex_unlock(&cpuctx->hlist_mutex);
+}
+static void swevent_hlist_put(struct perf_event *event)
+{
+        int cpu;
+        if (event->cpu != -1) {
+                swevent_hlist_put_cpu(event, event->cpu);
+                return;
+        }
+        for_each_possible_cpu(cpu)
+                swevent_hlist_put_cpu(event, cpu);
+}
+static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
+{
+        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+        int err = 0;
+        mutex_lock(&cpuctx->hlist_mutex);
+        if (!cpuctx->swevent_hlist && cpu_online(cpu)) {
+                struct swevent_hlist *hlist;
+                hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+                if (!hlist) {
+                        err = -ENOMEM;
+                        goto exit;
+                }
+                rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+        }
+        cpuctx->hlist_refcount++;
+ exit:
+        mutex_unlock(&cpuctx->hlist_mutex);
+        return err;
+}
+static int swevent_hlist_get(struct perf_event *event)
+{
+        int err;
+        int cpu, failed_cpu;
+        if (event->cpu != -1)
+                return swevent_hlist_get_cpu(event, event->cpu);
+        get_online_cpus();
+        for_each_possible_cpu(cpu) {
+                err = swevent_hlist_get_cpu(event, cpu);
+                if (err) {
+                        failed_cpu = cpu;
+                        goto fail;
+                }
+        }
+        put_online_cpus();
+        return 0;
+ fail:
+        for_each_possible_cpu(cpu) {
+                if (cpu == failed_cpu)
+                        break;
+                swevent_hlist_put_cpu(event, cpu);
+        }
+        put_online_cpus();
+        return err;
+}
 #ifdef CONFIG_EVENT_TRACING
 void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
@@ -4357,10 +4498,13 @@ static int perf_tp_event_match(struct perf_event *event,
 static void tp_perf_event_destroy(struct perf_event *event)
 {
        perf_trace_disable(event->attr.config);
+        swevent_hlist_put(event);
 }
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
 {
+        int err;
        /*
         * Raw tracepoint data is a severe data leak, only allow root to
         * have these.
@@ -4374,6 +4518,11 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
                return NULL;
        event->destroy = tp_perf_event_destroy;
+        err = swevent_hlist_get(event);
+        if (err) {
+                perf_trace_disable(event->attr.config);
+                return ERR_PTR(err);
+        }
        return &perf_ops_generic;
 }
@@ -4474,6 +4623,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
        WARN_ON(event->parent);
        atomic_dec(&perf_swevent_enabled[event_id]);
+        swevent_hlist_put(event);
 }
 static const struct pmu *sw_perf_event_init(struct perf_event *event)
@@ -4512,6 +4662,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
        case PERF_COUNT_SW_ALIGNMENT_FAULTS:
        case PERF_COUNT_SW_EMULATION_FAULTS:
                if (!event->parent) {
+                        int err;
+                        err = swevent_hlist_get(event);
+                        if (err)
+                                return ERR_PTR(err);
                        atomic_inc(&perf_swevent_enabled[event_id]);
                        event->destroy = sw_perf_event_destroy;
                }
@@ -4897,7 +5053,7 @@ err_fput_free_put_context:
 err_free_put_context:
        if (err < 0)
-                kfree(event);
+                free_event(event);
 err_put_context:
        if (err < 0)
@@ -5176,7 +5332,7 @@ void perf_event_exit_task(struct task_struct *child)
         *
         * But since its the parent context it won't be the same instance.
         */
-        mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
+        mutex_lock(&child_ctx->mutex);
 again:
        list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
@@ -5384,6 +5540,7 @@ static void __init perf_event_init_all_cpus(void)
        for_each_possible_cpu(cpu) {
                cpuctx = &per_cpu(perf_cpu_context, cpu);
+                mutex_init(&cpuctx->hlist_mutex);
                __perf_event_init_context(&cpuctx->ctx, NULL);
        }
 }
@@ -5397,6 +5554,16 @@ static void __cpuinit perf_event_init_cpu(int cpu)
        spin_lock(&perf_resource_lock);
        cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
        spin_unlock(&perf_resource_lock);
+        mutex_lock(&cpuctx->hlist_mutex);
+        if (cpuctx->hlist_refcount > 0) {
+                struct swevent_hlist *hlist;
+                hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+                WARN_ON_ONCE(!hlist);
+                rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+        }
+        mutex_unlock(&cpuctx->hlist_mutex);
 }
 #ifdef CONFIG_HOTPLUG_CPU
@@ -5416,6 +5583,10 @@ static void perf_event_exit_cpu(int cpu)
        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
        struct perf_event_context *ctx = &cpuctx->ctx;
+        mutex_lock(&cpuctx->hlist_mutex);
+        swevent_hlist_release(cpuctx);
+        mutex_unlock(&cpuctx->hlist_mutex);
        mutex_lock(&ctx->mutex);
        smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
        mutex_unlock(&ctx->mutex);