1 files changed, 211 insertions, 72 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 2f3fbf84215a..9dbe8cdaf145 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -16,6 +16,7 @@
 #include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/slab.h>
+#include <linux/hash.h>
 #include <linux/sysfs.h>
 #include <linux/dcache.h>
 #include <linux/percpu.h>
@@ -1367,6 +1368,8 @@ void perf_event_task_sched_in(struct task_struct *task)
        if (cpuctx->task_ctx == ctx)
                return;
+        perf_disable();
        /*
         * We want to keep the following priority order:
         * cpu pinned (that don't need to move), task pinned,
@@ -1379,6 +1382,8 @@ void perf_event_task_sched_in(struct task_struct *task)
        ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
        cpuctx->task_ctx = ctx;
+        perf_enable();
 }
 #define MAX_INTERRUPTS (~0ULL)
@@ -2642,6 +2647,7 @@ static int perf_fasync(int fd, struct file *filp, int on)
 }
 static const struct file_operations perf_fops = {
+        .llseek                 = no_llseek,
        .release                = perf_release,
        .read                   = perf_read,
        .poll                   = perf_poll,
@@ -2792,6 +2798,27 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
 /*
+ * We assume there is only KVM supporting the callbacks.
+ * Later on, we might change it to a list if there is
+ * another virtualization implementation supporting the callbacks.
+ */
+struct perf_guest_info_callbacks *perf_guest_cbs;
+int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+        perf_guest_cbs = cbs;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
+int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+        perf_guest_cbs = NULL;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
+/*
 * Output
 */
 static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
@@ -3743,7 +3770,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_MMAP,
-                                .misc = 0,
+                                .misc = PERF_RECORD_MISC_USER,
                                /* .size */
                        },
                        /* .pid */
@@ -3961,36 +3988,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
        perf_swevent_overflow(event, 0, nmi, data, regs);
 }
-static int perf_swevent_is_counting(struct perf_event *event)
-{
-        /*
-         * The event is active, we're good!
-         */
-        if (event->state == PERF_EVENT_STATE_ACTIVE)
-                return 1;
-        /*
-         * The event is off/error, not counting.
-         */
-        if (event->state != PERF_EVENT_STATE_INACTIVE)
-                return 0;
-        /*
-         * The event is inactive, if the context is active
-         * we're part of a group that didn't make it on the 'pmu',
-         * not counting.
-         */
-        if (event->ctx->is_active)
-                return 0;
-        /*
-         * We're inactive and the context is too, this means the
-         * task is scheduled out, we're counting events that happen
-         * to us, like migration events.
-         */
-        return 1;
-}
 static int perf_tp_event_match(struct perf_event *event,
                                struct perf_sample_data *data);
@@ -4014,12 +4011,6 @@ static int perf_swevent_match(struct perf_event *event,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
 {
-        if (event->cpu != -1 && event->cpu != smp_processor_id())
-                return 0;
-        if (!perf_swevent_is_counting(event))
-                return 0;
        if (event->attr.type != type)
                return 0;
@@ -4036,18 +4027,53 @@ static int perf_swevent_match(struct perf_event *event,
        return 1;
 }
-static void perf_swevent_ctx_event(struct perf_event_context *ctx,
+static inline u64 swevent_hash(u64 type, u32 event_id)
-                                     enum perf_type_id type,
+{
-                                     u32 event_id, u64 nr, int nmi,
+        u64 val = event_id | (type << 32);
-                                     struct perf_sample_data *data,
-                                     struct pt_regs *regs)
+        return hash_64(val, SWEVENT_HLIST_BITS);
+}
+static struct hlist_head *
+find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id)
 {
+        u64 hash;
+        struct swevent_hlist *hlist;
+        hash = swevent_hash(type, event_id);
+        hlist = rcu_dereference(ctx->swevent_hlist);
+        if (!hlist)
+                return NULL;
+        return &hlist->heads[hash];
+}
+static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
+                                    u64 nr, int nmi,
+                                    struct perf_sample_data *data,
+                                    struct pt_regs *regs)
+{
+        struct perf_cpu_context *cpuctx;
        struct perf_event *event;
+        struct hlist_node *node;
+        struct hlist_head *head;
-        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+        cpuctx = &__get_cpu_var(perf_cpu_context);
+        rcu_read_lock();
+        head = find_swevent_head(cpuctx, type, event_id);
+        if (!head)
+                goto end;
+        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
                if (perf_swevent_match(event, type, event_id, data, regs))
                        perf_swevent_add(event, nr, nmi, data, regs);
        }
+end:
+        rcu_read_unlock();
 }
 int perf_swevent_get_recursion_context(void)
@@ -4085,27 +4111,6 @@ void perf_swevent_put_recursion_context(int rctx)
 }
 EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
-static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
-                                    u64 nr, int nmi,
-                                    struct perf_sample_data *data,
-                                    struct pt_regs *regs)
-{
-        struct perf_cpu_context *cpuctx;
-        struct perf_event_context *ctx;
-        cpuctx = &__get_cpu_var(perf_cpu_context);
-        rcu_read_lock();
-        perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
-                                 nr, nmi, data, regs);
-        /*
-         * doesn't really matter which of the child contexts the
-         * events ends up in.
-         */
-        ctx = rcu_dereference(current->perf_event_ctxp);
-        if (ctx)
-                perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
-        rcu_read_unlock();
-}
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
                            struct pt_regs *regs, u64 addr)
@@ -4131,16 +4136,28 @@ static void perf_swevent_read(struct perf_event *event)
 static int perf_swevent_enable(struct perf_event *event)
 {
        struct hw_perf_event *hwc = &event->hw;
+        struct perf_cpu_context *cpuctx;
+        struct hlist_head *head;
+        cpuctx = &__get_cpu_var(perf_cpu_context);
        if (hwc->sample_period) {
                hwc->last_period = hwc->sample_period;
                perf_swevent_set_period(event);
        }
+        head = find_swevent_head(cpuctx, event->attr.type, event->attr.config);
+        if (WARN_ON_ONCE(!head))
+                return -EINVAL;
+        hlist_add_head_rcu(&event->hlist_entry, head);
        return 0;
 }
 static void perf_swevent_disable(struct perf_event *event)
 {
+        hlist_del_rcu(&event->hlist_entry);
 }
 static const struct pmu perf_ops_generic = {
@@ -4168,15 +4185,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
        perf_sample_data_init(&data, 0);
        data.period = event->hw.last_period;
        regs = get_irq_regs();
-        /*
-         * In case we exclude kernel IPs or are somehow not in interrupt
-         * context, provide the next best thing, the user IP.
-         */
-        if ((event->attr.exclude_kernel || !regs) &&
-                        !event->attr.exclude_user)
-                regs = task_pt_regs(current);
-        if (regs) {
+        if (regs && !perf_exclude_event(event, regs)) {
                if (!(event->attr.exclude_idle && current->pid == 0))
                        if (perf_event_overflow(event, 0, &data, regs))
                                ret = HRTIMER_NORESTART;
@@ -4324,6 +4334,105 @@ static const struct pmu perf_ops_task_clock = {
        .read           = task_clock_perf_event_read,
 };
+static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
+{
+        struct swevent_hlist *hlist;
+        hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
+        kfree(hlist);
+}
+static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
+{
+        struct swevent_hlist *hlist;
+        if (!cpuctx->swevent_hlist)
+                return;
+        hlist = cpuctx->swevent_hlist;
+        rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
+        call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
+}
+static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
+{
+        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+        mutex_lock(&cpuctx->hlist_mutex);
+        if (!--cpuctx->hlist_refcount)
+                swevent_hlist_release(cpuctx);
+        mutex_unlock(&cpuctx->hlist_mutex);
+}
+static void swevent_hlist_put(struct perf_event *event)
+{
+        int cpu;
+        if (event->cpu != -1) {
+                swevent_hlist_put_cpu(event, event->cpu);
+                return;
+        }
+        for_each_possible_cpu(cpu)
+                swevent_hlist_put_cpu(event, cpu);
+}
+static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
+{
+        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+        int err = 0;
+        mutex_lock(&cpuctx->hlist_mutex);
+        if (!cpuctx->swevent_hlist && cpu_online(cpu)) {
+                struct swevent_hlist *hlist;
+                hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+                if (!hlist) {
+                        err = -ENOMEM;
+                        goto exit;
+                }
+                rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+        }
+        cpuctx->hlist_refcount++;
+ exit:
+        mutex_unlock(&cpuctx->hlist_mutex);
+        return err;
+}
+static int swevent_hlist_get(struct perf_event *event)
+{
+        int err;
+        int cpu, failed_cpu;
+        if (event->cpu != -1)
+                return swevent_hlist_get_cpu(event, event->cpu);
+        get_online_cpus();
+        for_each_possible_cpu(cpu) {
+                err = swevent_hlist_get_cpu(event, cpu);
+                if (err) {
+                        failed_cpu = cpu;
+                        goto fail;
+                }
+        }
+        put_online_cpus();
+        return 0;
+ fail:
+        for_each_possible_cpu(cpu) {
+                if (cpu == failed_cpu)
+                        break;
+                swevent_hlist_put_cpu(event, cpu);
+        }
+        put_online_cpus();
+        return err;
+}
 #ifdef CONFIG_EVENT_TRACING
 void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
@@ -4357,10 +4466,13 @@ static int perf_tp_event_match(struct perf_event *event,
 static void tp_perf_event_destroy(struct perf_event *event)
 {
        perf_trace_disable(event->attr.config);
+        swevent_hlist_put(event);
 }
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
 {
+        int err;
        /*
         * Raw tracepoint data is a severe data leak, only allow root to
         * have these.
@@ -4374,6 +4486,11 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
                return NULL;
        event->destroy = tp_perf_event_destroy;
+        err = swevent_hlist_get(event);
+        if (err) {
+                perf_trace_disable(event->attr.config);
+                return ERR_PTR(err);
+        }
        return &perf_ops_generic;
 }
@@ -4474,6 +4591,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
        WARN_ON(event->parent);
        atomic_dec(&perf_swevent_enabled[event_id]);
+        swevent_hlist_put(event);
 }
 static const struct pmu *sw_perf_event_init(struct perf_event *event)
@@ -4512,6 +4630,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
        case PERF_COUNT_SW_ALIGNMENT_FAULTS:
        case PERF_COUNT_SW_EMULATION_FAULTS:
                if (!event->parent) {
+                        int err;
+                        err = swevent_hlist_get(event);
+                        if (err)
+                                return ERR_PTR(err);
                        atomic_inc(&perf_swevent_enabled[event_id]);
                        event->destroy = sw_perf_event_destroy;
                }
@@ -5384,6 +5508,7 @@ static void __init perf_event_init_all_cpus(void)
        for_each_possible_cpu(cpu) {
                cpuctx = &per_cpu(perf_cpu_context, cpu);
+                mutex_init(&cpuctx->hlist_mutex);
                __perf_event_init_context(&cpuctx->ctx, NULL);
        }
 }
@@ -5397,6 +5522,16 @@ static void __cpuinit perf_event_init_cpu(int cpu)
        spin_lock(&perf_resource_lock);
        cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
        spin_unlock(&perf_resource_lock);
+        mutex_lock(&cpuctx->hlist_mutex);
+        if (cpuctx->hlist_refcount > 0) {
+                struct swevent_hlist *hlist;
+                hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+                WARN_ON_ONCE(!hlist);
+                rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+        }
+        mutex_unlock(&cpuctx->hlist_mutex);
 }
 #ifdef CONFIG_HOTPLUG_CPU
@@ -5416,6 +5551,10 @@ static void perf_event_exit_cpu(int cpu)
        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
        struct perf_event_context *ctx = &cpuctx->ctx;
+        mutex_lock(&cpuctx->hlist_mutex);
+        swevent_hlist_release(cpuctx);
+        mutex_unlock(&cpuctx->hlist_mutex);
        mutex_lock(&ctx->mutex);
        smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
        mutex_unlock(&ctx->mutex);

diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 2f3fbf84215a..9dbe8cdaf145 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c
@@ -16,6 +16,7 @@
16	#include <linux/file.h>	16	#include <linux/file.h>
17	#include <linux/poll.h>	17	#include <linux/poll.h>
18	#include <linux/slab.h>	18	#include <linux/slab.h>
		19	#include <linux/hash.h>
19	#include <linux/sysfs.h>	20	#include <linux/sysfs.h>
20	#include <linux/dcache.h>	21	#include <linux/dcache.h>
21	#include <linux/percpu.h>	22	#include <linux/percpu.h>
@@ -1367,6 +1368,8 @@ void perf_event_task_sched_in(struct task_struct *task)
1367	if (cpuctx->task_ctx == ctx)	1368	if (cpuctx->task_ctx == ctx)
1368	return;	1369	return;
1369		1370
		1371	perf_disable();
		1372
1370	/*	1373	/*
1371	* We want to keep the following priority order:	1374	* We want to keep the following priority order:
1372	* cpu pinned (that don't need to move), task pinned,	1375	* cpu pinned (that don't need to move), task pinned,
@@ -1379,6 +1382,8 @@ void perf_event_task_sched_in(struct task_struct *task)
1379	ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);	1382	ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1380		1383
1381	cpuctx->task_ctx = ctx;	1384	cpuctx->task_ctx = ctx;
		1385
		1386	perf_enable();
1382	}	1387	}
1383		1388
1384	#define MAX_INTERRUPTS (~0ULL)	1389	#define MAX_INTERRUPTS (~0ULL)
@@ -2642,6 +2647,7 @@ static int perf_fasync(int fd, struct file *filp, int on)
2642	}	2647	}
2643		2648
2644	static const struct file_operations perf_fops = {	2649	static const struct file_operations perf_fops = {
		2650	.llseek = no_llseek,
2645	.release = perf_release,	2651	.release = perf_release,
2646	.read = perf_read,	2652	.read = perf_read,
2647	.poll = perf_poll,	2653	.poll = perf_poll,
@@ -2792,6 +2798,27 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
2792		2798
2793		2799
2794	/*	2800	/*
		2801	* We assume there is only KVM supporting the callbacks.
		2802	* Later on, we might change it to a list if there is
		2803	* another virtualization implementation supporting the callbacks.
		2804	*/
		2805	struct perf_guest_info_callbacks *perf_guest_cbs;
		2806
		2807	int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
		2808	{
		2809	perf_guest_cbs = cbs;
		2810	return 0;
		2811	}
		2812	EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
		2813
		2814	int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
		2815	{
		2816	perf_guest_cbs = NULL;
		2817	return 0;
		2818	}
		2819	EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
		2820
		2821	/*
2795	* Output	2822	* Output
2796	*/	2823	*/
2797	static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,	2824	static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
@@ -3743,7 +3770,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
3743	.event_id = {	3770	.event_id = {
3744	.header = {	3771	.header = {
3745	.type = PERF_RECORD_MMAP,	3772	.type = PERF_RECORD_MMAP,
3746	.misc = 0,	3773	.misc = PERF_RECORD_MISC_USER,
3747	/* .size */	3774	/* .size */
3748	},	3775	},
3749	/* .pid */	3776	/* .pid */
@@ -3961,36 +3988,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
3961	perf_swevent_overflow(event, 0, nmi, data, regs);	3988	perf_swevent_overflow(event, 0, nmi, data, regs);
3962	}	3989	}
3963		3990
3964	static int perf_swevent_is_counting(struct perf_event *event)
3965	{
3966	/*
3967	* The event is active, we're good!
3968	*/
3969	if (event->state == PERF_EVENT_STATE_ACTIVE)
3970	return 1;
3971
3972	/*
3973	* The event is off/error, not counting.
3974	*/
3975	if (event->state != PERF_EVENT_STATE_INACTIVE)
3976	return 0;
3977
3978	/*
3979	* The event is inactive, if the context is active
3980	* we're part of a group that didn't make it on the 'pmu',
3981	* not counting.
3982	*/
3983	if (event->ctx->is_active)
3984	return 0;
3985
3986	/*
3987	* We're inactive and the context is too, this means the
3988	* task is scheduled out, we're counting events that happen
3989	* to us, like migration events.
3990	*/
3991	return 1;
3992	}
3993
3994	static int perf_tp_event_match(struct perf_event *event,	3991	static int perf_tp_event_match(struct perf_event *event,
3995	struct perf_sample_data *data);	3992	struct perf_sample_data *data);
3996		3993
@@ -4014,12 +4011,6 @@ static int perf_swevent_match(struct perf_event *event,
4014	struct perf_sample_data *data,	4011	struct perf_sample_data *data,
4015	struct pt_regs *regs)	4012	struct pt_regs *regs)
4016	{	4013	{
4017	if (event->cpu != -1 && event->cpu != smp_processor_id())
4018	return 0;
4019
4020	if (!perf_swevent_is_counting(event))
4021	return 0;
4022
4023	if (event->attr.type != type)	4014	if (event->attr.type != type)
4024	return 0;	4015	return 0;
4025		4016
@@ -4036,18 +4027,53 @@ static int perf_swevent_match(struct perf_event *event,
4036	return 1;	4027	return 1;
4037	}	4028	}
4038		4029
4039	static void perf_swevent_ctx_event(struct perf_event_context *ctx,	4030	static inline u64 swevent_hash(u64 type, u32 event_id)
4040	enum perf_type_id type,	4031	{
4041	u32 event_id, u64 nr, int nmi,	4032	u64 val = event_id \| (type << 32);
4042	struct perf_sample_data *data,	4033
4043	struct pt_regs *regs)	4034	return hash_64(val, SWEVENT_HLIST_BITS);
		4035	}
		4036
		4037	static struct hlist_head *
		4038	find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id)
4044	{	4039	{
		4040	u64 hash;
		4041	struct swevent_hlist *hlist;
		4042
		4043	hash = swevent_hash(type, event_id);
		4044
		4045	hlist = rcu_dereference(ctx->swevent_hlist);
		4046	if (!hlist)
		4047	return NULL;
		4048
		4049	return &hlist->heads[hash];
		4050	}
		4051
		4052	static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
		4053	u64 nr, int nmi,
		4054	struct perf_sample_data *data,
		4055	struct pt_regs *regs)
		4056	{
		4057	struct perf_cpu_context *cpuctx;
4045	struct perf_event *event;	4058	struct perf_event *event;
		4059	struct hlist_node *node;
		4060	struct hlist_head *head;
4046		4061
4047	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {	4062	cpuctx = &__get_cpu_var(perf_cpu_context);
		4063
		4064	rcu_read_lock();
		4065
		4066	head = find_swevent_head(cpuctx, type, event_id);
		4067
		4068	if (!head)
		4069	goto end;
		4070
		4071	hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4048	if (perf_swevent_match(event, type, event_id, data, regs))	4072	if (perf_swevent_match(event, type, event_id, data, regs))
4049	perf_swevent_add(event, nr, nmi, data, regs);	4073	perf_swevent_add(event, nr, nmi, data, regs);
4050	}	4074	}
		4075	end:
		4076	rcu_read_unlock();
4051	}	4077	}
4052		4078
4053	int perf_swevent_get_recursion_context(void)	4079	int perf_swevent_get_recursion_context(void)
@@ -4085,27 +4111,6 @@ void perf_swevent_put_recursion_context(int rctx)
4085	}	4111	}
4086	EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);	4112	EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
4087		4113
4088	static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4089	u64 nr, int nmi,
4090	struct perf_sample_data *data,
4091	struct pt_regs *regs)
4092	{
4093	struct perf_cpu_context *cpuctx;
4094	struct perf_event_context *ctx;
4095
4096	cpuctx = &__get_cpu_var(perf_cpu_context);
4097	rcu_read_lock();
4098	perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
4099	nr, nmi, data, regs);
4100	/*
4101	* doesn't really matter which of the child contexts the
4102	* events ends up in.
4103	*/
4104	ctx = rcu_dereference(current->perf_event_ctxp);
4105	if (ctx)
4106	perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
4107	rcu_read_unlock();
4108	}
4109		4114
4110	void __perf_sw_event(u32 event_id, u64 nr, int nmi,	4115	void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4111	struct pt_regs *regs, u64 addr)	4116	struct pt_regs *regs, u64 addr)
@@ -4131,16 +4136,28 @@ static void perf_swevent_read(struct perf_event *event)
4131	static int perf_swevent_enable(struct perf_event *event)	4136	static int perf_swevent_enable(struct perf_event *event)
4132	{	4137	{
4133	struct hw_perf_event *hwc = &event->hw;	4138	struct hw_perf_event *hwc = &event->hw;
		4139	struct perf_cpu_context *cpuctx;
		4140	struct hlist_head *head;
		4141
		4142	cpuctx = &__get_cpu_var(perf_cpu_context);
4134		4143
4135	if (hwc->sample_period) {	4144	if (hwc->sample_period) {
4136	hwc->last_period = hwc->sample_period;	4145	hwc->last_period = hwc->sample_period;
4137	perf_swevent_set_period(event);	4146	perf_swevent_set_period(event);
4138	}	4147	}
		4148
		4149	head = find_swevent_head(cpuctx, event->attr.type, event->attr.config);
		4150	if (WARN_ON_ONCE(!head))
		4151	return -EINVAL;
		4152
		4153	hlist_add_head_rcu(&event->hlist_entry, head);
		4154
4139	return 0;	4155	return 0;
4140	}	4156	}
4141		4157
4142	static void perf_swevent_disable(struct perf_event *event)	4158	static void perf_swevent_disable(struct perf_event *event)
4143	{	4159	{
		4160	hlist_del_rcu(&event->hlist_entry);
4144	}	4161	}
4145		4162
4146	static const struct pmu perf_ops_generic = {	4163	static const struct pmu perf_ops_generic = {
@@ -4168,15 +4185,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4168	perf_sample_data_init(&data, 0);	4185	perf_sample_data_init(&data, 0);
4169	data.period = event->hw.last_period;	4186	data.period = event->hw.last_period;
4170	regs = get_irq_regs();	4187	regs = get_irq_regs();
4171	/*
4172	* In case we exclude kernel IPs or are somehow not in interrupt
4173	* context, provide the next best thing, the user IP.
4174	*/
4175	if ((event->attr.exclude_kernel \|\| !regs) &&
4176	!event->attr.exclude_user)
4177	regs = task_pt_regs(current);
4178		4188
4179	if (regs) {	4189	if (regs && !perf_exclude_event(event, regs)) {
4180	if (!(event->attr.exclude_idle && current->pid == 0))	4190	if (!(event->attr.exclude_idle && current->pid == 0))
4181	if (perf_event_overflow(event, 0, &data, regs))	4191	if (perf_event_overflow(event, 0, &data, regs))
4182	ret = HRTIMER_NORESTART;	4192	ret = HRTIMER_NORESTART;
@@ -4324,6 +4334,105 @@ static const struct pmu perf_ops_task_clock = {
4324	.read = task_clock_perf_event_read,	4334	.read = task_clock_perf_event_read,
4325	};	4335	};
4326		4336
		4337	static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
		4338	{
		4339	struct swevent_hlist *hlist;
		4340
		4341	hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
		4342	kfree(hlist);
		4343	}
		4344
		4345	static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
		4346	{
		4347	struct swevent_hlist *hlist;
		4348
		4349	if (!cpuctx->swevent_hlist)
		4350	return;
		4351
		4352	hlist = cpuctx->swevent_hlist;
		4353	rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
		4354	call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
		4355	}
		4356
		4357	static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
		4358	{
		4359	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
		4360
		4361	mutex_lock(&cpuctx->hlist_mutex);
		4362
		4363	if (!--cpuctx->hlist_refcount)
		4364	swevent_hlist_release(cpuctx);
		4365
		4366	mutex_unlock(&cpuctx->hlist_mutex);
		4367	}
		4368
		4369	static void swevent_hlist_put(struct perf_event *event)
		4370	{
		4371	int cpu;
		4372
		4373	if (event->cpu != -1) {
		4374	swevent_hlist_put_cpu(event, event->cpu);
		4375	return;
		4376	}
		4377
		4378	for_each_possible_cpu(cpu)
		4379	swevent_hlist_put_cpu(event, cpu);
		4380	}
		4381
		4382	static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
		4383	{
		4384	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
		4385	int err = 0;
		4386
		4387	mutex_lock(&cpuctx->hlist_mutex);
		4388
		4389	if (!cpuctx->swevent_hlist && cpu_online(cpu)) {
		4390	struct swevent_hlist *hlist;
		4391
		4392	hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
		4393	if (!hlist) {
		4394	err = -ENOMEM;
		4395	goto exit;
		4396	}
		4397	rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
		4398	}
		4399	cpuctx->hlist_refcount++;
		4400	exit:
		4401	mutex_unlock(&cpuctx->hlist_mutex);
		4402
		4403	return err;
		4404	}
		4405
		4406	static int swevent_hlist_get(struct perf_event *event)
		4407	{
		4408	int err;
		4409	int cpu, failed_cpu;
		4410
		4411	if (event->cpu != -1)
		4412	return swevent_hlist_get_cpu(event, event->cpu);
		4413
		4414	get_online_cpus();
		4415	for_each_possible_cpu(cpu) {
		4416	err = swevent_hlist_get_cpu(event, cpu);
		4417	if (err) {
		4418	failed_cpu = cpu;
		4419	goto fail;
		4420	}
		4421	}
		4422	put_online_cpus();
		4423
		4424	return 0;
		4425	fail:
		4426	for_each_possible_cpu(cpu) {
		4427	if (cpu == failed_cpu)
		4428	break;
		4429	swevent_hlist_put_cpu(event, cpu);
		4430	}
		4431
		4432	put_online_cpus();
		4433	return err;
		4434	}
		4435
4327	#ifdef CONFIG_EVENT_TRACING	4436	#ifdef CONFIG_EVENT_TRACING
4328		4437
4329	void perf_tp_event(int event_id, u64 addr, u64 count, void *record,	4438	void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
@@ -4357,10 +4466,13 @@ static int perf_tp_event_match(struct perf_event *event,
4357	static void tp_perf_event_destroy(struct perf_event *event)	4466	static void tp_perf_event_destroy(struct perf_event *event)
4358	{	4467	{
4359	perf_trace_disable(event->attr.config);	4468	perf_trace_disable(event->attr.config);
		4469	swevent_hlist_put(event);
4360	}	4470	}
4361		4471
4362	static const struct pmu tp_perf_event_init(struct perf_event event)	4472	static const struct pmu tp_perf_event_init(struct perf_event event)
4363	{	4473	{
		4474	int err;
		4475
4364	/*	4476	/*
4365	* Raw tracepoint data is a severe data leak, only allow root to	4477	* Raw tracepoint data is a severe data leak, only allow root to
4366	* have these.	4478	* have these.
@@ -4374,6 +4486,11 @@ static const struct pmu tp_perf_event_init(struct perf_event event)
4374	return NULL;	4486	return NULL;
4375		4487
4376	event->destroy = tp_perf_event_destroy;	4488	event->destroy = tp_perf_event_destroy;
		4489	err = swevent_hlist_get(event);
		4490	if (err) {
		4491	perf_trace_disable(event->attr.config);
		4492	return ERR_PTR(err);
		4493	}
4377		4494
4378	return &perf_ops_generic;	4495	return &perf_ops_generic;
4379	}	4496	}
@@ -4474,6 +4591,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
4474	WARN_ON(event->parent);	4591	WARN_ON(event->parent);
4475		4592
4476	atomic_dec(&perf_swevent_enabled[event_id]);	4593	atomic_dec(&perf_swevent_enabled[event_id]);
		4594	swevent_hlist_put(event);
4477	}	4595	}
4478		4596
4479	static const struct pmu sw_perf_event_init(struct perf_event event)	4597	static const struct pmu sw_perf_event_init(struct perf_event event)
@@ -4512,6 +4630,12 @@ static const struct pmu sw_perf_event_init(struct perf_event event)
4512	case PERF_COUNT_SW_ALIGNMENT_FAULTS:	4630	case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4513	case PERF_COUNT_SW_EMULATION_FAULTS:	4631	case PERF_COUNT_SW_EMULATION_FAULTS:
4514	if (!event->parent) {	4632	if (!event->parent) {
		4633	int err;
		4634
		4635	err = swevent_hlist_get(event);
		4636	if (err)
		4637	return ERR_PTR(err);
		4638
4515	atomic_inc(&perf_swevent_enabled[event_id]);	4639	atomic_inc(&perf_swevent_enabled[event_id]);
4516	event->destroy = sw_perf_event_destroy;	4640	event->destroy = sw_perf_event_destroy;
4517	}	4641	}
@@ -5384,6 +5508,7 @@ static void __init perf_event_init_all_cpus(void)
5384		5508
5385	for_each_possible_cpu(cpu) {	5509	for_each_possible_cpu(cpu) {
5386	cpuctx = &per_cpu(perf_cpu_context, cpu);	5510	cpuctx = &per_cpu(perf_cpu_context, cpu);
		5511	mutex_init(&cpuctx->hlist_mutex);
5387	__perf_event_init_context(&cpuctx->ctx, NULL);	5512	__perf_event_init_context(&cpuctx->ctx, NULL);
5388	}	5513	}
5389	}	5514	}
@@ -5397,6 +5522,16 @@ static void __cpuinit perf_event_init_cpu(int cpu)
5397	spin_lock(&perf_resource_lock);	5522	spin_lock(&perf_resource_lock);
5398	cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;	5523	cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5399	spin_unlock(&perf_resource_lock);	5524	spin_unlock(&perf_resource_lock);
		5525
		5526	mutex_lock(&cpuctx->hlist_mutex);
		5527	if (cpuctx->hlist_refcount > 0) {
		5528	struct swevent_hlist *hlist;
		5529
		5530	hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
		5531	WARN_ON_ONCE(!hlist);
		5532	rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
		5533	}
		5534	mutex_unlock(&cpuctx->hlist_mutex);
5400	}	5535	}
5401		5536
5402	#ifdef CONFIG_HOTPLUG_CPU	5537	#ifdef CONFIG_HOTPLUG_CPU
@@ -5416,6 +5551,10 @@ static void perf_event_exit_cpu(int cpu)
5416	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);	5551	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
5417	struct perf_event_context *ctx = &cpuctx->ctx;	5552	struct perf_event_context *ctx = &cpuctx->ctx;
5418		5553
		5554	mutex_lock(&cpuctx->hlist_mutex);
		5555	swevent_hlist_release(cpuctx);
		5556	mutex_unlock(&cpuctx->hlist_mutex);
		5557
5419	mutex_lock(&ctx->mutex);	5558	mutex_lock(&ctx->mutex);
5420	smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);	5559	smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5421	mutex_unlock(&ctx->mutex);	5560	mutex_unlock(&ctx->mutex);