4 files changed, 178 insertions, 21 deletions
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index b5d9d746dbc0..8003d87afd89 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -182,23 +182,124 @@ fail:
 /*
 * Determine if @a and @b measure the same set of tasks.
+ *
+ * If @a and @b measure the same set of tasks then we want to share a
+ * single RMID.
 */
 static bool __match_event(struct perf_event *a, struct perf_event *b)
 {
+        /* Per-cpu and task events don't mix */
        if ((a->attach_state & PERF_ATTACH_TASK) !=
            (b->attach_state & PERF_ATTACH_TASK))
                return false;
-        /* not task */
+#ifdef CONFIG_CGROUP_PERF
+        if (a->cgrp != b->cgrp)
+                return false;
+#endif
+        /* If not task event, we're machine wide */
+        if (!(b->attach_state & PERF_ATTACH_TASK))
+                return true;
+        /*
+         * Events that target same task are placed into the same cache group.
+         */
+        if (a->hw.cqm_target == b->hw.cqm_target)
+                return true;
+        /*
+         * Are we an inherited event?
+         */
+        if (b->parent == a)
+                return true;
+        return false;
+}
+#ifdef CONFIG_CGROUP_PERF
+static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
+{
+        if (event->attach_state & PERF_ATTACH_TASK)
+                return perf_cgroup_from_task(event->hw.cqm_target);
-        return true; /* if not task, we're machine wide */
+        return event->cgrp;
 }
+#endif
 /*
 * Determine if @a's tasks intersect with @b's tasks
+ *
+ * There are combinations of events that we explicitly prohibit,
+ *
+ *                 PROHIBITS
+ *     system-wide    ->        cgroup and task
+ *     cgroup         ->        system-wide
+ *                    ->        task in cgroup
+ *     task           ->        system-wide
+ *                    ->        task in cgroup
+ *
+ * Call this function before allocating an RMID.
 */
 static bool __conflict_event(struct perf_event *a, struct perf_event *b)
 {
+#ifdef CONFIG_CGROUP_PERF
+        /*
+         * We can have any number of cgroups but only one system-wide
+         * event at a time.
+         */
+        if (a->cgrp && b->cgrp) {
+                struct perf_cgroup *ac = a->cgrp;
+                struct perf_cgroup *bc = b->cgrp;
+                /*
+                 * This condition should have been caught in
+                 * __match_event() and we should be sharing an RMID.
+                 */
+                WARN_ON_ONCE(ac == bc);
+                if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+                    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+                        return true;
+                return false;
+        }
+        if (a->cgrp || b->cgrp) {
+                struct perf_cgroup *ac, *bc;
+                /*
+                 * cgroup and system-wide events are mutually exclusive
+                 */
+                if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
+                    (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
+                        return true;
+                /*
+                 * Ensure neither event is part of the other's cgroup
+                 */
+                ac = event_to_cgroup(a);
+                bc = event_to_cgroup(b);
+                if (ac == bc)
+                        return true;
+                /*
+                 * Must have cgroup and non-intersecting task events.
+                 */
+                if (!ac || !bc)
+                        return false;
+                /*
+                 * We have cgroup and task events, and the task belongs
+                 * to a cgroup. Check for for overlap.
+                 */
+                if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+                    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+                        return true;
+                return false;
+        }
+#endif
        /*
         * If one of them is not a task, same story as above with cgroups.
         */
@@ -245,9 +346,16 @@ static int intel_cqm_setup_event(struct perf_event *event,
 static void intel_cqm_event_read(struct perf_event *event)
 {
-        unsigned long rmid = event->hw.cqm_rmid;
+        unsigned long rmid;
        u64 val;
+        /*
+         * Task events are handled by intel_cqm_event_count().
+         */
+        if (event->cpu == -1)
+                return;
+        rmid = event->hw.cqm_rmid;
        val = __rmid_read(rmid);
        /*
@@ -259,6 +367,63 @@ static void intel_cqm_event_read(struct perf_event *event)
        local64_set(&event->count, val);
 }
+struct rmid_read {
+        unsigned int rmid;
+        atomic64_t value;
+};
+static void __intel_cqm_event_count(void *info)
+{
+        struct rmid_read *rr = info;
+        u64 val;
+        val = __rmid_read(rr->rmid);
+        if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+                return;
+        atomic64_add(val, &rr->value);
+}
+static inline bool cqm_group_leader(struct perf_event *event)
+{
+        return !list_empty(&event->hw.cqm_groups_entry);
+}
+static u64 intel_cqm_event_count(struct perf_event *event)
+{
+        struct rmid_read rr = {
+                .rmid = event->hw.cqm_rmid,
+                .value = ATOMIC64_INIT(0),
+        };
+        /*
+         * We only need to worry about task events. System-wide events
+         * are handled like usual, i.e. entirely with
+         * intel_cqm_event_read().
+         */
+        if (event->cpu != -1)
+                return __perf_event_count(event);
+        /*
+         * Only the group leader gets to report values. This stops us
+         * reporting duplicate values to userspace, and gives us a clear
+         * rule for which task gets to report the values.
+         *
+         * Note that it is impossible to attribute these values to
+         * specific packages - we forfeit that ability when we create
+         * task events.
+         */
+        if (!cqm_group_leader(event))
+                return 0;
+        on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+        local64_set(&event->count, atomic64_read(&rr.value));
+        return __perf_event_count(event);
+}
 static void intel_cqm_event_start(struct perf_event *event, int mode)
 {
        struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
@@ -344,7 +509,7 @@ static void intel_cqm_event_destroy(struct perf_event *event)
        /*
         * And we're the group leader..
         */
-        if (!list_empty(&event->hw.cqm_groups_entry)) {
+        if (cqm_group_leader(event)) {
                /*
                 * If there was a group_other, make that leader, otherwise
                 * destroy the group and return the RMID.
@@ -365,17 +530,6 @@ static void intel_cqm_event_destroy(struct perf_event *event)
 static struct pmu intel_cqm_pmu;
-/*
- * XXX there's a bit of a problem in that we cannot simply do the one
- * event per node as one would want, since that one event would one get
- * scheduled on the one cpu. But we want to 'schedule' the RMID on all
- * CPUs.
- *
- * This means we want events for each CPU, however, that generates a lot
- * of duplicate values out to userspace -- this is not to be helped
- * unless we want to change the core code in some way. Fore more info,
- * see intel_cqm_event_read().
- */
 static int intel_cqm_event_init(struct perf_event *event)
 {
        struct perf_event *group = NULL;
@@ -387,9 +541,6 @@ static int intel_cqm_event_init(struct perf_event *event)
        if (event->attr.config & ~QOS_EVENT_MASK)
                return -EINVAL;
-        if (event->cpu == -1)
-                return -EINVAL;
        /* unsupported modes and filters */
        if (event->attr.exclude_user   ||
            event->attr.exclude_kernel ||
@@ -407,7 +558,8 @@ static int intel_cqm_event_init(struct perf_event *event)
        mutex_lock(&cache_mutex);
-        err = intel_cqm_setup_event(event, &group); /* will also set rmid */
+        /* Will also set rmid */
+        err = intel_cqm_setup_event(event, &group);
        if (err)
                goto out;
@@ -470,6 +622,7 @@ static struct pmu intel_cqm_pmu = {
        .start          = intel_cqm_event_start,
        .stop           = intel_cqm_event_stop,
        .read           = intel_cqm_event_read,
+        .count          = intel_cqm_event_count,
 };
 static inline void cqm_pick_event_reader(int cpu)
@@ -599,8 +752,8 @@ static int __init intel_cqm_init(void)
        __perf_cpu_notifier(intel_cqm_cpu_notifier);
-        ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
+        ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm",
+                                PERF_TYPE_INTEL_CQM);
        if (ret)
                pr_err("Intel CQM perf registration failed: %d\n", ret);
        else
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ca5504c48f4f..dac4c2831d82 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -129,6 +129,7 @@ struct hw_perf_event {
                        struct list_head        cqm_events_entry;
                        struct list_head        cqm_groups_entry;
                        struct list_head        cqm_group_entry;
+                        struct task_struct      *cqm_target;
                };
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
                struct { /* breakpoint */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 1e3cd07cf76e..3c8b45de57ec 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -32,6 +32,7 @@ enum perf_type_id {
        PERF_TYPE_HW_CACHE                      = 3,
        PERF_TYPE_RAW                           = 4,
        PERF_TYPE_BREAKPOINT                    = 5,
+        PERF_TYPE_INTEL_CQM                     = 6,
        PERF_TYPE_MAX,                          /* non-ABI */
 };
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1fc3bae5904a..71109a045450 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7181,6 +7181,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
                else if (attr->type == PERF_TYPE_BREAKPOINT)
                        event->hw.bp_target = task;
 #endif
+                else if (attr->type == PERF_TYPE_INTEL_CQM)
+                        event->hw.cqm_target = task;
        }
        if (!overflow_handler && parent_event) {

diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index b5d9d746dbc0..8003d87afd89 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -182,23 +182,124 @@ fail:
182		182
183	/*	183	/*
184	* Determine if @a and @b measure the same set of tasks.	184	* Determine if @a and @b measure the same set of tasks.
		185	*
		186	* If @a and @b measure the same set of tasks then we want to share a
		187	* single RMID.
185	*/	188	*/
186	static bool __match_event(struct perf_event a, struct perf_event b)	189	static bool __match_event(struct perf_event a, struct perf_event b)
187	{	190	{
		191	/* Per-cpu and task events don't mix */
188	if ((a->attach_state & PERF_ATTACH_TASK) !=	192	if ((a->attach_state & PERF_ATTACH_TASK) !=
189	(b->attach_state & PERF_ATTACH_TASK))	193	(b->attach_state & PERF_ATTACH_TASK))
190	return false;	194	return false;
191		195
192	/* not task */	196	#ifdef CONFIG_CGROUP_PERF
		197	if (a->cgrp != b->cgrp)
		198	return false;
		199	#endif
		200
		201	/* If not task event, we're machine wide */
		202	if (!(b->attach_state & PERF_ATTACH_TASK))
		203	return true;
		204
		205	/*
		206	* Events that target same task are placed into the same cache group.
		207	*/
		208	if (a->hw.cqm_target == b->hw.cqm_target)
		209	return true;
		210
		211	/*
		212	* Are we an inherited event?
		213	*/
		214	if (b->parent == a)
		215	return true;
		216
		217	return false;
		218	}
		219
		220	#ifdef CONFIG_CGROUP_PERF
		221	static inline struct perf_cgroup event_to_cgroup(struct perf_event event)
		222	{
		223	if (event->attach_state & PERF_ATTACH_TASK)
		224	return perf_cgroup_from_task(event->hw.cqm_target);
193		225
194	return true; /* if not task, we're machine wide */	226	return event->cgrp;
195	}	227	}
		228	#endif
196		229
197	/*	230	/*
198	* Determine if @a's tasks intersect with @b's tasks	231	* Determine if @a's tasks intersect with @b's tasks
		232	*
		233	* There are combinations of events that we explicitly prohibit,
		234	*
		235	* PROHIBITS
		236	* system-wide -> cgroup and task
		237	* cgroup -> system-wide
		238	* -> task in cgroup
		239	* task -> system-wide
		240	* -> task in cgroup
		241	*
		242	* Call this function before allocating an RMID.
199	*/	243	*/
200	static bool __conflict_event(struct perf_event a, struct perf_event b)	244	static bool __conflict_event(struct perf_event a, struct perf_event b)
201	{	245	{
		246	#ifdef CONFIG_CGROUP_PERF
		247	/*
		248	* We can have any number of cgroups but only one system-wide
		249	* event at a time.
		250	*/
		251	if (a->cgrp && b->cgrp) {
		252	struct perf_cgroup *ac = a->cgrp;
		253	struct perf_cgroup *bc = b->cgrp;
		254
		255	/*
		256	* This condition should have been caught in
		257	* __match_event() and we should be sharing an RMID.
		258	*/
		259	WARN_ON_ONCE(ac == bc);
		260
		261	if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) \|\|
		262	cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
		263	return true;
		264
		265	return false;
		266	}
		267
		268	if (a->cgrp \|\| b->cgrp) {
		269	struct perf_cgroup ac, bc;
		270
		271	/*
		272	* cgroup and system-wide events are mutually exclusive
		273	*/
		274	if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) \|\|
		275	(b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
		276	return true;
		277
		278	/*
		279	* Ensure neither event is part of the other's cgroup
		280	*/
		281	ac = event_to_cgroup(a);
		282	bc = event_to_cgroup(b);
		283	if (ac == bc)
		284	return true;
		285
		286	/*
		287	* Must have cgroup and non-intersecting task events.
		288	*/
		289	if (!ac \|\| !bc)
		290	return false;
		291
		292	/*
		293	* We have cgroup and task events, and the task belongs
		294	* to a cgroup. Check for for overlap.
		295	*/
		296	if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) \|\|
		297	cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
		298	return true;
		299
		300	return false;
		301	}
		302	#endif
202	/*	303	/*
203	* If one of them is not a task, same story as above with cgroups.	304	* If one of them is not a task, same story as above with cgroups.
204	*/	305	*/
@@ -245,9 +346,16 @@ static int intel_cqm_setup_event(struct perf_event *event,
245		346
246	static void intel_cqm_event_read(struct perf_event *event)	347	static void intel_cqm_event_read(struct perf_event *event)
247	{	348	{
248	unsigned long rmid = event->hw.cqm_rmid;	349	unsigned long rmid;
249	u64 val;	350	u64 val;
250		351
		352	/*
		353	* Task events are handled by intel_cqm_event_count().
		354	*/
		355	if (event->cpu == -1)
		356	return;
		357
		358	rmid = event->hw.cqm_rmid;
251	val = __rmid_read(rmid);	359	val = __rmid_read(rmid);
252		360
253	/*	361	/*
@@ -259,6 +367,63 @@ static void intel_cqm_event_read(struct perf_event *event)
259	local64_set(&event->count, val);	367	local64_set(&event->count, val);
260	}	368	}
261		369
		370	struct rmid_read {
		371	unsigned int rmid;
		372	atomic64_t value;
		373	};
		374
		375	static void __intel_cqm_event_count(void *info)
		376	{
		377	struct rmid_read *rr = info;
		378	u64 val;
		379
		380	val = __rmid_read(rr->rmid);
		381
		382	if (val & (RMID_VAL_ERROR \| RMID_VAL_UNAVAIL))
		383	return;
		384
		385	atomic64_add(val, &rr->value);
		386	}
		387
		388	static inline bool cqm_group_leader(struct perf_event *event)
		389	{
		390	return !list_empty(&event->hw.cqm_groups_entry);
		391	}
		392
		393	static u64 intel_cqm_event_count(struct perf_event *event)
		394	{
		395	struct rmid_read rr = {
		396	.rmid = event->hw.cqm_rmid,
		397	.value = ATOMIC64_INIT(0),
		398	};
		399
		400	/*
		401	* We only need to worry about task events. System-wide events
		402	* are handled like usual, i.e. entirely with
		403	* intel_cqm_event_read().
		404	*/
		405	if (event->cpu != -1)
		406	return __perf_event_count(event);
		407
		408	/*
		409	* Only the group leader gets to report values. This stops us
		410	* reporting duplicate values to userspace, and gives us a clear
		411	* rule for which task gets to report the values.
		412	*
		413	* Note that it is impossible to attribute these values to
		414	* specific packages - we forfeit that ability when we create
		415	* task events.
		416	*/
		417	if (!cqm_group_leader(event))
		418	return 0;
		419
		420	on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
		421
		422	local64_set(&event->count, atomic64_read(&rr.value));
		423
		424	return __perf_event_count(event);
		425	}
		426
262	static void intel_cqm_event_start(struct perf_event *event, int mode)	427	static void intel_cqm_event_start(struct perf_event *event, int mode)
263	{	428	{
264	struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);	429	struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
@@ -344,7 +509,7 @@ static void intel_cqm_event_destroy(struct perf_event *event)
344	/*	509	/*
345	* And we're the group leader..	510	* And we're the group leader..
346	*/	511	*/
347	if (!list_empty(&event->hw.cqm_groups_entry)) {	512	if (cqm_group_leader(event)) {
348	/*	513	/*
349	* If there was a group_other, make that leader, otherwise	514	* If there was a group_other, make that leader, otherwise
350	* destroy the group and return the RMID.	515	* destroy the group and return the RMID.
@@ -365,17 +530,6 @@ static void intel_cqm_event_destroy(struct perf_event *event)
365		530
366	static struct pmu intel_cqm_pmu;	531	static struct pmu intel_cqm_pmu;
367		532
368	/*
369	* XXX there's a bit of a problem in that we cannot simply do the one
370	* event per node as one would want, since that one event would one get
371	* scheduled on the one cpu. But we want to 'schedule' the RMID on all
372	* CPUs.
373	*
374	* This means we want events for each CPU, however, that generates a lot
375	* of duplicate values out to userspace -- this is not to be helped
376	* unless we want to change the core code in some way. Fore more info,
377	* see intel_cqm_event_read().
378	*/
379	static int intel_cqm_event_init(struct perf_event *event)	533	static int intel_cqm_event_init(struct perf_event *event)
380	{	534	{
381	struct perf_event *group = NULL;	535	struct perf_event *group = NULL;
@@ -387,9 +541,6 @@ static int intel_cqm_event_init(struct perf_event *event)
387	if (event->attr.config & ~QOS_EVENT_MASK)	541	if (event->attr.config & ~QOS_EVENT_MASK)
388	return -EINVAL;	542	return -EINVAL;
389		543
390	if (event->cpu == -1)
391	return -EINVAL;
392
393	/* unsupported modes and filters */	544	/* unsupported modes and filters */
394	if (event->attr.exclude_user \|\|	545	if (event->attr.exclude_user \|\|
395	event->attr.exclude_kernel \|\|	546	event->attr.exclude_kernel \|\|
@@ -407,7 +558,8 @@ static int intel_cqm_event_init(struct perf_event *event)
407		558
408	mutex_lock(&cache_mutex);	559	mutex_lock(&cache_mutex);
409		560
410	err = intel_cqm_setup_event(event, &group); /* will also set rmid */	561	/* Will also set rmid */
		562	err = intel_cqm_setup_event(event, &group);
411	if (err)	563	if (err)
412	goto out;	564	goto out;
413		565
@@ -470,6 +622,7 @@ static struct pmu intel_cqm_pmu = {
470	.start = intel_cqm_event_start,	622	.start = intel_cqm_event_start,
471	.stop = intel_cqm_event_stop,	623	.stop = intel_cqm_event_stop,
472	.read = intel_cqm_event_read,	624	.read = intel_cqm_event_read,
		625	.count = intel_cqm_event_count,
473	};	626	};
474		627
475	static inline void cqm_pick_event_reader(int cpu)	628	static inline void cqm_pick_event_reader(int cpu)
@@ -599,8 +752,8 @@ static int __init intel_cqm_init(void)
599		752
600	__perf_cpu_notifier(intel_cqm_cpu_notifier);	753	__perf_cpu_notifier(intel_cqm_cpu_notifier);
601		754
602	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);	755	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm",
603		756	PERF_TYPE_INTEL_CQM);
604	if (ret)	757	if (ret)
605	pr_err("Intel CQM perf registration failed: %d\n", ret);	758	pr_err("Intel CQM perf registration failed: %d\n", ret);
606	else	759	else


diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ca5504c48f4f..dac4c2831d82 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h
@@ -129,6 +129,7 @@ struct hw_perf_event {
129	struct list_head cqm_events_entry;	129	struct list_head cqm_events_entry;
130	struct list_head cqm_groups_entry;	130	struct list_head cqm_groups_entry;
131	struct list_head cqm_group_entry;	131	struct list_head cqm_group_entry;
		132	struct task_struct *cqm_target;
132	};	133	};
133	#ifdef CONFIG_HAVE_HW_BREAKPOINT	134	#ifdef CONFIG_HAVE_HW_BREAKPOINT
134	struct { /* breakpoint */	135	struct { /* breakpoint */


diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 1e3cd07cf76e..3c8b45de57ec 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h
@@ -32,6 +32,7 @@ enum perf_type_id {
32	PERF_TYPE_HW_CACHE = 3,	32	PERF_TYPE_HW_CACHE = 3,
33	PERF_TYPE_RAW = 4,	33	PERF_TYPE_RAW = 4,
34	PERF_TYPE_BREAKPOINT = 5,	34	PERF_TYPE_BREAKPOINT = 5,
		35	PERF_TYPE_INTEL_CQM = 6,
35		36
36	PERF_TYPE_MAX, /* non-ABI */	37	PERF_TYPE_MAX, /* non-ABI */
37	};	38	};


diff --git a/kernel/events/core.c b/kernel/events/core.c index 1fc3bae5904a..71109a045450 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c
@@ -7181,6 +7181,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
7181	else if (attr->type == PERF_TYPE_BREAKPOINT)	7181	else if (attr->type == PERF_TYPE_BREAKPOINT)
7182	event->hw.bp_target = task;	7182	event->hw.bp_target = task;
7183	#endif	7183	#endif
		7184	else if (attr->type == PERF_TYPE_INTEL_CQM)
		7185	event->hw.cqm_target = task;
7184	}	7186	}
7185		7187
7186	if (!overflow_handler && parent_event) {	7188	if (!overflow_handler && parent_event) {