aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatt Fleming <matt.fleming@intel.com>2015-01-23 13:45:46 -0500
committerIngo Molnar <mingo@kernel.org>2015-02-25 07:53:34 -0500
commitbfe1fcd2688f557a6b6a88f59ea7619228728bd7 (patch)
tree49990bc8e692c283fbc01751100d2952f114f6f6
parent35298e554c74b7849875e3676ba8eaf833c7b917 (diff)
perf/x86/intel: Support task events with Intel CQM
Add support for task events as well as system-wide events. This change has a big impact on the way that we gather LLC occupancy values in intel_cqm_event_read(). Currently, for system-wide (per-cpu) events we defer processing to userspace which knows how to discard all but one cpu result per package. Things aren't so simple for task events because we need to do the value aggregation ourselves. To do this, we defer updating the LLC occupancy value in event->count from intel_cqm_event_read() and do an SMP cross-call to read values for all packages in intel_cqm_event_count(). We need to ensure that we only do this for one task event per cache group, otherwise we'll report duplicate values. If we're a system-wide event we want to fallback to the default perf_event_count() implementation. Refactor this into a common function so that we don't duplicate the code. Also, introduce PERF_TYPE_INTEL_CQM, since we need a way to track an event's task (if the event isn't per-cpu) inside of the Intel CQM PMU driver. This task information is only availble in the upper layers of the perf infrastructure. Other perf backends stash the target task in event->hw.*target so we need to do something similar. The task is used to determine whether events should share a cache group and an RMID. Signed-off-by: Matt Fleming <matt.fleming@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Kanaka Juvva <kanaka.d.juvva@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com> Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1422038748-21397-8-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_cqm.c195
-rw-r--r--include/linux/perf_event.h1
-rw-r--r--include/uapi/linux/perf_event.h1
-rw-r--r--kernel/events/core.c2
4 files changed, 178 insertions, 21 deletions
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index b5d9d746dbc0..8003d87afd89 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -182,23 +182,124 @@ fail:
182 182
183/* 183/*
184 * Determine if @a and @b measure the same set of tasks. 184 * Determine if @a and @b measure the same set of tasks.
185 *
186 * If @a and @b measure the same set of tasks then we want to share a
187 * single RMID.
185 */ 188 */
186static bool __match_event(struct perf_event *a, struct perf_event *b) 189static bool __match_event(struct perf_event *a, struct perf_event *b)
187{ 190{
191 /* Per-cpu and task events don't mix */
188 if ((a->attach_state & PERF_ATTACH_TASK) != 192 if ((a->attach_state & PERF_ATTACH_TASK) !=
189 (b->attach_state & PERF_ATTACH_TASK)) 193 (b->attach_state & PERF_ATTACH_TASK))
190 return false; 194 return false;
191 195
192 /* not task */ 196#ifdef CONFIG_CGROUP_PERF
197 if (a->cgrp != b->cgrp)
198 return false;
199#endif
200
201 /* If not task event, we're machine wide */
202 if (!(b->attach_state & PERF_ATTACH_TASK))
203 return true;
204
205 /*
206 * Events that target same task are placed into the same cache group.
207 */
208 if (a->hw.cqm_target == b->hw.cqm_target)
209 return true;
210
211 /*
212 * Are we an inherited event?
213 */
214 if (b->parent == a)
215 return true;
216
217 return false;
218}
219
220#ifdef CONFIG_CGROUP_PERF
221static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
222{
223 if (event->attach_state & PERF_ATTACH_TASK)
224 return perf_cgroup_from_task(event->hw.cqm_target);
193 225
194 return true; /* if not task, we're machine wide */ 226 return event->cgrp;
195} 227}
228#endif
196 229
197/* 230/*
198 * Determine if @a's tasks intersect with @b's tasks 231 * Determine if @a's tasks intersect with @b's tasks
232 *
233 * There are combinations of events that we explicitly prohibit,
234 *
235 * PROHIBITS
236 * system-wide -> cgroup and task
237 * cgroup -> system-wide
238 * -> task in cgroup
239 * task -> system-wide
240 * -> task in cgroup
241 *
242 * Call this function before allocating an RMID.
199 */ 243 */
200static bool __conflict_event(struct perf_event *a, struct perf_event *b) 244static bool __conflict_event(struct perf_event *a, struct perf_event *b)
201{ 245{
246#ifdef CONFIG_CGROUP_PERF
247 /*
248 * We can have any number of cgroups but only one system-wide
249 * event at a time.
250 */
251 if (a->cgrp && b->cgrp) {
252 struct perf_cgroup *ac = a->cgrp;
253 struct perf_cgroup *bc = b->cgrp;
254
255 /*
256 * This condition should have been caught in
257 * __match_event() and we should be sharing an RMID.
258 */
259 WARN_ON_ONCE(ac == bc);
260
261 if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
262 cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
263 return true;
264
265 return false;
266 }
267
268 if (a->cgrp || b->cgrp) {
269 struct perf_cgroup *ac, *bc;
270
271 /*
272 * cgroup and system-wide events are mutually exclusive
273 */
274 if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
275 (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
276 return true;
277
278 /*
279 * Ensure neither event is part of the other's cgroup
280 */
281 ac = event_to_cgroup(a);
282 bc = event_to_cgroup(b);
283 if (ac == bc)
284 return true;
285
286 /*
287 * Must have cgroup and non-intersecting task events.
288 */
289 if (!ac || !bc)
290 return false;
291
292 /*
293 * We have cgroup and task events, and the task belongs
294 * to a cgroup. Check for for overlap.
295 */
296 if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
297 cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
298 return true;
299
300 return false;
301 }
302#endif
202 /* 303 /*
203 * If one of them is not a task, same story as above with cgroups. 304 * If one of them is not a task, same story as above with cgroups.
204 */ 305 */
@@ -245,9 +346,16 @@ static int intel_cqm_setup_event(struct perf_event *event,
245 346
246static void intel_cqm_event_read(struct perf_event *event) 347static void intel_cqm_event_read(struct perf_event *event)
247{ 348{
248 unsigned long rmid = event->hw.cqm_rmid; 349 unsigned long rmid;
249 u64 val; 350 u64 val;
250 351
352 /*
353 * Task events are handled by intel_cqm_event_count().
354 */
355 if (event->cpu == -1)
356 return;
357
358 rmid = event->hw.cqm_rmid;
251 val = __rmid_read(rmid); 359 val = __rmid_read(rmid);
252 360
253 /* 361 /*
@@ -259,6 +367,63 @@ static void intel_cqm_event_read(struct perf_event *event)
259 local64_set(&event->count, val); 367 local64_set(&event->count, val);
260} 368}
261 369
370struct rmid_read {
371 unsigned int rmid;
372 atomic64_t value;
373};
374
375static void __intel_cqm_event_count(void *info)
376{
377 struct rmid_read *rr = info;
378 u64 val;
379
380 val = __rmid_read(rr->rmid);
381
382 if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
383 return;
384
385 atomic64_add(val, &rr->value);
386}
387
388static inline bool cqm_group_leader(struct perf_event *event)
389{
390 return !list_empty(&event->hw.cqm_groups_entry);
391}
392
393static u64 intel_cqm_event_count(struct perf_event *event)
394{
395 struct rmid_read rr = {
396 .rmid = event->hw.cqm_rmid,
397 .value = ATOMIC64_INIT(0),
398 };
399
400 /*
401 * We only need to worry about task events. System-wide events
402 * are handled like usual, i.e. entirely with
403 * intel_cqm_event_read().
404 */
405 if (event->cpu != -1)
406 return __perf_event_count(event);
407
408 /*
409 * Only the group leader gets to report values. This stops us
410 * reporting duplicate values to userspace, and gives us a clear
411 * rule for which task gets to report the values.
412 *
413 * Note that it is impossible to attribute these values to
414 * specific packages - we forfeit that ability when we create
415 * task events.
416 */
417 if (!cqm_group_leader(event))
418 return 0;
419
420 on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
421
422 local64_set(&event->count, atomic64_read(&rr.value));
423
424 return __perf_event_count(event);
425}
426
262static void intel_cqm_event_start(struct perf_event *event, int mode) 427static void intel_cqm_event_start(struct perf_event *event, int mode)
263{ 428{
264 struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); 429 struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
@@ -344,7 +509,7 @@ static void intel_cqm_event_destroy(struct perf_event *event)
344 /* 509 /*
345 * And we're the group leader.. 510 * And we're the group leader..
346 */ 511 */
347 if (!list_empty(&event->hw.cqm_groups_entry)) { 512 if (cqm_group_leader(event)) {
348 /* 513 /*
349 * If there was a group_other, make that leader, otherwise 514 * If there was a group_other, make that leader, otherwise
350 * destroy the group and return the RMID. 515 * destroy the group and return the RMID.
@@ -365,17 +530,6 @@ static void intel_cqm_event_destroy(struct perf_event *event)
365 530
366static struct pmu intel_cqm_pmu; 531static struct pmu intel_cqm_pmu;
367 532
368/*
369 * XXX there's a bit of a problem in that we cannot simply do the one
370 * event per node as one would want, since that one event would one get
371 * scheduled on the one cpu. But we want to 'schedule' the RMID on all
372 * CPUs.
373 *
374 * This means we want events for each CPU, however, that generates a lot
375 * of duplicate values out to userspace -- this is not to be helped
376 * unless we want to change the core code in some way. Fore more info,
377 * see intel_cqm_event_read().
378 */
379static int intel_cqm_event_init(struct perf_event *event) 533static int intel_cqm_event_init(struct perf_event *event)
380{ 534{
381 struct perf_event *group = NULL; 535 struct perf_event *group = NULL;
@@ -387,9 +541,6 @@ static int intel_cqm_event_init(struct perf_event *event)
387 if (event->attr.config & ~QOS_EVENT_MASK) 541 if (event->attr.config & ~QOS_EVENT_MASK)
388 return -EINVAL; 542 return -EINVAL;
389 543
390 if (event->cpu == -1)
391 return -EINVAL;
392
393 /* unsupported modes and filters */ 544 /* unsupported modes and filters */
394 if (event->attr.exclude_user || 545 if (event->attr.exclude_user ||
395 event->attr.exclude_kernel || 546 event->attr.exclude_kernel ||
@@ -407,7 +558,8 @@ static int intel_cqm_event_init(struct perf_event *event)
407 558
408 mutex_lock(&cache_mutex); 559 mutex_lock(&cache_mutex);
409 560
410 err = intel_cqm_setup_event(event, &group); /* will also set rmid */ 561 /* Will also set rmid */
562 err = intel_cqm_setup_event(event, &group);
411 if (err) 563 if (err)
412 goto out; 564 goto out;
413 565
@@ -470,6 +622,7 @@ static struct pmu intel_cqm_pmu = {
470 .start = intel_cqm_event_start, 622 .start = intel_cqm_event_start,
471 .stop = intel_cqm_event_stop, 623 .stop = intel_cqm_event_stop,
472 .read = intel_cqm_event_read, 624 .read = intel_cqm_event_read,
625 .count = intel_cqm_event_count,
473}; 626};
474 627
475static inline void cqm_pick_event_reader(int cpu) 628static inline void cqm_pick_event_reader(int cpu)
@@ -599,8 +752,8 @@ static int __init intel_cqm_init(void)
599 752
600 __perf_cpu_notifier(intel_cqm_cpu_notifier); 753 __perf_cpu_notifier(intel_cqm_cpu_notifier);
601 754
602 ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); 755 ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm",
603 756 PERF_TYPE_INTEL_CQM);
604 if (ret) 757 if (ret)
605 pr_err("Intel CQM perf registration failed: %d\n", ret); 758 pr_err("Intel CQM perf registration failed: %d\n", ret);
606 else 759 else
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ca5504c48f4f..dac4c2831d82 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -129,6 +129,7 @@ struct hw_perf_event {
129 struct list_head cqm_events_entry; 129 struct list_head cqm_events_entry;
130 struct list_head cqm_groups_entry; 130 struct list_head cqm_groups_entry;
131 struct list_head cqm_group_entry; 131 struct list_head cqm_group_entry;
132 struct task_struct *cqm_target;
132 }; 133 };
133#ifdef CONFIG_HAVE_HW_BREAKPOINT 134#ifdef CONFIG_HAVE_HW_BREAKPOINT
134 struct { /* breakpoint */ 135 struct { /* breakpoint */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 1e3cd07cf76e..3c8b45de57ec 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -32,6 +32,7 @@ enum perf_type_id {
32 PERF_TYPE_HW_CACHE = 3, 32 PERF_TYPE_HW_CACHE = 3,
33 PERF_TYPE_RAW = 4, 33 PERF_TYPE_RAW = 4,
34 PERF_TYPE_BREAKPOINT = 5, 34 PERF_TYPE_BREAKPOINT = 5,
35 PERF_TYPE_INTEL_CQM = 6,
35 36
36 PERF_TYPE_MAX, /* non-ABI */ 37 PERF_TYPE_MAX, /* non-ABI */
37}; 38};
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1fc3bae5904a..71109a045450 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7181,6 +7181,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
7181 else if (attr->type == PERF_TYPE_BREAKPOINT) 7181 else if (attr->type == PERF_TYPE_BREAKPOINT)
7182 event->hw.bp_target = task; 7182 event->hw.bp_target = task;
7183#endif 7183#endif
7184 else if (attr->type == PERF_TYPE_INTEL_CQM)
7185 event->hw.cqm_target = task;
7184 } 7186 }
7185 7187
7186 if (!overflow_handler && parent_event) { 7188 if (!overflow_handler && parent_event) {