aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/cpu/perf_event_intel_cqm.c
diff options
context:
space:
mode:
authorMatt Fleming <matt.fleming@intel.com>2015-01-23 13:45:47 -0500
committerIngo Molnar <mingo@kernel.org>2015-02-25 07:53:35 -0500
commitbff671dba7981195a644a5dc210d65de8ae2d251 (patch)
tree879bcaeaca6121d89e3bc8f978d169a78751f387 /arch/x86/kernel/cpu/perf_event_intel_cqm.c
parentbfe1fcd2688f557a6b6a88f59ea7619228728bd7 (diff)
perf/x86/intel: Perform rotation on Intel CQM RMIDs
There are many use cases where people will want to monitor more tasks than there exist RMIDs in the hardware, meaning that we have to perform some kind of multiplexing. We do this by "rotating" the RMIDs in a workqueue, and assigning an RMID to a waiting event when the RMID becomes unused. This scheme reserves one RMID at all times for rotation. When we need to schedule a new event we give it the reserved RMID, pick a victim event from the front of the global CQM list and wait for the victim's RMID to drop to zero occupancy, before it becomes the new reserved RMID. We put the victim's RMID onto the limbo list, where it resides for a "minimum queue time", which is intended to save ourselves an expensive smp IPI when the RMID is unlikely to have a occupancy value below __intel_cqm_threshold. If we fail to recycle an RMID, even after waiting the minimum queue time then we need to increment __intel_cqm_threshold. There is an upper bound on this threshold, __intel_cqm_max_threshold, which is programmable from userland as /sys/devices/intel_cqm/max_recycling_threshold. The comments above __intel_cqm_rmid_rotate() have more details. Signed-off-by: Matt Fleming <matt.fleming@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Kanaka Juvva <kanaka.d.juvva@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com> Link: http://lkml.kernel.org/r/1422038748-21397-9-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86/kernel/cpu/perf_event_intel_cqm.c')
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_cqm.c671
1 files changed, 623 insertions, 48 deletions
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 8003d87afd89..e31f5086f2b5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -25,9 +25,13 @@ struct intel_cqm_state {
25static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state); 25static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
26 26
27/* 27/*
28 * Protects cache_cgroups and cqm_rmid_lru. 28 * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
29 * Also protects event->hw.cqm_rmid
30 *
31 * Hold either for stability, both for modification of ->hw.cqm_rmid.
29 */ 32 */
30static DEFINE_MUTEX(cache_mutex); 33static DEFINE_MUTEX(cache_mutex);
34static DEFINE_RAW_SPINLOCK(cache_lock);
31 35
32/* 36/*
33 * Groups of events that have the same target(s), one RMID per group. 37 * Groups of events that have the same target(s), one RMID per group.
@@ -46,7 +50,34 @@ static cpumask_t cqm_cpumask;
46 50
47#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID 51#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
48 52
49static u64 __rmid_read(unsigned long rmid) 53/*
54 * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
55 *
56 * This rmid is always free and is guaranteed to have an associated
57 * near-zero occupancy value, i.e. no cachelines are tagged with this
58 * RMID, once __intel_cqm_rmid_rotate() returns.
59 */
60static unsigned int intel_cqm_rotation_rmid;
61
62#define INVALID_RMID (-1)
63
64/*
65 * Is @rmid valid for programming the hardware?
66 *
67 * rmid 0 is reserved by the hardware for all non-monitored tasks, which
68 * means that we should never come across an rmid with that value.
69 * Likewise, an rmid value of -1 is used to indicate "no rmid currently
70 * assigned" and is used as part of the rotation code.
71 */
72static inline bool __rmid_valid(unsigned int rmid)
73{
74 if (!rmid || rmid == INVALID_RMID)
75 return false;
76
77 return true;
78}
79
80static u64 __rmid_read(unsigned int rmid)
50{ 81{
51 u64 val; 82 u64 val;
52 83
@@ -64,13 +95,21 @@ static u64 __rmid_read(unsigned long rmid)
64 return val; 95 return val;
65} 96}
66 97
98enum rmid_recycle_state {
99 RMID_YOUNG = 0,
100 RMID_AVAILABLE,
101 RMID_DIRTY,
102};
103
67struct cqm_rmid_entry { 104struct cqm_rmid_entry {
68 u64 rmid; 105 unsigned int rmid;
106 enum rmid_recycle_state state;
69 struct list_head list; 107 struct list_head list;
108 unsigned long queue_time;
70}; 109};
71 110
72/* 111/*
73 * A least recently used list of RMIDs. 112 * cqm_rmid_free_lru - A least recently used list of RMIDs.
74 * 113 *
75 * Oldest entry at the head, newest (most recently used) entry at the 114 * Oldest entry at the head, newest (most recently used) entry at the
76 * tail. This list is never traversed, it's only used to keep track of 115 * tail. This list is never traversed, it's only used to keep track of
@@ -81,9 +120,18 @@ struct cqm_rmid_entry {
81 * in use. To mark an RMID as in use, remove its entry from the lru 120 * in use. To mark an RMID as in use, remove its entry from the lru
82 * list. 121 * list.
83 * 122 *
84 * This list is protected by cache_mutex. 123 *
124 * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
125 *
126 * This list is contains RMIDs that no one is currently using but that
127 * may have a non-zero occupancy value associated with them. The
128 * rotation worker moves RMIDs from the limbo list to the free list once
129 * the occupancy value drops below __intel_cqm_threshold.
130 *
131 * Both lists are protected by cache_mutex.
85 */ 132 */
86static LIST_HEAD(cqm_rmid_lru); 133static LIST_HEAD(cqm_rmid_free_lru);
134static LIST_HEAD(cqm_rmid_limbo_lru);
87 135
88/* 136/*
89 * We use a simple array of pointers so that we can lookup a struct 137 * We use a simple array of pointers so that we can lookup a struct
@@ -120,37 +168,43 @@ static int __get_rmid(void)
120 168
121 lockdep_assert_held(&cache_mutex); 169 lockdep_assert_held(&cache_mutex);
122 170
123 if (list_empty(&cqm_rmid_lru)) 171 if (list_empty(&cqm_rmid_free_lru))
124 return -EAGAIN; 172 return INVALID_RMID;
125 173
126 entry = list_first_entry(&cqm_rmid_lru, struct cqm_rmid_entry, list); 174 entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
127 list_del(&entry->list); 175 list_del(&entry->list);
128 176
129 return entry->rmid; 177 return entry->rmid;
130} 178}
131 179
132static void __put_rmid(int rmid) 180static void __put_rmid(unsigned int rmid)
133{ 181{
134 struct cqm_rmid_entry *entry; 182 struct cqm_rmid_entry *entry;
135 183
136 lockdep_assert_held(&cache_mutex); 184 lockdep_assert_held(&cache_mutex);
137 185
186 WARN_ON(!__rmid_valid(rmid));
138 entry = __rmid_entry(rmid); 187 entry = __rmid_entry(rmid);
139 188
140 list_add_tail(&entry->list, &cqm_rmid_lru); 189 entry->queue_time = jiffies;
190 entry->state = RMID_YOUNG;
191
192 list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
141} 193}
142 194
143static int intel_cqm_setup_rmid_cache(void) 195static int intel_cqm_setup_rmid_cache(void)
144{ 196{
145 struct cqm_rmid_entry *entry; 197 struct cqm_rmid_entry *entry;
146 int r; 198 unsigned int nr_rmids;
199 int r = 0;
147 200
201 nr_rmids = cqm_max_rmid + 1;
148 cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) * 202 cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
149 (cqm_max_rmid + 1), GFP_KERNEL); 203 nr_rmids, GFP_KERNEL);
150 if (!cqm_rmid_ptrs) 204 if (!cqm_rmid_ptrs)
151 return -ENOMEM; 205 return -ENOMEM;
152 206
153 for (r = 0; r <= cqm_max_rmid; r++) { 207 for (; r <= cqm_max_rmid; r++) {
154 struct cqm_rmid_entry *entry; 208 struct cqm_rmid_entry *entry;
155 209
156 entry = kmalloc(sizeof(*entry), GFP_KERNEL); 210 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
@@ -161,7 +215,7 @@ static int intel_cqm_setup_rmid_cache(void)
161 entry->rmid = r; 215 entry->rmid = r;
162 cqm_rmid_ptrs[r] = entry; 216 cqm_rmid_ptrs[r] = entry;
163 217
164 list_add_tail(&entry->list, &cqm_rmid_lru); 218 list_add_tail(&entry->list, &cqm_rmid_free_lru);
165 } 219 }
166 220
167 /* 221 /*
@@ -171,6 +225,10 @@ static int intel_cqm_setup_rmid_cache(void)
171 entry = __rmid_entry(0); 225 entry = __rmid_entry(0);
172 list_del(&entry->list); 226 list_del(&entry->list);
173 227
228 mutex_lock(&cache_mutex);
229 intel_cqm_rotation_rmid = __get_rmid();
230 mutex_unlock(&cache_mutex);
231
174 return 0; 232 return 0;
175fail: 233fail:
176 while (r--) 234 while (r--)
@@ -313,6 +371,424 @@ static bool __conflict_event(struct perf_event *a, struct perf_event *b)
313 return false; 371 return false;
314} 372}
315 373
374struct rmid_read {
375 unsigned int rmid;
376 atomic64_t value;
377};
378
379static void __intel_cqm_event_count(void *info);
380
381/*
382 * Exchange the RMID of a group of events.
383 */
384static unsigned int
385intel_cqm_xchg_rmid(struct perf_event *group, unsigned int rmid)
386{
387 struct perf_event *event;
388 unsigned int old_rmid = group->hw.cqm_rmid;
389 struct list_head *head = &group->hw.cqm_group_entry;
390
391 lockdep_assert_held(&cache_mutex);
392
393 /*
394 * If our RMID is being deallocated, perform a read now.
395 */
396 if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
397 struct rmid_read rr = {
398 .value = ATOMIC64_INIT(0),
399 .rmid = old_rmid,
400 };
401
402 on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
403 &rr, 1);
404 local64_set(&group->count, atomic64_read(&rr.value));
405 }
406
407 raw_spin_lock_irq(&cache_lock);
408
409 group->hw.cqm_rmid = rmid;
410 list_for_each_entry(event, head, hw.cqm_group_entry)
411 event->hw.cqm_rmid = rmid;
412
413 raw_spin_unlock_irq(&cache_lock);
414
415 return old_rmid;
416}
417
418/*
419 * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
420 * cachelines are still tagged with RMIDs in limbo, we progressively
421 * increment the threshold until we find an RMID in limbo with <=
422 * __intel_cqm_threshold lines tagged. This is designed to mitigate the
423 * problem where cachelines tagged with an RMID are not steadily being
424 * evicted.
425 *
426 * On successful rotations we decrease the threshold back towards zero.
427 *
428 * __intel_cqm_max_threshold provides an upper bound on the threshold,
429 * and is measured in bytes because it's exposed to userland.
430 */
431static unsigned int __intel_cqm_threshold;
432static unsigned int __intel_cqm_max_threshold;
433
434/*
435 * Test whether an RMID has a zero occupancy value on this cpu.
436 */
437static void intel_cqm_stable(void *arg)
438{
439 struct cqm_rmid_entry *entry;
440
441 list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
442 if (entry->state != RMID_AVAILABLE)
443 break;
444
445 if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
446 entry->state = RMID_DIRTY;
447 }
448}
449
450/*
451 * If we have group events waiting for an RMID that don't conflict with
452 * events already running, assign @rmid.
453 */
454static bool intel_cqm_sched_in_event(unsigned int rmid)
455{
456 struct perf_event *leader, *event;
457
458 lockdep_assert_held(&cache_mutex);
459
460 leader = list_first_entry(&cache_groups, struct perf_event,
461 hw.cqm_groups_entry);
462 event = leader;
463
464 list_for_each_entry_continue(event, &cache_groups,
465 hw.cqm_groups_entry) {
466 if (__rmid_valid(event->hw.cqm_rmid))
467 continue;
468
469 if (__conflict_event(event, leader))
470 continue;
471
472 intel_cqm_xchg_rmid(event, rmid);
473 return true;
474 }
475
476 return false;
477}
478
479/*
480 * Initially use this constant for both the limbo queue time and the
481 * rotation timer interval, pmu::hrtimer_interval_ms.
482 *
483 * They don't need to be the same, but the two are related since if you
484 * rotate faster than you recycle RMIDs, you may run out of available
485 * RMIDs.
486 */
487#define RMID_DEFAULT_QUEUE_TIME 250 /* ms */
488
489static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
490
491/*
492 * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
493 * @nr_available: number of freeable RMIDs on the limbo list
494 *
495 * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
496 * cachelines are tagged with those RMIDs. After this we can reuse them
497 * and know that the current set of active RMIDs is stable.
498 *
499 * Return %true or %false depending on whether stabilization needs to be
500 * reattempted.
501 *
502 * If we return %true then @nr_available is updated to indicate the
503 * number of RMIDs on the limbo list that have been queued for the
504 * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
505 * are above __intel_cqm_threshold.
506 */
507static bool intel_cqm_rmid_stabilize(unsigned int *available)
508{
509 struct cqm_rmid_entry *entry, *tmp;
510 struct perf_event *event;
511
512 lockdep_assert_held(&cache_mutex);
513
514 *available = 0;
515 list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
516 unsigned long min_queue_time;
517 unsigned long now = jiffies;
518
519 /*
520 * We hold RMIDs placed into limbo for a minimum queue
521 * time. Before the minimum queue time has elapsed we do
522 * not recycle RMIDs.
523 *
524 * The reasoning is that until a sufficient time has
525 * passed since we stopped using an RMID, any RMID
526 * placed onto the limbo list will likely still have
527 * data tagged in the cache, which means we'll probably
528 * fail to recycle it anyway.
529 *
530 * We can save ourselves an expensive IPI by skipping
531 * any RMIDs that have not been queued for the minimum
532 * time.
533 */
534 min_queue_time = entry->queue_time +
535 msecs_to_jiffies(__rmid_queue_time_ms);
536
537 if (time_after(min_queue_time, now))
538 break;
539
540 entry->state = RMID_AVAILABLE;
541 (*available)++;
542 }
543
544 /*
545 * Fast return if none of the RMIDs on the limbo list have been
546 * sitting on the queue for the minimum queue time.
547 */
548 if (!*available)
549 return false;
550
551 /*
552 * Test whether an RMID is free for each package.
553 */
554 on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
555
556 list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
557 /*
558 * Exhausted all RMIDs that have waited min queue time.
559 */
560 if (entry->state == RMID_YOUNG)
561 break;
562
563 if (entry->state == RMID_DIRTY)
564 continue;
565
566 list_del(&entry->list); /* remove from limbo */
567
568 /*
569 * The rotation RMID gets priority if it's
570 * currently invalid. In which case, skip adding
571 * the RMID to the the free lru.
572 */
573 if (!__rmid_valid(intel_cqm_rotation_rmid)) {
574 intel_cqm_rotation_rmid = entry->rmid;
575 continue;
576 }
577
578 /*
579 * If we have groups waiting for RMIDs, hand
580 * them one now.
581 */
582 list_for_each_entry(event, &cache_groups,
583 hw.cqm_groups_entry) {
584 if (__rmid_valid(event->hw.cqm_rmid))
585 continue;
586
587 intel_cqm_xchg_rmid(event, entry->rmid);
588 entry = NULL;
589 break;
590 }
591
592 if (!entry)
593 continue;
594
595 /*
596 * Otherwise place it onto the free list.
597 */
598 list_add_tail(&entry->list, &cqm_rmid_free_lru);
599 }
600
601
602 return __rmid_valid(intel_cqm_rotation_rmid);
603}
604
605/*
606 * Pick a victim group and move it to the tail of the group list.
607 */
608static struct perf_event *
609__intel_cqm_pick_and_rotate(void)
610{
611 struct perf_event *rotor;
612
613 lockdep_assert_held(&cache_mutex);
614 lockdep_assert_held(&cache_lock);
615
616 rotor = list_first_entry(&cache_groups, struct perf_event,
617 hw.cqm_groups_entry);
618 list_rotate_left(&cache_groups);
619
620 return rotor;
621}
622
623/*
624 * Attempt to rotate the groups and assign new RMIDs.
625 *
626 * Rotating RMIDs is complicated because the hardware doesn't give us
627 * any clues.
628 *
629 * There's problems with the hardware interface; when you change the
630 * task:RMID map cachelines retain their 'old' tags, giving a skewed
631 * picture. In order to work around this, we must always keep one free
632 * RMID - intel_cqm_rotation_rmid.
633 *
634 * Rotation works by taking away an RMID from a group (the old RMID),
635 * and assigning the free RMID to another group (the new RMID). We must
636 * then wait for the old RMID to not be used (no cachelines tagged).
637 * This ensure that all cachelines are tagged with 'active' RMIDs. At
638 * this point we can start reading values for the new RMID and treat the
639 * old RMID as the free RMID for the next rotation.
640 *
641 * Return %true or %false depending on whether we did any rotating.
642 */
643static bool __intel_cqm_rmid_rotate(void)
644{
645 struct perf_event *group, *rotor, *start = NULL;
646 unsigned int threshold_limit;
647 unsigned int nr_needed = 0;
648 unsigned int nr_available;
649 unsigned int rmid;
650 bool rotated = false;
651
652 mutex_lock(&cache_mutex);
653
654again:
655 /*
656 * Fast path through this function if there are no groups and no
657 * RMIDs that need cleaning.
658 */
659 if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
660 goto out;
661
662 list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
663 if (!__rmid_valid(group->hw.cqm_rmid)) {
664 if (!start)
665 start = group;
666 nr_needed++;
667 }
668 }
669
670 /*
671 * We have some event groups, but they all have RMIDs assigned
672 * and no RMIDs need cleaning.
673 */
674 if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
675 goto out;
676
677 if (!nr_needed)
678 goto stabilize;
679
680 /*
681 * We have more event groups without RMIDs than available RMIDs.
682 *
683 * We force deallocate the rmid of the group at the head of
684 * cache_groups. The first event group without an RMID then gets
685 * assigned intel_cqm_rotation_rmid. This ensures we always make
686 * forward progress.
687 *
688 * Rotate the cache_groups list so the previous head is now the
689 * tail.
690 */
691 rotor = __intel_cqm_pick_and_rotate();
692 rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
693
694 /*
695 * The group at the front of the list should always have a valid
696 * RMID. If it doesn't then no groups have RMIDs assigned.
697 */
698 if (!__rmid_valid(rmid))
699 goto stabilize;
700
701 /*
702 * If the rotation is going to succeed, reduce the threshold so
703 * that we don't needlessly reuse dirty RMIDs.
704 */
705 if (__rmid_valid(intel_cqm_rotation_rmid)) {
706 intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
707 intel_cqm_rotation_rmid = INVALID_RMID;
708
709 if (__intel_cqm_threshold)
710 __intel_cqm_threshold--;
711 }
712
713 __put_rmid(rmid);
714
715 rotated = true;
716
717stabilize:
718 /*
719 * We now need to stablize the RMID we freed above (if any) to
720 * ensure that the next time we rotate we have an RMID with zero
721 * occupancy value.
722 *
723 * Alternatively, if we didn't need to perform any rotation,
724 * we'll have a bunch of RMIDs in limbo that need stabilizing.
725 */
726 threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
727
728 while (intel_cqm_rmid_stabilize(&nr_available) &&
729 __intel_cqm_threshold < threshold_limit) {
730 unsigned int steal_limit;
731
732 /*
733 * Don't spin if nobody is actively waiting for an RMID,
734 * the rotation worker will be kicked as soon as an
735 * event needs an RMID anyway.
736 */
737 if (!nr_needed)
738 break;
739
740 /* Allow max 25% of RMIDs to be in limbo. */
741 steal_limit = (cqm_max_rmid + 1) / 4;
742
743 /*
744 * We failed to stabilize any RMIDs so our rotation
745 * logic is now stuck. In order to make forward progress
746 * we have a few options:
747 *
748 * 1. rotate ("steal") another RMID
749 * 2. increase the threshold
750 * 3. do nothing
751 *
752 * We do both of 1. and 2. until we hit the steal limit.
753 *
754 * The steal limit prevents all RMIDs ending up on the
755 * limbo list. This can happen if every RMID has a
756 * non-zero occupancy above threshold_limit, and the
757 * occupancy values aren't dropping fast enough.
758 *
759 * Note that there is prioritisation at work here - we'd
760 * rather increase the number of RMIDs on the limbo list
761 * than increase the threshold, because increasing the
762 * threshold skews the event data (because we reuse
763 * dirty RMIDs) - threshold bumps are a last resort.
764 */
765 if (nr_available < steal_limit)
766 goto again;
767
768 __intel_cqm_threshold++;
769 }
770
771out:
772 mutex_unlock(&cache_mutex);
773 return rotated;
774}
775
776static void intel_cqm_rmid_rotate(struct work_struct *work);
777
778static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
779
780static struct pmu intel_cqm_pmu;
781
782static void intel_cqm_rmid_rotate(struct work_struct *work)
783{
784 unsigned long delay;
785
786 __intel_cqm_rmid_rotate();
787
788 delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
789 schedule_delayed_work(&intel_cqm_rmid_work, delay);
790}
791
316/* 792/*
317 * Find a group and setup RMID. 793 * Find a group and setup RMID.
318 * 794 *
@@ -322,7 +798,6 @@ static int intel_cqm_setup_event(struct perf_event *event,
322 struct perf_event **group) 798 struct perf_event **group)
323{ 799{
324 struct perf_event *iter; 800 struct perf_event *iter;
325 int rmid;
326 801
327 list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { 802 list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
328 if (__match_event(iter, event)) { 803 if (__match_event(iter, event)) {
@@ -336,17 +811,14 @@ static int intel_cqm_setup_event(struct perf_event *event,
336 return -EBUSY; 811 return -EBUSY;
337 } 812 }
338 813
339 rmid = __get_rmid(); 814 event->hw.cqm_rmid = __get_rmid();
340 if (rmid < 0)
341 return rmid;
342
343 event->hw.cqm_rmid = rmid;
344 return 0; 815 return 0;
345} 816}
346 817
347static void intel_cqm_event_read(struct perf_event *event) 818static void intel_cqm_event_read(struct perf_event *event)
348{ 819{
349 unsigned long rmid; 820 unsigned long flags;
821 unsigned int rmid;
350 u64 val; 822 u64 val;
351 823
352 /* 824 /*
@@ -355,23 +827,25 @@ static void intel_cqm_event_read(struct perf_event *event)
355 if (event->cpu == -1) 827 if (event->cpu == -1)
356 return; 828 return;
357 829
830 raw_spin_lock_irqsave(&cache_lock, flags);
358 rmid = event->hw.cqm_rmid; 831 rmid = event->hw.cqm_rmid;
832
833 if (!__rmid_valid(rmid))
834 goto out;
835
359 val = __rmid_read(rmid); 836 val = __rmid_read(rmid);
360 837
361 /* 838 /*
362 * Ignore this reading on error states and do not update the value. 839 * Ignore this reading on error states and do not update the value.
363 */ 840 */
364 if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) 841 if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
365 return; 842 goto out;
366 843
367 local64_set(&event->count, val); 844 local64_set(&event->count, val);
845out:
846 raw_spin_unlock_irqrestore(&cache_lock, flags);
368} 847}
369 848
370struct rmid_read {
371 unsigned int rmid;
372 atomic64_t value;
373};
374
375static void __intel_cqm_event_count(void *info) 849static void __intel_cqm_event_count(void *info)
376{ 850{
377 struct rmid_read *rr = info; 851 struct rmid_read *rr = info;
@@ -392,8 +866,8 @@ static inline bool cqm_group_leader(struct perf_event *event)
392 866
393static u64 intel_cqm_event_count(struct perf_event *event) 867static u64 intel_cqm_event_count(struct perf_event *event)
394{ 868{
869 unsigned long flags;
395 struct rmid_read rr = { 870 struct rmid_read rr = {
396 .rmid = event->hw.cqm_rmid,
397 .value = ATOMIC64_INIT(0), 871 .value = ATOMIC64_INIT(0),
398 }; 872 };
399 873
@@ -417,17 +891,36 @@ static u64 intel_cqm_event_count(struct perf_event *event)
417 if (!cqm_group_leader(event)) 891 if (!cqm_group_leader(event))
418 return 0; 892 return 0;
419 893
420 on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1); 894 /*
895 * Notice that we don't perform the reading of an RMID
896 * atomically, because we can't hold a spin lock across the
897 * IPIs.
898 *
899 * Speculatively perform the read, since @event might be
900 * assigned a different (possibly invalid) RMID while we're
901 * busying performing the IPI calls. It's therefore necessary to
902 * check @event's RMID afterwards, and if it has changed,
903 * discard the result of the read.
904 */
905 rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
421 906
422 local64_set(&event->count, atomic64_read(&rr.value)); 907 if (!__rmid_valid(rr.rmid))
908 goto out;
909
910 on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
423 911
912 raw_spin_lock_irqsave(&cache_lock, flags);
913 if (event->hw.cqm_rmid == rr.rmid)
914 local64_set(&event->count, atomic64_read(&rr.value));
915 raw_spin_unlock_irqrestore(&cache_lock, flags);
916out:
424 return __perf_event_count(event); 917 return __perf_event_count(event);
425} 918}
426 919
427static void intel_cqm_event_start(struct perf_event *event, int mode) 920static void intel_cqm_event_start(struct perf_event *event, int mode)
428{ 921{
429 struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); 922 struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
430 unsigned long rmid = event->hw.cqm_rmid; 923 unsigned int rmid = event->hw.cqm_rmid;
431 unsigned long flags; 924 unsigned long flags;
432 925
433 if (!(event->hw.cqm_state & PERF_HES_STOPPED)) 926 if (!(event->hw.cqm_state & PERF_HES_STOPPED))
@@ -473,15 +966,19 @@ static void intel_cqm_event_stop(struct perf_event *event, int mode)
473 966
474static int intel_cqm_event_add(struct perf_event *event, int mode) 967static int intel_cqm_event_add(struct perf_event *event, int mode)
475{ 968{
476 int rmid; 969 unsigned long flags;
970 unsigned int rmid;
971
972 raw_spin_lock_irqsave(&cache_lock, flags);
477 973
478 event->hw.cqm_state = PERF_HES_STOPPED; 974 event->hw.cqm_state = PERF_HES_STOPPED;
479 rmid = event->hw.cqm_rmid; 975 rmid = event->hw.cqm_rmid;
480 WARN_ON_ONCE(!rmid);
481 976
482 if (mode & PERF_EF_START) 977 if (__rmid_valid(rmid) && (mode & PERF_EF_START))
483 intel_cqm_event_start(event, mode); 978 intel_cqm_event_start(event, mode);
484 979
980 raw_spin_unlock_irqrestore(&cache_lock, flags);
981
485 return 0; 982 return 0;
486} 983}
487 984
@@ -518,9 +1015,10 @@ static void intel_cqm_event_destroy(struct perf_event *event)
518 list_replace(&event->hw.cqm_groups_entry, 1015 list_replace(&event->hw.cqm_groups_entry,
519 &group_other->hw.cqm_groups_entry); 1016 &group_other->hw.cqm_groups_entry);
520 } else { 1017 } else {
521 int rmid = event->hw.cqm_rmid; 1018 unsigned int rmid = event->hw.cqm_rmid;
522 1019
523 __put_rmid(rmid); 1020 if (__rmid_valid(rmid))
1021 __put_rmid(rmid);
524 list_del(&event->hw.cqm_groups_entry); 1022 list_del(&event->hw.cqm_groups_entry);
525 } 1023 }
526 } 1024 }
@@ -528,11 +1026,10 @@ static void intel_cqm_event_destroy(struct perf_event *event)
528 mutex_unlock(&cache_mutex); 1026 mutex_unlock(&cache_mutex);
529} 1027}
530 1028
531static struct pmu intel_cqm_pmu;
532
533static int intel_cqm_event_init(struct perf_event *event) 1029static int intel_cqm_event_init(struct perf_event *event)
534{ 1030{
535 struct perf_event *group = NULL; 1031 struct perf_event *group = NULL;
1032 bool rotate = false;
536 int err; 1033 int err;
537 1034
538 if (event->attr.type != intel_cqm_pmu.type) 1035 if (event->attr.type != intel_cqm_pmu.type)
@@ -569,10 +1066,24 @@ static int intel_cqm_event_init(struct perf_event *event)
569 } else { 1066 } else {
570 list_add_tail(&event->hw.cqm_groups_entry, 1067 list_add_tail(&event->hw.cqm_groups_entry,
571 &cache_groups); 1068 &cache_groups);
1069
1070 /*
1071 * All RMIDs are either in use or have recently been
1072 * used. Kick the rotation worker to clean/free some.
1073 *
1074 * We only do this for the group leader, rather than for
1075 * every event in a group to save on needless work.
1076 */
1077 if (!__rmid_valid(event->hw.cqm_rmid))
1078 rotate = true;
572 } 1079 }
573 1080
574out: 1081out:
575 mutex_unlock(&cache_mutex); 1082 mutex_unlock(&cache_mutex);
1083
1084 if (rotate)
1085 schedule_delayed_work(&intel_cqm_rmid_work, 0);
1086
576 return err; 1087 return err;
577} 1088}
578 1089
@@ -607,22 +1118,76 @@ static struct attribute_group intel_cqm_format_group = {
607 .attrs = intel_cqm_formats_attr, 1118 .attrs = intel_cqm_formats_attr,
608}; 1119};
609 1120
1121static ssize_t
1122max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
1123 char *page)
1124{
1125 ssize_t rv;
1126
1127 mutex_lock(&cache_mutex);
1128 rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
1129 mutex_unlock(&cache_mutex);
1130
1131 return rv;
1132}
1133
1134static ssize_t
1135max_recycle_threshold_store(struct device *dev,
1136 struct device_attribute *attr,
1137 const char *buf, size_t count)
1138{
1139 unsigned int bytes, cachelines;
1140 int ret;
1141
1142 ret = kstrtouint(buf, 0, &bytes);
1143 if (ret)
1144 return ret;
1145
1146 mutex_lock(&cache_mutex);
1147
1148 __intel_cqm_max_threshold = bytes;
1149 cachelines = bytes / cqm_l3_scale;
1150
1151 /*
1152 * The new maximum takes effect immediately.
1153 */
1154 if (__intel_cqm_threshold > cachelines)
1155 __intel_cqm_threshold = cachelines;
1156
1157 mutex_unlock(&cache_mutex);
1158
1159 return count;
1160}
1161
1162static DEVICE_ATTR_RW(max_recycle_threshold);
1163
1164static struct attribute *intel_cqm_attrs[] = {
1165 &dev_attr_max_recycle_threshold.attr,
1166 NULL,
1167};
1168
1169static const struct attribute_group intel_cqm_group = {
1170 .attrs = intel_cqm_attrs,
1171};
1172
610static const struct attribute_group *intel_cqm_attr_groups[] = { 1173static const struct attribute_group *intel_cqm_attr_groups[] = {
611 &intel_cqm_events_group, 1174 &intel_cqm_events_group,
612 &intel_cqm_format_group, 1175 &intel_cqm_format_group,
1176 &intel_cqm_group,
613 NULL, 1177 NULL,
614}; 1178};
615 1179
616static struct pmu intel_cqm_pmu = { 1180static struct pmu intel_cqm_pmu = {
617 .attr_groups = intel_cqm_attr_groups, 1181 .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
618 .task_ctx_nr = perf_sw_context, 1182 .attr_groups = intel_cqm_attr_groups,
619 .event_init = intel_cqm_event_init, 1183 .task_ctx_nr = perf_sw_context,
620 .add = intel_cqm_event_add, 1184 .event_init = intel_cqm_event_init,
621 .del = intel_cqm_event_del, 1185 .add = intel_cqm_event_add,
622 .start = intel_cqm_event_start, 1186 .del = intel_cqm_event_del,
623 .stop = intel_cqm_event_stop, 1187 .start = intel_cqm_event_start,
624 .read = intel_cqm_event_read, 1188 .stop = intel_cqm_event_stop,
625 .count = intel_cqm_event_count, 1189 .read = intel_cqm_event_read,
1190 .count = intel_cqm_event_count,
626}; 1191};
627 1192
628static inline void cqm_pick_event_reader(int cpu) 1193static inline void cqm_pick_event_reader(int cpu)
@@ -732,6 +1297,16 @@ static int __init intel_cqm_init(void)
732 } 1297 }
733 } 1298 }
734 1299
1300 /*
1301 * A reasonable upper limit on the max threshold is the number
1302 * of lines tagged per RMID if all RMIDs have the same number of
1303 * lines tagged in the LLC.
1304 *
1305 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
1306 */
1307 __intel_cqm_max_threshold =
1308 boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
1309
735 snprintf(scale, sizeof(scale), "%u", cqm_l3_scale); 1310 snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
736 str = kstrdup(scale, GFP_KERNEL); 1311 str = kstrdup(scale, GFP_KERNEL);
737 if (!str) { 1312 if (!str) {