diff options
-rw-r--r-- | arch/x86/kernel/cpu/perf_event_intel_cqm.c | 671 |
1 files changed, 623 insertions, 48 deletions
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 8003d87afd89..e31f5086f2b5 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c | |||
@@ -25,9 +25,13 @@ struct intel_cqm_state { | |||
25 | static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state); | 25 | static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state); |
26 | 26 | ||
27 | /* | 27 | /* |
28 | * Protects cache_cgroups and cqm_rmid_lru. | 28 | * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru. |
29 | * Also protects event->hw.cqm_rmid | ||
30 | * | ||
31 | * Hold either for stability, both for modification of ->hw.cqm_rmid. | ||
29 | */ | 32 | */ |
30 | static DEFINE_MUTEX(cache_mutex); | 33 | static DEFINE_MUTEX(cache_mutex); |
34 | static DEFINE_RAW_SPINLOCK(cache_lock); | ||
31 | 35 | ||
32 | /* | 36 | /* |
33 | * Groups of events that have the same target(s), one RMID per group. | 37 | * Groups of events that have the same target(s), one RMID per group. |
@@ -46,7 +50,34 @@ static cpumask_t cqm_cpumask; | |||
46 | 50 | ||
47 | #define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID | 51 | #define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID |
48 | 52 | ||
49 | static u64 __rmid_read(unsigned long rmid) | 53 | /* |
54 | * This is central to the rotation algorithm in __intel_cqm_rmid_rotate(). | ||
55 | * | ||
56 | * This rmid is always free and is guaranteed to have an associated | ||
57 | * near-zero occupancy value, i.e. no cachelines are tagged with this | ||
58 | * RMID, once __intel_cqm_rmid_rotate() returns. | ||
59 | */ | ||
60 | static unsigned int intel_cqm_rotation_rmid; | ||
61 | |||
62 | #define INVALID_RMID (-1) | ||
63 | |||
64 | /* | ||
65 | * Is @rmid valid for programming the hardware? | ||
66 | * | ||
67 | * rmid 0 is reserved by the hardware for all non-monitored tasks, which | ||
68 | * means that we should never come across an rmid with that value. | ||
69 | * Likewise, an rmid value of -1 is used to indicate "no rmid currently | ||
70 | * assigned" and is used as part of the rotation code. | ||
71 | */ | ||
72 | static inline bool __rmid_valid(unsigned int rmid) | ||
73 | { | ||
74 | if (!rmid || rmid == INVALID_RMID) | ||
75 | return false; | ||
76 | |||
77 | return true; | ||
78 | } | ||
79 | |||
80 | static u64 __rmid_read(unsigned int rmid) | ||
50 | { | 81 | { |
51 | u64 val; | 82 | u64 val; |
52 | 83 | ||
@@ -64,13 +95,21 @@ static u64 __rmid_read(unsigned long rmid) | |||
64 | return val; | 95 | return val; |
65 | } | 96 | } |
66 | 97 | ||
98 | enum rmid_recycle_state { | ||
99 | RMID_YOUNG = 0, | ||
100 | RMID_AVAILABLE, | ||
101 | RMID_DIRTY, | ||
102 | }; | ||
103 | |||
67 | struct cqm_rmid_entry { | 104 | struct cqm_rmid_entry { |
68 | u64 rmid; | 105 | unsigned int rmid; |
106 | enum rmid_recycle_state state; | ||
69 | struct list_head list; | 107 | struct list_head list; |
108 | unsigned long queue_time; | ||
70 | }; | 109 | }; |
71 | 110 | ||
72 | /* | 111 | /* |
73 | * A least recently used list of RMIDs. | 112 | * cqm_rmid_free_lru - A least recently used list of RMIDs. |
74 | * | 113 | * |
75 | * Oldest entry at the head, newest (most recently used) entry at the | 114 | * Oldest entry at the head, newest (most recently used) entry at the |
76 | * tail. This list is never traversed, it's only used to keep track of | 115 | * tail. This list is never traversed, it's only used to keep track of |
@@ -81,9 +120,18 @@ struct cqm_rmid_entry { | |||
81 | * in use. To mark an RMID as in use, remove its entry from the lru | 120 | * in use. To mark an RMID as in use, remove its entry from the lru |
82 | * list. | 121 | * list. |
83 | * | 122 | * |
84 | * This list is protected by cache_mutex. | 123 | * |
124 | * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs. | ||
125 | * | ||
126 | * This list is contains RMIDs that no one is currently using but that | ||
127 | * may have a non-zero occupancy value associated with them. The | ||
128 | * rotation worker moves RMIDs from the limbo list to the free list once | ||
129 | * the occupancy value drops below __intel_cqm_threshold. | ||
130 | * | ||
131 | * Both lists are protected by cache_mutex. | ||
85 | */ | 132 | */ |
86 | static LIST_HEAD(cqm_rmid_lru); | 133 | static LIST_HEAD(cqm_rmid_free_lru); |
134 | static LIST_HEAD(cqm_rmid_limbo_lru); | ||
87 | 135 | ||
88 | /* | 136 | /* |
89 | * We use a simple array of pointers so that we can lookup a struct | 137 | * We use a simple array of pointers so that we can lookup a struct |
@@ -120,37 +168,43 @@ static int __get_rmid(void) | |||
120 | 168 | ||
121 | lockdep_assert_held(&cache_mutex); | 169 | lockdep_assert_held(&cache_mutex); |
122 | 170 | ||
123 | if (list_empty(&cqm_rmid_lru)) | 171 | if (list_empty(&cqm_rmid_free_lru)) |
124 | return -EAGAIN; | 172 | return INVALID_RMID; |
125 | 173 | ||
126 | entry = list_first_entry(&cqm_rmid_lru, struct cqm_rmid_entry, list); | 174 | entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list); |
127 | list_del(&entry->list); | 175 | list_del(&entry->list); |
128 | 176 | ||
129 | return entry->rmid; | 177 | return entry->rmid; |
130 | } | 178 | } |
131 | 179 | ||
132 | static void __put_rmid(int rmid) | 180 | static void __put_rmid(unsigned int rmid) |
133 | { | 181 | { |
134 | struct cqm_rmid_entry *entry; | 182 | struct cqm_rmid_entry *entry; |
135 | 183 | ||
136 | lockdep_assert_held(&cache_mutex); | 184 | lockdep_assert_held(&cache_mutex); |
137 | 185 | ||
186 | WARN_ON(!__rmid_valid(rmid)); | ||
138 | entry = __rmid_entry(rmid); | 187 | entry = __rmid_entry(rmid); |
139 | 188 | ||
140 | list_add_tail(&entry->list, &cqm_rmid_lru); | 189 | entry->queue_time = jiffies; |
190 | entry->state = RMID_YOUNG; | ||
191 | |||
192 | list_add_tail(&entry->list, &cqm_rmid_limbo_lru); | ||
141 | } | 193 | } |
142 | 194 | ||
143 | static int intel_cqm_setup_rmid_cache(void) | 195 | static int intel_cqm_setup_rmid_cache(void) |
144 | { | 196 | { |
145 | struct cqm_rmid_entry *entry; | 197 | struct cqm_rmid_entry *entry; |
146 | int r; | 198 | unsigned int nr_rmids; |
199 | int r = 0; | ||
147 | 200 | ||
201 | nr_rmids = cqm_max_rmid + 1; | ||
148 | cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) * | 202 | cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) * |
149 | (cqm_max_rmid + 1), GFP_KERNEL); | 203 | nr_rmids, GFP_KERNEL); |
150 | if (!cqm_rmid_ptrs) | 204 | if (!cqm_rmid_ptrs) |
151 | return -ENOMEM; | 205 | return -ENOMEM; |
152 | 206 | ||
153 | for (r = 0; r <= cqm_max_rmid; r++) { | 207 | for (; r <= cqm_max_rmid; r++) { |
154 | struct cqm_rmid_entry *entry; | 208 | struct cqm_rmid_entry *entry; |
155 | 209 | ||
156 | entry = kmalloc(sizeof(*entry), GFP_KERNEL); | 210 | entry = kmalloc(sizeof(*entry), GFP_KERNEL); |
@@ -161,7 +215,7 @@ static int intel_cqm_setup_rmid_cache(void) | |||
161 | entry->rmid = r; | 215 | entry->rmid = r; |
162 | cqm_rmid_ptrs[r] = entry; | 216 | cqm_rmid_ptrs[r] = entry; |
163 | 217 | ||
164 | list_add_tail(&entry->list, &cqm_rmid_lru); | 218 | list_add_tail(&entry->list, &cqm_rmid_free_lru); |
165 | } | 219 | } |
166 | 220 | ||
167 | /* | 221 | /* |
@@ -171,6 +225,10 @@ static int intel_cqm_setup_rmid_cache(void) | |||
171 | entry = __rmid_entry(0); | 225 | entry = __rmid_entry(0); |
172 | list_del(&entry->list); | 226 | list_del(&entry->list); |
173 | 227 | ||
228 | mutex_lock(&cache_mutex); | ||
229 | intel_cqm_rotation_rmid = __get_rmid(); | ||
230 | mutex_unlock(&cache_mutex); | ||
231 | |||
174 | return 0; | 232 | return 0; |
175 | fail: | 233 | fail: |
176 | while (r--) | 234 | while (r--) |
@@ -313,6 +371,424 @@ static bool __conflict_event(struct perf_event *a, struct perf_event *b) | |||
313 | return false; | 371 | return false; |
314 | } | 372 | } |
315 | 373 | ||
374 | struct rmid_read { | ||
375 | unsigned int rmid; | ||
376 | atomic64_t value; | ||
377 | }; | ||
378 | |||
379 | static void __intel_cqm_event_count(void *info); | ||
380 | |||
381 | /* | ||
382 | * Exchange the RMID of a group of events. | ||
383 | */ | ||
384 | static unsigned int | ||
385 | intel_cqm_xchg_rmid(struct perf_event *group, unsigned int rmid) | ||
386 | { | ||
387 | struct perf_event *event; | ||
388 | unsigned int old_rmid = group->hw.cqm_rmid; | ||
389 | struct list_head *head = &group->hw.cqm_group_entry; | ||
390 | |||
391 | lockdep_assert_held(&cache_mutex); | ||
392 | |||
393 | /* | ||
394 | * If our RMID is being deallocated, perform a read now. | ||
395 | */ | ||
396 | if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) { | ||
397 | struct rmid_read rr = { | ||
398 | .value = ATOMIC64_INIT(0), | ||
399 | .rmid = old_rmid, | ||
400 | }; | ||
401 | |||
402 | on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, | ||
403 | &rr, 1); | ||
404 | local64_set(&group->count, atomic64_read(&rr.value)); | ||
405 | } | ||
406 | |||
407 | raw_spin_lock_irq(&cache_lock); | ||
408 | |||
409 | group->hw.cqm_rmid = rmid; | ||
410 | list_for_each_entry(event, head, hw.cqm_group_entry) | ||
411 | event->hw.cqm_rmid = rmid; | ||
412 | |||
413 | raw_spin_unlock_irq(&cache_lock); | ||
414 | |||
415 | return old_rmid; | ||
416 | } | ||
417 | |||
418 | /* | ||
419 | * If we fail to assign a new RMID for intel_cqm_rotation_rmid because | ||
420 | * cachelines are still tagged with RMIDs in limbo, we progressively | ||
421 | * increment the threshold until we find an RMID in limbo with <= | ||
422 | * __intel_cqm_threshold lines tagged. This is designed to mitigate the | ||
423 | * problem where cachelines tagged with an RMID are not steadily being | ||
424 | * evicted. | ||
425 | * | ||
426 | * On successful rotations we decrease the threshold back towards zero. | ||
427 | * | ||
428 | * __intel_cqm_max_threshold provides an upper bound on the threshold, | ||
429 | * and is measured in bytes because it's exposed to userland. | ||
430 | */ | ||
431 | static unsigned int __intel_cqm_threshold; | ||
432 | static unsigned int __intel_cqm_max_threshold; | ||
433 | |||
434 | /* | ||
435 | * Test whether an RMID has a zero occupancy value on this cpu. | ||
436 | */ | ||
437 | static void intel_cqm_stable(void *arg) | ||
438 | { | ||
439 | struct cqm_rmid_entry *entry; | ||
440 | |||
441 | list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { | ||
442 | if (entry->state != RMID_AVAILABLE) | ||
443 | break; | ||
444 | |||
445 | if (__rmid_read(entry->rmid) > __intel_cqm_threshold) | ||
446 | entry->state = RMID_DIRTY; | ||
447 | } | ||
448 | } | ||
449 | |||
450 | /* | ||
451 | * If we have group events waiting for an RMID that don't conflict with | ||
452 | * events already running, assign @rmid. | ||
453 | */ | ||
454 | static bool intel_cqm_sched_in_event(unsigned int rmid) | ||
455 | { | ||
456 | struct perf_event *leader, *event; | ||
457 | |||
458 | lockdep_assert_held(&cache_mutex); | ||
459 | |||
460 | leader = list_first_entry(&cache_groups, struct perf_event, | ||
461 | hw.cqm_groups_entry); | ||
462 | event = leader; | ||
463 | |||
464 | list_for_each_entry_continue(event, &cache_groups, | ||
465 | hw.cqm_groups_entry) { | ||
466 | if (__rmid_valid(event->hw.cqm_rmid)) | ||
467 | continue; | ||
468 | |||
469 | if (__conflict_event(event, leader)) | ||
470 | continue; | ||
471 | |||
472 | intel_cqm_xchg_rmid(event, rmid); | ||
473 | return true; | ||
474 | } | ||
475 | |||
476 | return false; | ||
477 | } | ||
478 | |||
479 | /* | ||
480 | * Initially use this constant for both the limbo queue time and the | ||
481 | * rotation timer interval, pmu::hrtimer_interval_ms. | ||
482 | * | ||
483 | * They don't need to be the same, but the two are related since if you | ||
484 | * rotate faster than you recycle RMIDs, you may run out of available | ||
485 | * RMIDs. | ||
486 | */ | ||
487 | #define RMID_DEFAULT_QUEUE_TIME 250 /* ms */ | ||
488 | |||
489 | static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME; | ||
490 | |||
491 | /* | ||
492 | * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list | ||
493 | * @nr_available: number of freeable RMIDs on the limbo list | ||
494 | * | ||
495 | * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no | ||
496 | * cachelines are tagged with those RMIDs. After this we can reuse them | ||
497 | * and know that the current set of active RMIDs is stable. | ||
498 | * | ||
499 | * Return %true or %false depending on whether stabilization needs to be | ||
500 | * reattempted. | ||
501 | * | ||
502 | * If we return %true then @nr_available is updated to indicate the | ||
503 | * number of RMIDs on the limbo list that have been queued for the | ||
504 | * minimum queue time (RMID_AVAILABLE), but whose data occupancy values | ||
505 | * are above __intel_cqm_threshold. | ||
506 | */ | ||
507 | static bool intel_cqm_rmid_stabilize(unsigned int *available) | ||
508 | { | ||
509 | struct cqm_rmid_entry *entry, *tmp; | ||
510 | struct perf_event *event; | ||
511 | |||
512 | lockdep_assert_held(&cache_mutex); | ||
513 | |||
514 | *available = 0; | ||
515 | list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { | ||
516 | unsigned long min_queue_time; | ||
517 | unsigned long now = jiffies; | ||
518 | |||
519 | /* | ||
520 | * We hold RMIDs placed into limbo for a minimum queue | ||
521 | * time. Before the minimum queue time has elapsed we do | ||
522 | * not recycle RMIDs. | ||
523 | * | ||
524 | * The reasoning is that until a sufficient time has | ||
525 | * passed since we stopped using an RMID, any RMID | ||
526 | * placed onto the limbo list will likely still have | ||
527 | * data tagged in the cache, which means we'll probably | ||
528 | * fail to recycle it anyway. | ||
529 | * | ||
530 | * We can save ourselves an expensive IPI by skipping | ||
531 | * any RMIDs that have not been queued for the minimum | ||
532 | * time. | ||
533 | */ | ||
534 | min_queue_time = entry->queue_time + | ||
535 | msecs_to_jiffies(__rmid_queue_time_ms); | ||
536 | |||
537 | if (time_after(min_queue_time, now)) | ||
538 | break; | ||
539 | |||
540 | entry->state = RMID_AVAILABLE; | ||
541 | (*available)++; | ||
542 | } | ||
543 | |||
544 | /* | ||
545 | * Fast return if none of the RMIDs on the limbo list have been | ||
546 | * sitting on the queue for the minimum queue time. | ||
547 | */ | ||
548 | if (!*available) | ||
549 | return false; | ||
550 | |||
551 | /* | ||
552 | * Test whether an RMID is free for each package. | ||
553 | */ | ||
554 | on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true); | ||
555 | |||
556 | list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) { | ||
557 | /* | ||
558 | * Exhausted all RMIDs that have waited min queue time. | ||
559 | */ | ||
560 | if (entry->state == RMID_YOUNG) | ||
561 | break; | ||
562 | |||
563 | if (entry->state == RMID_DIRTY) | ||
564 | continue; | ||
565 | |||
566 | list_del(&entry->list); /* remove from limbo */ | ||
567 | |||
568 | /* | ||
569 | * The rotation RMID gets priority if it's | ||
570 | * currently invalid. In which case, skip adding | ||
571 | * the RMID to the the free lru. | ||
572 | */ | ||
573 | if (!__rmid_valid(intel_cqm_rotation_rmid)) { | ||
574 | intel_cqm_rotation_rmid = entry->rmid; | ||
575 | continue; | ||
576 | } | ||
577 | |||
578 | /* | ||
579 | * If we have groups waiting for RMIDs, hand | ||
580 | * them one now. | ||
581 | */ | ||
582 | list_for_each_entry(event, &cache_groups, | ||
583 | hw.cqm_groups_entry) { | ||
584 | if (__rmid_valid(event->hw.cqm_rmid)) | ||
585 | continue; | ||
586 | |||
587 | intel_cqm_xchg_rmid(event, entry->rmid); | ||
588 | entry = NULL; | ||
589 | break; | ||
590 | } | ||
591 | |||
592 | if (!entry) | ||
593 | continue; | ||
594 | |||
595 | /* | ||
596 | * Otherwise place it onto the free list. | ||
597 | */ | ||
598 | list_add_tail(&entry->list, &cqm_rmid_free_lru); | ||
599 | } | ||
600 | |||
601 | |||
602 | return __rmid_valid(intel_cqm_rotation_rmid); | ||
603 | } | ||
604 | |||
605 | /* | ||
606 | * Pick a victim group and move it to the tail of the group list. | ||
607 | */ | ||
608 | static struct perf_event * | ||
609 | __intel_cqm_pick_and_rotate(void) | ||
610 | { | ||
611 | struct perf_event *rotor; | ||
612 | |||
613 | lockdep_assert_held(&cache_mutex); | ||
614 | lockdep_assert_held(&cache_lock); | ||
615 | |||
616 | rotor = list_first_entry(&cache_groups, struct perf_event, | ||
617 | hw.cqm_groups_entry); | ||
618 | list_rotate_left(&cache_groups); | ||
619 | |||
620 | return rotor; | ||
621 | } | ||
622 | |||
623 | /* | ||
624 | * Attempt to rotate the groups and assign new RMIDs. | ||
625 | * | ||
626 | * Rotating RMIDs is complicated because the hardware doesn't give us | ||
627 | * any clues. | ||
628 | * | ||
629 | * There's problems with the hardware interface; when you change the | ||
630 | * task:RMID map cachelines retain their 'old' tags, giving a skewed | ||
631 | * picture. In order to work around this, we must always keep one free | ||
632 | * RMID - intel_cqm_rotation_rmid. | ||
633 | * | ||
634 | * Rotation works by taking away an RMID from a group (the old RMID), | ||
635 | * and assigning the free RMID to another group (the new RMID). We must | ||
636 | * then wait for the old RMID to not be used (no cachelines tagged). | ||
637 | * This ensure that all cachelines are tagged with 'active' RMIDs. At | ||
638 | * this point we can start reading values for the new RMID and treat the | ||
639 | * old RMID as the free RMID for the next rotation. | ||
640 | * | ||
641 | * Return %true or %false depending on whether we did any rotating. | ||
642 | */ | ||
643 | static bool __intel_cqm_rmid_rotate(void) | ||
644 | { | ||
645 | struct perf_event *group, *rotor, *start = NULL; | ||
646 | unsigned int threshold_limit; | ||
647 | unsigned int nr_needed = 0; | ||
648 | unsigned int nr_available; | ||
649 | unsigned int rmid; | ||
650 | bool rotated = false; | ||
651 | |||
652 | mutex_lock(&cache_mutex); | ||
653 | |||
654 | again: | ||
655 | /* | ||
656 | * Fast path through this function if there are no groups and no | ||
657 | * RMIDs that need cleaning. | ||
658 | */ | ||
659 | if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru)) | ||
660 | goto out; | ||
661 | |||
662 | list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) { | ||
663 | if (!__rmid_valid(group->hw.cqm_rmid)) { | ||
664 | if (!start) | ||
665 | start = group; | ||
666 | nr_needed++; | ||
667 | } | ||
668 | } | ||
669 | |||
670 | /* | ||
671 | * We have some event groups, but they all have RMIDs assigned | ||
672 | * and no RMIDs need cleaning. | ||
673 | */ | ||
674 | if (!nr_needed && list_empty(&cqm_rmid_limbo_lru)) | ||
675 | goto out; | ||
676 | |||
677 | if (!nr_needed) | ||
678 | goto stabilize; | ||
679 | |||
680 | /* | ||
681 | * We have more event groups without RMIDs than available RMIDs. | ||
682 | * | ||
683 | * We force deallocate the rmid of the group at the head of | ||
684 | * cache_groups. The first event group without an RMID then gets | ||
685 | * assigned intel_cqm_rotation_rmid. This ensures we always make | ||
686 | * forward progress. | ||
687 | * | ||
688 | * Rotate the cache_groups list so the previous head is now the | ||
689 | * tail. | ||
690 | */ | ||
691 | rotor = __intel_cqm_pick_and_rotate(); | ||
692 | rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID); | ||
693 | |||
694 | /* | ||
695 | * The group at the front of the list should always have a valid | ||
696 | * RMID. If it doesn't then no groups have RMIDs assigned. | ||
697 | */ | ||
698 | if (!__rmid_valid(rmid)) | ||
699 | goto stabilize; | ||
700 | |||
701 | /* | ||
702 | * If the rotation is going to succeed, reduce the threshold so | ||
703 | * that we don't needlessly reuse dirty RMIDs. | ||
704 | */ | ||
705 | if (__rmid_valid(intel_cqm_rotation_rmid)) { | ||
706 | intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid); | ||
707 | intel_cqm_rotation_rmid = INVALID_RMID; | ||
708 | |||
709 | if (__intel_cqm_threshold) | ||
710 | __intel_cqm_threshold--; | ||
711 | } | ||
712 | |||
713 | __put_rmid(rmid); | ||
714 | |||
715 | rotated = true; | ||
716 | |||
717 | stabilize: | ||
718 | /* | ||
719 | * We now need to stablize the RMID we freed above (if any) to | ||
720 | * ensure that the next time we rotate we have an RMID with zero | ||
721 | * occupancy value. | ||
722 | * | ||
723 | * Alternatively, if we didn't need to perform any rotation, | ||
724 | * we'll have a bunch of RMIDs in limbo that need stabilizing. | ||
725 | */ | ||
726 | threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale; | ||
727 | |||
728 | while (intel_cqm_rmid_stabilize(&nr_available) && | ||
729 | __intel_cqm_threshold < threshold_limit) { | ||
730 | unsigned int steal_limit; | ||
731 | |||
732 | /* | ||
733 | * Don't spin if nobody is actively waiting for an RMID, | ||
734 | * the rotation worker will be kicked as soon as an | ||
735 | * event needs an RMID anyway. | ||
736 | */ | ||
737 | if (!nr_needed) | ||
738 | break; | ||
739 | |||
740 | /* Allow max 25% of RMIDs to be in limbo. */ | ||
741 | steal_limit = (cqm_max_rmid + 1) / 4; | ||
742 | |||
743 | /* | ||
744 | * We failed to stabilize any RMIDs so our rotation | ||
745 | * logic is now stuck. In order to make forward progress | ||
746 | * we have a few options: | ||
747 | * | ||
748 | * 1. rotate ("steal") another RMID | ||
749 | * 2. increase the threshold | ||
750 | * 3. do nothing | ||
751 | * | ||
752 | * We do both of 1. and 2. until we hit the steal limit. | ||
753 | * | ||
754 | * The steal limit prevents all RMIDs ending up on the | ||
755 | * limbo list. This can happen if every RMID has a | ||
756 | * non-zero occupancy above threshold_limit, and the | ||
757 | * occupancy values aren't dropping fast enough. | ||
758 | * | ||
759 | * Note that there is prioritisation at work here - we'd | ||
760 | * rather increase the number of RMIDs on the limbo list | ||
761 | * than increase the threshold, because increasing the | ||
762 | * threshold skews the event data (because we reuse | ||
763 | * dirty RMIDs) - threshold bumps are a last resort. | ||
764 | */ | ||
765 | if (nr_available < steal_limit) | ||
766 | goto again; | ||
767 | |||
768 | __intel_cqm_threshold++; | ||
769 | } | ||
770 | |||
771 | out: | ||
772 | mutex_unlock(&cache_mutex); | ||
773 | return rotated; | ||
774 | } | ||
775 | |||
776 | static void intel_cqm_rmid_rotate(struct work_struct *work); | ||
777 | |||
778 | static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate); | ||
779 | |||
780 | static struct pmu intel_cqm_pmu; | ||
781 | |||
782 | static void intel_cqm_rmid_rotate(struct work_struct *work) | ||
783 | { | ||
784 | unsigned long delay; | ||
785 | |||
786 | __intel_cqm_rmid_rotate(); | ||
787 | |||
788 | delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms); | ||
789 | schedule_delayed_work(&intel_cqm_rmid_work, delay); | ||
790 | } | ||
791 | |||
316 | /* | 792 | /* |
317 | * Find a group and setup RMID. | 793 | * Find a group and setup RMID. |
318 | * | 794 | * |
@@ -322,7 +798,6 @@ static int intel_cqm_setup_event(struct perf_event *event, | |||
322 | struct perf_event **group) | 798 | struct perf_event **group) |
323 | { | 799 | { |
324 | struct perf_event *iter; | 800 | struct perf_event *iter; |
325 | int rmid; | ||
326 | 801 | ||
327 | list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { | 802 | list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { |
328 | if (__match_event(iter, event)) { | 803 | if (__match_event(iter, event)) { |
@@ -336,17 +811,14 @@ static int intel_cqm_setup_event(struct perf_event *event, | |||
336 | return -EBUSY; | 811 | return -EBUSY; |
337 | } | 812 | } |
338 | 813 | ||
339 | rmid = __get_rmid(); | 814 | event->hw.cqm_rmid = __get_rmid(); |
340 | if (rmid < 0) | ||
341 | return rmid; | ||
342 | |||
343 | event->hw.cqm_rmid = rmid; | ||
344 | return 0; | 815 | return 0; |
345 | } | 816 | } |
346 | 817 | ||
347 | static void intel_cqm_event_read(struct perf_event *event) | 818 | static void intel_cqm_event_read(struct perf_event *event) |
348 | { | 819 | { |
349 | unsigned long rmid; | 820 | unsigned long flags; |
821 | unsigned int rmid; | ||
350 | u64 val; | 822 | u64 val; |
351 | 823 | ||
352 | /* | 824 | /* |
@@ -355,23 +827,25 @@ static void intel_cqm_event_read(struct perf_event *event) | |||
355 | if (event->cpu == -1) | 827 | if (event->cpu == -1) |
356 | return; | 828 | return; |
357 | 829 | ||
830 | raw_spin_lock_irqsave(&cache_lock, flags); | ||
358 | rmid = event->hw.cqm_rmid; | 831 | rmid = event->hw.cqm_rmid; |
832 | |||
833 | if (!__rmid_valid(rmid)) | ||
834 | goto out; | ||
835 | |||
359 | val = __rmid_read(rmid); | 836 | val = __rmid_read(rmid); |
360 | 837 | ||
361 | /* | 838 | /* |
362 | * Ignore this reading on error states and do not update the value. | 839 | * Ignore this reading on error states and do not update the value. |
363 | */ | 840 | */ |
364 | if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) | 841 | if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) |
365 | return; | 842 | goto out; |
366 | 843 | ||
367 | local64_set(&event->count, val); | 844 | local64_set(&event->count, val); |
845 | out: | ||
846 | raw_spin_unlock_irqrestore(&cache_lock, flags); | ||
368 | } | 847 | } |
369 | 848 | ||
370 | struct rmid_read { | ||
371 | unsigned int rmid; | ||
372 | atomic64_t value; | ||
373 | }; | ||
374 | |||
375 | static void __intel_cqm_event_count(void *info) | 849 | static void __intel_cqm_event_count(void *info) |
376 | { | 850 | { |
377 | struct rmid_read *rr = info; | 851 | struct rmid_read *rr = info; |
@@ -392,8 +866,8 @@ static inline bool cqm_group_leader(struct perf_event *event) | |||
392 | 866 | ||
393 | static u64 intel_cqm_event_count(struct perf_event *event) | 867 | static u64 intel_cqm_event_count(struct perf_event *event) |
394 | { | 868 | { |
869 | unsigned long flags; | ||
395 | struct rmid_read rr = { | 870 | struct rmid_read rr = { |
396 | .rmid = event->hw.cqm_rmid, | ||
397 | .value = ATOMIC64_INIT(0), | 871 | .value = ATOMIC64_INIT(0), |
398 | }; | 872 | }; |
399 | 873 | ||
@@ -417,17 +891,36 @@ static u64 intel_cqm_event_count(struct perf_event *event) | |||
417 | if (!cqm_group_leader(event)) | 891 | if (!cqm_group_leader(event)) |
418 | return 0; | 892 | return 0; |
419 | 893 | ||
420 | on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1); | 894 | /* |
895 | * Notice that we don't perform the reading of an RMID | ||
896 | * atomically, because we can't hold a spin lock across the | ||
897 | * IPIs. | ||
898 | * | ||
899 | * Speculatively perform the read, since @event might be | ||
900 | * assigned a different (possibly invalid) RMID while we're | ||
901 | * busying performing the IPI calls. It's therefore necessary to | ||
902 | * check @event's RMID afterwards, and if it has changed, | ||
903 | * discard the result of the read. | ||
904 | */ | ||
905 | rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid); | ||
421 | 906 | ||
422 | local64_set(&event->count, atomic64_read(&rr.value)); | 907 | if (!__rmid_valid(rr.rmid)) |
908 | goto out; | ||
909 | |||
910 | on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1); | ||
423 | 911 | ||
912 | raw_spin_lock_irqsave(&cache_lock, flags); | ||
913 | if (event->hw.cqm_rmid == rr.rmid) | ||
914 | local64_set(&event->count, atomic64_read(&rr.value)); | ||
915 | raw_spin_unlock_irqrestore(&cache_lock, flags); | ||
916 | out: | ||
424 | return __perf_event_count(event); | 917 | return __perf_event_count(event); |
425 | } | 918 | } |
426 | 919 | ||
427 | static void intel_cqm_event_start(struct perf_event *event, int mode) | 920 | static void intel_cqm_event_start(struct perf_event *event, int mode) |
428 | { | 921 | { |
429 | struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); | 922 | struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); |
430 | unsigned long rmid = event->hw.cqm_rmid; | 923 | unsigned int rmid = event->hw.cqm_rmid; |
431 | unsigned long flags; | 924 | unsigned long flags; |
432 | 925 | ||
433 | if (!(event->hw.cqm_state & PERF_HES_STOPPED)) | 926 | if (!(event->hw.cqm_state & PERF_HES_STOPPED)) |
@@ -473,15 +966,19 @@ static void intel_cqm_event_stop(struct perf_event *event, int mode) | |||
473 | 966 | ||
474 | static int intel_cqm_event_add(struct perf_event *event, int mode) | 967 | static int intel_cqm_event_add(struct perf_event *event, int mode) |
475 | { | 968 | { |
476 | int rmid; | 969 | unsigned long flags; |
970 | unsigned int rmid; | ||
971 | |||
972 | raw_spin_lock_irqsave(&cache_lock, flags); | ||
477 | 973 | ||
478 | event->hw.cqm_state = PERF_HES_STOPPED; | 974 | event->hw.cqm_state = PERF_HES_STOPPED; |
479 | rmid = event->hw.cqm_rmid; | 975 | rmid = event->hw.cqm_rmid; |
480 | WARN_ON_ONCE(!rmid); | ||
481 | 976 | ||
482 | if (mode & PERF_EF_START) | 977 | if (__rmid_valid(rmid) && (mode & PERF_EF_START)) |
483 | intel_cqm_event_start(event, mode); | 978 | intel_cqm_event_start(event, mode); |
484 | 979 | ||
980 | raw_spin_unlock_irqrestore(&cache_lock, flags); | ||
981 | |||
485 | return 0; | 982 | return 0; |
486 | } | 983 | } |
487 | 984 | ||
@@ -518,9 +1015,10 @@ static void intel_cqm_event_destroy(struct perf_event *event) | |||
518 | list_replace(&event->hw.cqm_groups_entry, | 1015 | list_replace(&event->hw.cqm_groups_entry, |
519 | &group_other->hw.cqm_groups_entry); | 1016 | &group_other->hw.cqm_groups_entry); |
520 | } else { | 1017 | } else { |
521 | int rmid = event->hw.cqm_rmid; | 1018 | unsigned int rmid = event->hw.cqm_rmid; |
522 | 1019 | ||
523 | __put_rmid(rmid); | 1020 | if (__rmid_valid(rmid)) |
1021 | __put_rmid(rmid); | ||
524 | list_del(&event->hw.cqm_groups_entry); | 1022 | list_del(&event->hw.cqm_groups_entry); |
525 | } | 1023 | } |
526 | } | 1024 | } |
@@ -528,11 +1026,10 @@ static void intel_cqm_event_destroy(struct perf_event *event) | |||
528 | mutex_unlock(&cache_mutex); | 1026 | mutex_unlock(&cache_mutex); |
529 | } | 1027 | } |
530 | 1028 | ||
531 | static struct pmu intel_cqm_pmu; | ||
532 | |||
533 | static int intel_cqm_event_init(struct perf_event *event) | 1029 | static int intel_cqm_event_init(struct perf_event *event) |
534 | { | 1030 | { |
535 | struct perf_event *group = NULL; | 1031 | struct perf_event *group = NULL; |
1032 | bool rotate = false; | ||
536 | int err; | 1033 | int err; |
537 | 1034 | ||
538 | if (event->attr.type != intel_cqm_pmu.type) | 1035 | if (event->attr.type != intel_cqm_pmu.type) |
@@ -569,10 +1066,24 @@ static int intel_cqm_event_init(struct perf_event *event) | |||
569 | } else { | 1066 | } else { |
570 | list_add_tail(&event->hw.cqm_groups_entry, | 1067 | list_add_tail(&event->hw.cqm_groups_entry, |
571 | &cache_groups); | 1068 | &cache_groups); |
1069 | |||
1070 | /* | ||
1071 | * All RMIDs are either in use or have recently been | ||
1072 | * used. Kick the rotation worker to clean/free some. | ||
1073 | * | ||
1074 | * We only do this for the group leader, rather than for | ||
1075 | * every event in a group to save on needless work. | ||
1076 | */ | ||
1077 | if (!__rmid_valid(event->hw.cqm_rmid)) | ||
1078 | rotate = true; | ||
572 | } | 1079 | } |
573 | 1080 | ||
574 | out: | 1081 | out: |
575 | mutex_unlock(&cache_mutex); | 1082 | mutex_unlock(&cache_mutex); |
1083 | |||
1084 | if (rotate) | ||
1085 | schedule_delayed_work(&intel_cqm_rmid_work, 0); | ||
1086 | |||
576 | return err; | 1087 | return err; |
577 | } | 1088 | } |
578 | 1089 | ||
@@ -607,22 +1118,76 @@ static struct attribute_group intel_cqm_format_group = { | |||
607 | .attrs = intel_cqm_formats_attr, | 1118 | .attrs = intel_cqm_formats_attr, |
608 | }; | 1119 | }; |
609 | 1120 | ||
1121 | static ssize_t | ||
1122 | max_recycle_threshold_show(struct device *dev, struct device_attribute *attr, | ||
1123 | char *page) | ||
1124 | { | ||
1125 | ssize_t rv; | ||
1126 | |||
1127 | mutex_lock(&cache_mutex); | ||
1128 | rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold); | ||
1129 | mutex_unlock(&cache_mutex); | ||
1130 | |||
1131 | return rv; | ||
1132 | } | ||
1133 | |||
1134 | static ssize_t | ||
1135 | max_recycle_threshold_store(struct device *dev, | ||
1136 | struct device_attribute *attr, | ||
1137 | const char *buf, size_t count) | ||
1138 | { | ||
1139 | unsigned int bytes, cachelines; | ||
1140 | int ret; | ||
1141 | |||
1142 | ret = kstrtouint(buf, 0, &bytes); | ||
1143 | if (ret) | ||
1144 | return ret; | ||
1145 | |||
1146 | mutex_lock(&cache_mutex); | ||
1147 | |||
1148 | __intel_cqm_max_threshold = bytes; | ||
1149 | cachelines = bytes / cqm_l3_scale; | ||
1150 | |||
1151 | /* | ||
1152 | * The new maximum takes effect immediately. | ||
1153 | */ | ||
1154 | if (__intel_cqm_threshold > cachelines) | ||
1155 | __intel_cqm_threshold = cachelines; | ||
1156 | |||
1157 | mutex_unlock(&cache_mutex); | ||
1158 | |||
1159 | return count; | ||
1160 | } | ||
1161 | |||
1162 | static DEVICE_ATTR_RW(max_recycle_threshold); | ||
1163 | |||
1164 | static struct attribute *intel_cqm_attrs[] = { | ||
1165 | &dev_attr_max_recycle_threshold.attr, | ||
1166 | NULL, | ||
1167 | }; | ||
1168 | |||
1169 | static const struct attribute_group intel_cqm_group = { | ||
1170 | .attrs = intel_cqm_attrs, | ||
1171 | }; | ||
1172 | |||
610 | static const struct attribute_group *intel_cqm_attr_groups[] = { | 1173 | static const struct attribute_group *intel_cqm_attr_groups[] = { |
611 | &intel_cqm_events_group, | 1174 | &intel_cqm_events_group, |
612 | &intel_cqm_format_group, | 1175 | &intel_cqm_format_group, |
1176 | &intel_cqm_group, | ||
613 | NULL, | 1177 | NULL, |
614 | }; | 1178 | }; |
615 | 1179 | ||
616 | static struct pmu intel_cqm_pmu = { | 1180 | static struct pmu intel_cqm_pmu = { |
617 | .attr_groups = intel_cqm_attr_groups, | 1181 | .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME, |
618 | .task_ctx_nr = perf_sw_context, | 1182 | .attr_groups = intel_cqm_attr_groups, |
619 | .event_init = intel_cqm_event_init, | 1183 | .task_ctx_nr = perf_sw_context, |
620 | .add = intel_cqm_event_add, | 1184 | .event_init = intel_cqm_event_init, |
621 | .del = intel_cqm_event_del, | 1185 | .add = intel_cqm_event_add, |
622 | .start = intel_cqm_event_start, | 1186 | .del = intel_cqm_event_del, |
623 | .stop = intel_cqm_event_stop, | 1187 | .start = intel_cqm_event_start, |
624 | .read = intel_cqm_event_read, | 1188 | .stop = intel_cqm_event_stop, |
625 | .count = intel_cqm_event_count, | 1189 | .read = intel_cqm_event_read, |
1190 | .count = intel_cqm_event_count, | ||
626 | }; | 1191 | }; |
627 | 1192 | ||
628 | static inline void cqm_pick_event_reader(int cpu) | 1193 | static inline void cqm_pick_event_reader(int cpu) |
@@ -732,6 +1297,16 @@ static int __init intel_cqm_init(void) | |||
732 | } | 1297 | } |
733 | } | 1298 | } |
734 | 1299 | ||
1300 | /* | ||
1301 | * A reasonable upper limit on the max threshold is the number | ||
1302 | * of lines tagged per RMID if all RMIDs have the same number of | ||
1303 | * lines tagged in the LLC. | ||
1304 | * | ||
1305 | * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. | ||
1306 | */ | ||
1307 | __intel_cqm_max_threshold = | ||
1308 | boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1); | ||
1309 | |||
735 | snprintf(scale, sizeof(scale), "%u", cqm_l3_scale); | 1310 | snprintf(scale, sizeof(scale), "%u", cqm_l3_scale); |
736 | str = kstrdup(scale, GFP_KERNEL); | 1311 | str = kstrdup(scale, GFP_KERNEL); |
737 | if (!str) { | 1312 | if (!str) { |