diff options
Diffstat (limited to 'arch/x86/kernel/cpu/resctrl/pseudo_lock.c')
-rw-r--r-- | arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 1599 |
1 files changed, 1599 insertions, 0 deletions
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c new file mode 100644 index 000000000000..14bed6af8377 --- /dev/null +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c | |||
@@ -0,0 +1,1599 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Resource Director Technology (RDT) | ||
4 | * | ||
5 | * Pseudo-locking support built on top of Cache Allocation Technology (CAT) | ||
6 | * | ||
7 | * Copyright (C) 2018 Intel Corporation | ||
8 | * | ||
9 | * Author: Reinette Chatre <reinette.chatre@intel.com> | ||
10 | */ | ||
11 | |||
12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
13 | |||
14 | #include <linux/cacheinfo.h> | ||
15 | #include <linux/cpu.h> | ||
16 | #include <linux/cpumask.h> | ||
17 | #include <linux/debugfs.h> | ||
18 | #include <linux/kthread.h> | ||
19 | #include <linux/mman.h> | ||
20 | #include <linux/perf_event.h> | ||
21 | #include <linux/pm_qos.h> | ||
22 | #include <linux/slab.h> | ||
23 | #include <linux/uaccess.h> | ||
24 | |||
25 | #include <asm/cacheflush.h> | ||
26 | #include <asm/intel-family.h> | ||
27 | #include <asm/resctrl_sched.h> | ||
28 | #include <asm/perf_event.h> | ||
29 | |||
30 | #include "../../events/perf_event.h" /* For X86_CONFIG() */ | ||
31 | #include "internal.h" | ||
32 | |||
33 | #define CREATE_TRACE_POINTS | ||
34 | #include "pseudo_lock_event.h" | ||
35 | |||
36 | /* | ||
37 | * MSR_MISC_FEATURE_CONTROL register enables the modification of hardware | ||
38 | * prefetcher state. Details about this register can be found in the MSR | ||
39 | * tables for specific platforms found in Intel's SDM. | ||
40 | */ | ||
41 | #define MSR_MISC_FEATURE_CONTROL 0x000001a4 | ||
42 | |||
43 | /* | ||
44 | * The bits needed to disable hardware prefetching varies based on the | ||
45 | * platform. During initialization we will discover which bits to use. | ||
46 | */ | ||
47 | static u64 prefetch_disable_bits; | ||
48 | |||
49 | /* | ||
50 | * Major number assigned to and shared by all devices exposing | ||
51 | * pseudo-locked regions. | ||
52 | */ | ||
53 | static unsigned int pseudo_lock_major; | ||
54 | static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); | ||
55 | static struct class *pseudo_lock_class; | ||
56 | |||
57 | /** | ||
58 | * get_prefetch_disable_bits - prefetch disable bits of supported platforms | ||
59 | * | ||
60 | * Capture the list of platforms that have been validated to support | ||
61 | * pseudo-locking. This includes testing to ensure pseudo-locked regions | ||
62 | * with low cache miss rates can be created under variety of load conditions | ||
63 | * as well as that these pseudo-locked regions can maintain their low cache | ||
64 | * miss rates under variety of load conditions for significant lengths of time. | ||
65 | * | ||
66 | * After a platform has been validated to support pseudo-locking its | ||
67 | * hardware prefetch disable bits are included here as they are documented | ||
68 | * in the SDM. | ||
69 | * | ||
70 | * When adding a platform here also add support for its cache events to | ||
71 | * measure_cycles_perf_fn() | ||
72 | * | ||
73 | * Return: | ||
74 | * If platform is supported, the bits to disable hardware prefetchers, 0 | ||
75 | * if platform is not supported. | ||
76 | */ | ||
77 | static u64 get_prefetch_disable_bits(void) | ||
78 | { | ||
79 | if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || | ||
80 | boot_cpu_data.x86 != 6) | ||
81 | return 0; | ||
82 | |||
83 | switch (boot_cpu_data.x86_model) { | ||
84 | case INTEL_FAM6_BROADWELL_X: | ||
85 | /* | ||
86 | * SDM defines bits of MSR_MISC_FEATURE_CONTROL register | ||
87 | * as: | ||
88 | * 0 L2 Hardware Prefetcher Disable (R/W) | ||
89 | * 1 L2 Adjacent Cache Line Prefetcher Disable (R/W) | ||
90 | * 2 DCU Hardware Prefetcher Disable (R/W) | ||
91 | * 3 DCU IP Prefetcher Disable (R/W) | ||
92 | * 63:4 Reserved | ||
93 | */ | ||
94 | return 0xF; | ||
95 | case INTEL_FAM6_ATOM_GOLDMONT: | ||
96 | case INTEL_FAM6_ATOM_GOLDMONT_PLUS: | ||
97 | /* | ||
98 | * SDM defines bits of MSR_MISC_FEATURE_CONTROL register | ||
99 | * as: | ||
100 | * 0 L2 Hardware Prefetcher Disable (R/W) | ||
101 | * 1 Reserved | ||
102 | * 2 DCU Hardware Prefetcher Disable (R/W) | ||
103 | * 63:3 Reserved | ||
104 | */ | ||
105 | return 0x5; | ||
106 | } | ||
107 | |||
108 | return 0; | ||
109 | } | ||
110 | |||
111 | /** | ||
112 | * pseudo_lock_minor_get - Obtain available minor number | ||
113 | * @minor: Pointer to where new minor number will be stored | ||
114 | * | ||
115 | * A bitmask is used to track available minor numbers. Here the next free | ||
116 | * minor number is marked as unavailable and returned. | ||
117 | * | ||
118 | * Return: 0 on success, <0 on failure. | ||
119 | */ | ||
120 | static int pseudo_lock_minor_get(unsigned int *minor) | ||
121 | { | ||
122 | unsigned long first_bit; | ||
123 | |||
124 | first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS); | ||
125 | |||
126 | if (first_bit == MINORBITS) | ||
127 | return -ENOSPC; | ||
128 | |||
129 | __clear_bit(first_bit, &pseudo_lock_minor_avail); | ||
130 | *minor = first_bit; | ||
131 | |||
132 | return 0; | ||
133 | } | ||
134 | |||
135 | /** | ||
136 | * pseudo_lock_minor_release - Return minor number to available | ||
137 | * @minor: The minor number made available | ||
138 | */ | ||
139 | static void pseudo_lock_minor_release(unsigned int minor) | ||
140 | { | ||
141 | __set_bit(minor, &pseudo_lock_minor_avail); | ||
142 | } | ||
143 | |||
144 | /** | ||
145 | * region_find_by_minor - Locate a pseudo-lock region by inode minor number | ||
146 | * @minor: The minor number of the device representing pseudo-locked region | ||
147 | * | ||
148 | * When the character device is accessed we need to determine which | ||
149 | * pseudo-locked region it belongs to. This is done by matching the minor | ||
150 | * number of the device to the pseudo-locked region it belongs. | ||
151 | * | ||
152 | * Minor numbers are assigned at the time a pseudo-locked region is associated | ||
153 | * with a cache instance. | ||
154 | * | ||
155 | * Return: On success return pointer to resource group owning the pseudo-locked | ||
156 | * region, NULL on failure. | ||
157 | */ | ||
158 | static struct rdtgroup *region_find_by_minor(unsigned int minor) | ||
159 | { | ||
160 | struct rdtgroup *rdtgrp, *rdtgrp_match = NULL; | ||
161 | |||
162 | list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { | ||
163 | if (rdtgrp->plr && rdtgrp->plr->minor == minor) { | ||
164 | rdtgrp_match = rdtgrp; | ||
165 | break; | ||
166 | } | ||
167 | } | ||
168 | return rdtgrp_match; | ||
169 | } | ||
170 | |||
171 | /** | ||
172 | * pseudo_lock_pm_req - A power management QoS request list entry | ||
173 | * @list: Entry within the @pm_reqs list for a pseudo-locked region | ||
174 | * @req: PM QoS request | ||
175 | */ | ||
176 | struct pseudo_lock_pm_req { | ||
177 | struct list_head list; | ||
178 | struct dev_pm_qos_request req; | ||
179 | }; | ||
180 | |||
181 | static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) | ||
182 | { | ||
183 | struct pseudo_lock_pm_req *pm_req, *next; | ||
184 | |||
185 | list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) { | ||
186 | dev_pm_qos_remove_request(&pm_req->req); | ||
187 | list_del(&pm_req->list); | ||
188 | kfree(pm_req); | ||
189 | } | ||
190 | } | ||
191 | |||
192 | /** | ||
193 | * pseudo_lock_cstates_constrain - Restrict cores from entering C6 | ||
194 | * | ||
195 | * To prevent the cache from being affected by power management entering | ||
196 | * C6 has to be avoided. This is accomplished by requesting a latency | ||
197 | * requirement lower than lowest C6 exit latency of all supported | ||
198 | * platforms as found in the cpuidle state tables in the intel_idle driver. | ||
199 | * At this time it is possible to do so with a single latency requirement | ||
200 | * for all supported platforms. | ||
201 | * | ||
202 | * Since Goldmont is supported, which is affected by X86_BUG_MONITOR, | ||
203 | * the ACPI latencies need to be considered while keeping in mind that C2 | ||
204 | * may be set to map to deeper sleep states. In this case the latency | ||
205 | * requirement needs to prevent entering C2 also. | ||
206 | */ | ||
207 | static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) | ||
208 | { | ||
209 | struct pseudo_lock_pm_req *pm_req; | ||
210 | int cpu; | ||
211 | int ret; | ||
212 | |||
213 | for_each_cpu(cpu, &plr->d->cpu_mask) { | ||
214 | pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL); | ||
215 | if (!pm_req) { | ||
216 | rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n"); | ||
217 | ret = -ENOMEM; | ||
218 | goto out_err; | ||
219 | } | ||
220 | ret = dev_pm_qos_add_request(get_cpu_device(cpu), | ||
221 | &pm_req->req, | ||
222 | DEV_PM_QOS_RESUME_LATENCY, | ||
223 | 30); | ||
224 | if (ret < 0) { | ||
225 | rdt_last_cmd_printf("Failed to add latency req CPU%d\n", | ||
226 | cpu); | ||
227 | kfree(pm_req); | ||
228 | ret = -1; | ||
229 | goto out_err; | ||
230 | } | ||
231 | list_add(&pm_req->list, &plr->pm_reqs); | ||
232 | } | ||
233 | |||
234 | return 0; | ||
235 | |||
236 | out_err: | ||
237 | pseudo_lock_cstates_relax(plr); | ||
238 | return ret; | ||
239 | } | ||
240 | |||
241 | /** | ||
242 | * pseudo_lock_region_clear - Reset pseudo-lock region data | ||
243 | * @plr: pseudo-lock region | ||
244 | * | ||
245 | * All content of the pseudo-locked region is reset - any memory allocated | ||
246 | * freed. | ||
247 | * | ||
248 | * Return: void | ||
249 | */ | ||
250 | static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) | ||
251 | { | ||
252 | plr->size = 0; | ||
253 | plr->line_size = 0; | ||
254 | kfree(plr->kmem); | ||
255 | plr->kmem = NULL; | ||
256 | plr->r = NULL; | ||
257 | if (plr->d) | ||
258 | plr->d->plr = NULL; | ||
259 | plr->d = NULL; | ||
260 | plr->cbm = 0; | ||
261 | plr->debugfs_dir = NULL; | ||
262 | } | ||
263 | |||
264 | /** | ||
265 | * pseudo_lock_region_init - Initialize pseudo-lock region information | ||
266 | * @plr: pseudo-lock region | ||
267 | * | ||
268 | * Called after user provided a schemata to be pseudo-locked. From the | ||
269 | * schemata the &struct pseudo_lock_region is on entry already initialized | ||
270 | * with the resource, domain, and capacity bitmask. Here the information | ||
271 | * required for pseudo-locking is deduced from this data and &struct | ||
272 | * pseudo_lock_region initialized further. This information includes: | ||
273 | * - size in bytes of the region to be pseudo-locked | ||
274 | * - cache line size to know the stride with which data needs to be accessed | ||
275 | * to be pseudo-locked | ||
276 | * - a cpu associated with the cache instance on which the pseudo-locking | ||
277 | * flow can be executed | ||
278 | * | ||
279 | * Return: 0 on success, <0 on failure. Descriptive error will be written | ||
280 | * to last_cmd_status buffer. | ||
281 | */ | ||
282 | static int pseudo_lock_region_init(struct pseudo_lock_region *plr) | ||
283 | { | ||
284 | struct cpu_cacheinfo *ci; | ||
285 | int ret; | ||
286 | int i; | ||
287 | |||
288 | /* Pick the first cpu we find that is associated with the cache. */ | ||
289 | plr->cpu = cpumask_first(&plr->d->cpu_mask); | ||
290 | |||
291 | if (!cpu_online(plr->cpu)) { | ||
292 | rdt_last_cmd_printf("CPU %u associated with cache not online\n", | ||
293 | plr->cpu); | ||
294 | ret = -ENODEV; | ||
295 | goto out_region; | ||
296 | } | ||
297 | |||
298 | ci = get_cpu_cacheinfo(plr->cpu); | ||
299 | |||
300 | plr->size = rdtgroup_cbm_to_size(plr->r, plr->d, plr->cbm); | ||
301 | |||
302 | for (i = 0; i < ci->num_leaves; i++) { | ||
303 | if (ci->info_list[i].level == plr->r->cache_level) { | ||
304 | plr->line_size = ci->info_list[i].coherency_line_size; | ||
305 | return 0; | ||
306 | } | ||
307 | } | ||
308 | |||
309 | ret = -1; | ||
310 | rdt_last_cmd_puts("Unable to determine cache line size\n"); | ||
311 | out_region: | ||
312 | pseudo_lock_region_clear(plr); | ||
313 | return ret; | ||
314 | } | ||
315 | |||
316 | /** | ||
317 | * pseudo_lock_init - Initialize a pseudo-lock region | ||
318 | * @rdtgrp: resource group to which new pseudo-locked region will belong | ||
319 | * | ||
320 | * A pseudo-locked region is associated with a resource group. When this | ||
321 | * association is created the pseudo-locked region is initialized. The | ||
322 | * details of the pseudo-locked region are not known at this time so only | ||
323 | * allocation is done and association established. | ||
324 | * | ||
325 | * Return: 0 on success, <0 on failure | ||
326 | */ | ||
327 | static int pseudo_lock_init(struct rdtgroup *rdtgrp) | ||
328 | { | ||
329 | struct pseudo_lock_region *plr; | ||
330 | |||
331 | plr = kzalloc(sizeof(*plr), GFP_KERNEL); | ||
332 | if (!plr) | ||
333 | return -ENOMEM; | ||
334 | |||
335 | init_waitqueue_head(&plr->lock_thread_wq); | ||
336 | INIT_LIST_HEAD(&plr->pm_reqs); | ||
337 | rdtgrp->plr = plr; | ||
338 | return 0; | ||
339 | } | ||
340 | |||
341 | /** | ||
342 | * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked | ||
343 | * @plr: pseudo-lock region | ||
344 | * | ||
345 | * Initialize the details required to set up the pseudo-locked region and | ||
346 | * allocate the contiguous memory that will be pseudo-locked to the cache. | ||
347 | * | ||
348 | * Return: 0 on success, <0 on failure. Descriptive error will be written | ||
349 | * to last_cmd_status buffer. | ||
350 | */ | ||
351 | static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr) | ||
352 | { | ||
353 | int ret; | ||
354 | |||
355 | ret = pseudo_lock_region_init(plr); | ||
356 | if (ret < 0) | ||
357 | return ret; | ||
358 | |||
359 | /* | ||
360 | * We do not yet support contiguous regions larger than | ||
361 | * KMALLOC_MAX_SIZE. | ||
362 | */ | ||
363 | if (plr->size > KMALLOC_MAX_SIZE) { | ||
364 | rdt_last_cmd_puts("Requested region exceeds maximum size\n"); | ||
365 | ret = -E2BIG; | ||
366 | goto out_region; | ||
367 | } | ||
368 | |||
369 | plr->kmem = kzalloc(plr->size, GFP_KERNEL); | ||
370 | if (!plr->kmem) { | ||
371 | rdt_last_cmd_puts("Unable to allocate memory\n"); | ||
372 | ret = -ENOMEM; | ||
373 | goto out_region; | ||
374 | } | ||
375 | |||
376 | ret = 0; | ||
377 | goto out; | ||
378 | out_region: | ||
379 | pseudo_lock_region_clear(plr); | ||
380 | out: | ||
381 | return ret; | ||
382 | } | ||
383 | |||
384 | /** | ||
385 | * pseudo_lock_free - Free a pseudo-locked region | ||
386 | * @rdtgrp: resource group to which pseudo-locked region belonged | ||
387 | * | ||
388 | * The pseudo-locked region's resources have already been released, or not | ||
389 | * yet created at this point. Now it can be freed and disassociated from the | ||
390 | * resource group. | ||
391 | * | ||
392 | * Return: void | ||
393 | */ | ||
394 | static void pseudo_lock_free(struct rdtgroup *rdtgrp) | ||
395 | { | ||
396 | pseudo_lock_region_clear(rdtgrp->plr); | ||
397 | kfree(rdtgrp->plr); | ||
398 | rdtgrp->plr = NULL; | ||
399 | } | ||
400 | |||
401 | /** | ||
402 | * pseudo_lock_fn - Load kernel memory into cache | ||
403 | * @_rdtgrp: resource group to which pseudo-lock region belongs | ||
404 | * | ||
405 | * This is the core pseudo-locking flow. | ||
406 | * | ||
407 | * First we ensure that the kernel memory cannot be found in the cache. | ||
408 | * Then, while taking care that there will be as little interference as | ||
409 | * possible, the memory to be loaded is accessed while core is running | ||
410 | * with class of service set to the bitmask of the pseudo-locked region. | ||
411 | * After this is complete no future CAT allocations will be allowed to | ||
412 | * overlap with this bitmask. | ||
413 | * | ||
414 | * Local register variables are utilized to ensure that the memory region | ||
415 | * to be locked is the only memory access made during the critical locking | ||
416 | * loop. | ||
417 | * | ||
418 | * Return: 0. Waiter on waitqueue will be woken on completion. | ||
419 | */ | ||
420 | static int pseudo_lock_fn(void *_rdtgrp) | ||
421 | { | ||
422 | struct rdtgroup *rdtgrp = _rdtgrp; | ||
423 | struct pseudo_lock_region *plr = rdtgrp->plr; | ||
424 | u32 rmid_p, closid_p; | ||
425 | unsigned long i; | ||
426 | #ifdef CONFIG_KASAN | ||
427 | /* | ||
428 | * The registers used for local register variables are also used | ||
429 | * when KASAN is active. When KASAN is active we use a regular | ||
430 | * variable to ensure we always use a valid pointer, but the cost | ||
431 | * is that this variable will enter the cache through evicting the | ||
432 | * memory we are trying to lock into the cache. Thus expect lower | ||
433 | * pseudo-locking success rate when KASAN is active. | ||
434 | */ | ||
435 | unsigned int line_size; | ||
436 | unsigned int size; | ||
437 | void *mem_r; | ||
438 | #else | ||
439 | register unsigned int line_size asm("esi"); | ||
440 | register unsigned int size asm("edi"); | ||
441 | #ifdef CONFIG_X86_64 | ||
442 | register void *mem_r asm("rbx"); | ||
443 | #else | ||
444 | register void *mem_r asm("ebx"); | ||
445 | #endif /* CONFIG_X86_64 */ | ||
446 | #endif /* CONFIG_KASAN */ | ||
447 | |||
448 | /* | ||
449 | * Make sure none of the allocated memory is cached. If it is we | ||
450 | * will get a cache hit in below loop from outside of pseudo-locked | ||
451 | * region. | ||
452 | * wbinvd (as opposed to clflush/clflushopt) is required to | ||
453 | * increase likelihood that allocated cache portion will be filled | ||
454 | * with associated memory. | ||
455 | */ | ||
456 | native_wbinvd(); | ||
457 | |||
458 | /* | ||
459 | * Always called with interrupts enabled. By disabling interrupts | ||
460 | * ensure that we will not be preempted during this critical section. | ||
461 | */ | ||
462 | local_irq_disable(); | ||
463 | |||
464 | /* | ||
465 | * Call wrmsr and rdmsr as directly as possible to avoid tracing | ||
466 | * clobbering local register variables or affecting cache accesses. | ||
467 | * | ||
468 | * Disable the hardware prefetcher so that when the end of the memory | ||
469 | * being pseudo-locked is reached the hardware will not read beyond | ||
470 | * the buffer and evict pseudo-locked memory read earlier from the | ||
471 | * cache. | ||
472 | */ | ||
473 | __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); | ||
474 | closid_p = this_cpu_read(pqr_state.cur_closid); | ||
475 | rmid_p = this_cpu_read(pqr_state.cur_rmid); | ||
476 | mem_r = plr->kmem; | ||
477 | size = plr->size; | ||
478 | line_size = plr->line_size; | ||
479 | /* | ||
480 | * Critical section begin: start by writing the closid associated | ||
481 | * with the capacity bitmask of the cache region being | ||
482 | * pseudo-locked followed by reading of kernel memory to load it | ||
483 | * into the cache. | ||
484 | */ | ||
485 | __wrmsr(IA32_PQR_ASSOC, rmid_p, rdtgrp->closid); | ||
486 | /* | ||
487 | * Cache was flushed earlier. Now access kernel memory to read it | ||
488 | * into cache region associated with just activated plr->closid. | ||
489 | * Loop over data twice: | ||
490 | * - In first loop the cache region is shared with the page walker | ||
491 | * as it populates the paging structure caches (including TLB). | ||
492 | * - In the second loop the paging structure caches are used and | ||
493 | * cache region is populated with the memory being referenced. | ||
494 | */ | ||
495 | for (i = 0; i < size; i += PAGE_SIZE) { | ||
496 | /* | ||
497 | * Add a barrier to prevent speculative execution of this | ||
498 | * loop reading beyond the end of the buffer. | ||
499 | */ | ||
500 | rmb(); | ||
501 | asm volatile("mov (%0,%1,1), %%eax\n\t" | ||
502 | : | ||
503 | : "r" (mem_r), "r" (i) | ||
504 | : "%eax", "memory"); | ||
505 | } | ||
506 | for (i = 0; i < size; i += line_size) { | ||
507 | /* | ||
508 | * Add a barrier to prevent speculative execution of this | ||
509 | * loop reading beyond the end of the buffer. | ||
510 | */ | ||
511 | rmb(); | ||
512 | asm volatile("mov (%0,%1,1), %%eax\n\t" | ||
513 | : | ||
514 | : "r" (mem_r), "r" (i) | ||
515 | : "%eax", "memory"); | ||
516 | } | ||
517 | /* | ||
518 | * Critical section end: restore closid with capacity bitmask that | ||
519 | * does not overlap with pseudo-locked region. | ||
520 | */ | ||
521 | __wrmsr(IA32_PQR_ASSOC, rmid_p, closid_p); | ||
522 | |||
523 | /* Re-enable the hardware prefetcher(s) */ | ||
524 | wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); | ||
525 | local_irq_enable(); | ||
526 | |||
527 | plr->thread_done = 1; | ||
528 | wake_up_interruptible(&plr->lock_thread_wq); | ||
529 | return 0; | ||
530 | } | ||
531 | |||
532 | /** | ||
533 | * rdtgroup_monitor_in_progress - Test if monitoring in progress | ||
534 | * @r: resource group being queried | ||
535 | * | ||
536 | * Return: 1 if monitor groups have been created for this resource | ||
537 | * group, 0 otherwise. | ||
538 | */ | ||
539 | static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) | ||
540 | { | ||
541 | return !list_empty(&rdtgrp->mon.crdtgrp_list); | ||
542 | } | ||
543 | |||
544 | /** | ||
545 | * rdtgroup_locksetup_user_restrict - Restrict user access to group | ||
546 | * @rdtgrp: resource group needing access restricted | ||
547 | * | ||
548 | * A resource group used for cache pseudo-locking cannot have cpus or tasks | ||
549 | * assigned to it. This is communicated to the user by restricting access | ||
550 | * to all the files that can be used to make such changes. | ||
551 | * | ||
552 | * Permissions restored with rdtgroup_locksetup_user_restore() | ||
553 | * | ||
554 | * Return: 0 on success, <0 on failure. If a failure occurs during the | ||
555 | * restriction of access an attempt will be made to restore permissions but | ||
556 | * the state of the mode of these files will be uncertain when a failure | ||
557 | * occurs. | ||
558 | */ | ||
559 | static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) | ||
560 | { | ||
561 | int ret; | ||
562 | |||
563 | ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); | ||
564 | if (ret) | ||
565 | return ret; | ||
566 | |||
567 | ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); | ||
568 | if (ret) | ||
569 | goto err_tasks; | ||
570 | |||
571 | ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); | ||
572 | if (ret) | ||
573 | goto err_cpus; | ||
574 | |||
575 | if (rdt_mon_capable) { | ||
576 | ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); | ||
577 | if (ret) | ||
578 | goto err_cpus_list; | ||
579 | } | ||
580 | |||
581 | ret = 0; | ||
582 | goto out; | ||
583 | |||
584 | err_cpus_list: | ||
585 | rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); | ||
586 | err_cpus: | ||
587 | rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); | ||
588 | err_tasks: | ||
589 | rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); | ||
590 | out: | ||
591 | return ret; | ||
592 | } | ||
593 | |||
594 | /** | ||
595 | * rdtgroup_locksetup_user_restore - Restore user access to group | ||
596 | * @rdtgrp: resource group needing access restored | ||
597 | * | ||
598 | * Restore all file access previously removed using | ||
599 | * rdtgroup_locksetup_user_restrict() | ||
600 | * | ||
601 | * Return: 0 on success, <0 on failure. If a failure occurs during the | ||
602 | * restoration of access an attempt will be made to restrict permissions | ||
603 | * again but the state of the mode of these files will be uncertain when | ||
604 | * a failure occurs. | ||
605 | */ | ||
606 | static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) | ||
607 | { | ||
608 | int ret; | ||
609 | |||
610 | ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); | ||
611 | if (ret) | ||
612 | return ret; | ||
613 | |||
614 | ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); | ||
615 | if (ret) | ||
616 | goto err_tasks; | ||
617 | |||
618 | ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); | ||
619 | if (ret) | ||
620 | goto err_cpus; | ||
621 | |||
622 | if (rdt_mon_capable) { | ||
623 | ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); | ||
624 | if (ret) | ||
625 | goto err_cpus_list; | ||
626 | } | ||
627 | |||
628 | ret = 0; | ||
629 | goto out; | ||
630 | |||
631 | err_cpus_list: | ||
632 | rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); | ||
633 | err_cpus: | ||
634 | rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); | ||
635 | err_tasks: | ||
636 | rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); | ||
637 | out: | ||
638 | return ret; | ||
639 | } | ||
640 | |||
641 | /** | ||
642 | * rdtgroup_locksetup_enter - Resource group enters locksetup mode | ||
643 | * @rdtgrp: resource group requested to enter locksetup mode | ||
644 | * | ||
645 | * A resource group enters locksetup mode to reflect that it would be used | ||
646 | * to represent a pseudo-locked region and is in the process of being set | ||
647 | * up to do so. A resource group used for a pseudo-locked region would | ||
648 | * lose the closid associated with it so we cannot allow it to have any | ||
649 | * tasks or cpus assigned nor permit tasks or cpus to be assigned in the | ||
650 | * future. Monitoring of a pseudo-locked region is not allowed either. | ||
651 | * | ||
652 | * The above and more restrictions on a pseudo-locked region are checked | ||
653 | * for and enforced before the resource group enters the locksetup mode. | ||
654 | * | ||
655 | * Returns: 0 if the resource group successfully entered locksetup mode, <0 | ||
656 | * on failure. On failure the last_cmd_status buffer is updated with text to | ||
657 | * communicate details of failure to the user. | ||
658 | */ | ||
659 | int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) | ||
660 | { | ||
661 | int ret; | ||
662 | |||
663 | /* | ||
664 | * The default resource group can neither be removed nor lose the | ||
665 | * default closid associated with it. | ||
666 | */ | ||
667 | if (rdtgrp == &rdtgroup_default) { | ||
668 | rdt_last_cmd_puts("Cannot pseudo-lock default group\n"); | ||
669 | return -EINVAL; | ||
670 | } | ||
671 | |||
672 | /* | ||
673 | * Cache Pseudo-locking not supported when CDP is enabled. | ||
674 | * | ||
675 | * Some things to consider if you would like to enable this | ||
676 | * support (using L3 CDP as example): | ||
677 | * - When CDP is enabled two separate resources are exposed, | ||
678 | * L3DATA and L3CODE, but they are actually on the same cache. | ||
679 | * The implication for pseudo-locking is that if a | ||
680 | * pseudo-locked region is created on a domain of one | ||
681 | * resource (eg. L3CODE), then a pseudo-locked region cannot | ||
682 | * be created on that same domain of the other resource | ||
683 | * (eg. L3DATA). This is because the creation of a | ||
684 | * pseudo-locked region involves a call to wbinvd that will | ||
685 | * affect all cache allocations on particular domain. | ||
686 | * - Considering the previous, it may be possible to only | ||
687 | * expose one of the CDP resources to pseudo-locking and | ||
688 | * hide the other. For example, we could consider to only | ||
689 | * expose L3DATA and since the L3 cache is unified it is | ||
690 | * still possible to place instructions there are execute it. | ||
691 | * - If only one region is exposed to pseudo-locking we should | ||
692 | * still keep in mind that availability of a portion of cache | ||
693 | * for pseudo-locking should take into account both resources. | ||
694 | * Similarly, if a pseudo-locked region is created in one | ||
695 | * resource, the portion of cache used by it should be made | ||
696 | * unavailable to all future allocations from both resources. | ||
697 | */ | ||
698 | if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled || | ||
699 | rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled) { | ||
700 | rdt_last_cmd_puts("CDP enabled\n"); | ||
701 | return -EINVAL; | ||
702 | } | ||
703 | |||
704 | /* | ||
705 | * Not knowing the bits to disable prefetching implies that this | ||
706 | * platform does not support Cache Pseudo-Locking. | ||
707 | */ | ||
708 | prefetch_disable_bits = get_prefetch_disable_bits(); | ||
709 | if (prefetch_disable_bits == 0) { | ||
710 | rdt_last_cmd_puts("Pseudo-locking not supported\n"); | ||
711 | return -EINVAL; | ||
712 | } | ||
713 | |||
714 | if (rdtgroup_monitor_in_progress(rdtgrp)) { | ||
715 | rdt_last_cmd_puts("Monitoring in progress\n"); | ||
716 | return -EINVAL; | ||
717 | } | ||
718 | |||
719 | if (rdtgroup_tasks_assigned(rdtgrp)) { | ||
720 | rdt_last_cmd_puts("Tasks assigned to resource group\n"); | ||
721 | return -EINVAL; | ||
722 | } | ||
723 | |||
724 | if (!cpumask_empty(&rdtgrp->cpu_mask)) { | ||
725 | rdt_last_cmd_puts("CPUs assigned to resource group\n"); | ||
726 | return -EINVAL; | ||
727 | } | ||
728 | |||
729 | if (rdtgroup_locksetup_user_restrict(rdtgrp)) { | ||
730 | rdt_last_cmd_puts("Unable to modify resctrl permissions\n"); | ||
731 | return -EIO; | ||
732 | } | ||
733 | |||
734 | ret = pseudo_lock_init(rdtgrp); | ||
735 | if (ret) { | ||
736 | rdt_last_cmd_puts("Unable to init pseudo-lock region\n"); | ||
737 | goto out_release; | ||
738 | } | ||
739 | |||
740 | /* | ||
741 | * If this system is capable of monitoring a rmid would have been | ||
742 | * allocated when the control group was created. This is not needed | ||
743 | * anymore when this group would be used for pseudo-locking. This | ||
744 | * is safe to call on platforms not capable of monitoring. | ||
745 | */ | ||
746 | free_rmid(rdtgrp->mon.rmid); | ||
747 | |||
748 | ret = 0; | ||
749 | goto out; | ||
750 | |||
751 | out_release: | ||
752 | rdtgroup_locksetup_user_restore(rdtgrp); | ||
753 | out: | ||
754 | return ret; | ||
755 | } | ||
756 | |||
757 | /** | ||
758 | * rdtgroup_locksetup_exit - resource group exist locksetup mode | ||
759 | * @rdtgrp: resource group | ||
760 | * | ||
761 | * When a resource group exits locksetup mode the earlier restrictions are | ||
762 | * lifted. | ||
763 | * | ||
764 | * Return: 0 on success, <0 on failure | ||
765 | */ | ||
766 | int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) | ||
767 | { | ||
768 | int ret; | ||
769 | |||
770 | if (rdt_mon_capable) { | ||
771 | ret = alloc_rmid(); | ||
772 | if (ret < 0) { | ||
773 | rdt_last_cmd_puts("Out of RMIDs\n"); | ||
774 | return ret; | ||
775 | } | ||
776 | rdtgrp->mon.rmid = ret; | ||
777 | } | ||
778 | |||
779 | ret = rdtgroup_locksetup_user_restore(rdtgrp); | ||
780 | if (ret) { | ||
781 | free_rmid(rdtgrp->mon.rmid); | ||
782 | return ret; | ||
783 | } | ||
784 | |||
785 | pseudo_lock_free(rdtgrp); | ||
786 | return 0; | ||
787 | } | ||
788 | |||
789 | /** | ||
790 | * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked | ||
791 | * @d: RDT domain | ||
792 | * @cbm: CBM to test | ||
793 | * | ||
794 | * @d represents a cache instance and @cbm a capacity bitmask that is | ||
795 | * considered for it. Determine if @cbm overlaps with any existing | ||
796 | * pseudo-locked region on @d. | ||
797 | * | ||
798 | * @cbm is unsigned long, even if only 32 bits are used, to make the | ||
799 | * bitmap functions work correctly. | ||
800 | * | ||
801 | * Return: true if @cbm overlaps with pseudo-locked region on @d, false | ||
802 | * otherwise. | ||
803 | */ | ||
804 | bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm) | ||
805 | { | ||
806 | unsigned int cbm_len; | ||
807 | unsigned long cbm_b; | ||
808 | |||
809 | if (d->plr) { | ||
810 | cbm_len = d->plr->r->cache.cbm_len; | ||
811 | cbm_b = d->plr->cbm; | ||
812 | if (bitmap_intersects(&cbm, &cbm_b, cbm_len)) | ||
813 | return true; | ||
814 | } | ||
815 | return false; | ||
816 | } | ||
817 | |||
818 | /** | ||
819 | * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy | ||
820 | * @d: RDT domain under test | ||
821 | * | ||
822 | * The setup of a pseudo-locked region affects all cache instances within | ||
823 | * the hierarchy of the region. It is thus essential to know if any | ||
824 | * pseudo-locked regions exist within a cache hierarchy to prevent any | ||
825 | * attempts to create new pseudo-locked regions in the same hierarchy. | ||
826 | * | ||
827 | * Return: true if a pseudo-locked region exists in the hierarchy of @d or | ||
828 | * if it is not possible to test due to memory allocation issue, | ||
829 | * false otherwise. | ||
830 | */ | ||
831 | bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) | ||
832 | { | ||
833 | cpumask_var_t cpu_with_psl; | ||
834 | struct rdt_resource *r; | ||
835 | struct rdt_domain *d_i; | ||
836 | bool ret = false; | ||
837 | |||
838 | if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) | ||
839 | return true; | ||
840 | |||
841 | /* | ||
842 | * First determine which cpus have pseudo-locked regions | ||
843 | * associated with them. | ||
844 | */ | ||
845 | for_each_alloc_enabled_rdt_resource(r) { | ||
846 | list_for_each_entry(d_i, &r->domains, list) { | ||
847 | if (d_i->plr) | ||
848 | cpumask_or(cpu_with_psl, cpu_with_psl, | ||
849 | &d_i->cpu_mask); | ||
850 | } | ||
851 | } | ||
852 | |||
853 | /* | ||
854 | * Next test if new pseudo-locked region would intersect with | ||
855 | * existing region. | ||
856 | */ | ||
857 | if (cpumask_intersects(&d->cpu_mask, cpu_with_psl)) | ||
858 | ret = true; | ||
859 | |||
860 | free_cpumask_var(cpu_with_psl); | ||
861 | return ret; | ||
862 | } | ||
863 | |||
864 | /** | ||
865 | * measure_cycles_lat_fn - Measure cycle latency to read pseudo-locked memory | ||
866 | * @_plr: pseudo-lock region to measure | ||
867 | * | ||
868 | * There is no deterministic way to test if a memory region is cached. One | ||
869 | * way is to measure how long it takes to read the memory, the speed of | ||
870 | * access is a good way to learn how close to the cpu the data was. Even | ||
871 | * more, if the prefetcher is disabled and the memory is read at a stride | ||
872 | * of half the cache line, then a cache miss will be easy to spot since the | ||
873 | * read of the first half would be significantly slower than the read of | ||
874 | * the second half. | ||
875 | * | ||
876 | * Return: 0. Waiter on waitqueue will be woken on completion. | ||
877 | */ | ||
878 | static int measure_cycles_lat_fn(void *_plr) | ||
879 | { | ||
880 | struct pseudo_lock_region *plr = _plr; | ||
881 | unsigned long i; | ||
882 | u64 start, end; | ||
883 | void *mem_r; | ||
884 | |||
885 | local_irq_disable(); | ||
886 | /* | ||
887 | * Disable hardware prefetchers. | ||
888 | */ | ||
889 | wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); | ||
890 | mem_r = READ_ONCE(plr->kmem); | ||
891 | /* | ||
892 | * Dummy execute of the time measurement to load the needed | ||
893 | * instructions into the L1 instruction cache. | ||
894 | */ | ||
895 | start = rdtsc_ordered(); | ||
896 | for (i = 0; i < plr->size; i += 32) { | ||
897 | start = rdtsc_ordered(); | ||
898 | asm volatile("mov (%0,%1,1), %%eax\n\t" | ||
899 | : | ||
900 | : "r" (mem_r), "r" (i) | ||
901 | : "%eax", "memory"); | ||
902 | end = rdtsc_ordered(); | ||
903 | trace_pseudo_lock_mem_latency((u32)(end - start)); | ||
904 | } | ||
905 | wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); | ||
906 | local_irq_enable(); | ||
907 | plr->thread_done = 1; | ||
908 | wake_up_interruptible(&plr->lock_thread_wq); | ||
909 | return 0; | ||
910 | } | ||
911 | |||
912 | /* | ||
913 | * Create a perf_event_attr for the hit and miss perf events that will | ||
914 | * be used during the performance measurement. A perf_event maintains | ||
915 | * a pointer to its perf_event_attr so a unique attribute structure is | ||
916 | * created for each perf_event. | ||
917 | * | ||
918 | * The actual configuration of the event is set right before use in order | ||
919 | * to use the X86_CONFIG macro. | ||
920 | */ | ||
921 | static struct perf_event_attr perf_miss_attr = { | ||
922 | .type = PERF_TYPE_RAW, | ||
923 | .size = sizeof(struct perf_event_attr), | ||
924 | .pinned = 1, | ||
925 | .disabled = 0, | ||
926 | .exclude_user = 1, | ||
927 | }; | ||
928 | |||
929 | static struct perf_event_attr perf_hit_attr = { | ||
930 | .type = PERF_TYPE_RAW, | ||
931 | .size = sizeof(struct perf_event_attr), | ||
932 | .pinned = 1, | ||
933 | .disabled = 0, | ||
934 | .exclude_user = 1, | ||
935 | }; | ||
936 | |||
937 | struct residency_counts { | ||
938 | u64 miss_before, hits_before; | ||
939 | u64 miss_after, hits_after; | ||
940 | }; | ||
941 | |||
942 | static int measure_residency_fn(struct perf_event_attr *miss_attr, | ||
943 | struct perf_event_attr *hit_attr, | ||
944 | struct pseudo_lock_region *plr, | ||
945 | struct residency_counts *counts) | ||
946 | { | ||
947 | u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0; | ||
948 | struct perf_event *miss_event, *hit_event; | ||
949 | int hit_pmcnum, miss_pmcnum; | ||
950 | unsigned int line_size; | ||
951 | unsigned int size; | ||
952 | unsigned long i; | ||
953 | void *mem_r; | ||
954 | u64 tmp; | ||
955 | |||
956 | miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu, | ||
957 | NULL, NULL, NULL); | ||
958 | if (IS_ERR(miss_event)) | ||
959 | goto out; | ||
960 | |||
961 | hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu, | ||
962 | NULL, NULL, NULL); | ||
963 | if (IS_ERR(hit_event)) | ||
964 | goto out_miss; | ||
965 | |||
966 | local_irq_disable(); | ||
967 | /* | ||
968 | * Check any possible error state of events used by performing | ||
969 | * one local read. | ||
970 | */ | ||
971 | if (perf_event_read_local(miss_event, &tmp, NULL, NULL)) { | ||
972 | local_irq_enable(); | ||
973 | goto out_hit; | ||
974 | } | ||
975 | if (perf_event_read_local(hit_event, &tmp, NULL, NULL)) { | ||
976 | local_irq_enable(); | ||
977 | goto out_hit; | ||
978 | } | ||
979 | |||
980 | /* | ||
981 | * Disable hardware prefetchers. | ||
982 | */ | ||
983 | wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); | ||
984 | |||
985 | /* Initialize rest of local variables */ | ||
986 | /* | ||
987 | * Performance event has been validated right before this with | ||
988 | * interrupts disabled - it is thus safe to read the counter index. | ||
989 | */ | ||
990 | miss_pmcnum = x86_perf_rdpmc_index(miss_event); | ||
991 | hit_pmcnum = x86_perf_rdpmc_index(hit_event); | ||
992 | line_size = READ_ONCE(plr->line_size); | ||
993 | mem_r = READ_ONCE(plr->kmem); | ||
994 | size = READ_ONCE(plr->size); | ||
995 | |||
996 | /* | ||
997 | * Read counter variables twice - first to load the instructions | ||
998 | * used in L1 cache, second to capture accurate value that does not | ||
999 | * include cache misses incurred because of instruction loads. | ||
1000 | */ | ||
1001 | rdpmcl(hit_pmcnum, hits_before); | ||
1002 | rdpmcl(miss_pmcnum, miss_before); | ||
1003 | /* | ||
1004 | * From SDM: Performing back-to-back fast reads are not guaranteed | ||
1005 | * to be monotonic. | ||
1006 | * Use LFENCE to ensure all previous instructions are retired | ||
1007 | * before proceeding. | ||
1008 | */ | ||
1009 | rmb(); | ||
1010 | rdpmcl(hit_pmcnum, hits_before); | ||
1011 | rdpmcl(miss_pmcnum, miss_before); | ||
1012 | /* | ||
1013 | * Use LFENCE to ensure all previous instructions are retired | ||
1014 | * before proceeding. | ||
1015 | */ | ||
1016 | rmb(); | ||
1017 | for (i = 0; i < size; i += line_size) { | ||
1018 | /* | ||
1019 | * Add a barrier to prevent speculative execution of this | ||
1020 | * loop reading beyond the end of the buffer. | ||
1021 | */ | ||
1022 | rmb(); | ||
1023 | asm volatile("mov (%0,%1,1), %%eax\n\t" | ||
1024 | : | ||
1025 | : "r" (mem_r), "r" (i) | ||
1026 | : "%eax", "memory"); | ||
1027 | } | ||
1028 | /* | ||
1029 | * Use LFENCE to ensure all previous instructions are retired | ||
1030 | * before proceeding. | ||
1031 | */ | ||
1032 | rmb(); | ||
1033 | rdpmcl(hit_pmcnum, hits_after); | ||
1034 | rdpmcl(miss_pmcnum, miss_after); | ||
1035 | /* | ||
1036 | * Use LFENCE to ensure all previous instructions are retired | ||
1037 | * before proceeding. | ||
1038 | */ | ||
1039 | rmb(); | ||
1040 | /* Re-enable hardware prefetchers */ | ||
1041 | wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); | ||
1042 | local_irq_enable(); | ||
1043 | out_hit: | ||
1044 | perf_event_release_kernel(hit_event); | ||
1045 | out_miss: | ||
1046 | perf_event_release_kernel(miss_event); | ||
1047 | out: | ||
1048 | /* | ||
1049 | * All counts will be zero on failure. | ||
1050 | */ | ||
1051 | counts->miss_before = miss_before; | ||
1052 | counts->hits_before = hits_before; | ||
1053 | counts->miss_after = miss_after; | ||
1054 | counts->hits_after = hits_after; | ||
1055 | return 0; | ||
1056 | } | ||
1057 | |||
1058 | static int measure_l2_residency(void *_plr) | ||
1059 | { | ||
1060 | struct pseudo_lock_region *plr = _plr; | ||
1061 | struct residency_counts counts = {0}; | ||
1062 | |||
1063 | /* | ||
1064 | * Non-architectural event for the Goldmont Microarchitecture | ||
1065 | * from Intel x86 Architecture Software Developer Manual (SDM): | ||
1066 | * MEM_LOAD_UOPS_RETIRED D1H (event number) | ||
1067 | * Umask values: | ||
1068 | * L2_HIT 02H | ||
1069 | * L2_MISS 10H | ||
1070 | */ | ||
1071 | switch (boot_cpu_data.x86_model) { | ||
1072 | case INTEL_FAM6_ATOM_GOLDMONT: | ||
1073 | case INTEL_FAM6_ATOM_GOLDMONT_PLUS: | ||
1074 | perf_miss_attr.config = X86_CONFIG(.event = 0xd1, | ||
1075 | .umask = 0x10); | ||
1076 | perf_hit_attr.config = X86_CONFIG(.event = 0xd1, | ||
1077 | .umask = 0x2); | ||
1078 | break; | ||
1079 | default: | ||
1080 | goto out; | ||
1081 | } | ||
1082 | |||
1083 | measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts); | ||
1084 | /* | ||
1085 | * If a failure prevented the measurements from succeeding | ||
1086 | * tracepoints will still be written and all counts will be zero. | ||
1087 | */ | ||
1088 | trace_pseudo_lock_l2(counts.hits_after - counts.hits_before, | ||
1089 | counts.miss_after - counts.miss_before); | ||
1090 | out: | ||
1091 | plr->thread_done = 1; | ||
1092 | wake_up_interruptible(&plr->lock_thread_wq); | ||
1093 | return 0; | ||
1094 | } | ||
1095 | |||
1096 | static int measure_l3_residency(void *_plr) | ||
1097 | { | ||
1098 | struct pseudo_lock_region *plr = _plr; | ||
1099 | struct residency_counts counts = {0}; | ||
1100 | |||
1101 | /* | ||
1102 | * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event | ||
1103 | * has two "no fix" errata associated with it: BDM35 and BDM100. On | ||
1104 | * this platform the following events are used instead: | ||
1105 | * LONGEST_LAT_CACHE 2EH (Documented in SDM) | ||
1106 | * REFERENCE 4FH | ||
1107 | * MISS 41H | ||
1108 | */ | ||
1109 | |||
1110 | switch (boot_cpu_data.x86_model) { | ||
1111 | case INTEL_FAM6_BROADWELL_X: | ||
1112 | /* On BDW the hit event counts references, not hits */ | ||
1113 | perf_hit_attr.config = X86_CONFIG(.event = 0x2e, | ||
1114 | .umask = 0x4f); | ||
1115 | perf_miss_attr.config = X86_CONFIG(.event = 0x2e, | ||
1116 | .umask = 0x41); | ||
1117 | break; | ||
1118 | default: | ||
1119 | goto out; | ||
1120 | } | ||
1121 | |||
1122 | measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts); | ||
1123 | /* | ||
1124 | * If a failure prevented the measurements from succeeding | ||
1125 | * tracepoints will still be written and all counts will be zero. | ||
1126 | */ | ||
1127 | |||
1128 | counts.miss_after -= counts.miss_before; | ||
1129 | if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) { | ||
1130 | /* | ||
1131 | * On BDW references and misses are counted, need to adjust. | ||
1132 | * Sometimes the "hits" counter is a bit more than the | ||
1133 | * references, for example, x references but x + 1 hits. | ||
1134 | * To not report invalid hit values in this case we treat | ||
1135 | * that as misses equal to references. | ||
1136 | */ | ||
1137 | /* First compute the number of cache references measured */ | ||
1138 | counts.hits_after -= counts.hits_before; | ||
1139 | /* Next convert references to cache hits */ | ||
1140 | counts.hits_after -= min(counts.miss_after, counts.hits_after); | ||
1141 | } else { | ||
1142 | counts.hits_after -= counts.hits_before; | ||
1143 | } | ||
1144 | |||
1145 | trace_pseudo_lock_l3(counts.hits_after, counts.miss_after); | ||
1146 | out: | ||
1147 | plr->thread_done = 1; | ||
1148 | wake_up_interruptible(&plr->lock_thread_wq); | ||
1149 | return 0; | ||
1150 | } | ||
1151 | |||
1152 | /** | ||
1153 | * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region | ||
1154 | * | ||
1155 | * The measurement of latency to access a pseudo-locked region should be | ||
1156 | * done from a cpu that is associated with that pseudo-locked region. | ||
1157 | * Determine which cpu is associated with this region and start a thread on | ||
1158 | * that cpu to perform the measurement, wait for that thread to complete. | ||
1159 | * | ||
1160 | * Return: 0 on success, <0 on failure | ||
1161 | */ | ||
1162 | static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) | ||
1163 | { | ||
1164 | struct pseudo_lock_region *plr = rdtgrp->plr; | ||
1165 | struct task_struct *thread; | ||
1166 | unsigned int cpu; | ||
1167 | int ret = -1; | ||
1168 | |||
1169 | cpus_read_lock(); | ||
1170 | mutex_lock(&rdtgroup_mutex); | ||
1171 | |||
1172 | if (rdtgrp->flags & RDT_DELETED) { | ||
1173 | ret = -ENODEV; | ||
1174 | goto out; | ||
1175 | } | ||
1176 | |||
1177 | if (!plr->d) { | ||
1178 | ret = -ENODEV; | ||
1179 | goto out; | ||
1180 | } | ||
1181 | |||
1182 | plr->thread_done = 0; | ||
1183 | cpu = cpumask_first(&plr->d->cpu_mask); | ||
1184 | if (!cpu_online(cpu)) { | ||
1185 | ret = -ENODEV; | ||
1186 | goto out; | ||
1187 | } | ||
1188 | |||
1189 | plr->cpu = cpu; | ||
1190 | |||
1191 | if (sel == 1) | ||
1192 | thread = kthread_create_on_node(measure_cycles_lat_fn, plr, | ||
1193 | cpu_to_node(cpu), | ||
1194 | "pseudo_lock_measure/%u", | ||
1195 | cpu); | ||
1196 | else if (sel == 2) | ||
1197 | thread = kthread_create_on_node(measure_l2_residency, plr, | ||
1198 | cpu_to_node(cpu), | ||
1199 | "pseudo_lock_measure/%u", | ||
1200 | cpu); | ||
1201 | else if (sel == 3) | ||
1202 | thread = kthread_create_on_node(measure_l3_residency, plr, | ||
1203 | cpu_to_node(cpu), | ||
1204 | "pseudo_lock_measure/%u", | ||
1205 | cpu); | ||
1206 | else | ||
1207 | goto out; | ||
1208 | |||
1209 | if (IS_ERR(thread)) { | ||
1210 | ret = PTR_ERR(thread); | ||
1211 | goto out; | ||
1212 | } | ||
1213 | kthread_bind(thread, cpu); | ||
1214 | wake_up_process(thread); | ||
1215 | |||
1216 | ret = wait_event_interruptible(plr->lock_thread_wq, | ||
1217 | plr->thread_done == 1); | ||
1218 | if (ret < 0) | ||
1219 | goto out; | ||
1220 | |||
1221 | ret = 0; | ||
1222 | |||
1223 | out: | ||
1224 | mutex_unlock(&rdtgroup_mutex); | ||
1225 | cpus_read_unlock(); | ||
1226 | return ret; | ||
1227 | } | ||
1228 | |||
1229 | static ssize_t pseudo_lock_measure_trigger(struct file *file, | ||
1230 | const char __user *user_buf, | ||
1231 | size_t count, loff_t *ppos) | ||
1232 | { | ||
1233 | struct rdtgroup *rdtgrp = file->private_data; | ||
1234 | size_t buf_size; | ||
1235 | char buf[32]; | ||
1236 | int ret; | ||
1237 | int sel; | ||
1238 | |||
1239 | buf_size = min(count, (sizeof(buf) - 1)); | ||
1240 | if (copy_from_user(buf, user_buf, buf_size)) | ||
1241 | return -EFAULT; | ||
1242 | |||
1243 | buf[buf_size] = '\0'; | ||
1244 | ret = kstrtoint(buf, 10, &sel); | ||
1245 | if (ret == 0) { | ||
1246 | if (sel != 1 && sel != 2 && sel != 3) | ||
1247 | return -EINVAL; | ||
1248 | ret = debugfs_file_get(file->f_path.dentry); | ||
1249 | if (ret) | ||
1250 | return ret; | ||
1251 | ret = pseudo_lock_measure_cycles(rdtgrp, sel); | ||
1252 | if (ret == 0) | ||
1253 | ret = count; | ||
1254 | debugfs_file_put(file->f_path.dentry); | ||
1255 | } | ||
1256 | |||
1257 | return ret; | ||
1258 | } | ||
1259 | |||
1260 | static const struct file_operations pseudo_measure_fops = { | ||
1261 | .write = pseudo_lock_measure_trigger, | ||
1262 | .open = simple_open, | ||
1263 | .llseek = default_llseek, | ||
1264 | }; | ||
1265 | |||
1266 | /** | ||
1267 | * rdtgroup_pseudo_lock_create - Create a pseudo-locked region | ||
1268 | * @rdtgrp: resource group to which pseudo-lock region belongs | ||
1269 | * | ||
1270 | * Called when a resource group in the pseudo-locksetup mode receives a | ||
1271 | * valid schemata that should be pseudo-locked. Since the resource group is | ||
1272 | * in pseudo-locksetup mode the &struct pseudo_lock_region has already been | ||
1273 | * allocated and initialized with the essential information. If a failure | ||
1274 | * occurs the resource group remains in the pseudo-locksetup mode with the | ||
1275 | * &struct pseudo_lock_region associated with it, but cleared from all | ||
1276 | * information and ready for the user to re-attempt pseudo-locking by | ||
1277 | * writing the schemata again. | ||
1278 | * | ||
1279 | * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0 | ||
1280 | * on failure. Descriptive error will be written to last_cmd_status buffer. | ||
1281 | */ | ||
1282 | int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) | ||
1283 | { | ||
1284 | struct pseudo_lock_region *plr = rdtgrp->plr; | ||
1285 | struct task_struct *thread; | ||
1286 | unsigned int new_minor; | ||
1287 | struct device *dev; | ||
1288 | int ret; | ||
1289 | |||
1290 | ret = pseudo_lock_region_alloc(plr); | ||
1291 | if (ret < 0) | ||
1292 | return ret; | ||
1293 | |||
1294 | ret = pseudo_lock_cstates_constrain(plr); | ||
1295 | if (ret < 0) { | ||
1296 | ret = -EINVAL; | ||
1297 | goto out_region; | ||
1298 | } | ||
1299 | |||
1300 | plr->thread_done = 0; | ||
1301 | |||
1302 | thread = kthread_create_on_node(pseudo_lock_fn, rdtgrp, | ||
1303 | cpu_to_node(plr->cpu), | ||
1304 | "pseudo_lock/%u", plr->cpu); | ||
1305 | if (IS_ERR(thread)) { | ||
1306 | ret = PTR_ERR(thread); | ||
1307 | rdt_last_cmd_printf("Locking thread returned error %d\n", ret); | ||
1308 | goto out_cstates; | ||
1309 | } | ||
1310 | |||
1311 | kthread_bind(thread, plr->cpu); | ||
1312 | wake_up_process(thread); | ||
1313 | |||
1314 | ret = wait_event_interruptible(plr->lock_thread_wq, | ||
1315 | plr->thread_done == 1); | ||
1316 | if (ret < 0) { | ||
1317 | /* | ||
1318 | * If the thread does not get on the CPU for whatever | ||
1319 | * reason and the process which sets up the region is | ||
1320 | * interrupted then this will leave the thread in runnable | ||
1321 | * state and once it gets on the CPU it will derefence | ||
1322 | * the cleared, but not freed, plr struct resulting in an | ||
1323 | * empty pseudo-locking loop. | ||
1324 | */ | ||
1325 | rdt_last_cmd_puts("Locking thread interrupted\n"); | ||
1326 | goto out_cstates; | ||
1327 | } | ||
1328 | |||
1329 | ret = pseudo_lock_minor_get(&new_minor); | ||
1330 | if (ret < 0) { | ||
1331 | rdt_last_cmd_puts("Unable to obtain a new minor number\n"); | ||
1332 | goto out_cstates; | ||
1333 | } | ||
1334 | |||
1335 | /* | ||
1336 | * Unlock access but do not release the reference. The | ||
1337 | * pseudo-locked region will still be here on return. | ||
1338 | * | ||
1339 | * The mutex has to be released temporarily to avoid a potential | ||
1340 | * deadlock with the mm->mmap_sem semaphore which is obtained in | ||
1341 | * the device_create() and debugfs_create_dir() callpath below | ||
1342 | * as well as before the mmap() callback is called. | ||
1343 | */ | ||
1344 | mutex_unlock(&rdtgroup_mutex); | ||
1345 | |||
1346 | if (!IS_ERR_OR_NULL(debugfs_resctrl)) { | ||
1347 | plr->debugfs_dir = debugfs_create_dir(rdtgrp->kn->name, | ||
1348 | debugfs_resctrl); | ||
1349 | if (!IS_ERR_OR_NULL(plr->debugfs_dir)) | ||
1350 | debugfs_create_file("pseudo_lock_measure", 0200, | ||
1351 | plr->debugfs_dir, rdtgrp, | ||
1352 | &pseudo_measure_fops); | ||
1353 | } | ||
1354 | |||
1355 | dev = device_create(pseudo_lock_class, NULL, | ||
1356 | MKDEV(pseudo_lock_major, new_minor), | ||
1357 | rdtgrp, "%s", rdtgrp->kn->name); | ||
1358 | |||
1359 | mutex_lock(&rdtgroup_mutex); | ||
1360 | |||
1361 | if (IS_ERR(dev)) { | ||
1362 | ret = PTR_ERR(dev); | ||
1363 | rdt_last_cmd_printf("Failed to create character device: %d\n", | ||
1364 | ret); | ||
1365 | goto out_debugfs; | ||
1366 | } | ||
1367 | |||
1368 | /* We released the mutex - check if group was removed while we did so */ | ||
1369 | if (rdtgrp->flags & RDT_DELETED) { | ||
1370 | ret = -ENODEV; | ||
1371 | goto out_device; | ||
1372 | } | ||
1373 | |||
1374 | plr->minor = new_minor; | ||
1375 | |||
1376 | rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; | ||
1377 | closid_free(rdtgrp->closid); | ||
1378 | rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444); | ||
1379 | rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444); | ||
1380 | |||
1381 | ret = 0; | ||
1382 | goto out; | ||
1383 | |||
1384 | out_device: | ||
1385 | device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); | ||
1386 | out_debugfs: | ||
1387 | debugfs_remove_recursive(plr->debugfs_dir); | ||
1388 | pseudo_lock_minor_release(new_minor); | ||
1389 | out_cstates: | ||
1390 | pseudo_lock_cstates_relax(plr); | ||
1391 | out_region: | ||
1392 | pseudo_lock_region_clear(plr); | ||
1393 | out: | ||
1394 | return ret; | ||
1395 | } | ||
1396 | |||
1397 | /** | ||
1398 | * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region | ||
1399 | * @rdtgrp: resource group to which the pseudo-locked region belongs | ||
1400 | * | ||
1401 | * The removal of a pseudo-locked region can be initiated when the resource | ||
1402 | * group is removed from user space via a "rmdir" from userspace or the | ||
1403 | * unmount of the resctrl filesystem. On removal the resource group does | ||
1404 | * not go back to pseudo-locksetup mode before it is removed, instead it is | ||
1405 | * removed directly. There is thus assymmetry with the creation where the | ||
1406 | * &struct pseudo_lock_region is removed here while it was not created in | ||
1407 | * rdtgroup_pseudo_lock_create(). | ||
1408 | * | ||
1409 | * Return: void | ||
1410 | */ | ||
1411 | void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) | ||
1412 | { | ||
1413 | struct pseudo_lock_region *plr = rdtgrp->plr; | ||
1414 | |||
1415 | if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { | ||
1416 | /* | ||
1417 | * Default group cannot be a pseudo-locked region so we can | ||
1418 | * free closid here. | ||
1419 | */ | ||
1420 | closid_free(rdtgrp->closid); | ||
1421 | goto free; | ||
1422 | } | ||
1423 | |||
1424 | pseudo_lock_cstates_relax(plr); | ||
1425 | debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); | ||
1426 | device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); | ||
1427 | pseudo_lock_minor_release(plr->minor); | ||
1428 | |||
1429 | free: | ||
1430 | pseudo_lock_free(rdtgrp); | ||
1431 | } | ||
1432 | |||
1433 | static int pseudo_lock_dev_open(struct inode *inode, struct file *filp) | ||
1434 | { | ||
1435 | struct rdtgroup *rdtgrp; | ||
1436 | |||
1437 | mutex_lock(&rdtgroup_mutex); | ||
1438 | |||
1439 | rdtgrp = region_find_by_minor(iminor(inode)); | ||
1440 | if (!rdtgrp) { | ||
1441 | mutex_unlock(&rdtgroup_mutex); | ||
1442 | return -ENODEV; | ||
1443 | } | ||
1444 | |||
1445 | filp->private_data = rdtgrp; | ||
1446 | atomic_inc(&rdtgrp->waitcount); | ||
1447 | /* Perform a non-seekable open - llseek is not supported */ | ||
1448 | filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); | ||
1449 | |||
1450 | mutex_unlock(&rdtgroup_mutex); | ||
1451 | |||
1452 | return 0; | ||
1453 | } | ||
1454 | |||
1455 | static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) | ||
1456 | { | ||
1457 | struct rdtgroup *rdtgrp; | ||
1458 | |||
1459 | mutex_lock(&rdtgroup_mutex); | ||
1460 | rdtgrp = filp->private_data; | ||
1461 | WARN_ON(!rdtgrp); | ||
1462 | if (!rdtgrp) { | ||
1463 | mutex_unlock(&rdtgroup_mutex); | ||
1464 | return -ENODEV; | ||
1465 | } | ||
1466 | filp->private_data = NULL; | ||
1467 | atomic_dec(&rdtgrp->waitcount); | ||
1468 | mutex_unlock(&rdtgroup_mutex); | ||
1469 | return 0; | ||
1470 | } | ||
1471 | |||
1472 | static int pseudo_lock_dev_mremap(struct vm_area_struct *area) | ||
1473 | { | ||
1474 | /* Not supported */ | ||
1475 | return -EINVAL; | ||
1476 | } | ||
1477 | |||
1478 | static const struct vm_operations_struct pseudo_mmap_ops = { | ||
1479 | .mremap = pseudo_lock_dev_mremap, | ||
1480 | }; | ||
1481 | |||
1482 | static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) | ||
1483 | { | ||
1484 | unsigned long vsize = vma->vm_end - vma->vm_start; | ||
1485 | unsigned long off = vma->vm_pgoff << PAGE_SHIFT; | ||
1486 | struct pseudo_lock_region *plr; | ||
1487 | struct rdtgroup *rdtgrp; | ||
1488 | unsigned long physical; | ||
1489 | unsigned long psize; | ||
1490 | |||
1491 | mutex_lock(&rdtgroup_mutex); | ||
1492 | |||
1493 | rdtgrp = filp->private_data; | ||
1494 | WARN_ON(!rdtgrp); | ||
1495 | if (!rdtgrp) { | ||
1496 | mutex_unlock(&rdtgroup_mutex); | ||
1497 | return -ENODEV; | ||
1498 | } | ||
1499 | |||
1500 | plr = rdtgrp->plr; | ||
1501 | |||
1502 | if (!plr->d) { | ||
1503 | mutex_unlock(&rdtgroup_mutex); | ||
1504 | return -ENODEV; | ||
1505 | } | ||
1506 | |||
1507 | /* | ||
1508 | * Task is required to run with affinity to the cpus associated | ||
1509 | * with the pseudo-locked region. If this is not the case the task | ||
1510 | * may be scheduled elsewhere and invalidate entries in the | ||
1511 | * pseudo-locked region. | ||
1512 | */ | ||
1513 | if (!cpumask_subset(¤t->cpus_allowed, &plr->d->cpu_mask)) { | ||
1514 | mutex_unlock(&rdtgroup_mutex); | ||
1515 | return -EINVAL; | ||
1516 | } | ||
1517 | |||
1518 | physical = __pa(plr->kmem) >> PAGE_SHIFT; | ||
1519 | psize = plr->size - off; | ||
1520 | |||
1521 | if (off > plr->size) { | ||
1522 | mutex_unlock(&rdtgroup_mutex); | ||
1523 | return -ENOSPC; | ||
1524 | } | ||
1525 | |||
1526 | /* | ||
1527 | * Ensure changes are carried directly to the memory being mapped, | ||
1528 | * do not allow copy-on-write mapping. | ||
1529 | */ | ||
1530 | if (!(vma->vm_flags & VM_SHARED)) { | ||
1531 | mutex_unlock(&rdtgroup_mutex); | ||
1532 | return -EINVAL; | ||
1533 | } | ||
1534 | |||
1535 | if (vsize > psize) { | ||
1536 | mutex_unlock(&rdtgroup_mutex); | ||
1537 | return -ENOSPC; | ||
1538 | } | ||
1539 | |||
1540 | memset(plr->kmem + off, 0, vsize); | ||
1541 | |||
1542 | if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff, | ||
1543 | vsize, vma->vm_page_prot)) { | ||
1544 | mutex_unlock(&rdtgroup_mutex); | ||
1545 | return -EAGAIN; | ||
1546 | } | ||
1547 | vma->vm_ops = &pseudo_mmap_ops; | ||
1548 | mutex_unlock(&rdtgroup_mutex); | ||
1549 | return 0; | ||
1550 | } | ||
1551 | |||
1552 | static const struct file_operations pseudo_lock_dev_fops = { | ||
1553 | .owner = THIS_MODULE, | ||
1554 | .llseek = no_llseek, | ||
1555 | .read = NULL, | ||
1556 | .write = NULL, | ||
1557 | .open = pseudo_lock_dev_open, | ||
1558 | .release = pseudo_lock_dev_release, | ||
1559 | .mmap = pseudo_lock_dev_mmap, | ||
1560 | }; | ||
1561 | |||
1562 | static char *pseudo_lock_devnode(struct device *dev, umode_t *mode) | ||
1563 | { | ||
1564 | struct rdtgroup *rdtgrp; | ||
1565 | |||
1566 | rdtgrp = dev_get_drvdata(dev); | ||
1567 | if (mode) | ||
1568 | *mode = 0600; | ||
1569 | return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name); | ||
1570 | } | ||
1571 | |||
1572 | int rdt_pseudo_lock_init(void) | ||
1573 | { | ||
1574 | int ret; | ||
1575 | |||
1576 | ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops); | ||
1577 | if (ret < 0) | ||
1578 | return ret; | ||
1579 | |||
1580 | pseudo_lock_major = ret; | ||
1581 | |||
1582 | pseudo_lock_class = class_create(THIS_MODULE, "pseudo_lock"); | ||
1583 | if (IS_ERR(pseudo_lock_class)) { | ||
1584 | ret = PTR_ERR(pseudo_lock_class); | ||
1585 | unregister_chrdev(pseudo_lock_major, "pseudo_lock"); | ||
1586 | return ret; | ||
1587 | } | ||
1588 | |||
1589 | pseudo_lock_class->devnode = pseudo_lock_devnode; | ||
1590 | return 0; | ||
1591 | } | ||
1592 | |||
1593 | void rdt_pseudo_lock_release(void) | ||
1594 | { | ||
1595 | class_destroy(pseudo_lock_class); | ||
1596 | pseudo_lock_class = NULL; | ||
1597 | unregister_chrdev(pseudo_lock_major, "pseudo_lock"); | ||
1598 | pseudo_lock_major = 0; | ||
1599 | } | ||