diff options
| -rw-r--r-- | Documentation/x86/intel_rdt_ui.txt | 380 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/Makefile | 4 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/intel_rdt.c | 11 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/intel_rdt.h | 143 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 129 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c | 1522 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h | 43 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 808 |
8 files changed, 2965 insertions, 75 deletions
diff --git a/Documentation/x86/intel_rdt_ui.txt b/Documentation/x86/intel_rdt_ui.txt index a16aa2113840..f662d3c530e5 100644 --- a/Documentation/x86/intel_rdt_ui.txt +++ b/Documentation/x86/intel_rdt_ui.txt | |||
| @@ -29,7 +29,11 @@ mount options are: | |||
| 29 | L2 and L3 CDP are controlled seperately. | 29 | L2 and L3 CDP are controlled seperately. |
| 30 | 30 | ||
| 31 | RDT features are orthogonal. A particular system may support only | 31 | RDT features are orthogonal. A particular system may support only |
| 32 | monitoring, only control, or both monitoring and control. | 32 | monitoring, only control, or both monitoring and control. Cache |
| 33 | pseudo-locking is a unique way of using cache control to "pin" or | ||
| 34 | "lock" data in the cache. Details can be found in | ||
| 35 | "Cache Pseudo-Locking". | ||
| 36 | |||
| 33 | 37 | ||
| 34 | The mount succeeds if either of allocation or monitoring is present, but | 38 | The mount succeeds if either of allocation or monitoring is present, but |
| 35 | only those files and directories supported by the system will be created. | 39 | only those files and directories supported by the system will be created. |
| @@ -65,6 +69,29 @@ related to allocation: | |||
| 65 | some platforms support devices that have their | 69 | some platforms support devices that have their |
| 66 | own settings for cache use which can over-ride | 70 | own settings for cache use which can over-ride |
| 67 | these bits. | 71 | these bits. |
| 72 | "bit_usage": Annotated capacity bitmasks showing how all | ||
| 73 | instances of the resource are used. The legend is: | ||
| 74 | "0" - Corresponding region is unused. When the system's | ||
| 75 | resources have been allocated and a "0" is found | ||
| 76 | in "bit_usage" it is a sign that resources are | ||
| 77 | wasted. | ||
| 78 | "H" - Corresponding region is used by hardware only | ||
| 79 | but available for software use. If a resource | ||
| 80 | has bits set in "shareable_bits" but not all | ||
| 81 | of these bits appear in the resource groups' | ||
| 82 | schematas then the bits appearing in | ||
| 83 | "shareable_bits" but no resource group will | ||
| 84 | be marked as "H". | ||
| 85 | "X" - Corresponding region is available for sharing and | ||
| 86 | used by hardware and software. These are the | ||
| 87 | bits that appear in "shareable_bits" as | ||
| 88 | well as a resource group's allocation. | ||
| 89 | "S" - Corresponding region is used by software | ||
| 90 | and available for sharing. | ||
| 91 | "E" - Corresponding region is used exclusively by | ||
| 92 | one resource group. No sharing allowed. | ||
| 93 | "P" - Corresponding region is pseudo-locked. No | ||
| 94 | sharing allowed. | ||
| 68 | 95 | ||
| 69 | Memory bandwitdh(MB) subdirectory contains the following files | 96 | Memory bandwitdh(MB) subdirectory contains the following files |
| 70 | with respect to allocation: | 97 | with respect to allocation: |
| @@ -151,6 +178,9 @@ All groups contain the following files: | |||
| 151 | CPUs to/from this group. As with the tasks file a hierarchy is | 178 | CPUs to/from this group. As with the tasks file a hierarchy is |
| 152 | maintained where MON groups may only include CPUs owned by the | 179 | maintained where MON groups may only include CPUs owned by the |
| 153 | parent CTRL_MON group. | 180 | parent CTRL_MON group. |
| 181 | When the resouce group is in pseudo-locked mode this file will | ||
| 182 | only be readable, reflecting the CPUs associated with the | ||
| 183 | pseudo-locked region. | ||
| 154 | 184 | ||
| 155 | 185 | ||
| 156 | "cpus_list": | 186 | "cpus_list": |
| @@ -163,6 +193,21 @@ When control is enabled all CTRL_MON groups will also contain: | |||
| 163 | A list of all the resources available to this group. | 193 | A list of all the resources available to this group. |
| 164 | Each resource has its own line and format - see below for details. | 194 | Each resource has its own line and format - see below for details. |
| 165 | 195 | ||
| 196 | "size": | ||
| 197 | Mirrors the display of the "schemata" file to display the size in | ||
| 198 | bytes of each allocation instead of the bits representing the | ||
| 199 | allocation. | ||
| 200 | |||
| 201 | "mode": | ||
| 202 | The "mode" of the resource group dictates the sharing of its | ||
| 203 | allocations. A "shareable" resource group allows sharing of its | ||
| 204 | allocations while an "exclusive" resource group does not. A | ||
| 205 | cache pseudo-locked region is created by first writing | ||
| 206 | "pseudo-locksetup" to the "mode" file before writing the cache | ||
| 207 | pseudo-locked region's schemata to the resource group's "schemata" | ||
| 208 | file. On successful pseudo-locked region creation the mode will | ||
| 209 | automatically change to "pseudo-locked". | ||
| 210 | |||
| 166 | When monitoring is enabled all MON groups will also contain: | 211 | When monitoring is enabled all MON groups will also contain: |
| 167 | 212 | ||
| 168 | "mon_data": | 213 | "mon_data": |
| @@ -379,6 +424,170 @@ L3CODE:0=fffff;1=fffff;2=fffff;3=fffff | |||
| 379 | L3DATA:0=fffff;1=fffff;2=3c0;3=fffff | 424 | L3DATA:0=fffff;1=fffff;2=3c0;3=fffff |
| 380 | L3CODE:0=fffff;1=fffff;2=fffff;3=fffff | 425 | L3CODE:0=fffff;1=fffff;2=fffff;3=fffff |
| 381 | 426 | ||
| 427 | Cache Pseudo-Locking | ||
| 428 | -------------------- | ||
| 429 | CAT enables a user to specify the amount of cache space that an | ||
| 430 | application can fill. Cache pseudo-locking builds on the fact that a | ||
| 431 | CPU can still read and write data pre-allocated outside its current | ||
| 432 | allocated area on a cache hit. With cache pseudo-locking, data can be | ||
| 433 | preloaded into a reserved portion of cache that no application can | ||
| 434 | fill, and from that point on will only serve cache hits. The cache | ||
| 435 | pseudo-locked memory is made accessible to user space where an | ||
| 436 | application can map it into its virtual address space and thus have | ||
| 437 | a region of memory with reduced average read latency. | ||
| 438 | |||
| 439 | The creation of a cache pseudo-locked region is triggered by a request | ||
| 440 | from the user to do so that is accompanied by a schemata of the region | ||
| 441 | to be pseudo-locked. The cache pseudo-locked region is created as follows: | ||
| 442 | - Create a CAT allocation CLOSNEW with a CBM matching the schemata | ||
| 443 | from the user of the cache region that will contain the pseudo-locked | ||
| 444 | memory. This region must not overlap with any current CAT allocation/CLOS | ||
| 445 | on the system and no future overlap with this cache region is allowed | ||
| 446 | while the pseudo-locked region exists. | ||
| 447 | - Create a contiguous region of memory of the same size as the cache | ||
| 448 | region. | ||
| 449 | - Flush the cache, disable hardware prefetchers, disable preemption. | ||
| 450 | - Make CLOSNEW the active CLOS and touch the allocated memory to load | ||
| 451 | it into the cache. | ||
| 452 | - Set the previous CLOS as active. | ||
| 453 | - At this point the closid CLOSNEW can be released - the cache | ||
| 454 | pseudo-locked region is protected as long as its CBM does not appear in | ||
| 455 | any CAT allocation. Even though the cache pseudo-locked region will from | ||
| 456 | this point on not appear in any CBM of any CLOS an application running with | ||
| 457 | any CLOS will be able to access the memory in the pseudo-locked region since | ||
| 458 | the region continues to serve cache hits. | ||
| 459 | - The contiguous region of memory loaded into the cache is exposed to | ||
| 460 | user-space as a character device. | ||
| 461 | |||
| 462 | Cache pseudo-locking increases the probability that data will remain | ||
| 463 | in the cache via carefully configuring the CAT feature and controlling | ||
| 464 | application behavior. There is no guarantee that data is placed in | ||
| 465 | cache. Instructions like INVD, WBINVD, CLFLUSH, etc. can still evict | ||
| 466 | “locked” data from cache. Power management C-states may shrink or | ||
| 467 | power off cache. Deeper C-states will automatically be restricted on | ||
| 468 | pseudo-locked region creation. | ||
| 469 | |||
| 470 | It is required that an application using a pseudo-locked region runs | ||
| 471 | with affinity to the cores (or a subset of the cores) associated | ||
| 472 | with the cache on which the pseudo-locked region resides. A sanity check | ||
| 473 | within the code will not allow an application to map pseudo-locked memory | ||
| 474 | unless it runs with affinity to cores associated with the cache on which the | ||
| 475 | pseudo-locked region resides. The sanity check is only done during the | ||
| 476 | initial mmap() handling, there is no enforcement afterwards and the | ||
| 477 | application self needs to ensure it remains affine to the correct cores. | ||
| 478 | |||
| 479 | Pseudo-locking is accomplished in two stages: | ||
| 480 | 1) During the first stage the system administrator allocates a portion | ||
| 481 | of cache that should be dedicated to pseudo-locking. At this time an | ||
| 482 | equivalent portion of memory is allocated, loaded into allocated | ||
| 483 | cache portion, and exposed as a character device. | ||
| 484 | 2) During the second stage a user-space application maps (mmap()) the | ||
| 485 | pseudo-locked memory into its address space. | ||
| 486 | |||
| 487 | Cache Pseudo-Locking Interface | ||
| 488 | ------------------------------ | ||
| 489 | A pseudo-locked region is created using the resctrl interface as follows: | ||
| 490 | |||
| 491 | 1) Create a new resource group by creating a new directory in /sys/fs/resctrl. | ||
| 492 | 2) Change the new resource group's mode to "pseudo-locksetup" by writing | ||
| 493 | "pseudo-locksetup" to the "mode" file. | ||
| 494 | 3) Write the schemata of the pseudo-locked region to the "schemata" file. All | ||
| 495 | bits within the schemata should be "unused" according to the "bit_usage" | ||
| 496 | file. | ||
| 497 | |||
| 498 | On successful pseudo-locked region creation the "mode" file will contain | ||
| 499 | "pseudo-locked" and a new character device with the same name as the resource | ||
| 500 | group will exist in /dev/pseudo_lock. This character device can be mmap()'ed | ||
| 501 | by user space in order to obtain access to the pseudo-locked memory region. | ||
| 502 | |||
| 503 | An example of cache pseudo-locked region creation and usage can be found below. | ||
| 504 | |||
| 505 | Cache Pseudo-Locking Debugging Interface | ||
| 506 | --------------------------------------- | ||
| 507 | The pseudo-locking debugging interface is enabled by default (if | ||
| 508 | CONFIG_DEBUG_FS is enabled) and can be found in /sys/kernel/debug/resctrl. | ||
| 509 | |||
| 510 | There is no explicit way for the kernel to test if a provided memory | ||
| 511 | location is present in the cache. The pseudo-locking debugging interface uses | ||
| 512 | the tracing infrastructure to provide two ways to measure cache residency of | ||
| 513 | the pseudo-locked region: | ||
| 514 | 1) Memory access latency using the pseudo_lock_mem_latency tracepoint. Data | ||
| 515 | from these measurements are best visualized using a hist trigger (see | ||
| 516 | example below). In this test the pseudo-locked region is traversed at | ||
| 517 | a stride of 32 bytes while hardware prefetchers and preemption | ||
| 518 | are disabled. This also provides a substitute visualization of cache | ||
| 519 | hits and misses. | ||
| 520 | 2) Cache hit and miss measurements using model specific precision counters if | ||
| 521 | available. Depending on the levels of cache on the system the pseudo_lock_l2 | ||
| 522 | and pseudo_lock_l3 tracepoints are available. | ||
| 523 | WARNING: triggering this measurement uses from two (for just L2 | ||
| 524 | measurements) to four (for L2 and L3 measurements) precision counters on | ||
| 525 | the system, if any other measurements are in progress the counters and | ||
| 526 | their corresponding event registers will be clobbered. | ||
| 527 | |||
| 528 | When a pseudo-locked region is created a new debugfs directory is created for | ||
| 529 | it in debugfs as /sys/kernel/debug/resctrl/<newdir>. A single | ||
| 530 | write-only file, pseudo_lock_measure, is present in this directory. The | ||
| 531 | measurement on the pseudo-locked region depends on the number, 1 or 2, | ||
| 532 | written to this debugfs file. Since the measurements are recorded with the | ||
| 533 | tracing infrastructure the relevant tracepoints need to be enabled before the | ||
| 534 | measurement is triggered. | ||
| 535 | |||
| 536 | Example of latency debugging interface: | ||
| 537 | In this example a pseudo-locked region named "newlock" was created. Here is | ||
| 538 | how we can measure the latency in cycles of reading from this region and | ||
| 539 | visualize this data with a histogram that is available if CONFIG_HIST_TRIGGERS | ||
| 540 | is set: | ||
| 541 | # :> /sys/kernel/debug/tracing/trace | ||
| 542 | # echo 'hist:keys=latency' > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_mem_latency/trigger | ||
| 543 | # echo 1 > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_mem_latency/enable | ||
| 544 | # echo 1 > /sys/kernel/debug/resctrl/newlock/pseudo_lock_measure | ||
| 545 | # echo 0 > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_mem_latency/enable | ||
| 546 | # cat /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_mem_latency/hist | ||
| 547 | |||
| 548 | # event histogram | ||
| 549 | # | ||
| 550 | # trigger info: hist:keys=latency:vals=hitcount:sort=hitcount:size=2048 [active] | ||
| 551 | # | ||
| 552 | |||
| 553 | { latency: 456 } hitcount: 1 | ||
| 554 | { latency: 50 } hitcount: 83 | ||
| 555 | { latency: 36 } hitcount: 96 | ||
| 556 | { latency: 44 } hitcount: 174 | ||
| 557 | { latency: 48 } hitcount: 195 | ||
| 558 | { latency: 46 } hitcount: 262 | ||
| 559 | { latency: 42 } hitcount: 693 | ||
| 560 | { latency: 40 } hitcount: 3204 | ||
| 561 | { latency: 38 } hitcount: 3484 | ||
| 562 | |||
| 563 | Totals: | ||
| 564 | Hits: 8192 | ||
| 565 | Entries: 9 | ||
| 566 | Dropped: 0 | ||
| 567 | |||
| 568 | Example of cache hits/misses debugging: | ||
| 569 | In this example a pseudo-locked region named "newlock" was created on the L2 | ||
| 570 | cache of a platform. Here is how we can obtain details of the cache hits | ||
| 571 | and misses using the platform's precision counters. | ||
| 572 | |||
| 573 | # :> /sys/kernel/debug/tracing/trace | ||
| 574 | # echo 1 > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_l2/enable | ||
| 575 | # echo 2 > /sys/kernel/debug/resctrl/newlock/pseudo_lock_measure | ||
| 576 | # echo 0 > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_l2/enable | ||
| 577 | # cat /sys/kernel/debug/tracing/trace | ||
| 578 | |||
| 579 | # tracer: nop | ||
| 580 | # | ||
| 581 | # _-----=> irqs-off | ||
| 582 | # / _----=> need-resched | ||
| 583 | # | / _---=> hardirq/softirq | ||
| 584 | # || / _--=> preempt-depth | ||
| 585 | # ||| / delay | ||
| 586 | # TASK-PID CPU# |||| TIMESTAMP FUNCTION | ||
| 587 | # | | | |||| | | | ||
| 588 | pseudo_lock_mea-1672 [002] .... 3132.860500: pseudo_lock_l2: hits=4097 miss=0 | ||
| 589 | |||
| 590 | |||
| 382 | Examples for RDT allocation usage: | 591 | Examples for RDT allocation usage: |
| 383 | 592 | ||
| 384 | Example 1 | 593 | Example 1 |
| @@ -502,7 +711,172 @@ siblings and only the real time threads are scheduled on the cores 4-7. | |||
| 502 | 711 | ||
| 503 | # echo F0 > p0/cpus | 712 | # echo F0 > p0/cpus |
| 504 | 713 | ||
| 505 | 4) Locking between applications | 714 | Example 4 |
| 715 | --------- | ||
| 716 | |||
| 717 | The resource groups in previous examples were all in the default "shareable" | ||
| 718 | mode allowing sharing of their cache allocations. If one resource group | ||
| 719 | configures a cache allocation then nothing prevents another resource group | ||
| 720 | to overlap with that allocation. | ||
| 721 | |||
| 722 | In this example a new exclusive resource group will be created on a L2 CAT | ||
| 723 | system with two L2 cache instances that can be configured with an 8-bit | ||
| 724 | capacity bitmask. The new exclusive resource group will be configured to use | ||
| 725 | 25% of each cache instance. | ||
| 726 | |||
| 727 | # mount -t resctrl resctrl /sys/fs/resctrl/ | ||
| 728 | # cd /sys/fs/resctrl | ||
| 729 | |||
| 730 | First, we observe that the default group is configured to allocate to all L2 | ||
| 731 | cache: | ||
| 732 | |||
| 733 | # cat schemata | ||
| 734 | L2:0=ff;1=ff | ||
| 735 | |||
| 736 | We could attempt to create the new resource group at this point, but it will | ||
| 737 | fail because of the overlap with the schemata of the default group: | ||
| 738 | # mkdir p0 | ||
| 739 | # echo 'L2:0=0x3;1=0x3' > p0/schemata | ||
| 740 | # cat p0/mode | ||
| 741 | shareable | ||
| 742 | # echo exclusive > p0/mode | ||
| 743 | -sh: echo: write error: Invalid argument | ||
| 744 | # cat info/last_cmd_status | ||
| 745 | schemata overlaps | ||
| 746 | |||
| 747 | To ensure that there is no overlap with another resource group the default | ||
| 748 | resource group's schemata has to change, making it possible for the new | ||
| 749 | resource group to become exclusive. | ||
| 750 | # echo 'L2:0=0xfc;1=0xfc' > schemata | ||
| 751 | # echo exclusive > p0/mode | ||
| 752 | # grep . p0/* | ||
| 753 | p0/cpus:0 | ||
| 754 | p0/mode:exclusive | ||
| 755 | p0/schemata:L2:0=03;1=03 | ||
| 756 | p0/size:L2:0=262144;1=262144 | ||
| 757 | |||
| 758 | A new resource group will on creation not overlap with an exclusive resource | ||
| 759 | group: | ||
| 760 | # mkdir p1 | ||
| 761 | # grep . p1/* | ||
| 762 | p1/cpus:0 | ||
| 763 | p1/mode:shareable | ||
| 764 | p1/schemata:L2:0=fc;1=fc | ||
| 765 | p1/size:L2:0=786432;1=786432 | ||
| 766 | |||
| 767 | The bit_usage will reflect how the cache is used: | ||
| 768 | # cat info/L2/bit_usage | ||
| 769 | 0=SSSSSSEE;1=SSSSSSEE | ||
| 770 | |||
| 771 | A resource group cannot be forced to overlap with an exclusive resource group: | ||
| 772 | # echo 'L2:0=0x1;1=0x1' > p1/schemata | ||
| 773 | -sh: echo: write error: Invalid argument | ||
| 774 | # cat info/last_cmd_status | ||
| 775 | overlaps with exclusive group | ||
| 776 | |||
| 777 | Example of Cache Pseudo-Locking | ||
| 778 | ------------------------------- | ||
| 779 | Lock portion of L2 cache from cache id 1 using CBM 0x3. Pseudo-locked | ||
| 780 | region is exposed at /dev/pseudo_lock/newlock that can be provided to | ||
| 781 | application for argument to mmap(). | ||
| 782 | |||
| 783 | # mount -t resctrl resctrl /sys/fs/resctrl/ | ||
| 784 | # cd /sys/fs/resctrl | ||
| 785 | |||
| 786 | Ensure that there are bits available that can be pseudo-locked, since only | ||
| 787 | unused bits can be pseudo-locked the bits to be pseudo-locked needs to be | ||
| 788 | removed from the default resource group's schemata: | ||
| 789 | # cat info/L2/bit_usage | ||
| 790 | 0=SSSSSSSS;1=SSSSSSSS | ||
| 791 | # echo 'L2:1=0xfc' > schemata | ||
| 792 | # cat info/L2/bit_usage | ||
| 793 | 0=SSSSSSSS;1=SSSSSS00 | ||
| 794 | |||
| 795 | Create a new resource group that will be associated with the pseudo-locked | ||
| 796 | region, indicate that it will be used for a pseudo-locked region, and | ||
| 797 | configure the requested pseudo-locked region capacity bitmask: | ||
| 798 | |||
| 799 | # mkdir newlock | ||
| 800 | # echo pseudo-locksetup > newlock/mode | ||
| 801 | # echo 'L2:1=0x3' > newlock/schemata | ||
| 802 | |||
| 803 | On success the resource group's mode will change to pseudo-locked, the | ||
| 804 | bit_usage will reflect the pseudo-locked region, and the character device | ||
| 805 | exposing the pseudo-locked region will exist: | ||
| 806 | |||
| 807 | # cat newlock/mode | ||
| 808 | pseudo-locked | ||
| 809 | # cat info/L2/bit_usage | ||
| 810 | 0=SSSSSSSS;1=SSSSSSPP | ||
| 811 | # ls -l /dev/pseudo_lock/newlock | ||
| 812 | crw------- 1 root root 243, 0 Apr 3 05:01 /dev/pseudo_lock/newlock | ||
| 813 | |||
| 814 | /* | ||
| 815 | * Example code to access one page of pseudo-locked cache region | ||
| 816 | * from user space. | ||
| 817 | */ | ||
| 818 | #define _GNU_SOURCE | ||
| 819 | #include <fcntl.h> | ||
| 820 | #include <sched.h> | ||
| 821 | #include <stdio.h> | ||
| 822 | #include <stdlib.h> | ||
| 823 | #include <unistd.h> | ||
| 824 | #include <sys/mman.h> | ||
| 825 | |||
| 826 | /* | ||
| 827 | * It is required that the application runs with affinity to only | ||
| 828 | * cores associated with the pseudo-locked region. Here the cpu | ||
| 829 | * is hardcoded for convenience of example. | ||
| 830 | */ | ||
| 831 | static int cpuid = 2; | ||
| 832 | |||
| 833 | int main(int argc, char *argv[]) | ||
| 834 | { | ||
| 835 | cpu_set_t cpuset; | ||
| 836 | long page_size; | ||
| 837 | void *mapping; | ||
| 838 | int dev_fd; | ||
| 839 | int ret; | ||
| 840 | |||
| 841 | page_size = sysconf(_SC_PAGESIZE); | ||
| 842 | |||
| 843 | CPU_ZERO(&cpuset); | ||
| 844 | CPU_SET(cpuid, &cpuset); | ||
| 845 | ret = sched_setaffinity(0, sizeof(cpuset), &cpuset); | ||
| 846 | if (ret < 0) { | ||
| 847 | perror("sched_setaffinity"); | ||
| 848 | exit(EXIT_FAILURE); | ||
| 849 | } | ||
| 850 | |||
| 851 | dev_fd = open("/dev/pseudo_lock/newlock", O_RDWR); | ||
| 852 | if (dev_fd < 0) { | ||
| 853 | perror("open"); | ||
| 854 | exit(EXIT_FAILURE); | ||
| 855 | } | ||
| 856 | |||
| 857 | mapping = mmap(0, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, | ||
| 858 | dev_fd, 0); | ||
| 859 | if (mapping == MAP_FAILED) { | ||
| 860 | perror("mmap"); | ||
| 861 | close(dev_fd); | ||
| 862 | exit(EXIT_FAILURE); | ||
| 863 | } | ||
| 864 | |||
| 865 | /* Application interacts with pseudo-locked memory @mapping */ | ||
| 866 | |||
| 867 | ret = munmap(mapping, page_size); | ||
| 868 | if (ret < 0) { | ||
| 869 | perror("munmap"); | ||
| 870 | close(dev_fd); | ||
| 871 | exit(EXIT_FAILURE); | ||
| 872 | } | ||
| 873 | |||
| 874 | close(dev_fd); | ||
| 875 | exit(EXIT_SUCCESS); | ||
| 876 | } | ||
| 877 | |||
| 878 | Locking between applications | ||
| 879 | ---------------------------- | ||
| 506 | 880 | ||
| 507 | Certain operations on the resctrl filesystem, composed of read/writes | 881 | Certain operations on the resctrl filesystem, composed of read/writes |
| 508 | to/from multiple files, must be atomic. | 882 | to/from multiple files, must be atomic. |
| @@ -510,7 +884,7 @@ to/from multiple files, must be atomic. | |||
| 510 | As an example, the allocation of an exclusive reservation of L3 cache | 884 | As an example, the allocation of an exclusive reservation of L3 cache |
| 511 | involves: | 885 | involves: |
| 512 | 886 | ||
| 513 | 1. Read the cbmmasks from each directory | 887 | 1. Read the cbmmasks from each directory or the per-resource "bit_usage" |
| 514 | 2. Find a contiguous set of bits in the global CBM bitmask that is clear | 888 | 2. Find a contiguous set of bits in the global CBM bitmask that is clear |
| 515 | in any of the directory cbmmasks | 889 | in any of the directory cbmmasks |
| 516 | 3. Create a new directory | 890 | 3. Create a new directory |
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 7a40196967cb..347137e80bf5 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile | |||
| @@ -35,7 +35,9 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o | |||
| 35 | obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o | 35 | obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o |
| 36 | obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o | 36 | obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o |
| 37 | 37 | ||
| 38 | obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o intel_rdt_ctrlmondata.o | 38 | obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o |
| 39 | obj-$(CONFIG_INTEL_RDT) += intel_rdt_ctrlmondata.o intel_rdt_pseudo_lock.o | ||
| 40 | CFLAGS_intel_rdt_pseudo_lock.o = -I$(src) | ||
| 39 | 41 | ||
| 40 | obj-$(CONFIG_X86_MCE) += mcheck/ | 42 | obj-$(CONFIG_X86_MCE) += mcheck/ |
| 41 | obj-$(CONFIG_MTRR) += mtrr/ | 43 | obj-$(CONFIG_MTRR) += mtrr/ |
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index ec4754f81cbd..abb71ac70443 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c | |||
| @@ -859,6 +859,8 @@ static __init bool get_rdt_resources(void) | |||
| 859 | return (rdt_mon_capable || rdt_alloc_capable); | 859 | return (rdt_mon_capable || rdt_alloc_capable); |
| 860 | } | 860 | } |
| 861 | 861 | ||
| 862 | static enum cpuhp_state rdt_online; | ||
| 863 | |||
| 862 | static int __init intel_rdt_late_init(void) | 864 | static int __init intel_rdt_late_init(void) |
| 863 | { | 865 | { |
| 864 | struct rdt_resource *r; | 866 | struct rdt_resource *r; |
| @@ -880,6 +882,7 @@ static int __init intel_rdt_late_init(void) | |||
| 880 | cpuhp_remove_state(state); | 882 | cpuhp_remove_state(state); |
| 881 | return ret; | 883 | return ret; |
| 882 | } | 884 | } |
| 885 | rdt_online = state; | ||
| 883 | 886 | ||
| 884 | for_each_alloc_capable_rdt_resource(r) | 887 | for_each_alloc_capable_rdt_resource(r) |
| 885 | pr_info("Intel RDT %s allocation detected\n", r->name); | 888 | pr_info("Intel RDT %s allocation detected\n", r->name); |
| @@ -891,3 +894,11 @@ static int __init intel_rdt_late_init(void) | |||
| 891 | } | 894 | } |
| 892 | 895 | ||
| 893 | late_initcall(intel_rdt_late_init); | 896 | late_initcall(intel_rdt_late_init); |
| 897 | |||
| 898 | static void __exit intel_rdt_exit(void) | ||
| 899 | { | ||
| 900 | cpuhp_remove_state(rdt_online); | ||
| 901 | rdtgroup_exit(); | ||
| 902 | } | ||
| 903 | |||
| 904 | __exitcall(intel_rdt_exit); | ||
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 39752825e376..4e588f36228f 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h | |||
| @@ -81,6 +81,34 @@ enum rdt_group_type { | |||
| 81 | }; | 81 | }; |
| 82 | 82 | ||
| 83 | /** | 83 | /** |
| 84 | * enum rdtgrp_mode - Mode of a RDT resource group | ||
| 85 | * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations | ||
| 86 | * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed | ||
| 87 | * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking | ||
| 88 | * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations | ||
| 89 | * allowed AND the allocations are Cache Pseudo-Locked | ||
| 90 | * | ||
| 91 | * The mode of a resource group enables control over the allowed overlap | ||
| 92 | * between allocations associated with different resource groups (classes | ||
| 93 | * of service). User is able to modify the mode of a resource group by | ||
| 94 | * writing to the "mode" resctrl file associated with the resource group. | ||
| 95 | * | ||
| 96 | * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by | ||
| 97 | * writing the appropriate text to the "mode" file. A resource group enters | ||
| 98 | * "pseudo-locked" mode after the schemata is written while the resource | ||
| 99 | * group is in "pseudo-locksetup" mode. | ||
| 100 | */ | ||
| 101 | enum rdtgrp_mode { | ||
| 102 | RDT_MODE_SHAREABLE = 0, | ||
| 103 | RDT_MODE_EXCLUSIVE, | ||
| 104 | RDT_MODE_PSEUDO_LOCKSETUP, | ||
| 105 | RDT_MODE_PSEUDO_LOCKED, | ||
| 106 | |||
| 107 | /* Must be last */ | ||
| 108 | RDT_NUM_MODES, | ||
| 109 | }; | ||
| 110 | |||
| 111 | /** | ||
| 84 | * struct mongroup - store mon group's data in resctrl fs. | 112 | * struct mongroup - store mon group's data in resctrl fs. |
| 85 | * @mon_data_kn kernlfs node for the mon_data directory | 113 | * @mon_data_kn kernlfs node for the mon_data directory |
| 86 | * @parent: parent rdtgrp | 114 | * @parent: parent rdtgrp |
| @@ -95,6 +123,43 @@ struct mongroup { | |||
| 95 | }; | 123 | }; |
| 96 | 124 | ||
| 97 | /** | 125 | /** |
| 126 | * struct pseudo_lock_region - pseudo-lock region information | ||
| 127 | * @r: RDT resource to which this pseudo-locked region | ||
| 128 | * belongs | ||
| 129 | * @d: RDT domain to which this pseudo-locked region | ||
| 130 | * belongs | ||
| 131 | * @cbm: bitmask of the pseudo-locked region | ||
| 132 | * @lock_thread_wq: waitqueue used to wait on the pseudo-locking thread | ||
| 133 | * completion | ||
| 134 | * @thread_done: variable used by waitqueue to test if pseudo-locking | ||
| 135 | * thread completed | ||
| 136 | * @cpu: core associated with the cache on which the setup code | ||
| 137 | * will be run | ||
| 138 | * @line_size: size of the cache lines | ||
| 139 | * @size: size of pseudo-locked region in bytes | ||
| 140 | * @kmem: the kernel memory associated with pseudo-locked region | ||
| 141 | * @minor: minor number of character device associated with this | ||
| 142 | * region | ||
| 143 | * @debugfs_dir: pointer to this region's directory in the debugfs | ||
| 144 | * filesystem | ||
| 145 | * @pm_reqs: Power management QoS requests related to this region | ||
| 146 | */ | ||
| 147 | struct pseudo_lock_region { | ||
| 148 | struct rdt_resource *r; | ||
| 149 | struct rdt_domain *d; | ||
| 150 | u32 cbm; | ||
| 151 | wait_queue_head_t lock_thread_wq; | ||
| 152 | int thread_done; | ||
| 153 | int cpu; | ||
| 154 | unsigned int line_size; | ||
| 155 | unsigned int size; | ||
| 156 | void *kmem; | ||
| 157 | unsigned int minor; | ||
| 158 | struct dentry *debugfs_dir; | ||
| 159 | struct list_head pm_reqs; | ||
| 160 | }; | ||
| 161 | |||
| 162 | /** | ||
| 98 | * struct rdtgroup - store rdtgroup's data in resctrl file system. | 163 | * struct rdtgroup - store rdtgroup's data in resctrl file system. |
| 99 | * @kn: kernfs node | 164 | * @kn: kernfs node |
| 100 | * @rdtgroup_list: linked list for all rdtgroups | 165 | * @rdtgroup_list: linked list for all rdtgroups |
| @@ -106,16 +171,20 @@ struct mongroup { | |||
| 106 | * @type: indicates type of this rdtgroup - either | 171 | * @type: indicates type of this rdtgroup - either |
| 107 | * monitor only or ctrl_mon group | 172 | * monitor only or ctrl_mon group |
| 108 | * @mon: mongroup related data | 173 | * @mon: mongroup related data |
| 174 | * @mode: mode of resource group | ||
| 175 | * @plr: pseudo-locked region | ||
| 109 | */ | 176 | */ |
| 110 | struct rdtgroup { | 177 | struct rdtgroup { |
| 111 | struct kernfs_node *kn; | 178 | struct kernfs_node *kn; |
| 112 | struct list_head rdtgroup_list; | 179 | struct list_head rdtgroup_list; |
| 113 | u32 closid; | 180 | u32 closid; |
| 114 | struct cpumask cpu_mask; | 181 | struct cpumask cpu_mask; |
| 115 | int flags; | 182 | int flags; |
| 116 | atomic_t waitcount; | 183 | atomic_t waitcount; |
| 117 | enum rdt_group_type type; | 184 | enum rdt_group_type type; |
| 118 | struct mongroup mon; | 185 | struct mongroup mon; |
| 186 | enum rdtgrp_mode mode; | ||
| 187 | struct pseudo_lock_region *plr; | ||
| 119 | }; | 188 | }; |
| 120 | 189 | ||
| 121 | /* rdtgroup.flags */ | 190 | /* rdtgroup.flags */ |
| @@ -148,6 +217,7 @@ extern struct list_head rdt_all_groups; | |||
| 148 | extern int max_name_width, max_data_width; | 217 | extern int max_name_width, max_data_width; |
| 149 | 218 | ||
| 150 | int __init rdtgroup_init(void); | 219 | int __init rdtgroup_init(void); |
| 220 | void __exit rdtgroup_exit(void); | ||
| 151 | 221 | ||
| 152 | /** | 222 | /** |
| 153 | * struct rftype - describe each file in the resctrl file system | 223 | * struct rftype - describe each file in the resctrl file system |
| @@ -216,22 +286,24 @@ struct mbm_state { | |||
| 216 | * @mbps_val: When mba_sc is enabled, this holds the bandwidth in MBps | 286 | * @mbps_val: When mba_sc is enabled, this holds the bandwidth in MBps |
| 217 | * @new_ctrl: new ctrl value to be loaded | 287 | * @new_ctrl: new ctrl value to be loaded |
| 218 | * @have_new_ctrl: did user provide new_ctrl for this domain | 288 | * @have_new_ctrl: did user provide new_ctrl for this domain |
| 289 | * @plr: pseudo-locked region (if any) associated with domain | ||
| 219 | */ | 290 | */ |
| 220 | struct rdt_domain { | 291 | struct rdt_domain { |
| 221 | struct list_head list; | 292 | struct list_head list; |
| 222 | int id; | 293 | int id; |
| 223 | struct cpumask cpu_mask; | 294 | struct cpumask cpu_mask; |
| 224 | unsigned long *rmid_busy_llc; | 295 | unsigned long *rmid_busy_llc; |
| 225 | struct mbm_state *mbm_total; | 296 | struct mbm_state *mbm_total; |
| 226 | struct mbm_state *mbm_local; | 297 | struct mbm_state *mbm_local; |
| 227 | struct delayed_work mbm_over; | 298 | struct delayed_work mbm_over; |
| 228 | struct delayed_work cqm_limbo; | 299 | struct delayed_work cqm_limbo; |
| 229 | int mbm_work_cpu; | 300 | int mbm_work_cpu; |
| 230 | int cqm_work_cpu; | 301 | int cqm_work_cpu; |
| 231 | u32 *ctrl_val; | 302 | u32 *ctrl_val; |
| 232 | u32 *mbps_val; | 303 | u32 *mbps_val; |
| 233 | u32 new_ctrl; | 304 | u32 new_ctrl; |
| 234 | bool have_new_ctrl; | 305 | bool have_new_ctrl; |
| 306 | struct pseudo_lock_region *plr; | ||
| 235 | }; | 307 | }; |
| 236 | 308 | ||
| 237 | /** | 309 | /** |
| @@ -351,7 +423,7 @@ struct rdt_resource { | |||
| 351 | struct rdt_cache cache; | 423 | struct rdt_cache cache; |
| 352 | struct rdt_membw membw; | 424 | struct rdt_membw membw; |
| 353 | const char *format_str; | 425 | const char *format_str; |
| 354 | int (*parse_ctrlval) (char *buf, struct rdt_resource *r, | 426 | int (*parse_ctrlval) (void *data, struct rdt_resource *r, |
| 355 | struct rdt_domain *d); | 427 | struct rdt_domain *d); |
| 356 | struct list_head evt_list; | 428 | struct list_head evt_list; |
| 357 | int num_rmid; | 429 | int num_rmid; |
| @@ -359,8 +431,8 @@ struct rdt_resource { | |||
| 359 | unsigned long fflags; | 431 | unsigned long fflags; |
| 360 | }; | 432 | }; |
| 361 | 433 | ||
| 362 | int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d); | 434 | int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d); |
| 363 | int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d); | 435 | int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d); |
| 364 | 436 | ||
| 365 | extern struct mutex rdtgroup_mutex; | 437 | extern struct mutex rdtgroup_mutex; |
| 366 | 438 | ||
| @@ -368,7 +440,7 @@ extern struct rdt_resource rdt_resources_all[]; | |||
| 368 | extern struct rdtgroup rdtgroup_default; | 440 | extern struct rdtgroup rdtgroup_default; |
| 369 | DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); | 441 | DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); |
| 370 | 442 | ||
| 371 | int __init rdtgroup_init(void); | 443 | extern struct dentry *debugfs_resctrl; |
| 372 | 444 | ||
| 373 | enum { | 445 | enum { |
| 374 | RDT_RESOURCE_L3, | 446 | RDT_RESOURCE_L3, |
| @@ -439,13 +511,32 @@ void rdt_last_cmd_printf(const char *fmt, ...); | |||
| 439 | void rdt_ctrl_update(void *arg); | 511 | void rdt_ctrl_update(void *arg); |
| 440 | struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); | 512 | struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); |
| 441 | void rdtgroup_kn_unlock(struct kernfs_node *kn); | 513 | void rdtgroup_kn_unlock(struct kernfs_node *kn); |
| 514 | int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name); | ||
| 515 | int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, | ||
| 516 | umode_t mask); | ||
| 442 | struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, | 517 | struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, |
| 443 | struct list_head **pos); | 518 | struct list_head **pos); |
| 444 | ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, | 519 | ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, |
| 445 | char *buf, size_t nbytes, loff_t off); | 520 | char *buf, size_t nbytes, loff_t off); |
| 446 | int rdtgroup_schemata_show(struct kernfs_open_file *of, | 521 | int rdtgroup_schemata_show(struct kernfs_open_file *of, |
| 447 | struct seq_file *s, void *v); | 522 | struct seq_file *s, void *v); |
| 523 | bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, | ||
| 524 | u32 _cbm, int closid, bool exclusive); | ||
| 525 | unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d, | ||
| 526 | u32 cbm); | ||
| 527 | enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); | ||
| 528 | int rdtgroup_tasks_assigned(struct rdtgroup *r); | ||
| 529 | int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); | ||
| 530 | int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); | ||
| 531 | bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, u32 _cbm); | ||
| 532 | bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d); | ||
| 533 | int rdt_pseudo_lock_init(void); | ||
| 534 | void rdt_pseudo_lock_release(void); | ||
| 535 | int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); | ||
| 536 | void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); | ||
| 448 | struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); | 537 | struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); |
| 538 | int update_domains(struct rdt_resource *r, int closid); | ||
| 539 | void closid_free(int closid); | ||
| 449 | int alloc_rmid(void); | 540 | int alloc_rmid(void); |
| 450 | void free_rmid(u32 rmid); | 541 | void free_rmid(u32 rmid); |
| 451 | int rdt_get_mon_l3_config(struct rdt_resource *r); | 542 | int rdt_get_mon_l3_config(struct rdt_resource *r); |
diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index 116d57b248d3..af358ca05160 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | |||
| @@ -64,9 +64,10 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) | |||
| 64 | return true; | 64 | return true; |
| 65 | } | 65 | } |
| 66 | 66 | ||
| 67 | int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d) | 67 | int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d) |
| 68 | { | 68 | { |
| 69 | unsigned long data; | 69 | unsigned long data; |
| 70 | char *buf = _buf; | ||
| 70 | 71 | ||
| 71 | if (d->have_new_ctrl) { | 72 | if (d->have_new_ctrl) { |
| 72 | rdt_last_cmd_printf("duplicate domain %d\n", d->id); | 73 | rdt_last_cmd_printf("duplicate domain %d\n", d->id); |
| @@ -87,7 +88,7 @@ int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d) | |||
| 87 | * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.). | 88 | * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.). |
| 88 | * Additionally Haswell requires at least two bits set. | 89 | * Additionally Haswell requires at least two bits set. |
| 89 | */ | 90 | */ |
| 90 | static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r) | 91 | static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) |
| 91 | { | 92 | { |
| 92 | unsigned long first_bit, zero_bit, val; | 93 | unsigned long first_bit, zero_bit, val; |
| 93 | unsigned int cbm_len = r->cache.cbm_len; | 94 | unsigned int cbm_len = r->cache.cbm_len; |
| @@ -122,22 +123,64 @@ static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r) | |||
| 122 | return true; | 123 | return true; |
| 123 | } | 124 | } |
| 124 | 125 | ||
| 126 | struct rdt_cbm_parse_data { | ||
| 127 | struct rdtgroup *rdtgrp; | ||
| 128 | char *buf; | ||
| 129 | }; | ||
| 130 | |||
| 125 | /* | 131 | /* |
| 126 | * Read one cache bit mask (hex). Check that it is valid for the current | 132 | * Read one cache bit mask (hex). Check that it is valid for the current |
| 127 | * resource type. | 133 | * resource type. |
| 128 | */ | 134 | */ |
| 129 | int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d) | 135 | int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d) |
| 130 | { | 136 | { |
| 131 | unsigned long data; | 137 | struct rdt_cbm_parse_data *data = _data; |
| 138 | struct rdtgroup *rdtgrp = data->rdtgrp; | ||
| 139 | u32 cbm_val; | ||
| 132 | 140 | ||
| 133 | if (d->have_new_ctrl) { | 141 | if (d->have_new_ctrl) { |
| 134 | rdt_last_cmd_printf("duplicate domain %d\n", d->id); | 142 | rdt_last_cmd_printf("duplicate domain %d\n", d->id); |
| 135 | return -EINVAL; | 143 | return -EINVAL; |
| 136 | } | 144 | } |
| 137 | 145 | ||
| 138 | if(!cbm_validate(buf, &data, r)) | 146 | /* |
| 147 | * Cannot set up more than one pseudo-locked region in a cache | ||
| 148 | * hierarchy. | ||
| 149 | */ | ||
| 150 | if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && | ||
| 151 | rdtgroup_pseudo_locked_in_hierarchy(d)) { | ||
| 152 | rdt_last_cmd_printf("pseudo-locked region in hierarchy\n"); | ||
| 139 | return -EINVAL; | 153 | return -EINVAL; |
| 140 | d->new_ctrl = data; | 154 | } |
| 155 | |||
| 156 | if (!cbm_validate(data->buf, &cbm_val, r)) | ||
| 157 | return -EINVAL; | ||
| 158 | |||
| 159 | if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || | ||
| 160 | rdtgrp->mode == RDT_MODE_SHAREABLE) && | ||
| 161 | rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) { | ||
| 162 | rdt_last_cmd_printf("CBM overlaps with pseudo-locked region\n"); | ||
| 163 | return -EINVAL; | ||
| 164 | } | ||
| 165 | |||
| 166 | /* | ||
| 167 | * The CBM may not overlap with the CBM of another closid if | ||
| 168 | * either is exclusive. | ||
| 169 | */ | ||
| 170 | if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, true)) { | ||
| 171 | rdt_last_cmd_printf("overlaps with exclusive group\n"); | ||
| 172 | return -EINVAL; | ||
| 173 | } | ||
| 174 | |||
| 175 | if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, false)) { | ||
| 176 | if (rdtgrp->mode == RDT_MODE_EXCLUSIVE || | ||
| 177 | rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { | ||
| 178 | rdt_last_cmd_printf("overlaps with other group\n"); | ||
| 179 | return -EINVAL; | ||
| 180 | } | ||
| 181 | } | ||
| 182 | |||
| 183 | d->new_ctrl = cbm_val; | ||
| 141 | d->have_new_ctrl = true; | 184 | d->have_new_ctrl = true; |
| 142 | 185 | ||
| 143 | return 0; | 186 | return 0; |
| @@ -149,8 +192,10 @@ int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d) | |||
| 149 | * separated by ";". The "id" is in decimal, and must match one of | 192 | * separated by ";". The "id" is in decimal, and must match one of |
| 150 | * the "id"s for this resource. | 193 | * the "id"s for this resource. |
| 151 | */ | 194 | */ |
| 152 | static int parse_line(char *line, struct rdt_resource *r) | 195 | static int parse_line(char *line, struct rdt_resource *r, |
| 196 | struct rdtgroup *rdtgrp) | ||
| 153 | { | 197 | { |
| 198 | struct rdt_cbm_parse_data data; | ||
| 154 | char *dom = NULL, *id; | 199 | char *dom = NULL, *id; |
| 155 | struct rdt_domain *d; | 200 | struct rdt_domain *d; |
| 156 | unsigned long dom_id; | 201 | unsigned long dom_id; |
| @@ -167,15 +212,32 @@ next: | |||
| 167 | dom = strim(dom); | 212 | dom = strim(dom); |
| 168 | list_for_each_entry(d, &r->domains, list) { | 213 | list_for_each_entry(d, &r->domains, list) { |
| 169 | if (d->id == dom_id) { | 214 | if (d->id == dom_id) { |
| 170 | if (r->parse_ctrlval(dom, r, d)) | 215 | data.buf = dom; |
| 216 | data.rdtgrp = rdtgrp; | ||
| 217 | if (r->parse_ctrlval(&data, r, d)) | ||
| 171 | return -EINVAL; | 218 | return -EINVAL; |
| 219 | if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { | ||
| 220 | /* | ||
| 221 | * In pseudo-locking setup mode and just | ||
| 222 | * parsed a valid CBM that should be | ||
| 223 | * pseudo-locked. Only one locked region per | ||
| 224 | * resource group and domain so just do | ||
| 225 | * the required initialization for single | ||
| 226 | * region and return. | ||
| 227 | */ | ||
| 228 | rdtgrp->plr->r = r; | ||
| 229 | rdtgrp->plr->d = d; | ||
| 230 | rdtgrp->plr->cbm = d->new_ctrl; | ||
| 231 | d->plr = rdtgrp->plr; | ||
| 232 | return 0; | ||
| 233 | } | ||
| 172 | goto next; | 234 | goto next; |
| 173 | } | 235 | } |
| 174 | } | 236 | } |
| 175 | return -EINVAL; | 237 | return -EINVAL; |
| 176 | } | 238 | } |
| 177 | 239 | ||
| 178 | static int update_domains(struct rdt_resource *r, int closid) | 240 | int update_domains(struct rdt_resource *r, int closid) |
| 179 | { | 241 | { |
| 180 | struct msr_param msr_param; | 242 | struct msr_param msr_param; |
| 181 | cpumask_var_t cpu_mask; | 243 | cpumask_var_t cpu_mask; |
| @@ -220,13 +282,14 @@ done: | |||
| 220 | return 0; | 282 | return 0; |
| 221 | } | 283 | } |
| 222 | 284 | ||
| 223 | static int rdtgroup_parse_resource(char *resname, char *tok, int closid) | 285 | static int rdtgroup_parse_resource(char *resname, char *tok, |
| 286 | struct rdtgroup *rdtgrp) | ||
| 224 | { | 287 | { |
| 225 | struct rdt_resource *r; | 288 | struct rdt_resource *r; |
| 226 | 289 | ||
| 227 | for_each_alloc_enabled_rdt_resource(r) { | 290 | for_each_alloc_enabled_rdt_resource(r) { |
| 228 | if (!strcmp(resname, r->name) && closid < r->num_closid) | 291 | if (!strcmp(resname, r->name) && rdtgrp->closid < r->num_closid) |
| 229 | return parse_line(tok, r); | 292 | return parse_line(tok, r, rdtgrp); |
| 230 | } | 293 | } |
| 231 | rdt_last_cmd_printf("unknown/unsupported resource name '%s'\n", resname); | 294 | rdt_last_cmd_printf("unknown/unsupported resource name '%s'\n", resname); |
| 232 | return -EINVAL; | 295 | return -EINVAL; |
| @@ -239,7 +302,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, | |||
| 239 | struct rdt_domain *dom; | 302 | struct rdt_domain *dom; |
| 240 | struct rdt_resource *r; | 303 | struct rdt_resource *r; |
| 241 | char *tok, *resname; | 304 | char *tok, *resname; |
| 242 | int closid, ret = 0; | 305 | int ret = 0; |
| 243 | 306 | ||
| 244 | /* Valid input requires a trailing newline */ | 307 | /* Valid input requires a trailing newline */ |
| 245 | if (nbytes == 0 || buf[nbytes - 1] != '\n') | 308 | if (nbytes == 0 || buf[nbytes - 1] != '\n') |
| @@ -253,7 +316,15 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, | |||
| 253 | } | 316 | } |
| 254 | rdt_last_cmd_clear(); | 317 | rdt_last_cmd_clear(); |
| 255 | 318 | ||
| 256 | closid = rdtgrp->closid; | 319 | /* |
| 320 | * No changes to pseudo-locked region allowed. It has to be removed | ||
| 321 | * and re-created instead. | ||
| 322 | */ | ||
| 323 | if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { | ||
| 324 | ret = -EINVAL; | ||
| 325 | rdt_last_cmd_puts("resource group is pseudo-locked\n"); | ||
| 326 | goto out; | ||
| 327 | } | ||
| 257 | 328 | ||
| 258 | for_each_alloc_enabled_rdt_resource(r) { | 329 | for_each_alloc_enabled_rdt_resource(r) { |
| 259 | list_for_each_entry(dom, &r->domains, list) | 330 | list_for_each_entry(dom, &r->domains, list) |
| @@ -272,17 +343,27 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, | |||
| 272 | ret = -EINVAL; | 343 | ret = -EINVAL; |
| 273 | goto out; | 344 | goto out; |
| 274 | } | 345 | } |
| 275 | ret = rdtgroup_parse_resource(resname, tok, closid); | 346 | ret = rdtgroup_parse_resource(resname, tok, rdtgrp); |
| 276 | if (ret) | 347 | if (ret) |
| 277 | goto out; | 348 | goto out; |
| 278 | } | 349 | } |
| 279 | 350 | ||
| 280 | for_each_alloc_enabled_rdt_resource(r) { | 351 | for_each_alloc_enabled_rdt_resource(r) { |
| 281 | ret = update_domains(r, closid); | 352 | ret = update_domains(r, rdtgrp->closid); |
| 282 | if (ret) | 353 | if (ret) |
| 283 | goto out; | 354 | goto out; |
| 284 | } | 355 | } |
| 285 | 356 | ||
| 357 | if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { | ||
| 358 | /* | ||
| 359 | * If pseudo-locking fails we keep the resource group in | ||
| 360 | * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service | ||
| 361 | * active and updated for just the domain the pseudo-locked | ||
| 362 | * region was requested for. | ||
| 363 | */ | ||
| 364 | ret = rdtgroup_pseudo_lock_create(rdtgrp); | ||
| 365 | } | ||
| 366 | |||
| 286 | out: | 367 | out: |
| 287 | rdtgroup_kn_unlock(of->kn); | 368 | rdtgroup_kn_unlock(of->kn); |
| 288 | return ret ?: nbytes; | 369 | return ret ?: nbytes; |
| @@ -318,10 +399,18 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, | |||
| 318 | 399 | ||
| 319 | rdtgrp = rdtgroup_kn_lock_live(of->kn); | 400 | rdtgrp = rdtgroup_kn_lock_live(of->kn); |
| 320 | if (rdtgrp) { | 401 | if (rdtgrp) { |
| 321 | closid = rdtgrp->closid; | 402 | if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { |
| 322 | for_each_alloc_enabled_rdt_resource(r) { | 403 | for_each_alloc_enabled_rdt_resource(r) |
| 323 | if (closid < r->num_closid) | 404 | seq_printf(s, "%s:uninitialized\n", r->name); |
| 324 | show_doms(s, r, closid); | 405 | } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { |
| 406 | seq_printf(s, "%s:%d=%x\n", rdtgrp->plr->r->name, | ||
| 407 | rdtgrp->plr->d->id, rdtgrp->plr->cbm); | ||
| 408 | } else { | ||
| 409 | closid = rdtgrp->closid; | ||
| 410 | for_each_alloc_enabled_rdt_resource(r) { | ||
| 411 | if (closid < r->num_closid) | ||
| 412 | show_doms(s, r, closid); | ||
| 413 | } | ||
| 325 | } | 414 | } |
| 326 | } else { | 415 | } else { |
| 327 | ret = -ENOENT; | 416 | ret = -ENOENT; |
diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c new file mode 100644 index 000000000000..40f3903ae5d9 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c | |||
| @@ -0,0 +1,1522 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0 | ||
| 2 | /* | ||
| 3 | * Resource Director Technology (RDT) | ||
| 4 | * | ||
| 5 | * Pseudo-locking support built on top of Cache Allocation Technology (CAT) | ||
| 6 | * | ||
| 7 | * Copyright (C) 2018 Intel Corporation | ||
| 8 | * | ||
| 9 | * Author: Reinette Chatre <reinette.chatre@intel.com> | ||
| 10 | */ | ||
| 11 | |||
| 12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
| 13 | |||
| 14 | #include <linux/cacheinfo.h> | ||
| 15 | #include <linux/cpu.h> | ||
| 16 | #include <linux/cpumask.h> | ||
| 17 | #include <linux/debugfs.h> | ||
| 18 | #include <linux/kthread.h> | ||
| 19 | #include <linux/mman.h> | ||
| 20 | #include <linux/pm_qos.h> | ||
| 21 | #include <linux/slab.h> | ||
| 22 | #include <linux/uaccess.h> | ||
| 23 | |||
| 24 | #include <asm/cacheflush.h> | ||
| 25 | #include <asm/intel-family.h> | ||
| 26 | #include <asm/intel_rdt_sched.h> | ||
| 27 | #include <asm/perf_event.h> | ||
| 28 | |||
| 29 | #include "intel_rdt.h" | ||
| 30 | |||
| 31 | #define CREATE_TRACE_POINTS | ||
| 32 | #include "intel_rdt_pseudo_lock_event.h" | ||
| 33 | |||
| 34 | /* | ||
| 35 | * MSR_MISC_FEATURE_CONTROL register enables the modification of hardware | ||
| 36 | * prefetcher state. Details about this register can be found in the MSR | ||
| 37 | * tables for specific platforms found in Intel's SDM. | ||
| 38 | */ | ||
| 39 | #define MSR_MISC_FEATURE_CONTROL 0x000001a4 | ||
| 40 | |||
| 41 | /* | ||
| 42 | * The bits needed to disable hardware prefetching varies based on the | ||
| 43 | * platform. During initialization we will discover which bits to use. | ||
| 44 | */ | ||
| 45 | static u64 prefetch_disable_bits; | ||
| 46 | |||
| 47 | /* | ||
| 48 | * Major number assigned to and shared by all devices exposing | ||
| 49 | * pseudo-locked regions. | ||
| 50 | */ | ||
| 51 | static unsigned int pseudo_lock_major; | ||
| 52 | static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); | ||
| 53 | static struct class *pseudo_lock_class; | ||
| 54 | |||
| 55 | /** | ||
| 56 | * get_prefetch_disable_bits - prefetch disable bits of supported platforms | ||
| 57 | * | ||
| 58 | * Capture the list of platforms that have been validated to support | ||
| 59 | * pseudo-locking. This includes testing to ensure pseudo-locked regions | ||
| 60 | * with low cache miss rates can be created under variety of load conditions | ||
| 61 | * as well as that these pseudo-locked regions can maintain their low cache | ||
| 62 | * miss rates under variety of load conditions for significant lengths of time. | ||
| 63 | * | ||
| 64 | * After a platform has been validated to support pseudo-locking its | ||
| 65 | * hardware prefetch disable bits are included here as they are documented | ||
| 66 | * in the SDM. | ||
| 67 | * | ||
| 68 | * When adding a platform here also add support for its cache events to | ||
| 69 | * measure_cycles_perf_fn() | ||
| 70 | * | ||
| 71 | * Return: | ||
| 72 | * If platform is supported, the bits to disable hardware prefetchers, 0 | ||
| 73 | * if platform is not supported. | ||
| 74 | */ | ||
| 75 | static u64 get_prefetch_disable_bits(void) | ||
| 76 | { | ||
| 77 | if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || | ||
| 78 | boot_cpu_data.x86 != 6) | ||
| 79 | return 0; | ||
| 80 | |||
| 81 | switch (boot_cpu_data.x86_model) { | ||
| 82 | case INTEL_FAM6_BROADWELL_X: | ||
| 83 | /* | ||
| 84 | * SDM defines bits of MSR_MISC_FEATURE_CONTROL register | ||
| 85 | * as: | ||
| 86 | * 0 L2 Hardware Prefetcher Disable (R/W) | ||
| 87 | * 1 L2 Adjacent Cache Line Prefetcher Disable (R/W) | ||
| 88 | * 2 DCU Hardware Prefetcher Disable (R/W) | ||
| 89 | * 3 DCU IP Prefetcher Disable (R/W) | ||
| 90 | * 63:4 Reserved | ||
| 91 | */ | ||
| 92 | return 0xF; | ||
| 93 | case INTEL_FAM6_ATOM_GOLDMONT: | ||
| 94 | case INTEL_FAM6_ATOM_GEMINI_LAKE: | ||
| 95 | /* | ||
| 96 | * SDM defines bits of MSR_MISC_FEATURE_CONTROL register | ||
| 97 | * as: | ||
| 98 | * 0 L2 Hardware Prefetcher Disable (R/W) | ||
| 99 | * 1 Reserved | ||
| 100 | * 2 DCU Hardware Prefetcher Disable (R/W) | ||
| 101 | * 63:3 Reserved | ||
| 102 | */ | ||
| 103 | return 0x5; | ||
| 104 | } | ||
| 105 | |||
| 106 | return 0; | ||
| 107 | } | ||
| 108 | |||
| 109 | /* | ||
| 110 | * Helper to write 64bit value to MSR without tracing. Used when | ||
| 111 | * use of the cache should be restricted and use of registers used | ||
| 112 | * for local variables avoided. | ||
| 113 | */ | ||
| 114 | static inline void pseudo_wrmsrl_notrace(unsigned int msr, u64 val) | ||
| 115 | { | ||
| 116 | __wrmsr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32)); | ||
| 117 | } | ||
| 118 | |||
| 119 | /** | ||
| 120 | * pseudo_lock_minor_get - Obtain available minor number | ||
| 121 | * @minor: Pointer to where new minor number will be stored | ||
| 122 | * | ||
| 123 | * A bitmask is used to track available minor numbers. Here the next free | ||
| 124 | * minor number is marked as unavailable and returned. | ||
| 125 | * | ||
| 126 | * Return: 0 on success, <0 on failure. | ||
| 127 | */ | ||
| 128 | static int pseudo_lock_minor_get(unsigned int *minor) | ||
| 129 | { | ||
| 130 | unsigned long first_bit; | ||
| 131 | |||
| 132 | first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS); | ||
| 133 | |||
| 134 | if (first_bit == MINORBITS) | ||
| 135 | return -ENOSPC; | ||
| 136 | |||
| 137 | __clear_bit(first_bit, &pseudo_lock_minor_avail); | ||
| 138 | *minor = first_bit; | ||
| 139 | |||
| 140 | return 0; | ||
| 141 | } | ||
| 142 | |||
| 143 | /** | ||
| 144 | * pseudo_lock_minor_release - Return minor number to available | ||
| 145 | * @minor: The minor number made available | ||
| 146 | */ | ||
| 147 | static void pseudo_lock_minor_release(unsigned int minor) | ||
| 148 | { | ||
| 149 | __set_bit(minor, &pseudo_lock_minor_avail); | ||
| 150 | } | ||
| 151 | |||
| 152 | /** | ||
| 153 | * region_find_by_minor - Locate a pseudo-lock region by inode minor number | ||
| 154 | * @minor: The minor number of the device representing pseudo-locked region | ||
| 155 | * | ||
| 156 | * When the character device is accessed we need to determine which | ||
| 157 | * pseudo-locked region it belongs to. This is done by matching the minor | ||
| 158 | * number of the device to the pseudo-locked region it belongs. | ||
| 159 | * | ||
| 160 | * Minor numbers are assigned at the time a pseudo-locked region is associated | ||
| 161 | * with a cache instance. | ||
| 162 | * | ||
| 163 | * Return: On success return pointer to resource group owning the pseudo-locked | ||
| 164 | * region, NULL on failure. | ||
| 165 | */ | ||
| 166 | static struct rdtgroup *region_find_by_minor(unsigned int minor) | ||
| 167 | { | ||
| 168 | struct rdtgroup *rdtgrp, *rdtgrp_match = NULL; | ||
| 169 | |||
| 170 | list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { | ||
| 171 | if (rdtgrp->plr && rdtgrp->plr->minor == minor) { | ||
| 172 | rdtgrp_match = rdtgrp; | ||
| 173 | break; | ||
| 174 | } | ||
| 175 | } | ||
| 176 | return rdtgrp_match; | ||
| 177 | } | ||
| 178 | |||
| 179 | /** | ||
| 180 | * pseudo_lock_pm_req - A power management QoS request list entry | ||
| 181 | * @list: Entry within the @pm_reqs list for a pseudo-locked region | ||
| 182 | * @req: PM QoS request | ||
| 183 | */ | ||
| 184 | struct pseudo_lock_pm_req { | ||
| 185 | struct list_head list; | ||
| 186 | struct dev_pm_qos_request req; | ||
| 187 | }; | ||
| 188 | |||
| 189 | static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) | ||
| 190 | { | ||
| 191 | struct pseudo_lock_pm_req *pm_req, *next; | ||
| 192 | |||
| 193 | list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) { | ||
| 194 | dev_pm_qos_remove_request(&pm_req->req); | ||
| 195 | list_del(&pm_req->list); | ||
| 196 | kfree(pm_req); | ||
| 197 | } | ||
| 198 | } | ||
| 199 | |||
| 200 | /** | ||
| 201 | * pseudo_lock_cstates_constrain - Restrict cores from entering C6 | ||
| 202 | * | ||
| 203 | * To prevent the cache from being affected by power management entering | ||
| 204 | * C6 has to be avoided. This is accomplished by requesting a latency | ||
| 205 | * requirement lower than lowest C6 exit latency of all supported | ||
| 206 | * platforms as found in the cpuidle state tables in the intel_idle driver. | ||
| 207 | * At this time it is possible to do so with a single latency requirement | ||
| 208 | * for all supported platforms. | ||
| 209 | * | ||
| 210 | * Since Goldmont is supported, which is affected by X86_BUG_MONITOR, | ||
| 211 | * the ACPI latencies need to be considered while keeping in mind that C2 | ||
| 212 | * may be set to map to deeper sleep states. In this case the latency | ||
| 213 | * requirement needs to prevent entering C2 also. | ||
| 214 | */ | ||
| 215 | static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) | ||
| 216 | { | ||
| 217 | struct pseudo_lock_pm_req *pm_req; | ||
| 218 | int cpu; | ||
| 219 | int ret; | ||
| 220 | |||
| 221 | for_each_cpu(cpu, &plr->d->cpu_mask) { | ||
| 222 | pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL); | ||
| 223 | if (!pm_req) { | ||
| 224 | rdt_last_cmd_puts("fail allocating mem for PM QoS\n"); | ||
| 225 | ret = -ENOMEM; | ||
| 226 | goto out_err; | ||
| 227 | } | ||
| 228 | ret = dev_pm_qos_add_request(get_cpu_device(cpu), | ||
| 229 | &pm_req->req, | ||
| 230 | DEV_PM_QOS_RESUME_LATENCY, | ||
| 231 | 30); | ||
| 232 | if (ret < 0) { | ||
| 233 | rdt_last_cmd_printf("fail to add latency req cpu%d\n", | ||
| 234 | cpu); | ||
| 235 | kfree(pm_req); | ||
| 236 | ret = -1; | ||
| 237 | goto out_err; | ||
| 238 | } | ||
| 239 | list_add(&pm_req->list, &plr->pm_reqs); | ||
| 240 | } | ||
| 241 | |||
| 242 | return 0; | ||
| 243 | |||
| 244 | out_err: | ||
| 245 | pseudo_lock_cstates_relax(plr); | ||
| 246 | return ret; | ||
| 247 | } | ||
| 248 | |||
| 249 | /** | ||
| 250 | * pseudo_lock_region_clear - Reset pseudo-lock region data | ||
| 251 | * @plr: pseudo-lock region | ||
| 252 | * | ||
| 253 | * All content of the pseudo-locked region is reset - any memory allocated | ||
| 254 | * freed. | ||
| 255 | * | ||
| 256 | * Return: void | ||
| 257 | */ | ||
| 258 | static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) | ||
| 259 | { | ||
| 260 | plr->size = 0; | ||
| 261 | plr->line_size = 0; | ||
| 262 | kfree(plr->kmem); | ||
| 263 | plr->kmem = NULL; | ||
| 264 | plr->r = NULL; | ||
| 265 | if (plr->d) | ||
| 266 | plr->d->plr = NULL; | ||
| 267 | plr->d = NULL; | ||
| 268 | plr->cbm = 0; | ||
| 269 | plr->debugfs_dir = NULL; | ||
| 270 | } | ||
| 271 | |||
| 272 | /** | ||
| 273 | * pseudo_lock_region_init - Initialize pseudo-lock region information | ||
| 274 | * @plr: pseudo-lock region | ||
| 275 | * | ||
| 276 | * Called after user provided a schemata to be pseudo-locked. From the | ||
| 277 | * schemata the &struct pseudo_lock_region is on entry already initialized | ||
| 278 | * with the resource, domain, and capacity bitmask. Here the information | ||
| 279 | * required for pseudo-locking is deduced from this data and &struct | ||
| 280 | * pseudo_lock_region initialized further. This information includes: | ||
| 281 | * - size in bytes of the region to be pseudo-locked | ||
| 282 | * - cache line size to know the stride with which data needs to be accessed | ||
| 283 | * to be pseudo-locked | ||
| 284 | * - a cpu associated with the cache instance on which the pseudo-locking | ||
| 285 | * flow can be executed | ||
| 286 | * | ||
| 287 | * Return: 0 on success, <0 on failure. Descriptive error will be written | ||
| 288 | * to last_cmd_status buffer. | ||
| 289 | */ | ||
| 290 | static int pseudo_lock_region_init(struct pseudo_lock_region *plr) | ||
| 291 | { | ||
| 292 | struct cpu_cacheinfo *ci; | ||
| 293 | int ret; | ||
| 294 | int i; | ||
| 295 | |||
| 296 | /* Pick the first cpu we find that is associated with the cache. */ | ||
| 297 | plr->cpu = cpumask_first(&plr->d->cpu_mask); | ||
| 298 | |||
| 299 | if (!cpu_online(plr->cpu)) { | ||
| 300 | rdt_last_cmd_printf("cpu %u associated with cache not online\n", | ||
| 301 | plr->cpu); | ||
| 302 | ret = -ENODEV; | ||
| 303 | goto out_region; | ||
| 304 | } | ||
| 305 | |||
| 306 | ci = get_cpu_cacheinfo(plr->cpu); | ||
| 307 | |||
| 308 | plr->size = rdtgroup_cbm_to_size(plr->r, plr->d, plr->cbm); | ||
| 309 | |||
| 310 | for (i = 0; i < ci->num_leaves; i++) { | ||
| 311 | if (ci->info_list[i].level == plr->r->cache_level) { | ||
| 312 | plr->line_size = ci->info_list[i].coherency_line_size; | ||
| 313 | return 0; | ||
| 314 | } | ||
| 315 | } | ||
| 316 | |||
| 317 | ret = -1; | ||
| 318 | rdt_last_cmd_puts("unable to determine cache line size\n"); | ||
| 319 | out_region: | ||
| 320 | pseudo_lock_region_clear(plr); | ||
| 321 | return ret; | ||
| 322 | } | ||
| 323 | |||
| 324 | /** | ||
| 325 | * pseudo_lock_init - Initialize a pseudo-lock region | ||
| 326 | * @rdtgrp: resource group to which new pseudo-locked region will belong | ||
| 327 | * | ||
| 328 | * A pseudo-locked region is associated with a resource group. When this | ||
| 329 | * association is created the pseudo-locked region is initialized. The | ||
| 330 | * details of the pseudo-locked region are not known at this time so only | ||
| 331 | * allocation is done and association established. | ||
| 332 | * | ||
| 333 | * Return: 0 on success, <0 on failure | ||
| 334 | */ | ||
| 335 | static int pseudo_lock_init(struct rdtgroup *rdtgrp) | ||
| 336 | { | ||
| 337 | struct pseudo_lock_region *plr; | ||
| 338 | |||
| 339 | plr = kzalloc(sizeof(*plr), GFP_KERNEL); | ||
| 340 | if (!plr) | ||
| 341 | return -ENOMEM; | ||
| 342 | |||
| 343 | init_waitqueue_head(&plr->lock_thread_wq); | ||
| 344 | INIT_LIST_HEAD(&plr->pm_reqs); | ||
| 345 | rdtgrp->plr = plr; | ||
| 346 | return 0; | ||
| 347 | } | ||
| 348 | |||
| 349 | /** | ||
| 350 | * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked | ||
| 351 | * @plr: pseudo-lock region | ||
| 352 | * | ||
| 353 | * Initialize the details required to set up the pseudo-locked region and | ||
| 354 | * allocate the contiguous memory that will be pseudo-locked to the cache. | ||
| 355 | * | ||
| 356 | * Return: 0 on success, <0 on failure. Descriptive error will be written | ||
| 357 | * to last_cmd_status buffer. | ||
| 358 | */ | ||
| 359 | static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr) | ||
| 360 | { | ||
| 361 | int ret; | ||
| 362 | |||
| 363 | ret = pseudo_lock_region_init(plr); | ||
| 364 | if (ret < 0) | ||
| 365 | return ret; | ||
| 366 | |||
| 367 | /* | ||
| 368 | * We do not yet support contiguous regions larger than | ||
| 369 | * KMALLOC_MAX_SIZE. | ||
| 370 | */ | ||
| 371 | if (plr->size > KMALLOC_MAX_SIZE) { | ||
| 372 | rdt_last_cmd_puts("requested region exceeds maximum size\n"); | ||
| 373 | ret = -E2BIG; | ||
| 374 | goto out_region; | ||
| 375 | } | ||
| 376 | |||
| 377 | plr->kmem = kzalloc(plr->size, GFP_KERNEL); | ||
| 378 | if (!plr->kmem) { | ||
| 379 | rdt_last_cmd_puts("unable to allocate memory\n"); | ||
| 380 | ret = -ENOMEM; | ||
| 381 | goto out_region; | ||
| 382 | } | ||
| 383 | |||
| 384 | ret = 0; | ||
| 385 | goto out; | ||
| 386 | out_region: | ||
| 387 | pseudo_lock_region_clear(plr); | ||
| 388 | out: | ||
| 389 | return ret; | ||
| 390 | } | ||
| 391 | |||
| 392 | /** | ||
| 393 | * pseudo_lock_free - Free a pseudo-locked region | ||
| 394 | * @rdtgrp: resource group to which pseudo-locked region belonged | ||
| 395 | * | ||
| 396 | * The pseudo-locked region's resources have already been released, or not | ||
| 397 | * yet created at this point. Now it can be freed and disassociated from the | ||
| 398 | * resource group. | ||
| 399 | * | ||
| 400 | * Return: void | ||
| 401 | */ | ||
| 402 | static void pseudo_lock_free(struct rdtgroup *rdtgrp) | ||
| 403 | { | ||
| 404 | pseudo_lock_region_clear(rdtgrp->plr); | ||
| 405 | kfree(rdtgrp->plr); | ||
| 406 | rdtgrp->plr = NULL; | ||
| 407 | } | ||
| 408 | |||
| 409 | /** | ||
| 410 | * pseudo_lock_fn - Load kernel memory into cache | ||
| 411 | * @_rdtgrp: resource group to which pseudo-lock region belongs | ||
| 412 | * | ||
| 413 | * This is the core pseudo-locking flow. | ||
| 414 | * | ||
| 415 | * First we ensure that the kernel memory cannot be found in the cache. | ||
| 416 | * Then, while taking care that there will be as little interference as | ||
| 417 | * possible, the memory to be loaded is accessed while core is running | ||
| 418 | * with class of service set to the bitmask of the pseudo-locked region. | ||
| 419 | * After this is complete no future CAT allocations will be allowed to | ||
| 420 | * overlap with this bitmask. | ||
| 421 | * | ||
| 422 | * Local register variables are utilized to ensure that the memory region | ||
| 423 | * to be locked is the only memory access made during the critical locking | ||
| 424 | * loop. | ||
| 425 | * | ||
| 426 | * Return: 0. Waiter on waitqueue will be woken on completion. | ||
| 427 | */ | ||
| 428 | static int pseudo_lock_fn(void *_rdtgrp) | ||
| 429 | { | ||
| 430 | struct rdtgroup *rdtgrp = _rdtgrp; | ||
| 431 | struct pseudo_lock_region *plr = rdtgrp->plr; | ||
| 432 | u32 rmid_p, closid_p; | ||
| 433 | unsigned long i; | ||
| 434 | #ifdef CONFIG_KASAN | ||
| 435 | /* | ||
| 436 | * The registers used for local register variables are also used | ||
| 437 | * when KASAN is active. When KASAN is active we use a regular | ||
| 438 | * variable to ensure we always use a valid pointer, but the cost | ||
| 439 | * is that this variable will enter the cache through evicting the | ||
| 440 | * memory we are trying to lock into the cache. Thus expect lower | ||
| 441 | * pseudo-locking success rate when KASAN is active. | ||
| 442 | */ | ||
| 443 | unsigned int line_size; | ||
| 444 | unsigned int size; | ||
| 445 | void *mem_r; | ||
| 446 | #else | ||
| 447 | register unsigned int line_size asm("esi"); | ||
| 448 | register unsigned int size asm("edi"); | ||
| 449 | #ifdef CONFIG_X86_64 | ||
| 450 | register void *mem_r asm("rbx"); | ||
| 451 | #else | ||
| 452 | register void *mem_r asm("ebx"); | ||
| 453 | #endif /* CONFIG_X86_64 */ | ||
| 454 | #endif /* CONFIG_KASAN */ | ||
| 455 | |||
| 456 | /* | ||
| 457 | * Make sure none of the allocated memory is cached. If it is we | ||
| 458 | * will get a cache hit in below loop from outside of pseudo-locked | ||
| 459 | * region. | ||
| 460 | * wbinvd (as opposed to clflush/clflushopt) is required to | ||
| 461 | * increase likelihood that allocated cache portion will be filled | ||
| 462 | * with associated memory. | ||
| 463 | */ | ||
| 464 | native_wbinvd(); | ||
| 465 | |||
| 466 | /* | ||
| 467 | * Always called with interrupts enabled. By disabling interrupts | ||
| 468 | * ensure that we will not be preempted during this critical section. | ||
| 469 | */ | ||
| 470 | local_irq_disable(); | ||
| 471 | |||
| 472 | /* | ||
| 473 | * Call wrmsr and rdmsr as directly as possible to avoid tracing | ||
| 474 | * clobbering local register variables or affecting cache accesses. | ||
| 475 | * | ||
| 476 | * Disable the hardware prefetcher so that when the end of the memory | ||
| 477 | * being pseudo-locked is reached the hardware will not read beyond | ||
| 478 | * the buffer and evict pseudo-locked memory read earlier from the | ||
| 479 | * cache. | ||
| 480 | */ | ||
| 481 | __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); | ||
| 482 | closid_p = this_cpu_read(pqr_state.cur_closid); | ||
| 483 | rmid_p = this_cpu_read(pqr_state.cur_rmid); | ||
| 484 | mem_r = plr->kmem; | ||
| 485 | size = plr->size; | ||
| 486 | line_size = plr->line_size; | ||
| 487 | /* | ||
| 488 | * Critical section begin: start by writing the closid associated | ||
| 489 | * with the capacity bitmask of the cache region being | ||
| 490 | * pseudo-locked followed by reading of kernel memory to load it | ||
| 491 | * into the cache. | ||
| 492 | */ | ||
| 493 | __wrmsr(IA32_PQR_ASSOC, rmid_p, rdtgrp->closid); | ||
| 494 | /* | ||
| 495 | * Cache was flushed earlier. Now access kernel memory to read it | ||
| 496 | * into cache region associated with just activated plr->closid. | ||
| 497 | * Loop over data twice: | ||
| 498 | * - In first loop the cache region is shared with the page walker | ||
| 499 | * as it populates the paging structure caches (including TLB). | ||
| 500 | * - In the second loop the paging structure caches are used and | ||
| 501 | * cache region is populated with the memory being referenced. | ||
| 502 | */ | ||
| 503 | for (i = 0; i < size; i += PAGE_SIZE) { | ||
| 504 | /* | ||
| 505 | * Add a barrier to prevent speculative execution of this | ||
| 506 | * loop reading beyond the end of the buffer. | ||
| 507 | */ | ||
| 508 | rmb(); | ||
| 509 | asm volatile("mov (%0,%1,1), %%eax\n\t" | ||
| 510 | : | ||
| 511 | : "r" (mem_r), "r" (i) | ||
| 512 | : "%eax", "memory"); | ||
| 513 | } | ||
| 514 | for (i = 0; i < size; i += line_size) { | ||
| 515 | /* | ||
| 516 | * Add a barrier to prevent speculative execution of this | ||
| 517 | * loop reading beyond the end of the buffer. | ||
| 518 | */ | ||
| 519 | rmb(); | ||
| 520 | asm volatile("mov (%0,%1,1), %%eax\n\t" | ||
| 521 | : | ||
| 522 | : "r" (mem_r), "r" (i) | ||
| 523 | : "%eax", "memory"); | ||
| 524 | } | ||
| 525 | /* | ||
| 526 | * Critical section end: restore closid with capacity bitmask that | ||
| 527 | * does not overlap with pseudo-locked region. | ||
| 528 | */ | ||
| 529 | __wrmsr(IA32_PQR_ASSOC, rmid_p, closid_p); | ||
| 530 | |||
| 531 | /* Re-enable the hardware prefetcher(s) */ | ||
| 532 | wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); | ||
| 533 | local_irq_enable(); | ||
| 534 | |||
| 535 | plr->thread_done = 1; | ||
| 536 | wake_up_interruptible(&plr->lock_thread_wq); | ||
| 537 | return 0; | ||
| 538 | } | ||
| 539 | |||
| 540 | /** | ||
| 541 | * rdtgroup_monitor_in_progress - Test if monitoring in progress | ||
| 542 | * @r: resource group being queried | ||
| 543 | * | ||
| 544 | * Return: 1 if monitor groups have been created for this resource | ||
| 545 | * group, 0 otherwise. | ||
| 546 | */ | ||
| 547 | static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) | ||
| 548 | { | ||
| 549 | return !list_empty(&rdtgrp->mon.crdtgrp_list); | ||
| 550 | } | ||
| 551 | |||
| 552 | /** | ||
| 553 | * rdtgroup_locksetup_user_restrict - Restrict user access to group | ||
| 554 | * @rdtgrp: resource group needing access restricted | ||
| 555 | * | ||
| 556 | * A resource group used for cache pseudo-locking cannot have cpus or tasks | ||
| 557 | * assigned to it. This is communicated to the user by restricting access | ||
| 558 | * to all the files that can be used to make such changes. | ||
| 559 | * | ||
| 560 | * Permissions restored with rdtgroup_locksetup_user_restore() | ||
| 561 | * | ||
| 562 | * Return: 0 on success, <0 on failure. If a failure occurs during the | ||
| 563 | * restriction of access an attempt will be made to restore permissions but | ||
| 564 | * the state of the mode of these files will be uncertain when a failure | ||
| 565 | * occurs. | ||
| 566 | */ | ||
| 567 | static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) | ||
| 568 | { | ||
| 569 | int ret; | ||
| 570 | |||
| 571 | ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); | ||
| 572 | if (ret) | ||
| 573 | return ret; | ||
| 574 | |||
| 575 | ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); | ||
| 576 | if (ret) | ||
| 577 | goto err_tasks; | ||
| 578 | |||
| 579 | ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); | ||
| 580 | if (ret) | ||
| 581 | goto err_cpus; | ||
| 582 | |||
| 583 | if (rdt_mon_capable) { | ||
| 584 | ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); | ||
| 585 | if (ret) | ||
| 586 | goto err_cpus_list; | ||
| 587 | } | ||
| 588 | |||
| 589 | ret = 0; | ||
| 590 | goto out; | ||
| 591 | |||
| 592 | err_cpus_list: | ||
| 593 | rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); | ||
| 594 | err_cpus: | ||
| 595 | rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); | ||
| 596 | err_tasks: | ||
| 597 | rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); | ||
| 598 | out: | ||
| 599 | return ret; | ||
| 600 | } | ||
| 601 | |||
| 602 | /** | ||
| 603 | * rdtgroup_locksetup_user_restore - Restore user access to group | ||
| 604 | * @rdtgrp: resource group needing access restored | ||
| 605 | * | ||
| 606 | * Restore all file access previously removed using | ||
| 607 | * rdtgroup_locksetup_user_restrict() | ||
| 608 | * | ||
| 609 | * Return: 0 on success, <0 on failure. If a failure occurs during the | ||
| 610 | * restoration of access an attempt will be made to restrict permissions | ||
| 611 | * again but the state of the mode of these files will be uncertain when | ||
| 612 | * a failure occurs. | ||
| 613 | */ | ||
| 614 | static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) | ||
| 615 | { | ||
| 616 | int ret; | ||
| 617 | |||
| 618 | ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); | ||
| 619 | if (ret) | ||
| 620 | return ret; | ||
| 621 | |||
| 622 | ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); | ||
| 623 | if (ret) | ||
| 624 | goto err_tasks; | ||
| 625 | |||
| 626 | ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); | ||
| 627 | if (ret) | ||
| 628 | goto err_cpus; | ||
| 629 | |||
| 630 | if (rdt_mon_capable) { | ||
| 631 | ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); | ||
| 632 | if (ret) | ||
| 633 | goto err_cpus_list; | ||
| 634 | } | ||
| 635 | |||
| 636 | ret = 0; | ||
| 637 | goto out; | ||
| 638 | |||
| 639 | err_cpus_list: | ||
| 640 | rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); | ||
| 641 | err_cpus: | ||
| 642 | rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); | ||
| 643 | err_tasks: | ||
| 644 | rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); | ||
| 645 | out: | ||
| 646 | return ret; | ||
| 647 | } | ||
| 648 | |||
| 649 | /** | ||
| 650 | * rdtgroup_locksetup_enter - Resource group enters locksetup mode | ||
| 651 | * @rdtgrp: resource group requested to enter locksetup mode | ||
| 652 | * | ||
| 653 | * A resource group enters locksetup mode to reflect that it would be used | ||
| 654 | * to represent a pseudo-locked region and is in the process of being set | ||
| 655 | * up to do so. A resource group used for a pseudo-locked region would | ||
| 656 | * lose the closid associated with it so we cannot allow it to have any | ||
| 657 | * tasks or cpus assigned nor permit tasks or cpus to be assigned in the | ||
| 658 | * future. Monitoring of a pseudo-locked region is not allowed either. | ||
| 659 | * | ||
| 660 | * The above and more restrictions on a pseudo-locked region are checked | ||
| 661 | * for and enforced before the resource group enters the locksetup mode. | ||
| 662 | * | ||
| 663 | * Returns: 0 if the resource group successfully entered locksetup mode, <0 | ||
| 664 | * on failure. On failure the last_cmd_status buffer is updated with text to | ||
| 665 | * communicate details of failure to the user. | ||
| 666 | */ | ||
| 667 | int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) | ||
| 668 | { | ||
| 669 | int ret; | ||
| 670 | |||
| 671 | /* | ||
| 672 | * The default resource group can neither be removed nor lose the | ||
| 673 | * default closid associated with it. | ||
| 674 | */ | ||
| 675 | if (rdtgrp == &rdtgroup_default) { | ||
| 676 | rdt_last_cmd_puts("cannot pseudo-lock default group\n"); | ||
| 677 | return -EINVAL; | ||
| 678 | } | ||
| 679 | |||
| 680 | /* | ||
| 681 | * Cache Pseudo-locking not supported when CDP is enabled. | ||
| 682 | * | ||
| 683 | * Some things to consider if you would like to enable this | ||
| 684 | * support (using L3 CDP as example): | ||
| 685 | * - When CDP is enabled two separate resources are exposed, | ||
| 686 | * L3DATA and L3CODE, but they are actually on the same cache. | ||
| 687 | * The implication for pseudo-locking is that if a | ||
| 688 | * pseudo-locked region is created on a domain of one | ||
| 689 | * resource (eg. L3CODE), then a pseudo-locked region cannot | ||
| 690 | * be created on that same domain of the other resource | ||
| 691 | * (eg. L3DATA). This is because the creation of a | ||
| 692 | * pseudo-locked region involves a call to wbinvd that will | ||
| 693 | * affect all cache allocations on particular domain. | ||
| 694 | * - Considering the previous, it may be possible to only | ||
| 695 | * expose one of the CDP resources to pseudo-locking and | ||
| 696 | * hide the other. For example, we could consider to only | ||
| 697 | * expose L3DATA and since the L3 cache is unified it is | ||
| 698 | * still possible to place instructions there are execute it. | ||
| 699 | * - If only one region is exposed to pseudo-locking we should | ||
| 700 | * still keep in mind that availability of a portion of cache | ||
| 701 | * for pseudo-locking should take into account both resources. | ||
| 702 | * Similarly, if a pseudo-locked region is created in one | ||
| 703 | * resource, the portion of cache used by it should be made | ||
| 704 | * unavailable to all future allocations from both resources. | ||
| 705 | */ | ||
| 706 | if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled || | ||
| 707 | rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled) { | ||
| 708 | rdt_last_cmd_puts("CDP enabled\n"); | ||
| 709 | return -EINVAL; | ||
| 710 | } | ||
| 711 | |||
| 712 | /* | ||
| 713 | * Not knowing the bits to disable prefetching implies that this | ||
| 714 | * platform does not support Cache Pseudo-Locking. | ||
| 715 | */ | ||
| 716 | prefetch_disable_bits = get_prefetch_disable_bits(); | ||
| 717 | if (prefetch_disable_bits == 0) { | ||
| 718 | rdt_last_cmd_puts("pseudo-locking not supported\n"); | ||
| 719 | return -EINVAL; | ||
| 720 | } | ||
| 721 | |||
| 722 | if (rdtgroup_monitor_in_progress(rdtgrp)) { | ||
| 723 | rdt_last_cmd_puts("monitoring in progress\n"); | ||
| 724 | return -EINVAL; | ||
| 725 | } | ||
| 726 | |||
| 727 | if (rdtgroup_tasks_assigned(rdtgrp)) { | ||
| 728 | rdt_last_cmd_puts("tasks assigned to resource group\n"); | ||
| 729 | return -EINVAL; | ||
| 730 | } | ||
| 731 | |||
| 732 | if (!cpumask_empty(&rdtgrp->cpu_mask)) { | ||
| 733 | rdt_last_cmd_puts("CPUs assigned to resource group\n"); | ||
| 734 | return -EINVAL; | ||
| 735 | } | ||
| 736 | |||
| 737 | if (rdtgroup_locksetup_user_restrict(rdtgrp)) { | ||
| 738 | rdt_last_cmd_puts("unable to modify resctrl permissions\n"); | ||
| 739 | return -EIO; | ||
| 740 | } | ||
| 741 | |||
| 742 | ret = pseudo_lock_init(rdtgrp); | ||
| 743 | if (ret) { | ||
| 744 | rdt_last_cmd_puts("unable to init pseudo-lock region\n"); | ||
| 745 | goto out_release; | ||
| 746 | } | ||
| 747 | |||
| 748 | /* | ||
| 749 | * If this system is capable of monitoring a rmid would have been | ||
| 750 | * allocated when the control group was created. This is not needed | ||
| 751 | * anymore when this group would be used for pseudo-locking. This | ||
| 752 | * is safe to call on platforms not capable of monitoring. | ||
| 753 | */ | ||
| 754 | free_rmid(rdtgrp->mon.rmid); | ||
| 755 | |||
| 756 | ret = 0; | ||
| 757 | goto out; | ||
| 758 | |||
| 759 | out_release: | ||
| 760 | rdtgroup_locksetup_user_restore(rdtgrp); | ||
| 761 | out: | ||
| 762 | return ret; | ||
| 763 | } | ||
| 764 | |||
| 765 | /** | ||
| 766 | * rdtgroup_locksetup_exit - resource group exist locksetup mode | ||
| 767 | * @rdtgrp: resource group | ||
| 768 | * | ||
| 769 | * When a resource group exits locksetup mode the earlier restrictions are | ||
| 770 | * lifted. | ||
| 771 | * | ||
| 772 | * Return: 0 on success, <0 on failure | ||
| 773 | */ | ||
| 774 | int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) | ||
| 775 | { | ||
| 776 | int ret; | ||
| 777 | |||
| 778 | if (rdt_mon_capable) { | ||
| 779 | ret = alloc_rmid(); | ||
| 780 | if (ret < 0) { | ||
| 781 | rdt_last_cmd_puts("out of RMIDs\n"); | ||
| 782 | return ret; | ||
| 783 | } | ||
| 784 | rdtgrp->mon.rmid = ret; | ||
| 785 | } | ||
| 786 | |||
| 787 | ret = rdtgroup_locksetup_user_restore(rdtgrp); | ||
| 788 | if (ret) { | ||
| 789 | free_rmid(rdtgrp->mon.rmid); | ||
| 790 | return ret; | ||
| 791 | } | ||
| 792 | |||
| 793 | pseudo_lock_free(rdtgrp); | ||
| 794 | return 0; | ||
| 795 | } | ||
| 796 | |||
| 797 | /** | ||
| 798 | * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked | ||
| 799 | * @d: RDT domain | ||
| 800 | * @_cbm: CBM to test | ||
| 801 | * | ||
| 802 | * @d represents a cache instance and @_cbm a capacity bitmask that is | ||
| 803 | * considered for it. Determine if @_cbm overlaps with any existing | ||
| 804 | * pseudo-locked region on @d. | ||
| 805 | * | ||
| 806 | * Return: true if @_cbm overlaps with pseudo-locked region on @d, false | ||
| 807 | * otherwise. | ||
| 808 | */ | ||
| 809 | bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, u32 _cbm) | ||
| 810 | { | ||
| 811 | unsigned long *cbm = (unsigned long *)&_cbm; | ||
| 812 | unsigned long *cbm_b; | ||
| 813 | unsigned int cbm_len; | ||
| 814 | |||
| 815 | if (d->plr) { | ||
| 816 | cbm_len = d->plr->r->cache.cbm_len; | ||
| 817 | cbm_b = (unsigned long *)&d->plr->cbm; | ||
| 818 | if (bitmap_intersects(cbm, cbm_b, cbm_len)) | ||
| 819 | return true; | ||
| 820 | } | ||
| 821 | return false; | ||
| 822 | } | ||
| 823 | |||
| 824 | /** | ||
| 825 | * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy | ||
| 826 | * @d: RDT domain under test | ||
| 827 | * | ||
| 828 | * The setup of a pseudo-locked region affects all cache instances within | ||
| 829 | * the hierarchy of the region. It is thus essential to know if any | ||
| 830 | * pseudo-locked regions exist within a cache hierarchy to prevent any | ||
| 831 | * attempts to create new pseudo-locked regions in the same hierarchy. | ||
| 832 | * | ||
| 833 | * Return: true if a pseudo-locked region exists in the hierarchy of @d or | ||
| 834 | * if it is not possible to test due to memory allocation issue, | ||
| 835 | * false otherwise. | ||
| 836 | */ | ||
| 837 | bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) | ||
| 838 | { | ||
| 839 | cpumask_var_t cpu_with_psl; | ||
| 840 | struct rdt_resource *r; | ||
| 841 | struct rdt_domain *d_i; | ||
| 842 | bool ret = false; | ||
| 843 | |||
| 844 | if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) | ||
| 845 | return true; | ||
| 846 | |||
| 847 | /* | ||
| 848 | * First determine which cpus have pseudo-locked regions | ||
| 849 | * associated with them. | ||
| 850 | */ | ||
| 851 | for_each_alloc_enabled_rdt_resource(r) { | ||
| 852 | list_for_each_entry(d_i, &r->domains, list) { | ||
| 853 | if (d_i->plr) | ||
| 854 | cpumask_or(cpu_with_psl, cpu_with_psl, | ||
| 855 | &d_i->cpu_mask); | ||
| 856 | } | ||
| 857 | } | ||
| 858 | |||
| 859 | /* | ||
| 860 | * Next test if new pseudo-locked region would intersect with | ||
| 861 | * existing region. | ||
| 862 | */ | ||
| 863 | if (cpumask_intersects(&d->cpu_mask, cpu_with_psl)) | ||
| 864 | ret = true; | ||
| 865 | |||
| 866 | free_cpumask_var(cpu_with_psl); | ||
| 867 | return ret; | ||
| 868 | } | ||
| 869 | |||
| 870 | /** | ||
| 871 | * measure_cycles_lat_fn - Measure cycle latency to read pseudo-locked memory | ||
| 872 | * @_plr: pseudo-lock region to measure | ||
| 873 | * | ||
| 874 | * There is no deterministic way to test if a memory region is cached. One | ||
| 875 | * way is to measure how long it takes to read the memory, the speed of | ||
| 876 | * access is a good way to learn how close to the cpu the data was. Even | ||
| 877 | * more, if the prefetcher is disabled and the memory is read at a stride | ||
| 878 | * of half the cache line, then a cache miss will be easy to spot since the | ||
| 879 | * read of the first half would be significantly slower than the read of | ||
| 880 | * the second half. | ||
| 881 | * | ||
| 882 | * Return: 0. Waiter on waitqueue will be woken on completion. | ||
| 883 | */ | ||
| 884 | static int measure_cycles_lat_fn(void *_plr) | ||
| 885 | { | ||
| 886 | struct pseudo_lock_region *plr = _plr; | ||
| 887 | unsigned long i; | ||
| 888 | u64 start, end; | ||
| 889 | #ifdef CONFIG_KASAN | ||
| 890 | /* | ||
| 891 | * The registers used for local register variables are also used | ||
| 892 | * when KASAN is active. When KASAN is active we use a regular | ||
| 893 | * variable to ensure we always use a valid pointer to access memory. | ||
| 894 | * The cost is that accessing this pointer, which could be in | ||
| 895 | * cache, will be included in the measurement of memory read latency. | ||
| 896 | */ | ||
| 897 | void *mem_r; | ||
| 898 | #else | ||
| 899 | #ifdef CONFIG_X86_64 | ||
| 900 | register void *mem_r asm("rbx"); | ||
| 901 | #else | ||
| 902 | register void *mem_r asm("ebx"); | ||
| 903 | #endif /* CONFIG_X86_64 */ | ||
| 904 | #endif /* CONFIG_KASAN */ | ||
| 905 | |||
| 906 | local_irq_disable(); | ||
| 907 | /* | ||
| 908 | * The wrmsr call may be reordered with the assignment below it. | ||
| 909 | * Call wrmsr as directly as possible to avoid tracing clobbering | ||
| 910 | * local register variable used for memory pointer. | ||
| 911 | */ | ||
| 912 | __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); | ||
| 913 | mem_r = plr->kmem; | ||
| 914 | /* | ||
| 915 | * Dummy execute of the time measurement to load the needed | ||
| 916 | * instructions into the L1 instruction cache. | ||
| 917 | */ | ||
| 918 | start = rdtsc_ordered(); | ||
| 919 | for (i = 0; i < plr->size; i += 32) { | ||
| 920 | start = rdtsc_ordered(); | ||
| 921 | asm volatile("mov (%0,%1,1), %%eax\n\t" | ||
| 922 | : | ||
| 923 | : "r" (mem_r), "r" (i) | ||
| 924 | : "%eax", "memory"); | ||
| 925 | end = rdtsc_ordered(); | ||
| 926 | trace_pseudo_lock_mem_latency((u32)(end - start)); | ||
| 927 | } | ||
| 928 | wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); | ||
| 929 | local_irq_enable(); | ||
| 930 | plr->thread_done = 1; | ||
| 931 | wake_up_interruptible(&plr->lock_thread_wq); | ||
| 932 | return 0; | ||
| 933 | } | ||
| 934 | |||
| 935 | static int measure_cycles_perf_fn(void *_plr) | ||
| 936 | { | ||
| 937 | unsigned long long l3_hits = 0, l3_miss = 0; | ||
| 938 | u64 l3_hit_bits = 0, l3_miss_bits = 0; | ||
| 939 | struct pseudo_lock_region *plr = _plr; | ||
| 940 | unsigned long long l2_hits, l2_miss; | ||
| 941 | u64 l2_hit_bits, l2_miss_bits; | ||
| 942 | unsigned long i; | ||
| 943 | #ifdef CONFIG_KASAN | ||
| 944 | /* | ||
| 945 | * The registers used for local register variables are also used | ||
| 946 | * when KASAN is active. When KASAN is active we use regular variables | ||
| 947 | * at the cost of including cache access latency to these variables | ||
| 948 | * in the measurements. | ||
| 949 | */ | ||
| 950 | unsigned int line_size; | ||
| 951 | unsigned int size; | ||
| 952 | void *mem_r; | ||
| 953 | #else | ||
| 954 | register unsigned int line_size asm("esi"); | ||
| 955 | register unsigned int size asm("edi"); | ||
| 956 | #ifdef CONFIG_X86_64 | ||
| 957 | register void *mem_r asm("rbx"); | ||
| 958 | #else | ||
| 959 | register void *mem_r asm("ebx"); | ||
| 960 | #endif /* CONFIG_X86_64 */ | ||
| 961 | #endif /* CONFIG_KASAN */ | ||
| 962 | |||
| 963 | /* | ||
| 964 | * Non-architectural event for the Goldmont Microarchitecture | ||
| 965 | * from Intel x86 Architecture Software Developer Manual (SDM): | ||
| 966 | * MEM_LOAD_UOPS_RETIRED D1H (event number) | ||
| 967 | * Umask values: | ||
| 968 | * L1_HIT 01H | ||
| 969 | * L2_HIT 02H | ||
| 970 | * L1_MISS 08H | ||
| 971 | * L2_MISS 10H | ||
| 972 | * | ||
| 973 | * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event | ||
| 974 | * has two "no fix" errata associated with it: BDM35 and BDM100. On | ||
| 975 | * this platform we use the following events instead: | ||
| 976 | * L2_RQSTS 24H (Documented in https://download.01.org/perfmon/BDW/) | ||
| 977 | * REFERENCES FFH | ||
| 978 | * MISS 3FH | ||
| 979 | * LONGEST_LAT_CACHE 2EH (Documented in SDM) | ||
| 980 | * REFERENCE 4FH | ||
| 981 | * MISS 41H | ||
| 982 | */ | ||
| 983 | |||
| 984 | /* | ||
| 985 | * Start by setting flags for IA32_PERFEVTSELx: | ||
| 986 | * OS (Operating system mode) 0x2 | ||
| 987 | * INT (APIC interrupt enable) 0x10 | ||
| 988 | * EN (Enable counter) 0x40 | ||
| 989 | * | ||
| 990 | * Then add the Umask value and event number to select performance | ||
| 991 | * event. | ||
| 992 | */ | ||
| 993 | |||
| 994 | switch (boot_cpu_data.x86_model) { | ||
| 995 | case INTEL_FAM6_ATOM_GOLDMONT: | ||
| 996 | case INTEL_FAM6_ATOM_GEMINI_LAKE: | ||
| 997 | l2_hit_bits = (0x52ULL << 16) | (0x2 << 8) | 0xd1; | ||
| 998 | l2_miss_bits = (0x52ULL << 16) | (0x10 << 8) | 0xd1; | ||
| 999 | break; | ||
| 1000 | case INTEL_FAM6_BROADWELL_X: | ||
| 1001 | /* On BDW the l2_hit_bits count references, not hits */ | ||
| 1002 | l2_hit_bits = (0x52ULL << 16) | (0xff << 8) | 0x24; | ||
| 1003 | l2_miss_bits = (0x52ULL << 16) | (0x3f << 8) | 0x24; | ||
| 1004 | /* On BDW the l3_hit_bits count references, not hits */ | ||
| 1005 | l3_hit_bits = (0x52ULL << 16) | (0x4f << 8) | 0x2e; | ||
| 1006 | l3_miss_bits = (0x52ULL << 16) | (0x41 << 8) | 0x2e; | ||
| 1007 | break; | ||
| 1008 | default: | ||
| 1009 | goto out; | ||
| 1010 | } | ||
| 1011 | |||
| 1012 | local_irq_disable(); | ||
| 1013 | /* | ||
| 1014 | * Call wrmsr direcly to avoid the local register variables from | ||
| 1015 | * being overwritten due to reordering of their assignment with | ||
| 1016 | * the wrmsr calls. | ||
| 1017 | */ | ||
| 1018 | __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); | ||
| 1019 | /* Disable events and reset counters */ | ||
| 1020 | pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, 0x0); | ||
| 1021 | pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x0); | ||
| 1022 | pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0, 0x0); | ||
| 1023 | pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 1, 0x0); | ||
| 1024 | if (l3_hit_bits > 0) { | ||
| 1025 | pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x0); | ||
| 1026 | pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, 0x0); | ||
| 1027 | pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 2, 0x0); | ||
| 1028 | pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 3, 0x0); | ||
| 1029 | } | ||
| 1030 | /* Set and enable the L2 counters */ | ||
| 1031 | pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, l2_hit_bits); | ||
| 1032 | pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, l2_miss_bits); | ||
| 1033 | if (l3_hit_bits > 0) { | ||
| 1034 | pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, | ||
| 1035 | l3_hit_bits); | ||
| 1036 | pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, | ||
| 1037 | l3_miss_bits); | ||
| 1038 | } | ||
| 1039 | mem_r = plr->kmem; | ||
| 1040 | size = plr->size; | ||
| 1041 | line_size = plr->line_size; | ||
| 1042 | for (i = 0; i < size; i += line_size) { | ||
| 1043 | asm volatile("mov (%0,%1,1), %%eax\n\t" | ||
| 1044 | : | ||
| 1045 | : "r" (mem_r), "r" (i) | ||
| 1046 | : "%eax", "memory"); | ||
| 1047 | } | ||
| 1048 | /* | ||
| 1049 | * Call wrmsr directly (no tracing) to not influence | ||
| 1050 | * the cache access counters as they are disabled. | ||
| 1051 | */ | ||
| 1052 | pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, | ||
| 1053 | l2_hit_bits & ~(0x40ULL << 16)); | ||
| 1054 | pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, | ||
| 1055 | l2_miss_bits & ~(0x40ULL << 16)); | ||
| 1056 | if (l3_hit_bits > 0) { | ||
| 1057 | pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, | ||
| 1058 | l3_hit_bits & ~(0x40ULL << 16)); | ||
| 1059 | pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, | ||
| 1060 | l3_miss_bits & ~(0x40ULL << 16)); | ||
| 1061 | } | ||
| 1062 | l2_hits = native_read_pmc(0); | ||
| 1063 | l2_miss = native_read_pmc(1); | ||
| 1064 | if (l3_hit_bits > 0) { | ||
| 1065 | l3_hits = native_read_pmc(2); | ||
| 1066 | l3_miss = native_read_pmc(3); | ||
| 1067 | } | ||
| 1068 | wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); | ||
| 1069 | local_irq_enable(); | ||
| 1070 | /* | ||
| 1071 | * On BDW we count references and misses, need to adjust. Sometimes | ||
| 1072 | * the "hits" counter is a bit more than the references, for | ||
| 1073 | * example, x references but x + 1 hits. To not report invalid | ||
| 1074 | * hit values in this case we treat that as misses eaqual to | ||
| 1075 | * references. | ||
| 1076 | */ | ||
| 1077 | if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) | ||
| 1078 | l2_hits -= (l2_miss > l2_hits ? l2_hits : l2_miss); | ||
| 1079 | trace_pseudo_lock_l2(l2_hits, l2_miss); | ||
| 1080 | if (l3_hit_bits > 0) { | ||
| 1081 | if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) | ||
| 1082 | l3_hits -= (l3_miss > l3_hits ? l3_hits : l3_miss); | ||
| 1083 | trace_pseudo_lock_l3(l3_hits, l3_miss); | ||
| 1084 | } | ||
| 1085 | |||
| 1086 | out: | ||
| 1087 | plr->thread_done = 1; | ||
| 1088 | wake_up_interruptible(&plr->lock_thread_wq); | ||
| 1089 | return 0; | ||
| 1090 | } | ||
| 1091 | |||
| 1092 | /** | ||
| 1093 | * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region | ||
| 1094 | * | ||
| 1095 | * The measurement of latency to access a pseudo-locked region should be | ||
| 1096 | * done from a cpu that is associated with that pseudo-locked region. | ||
| 1097 | * Determine which cpu is associated with this region and start a thread on | ||
| 1098 | * that cpu to perform the measurement, wait for that thread to complete. | ||
| 1099 | * | ||
| 1100 | * Return: 0 on success, <0 on failure | ||
| 1101 | */ | ||
| 1102 | static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) | ||
| 1103 | { | ||
| 1104 | struct pseudo_lock_region *plr = rdtgrp->plr; | ||
| 1105 | struct task_struct *thread; | ||
| 1106 | unsigned int cpu; | ||
| 1107 | int ret = -1; | ||
| 1108 | |||
| 1109 | cpus_read_lock(); | ||
| 1110 | mutex_lock(&rdtgroup_mutex); | ||
| 1111 | |||
| 1112 | if (rdtgrp->flags & RDT_DELETED) { | ||
| 1113 | ret = -ENODEV; | ||
| 1114 | goto out; | ||
| 1115 | } | ||
| 1116 | |||
| 1117 | plr->thread_done = 0; | ||
| 1118 | cpu = cpumask_first(&plr->d->cpu_mask); | ||
| 1119 | if (!cpu_online(cpu)) { | ||
| 1120 | ret = -ENODEV; | ||
| 1121 | goto out; | ||
| 1122 | } | ||
| 1123 | |||
| 1124 | if (sel == 1) | ||
| 1125 | thread = kthread_create_on_node(measure_cycles_lat_fn, plr, | ||
| 1126 | cpu_to_node(cpu), | ||
| 1127 | "pseudo_lock_measure/%u", | ||
| 1128 | cpu); | ||
| 1129 | else if (sel == 2) | ||
| 1130 | thread = kthread_create_on_node(measure_cycles_perf_fn, plr, | ||
| 1131 | cpu_to_node(cpu), | ||
| 1132 | "pseudo_lock_measure/%u", | ||
| 1133 | cpu); | ||
| 1134 | else | ||
| 1135 | goto out; | ||
| 1136 | |||
| 1137 | if (IS_ERR(thread)) { | ||
| 1138 | ret = PTR_ERR(thread); | ||
| 1139 | goto out; | ||
| 1140 | } | ||
| 1141 | kthread_bind(thread, cpu); | ||
| 1142 | wake_up_process(thread); | ||
| 1143 | |||
| 1144 | ret = wait_event_interruptible(plr->lock_thread_wq, | ||
| 1145 | plr->thread_done == 1); | ||
| 1146 | if (ret < 0) | ||
| 1147 | goto out; | ||
| 1148 | |||
| 1149 | ret = 0; | ||
| 1150 | |||
| 1151 | out: | ||
| 1152 | mutex_unlock(&rdtgroup_mutex); | ||
| 1153 | cpus_read_unlock(); | ||
| 1154 | return ret; | ||
| 1155 | } | ||
| 1156 | |||
| 1157 | static ssize_t pseudo_lock_measure_trigger(struct file *file, | ||
| 1158 | const char __user *user_buf, | ||
| 1159 | size_t count, loff_t *ppos) | ||
| 1160 | { | ||
| 1161 | struct rdtgroup *rdtgrp = file->private_data; | ||
| 1162 | size_t buf_size; | ||
| 1163 | char buf[32]; | ||
| 1164 | int ret; | ||
| 1165 | int sel; | ||
| 1166 | |||
| 1167 | buf_size = min(count, (sizeof(buf) - 1)); | ||
| 1168 | if (copy_from_user(buf, user_buf, buf_size)) | ||
| 1169 | return -EFAULT; | ||
| 1170 | |||
| 1171 | buf[buf_size] = '\0'; | ||
| 1172 | ret = kstrtoint(buf, 10, &sel); | ||
| 1173 | if (ret == 0) { | ||
| 1174 | if (sel != 1) | ||
| 1175 | return -EINVAL; | ||
| 1176 | ret = debugfs_file_get(file->f_path.dentry); | ||
| 1177 | if (ret) | ||
| 1178 | return ret; | ||
| 1179 | ret = pseudo_lock_measure_cycles(rdtgrp, sel); | ||
| 1180 | if (ret == 0) | ||
| 1181 | ret = count; | ||
| 1182 | debugfs_file_put(file->f_path.dentry); | ||
| 1183 | } | ||
| 1184 | |||
| 1185 | return ret; | ||
| 1186 | } | ||
| 1187 | |||
| 1188 | static const struct file_operations pseudo_measure_fops = { | ||
| 1189 | .write = pseudo_lock_measure_trigger, | ||
| 1190 | .open = simple_open, | ||
| 1191 | .llseek = default_llseek, | ||
| 1192 | }; | ||
| 1193 | |||
| 1194 | /** | ||
| 1195 | * rdtgroup_pseudo_lock_create - Create a pseudo-locked region | ||
| 1196 | * @rdtgrp: resource group to which pseudo-lock region belongs | ||
| 1197 | * | ||
| 1198 | * Called when a resource group in the pseudo-locksetup mode receives a | ||
| 1199 | * valid schemata that should be pseudo-locked. Since the resource group is | ||
| 1200 | * in pseudo-locksetup mode the &struct pseudo_lock_region has already been | ||
| 1201 | * allocated and initialized with the essential information. If a failure | ||
| 1202 | * occurs the resource group remains in the pseudo-locksetup mode with the | ||
| 1203 | * &struct pseudo_lock_region associated with it, but cleared from all | ||
| 1204 | * information and ready for the user to re-attempt pseudo-locking by | ||
| 1205 | * writing the schemata again. | ||
| 1206 | * | ||
| 1207 | * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0 | ||
| 1208 | * on failure. Descriptive error will be written to last_cmd_status buffer. | ||
| 1209 | */ | ||
| 1210 | int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) | ||
| 1211 | { | ||
| 1212 | struct pseudo_lock_region *plr = rdtgrp->plr; | ||
| 1213 | struct task_struct *thread; | ||
| 1214 | unsigned int new_minor; | ||
| 1215 | struct device *dev; | ||
| 1216 | int ret; | ||
| 1217 | |||
| 1218 | ret = pseudo_lock_region_alloc(plr); | ||
| 1219 | if (ret < 0) | ||
| 1220 | return ret; | ||
| 1221 | |||
| 1222 | ret = pseudo_lock_cstates_constrain(plr); | ||
| 1223 | if (ret < 0) { | ||
| 1224 | ret = -EINVAL; | ||
| 1225 | goto out_region; | ||
| 1226 | } | ||
| 1227 | |||
| 1228 | plr->thread_done = 0; | ||
| 1229 | |||
| 1230 | thread = kthread_create_on_node(pseudo_lock_fn, rdtgrp, | ||
| 1231 | cpu_to_node(plr->cpu), | ||
| 1232 | "pseudo_lock/%u", plr->cpu); | ||
| 1233 | if (IS_ERR(thread)) { | ||
| 1234 | ret = PTR_ERR(thread); | ||
| 1235 | rdt_last_cmd_printf("locking thread returned error %d\n", ret); | ||
| 1236 | goto out_cstates; | ||
| 1237 | } | ||
| 1238 | |||
| 1239 | kthread_bind(thread, plr->cpu); | ||
| 1240 | wake_up_process(thread); | ||
| 1241 | |||
| 1242 | ret = wait_event_interruptible(plr->lock_thread_wq, | ||
| 1243 | plr->thread_done == 1); | ||
| 1244 | if (ret < 0) { | ||
| 1245 | /* | ||
| 1246 | * If the thread does not get on the CPU for whatever | ||
| 1247 | * reason and the process which sets up the region is | ||
| 1248 | * interrupted then this will leave the thread in runnable | ||
| 1249 | * state and once it gets on the CPU it will derefence | ||
| 1250 | * the cleared, but not freed, plr struct resulting in an | ||
| 1251 | * empty pseudo-locking loop. | ||
| 1252 | */ | ||
| 1253 | rdt_last_cmd_puts("locking thread interrupted\n"); | ||
| 1254 | goto out_cstates; | ||
| 1255 | } | ||
| 1256 | |||
| 1257 | ret = pseudo_lock_minor_get(&new_minor); | ||
| 1258 | if (ret < 0) { | ||
| 1259 | rdt_last_cmd_puts("unable to obtain a new minor number\n"); | ||
| 1260 | goto out_cstates; | ||
| 1261 | } | ||
| 1262 | |||
| 1263 | /* | ||
| 1264 | * Unlock access but do not release the reference. The | ||
| 1265 | * pseudo-locked region will still be here on return. | ||
| 1266 | * | ||
| 1267 | * The mutex has to be released temporarily to avoid a potential | ||
| 1268 | * deadlock with the mm->mmap_sem semaphore which is obtained in | ||
| 1269 | * the device_create() and debugfs_create_dir() callpath below | ||
| 1270 | * as well as before the mmap() callback is called. | ||
| 1271 | */ | ||
| 1272 | mutex_unlock(&rdtgroup_mutex); | ||
| 1273 | |||
| 1274 | if (!IS_ERR_OR_NULL(debugfs_resctrl)) { | ||
| 1275 | plr->debugfs_dir = debugfs_create_dir(rdtgrp->kn->name, | ||
| 1276 | debugfs_resctrl); | ||
| 1277 | if (!IS_ERR_OR_NULL(plr->debugfs_dir)) | ||
| 1278 | debugfs_create_file("pseudo_lock_measure", 0200, | ||
| 1279 | plr->debugfs_dir, rdtgrp, | ||
| 1280 | &pseudo_measure_fops); | ||
| 1281 | } | ||
| 1282 | |||
| 1283 | dev = device_create(pseudo_lock_class, NULL, | ||
| 1284 | MKDEV(pseudo_lock_major, new_minor), | ||
| 1285 | rdtgrp, "%s", rdtgrp->kn->name); | ||
| 1286 | |||
| 1287 | mutex_lock(&rdtgroup_mutex); | ||
| 1288 | |||
| 1289 | if (IS_ERR(dev)) { | ||
| 1290 | ret = PTR_ERR(dev); | ||
| 1291 | rdt_last_cmd_printf("failed to create character device: %d\n", | ||
| 1292 | ret); | ||
| 1293 | goto out_debugfs; | ||
| 1294 | } | ||
| 1295 | |||
| 1296 | /* We released the mutex - check if group was removed while we did so */ | ||
| 1297 | if (rdtgrp->flags & RDT_DELETED) { | ||
| 1298 | ret = -ENODEV; | ||
| 1299 | goto out_device; | ||
| 1300 | } | ||
| 1301 | |||
| 1302 | plr->minor = new_minor; | ||
| 1303 | |||
| 1304 | rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; | ||
| 1305 | closid_free(rdtgrp->closid); | ||
| 1306 | rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444); | ||
| 1307 | rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444); | ||
| 1308 | |||
| 1309 | ret = 0; | ||
| 1310 | goto out; | ||
| 1311 | |||
| 1312 | out_device: | ||
| 1313 | device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); | ||
| 1314 | out_debugfs: | ||
| 1315 | debugfs_remove_recursive(plr->debugfs_dir); | ||
| 1316 | pseudo_lock_minor_release(new_minor); | ||
| 1317 | out_cstates: | ||
| 1318 | pseudo_lock_cstates_relax(plr); | ||
| 1319 | out_region: | ||
| 1320 | pseudo_lock_region_clear(plr); | ||
| 1321 | out: | ||
| 1322 | return ret; | ||
| 1323 | } | ||
| 1324 | |||
| 1325 | /** | ||
| 1326 | * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region | ||
| 1327 | * @rdtgrp: resource group to which the pseudo-locked region belongs | ||
| 1328 | * | ||
| 1329 | * The removal of a pseudo-locked region can be initiated when the resource | ||
| 1330 | * group is removed from user space via a "rmdir" from userspace or the | ||
| 1331 | * unmount of the resctrl filesystem. On removal the resource group does | ||
| 1332 | * not go back to pseudo-locksetup mode before it is removed, instead it is | ||
| 1333 | * removed directly. There is thus assymmetry with the creation where the | ||
| 1334 | * &struct pseudo_lock_region is removed here while it was not created in | ||
| 1335 | * rdtgroup_pseudo_lock_create(). | ||
| 1336 | * | ||
| 1337 | * Return: void | ||
| 1338 | */ | ||
| 1339 | void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) | ||
| 1340 | { | ||
| 1341 | struct pseudo_lock_region *plr = rdtgrp->plr; | ||
| 1342 | |||
| 1343 | if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { | ||
| 1344 | /* | ||
| 1345 | * Default group cannot be a pseudo-locked region so we can | ||
| 1346 | * free closid here. | ||
| 1347 | */ | ||
| 1348 | closid_free(rdtgrp->closid); | ||
| 1349 | goto free; | ||
| 1350 | } | ||
| 1351 | |||
| 1352 | pseudo_lock_cstates_relax(plr); | ||
| 1353 | debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); | ||
| 1354 | device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); | ||
| 1355 | pseudo_lock_minor_release(plr->minor); | ||
| 1356 | |||
| 1357 | free: | ||
| 1358 | pseudo_lock_free(rdtgrp); | ||
| 1359 | } | ||
| 1360 | |||
| 1361 | static int pseudo_lock_dev_open(struct inode *inode, struct file *filp) | ||
| 1362 | { | ||
| 1363 | struct rdtgroup *rdtgrp; | ||
| 1364 | |||
| 1365 | mutex_lock(&rdtgroup_mutex); | ||
| 1366 | |||
| 1367 | rdtgrp = region_find_by_minor(iminor(inode)); | ||
| 1368 | if (!rdtgrp) { | ||
| 1369 | mutex_unlock(&rdtgroup_mutex); | ||
| 1370 | return -ENODEV; | ||
| 1371 | } | ||
| 1372 | |||
| 1373 | filp->private_data = rdtgrp; | ||
| 1374 | atomic_inc(&rdtgrp->waitcount); | ||
| 1375 | /* Perform a non-seekable open - llseek is not supported */ | ||
| 1376 | filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); | ||
| 1377 | |||
| 1378 | mutex_unlock(&rdtgroup_mutex); | ||
| 1379 | |||
| 1380 | return 0; | ||
| 1381 | } | ||
| 1382 | |||
| 1383 | static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) | ||
| 1384 | { | ||
| 1385 | struct rdtgroup *rdtgrp; | ||
| 1386 | |||
| 1387 | mutex_lock(&rdtgroup_mutex); | ||
| 1388 | rdtgrp = filp->private_data; | ||
| 1389 | WARN_ON(!rdtgrp); | ||
| 1390 | if (!rdtgrp) { | ||
| 1391 | mutex_unlock(&rdtgroup_mutex); | ||
| 1392 | return -ENODEV; | ||
| 1393 | } | ||
| 1394 | filp->private_data = NULL; | ||
| 1395 | atomic_dec(&rdtgrp->waitcount); | ||
| 1396 | mutex_unlock(&rdtgroup_mutex); | ||
| 1397 | return 0; | ||
| 1398 | } | ||
| 1399 | |||
| 1400 | static int pseudo_lock_dev_mremap(struct vm_area_struct *area) | ||
| 1401 | { | ||
| 1402 | /* Not supported */ | ||
| 1403 | return -EINVAL; | ||
| 1404 | } | ||
| 1405 | |||
| 1406 | static const struct vm_operations_struct pseudo_mmap_ops = { | ||
| 1407 | .mremap = pseudo_lock_dev_mremap, | ||
| 1408 | }; | ||
| 1409 | |||
| 1410 | static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) | ||
| 1411 | { | ||
| 1412 | unsigned long vsize = vma->vm_end - vma->vm_start; | ||
| 1413 | unsigned long off = vma->vm_pgoff << PAGE_SHIFT; | ||
| 1414 | struct pseudo_lock_region *plr; | ||
| 1415 | struct rdtgroup *rdtgrp; | ||
| 1416 | unsigned long physical; | ||
| 1417 | unsigned long psize; | ||
| 1418 | |||
| 1419 | mutex_lock(&rdtgroup_mutex); | ||
| 1420 | |||
| 1421 | rdtgrp = filp->private_data; | ||
| 1422 | WARN_ON(!rdtgrp); | ||
| 1423 | if (!rdtgrp) { | ||
| 1424 | mutex_unlock(&rdtgroup_mutex); | ||
| 1425 | return -ENODEV; | ||
| 1426 | } | ||
| 1427 | |||
| 1428 | plr = rdtgrp->plr; | ||
| 1429 | |||
| 1430 | /* | ||
| 1431 | * Task is required to run with affinity to the cpus associated | ||
| 1432 | * with the pseudo-locked region. If this is not the case the task | ||
| 1433 | * may be scheduled elsewhere and invalidate entries in the | ||
| 1434 | * pseudo-locked region. | ||
| 1435 | */ | ||
| 1436 | if (!cpumask_subset(¤t->cpus_allowed, &plr->d->cpu_mask)) { | ||
| 1437 | mutex_unlock(&rdtgroup_mutex); | ||
| 1438 | return -EINVAL; | ||
| 1439 | } | ||
| 1440 | |||
| 1441 | physical = __pa(plr->kmem) >> PAGE_SHIFT; | ||
| 1442 | psize = plr->size - off; | ||
| 1443 | |||
| 1444 | if (off > plr->size) { | ||
| 1445 | mutex_unlock(&rdtgroup_mutex); | ||
| 1446 | return -ENOSPC; | ||
| 1447 | } | ||
| 1448 | |||
| 1449 | /* | ||
| 1450 | * Ensure changes are carried directly to the memory being mapped, | ||
| 1451 | * do not allow copy-on-write mapping. | ||
| 1452 | */ | ||
| 1453 | if (!(vma->vm_flags & VM_SHARED)) { | ||
| 1454 | mutex_unlock(&rdtgroup_mutex); | ||
| 1455 | return -EINVAL; | ||
| 1456 | } | ||
| 1457 | |||
| 1458 | if (vsize > psize) { | ||
| 1459 | mutex_unlock(&rdtgroup_mutex); | ||
| 1460 | return -ENOSPC; | ||
| 1461 | } | ||
| 1462 | |||
| 1463 | memset(plr->kmem + off, 0, vsize); | ||
| 1464 | |||
| 1465 | if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff, | ||
| 1466 | vsize, vma->vm_page_prot)) { | ||
| 1467 | mutex_unlock(&rdtgroup_mutex); | ||
| 1468 | return -EAGAIN; | ||
| 1469 | } | ||
| 1470 | vma->vm_ops = &pseudo_mmap_ops; | ||
| 1471 | mutex_unlock(&rdtgroup_mutex); | ||
| 1472 | return 0; | ||
| 1473 | } | ||
| 1474 | |||
| 1475 | static const struct file_operations pseudo_lock_dev_fops = { | ||
| 1476 | .owner = THIS_MODULE, | ||
| 1477 | .llseek = no_llseek, | ||
| 1478 | .read = NULL, | ||
| 1479 | .write = NULL, | ||
| 1480 | .open = pseudo_lock_dev_open, | ||
| 1481 | .release = pseudo_lock_dev_release, | ||
| 1482 | .mmap = pseudo_lock_dev_mmap, | ||
| 1483 | }; | ||
| 1484 | |||
| 1485 | static char *pseudo_lock_devnode(struct device *dev, umode_t *mode) | ||
| 1486 | { | ||
| 1487 | struct rdtgroup *rdtgrp; | ||
| 1488 | |||
| 1489 | rdtgrp = dev_get_drvdata(dev); | ||
| 1490 | if (mode) | ||
| 1491 | *mode = 0600; | ||
| 1492 | return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name); | ||
| 1493 | } | ||
| 1494 | |||
| 1495 | int rdt_pseudo_lock_init(void) | ||
| 1496 | { | ||
| 1497 | int ret; | ||
| 1498 | |||
| 1499 | ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops); | ||
| 1500 | if (ret < 0) | ||
| 1501 | return ret; | ||
| 1502 | |||
| 1503 | pseudo_lock_major = ret; | ||
| 1504 | |||
| 1505 | pseudo_lock_class = class_create(THIS_MODULE, "pseudo_lock"); | ||
| 1506 | if (IS_ERR(pseudo_lock_class)) { | ||
| 1507 | ret = PTR_ERR(pseudo_lock_class); | ||
| 1508 | unregister_chrdev(pseudo_lock_major, "pseudo_lock"); | ||
| 1509 | return ret; | ||
| 1510 | } | ||
| 1511 | |||
| 1512 | pseudo_lock_class->devnode = pseudo_lock_devnode; | ||
| 1513 | return 0; | ||
| 1514 | } | ||
| 1515 | |||
| 1516 | void rdt_pseudo_lock_release(void) | ||
| 1517 | { | ||
| 1518 | class_destroy(pseudo_lock_class); | ||
| 1519 | pseudo_lock_class = NULL; | ||
| 1520 | unregister_chrdev(pseudo_lock_major, "pseudo_lock"); | ||
| 1521 | pseudo_lock_major = 0; | ||
| 1522 | } | ||
diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h new file mode 100644 index 000000000000..2c041e6d9f05 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h | |||
| @@ -0,0 +1,43 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
| 2 | #undef TRACE_SYSTEM | ||
| 3 | #define TRACE_SYSTEM resctrl | ||
| 4 | |||
| 5 | #if !defined(_TRACE_PSEUDO_LOCK_H) || defined(TRACE_HEADER_MULTI_READ) | ||
| 6 | #define _TRACE_PSEUDO_LOCK_H | ||
| 7 | |||
| 8 | #include <linux/tracepoint.h> | ||
| 9 | |||
| 10 | TRACE_EVENT(pseudo_lock_mem_latency, | ||
| 11 | TP_PROTO(u32 latency), | ||
| 12 | TP_ARGS(latency), | ||
| 13 | TP_STRUCT__entry(__field(u32, latency)), | ||
| 14 | TP_fast_assign(__entry->latency = latency), | ||
| 15 | TP_printk("latency=%u", __entry->latency) | ||
| 16 | ); | ||
| 17 | |||
| 18 | TRACE_EVENT(pseudo_lock_l2, | ||
| 19 | TP_PROTO(u64 l2_hits, u64 l2_miss), | ||
| 20 | TP_ARGS(l2_hits, l2_miss), | ||
| 21 | TP_STRUCT__entry(__field(u64, l2_hits) | ||
| 22 | __field(u64, l2_miss)), | ||
| 23 | TP_fast_assign(__entry->l2_hits = l2_hits; | ||
| 24 | __entry->l2_miss = l2_miss;), | ||
| 25 | TP_printk("hits=%llu miss=%llu", | ||
| 26 | __entry->l2_hits, __entry->l2_miss)); | ||
| 27 | |||
| 28 | TRACE_EVENT(pseudo_lock_l3, | ||
| 29 | TP_PROTO(u64 l3_hits, u64 l3_miss), | ||
| 30 | TP_ARGS(l3_hits, l3_miss), | ||
| 31 | TP_STRUCT__entry(__field(u64, l3_hits) | ||
| 32 | __field(u64, l3_miss)), | ||
| 33 | TP_fast_assign(__entry->l3_hits = l3_hits; | ||
| 34 | __entry->l3_miss = l3_miss;), | ||
| 35 | TP_printk("hits=%llu miss=%llu", | ||
| 36 | __entry->l3_hits, __entry->l3_miss)); | ||
| 37 | |||
| 38 | #endif /* _TRACE_PSEUDO_LOCK_H */ | ||
| 39 | |||
| 40 | #undef TRACE_INCLUDE_PATH | ||
| 41 | #define TRACE_INCLUDE_PATH . | ||
| 42 | #define TRACE_INCLUDE_FILE intel_rdt_pseudo_lock_event | ||
| 43 | #include <trace/define_trace.h> | ||
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 749856a2e736..d6d7ea7349d0 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | |||
| @@ -20,7 +20,9 @@ | |||
| 20 | 20 | ||
| 21 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 21 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
| 22 | 22 | ||
| 23 | #include <linux/cacheinfo.h> | ||
| 23 | #include <linux/cpu.h> | 24 | #include <linux/cpu.h> |
| 25 | #include <linux/debugfs.h> | ||
| 24 | #include <linux/fs.h> | 26 | #include <linux/fs.h> |
| 25 | #include <linux/sysfs.h> | 27 | #include <linux/sysfs.h> |
| 26 | #include <linux/kernfs.h> | 28 | #include <linux/kernfs.h> |
| @@ -55,6 +57,8 @@ static struct kernfs_node *kn_mondata; | |||
| 55 | static struct seq_buf last_cmd_status; | 57 | static struct seq_buf last_cmd_status; |
| 56 | static char last_cmd_status_buf[512]; | 58 | static char last_cmd_status_buf[512]; |
| 57 | 59 | ||
| 60 | struct dentry *debugfs_resctrl; | ||
| 61 | |||
| 58 | void rdt_last_cmd_clear(void) | 62 | void rdt_last_cmd_clear(void) |
| 59 | { | 63 | { |
| 60 | lockdep_assert_held(&rdtgroup_mutex); | 64 | lockdep_assert_held(&rdtgroup_mutex); |
| @@ -121,11 +125,65 @@ static int closid_alloc(void) | |||
| 121 | return closid; | 125 | return closid; |
| 122 | } | 126 | } |
| 123 | 127 | ||
| 124 | static void closid_free(int closid) | 128 | void closid_free(int closid) |
| 125 | { | 129 | { |
| 126 | closid_free_map |= 1 << closid; | 130 | closid_free_map |= 1 << closid; |
| 127 | } | 131 | } |
| 128 | 132 | ||
| 133 | /** | ||
| 134 | * closid_allocated - test if provided closid is in use | ||
| 135 | * @closid: closid to be tested | ||
| 136 | * | ||
| 137 | * Return: true if @closid is currently associated with a resource group, | ||
| 138 | * false if @closid is free | ||
| 139 | */ | ||
| 140 | static bool closid_allocated(unsigned int closid) | ||
| 141 | { | ||
| 142 | return (closid_free_map & (1 << closid)) == 0; | ||
| 143 | } | ||
| 144 | |||
| 145 | /** | ||
| 146 | * rdtgroup_mode_by_closid - Return mode of resource group with closid | ||
| 147 | * @closid: closid if the resource group | ||
| 148 | * | ||
| 149 | * Each resource group is associated with a @closid. Here the mode | ||
| 150 | * of a resource group can be queried by searching for it using its closid. | ||
| 151 | * | ||
| 152 | * Return: mode as &enum rdtgrp_mode of resource group with closid @closid | ||
| 153 | */ | ||
| 154 | enum rdtgrp_mode rdtgroup_mode_by_closid(int closid) | ||
| 155 | { | ||
| 156 | struct rdtgroup *rdtgrp; | ||
| 157 | |||
| 158 | list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { | ||
| 159 | if (rdtgrp->closid == closid) | ||
| 160 | return rdtgrp->mode; | ||
| 161 | } | ||
| 162 | |||
| 163 | return RDT_NUM_MODES; | ||
| 164 | } | ||
| 165 | |||
| 166 | static const char * const rdt_mode_str[] = { | ||
| 167 | [RDT_MODE_SHAREABLE] = "shareable", | ||
| 168 | [RDT_MODE_EXCLUSIVE] = "exclusive", | ||
| 169 | [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup", | ||
| 170 | [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked", | ||
| 171 | }; | ||
| 172 | |||
| 173 | /** | ||
| 174 | * rdtgroup_mode_str - Return the string representation of mode | ||
| 175 | * @mode: the resource group mode as &enum rdtgroup_mode | ||
| 176 | * | ||
| 177 | * Return: string representation of valid mode, "unknown" otherwise | ||
| 178 | */ | ||
| 179 | static const char *rdtgroup_mode_str(enum rdtgrp_mode mode) | ||
| 180 | { | ||
| 181 | if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES) | ||
| 182 | return "unknown"; | ||
| 183 | |||
| 184 | return rdt_mode_str[mode]; | ||
| 185 | } | ||
| 186 | |||
| 129 | /* set uid and gid of rdtgroup dirs and files to that of the creator */ | 187 | /* set uid and gid of rdtgroup dirs and files to that of the creator */ |
| 130 | static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) | 188 | static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) |
| 131 | { | 189 | { |
| @@ -207,8 +265,12 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, | |||
| 207 | rdtgrp = rdtgroup_kn_lock_live(of->kn); | 265 | rdtgrp = rdtgroup_kn_lock_live(of->kn); |
| 208 | 266 | ||
| 209 | if (rdtgrp) { | 267 | if (rdtgrp) { |
| 210 | seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", | 268 | if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) |
| 211 | cpumask_pr_args(&rdtgrp->cpu_mask)); | 269 | seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", |
| 270 | cpumask_pr_args(&rdtgrp->plr->d->cpu_mask)); | ||
| 271 | else | ||
| 272 | seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", | ||
| 273 | cpumask_pr_args(&rdtgrp->cpu_mask)); | ||
| 212 | } else { | 274 | } else { |
| 213 | ret = -ENOENT; | 275 | ret = -ENOENT; |
| 214 | } | 276 | } |
| @@ -394,6 +456,13 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, | |||
| 394 | goto unlock; | 456 | goto unlock; |
| 395 | } | 457 | } |
| 396 | 458 | ||
| 459 | if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || | ||
| 460 | rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { | ||
| 461 | ret = -EINVAL; | ||
| 462 | rdt_last_cmd_puts("pseudo-locking in progress\n"); | ||
| 463 | goto unlock; | ||
| 464 | } | ||
| 465 | |||
| 397 | if (is_cpu_list(of)) | 466 | if (is_cpu_list(of)) |
| 398 | ret = cpulist_parse(buf, newmask); | 467 | ret = cpulist_parse(buf, newmask); |
| 399 | else | 468 | else |
| @@ -509,6 +578,32 @@ static int __rdtgroup_move_task(struct task_struct *tsk, | |||
| 509 | return ret; | 578 | return ret; |
| 510 | } | 579 | } |
| 511 | 580 | ||
| 581 | /** | ||
| 582 | * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group | ||
| 583 | * @r: Resource group | ||
| 584 | * | ||
| 585 | * Return: 1 if tasks have been assigned to @r, 0 otherwise | ||
| 586 | */ | ||
| 587 | int rdtgroup_tasks_assigned(struct rdtgroup *r) | ||
| 588 | { | ||
| 589 | struct task_struct *p, *t; | ||
| 590 | int ret = 0; | ||
| 591 | |||
| 592 | lockdep_assert_held(&rdtgroup_mutex); | ||
| 593 | |||
| 594 | rcu_read_lock(); | ||
| 595 | for_each_process_thread(p, t) { | ||
| 596 | if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) || | ||
| 597 | (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) { | ||
| 598 | ret = 1; | ||
| 599 | break; | ||
| 600 | } | ||
| 601 | } | ||
| 602 | rcu_read_unlock(); | ||
| 603 | |||
| 604 | return ret; | ||
| 605 | } | ||
| 606 | |||
| 512 | static int rdtgroup_task_write_permission(struct task_struct *task, | 607 | static int rdtgroup_task_write_permission(struct task_struct *task, |
| 513 | struct kernfs_open_file *of) | 608 | struct kernfs_open_file *of) |
| 514 | { | 609 | { |
| @@ -570,13 +665,22 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, | |||
| 570 | if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) | 665 | if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) |
| 571 | return -EINVAL; | 666 | return -EINVAL; |
| 572 | rdtgrp = rdtgroup_kn_lock_live(of->kn); | 667 | rdtgrp = rdtgroup_kn_lock_live(of->kn); |
| 668 | if (!rdtgrp) { | ||
| 669 | rdtgroup_kn_unlock(of->kn); | ||
| 670 | return -ENOENT; | ||
| 671 | } | ||
| 573 | rdt_last_cmd_clear(); | 672 | rdt_last_cmd_clear(); |
| 574 | 673 | ||
| 575 | if (rdtgrp) | 674 | if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || |
| 576 | ret = rdtgroup_move_task(pid, rdtgrp, of); | 675 | rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { |
| 577 | else | 676 | ret = -EINVAL; |
| 578 | ret = -ENOENT; | 677 | rdt_last_cmd_puts("pseudo-locking in progress\n"); |
| 678 | goto unlock; | ||
| 679 | } | ||
| 579 | 680 | ||
| 681 | ret = rdtgroup_move_task(pid, rdtgrp, of); | ||
| 682 | |||
| 683 | unlock: | ||
| 580 | rdtgroup_kn_unlock(of->kn); | 684 | rdtgroup_kn_unlock(of->kn); |
| 581 | 685 | ||
| 582 | return ret ?: nbytes; | 686 | return ret ?: nbytes; |
| @@ -662,6 +766,94 @@ static int rdt_shareable_bits_show(struct kernfs_open_file *of, | |||
| 662 | return 0; | 766 | return 0; |
| 663 | } | 767 | } |
| 664 | 768 | ||
| 769 | /** | ||
| 770 | * rdt_bit_usage_show - Display current usage of resources | ||
| 771 | * | ||
| 772 | * A domain is a shared resource that can now be allocated differently. Here | ||
| 773 | * we display the current regions of the domain as an annotated bitmask. | ||
| 774 | * For each domain of this resource its allocation bitmask | ||
| 775 | * is annotated as below to indicate the current usage of the corresponding bit: | ||
| 776 | * 0 - currently unused | ||
| 777 | * X - currently available for sharing and used by software and hardware | ||
| 778 | * H - currently used by hardware only but available for software use | ||
| 779 | * S - currently used and shareable by software only | ||
| 780 | * E - currently used exclusively by one resource group | ||
| 781 | * P - currently pseudo-locked by one resource group | ||
| 782 | */ | ||
| 783 | static int rdt_bit_usage_show(struct kernfs_open_file *of, | ||
| 784 | struct seq_file *seq, void *v) | ||
| 785 | { | ||
| 786 | struct rdt_resource *r = of->kn->parent->priv; | ||
| 787 | u32 sw_shareable = 0, hw_shareable = 0; | ||
| 788 | u32 exclusive = 0, pseudo_locked = 0; | ||
| 789 | struct rdt_domain *dom; | ||
| 790 | int i, hwb, swb, excl, psl; | ||
| 791 | enum rdtgrp_mode mode; | ||
| 792 | bool sep = false; | ||
| 793 | u32 *ctrl; | ||
| 794 | |||
| 795 | mutex_lock(&rdtgroup_mutex); | ||
| 796 | hw_shareable = r->cache.shareable_bits; | ||
| 797 | list_for_each_entry(dom, &r->domains, list) { | ||
| 798 | if (sep) | ||
| 799 | seq_putc(seq, ';'); | ||
| 800 | ctrl = dom->ctrl_val; | ||
| 801 | sw_shareable = 0; | ||
| 802 | exclusive = 0; | ||
| 803 | seq_printf(seq, "%d=", dom->id); | ||
| 804 | for (i = 0; i < r->num_closid; i++, ctrl++) { | ||
| 805 | if (!closid_allocated(i)) | ||
| 806 | continue; | ||
| 807 | mode = rdtgroup_mode_by_closid(i); | ||
| 808 | switch (mode) { | ||
| 809 | case RDT_MODE_SHAREABLE: | ||
| 810 | sw_shareable |= *ctrl; | ||
| 811 | break; | ||
| 812 | case RDT_MODE_EXCLUSIVE: | ||
| 813 | exclusive |= *ctrl; | ||
| 814 | break; | ||
| 815 | case RDT_MODE_PSEUDO_LOCKSETUP: | ||
| 816 | /* | ||
| 817 | * RDT_MODE_PSEUDO_LOCKSETUP is possible | ||
| 818 | * here but not included since the CBM | ||
| 819 | * associated with this CLOSID in this mode | ||
| 820 | * is not initialized and no task or cpu can be | ||
| 821 | * assigned this CLOSID. | ||
| 822 | */ | ||
| 823 | break; | ||
| 824 | case RDT_MODE_PSEUDO_LOCKED: | ||
| 825 | case RDT_NUM_MODES: | ||
| 826 | WARN(1, | ||
| 827 | "invalid mode for closid %d\n", i); | ||
| 828 | break; | ||
| 829 | } | ||
| 830 | } | ||
| 831 | for (i = r->cache.cbm_len - 1; i >= 0; i--) { | ||
| 832 | pseudo_locked = dom->plr ? dom->plr->cbm : 0; | ||
| 833 | hwb = test_bit(i, (unsigned long *)&hw_shareable); | ||
| 834 | swb = test_bit(i, (unsigned long *)&sw_shareable); | ||
| 835 | excl = test_bit(i, (unsigned long *)&exclusive); | ||
| 836 | psl = test_bit(i, (unsigned long *)&pseudo_locked); | ||
| 837 | if (hwb && swb) | ||
| 838 | seq_putc(seq, 'X'); | ||
| 839 | else if (hwb && !swb) | ||
| 840 | seq_putc(seq, 'H'); | ||
| 841 | else if (!hwb && swb) | ||
| 842 | seq_putc(seq, 'S'); | ||
| 843 | else if (excl) | ||
| 844 | seq_putc(seq, 'E'); | ||
| 845 | else if (psl) | ||
| 846 | seq_putc(seq, 'P'); | ||
| 847 | else /* Unused bits remain */ | ||
| 848 | seq_putc(seq, '0'); | ||
| 849 | } | ||
| 850 | sep = true; | ||
| 851 | } | ||
| 852 | seq_putc(seq, '\n'); | ||
| 853 | mutex_unlock(&rdtgroup_mutex); | ||
| 854 | return 0; | ||
| 855 | } | ||
| 856 | |||
| 665 | static int rdt_min_bw_show(struct kernfs_open_file *of, | 857 | static int rdt_min_bw_show(struct kernfs_open_file *of, |
| 666 | struct seq_file *seq, void *v) | 858 | struct seq_file *seq, void *v) |
| 667 | { | 859 | { |
| @@ -740,6 +932,269 @@ static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, | |||
| 740 | return nbytes; | 932 | return nbytes; |
| 741 | } | 933 | } |
| 742 | 934 | ||
| 935 | /* | ||
| 936 | * rdtgroup_mode_show - Display mode of this resource group | ||
| 937 | */ | ||
| 938 | static int rdtgroup_mode_show(struct kernfs_open_file *of, | ||
| 939 | struct seq_file *s, void *v) | ||
| 940 | { | ||
| 941 | struct rdtgroup *rdtgrp; | ||
| 942 | |||
| 943 | rdtgrp = rdtgroup_kn_lock_live(of->kn); | ||
| 944 | if (!rdtgrp) { | ||
| 945 | rdtgroup_kn_unlock(of->kn); | ||
| 946 | return -ENOENT; | ||
| 947 | } | ||
| 948 | |||
| 949 | seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode)); | ||
| 950 | |||
| 951 | rdtgroup_kn_unlock(of->kn); | ||
| 952 | return 0; | ||
| 953 | } | ||
| 954 | |||
| 955 | /** | ||
| 956 | * rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other | ||
| 957 | * @r: Resource to which domain instance @d belongs. | ||
| 958 | * @d: The domain instance for which @closid is being tested. | ||
| 959 | * @cbm: Capacity bitmask being tested. | ||
| 960 | * @closid: Intended closid for @cbm. | ||
| 961 | * @exclusive: Only check if overlaps with exclusive resource groups | ||
| 962 | * | ||
| 963 | * Checks if provided @cbm intended to be used for @closid on domain | ||
| 964 | * @d overlaps with any other closids or other hardware usage associated | ||
| 965 | * with this domain. If @exclusive is true then only overlaps with | ||
| 966 | * resource groups in exclusive mode will be considered. If @exclusive | ||
| 967 | * is false then overlaps with any resource group or hardware entities | ||
| 968 | * will be considered. | ||
| 969 | * | ||
| 970 | * Return: false if CBM does not overlap, true if it does. | ||
| 971 | */ | ||
| 972 | bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, | ||
| 973 | u32 _cbm, int closid, bool exclusive) | ||
| 974 | { | ||
| 975 | unsigned long *cbm = (unsigned long *)&_cbm; | ||
| 976 | unsigned long *ctrl_b; | ||
| 977 | enum rdtgrp_mode mode; | ||
| 978 | u32 *ctrl; | ||
| 979 | int i; | ||
| 980 | |||
| 981 | /* Check for any overlap with regions used by hardware directly */ | ||
| 982 | if (!exclusive) { | ||
| 983 | if (bitmap_intersects(cbm, | ||
| 984 | (unsigned long *)&r->cache.shareable_bits, | ||
| 985 | r->cache.cbm_len)) | ||
| 986 | return true; | ||
| 987 | } | ||
| 988 | |||
| 989 | /* Check for overlap with other resource groups */ | ||
| 990 | ctrl = d->ctrl_val; | ||
| 991 | for (i = 0; i < r->num_closid; i++, ctrl++) { | ||
| 992 | ctrl_b = (unsigned long *)ctrl; | ||
| 993 | mode = rdtgroup_mode_by_closid(i); | ||
| 994 | if (closid_allocated(i) && i != closid && | ||
| 995 | mode != RDT_MODE_PSEUDO_LOCKSETUP) { | ||
| 996 | if (bitmap_intersects(cbm, ctrl_b, r->cache.cbm_len)) { | ||
| 997 | if (exclusive) { | ||
| 998 | if (mode == RDT_MODE_EXCLUSIVE) | ||
| 999 | return true; | ||
| 1000 | continue; | ||
| 1001 | } | ||
| 1002 | return true; | ||
| 1003 | } | ||
| 1004 | } | ||
| 1005 | } | ||
| 1006 | |||
| 1007 | return false; | ||
| 1008 | } | ||
| 1009 | |||
| 1010 | /** | ||
| 1011 | * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive | ||
| 1012 | * | ||
| 1013 | * An exclusive resource group implies that there should be no sharing of | ||
| 1014 | * its allocated resources. At the time this group is considered to be | ||
| 1015 | * exclusive this test can determine if its current schemata supports this | ||
| 1016 | * setting by testing for overlap with all other resource groups. | ||
| 1017 | * | ||
| 1018 | * Return: true if resource group can be exclusive, false if there is overlap | ||
| 1019 | * with allocations of other resource groups and thus this resource group | ||
| 1020 | * cannot be exclusive. | ||
| 1021 | */ | ||
| 1022 | static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) | ||
| 1023 | { | ||
| 1024 | int closid = rdtgrp->closid; | ||
| 1025 | struct rdt_resource *r; | ||
| 1026 | struct rdt_domain *d; | ||
| 1027 | |||
| 1028 | for_each_alloc_enabled_rdt_resource(r) { | ||
| 1029 | list_for_each_entry(d, &r->domains, list) { | ||
| 1030 | if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid], | ||
| 1031 | rdtgrp->closid, false)) | ||
| 1032 | return false; | ||
| 1033 | } | ||
| 1034 | } | ||
| 1035 | |||
| 1036 | return true; | ||
| 1037 | } | ||
| 1038 | |||
| 1039 | /** | ||
| 1040 | * rdtgroup_mode_write - Modify the resource group's mode | ||
| 1041 | * | ||
| 1042 | */ | ||
| 1043 | static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, | ||
| 1044 | char *buf, size_t nbytes, loff_t off) | ||
| 1045 | { | ||
| 1046 | struct rdtgroup *rdtgrp; | ||
| 1047 | enum rdtgrp_mode mode; | ||
| 1048 | int ret = 0; | ||
| 1049 | |||
| 1050 | /* Valid input requires a trailing newline */ | ||
| 1051 | if (nbytes == 0 || buf[nbytes - 1] != '\n') | ||
| 1052 | return -EINVAL; | ||
| 1053 | buf[nbytes - 1] = '\0'; | ||
| 1054 | |||
| 1055 | rdtgrp = rdtgroup_kn_lock_live(of->kn); | ||
| 1056 | if (!rdtgrp) { | ||
| 1057 | rdtgroup_kn_unlock(of->kn); | ||
| 1058 | return -ENOENT; | ||
| 1059 | } | ||
| 1060 | |||
| 1061 | rdt_last_cmd_clear(); | ||
| 1062 | |||
| 1063 | mode = rdtgrp->mode; | ||
| 1064 | |||
| 1065 | if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) || | ||
| 1066 | (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) || | ||
| 1067 | (!strcmp(buf, "pseudo-locksetup") && | ||
| 1068 | mode == RDT_MODE_PSEUDO_LOCKSETUP) || | ||
| 1069 | (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED)) | ||
| 1070 | goto out; | ||
| 1071 | |||
| 1072 | if (mode == RDT_MODE_PSEUDO_LOCKED) { | ||
| 1073 | rdt_last_cmd_printf("cannot change pseudo-locked group\n"); | ||
| 1074 | ret = -EINVAL; | ||
| 1075 | goto out; | ||
| 1076 | } | ||
| 1077 | |||
| 1078 | if (!strcmp(buf, "shareable")) { | ||
| 1079 | if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { | ||
| 1080 | ret = rdtgroup_locksetup_exit(rdtgrp); | ||
| 1081 | if (ret) | ||
| 1082 | goto out; | ||
| 1083 | } | ||
| 1084 | rdtgrp->mode = RDT_MODE_SHAREABLE; | ||
| 1085 | } else if (!strcmp(buf, "exclusive")) { | ||
| 1086 | if (!rdtgroup_mode_test_exclusive(rdtgrp)) { | ||
| 1087 | rdt_last_cmd_printf("schemata overlaps\n"); | ||
| 1088 | ret = -EINVAL; | ||
| 1089 | goto out; | ||
| 1090 | } | ||
| 1091 | if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { | ||
| 1092 | ret = rdtgroup_locksetup_exit(rdtgrp); | ||
| 1093 | if (ret) | ||
| 1094 | goto out; | ||
| 1095 | } | ||
| 1096 | rdtgrp->mode = RDT_MODE_EXCLUSIVE; | ||
| 1097 | } else if (!strcmp(buf, "pseudo-locksetup")) { | ||
| 1098 | ret = rdtgroup_locksetup_enter(rdtgrp); | ||
| 1099 | if (ret) | ||
| 1100 | goto out; | ||
| 1101 | rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP; | ||
| 1102 | } else { | ||
| 1103 | rdt_last_cmd_printf("unknown/unsupported mode\n"); | ||
| 1104 | ret = -EINVAL; | ||
| 1105 | } | ||
| 1106 | |||
| 1107 | out: | ||
| 1108 | rdtgroup_kn_unlock(of->kn); | ||
| 1109 | return ret ?: nbytes; | ||
| 1110 | } | ||
| 1111 | |||
| 1112 | /** | ||
| 1113 | * rdtgroup_cbm_to_size - Translate CBM to size in bytes | ||
| 1114 | * @r: RDT resource to which @d belongs. | ||
| 1115 | * @d: RDT domain instance. | ||
| 1116 | * @cbm: bitmask for which the size should be computed. | ||
| 1117 | * | ||
| 1118 | * The bitmask provided associated with the RDT domain instance @d will be | ||
| 1119 | * translated into how many bytes it represents. The size in bytes is | ||
| 1120 | * computed by first dividing the total cache size by the CBM length to | ||
| 1121 | * determine how many bytes each bit in the bitmask represents. The result | ||
| 1122 | * is multiplied with the number of bits set in the bitmask. | ||
| 1123 | */ | ||
| 1124 | unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, | ||
| 1125 | struct rdt_domain *d, u32 cbm) | ||
| 1126 | { | ||
| 1127 | struct cpu_cacheinfo *ci; | ||
| 1128 | unsigned int size = 0; | ||
| 1129 | int num_b, i; | ||
| 1130 | |||
| 1131 | num_b = bitmap_weight((unsigned long *)&cbm, r->cache.cbm_len); | ||
| 1132 | ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask)); | ||
| 1133 | for (i = 0; i < ci->num_leaves; i++) { | ||
| 1134 | if (ci->info_list[i].level == r->cache_level) { | ||
| 1135 | size = ci->info_list[i].size / r->cache.cbm_len * num_b; | ||
| 1136 | break; | ||
| 1137 | } | ||
| 1138 | } | ||
| 1139 | |||
| 1140 | return size; | ||
| 1141 | } | ||
| 1142 | |||
| 1143 | /** | ||
| 1144 | * rdtgroup_size_show - Display size in bytes of allocated regions | ||
| 1145 | * | ||
| 1146 | * The "size" file mirrors the layout of the "schemata" file, printing the | ||
| 1147 | * size in bytes of each region instead of the capacity bitmask. | ||
| 1148 | * | ||
| 1149 | */ | ||
| 1150 | static int rdtgroup_size_show(struct kernfs_open_file *of, | ||
| 1151 | struct seq_file *s, void *v) | ||
| 1152 | { | ||
| 1153 | struct rdtgroup *rdtgrp; | ||
| 1154 | struct rdt_resource *r; | ||
| 1155 | struct rdt_domain *d; | ||
| 1156 | unsigned int size; | ||
| 1157 | bool sep = false; | ||
| 1158 | u32 cbm; | ||
| 1159 | |||
| 1160 | rdtgrp = rdtgroup_kn_lock_live(of->kn); | ||
| 1161 | if (!rdtgrp) { | ||
| 1162 | rdtgroup_kn_unlock(of->kn); | ||
| 1163 | return -ENOENT; | ||
| 1164 | } | ||
| 1165 | |||
| 1166 | if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { | ||
| 1167 | seq_printf(s, "%*s:", max_name_width, rdtgrp->plr->r->name); | ||
| 1168 | size = rdtgroup_cbm_to_size(rdtgrp->plr->r, | ||
| 1169 | rdtgrp->plr->d, | ||
| 1170 | rdtgrp->plr->cbm); | ||
| 1171 | seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size); | ||
| 1172 | goto out; | ||
| 1173 | } | ||
| 1174 | |||
| 1175 | for_each_alloc_enabled_rdt_resource(r) { | ||
| 1176 | seq_printf(s, "%*s:", max_name_width, r->name); | ||
| 1177 | list_for_each_entry(d, &r->domains, list) { | ||
| 1178 | if (sep) | ||
| 1179 | seq_putc(s, ';'); | ||
| 1180 | if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { | ||
| 1181 | size = 0; | ||
| 1182 | } else { | ||
| 1183 | cbm = d->ctrl_val[rdtgrp->closid]; | ||
| 1184 | size = rdtgroup_cbm_to_size(r, d, cbm); | ||
| 1185 | } | ||
| 1186 | seq_printf(s, "%d=%u", d->id, size); | ||
| 1187 | sep = true; | ||
| 1188 | } | ||
| 1189 | seq_putc(s, '\n'); | ||
| 1190 | } | ||
| 1191 | |||
| 1192 | out: | ||
| 1193 | rdtgroup_kn_unlock(of->kn); | ||
| 1194 | |||
| 1195 | return 0; | ||
| 1196 | } | ||
| 1197 | |||
| 743 | /* rdtgroup information files for one cache resource. */ | 1198 | /* rdtgroup information files for one cache resource. */ |
| 744 | static struct rftype res_common_files[] = { | 1199 | static struct rftype res_common_files[] = { |
| 745 | { | 1200 | { |
| @@ -792,6 +1247,13 @@ static struct rftype res_common_files[] = { | |||
| 792 | .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, | 1247 | .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, |
| 793 | }, | 1248 | }, |
| 794 | { | 1249 | { |
| 1250 | .name = "bit_usage", | ||
| 1251 | .mode = 0444, | ||
| 1252 | .kf_ops = &rdtgroup_kf_single_ops, | ||
| 1253 | .seq_show = rdt_bit_usage_show, | ||
| 1254 | .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, | ||
| 1255 | }, | ||
| 1256 | { | ||
| 795 | .name = "min_bandwidth", | 1257 | .name = "min_bandwidth", |
| 796 | .mode = 0444, | 1258 | .mode = 0444, |
| 797 | .kf_ops = &rdtgroup_kf_single_ops, | 1259 | .kf_ops = &rdtgroup_kf_single_ops, |
| @@ -853,6 +1315,22 @@ static struct rftype res_common_files[] = { | |||
| 853 | .seq_show = rdtgroup_schemata_show, | 1315 | .seq_show = rdtgroup_schemata_show, |
| 854 | .fflags = RF_CTRL_BASE, | 1316 | .fflags = RF_CTRL_BASE, |
| 855 | }, | 1317 | }, |
| 1318 | { | ||
| 1319 | .name = "mode", | ||
| 1320 | .mode = 0644, | ||
| 1321 | .kf_ops = &rdtgroup_kf_single_ops, | ||
| 1322 | .write = rdtgroup_mode_write, | ||
| 1323 | .seq_show = rdtgroup_mode_show, | ||
| 1324 | .fflags = RF_CTRL_BASE, | ||
| 1325 | }, | ||
| 1326 | { | ||
| 1327 | .name = "size", | ||
| 1328 | .mode = 0444, | ||
| 1329 | .kf_ops = &rdtgroup_kf_single_ops, | ||
| 1330 | .seq_show = rdtgroup_size_show, | ||
| 1331 | .fflags = RF_CTRL_BASE, | ||
| 1332 | }, | ||
| 1333 | |||
| 856 | }; | 1334 | }; |
| 857 | 1335 | ||
| 858 | static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) | 1336 | static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) |
| @@ -883,6 +1361,103 @@ error: | |||
| 883 | return ret; | 1361 | return ret; |
| 884 | } | 1362 | } |
| 885 | 1363 | ||
| 1364 | /** | ||
| 1365 | * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file | ||
| 1366 | * @r: The resource group with which the file is associated. | ||
| 1367 | * @name: Name of the file | ||
| 1368 | * | ||
| 1369 | * The permissions of named resctrl file, directory, or link are modified | ||
| 1370 | * to not allow read, write, or execute by any user. | ||
| 1371 | * | ||
| 1372 | * WARNING: This function is intended to communicate to the user that the | ||
| 1373 | * resctrl file has been locked down - that it is not relevant to the | ||
| 1374 | * particular state the system finds itself in. It should not be relied | ||
| 1375 | * on to protect from user access because after the file's permissions | ||
| 1376 | * are restricted the user can still change the permissions using chmod | ||
| 1377 | * from the command line. | ||
| 1378 | * | ||
| 1379 | * Return: 0 on success, <0 on failure. | ||
| 1380 | */ | ||
| 1381 | int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name) | ||
| 1382 | { | ||
| 1383 | struct iattr iattr = {.ia_valid = ATTR_MODE,}; | ||
| 1384 | struct kernfs_node *kn; | ||
| 1385 | int ret = 0; | ||
| 1386 | |||
| 1387 | kn = kernfs_find_and_get_ns(r->kn, name, NULL); | ||
| 1388 | if (!kn) | ||
| 1389 | return -ENOENT; | ||
| 1390 | |||
| 1391 | switch (kernfs_type(kn)) { | ||
| 1392 | case KERNFS_DIR: | ||
| 1393 | iattr.ia_mode = S_IFDIR; | ||
| 1394 | break; | ||
| 1395 | case KERNFS_FILE: | ||
| 1396 | iattr.ia_mode = S_IFREG; | ||
| 1397 | break; | ||
| 1398 | case KERNFS_LINK: | ||
| 1399 | iattr.ia_mode = S_IFLNK; | ||
| 1400 | break; | ||
| 1401 | } | ||
| 1402 | |||
| 1403 | ret = kernfs_setattr(kn, &iattr); | ||
| 1404 | kernfs_put(kn); | ||
| 1405 | return ret; | ||
| 1406 | } | ||
| 1407 | |||
| 1408 | /** | ||
| 1409 | * rdtgroup_kn_mode_restore - Restore user access to named resctrl file | ||
| 1410 | * @r: The resource group with which the file is associated. | ||
| 1411 | * @name: Name of the file | ||
| 1412 | * @mask: Mask of permissions that should be restored | ||
| 1413 | * | ||
| 1414 | * Restore the permissions of the named file. If @name is a directory the | ||
| 1415 | * permissions of its parent will be used. | ||
| 1416 | * | ||
| 1417 | * Return: 0 on success, <0 on failure. | ||
| 1418 | */ | ||
| 1419 | int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, | ||
| 1420 | umode_t mask) | ||
| 1421 | { | ||
| 1422 | struct iattr iattr = {.ia_valid = ATTR_MODE,}; | ||
| 1423 | struct kernfs_node *kn, *parent; | ||
| 1424 | struct rftype *rfts, *rft; | ||
| 1425 | int ret, len; | ||
| 1426 | |||
| 1427 | rfts = res_common_files; | ||
| 1428 | len = ARRAY_SIZE(res_common_files); | ||
| 1429 | |||
| 1430 | for (rft = rfts; rft < rfts + len; rft++) { | ||
| 1431 | if (!strcmp(rft->name, name)) | ||
| 1432 | iattr.ia_mode = rft->mode & mask; | ||
| 1433 | } | ||
| 1434 | |||
| 1435 | kn = kernfs_find_and_get_ns(r->kn, name, NULL); | ||
| 1436 | if (!kn) | ||
| 1437 | return -ENOENT; | ||
| 1438 | |||
| 1439 | switch (kernfs_type(kn)) { | ||
| 1440 | case KERNFS_DIR: | ||
| 1441 | parent = kernfs_get_parent(kn); | ||
| 1442 | if (parent) { | ||
| 1443 | iattr.ia_mode |= parent->mode; | ||
| 1444 | kernfs_put(parent); | ||
| 1445 | } | ||
| 1446 | iattr.ia_mode |= S_IFDIR; | ||
| 1447 | break; | ||
| 1448 | case KERNFS_FILE: | ||
| 1449 | iattr.ia_mode |= S_IFREG; | ||
| 1450 | break; | ||
| 1451 | case KERNFS_LINK: | ||
| 1452 | iattr.ia_mode |= S_IFLNK; | ||
| 1453 | break; | ||
| 1454 | } | ||
| 1455 | |||
| 1456 | ret = kernfs_setattr(kn, &iattr); | ||
| 1457 | kernfs_put(kn); | ||
| 1458 | return ret; | ||
| 1459 | } | ||
| 1460 | |||
| 886 | static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name, | 1461 | static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name, |
| 887 | unsigned long fflags) | 1462 | unsigned long fflags) |
| 888 | { | 1463 | { |
| @@ -1224,6 +1799,9 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) | |||
| 1224 | 1799 | ||
| 1225 | if (atomic_dec_and_test(&rdtgrp->waitcount) && | 1800 | if (atomic_dec_and_test(&rdtgrp->waitcount) && |
| 1226 | (rdtgrp->flags & RDT_DELETED)) { | 1801 | (rdtgrp->flags & RDT_DELETED)) { |
| 1802 | if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || | ||
| 1803 | rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) | ||
| 1804 | rdtgroup_pseudo_lock_remove(rdtgrp); | ||
| 1227 | kernfs_unbreak_active_protection(kn); | 1805 | kernfs_unbreak_active_protection(kn); |
| 1228 | kernfs_put(rdtgrp->kn); | 1806 | kernfs_put(rdtgrp->kn); |
| 1229 | kfree(rdtgrp); | 1807 | kfree(rdtgrp); |
| @@ -1289,10 +1867,16 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, | |||
| 1289 | rdtgroup_default.mon.mon_data_kn = kn_mondata; | 1867 | rdtgroup_default.mon.mon_data_kn = kn_mondata; |
| 1290 | } | 1868 | } |
| 1291 | 1869 | ||
| 1870 | ret = rdt_pseudo_lock_init(); | ||
| 1871 | if (ret) { | ||
| 1872 | dentry = ERR_PTR(ret); | ||
| 1873 | goto out_mondata; | ||
| 1874 | } | ||
| 1875 | |||
| 1292 | dentry = kernfs_mount(fs_type, flags, rdt_root, | 1876 | dentry = kernfs_mount(fs_type, flags, rdt_root, |
| 1293 | RDTGROUP_SUPER_MAGIC, NULL); | 1877 | RDTGROUP_SUPER_MAGIC, NULL); |
| 1294 | if (IS_ERR(dentry)) | 1878 | if (IS_ERR(dentry)) |
| 1295 | goto out_mondata; | 1879 | goto out_psl; |
| 1296 | 1880 | ||
| 1297 | if (rdt_alloc_capable) | 1881 | if (rdt_alloc_capable) |
| 1298 | static_branch_enable_cpuslocked(&rdt_alloc_enable_key); | 1882 | static_branch_enable_cpuslocked(&rdt_alloc_enable_key); |
| @@ -1310,6 +1894,8 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, | |||
| 1310 | 1894 | ||
| 1311 | goto out; | 1895 | goto out; |
| 1312 | 1896 | ||
| 1897 | out_psl: | ||
| 1898 | rdt_pseudo_lock_release(); | ||
| 1313 | out_mondata: | 1899 | out_mondata: |
| 1314 | if (rdt_mon_capable) | 1900 | if (rdt_mon_capable) |
| 1315 | kernfs_remove(kn_mondata); | 1901 | kernfs_remove(kn_mondata); |
| @@ -1447,6 +2033,10 @@ static void rmdir_all_sub(void) | |||
| 1447 | if (rdtgrp == &rdtgroup_default) | 2033 | if (rdtgrp == &rdtgroup_default) |
| 1448 | continue; | 2034 | continue; |
| 1449 | 2035 | ||
| 2036 | if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || | ||
| 2037 | rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) | ||
| 2038 | rdtgroup_pseudo_lock_remove(rdtgrp); | ||
| 2039 | |||
| 1450 | /* | 2040 | /* |
| 1451 | * Give any CPUs back to the default group. We cannot copy | 2041 | * Give any CPUs back to the default group. We cannot copy |
| 1452 | * cpu_online_mask because a CPU might have executed the | 2042 | * cpu_online_mask because a CPU might have executed the |
| @@ -1483,6 +2073,8 @@ static void rdt_kill_sb(struct super_block *sb) | |||
| 1483 | reset_all_ctrls(r); | 2073 | reset_all_ctrls(r); |
| 1484 | cdp_disable_all(); | 2074 | cdp_disable_all(); |
| 1485 | rmdir_all_sub(); | 2075 | rmdir_all_sub(); |
| 2076 | rdt_pseudo_lock_release(); | ||
| 2077 | rdtgroup_default.mode = RDT_MODE_SHAREABLE; | ||
| 1486 | static_branch_disable_cpuslocked(&rdt_alloc_enable_key); | 2078 | static_branch_disable_cpuslocked(&rdt_alloc_enable_key); |
| 1487 | static_branch_disable_cpuslocked(&rdt_mon_enable_key); | 2079 | static_branch_disable_cpuslocked(&rdt_mon_enable_key); |
| 1488 | static_branch_disable_cpuslocked(&rdt_enable_key); | 2080 | static_branch_disable_cpuslocked(&rdt_enable_key); |
| @@ -1682,6 +2274,114 @@ out_destroy: | |||
| 1682 | return ret; | 2274 | return ret; |
| 1683 | } | 2275 | } |
| 1684 | 2276 | ||
| 2277 | /** | ||
| 2278 | * cbm_ensure_valid - Enforce validity on provided CBM | ||
| 2279 | * @_val: Candidate CBM | ||
| 2280 | * @r: RDT resource to which the CBM belongs | ||
| 2281 | * | ||
| 2282 | * The provided CBM represents all cache portions available for use. This | ||
| 2283 | * may be represented by a bitmap that does not consist of contiguous ones | ||
| 2284 | * and thus be an invalid CBM. | ||
| 2285 | * Here the provided CBM is forced to be a valid CBM by only considering | ||
| 2286 | * the first set of contiguous bits as valid and clearing all bits. | ||
| 2287 | * The intention here is to provide a valid default CBM with which a new | ||
| 2288 | * resource group is initialized. The user can follow this with a | ||
| 2289 | * modification to the CBM if the default does not satisfy the | ||
| 2290 | * requirements. | ||
| 2291 | */ | ||
| 2292 | static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r) | ||
| 2293 | { | ||
| 2294 | /* | ||
| 2295 | * Convert the u32 _val to an unsigned long required by all the bit | ||
| 2296 | * operations within this function. No more than 32 bits of this | ||
| 2297 | * converted value can be accessed because all bit operations are | ||
| 2298 | * additionally provided with cbm_len that is initialized during | ||
| 2299 | * hardware enumeration using five bits from the EAX register and | ||
| 2300 | * thus never can exceed 32 bits. | ||
| 2301 | */ | ||
| 2302 | unsigned long *val = (unsigned long *)_val; | ||
| 2303 | unsigned int cbm_len = r->cache.cbm_len; | ||
| 2304 | unsigned long first_bit, zero_bit; | ||
| 2305 | |||
| 2306 | if (*val == 0) | ||
| 2307 | return; | ||
| 2308 | |||
| 2309 | first_bit = find_first_bit(val, cbm_len); | ||
| 2310 | zero_bit = find_next_zero_bit(val, cbm_len, first_bit); | ||
| 2311 | |||
| 2312 | /* Clear any remaining bits to ensure contiguous region */ | ||
| 2313 | bitmap_clear(val, zero_bit, cbm_len - zero_bit); | ||
| 2314 | } | ||
| 2315 | |||
| 2316 | /** | ||
| 2317 | * rdtgroup_init_alloc - Initialize the new RDT group's allocations | ||
| 2318 | * | ||
| 2319 | * A new RDT group is being created on an allocation capable (CAT) | ||
| 2320 | * supporting system. Set this group up to start off with all usable | ||
| 2321 | * allocations. That is, all shareable and unused bits. | ||
| 2322 | * | ||
| 2323 | * All-zero CBM is invalid. If there are no more shareable bits available | ||
| 2324 | * on any domain then the entire allocation will fail. | ||
| 2325 | */ | ||
| 2326 | static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) | ||
| 2327 | { | ||
| 2328 | u32 used_b = 0, unused_b = 0; | ||
| 2329 | u32 closid = rdtgrp->closid; | ||
| 2330 | struct rdt_resource *r; | ||
| 2331 | enum rdtgrp_mode mode; | ||
| 2332 | struct rdt_domain *d; | ||
| 2333 | int i, ret; | ||
| 2334 | u32 *ctrl; | ||
| 2335 | |||
| 2336 | for_each_alloc_enabled_rdt_resource(r) { | ||
| 2337 | list_for_each_entry(d, &r->domains, list) { | ||
| 2338 | d->have_new_ctrl = false; | ||
| 2339 | d->new_ctrl = r->cache.shareable_bits; | ||
| 2340 | used_b = r->cache.shareable_bits; | ||
| 2341 | ctrl = d->ctrl_val; | ||
| 2342 | for (i = 0; i < r->num_closid; i++, ctrl++) { | ||
| 2343 | if (closid_allocated(i) && i != closid) { | ||
| 2344 | mode = rdtgroup_mode_by_closid(i); | ||
| 2345 | if (mode == RDT_MODE_PSEUDO_LOCKSETUP) | ||
| 2346 | break; | ||
| 2347 | used_b |= *ctrl; | ||
| 2348 | if (mode == RDT_MODE_SHAREABLE) | ||
| 2349 | d->new_ctrl |= *ctrl; | ||
| 2350 | } | ||
| 2351 | } | ||
| 2352 | if (d->plr && d->plr->cbm > 0) | ||
| 2353 | used_b |= d->plr->cbm; | ||
| 2354 | unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); | ||
| 2355 | unused_b &= BIT_MASK(r->cache.cbm_len) - 1; | ||
| 2356 | d->new_ctrl |= unused_b; | ||
| 2357 | /* | ||
| 2358 | * Force the initial CBM to be valid, user can | ||
| 2359 | * modify the CBM based on system availability. | ||
| 2360 | */ | ||
| 2361 | cbm_ensure_valid(&d->new_ctrl, r); | ||
| 2362 | if (bitmap_weight((unsigned long *) &d->new_ctrl, | ||
| 2363 | r->cache.cbm_len) < | ||
| 2364 | r->cache.min_cbm_bits) { | ||
| 2365 | rdt_last_cmd_printf("no space on %s:%d\n", | ||
| 2366 | r->name, d->id); | ||
| 2367 | return -ENOSPC; | ||
| 2368 | } | ||
| 2369 | d->have_new_ctrl = true; | ||
| 2370 | } | ||
| 2371 | } | ||
| 2372 | |||
| 2373 | for_each_alloc_enabled_rdt_resource(r) { | ||
| 2374 | ret = update_domains(r, rdtgrp->closid); | ||
| 2375 | if (ret < 0) { | ||
| 2376 | rdt_last_cmd_puts("failed to initialize allocations\n"); | ||
| 2377 | return ret; | ||
| 2378 | } | ||
| 2379 | rdtgrp->mode = RDT_MODE_SHAREABLE; | ||
| 2380 | } | ||
| 2381 | |||
| 2382 | return 0; | ||
| 2383 | } | ||
| 2384 | |||
| 1685 | static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, | 2385 | static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, |
| 1686 | struct kernfs_node *prgrp_kn, | 2386 | struct kernfs_node *prgrp_kn, |
| 1687 | const char *name, umode_t mode, | 2387 | const char *name, umode_t mode, |
| @@ -1700,6 +2400,14 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, | |||
| 1700 | goto out_unlock; | 2400 | goto out_unlock; |
| 1701 | } | 2401 | } |
| 1702 | 2402 | ||
| 2403 | if (rtype == RDTMON_GROUP && | ||
| 2404 | (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || | ||
| 2405 | prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) { | ||
| 2406 | ret = -EINVAL; | ||
| 2407 | rdt_last_cmd_puts("pseudo-locking in progress\n"); | ||
| 2408 | goto out_unlock; | ||
| 2409 | } | ||
| 2410 | |||
| 1703 | /* allocate the rdtgroup. */ | 2411 | /* allocate the rdtgroup. */ |
| 1704 | rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); | 2412 | rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); |
| 1705 | if (!rdtgrp) { | 2413 | if (!rdtgrp) { |
| @@ -1840,6 +2548,10 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, | |||
| 1840 | ret = 0; | 2548 | ret = 0; |
| 1841 | 2549 | ||
| 1842 | rdtgrp->closid = closid; | 2550 | rdtgrp->closid = closid; |
| 2551 | ret = rdtgroup_init_alloc(rdtgrp); | ||
| 2552 | if (ret < 0) | ||
| 2553 | goto out_id_free; | ||
| 2554 | |||
| 1843 | list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); | 2555 | list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); |
| 1844 | 2556 | ||
| 1845 | if (rdt_mon_capable) { | 2557 | if (rdt_mon_capable) { |
| @@ -1850,15 +2562,16 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, | |||
| 1850 | ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL); | 2562 | ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL); |
| 1851 | if (ret) { | 2563 | if (ret) { |
| 1852 | rdt_last_cmd_puts("kernfs subdir error\n"); | 2564 | rdt_last_cmd_puts("kernfs subdir error\n"); |
| 1853 | goto out_id_free; | 2565 | goto out_del_list; |
| 1854 | } | 2566 | } |
| 1855 | } | 2567 | } |
| 1856 | 2568 | ||
| 1857 | goto out_unlock; | 2569 | goto out_unlock; |
| 1858 | 2570 | ||
| 2571 | out_del_list: | ||
| 2572 | list_del(&rdtgrp->rdtgroup_list); | ||
| 1859 | out_id_free: | 2573 | out_id_free: |
| 1860 | closid_free(closid); | 2574 | closid_free(closid); |
| 1861 | list_del(&rdtgrp->rdtgroup_list); | ||
| 1862 | out_common_fail: | 2575 | out_common_fail: |
| 1863 | mkdir_rdt_prepare_clean(rdtgrp); | 2576 | mkdir_rdt_prepare_clean(rdtgrp); |
| 1864 | out_unlock: | 2577 | out_unlock: |
| @@ -1945,6 +2658,21 @@ static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp, | |||
| 1945 | return 0; | 2658 | return 0; |
| 1946 | } | 2659 | } |
| 1947 | 2660 | ||
| 2661 | static int rdtgroup_ctrl_remove(struct kernfs_node *kn, | ||
| 2662 | struct rdtgroup *rdtgrp) | ||
| 2663 | { | ||
| 2664 | rdtgrp->flags = RDT_DELETED; | ||
| 2665 | list_del(&rdtgrp->rdtgroup_list); | ||
| 2666 | |||
| 2667 | /* | ||
| 2668 | * one extra hold on this, will drop when we kfree(rdtgrp) | ||
| 2669 | * in rdtgroup_kn_unlock() | ||
| 2670 | */ | ||
| 2671 | kernfs_get(kn); | ||
| 2672 | kernfs_remove(rdtgrp->kn); | ||
| 2673 | return 0; | ||
| 2674 | } | ||
| 2675 | |||
| 1948 | static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, | 2676 | static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, |
| 1949 | cpumask_var_t tmpmask) | 2677 | cpumask_var_t tmpmask) |
| 1950 | { | 2678 | { |
| @@ -1970,7 +2698,6 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, | |||
| 1970 | cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); | 2698 | cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); |
| 1971 | update_closid_rmid(tmpmask, NULL); | 2699 | update_closid_rmid(tmpmask, NULL); |
| 1972 | 2700 | ||
| 1973 | rdtgrp->flags = RDT_DELETED; | ||
| 1974 | closid_free(rdtgrp->closid); | 2701 | closid_free(rdtgrp->closid); |
| 1975 | free_rmid(rdtgrp->mon.rmid); | 2702 | free_rmid(rdtgrp->mon.rmid); |
| 1976 | 2703 | ||
| @@ -1979,14 +2706,7 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, | |||
| 1979 | */ | 2706 | */ |
| 1980 | free_all_child_rdtgrp(rdtgrp); | 2707 | free_all_child_rdtgrp(rdtgrp); |
| 1981 | 2708 | ||
| 1982 | list_del(&rdtgrp->rdtgroup_list); | 2709 | rdtgroup_ctrl_remove(kn, rdtgrp); |
| 1983 | |||
| 1984 | /* | ||
| 1985 | * one extra hold on this, will drop when we kfree(rdtgrp) | ||
| 1986 | * in rdtgroup_kn_unlock() | ||
| 1987 | */ | ||
| 1988 | kernfs_get(kn); | ||
| 1989 | kernfs_remove(rdtgrp->kn); | ||
| 1990 | 2710 | ||
| 1991 | return 0; | 2711 | return 0; |
| 1992 | } | 2712 | } |
| @@ -2014,13 +2734,19 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) | |||
| 2014 | * If the rdtgroup is a mon group and parent directory | 2734 | * If the rdtgroup is a mon group and parent directory |
| 2015 | * is a valid "mon_groups" directory, remove the mon group. | 2735 | * is a valid "mon_groups" directory, remove the mon group. |
| 2016 | */ | 2736 | */ |
| 2017 | if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) | 2737 | if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) { |
| 2018 | ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask); | 2738 | if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || |
| 2019 | else if (rdtgrp->type == RDTMON_GROUP && | 2739 | rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { |
| 2020 | is_mon_groups(parent_kn, kn->name)) | 2740 | ret = rdtgroup_ctrl_remove(kn, rdtgrp); |
| 2741 | } else { | ||
| 2742 | ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask); | ||
| 2743 | } | ||
| 2744 | } else if (rdtgrp->type == RDTMON_GROUP && | ||
| 2745 | is_mon_groups(parent_kn, kn->name)) { | ||
| 2021 | ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask); | 2746 | ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask); |
| 2022 | else | 2747 | } else { |
| 2023 | ret = -EPERM; | 2748 | ret = -EPERM; |
| 2749 | } | ||
| 2024 | 2750 | ||
| 2025 | out: | 2751 | out: |
| 2026 | rdtgroup_kn_unlock(kn); | 2752 | rdtgroup_kn_unlock(kn); |
| @@ -2046,7 +2772,8 @@ static int __init rdtgroup_setup_root(void) | |||
| 2046 | int ret; | 2772 | int ret; |
| 2047 | 2773 | ||
| 2048 | rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, | 2774 | rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, |
| 2049 | KERNFS_ROOT_CREATE_DEACTIVATED, | 2775 | KERNFS_ROOT_CREATE_DEACTIVATED | |
| 2776 | KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, | ||
| 2050 | &rdtgroup_default); | 2777 | &rdtgroup_default); |
| 2051 | if (IS_ERR(rdt_root)) | 2778 | if (IS_ERR(rdt_root)) |
| 2052 | return PTR_ERR(rdt_root); | 2779 | return PTR_ERR(rdt_root); |
| @@ -2102,6 +2829,29 @@ int __init rdtgroup_init(void) | |||
| 2102 | if (ret) | 2829 | if (ret) |
| 2103 | goto cleanup_mountpoint; | 2830 | goto cleanup_mountpoint; |
| 2104 | 2831 | ||
| 2832 | /* | ||
| 2833 | * Adding the resctrl debugfs directory here may not be ideal since | ||
| 2834 | * it would let the resctrl debugfs directory appear on the debugfs | ||
| 2835 | * filesystem before the resctrl filesystem is mounted. | ||
| 2836 | * It may also be ok since that would enable debugging of RDT before | ||
| 2837 | * resctrl is mounted. | ||
| 2838 | * The reason why the debugfs directory is created here and not in | ||
| 2839 | * rdt_mount() is because rdt_mount() takes rdtgroup_mutex and | ||
| 2840 | * during the debugfs directory creation also &sb->s_type->i_mutex_key | ||
| 2841 | * (the lockdep class of inode->i_rwsem). Other filesystem | ||
| 2842 | * interactions (eg. SyS_getdents) have the lock ordering: | ||
| 2843 | * &sb->s_type->i_mutex_key --> &mm->mmap_sem | ||
| 2844 | * During mmap(), called with &mm->mmap_sem, the rdtgroup_mutex | ||
| 2845 | * is taken, thus creating dependency: | ||
| 2846 | * &mm->mmap_sem --> rdtgroup_mutex for the latter that can cause | ||
| 2847 | * issues considering the other two lock dependencies. | ||
| 2848 | * By creating the debugfs directory here we avoid a dependency | ||
| 2849 | * that may cause deadlock (even though file operations cannot | ||
| 2850 | * occur until the filesystem is mounted, but I do not know how to | ||
| 2851 | * tell lockdep that). | ||
| 2852 | */ | ||
| 2853 | debugfs_resctrl = debugfs_create_dir("resctrl", NULL); | ||
| 2854 | |||
| 2105 | return 0; | 2855 | return 0; |
| 2106 | 2856 | ||
| 2107 | cleanup_mountpoint: | 2857 | cleanup_mountpoint: |
| @@ -2111,3 +2861,11 @@ cleanup_root: | |||
| 2111 | 2861 | ||
| 2112 | return ret; | 2862 | return ret; |
| 2113 | } | 2863 | } |
| 2864 | |||
| 2865 | void __exit rdtgroup_exit(void) | ||
| 2866 | { | ||
| 2867 | debugfs_remove_recursive(debugfs_resctrl); | ||
| 2868 | unregister_filesystem(&rdt_fs_type); | ||
| 2869 | sysfs_remove_mount_point(fs_kobj, "resctrl"); | ||
| 2870 | kernfs_destroy_root(rdt_root); | ||
| 2871 | } | ||
