diff options
| -rw-r--r-- | Documentation/admin-guide/kernel-parameters.rst | 1 | ||||
| -rw-r--r-- | Documentation/admin-guide/kernel-parameters.txt | 6 | ||||
| -rw-r--r-- | Documentation/x86/intel_rdt_ui.txt | 323 | ||||
| -rw-r--r-- | MAINTAINERS | 2 | ||||
| -rw-r--r-- | arch/x86/Kconfig | 12 | ||||
| -rw-r--r-- | arch/x86/events/intel/Makefile | 2 | ||||
| -rw-r--r-- | arch/x86/events/intel/cqm.c | 1766 | ||||
| -rw-r--r-- | arch/x86/include/asm/intel_rdt.h | 286 | ||||
| -rw-r--r-- | arch/x86/include/asm/intel_rdt_common.h | 27 | ||||
| -rw-r--r-- | arch/x86/include/asm/intel_rdt_sched.h | 92 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/Makefile | 2 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/intel_rdt.c | 375 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/intel_rdt.h | 440 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c (renamed from arch/x86/kernel/cpu/intel_rdt_schemata.c) | 67 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/intel_rdt_monitor.c | 499 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 1117 | ||||
| -rw-r--r-- | arch/x86/kernel/process_32.c | 2 | ||||
| -rw-r--r-- | arch/x86/kernel/process_64.c | 2 | ||||
| -rw-r--r-- | include/linux/perf_event.h | 18 | ||||
| -rw-r--r-- | include/linux/sched.h | 5 | ||||
| -rw-r--r-- | kernel/events/core.c | 14 |
21 files changed, 2631 insertions, 2427 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst index d76ab3907e2b..b2598cc9834c 100644 --- a/Documentation/admin-guide/kernel-parameters.rst +++ b/Documentation/admin-guide/kernel-parameters.rst | |||
| @@ -138,6 +138,7 @@ parameter is applicable:: | |||
| 138 | PPT Parallel port support is enabled. | 138 | PPT Parallel port support is enabled. |
| 139 | PS2 Appropriate PS/2 support is enabled. | 139 | PS2 Appropriate PS/2 support is enabled. |
| 140 | RAM RAM disk support is enabled. | 140 | RAM RAM disk support is enabled. |
| 141 | RDT Intel Resource Director Technology. | ||
| 141 | S390 S390 architecture is enabled. | 142 | S390 S390 architecture is enabled. |
| 142 | SCSI Appropriate SCSI support is enabled. | 143 | SCSI Appropriate SCSI support is enabled. |
| 143 | A lot of drivers have their options described inside | 144 | A lot of drivers have their options described inside |
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index dad6fa01af95..591d48f3a7de 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt | |||
| @@ -3612,6 +3612,12 @@ | |||
| 3612 | Run specified binary instead of /init from the ramdisk, | 3612 | Run specified binary instead of /init from the ramdisk, |
| 3613 | used for early userspace startup. See initrd. | 3613 | used for early userspace startup. See initrd. |
| 3614 | 3614 | ||
| 3615 | rdt= [HW,X86,RDT] | ||
| 3616 | Turn on/off individual RDT features. List is: | ||
| 3617 | cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, mba. | ||
| 3618 | E.g. to turn on cmt and turn off mba use: | ||
| 3619 | rdt=cmt,!mba | ||
| 3620 | |||
| 3615 | reboot= [KNL] | 3621 | reboot= [KNL] |
| 3616 | Format (x86 or x86_64): | 3622 | Format (x86 or x86_64): |
| 3617 | [w[arm] | c[old] | h[ard] | s[oft] | g[pio]] \ | 3623 | [w[arm] | c[old] | h[ard] | s[oft] | g[pio]] \ |
diff --git a/Documentation/x86/intel_rdt_ui.txt b/Documentation/x86/intel_rdt_ui.txt index c491a1b82de2..4d8848e4e224 100644 --- a/Documentation/x86/intel_rdt_ui.txt +++ b/Documentation/x86/intel_rdt_ui.txt | |||
| @@ -6,8 +6,8 @@ Fenghua Yu <fenghua.yu@intel.com> | |||
| 6 | Tony Luck <tony.luck@intel.com> | 6 | Tony Luck <tony.luck@intel.com> |
| 7 | Vikas Shivappa <vikas.shivappa@intel.com> | 7 | Vikas Shivappa <vikas.shivappa@intel.com> |
| 8 | 8 | ||
| 9 | This feature is enabled by the CONFIG_INTEL_RDT_A Kconfig and the | 9 | This feature is enabled by the CONFIG_INTEL_RDT Kconfig and the |
| 10 | X86 /proc/cpuinfo flag bits "rdt", "cat_l3" and "cdp_l3". | 10 | X86 /proc/cpuinfo flag bits "rdt", "cqm", "cat_l3" and "cdp_l3". |
| 11 | 11 | ||
| 12 | To use the feature mount the file system: | 12 | To use the feature mount the file system: |
| 13 | 13 | ||
| @@ -17,6 +17,13 @@ mount options are: | |||
| 17 | 17 | ||
| 18 | "cdp": Enable code/data prioritization in L3 cache allocations. | 18 | "cdp": Enable code/data prioritization in L3 cache allocations. |
| 19 | 19 | ||
| 20 | RDT features are orthogonal. A particular system may support only | ||
| 21 | monitoring, only control, or both monitoring and control. | ||
| 22 | |||
| 23 | The mount succeeds if either of allocation or monitoring is present, but | ||
| 24 | only those files and directories supported by the system will be created. | ||
| 25 | For more details on the behavior of the interface during monitoring | ||
| 26 | and allocation, see the "Resource alloc and monitor groups" section. | ||
| 20 | 27 | ||
| 21 | Info directory | 28 | Info directory |
| 22 | -------------- | 29 | -------------- |
| @@ -24,7 +31,12 @@ Info directory | |||
| 24 | The 'info' directory contains information about the enabled | 31 | The 'info' directory contains information about the enabled |
| 25 | resources. Each resource has its own subdirectory. The subdirectory | 32 | resources. Each resource has its own subdirectory. The subdirectory |
| 26 | names reflect the resource names. | 33 | names reflect the resource names. |
| 27 | Cache resource(L3/L2) subdirectory contains the following files: | 34 | |
| 35 | Each subdirectory contains the following files with respect to | ||
| 36 | allocation: | ||
| 37 | |||
| 38 | Cache resource(L3/L2) subdirectory contains the following files | ||
| 39 | related to allocation: | ||
| 28 | 40 | ||
| 29 | "num_closids": The number of CLOSIDs which are valid for this | 41 | "num_closids": The number of CLOSIDs which are valid for this |
| 30 | resource. The kernel uses the smallest number of | 42 | resource. The kernel uses the smallest number of |
| @@ -36,7 +48,15 @@ Cache resource(L3/L2) subdirectory contains the following files: | |||
| 36 | "min_cbm_bits": The minimum number of consecutive bits which | 48 | "min_cbm_bits": The minimum number of consecutive bits which |
| 37 | must be set when writing a mask. | 49 | must be set when writing a mask. |
| 38 | 50 | ||
| 39 | Memory bandwitdh(MB) subdirectory contains the following files: | 51 | "shareable_bits": Bitmask of shareable resource with other executing |
| 52 | entities (e.g. I/O). User can use this when | ||
| 53 | setting up exclusive cache partitions. Note that | ||
| 54 | some platforms support devices that have their | ||
| 55 | own settings for cache use which can over-ride | ||
| 56 | these bits. | ||
| 57 | |||
| 58 | Memory bandwitdh(MB) subdirectory contains the following files | ||
| 59 | with respect to allocation: | ||
| 40 | 60 | ||
| 41 | "min_bandwidth": The minimum memory bandwidth percentage which | 61 | "min_bandwidth": The minimum memory bandwidth percentage which |
| 42 | user can request. | 62 | user can request. |
| @@ -52,48 +72,152 @@ Memory bandwitdh(MB) subdirectory contains the following files: | |||
| 52 | non-linear. This field is purely informational | 72 | non-linear. This field is purely informational |
| 53 | only. | 73 | only. |
| 54 | 74 | ||
| 55 | Resource groups | 75 | If RDT monitoring is available there will be an "L3_MON" directory |
| 56 | --------------- | 76 | with the following files: |
| 77 | |||
| 78 | "num_rmids": The number of RMIDs available. This is the | ||
| 79 | upper bound for how many "CTRL_MON" + "MON" | ||
| 80 | groups can be created. | ||
| 81 | |||
| 82 | "mon_features": Lists the monitoring events if | ||
| 83 | monitoring is enabled for the resource. | ||
| 84 | |||
| 85 | "max_threshold_occupancy": | ||
| 86 | Read/write file provides the largest value (in | ||
| 87 | bytes) at which a previously used LLC_occupancy | ||
| 88 | counter can be considered for re-use. | ||
| 89 | |||
| 90 | |||
| 91 | Resource alloc and monitor groups | ||
| 92 | --------------------------------- | ||
| 93 | |||
| 57 | Resource groups are represented as directories in the resctrl file | 94 | Resource groups are represented as directories in the resctrl file |
| 58 | system. The default group is the root directory. Other groups may be | 95 | system. The default group is the root directory which, immediately |
| 59 | created as desired by the system administrator using the "mkdir(1)" | 96 | after mounting, owns all the tasks and cpus in the system and can make |
| 60 | command, and removed using "rmdir(1)". | 97 | full use of all resources. |
| 98 | |||
| 99 | On a system with RDT control features additional directories can be | ||
| 100 | created in the root directory that specify different amounts of each | ||
| 101 | resource (see "schemata" below). The root and these additional top level | ||
| 102 | directories are referred to as "CTRL_MON" groups below. | ||
| 103 | |||
| 104 | On a system with RDT monitoring the root directory and other top level | ||
| 105 | directories contain a directory named "mon_groups" in which additional | ||
| 106 | directories can be created to monitor subsets of tasks in the CTRL_MON | ||
| 107 | group that is their ancestor. These are called "MON" groups in the rest | ||
| 108 | of this document. | ||
| 109 | |||
| 110 | Removing a directory will move all tasks and cpus owned by the group it | ||
| 111 | represents to the parent. Removing one of the created CTRL_MON groups | ||
| 112 | will automatically remove all MON groups below it. | ||
| 113 | |||
| 114 | All groups contain the following files: | ||
| 115 | |||
| 116 | "tasks": | ||
| 117 | Reading this file shows the list of all tasks that belong to | ||
| 118 | this group. Writing a task id to the file will add a task to the | ||
| 119 | group. If the group is a CTRL_MON group the task is removed from | ||
| 120 | whichever previous CTRL_MON group owned the task and also from | ||
| 121 | any MON group that owned the task. If the group is a MON group, | ||
| 122 | then the task must already belong to the CTRL_MON parent of this | ||
| 123 | group. The task is removed from any previous MON group. | ||
| 124 | |||
| 125 | |||
| 126 | "cpus": | ||
| 127 | Reading this file shows a bitmask of the logical CPUs owned by | ||
| 128 | this group. Writing a mask to this file will add and remove | ||
| 129 | CPUs to/from this group. As with the tasks file a hierarchy is | ||
| 130 | maintained where MON groups may only include CPUs owned by the | ||
| 131 | parent CTRL_MON group. | ||
| 132 | |||
| 61 | 133 | ||
| 62 | There are three files associated with each group: | 134 | "cpus_list": |
| 135 | Just like "cpus", only using ranges of CPUs instead of bitmasks. | ||
| 63 | 136 | ||
| 64 | "tasks": A list of tasks that belongs to this group. Tasks can be | ||
| 65 | added to a group by writing the task ID to the "tasks" file | ||
| 66 | (which will automatically remove them from the previous | ||
| 67 | group to which they belonged). New tasks created by fork(2) | ||
| 68 | and clone(2) are added to the same group as their parent. | ||
| 69 | If a pid is not in any sub partition, it is in root partition | ||
| 70 | (i.e. default partition). | ||
| 71 | 137 | ||
| 72 | "cpus": A bitmask of logical CPUs assigned to this group. Writing | 138 | When control is enabled all CTRL_MON groups will also contain: |
| 73 | a new mask can add/remove CPUs from this group. Added CPUs | ||
| 74 | are removed from their previous group. Removed ones are | ||
| 75 | given to the default (root) group. You cannot remove CPUs | ||
| 76 | from the default group. | ||
| 77 | 139 | ||
| 78 | "cpus_list": One or more CPU ranges of logical CPUs assigned to this | 140 | "schemata": |
| 79 | group. Same rules apply like for the "cpus" file. | 141 | A list of all the resources available to this group. |
| 142 | Each resource has its own line and format - see below for details. | ||
| 80 | 143 | ||
| 81 | "schemata": A list of all the resources available to this group. | 144 | When monitoring is enabled all MON groups will also contain: |
| 82 | Each resource has its own line and format - see below for | ||
| 83 | details. | ||
| 84 | 145 | ||
| 85 | When a task is running the following rules define which resources | 146 | "mon_data": |
| 86 | are available to it: | 147 | This contains a set of files organized by L3 domain and by |
| 148 | RDT event. E.g. on a system with two L3 domains there will | ||
| 149 | be subdirectories "mon_L3_00" and "mon_L3_01". Each of these | ||
| 150 | directories have one file per event (e.g. "llc_occupancy", | ||
| 151 | "mbm_total_bytes", and "mbm_local_bytes"). In a MON group these | ||
| 152 | files provide a read out of the current value of the event for | ||
| 153 | all tasks in the group. In CTRL_MON groups these files provide | ||
| 154 | the sum for all tasks in the CTRL_MON group and all tasks in | ||
| 155 | MON groups. Please see example section for more details on usage. | ||
| 156 | |||
| 157 | Resource allocation rules | ||
| 158 | ------------------------- | ||
| 159 | When a task is running the following rules define which resources are | ||
| 160 | available to it: | ||
| 87 | 161 | ||
| 88 | 1) If the task is a member of a non-default group, then the schemata | 162 | 1) If the task is a member of a non-default group, then the schemata |
| 89 | for that group is used. | 163 | for that group is used. |
| 90 | 164 | ||
| 91 | 2) Else if the task belongs to the default group, but is running on a | 165 | 2) Else if the task belongs to the default group, but is running on a |
| 92 | CPU that is assigned to some specific group, then the schemata for | 166 | CPU that is assigned to some specific group, then the schemata for the |
| 93 | the CPU's group is used. | 167 | CPU's group is used. |
| 94 | 168 | ||
| 95 | 3) Otherwise the schemata for the default group is used. | 169 | 3) Otherwise the schemata for the default group is used. |
| 96 | 170 | ||
| 171 | Resource monitoring rules | ||
| 172 | ------------------------- | ||
| 173 | 1) If a task is a member of a MON group, or non-default CTRL_MON group | ||
| 174 | then RDT events for the task will be reported in that group. | ||
| 175 | |||
| 176 | 2) If a task is a member of the default CTRL_MON group, but is running | ||
| 177 | on a CPU that is assigned to some specific group, then the RDT events | ||
| 178 | for the task will be reported in that group. | ||
| 179 | |||
| 180 | 3) Otherwise RDT events for the task will be reported in the root level | ||
| 181 | "mon_data" group. | ||
| 182 | |||
| 183 | |||
| 184 | Notes on cache occupancy monitoring and control | ||
| 185 | ----------------------------------------------- | ||
| 186 | When moving a task from one group to another you should remember that | ||
| 187 | this only affects *new* cache allocations by the task. E.g. you may have | ||
| 188 | a task in a monitor group showing 3 MB of cache occupancy. If you move | ||
| 189 | to a new group and immediately check the occupancy of the old and new | ||
| 190 | groups you will likely see that the old group is still showing 3 MB and | ||
| 191 | the new group zero. When the task accesses locations still in cache from | ||
| 192 | before the move, the h/w does not update any counters. On a busy system | ||
| 193 | you will likely see the occupancy in the old group go down as cache lines | ||
| 194 | are evicted and re-used while the occupancy in the new group rises as | ||
| 195 | the task accesses memory and loads into the cache are counted based on | ||
| 196 | membership in the new group. | ||
| 197 | |||
| 198 | The same applies to cache allocation control. Moving a task to a group | ||
| 199 | with a smaller cache partition will not evict any cache lines. The | ||
| 200 | process may continue to use them from the old partition. | ||
| 201 | |||
| 202 | Hardware uses CLOSid(Class of service ID) and an RMID(Resource monitoring ID) | ||
| 203 | to identify a control group and a monitoring group respectively. Each of | ||
| 204 | the resource groups are mapped to these IDs based on the kind of group. The | ||
| 205 | number of CLOSid and RMID are limited by the hardware and hence the creation of | ||
| 206 | a "CTRL_MON" directory may fail if we run out of either CLOSID or RMID | ||
| 207 | and creation of "MON" group may fail if we run out of RMIDs. | ||
| 208 | |||
| 209 | max_threshold_occupancy - generic concepts | ||
| 210 | ------------------------------------------ | ||
| 211 | |||
| 212 | Note that an RMID once freed may not be immediately available for use as | ||
| 213 | the RMID is still tagged the cache lines of the previous user of RMID. | ||
| 214 | Hence such RMIDs are placed on limbo list and checked back if the cache | ||
| 215 | occupancy has gone down. If there is a time when system has a lot of | ||
| 216 | limbo RMIDs but which are not ready to be used, user may see an -EBUSY | ||
| 217 | during mkdir. | ||
| 218 | |||
| 219 | max_threshold_occupancy is a user configurable value to determine the | ||
| 220 | occupancy at which an RMID can be freed. | ||
| 97 | 221 | ||
| 98 | Schemata files - general concepts | 222 | Schemata files - general concepts |
| 99 | --------------------------------- | 223 | --------------------------------- |
| @@ -143,22 +267,22 @@ SKUs. Using a high bandwidth and a low bandwidth setting on two threads | |||
| 143 | sharing a core will result in both threads being throttled to use the | 267 | sharing a core will result in both threads being throttled to use the |
| 144 | low bandwidth. | 268 | low bandwidth. |
| 145 | 269 | ||
| 146 | L3 details (code and data prioritization disabled) | 270 | L3 schemata file details (code and data prioritization disabled) |
| 147 | -------------------------------------------------- | 271 | ---------------------------------------------------------------- |
| 148 | With CDP disabled the L3 schemata format is: | 272 | With CDP disabled the L3 schemata format is: |
| 149 | 273 | ||
| 150 | L3:<cache_id0>=<cbm>;<cache_id1>=<cbm>;... | 274 | L3:<cache_id0>=<cbm>;<cache_id1>=<cbm>;... |
| 151 | 275 | ||
| 152 | L3 details (CDP enabled via mount option to resctrl) | 276 | L3 schemata file details (CDP enabled via mount option to resctrl) |
| 153 | ---------------------------------------------------- | 277 | ------------------------------------------------------------------ |
| 154 | When CDP is enabled L3 control is split into two separate resources | 278 | When CDP is enabled L3 control is split into two separate resources |
| 155 | so you can specify independent masks for code and data like this: | 279 | so you can specify independent masks for code and data like this: |
| 156 | 280 | ||
| 157 | L3data:<cache_id0>=<cbm>;<cache_id1>=<cbm>;... | 281 | L3data:<cache_id0>=<cbm>;<cache_id1>=<cbm>;... |
| 158 | L3code:<cache_id0>=<cbm>;<cache_id1>=<cbm>;... | 282 | L3code:<cache_id0>=<cbm>;<cache_id1>=<cbm>;... |
| 159 | 283 | ||
| 160 | L2 details | 284 | L2 schemata file details |
| 161 | ---------- | 285 | ------------------------ |
| 162 | L2 cache does not support code and data prioritization, so the | 286 | L2 cache does not support code and data prioritization, so the |
| 163 | schemata format is always: | 287 | schemata format is always: |
| 164 | 288 | ||
| @@ -185,6 +309,8 @@ L3CODE:0=fffff;1=fffff;2=fffff;3=fffff | |||
| 185 | L3DATA:0=fffff;1=fffff;2=3c0;3=fffff | 309 | L3DATA:0=fffff;1=fffff;2=3c0;3=fffff |
| 186 | L3CODE:0=fffff;1=fffff;2=fffff;3=fffff | 310 | L3CODE:0=fffff;1=fffff;2=fffff;3=fffff |
| 187 | 311 | ||
| 312 | Examples for RDT allocation usage: | ||
| 313 | |||
| 188 | Example 1 | 314 | Example 1 |
| 189 | --------- | 315 | --------- |
| 190 | On a two socket machine (one L3 cache per socket) with just four bits | 316 | On a two socket machine (one L3 cache per socket) with just four bits |
| @@ -410,3 +536,124 @@ void main(void) | |||
| 410 | /* code to read and write directory contents */ | 536 | /* code to read and write directory contents */ |
| 411 | resctrl_release_lock(fd); | 537 | resctrl_release_lock(fd); |
| 412 | } | 538 | } |
| 539 | |||
| 540 | Examples for RDT Monitoring along with allocation usage: | ||
| 541 | |||
| 542 | Reading monitored data | ||
| 543 | ---------------------- | ||
| 544 | Reading an event file (for ex: mon_data/mon_L3_00/llc_occupancy) would | ||
| 545 | show the current snapshot of LLC occupancy of the corresponding MON | ||
| 546 | group or CTRL_MON group. | ||
| 547 | |||
| 548 | |||
| 549 | Example 1 (Monitor CTRL_MON group and subset of tasks in CTRL_MON group) | ||
| 550 | --------- | ||
| 551 | On a two socket machine (one L3 cache per socket) with just four bits | ||
| 552 | for cache bit masks | ||
| 553 | |||
| 554 | # mount -t resctrl resctrl /sys/fs/resctrl | ||
| 555 | # cd /sys/fs/resctrl | ||
| 556 | # mkdir p0 p1 | ||
| 557 | # echo "L3:0=3;1=c" > /sys/fs/resctrl/p0/schemata | ||
| 558 | # echo "L3:0=3;1=3" > /sys/fs/resctrl/p1/schemata | ||
| 559 | # echo 5678 > p1/tasks | ||
| 560 | # echo 5679 > p1/tasks | ||
| 561 | |||
| 562 | The default resource group is unmodified, so we have access to all parts | ||
| 563 | of all caches (its schemata file reads "L3:0=f;1=f"). | ||
| 564 | |||
| 565 | Tasks that are under the control of group "p0" may only allocate from the | ||
| 566 | "lower" 50% on cache ID 0, and the "upper" 50% of cache ID 1. | ||
| 567 | Tasks in group "p1" use the "lower" 50% of cache on both sockets. | ||
| 568 | |||
| 569 | Create monitor groups and assign a subset of tasks to each monitor group. | ||
| 570 | |||
| 571 | # cd /sys/fs/resctrl/p1/mon_groups | ||
| 572 | # mkdir m11 m12 | ||
| 573 | # echo 5678 > m11/tasks | ||
| 574 | # echo 5679 > m12/tasks | ||
| 575 | |||
| 576 | fetch data (data shown in bytes) | ||
| 577 | |||
| 578 | # cat m11/mon_data/mon_L3_00/llc_occupancy | ||
| 579 | 16234000 | ||
| 580 | # cat m11/mon_data/mon_L3_01/llc_occupancy | ||
| 581 | 14789000 | ||
| 582 | # cat m12/mon_data/mon_L3_00/llc_occupancy | ||
| 583 | 16789000 | ||
| 584 | |||
| 585 | The parent ctrl_mon group shows the aggregated data. | ||
| 586 | |||
| 587 | # cat /sys/fs/resctrl/p1/mon_data/mon_l3_00/llc_occupancy | ||
| 588 | 31234000 | ||
| 589 | |||
| 590 | Example 2 (Monitor a task from its creation) | ||
| 591 | --------- | ||
| 592 | On a two socket machine (one L3 cache per socket) | ||
| 593 | |||
| 594 | # mount -t resctrl resctrl /sys/fs/resctrl | ||
| 595 | # cd /sys/fs/resctrl | ||
| 596 | # mkdir p0 p1 | ||
| 597 | |||
| 598 | An RMID is allocated to the group once its created and hence the <cmd> | ||
| 599 | below is monitored from its creation. | ||
| 600 | |||
| 601 | # echo $$ > /sys/fs/resctrl/p1/tasks | ||
| 602 | # <cmd> | ||
| 603 | |||
| 604 | Fetch the data | ||
| 605 | |||
| 606 | # cat /sys/fs/resctrl/p1/mon_data/mon_l3_00/llc_occupancy | ||
| 607 | 31789000 | ||
| 608 | |||
| 609 | Example 3 (Monitor without CAT support or before creating CAT groups) | ||
| 610 | --------- | ||
| 611 | |||
| 612 | Assume a system like HSW has only CQM and no CAT support. In this case | ||
| 613 | the resctrl will still mount but cannot create CTRL_MON directories. | ||
| 614 | But user can create different MON groups within the root group thereby | ||
| 615 | able to monitor all tasks including kernel threads. | ||
| 616 | |||
| 617 | This can also be used to profile jobs cache size footprint before being | ||
| 618 | able to allocate them to different allocation groups. | ||
| 619 | |||
| 620 | # mount -t resctrl resctrl /sys/fs/resctrl | ||
| 621 | # cd /sys/fs/resctrl | ||
| 622 | # mkdir mon_groups/m01 | ||
| 623 | # mkdir mon_groups/m02 | ||
| 624 | |||
| 625 | # echo 3478 > /sys/fs/resctrl/mon_groups/m01/tasks | ||
| 626 | # echo 2467 > /sys/fs/resctrl/mon_groups/m02/tasks | ||
| 627 | |||
| 628 | Monitor the groups separately and also get per domain data. From the | ||
| 629 | below its apparent that the tasks are mostly doing work on | ||
| 630 | domain(socket) 0. | ||
| 631 | |||
| 632 | # cat /sys/fs/resctrl/mon_groups/m01/mon_L3_00/llc_occupancy | ||
| 633 | 31234000 | ||
| 634 | # cat /sys/fs/resctrl/mon_groups/m01/mon_L3_01/llc_occupancy | ||
| 635 | 34555 | ||
| 636 | # cat /sys/fs/resctrl/mon_groups/m02/mon_L3_00/llc_occupancy | ||
| 637 | 31234000 | ||
| 638 | # cat /sys/fs/resctrl/mon_groups/m02/mon_L3_01/llc_occupancy | ||
| 639 | 32789 | ||
| 640 | |||
| 641 | |||
| 642 | Example 4 (Monitor real time tasks) | ||
| 643 | ----------------------------------- | ||
| 644 | |||
| 645 | A single socket system which has real time tasks running on cores 4-7 | ||
| 646 | and non real time tasks on other cpus. We want to monitor the cache | ||
| 647 | occupancy of the real time threads on these cores. | ||
| 648 | |||
| 649 | # mount -t resctrl resctrl /sys/fs/resctrl | ||
| 650 | # cd /sys/fs/resctrl | ||
| 651 | # mkdir p1 | ||
| 652 | |||
| 653 | Move the cpus 4-7 over to p1 | ||
| 654 | # echo f0 > p0/cpus | ||
| 655 | |||
| 656 | View the llc occupancy snapshot | ||
| 657 | |||
| 658 | # cat /sys/fs/resctrl/p1/mon_data/mon_L3_00/llc_occupancy | ||
| 659 | 11234000 | ||
diff --git a/MAINTAINERS b/MAINTAINERS index b81e93b71c4b..8ef4694af6e8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
| @@ -11121,7 +11121,7 @@ M: Fenghua Yu <fenghua.yu@intel.com> | |||
| 11121 | L: linux-kernel@vger.kernel.org | 11121 | L: linux-kernel@vger.kernel.org |
| 11122 | S: Supported | 11122 | S: Supported |
| 11123 | F: arch/x86/kernel/cpu/intel_rdt* | 11123 | F: arch/x86/kernel/cpu/intel_rdt* |
| 11124 | F: arch/x86/include/asm/intel_rdt* | 11124 | F: arch/x86/include/asm/intel_rdt_sched.h |
| 11125 | F: Documentation/x86/intel_rdt* | 11125 | F: Documentation/x86/intel_rdt* |
| 11126 | 11126 | ||
| 11127 | READ-COPY UPDATE (RCU) | 11127 | READ-COPY UPDATE (RCU) |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b4b27ab016f6..acb366bf6bc1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
| @@ -429,16 +429,16 @@ config GOLDFISH | |||
| 429 | def_bool y | 429 | def_bool y |
| 430 | depends on X86_GOLDFISH | 430 | depends on X86_GOLDFISH |
| 431 | 431 | ||
| 432 | config INTEL_RDT_A | 432 | config INTEL_RDT |
| 433 | bool "Intel Resource Director Technology Allocation support" | 433 | bool "Intel Resource Director Technology support" |
| 434 | default n | 434 | default n |
| 435 | depends on X86 && CPU_SUP_INTEL | 435 | depends on X86 && CPU_SUP_INTEL |
| 436 | select KERNFS | 436 | select KERNFS |
| 437 | help | 437 | help |
| 438 | Select to enable resource allocation which is a sub-feature of | 438 | Select to enable resource allocation and monitoring which are |
| 439 | Intel Resource Director Technology(RDT). More information about | 439 | sub-features of Intel Resource Director Technology(RDT). More |
| 440 | RDT can be found in the Intel x86 Architecture Software | 440 | information about RDT can be found in the Intel x86 |
| 441 | Developer Manual. | 441 | Architecture Software Developer Manual. |
| 442 | 442 | ||
| 443 | Say N if unsure. | 443 | Say N if unsure. |
| 444 | 444 | ||
diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile index 06c2baa51814..e9d8520a801a 100644 --- a/arch/x86/events/intel/Makefile +++ b/arch/x86/events/intel/Makefile | |||
| @@ -1,4 +1,4 @@ | |||
| 1 | obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o cqm.o | 1 | obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o |
| 2 | obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o | 2 | obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o |
| 3 | obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o | 3 | obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o |
| 4 | obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl-perf.o | 4 | obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl-perf.o |
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c deleted file mode 100644 index 2521f771f2f5..000000000000 --- a/arch/x86/events/intel/cqm.c +++ /dev/null | |||
| @@ -1,1766 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * Intel Cache Quality-of-Service Monitoring (CQM) support. | ||
| 3 | * | ||
| 4 | * Based very, very heavily on work by Peter Zijlstra. | ||
| 5 | */ | ||
| 6 | |||
| 7 | #include <linux/perf_event.h> | ||
| 8 | #include <linux/slab.h> | ||
| 9 | #include <asm/cpu_device_id.h> | ||
| 10 | #include <asm/intel_rdt_common.h> | ||
| 11 | #include "../perf_event.h" | ||
| 12 | |||
| 13 | #define MSR_IA32_QM_CTR 0x0c8e | ||
| 14 | #define MSR_IA32_QM_EVTSEL 0x0c8d | ||
| 15 | |||
| 16 | #define MBM_CNTR_WIDTH 24 | ||
| 17 | /* | ||
| 18 | * Guaranteed time in ms as per SDM where MBM counters will not overflow. | ||
| 19 | */ | ||
| 20 | #define MBM_CTR_OVERFLOW_TIME 1000 | ||
| 21 | |||
| 22 | static u32 cqm_max_rmid = -1; | ||
| 23 | static unsigned int cqm_l3_scale; /* supposedly cacheline size */ | ||
| 24 | static bool cqm_enabled, mbm_enabled; | ||
| 25 | unsigned int mbm_socket_max; | ||
| 26 | |||
| 27 | /* | ||
| 28 | * The cached intel_pqr_state is strictly per CPU and can never be | ||
| 29 | * updated from a remote CPU. Both functions which modify the state | ||
| 30 | * (intel_cqm_event_start and intel_cqm_event_stop) are called with | ||
| 31 | * interrupts disabled, which is sufficient for the protection. | ||
| 32 | */ | ||
| 33 | DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); | ||
| 34 | static struct hrtimer *mbm_timers; | ||
| 35 | /** | ||
| 36 | * struct sample - mbm event's (local or total) data | ||
| 37 | * @total_bytes #bytes since we began monitoring | ||
| 38 | * @prev_msr previous value of MSR | ||
| 39 | */ | ||
| 40 | struct sample { | ||
| 41 | u64 total_bytes; | ||
| 42 | u64 prev_msr; | ||
| 43 | }; | ||
| 44 | |||
| 45 | /* | ||
| 46 | * samples profiled for total memory bandwidth type events | ||
| 47 | */ | ||
| 48 | static struct sample *mbm_total; | ||
| 49 | /* | ||
| 50 | * samples profiled for local memory bandwidth type events | ||
| 51 | */ | ||
| 52 | static struct sample *mbm_local; | ||
| 53 | |||
| 54 | #define pkg_id topology_physical_package_id(smp_processor_id()) | ||
| 55 | /* | ||
| 56 | * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array. | ||
| 57 | * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of | ||
| 58 | * rmids per socket, an example is given below | ||
| 59 | * RMID1 of Socket0: vrmid = 1 | ||
| 60 | * RMID1 of Socket1: vrmid = 1 * (cqm_max_rmid + 1) + 1 | ||
| 61 | * RMID1 of Socket2: vrmid = 2 * (cqm_max_rmid + 1) + 1 | ||
| 62 | */ | ||
| 63 | #define rmid_2_index(rmid) ((pkg_id * (cqm_max_rmid + 1)) + rmid) | ||
| 64 | /* | ||
| 65 | * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru. | ||
| 66 | * Also protects event->hw.cqm_rmid | ||
| 67 | * | ||
| 68 | * Hold either for stability, both for modification of ->hw.cqm_rmid. | ||
| 69 | */ | ||
| 70 | static DEFINE_MUTEX(cache_mutex); | ||
| 71 | static DEFINE_RAW_SPINLOCK(cache_lock); | ||
| 72 | |||
| 73 | /* | ||
| 74 | * Groups of events that have the same target(s), one RMID per group. | ||
| 75 | */ | ||
| 76 | static LIST_HEAD(cache_groups); | ||
| 77 | |||
| 78 | /* | ||
| 79 | * Mask of CPUs for reading CQM values. We only need one per-socket. | ||
| 80 | */ | ||
| 81 | static cpumask_t cqm_cpumask; | ||
| 82 | |||
| 83 | #define RMID_VAL_ERROR (1ULL << 63) | ||
| 84 | #define RMID_VAL_UNAVAIL (1ULL << 62) | ||
| 85 | |||
| 86 | /* | ||
| 87 | * Event IDs are used to program IA32_QM_EVTSEL before reading event | ||
| 88 | * counter from IA32_QM_CTR | ||
| 89 | */ | ||
| 90 | #define QOS_L3_OCCUP_EVENT_ID 0x01 | ||
| 91 | #define QOS_MBM_TOTAL_EVENT_ID 0x02 | ||
| 92 | #define QOS_MBM_LOCAL_EVENT_ID 0x03 | ||
| 93 | |||
| 94 | /* | ||
| 95 | * This is central to the rotation algorithm in __intel_cqm_rmid_rotate(). | ||
| 96 | * | ||
| 97 | * This rmid is always free and is guaranteed to have an associated | ||
| 98 | * near-zero occupancy value, i.e. no cachelines are tagged with this | ||
| 99 | * RMID, once __intel_cqm_rmid_rotate() returns. | ||
| 100 | */ | ||
| 101 | static u32 intel_cqm_rotation_rmid; | ||
| 102 | |||
| 103 | #define INVALID_RMID (-1) | ||
| 104 | |||
| 105 | /* | ||
| 106 | * Is @rmid valid for programming the hardware? | ||
| 107 | * | ||
| 108 | * rmid 0 is reserved by the hardware for all non-monitored tasks, which | ||
| 109 | * means that we should never come across an rmid with that value. | ||
| 110 | * Likewise, an rmid value of -1 is used to indicate "no rmid currently | ||
| 111 | * assigned" and is used as part of the rotation code. | ||
| 112 | */ | ||
| 113 | static inline bool __rmid_valid(u32 rmid) | ||
| 114 | { | ||
| 115 | if (!rmid || rmid == INVALID_RMID) | ||
| 116 | return false; | ||
| 117 | |||
| 118 | return true; | ||
| 119 | } | ||
| 120 | |||
| 121 | static u64 __rmid_read(u32 rmid) | ||
| 122 | { | ||
| 123 | u64 val; | ||
| 124 | |||
| 125 | /* | ||
| 126 | * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt, | ||
| 127 | * it just says that to increase confusion. | ||
| 128 | */ | ||
| 129 | wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid); | ||
| 130 | rdmsrl(MSR_IA32_QM_CTR, val); | ||
| 131 | |||
| 132 | /* | ||
| 133 | * Aside from the ERROR and UNAVAIL bits, assume this thing returns | ||
| 134 | * the number of cachelines tagged with @rmid. | ||
| 135 | */ | ||
| 136 | return val; | ||
| 137 | } | ||
| 138 | |||
| 139 | enum rmid_recycle_state { | ||
| 140 | RMID_YOUNG = 0, | ||
| 141 | RMID_AVAILABLE, | ||
| 142 | RMID_DIRTY, | ||
| 143 | }; | ||
| 144 | |||
| 145 | struct cqm_rmid_entry { | ||
| 146 | u32 rmid; | ||
| 147 | enum rmid_recycle_state state; | ||
| 148 | struct list_head list; | ||
| 149 | unsigned long queue_time; | ||
| 150 | }; | ||
| 151 | |||
| 152 | /* | ||
| 153 | * cqm_rmid_free_lru - A least recently used list of RMIDs. | ||
| 154 | * | ||
| 155 | * Oldest entry at the head, newest (most recently used) entry at the | ||
| 156 | * tail. This list is never traversed, it's only used to keep track of | ||
| 157 | * the lru order. That is, we only pick entries of the head or insert | ||
| 158 | * them on the tail. | ||
| 159 | * | ||
| 160 | * All entries on the list are 'free', and their RMIDs are not currently | ||
| 161 | * in use. To mark an RMID as in use, remove its entry from the lru | ||
| 162 | * list. | ||
| 163 | * | ||
| 164 | * | ||
| 165 | * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs. | ||
| 166 | * | ||
| 167 | * This list is contains RMIDs that no one is currently using but that | ||
| 168 | * may have a non-zero occupancy value associated with them. The | ||
| 169 | * rotation worker moves RMIDs from the limbo list to the free list once | ||
| 170 | * the occupancy value drops below __intel_cqm_threshold. | ||
| 171 | * | ||
| 172 | * Both lists are protected by cache_mutex. | ||
| 173 | */ | ||
| 174 | static LIST_HEAD(cqm_rmid_free_lru); | ||
| 175 | static LIST_HEAD(cqm_rmid_limbo_lru); | ||
| 176 | |||
| 177 | /* | ||
| 178 | * We use a simple array of pointers so that we can lookup a struct | ||
| 179 | * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid() | ||
| 180 | * and __put_rmid() from having to worry about dealing with struct | ||
| 181 | * cqm_rmid_entry - they just deal with rmids, i.e. integers. | ||
| 182 | * | ||
| 183 | * Once this array is initialized it is read-only. No locks are required | ||
| 184 | * to access it. | ||
| 185 | * | ||
| 186 | * All entries for all RMIDs can be looked up in the this array at all | ||
| 187 | * times. | ||
| 188 | */ | ||
| 189 | static struct cqm_rmid_entry **cqm_rmid_ptrs; | ||
| 190 | |||
| 191 | static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid) | ||
| 192 | { | ||
| 193 | struct cqm_rmid_entry *entry; | ||
| 194 | |||
| 195 | entry = cqm_rmid_ptrs[rmid]; | ||
| 196 | WARN_ON(entry->rmid != rmid); | ||
| 197 | |||
| 198 | return entry; | ||
| 199 | } | ||
| 200 | |||
| 201 | /* | ||
| 202 | * Returns < 0 on fail. | ||
| 203 | * | ||
| 204 | * We expect to be called with cache_mutex held. | ||
| 205 | */ | ||
| 206 | static u32 __get_rmid(void) | ||
| 207 | { | ||
| 208 | struct cqm_rmid_entry *entry; | ||
| 209 | |||
| 210 | lockdep_assert_held(&cache_mutex); | ||
| 211 | |||
| 212 | if (list_empty(&cqm_rmid_free_lru)) | ||
| 213 | return INVALID_RMID; | ||
| 214 | |||
| 215 | entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list); | ||
| 216 | list_del(&entry->list); | ||
| 217 | |||
| 218 | return entry->rmid; | ||
| 219 | } | ||
| 220 | |||
| 221 | static void __put_rmid(u32 rmid) | ||
| 222 | { | ||
| 223 | struct cqm_rmid_entry *entry; | ||
| 224 | |||
| 225 | lockdep_assert_held(&cache_mutex); | ||
| 226 | |||
| 227 | WARN_ON(!__rmid_valid(rmid)); | ||
| 228 | entry = __rmid_entry(rmid); | ||
| 229 | |||
| 230 | entry->queue_time = jiffies; | ||
| 231 | entry->state = RMID_YOUNG; | ||
| 232 | |||
| 233 | list_add_tail(&entry->list, &cqm_rmid_limbo_lru); | ||
| 234 | } | ||
| 235 | |||
| 236 | static void cqm_cleanup(void) | ||
| 237 | { | ||
| 238 | int i; | ||
| 239 | |||
| 240 | if (!cqm_rmid_ptrs) | ||
| 241 | return; | ||
| 242 | |||
| 243 | for (i = 0; i < cqm_max_rmid; i++) | ||
| 244 | kfree(cqm_rmid_ptrs[i]); | ||
| 245 | |||
| 246 | kfree(cqm_rmid_ptrs); | ||
| 247 | cqm_rmid_ptrs = NULL; | ||
| 248 | cqm_enabled = false; | ||
| 249 | } | ||
| 250 | |||
| 251 | static int intel_cqm_setup_rmid_cache(void) | ||
| 252 | { | ||
| 253 | struct cqm_rmid_entry *entry; | ||
| 254 | unsigned int nr_rmids; | ||
| 255 | int r = 0; | ||
| 256 | |||
| 257 | nr_rmids = cqm_max_rmid + 1; | ||
| 258 | cqm_rmid_ptrs = kzalloc(sizeof(struct cqm_rmid_entry *) * | ||
| 259 | nr_rmids, GFP_KERNEL); | ||
| 260 | if (!cqm_rmid_ptrs) | ||
| 261 | return -ENOMEM; | ||
| 262 | |||
| 263 | for (; r <= cqm_max_rmid; r++) { | ||
| 264 | struct cqm_rmid_entry *entry; | ||
| 265 | |||
| 266 | entry = kmalloc(sizeof(*entry), GFP_KERNEL); | ||
| 267 | if (!entry) | ||
| 268 | goto fail; | ||
| 269 | |||
| 270 | INIT_LIST_HEAD(&entry->list); | ||
| 271 | entry->rmid = r; | ||
| 272 | cqm_rmid_ptrs[r] = entry; | ||
| 273 | |||
| 274 | list_add_tail(&entry->list, &cqm_rmid_free_lru); | ||
| 275 | } | ||
| 276 | |||
| 277 | /* | ||
| 278 | * RMID 0 is special and is always allocated. It's used for all | ||
| 279 | * tasks that are not monitored. | ||
| 280 | */ | ||
| 281 | entry = __rmid_entry(0); | ||
| 282 | list_del(&entry->list); | ||
| 283 | |||
| 284 | mutex_lock(&cache_mutex); | ||
| 285 | intel_cqm_rotation_rmid = __get_rmid(); | ||
| 286 | mutex_unlock(&cache_mutex); | ||
| 287 | |||
| 288 | return 0; | ||
| 289 | |||
| 290 | fail: | ||
| 291 | cqm_cleanup(); | ||
| 292 | return -ENOMEM; | ||
| 293 | } | ||
| 294 | |||
| 295 | /* | ||
| 296 | * Determine if @a and @b measure the same set of tasks. | ||
| 297 | * | ||
| 298 | * If @a and @b measure the same set of tasks then we want to share a | ||
| 299 | * single RMID. | ||
| 300 | */ | ||
| 301 | static bool __match_event(struct perf_event *a, struct perf_event *b) | ||
| 302 | { | ||
| 303 | /* Per-cpu and task events don't mix */ | ||
| 304 | if ((a->attach_state & PERF_ATTACH_TASK) != | ||
| 305 | (b->attach_state & PERF_ATTACH_TASK)) | ||
| 306 | return false; | ||
| 307 | |||
| 308 | #ifdef CONFIG_CGROUP_PERF | ||
| 309 | if (a->cgrp != b->cgrp) | ||
| 310 | return false; | ||
| 311 | #endif | ||
| 312 | |||
| 313 | /* If not task event, we're machine wide */ | ||
| 314 | if (!(b->attach_state & PERF_ATTACH_TASK)) | ||
| 315 | return true; | ||
| 316 | |||
| 317 | /* | ||
| 318 | * Events that target same task are placed into the same cache group. | ||
| 319 | * Mark it as a multi event group, so that we update ->count | ||
| 320 | * for every event rather than just the group leader later. | ||
| 321 | */ | ||
| 322 | if (a->hw.target == b->hw.target) { | ||
| 323 | b->hw.is_group_event = true; | ||
| 324 | return true; | ||
| 325 | } | ||
| 326 | |||
| 327 | /* | ||
| 328 | * Are we an inherited event? | ||
| 329 | */ | ||
| 330 | if (b->parent == a) | ||
| 331 | return true; | ||
| 332 | |||
| 333 | return false; | ||
| 334 | } | ||
| 335 | |||
| 336 | #ifdef CONFIG_CGROUP_PERF | ||
| 337 | static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) | ||
| 338 | { | ||
| 339 | if (event->attach_state & PERF_ATTACH_TASK) | ||
| 340 | return perf_cgroup_from_task(event->hw.target, event->ctx); | ||
| 341 | |||
| 342 | return event->cgrp; | ||
| 343 | } | ||
| 344 | #endif | ||
| 345 | |||
| 346 | /* | ||
| 347 | * Determine if @a's tasks intersect with @b's tasks | ||
| 348 | * | ||
| 349 | * There are combinations of events that we explicitly prohibit, | ||
| 350 | * | ||
| 351 | * PROHIBITS | ||
| 352 | * system-wide -> cgroup and task | ||
| 353 | * cgroup -> system-wide | ||
| 354 | * -> task in cgroup | ||
| 355 | * task -> system-wide | ||
| 356 | * -> task in cgroup | ||
| 357 | * | ||
| 358 | * Call this function before allocating an RMID. | ||
| 359 | */ | ||
| 360 | static bool __conflict_event(struct perf_event *a, struct perf_event *b) | ||
| 361 | { | ||
| 362 | #ifdef CONFIG_CGROUP_PERF | ||
| 363 | /* | ||
| 364 | * We can have any number of cgroups but only one system-wide | ||
| 365 | * event at a time. | ||
| 366 | */ | ||
| 367 | if (a->cgrp && b->cgrp) { | ||
| 368 | struct perf_cgroup *ac = a->cgrp; | ||
| 369 | struct perf_cgroup *bc = b->cgrp; | ||
| 370 | |||
| 371 | /* | ||
| 372 | * This condition should have been caught in | ||
| 373 | * __match_event() and we should be sharing an RMID. | ||
| 374 | */ | ||
| 375 | WARN_ON_ONCE(ac == bc); | ||
| 376 | |||
| 377 | if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || | ||
| 378 | cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) | ||
| 379 | return true; | ||
| 380 | |||
| 381 | return false; | ||
| 382 | } | ||
| 383 | |||
| 384 | if (a->cgrp || b->cgrp) { | ||
| 385 | struct perf_cgroup *ac, *bc; | ||
| 386 | |||
| 387 | /* | ||
| 388 | * cgroup and system-wide events are mutually exclusive | ||
| 389 | */ | ||
| 390 | if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) || | ||
| 391 | (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK))) | ||
| 392 | return true; | ||
| 393 | |||
| 394 | /* | ||
| 395 | * Ensure neither event is part of the other's cgroup | ||
| 396 | */ | ||
| 397 | ac = event_to_cgroup(a); | ||
| 398 | bc = event_to_cgroup(b); | ||
| 399 | if (ac == bc) | ||
| 400 | return true; | ||
| 401 | |||
| 402 | /* | ||
| 403 | * Must have cgroup and non-intersecting task events. | ||
| 404 | */ | ||
| 405 | if (!ac || !bc) | ||
| 406 | return false; | ||
| 407 | |||
| 408 | /* | ||
| 409 | * We have cgroup and task events, and the task belongs | ||
| 410 | * to a cgroup. Check for for overlap. | ||
| 411 | */ | ||
| 412 | if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || | ||
| 413 | cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) | ||
| 414 | return true; | ||
| 415 | |||
| 416 | return false; | ||
| 417 | } | ||
| 418 | #endif | ||
| 419 | /* | ||
| 420 | * If one of them is not a task, same story as above with cgroups. | ||
| 421 | */ | ||
| 422 | if (!(a->attach_state & PERF_ATTACH_TASK) || | ||
| 423 | !(b->attach_state & PERF_ATTACH_TASK)) | ||
| 424 | return true; | ||
| 425 | |||
| 426 | /* | ||
| 427 | * Must be non-overlapping. | ||
| 428 | */ | ||
| 429 | return false; | ||
| 430 | } | ||
| 431 | |||
| 432 | struct rmid_read { | ||
| 433 | u32 rmid; | ||
| 434 | u32 evt_type; | ||
| 435 | atomic64_t value; | ||
| 436 | }; | ||
| 437 | |||
| 438 | static void __intel_cqm_event_count(void *info); | ||
| 439 | static void init_mbm_sample(u32 rmid, u32 evt_type); | ||
| 440 | static void __intel_mbm_event_count(void *info); | ||
| 441 | |||
| 442 | static bool is_cqm_event(int e) | ||
| 443 | { | ||
| 444 | return (e == QOS_L3_OCCUP_EVENT_ID); | ||
| 445 | } | ||
| 446 | |||
| 447 | static bool is_mbm_event(int e) | ||
| 448 | { | ||
| 449 | return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID); | ||
| 450 | } | ||
| 451 | |||
| 452 | static void cqm_mask_call(struct rmid_read *rr) | ||
| 453 | { | ||
| 454 | if (is_mbm_event(rr->evt_type)) | ||
| 455 | on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, rr, 1); | ||
| 456 | else | ||
| 457 | on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, rr, 1); | ||
| 458 | } | ||
| 459 | |||
| 460 | /* | ||
| 461 | * Exchange the RMID of a group of events. | ||
| 462 | */ | ||
| 463 | static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid) | ||
| 464 | { | ||
| 465 | struct perf_event *event; | ||
| 466 | struct list_head *head = &group->hw.cqm_group_entry; | ||
| 467 | u32 old_rmid = group->hw.cqm_rmid; | ||
| 468 | |||
| 469 | lockdep_assert_held(&cache_mutex); | ||
| 470 | |||
| 471 | /* | ||
| 472 | * If our RMID is being deallocated, perform a read now. | ||
| 473 | */ | ||
| 474 | if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) { | ||
| 475 | struct rmid_read rr = { | ||
| 476 | .rmid = old_rmid, | ||
| 477 | .evt_type = group->attr.config, | ||
| 478 | .value = ATOMIC64_INIT(0), | ||
| 479 | }; | ||
| 480 | |||
| 481 | cqm_mask_call(&rr); | ||
| 482 | local64_set(&group->count, atomic64_read(&rr.value)); | ||
| 483 | } | ||
| 484 | |||
| 485 | raw_spin_lock_irq(&cache_lock); | ||
| 486 | |||
| 487 | group->hw.cqm_rmid = rmid; | ||
| 488 | list_for_each_entry(event, head, hw.cqm_group_entry) | ||
| 489 | event->hw.cqm_rmid = rmid; | ||
| 490 | |||
| 491 | raw_spin_unlock_irq(&cache_lock); | ||
| 492 | |||
| 493 | /* | ||
| 494 | * If the allocation is for mbm, init the mbm stats. | ||
| 495 | * Need to check if each event in the group is mbm event | ||
| 496 | * because there could be multiple type of events in the same group. | ||
| 497 | */ | ||
| 498 | if (__rmid_valid(rmid)) { | ||
| 499 | event = group; | ||
| 500 | if (is_mbm_event(event->attr.config)) | ||
| 501 | init_mbm_sample(rmid, event->attr.config); | ||
| 502 | |||
| 503 | list_for_each_entry(event, head, hw.cqm_group_entry) { | ||
| 504 | if (is_mbm_event(event->attr.config)) | ||
| 505 | init_mbm_sample(rmid, event->attr.config); | ||
| 506 | } | ||
| 507 | } | ||
| 508 | |||
| 509 | return old_rmid; | ||
| 510 | } | ||
| 511 | |||
| 512 | /* | ||
| 513 | * If we fail to assign a new RMID for intel_cqm_rotation_rmid because | ||
| 514 | * cachelines are still tagged with RMIDs in limbo, we progressively | ||
| 515 | * increment the threshold until we find an RMID in limbo with <= | ||
| 516 | * __intel_cqm_threshold lines tagged. This is designed to mitigate the | ||
| 517 | * problem where cachelines tagged with an RMID are not steadily being | ||
| 518 | * evicted. | ||
| 519 | * | ||
| 520 | * On successful rotations we decrease the threshold back towards zero. | ||
| 521 | * | ||
| 522 | * __intel_cqm_max_threshold provides an upper bound on the threshold, | ||
| 523 | * and is measured in bytes because it's exposed to userland. | ||
| 524 | */ | ||
| 525 | static unsigned int __intel_cqm_threshold; | ||
| 526 | static unsigned int __intel_cqm_max_threshold; | ||
| 527 | |||
| 528 | /* | ||
| 529 | * Test whether an RMID has a zero occupancy value on this cpu. | ||
| 530 | */ | ||
| 531 | static void intel_cqm_stable(void *arg) | ||
| 532 | { | ||
| 533 | struct cqm_rmid_entry *entry; | ||
| 534 | |||
| 535 | list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { | ||
| 536 | if (entry->state != RMID_AVAILABLE) | ||
| 537 | break; | ||
| 538 | |||
| 539 | if (__rmid_read(entry->rmid) > __intel_cqm_threshold) | ||
| 540 | entry->state = RMID_DIRTY; | ||
| 541 | } | ||
| 542 | } | ||
| 543 | |||
| 544 | /* | ||
| 545 | * If we have group events waiting for an RMID that don't conflict with | ||
| 546 | * events already running, assign @rmid. | ||
| 547 | */ | ||
| 548 | static bool intel_cqm_sched_in_event(u32 rmid) | ||
| 549 | { | ||
| 550 | struct perf_event *leader, *event; | ||
| 551 | |||
| 552 | lockdep_assert_held(&cache_mutex); | ||
| 553 | |||
| 554 | leader = list_first_entry(&cache_groups, struct perf_event, | ||
| 555 | hw.cqm_groups_entry); | ||
| 556 | event = leader; | ||
| 557 | |||
| 558 | list_for_each_entry_continue(event, &cache_groups, | ||
| 559 | hw.cqm_groups_entry) { | ||
| 560 | if (__rmid_valid(event->hw.cqm_rmid)) | ||
| 561 | continue; | ||
| 562 | |||
| 563 | if (__conflict_event(event, leader)) | ||
| 564 | continue; | ||
| 565 | |||
| 566 | intel_cqm_xchg_rmid(event, rmid); | ||
| 567 | return true; | ||
| 568 | } | ||
| 569 | |||
| 570 | return false; | ||
| 571 | } | ||
| 572 | |||
| 573 | /* | ||
| 574 | * Initially use this constant for both the limbo queue time and the | ||
| 575 | * rotation timer interval, pmu::hrtimer_interval_ms. | ||
| 576 | * | ||
| 577 | * They don't need to be the same, but the two are related since if you | ||
| 578 | * rotate faster than you recycle RMIDs, you may run out of available | ||
| 579 | * RMIDs. | ||
| 580 | */ | ||
| 581 | #define RMID_DEFAULT_QUEUE_TIME 250 /* ms */ | ||
| 582 | |||
| 583 | static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME; | ||
| 584 | |||
| 585 | /* | ||
| 586 | * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list | ||
| 587 | * @nr_available: number of freeable RMIDs on the limbo list | ||
| 588 | * | ||
| 589 | * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no | ||
| 590 | * cachelines are tagged with those RMIDs. After this we can reuse them | ||
| 591 | * and know that the current set of active RMIDs is stable. | ||
| 592 | * | ||
| 593 | * Return %true or %false depending on whether stabilization needs to be | ||
| 594 | * reattempted. | ||
| 595 | * | ||
| 596 | * If we return %true then @nr_available is updated to indicate the | ||
| 597 | * number of RMIDs on the limbo list that have been queued for the | ||
| 598 | * minimum queue time (RMID_AVAILABLE), but whose data occupancy values | ||
| 599 | * are above __intel_cqm_threshold. | ||
| 600 | */ | ||
| 601 | static bool intel_cqm_rmid_stabilize(unsigned int *available) | ||
| 602 | { | ||
| 603 | struct cqm_rmid_entry *entry, *tmp; | ||
| 604 | |||
| 605 | lockdep_assert_held(&cache_mutex); | ||
| 606 | |||
| 607 | *available = 0; | ||
| 608 | list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { | ||
| 609 | unsigned long min_queue_time; | ||
| 610 | unsigned long now = jiffies; | ||
| 611 | |||
| 612 | /* | ||
| 613 | * We hold RMIDs placed into limbo for a minimum queue | ||
| 614 | * time. Before the minimum queue time has elapsed we do | ||
| 615 | * not recycle RMIDs. | ||
| 616 | * | ||
| 617 | * The reasoning is that until a sufficient time has | ||
| 618 | * passed since we stopped using an RMID, any RMID | ||
| 619 | * placed onto the limbo list will likely still have | ||
| 620 | * data tagged in the cache, which means we'll probably | ||
| 621 | * fail to recycle it anyway. | ||
| 622 | * | ||
| 623 | * We can save ourselves an expensive IPI by skipping | ||
| 624 | * any RMIDs that have not been queued for the minimum | ||
| 625 | * time. | ||
| 626 | */ | ||
| 627 | min_queue_time = entry->queue_time + | ||
| 628 | msecs_to_jiffies(__rmid_queue_time_ms); | ||
| 629 | |||
| 630 | if (time_after(min_queue_time, now)) | ||
| 631 | break; | ||
| 632 | |||
| 633 | entry->state = RMID_AVAILABLE; | ||
| 634 | (*available)++; | ||
| 635 | } | ||
| 636 | |||
| 637 | /* | ||
| 638 | * Fast return if none of the RMIDs on the limbo list have been | ||
| 639 | * sitting on the queue for the minimum queue time. | ||
| 640 | */ | ||
| 641 | if (!*available) | ||
| 642 | return false; | ||
| 643 | |||
| 644 | /* | ||
| 645 | * Test whether an RMID is free for each package. | ||
| 646 | */ | ||
| 647 | on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true); | ||
| 648 | |||
| 649 | list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) { | ||
| 650 | /* | ||
| 651 | * Exhausted all RMIDs that have waited min queue time. | ||
| 652 | */ | ||
| 653 | if (entry->state == RMID_YOUNG) | ||
| 654 | break; | ||
| 655 | |||
| 656 | if (entry->state == RMID_DIRTY) | ||
| 657 | continue; | ||
| 658 | |||
| 659 | list_del(&entry->list); /* remove from limbo */ | ||
| 660 | |||
| 661 | /* | ||
| 662 | * The rotation RMID gets priority if it's | ||
| 663 | * currently invalid. In which case, skip adding | ||
| 664 | * the RMID to the the free lru. | ||
| 665 | */ | ||
| 666 | if (!__rmid_valid(intel_cqm_rotation_rmid)) { | ||
| 667 | intel_cqm_rotation_rmid = entry->rmid; | ||
| 668 | continue; | ||
| 669 | } | ||
| 670 | |||
| 671 | /* | ||
| 672 | * If we have groups waiting for RMIDs, hand | ||
| 673 | * them one now provided they don't conflict. | ||
| 674 | */ | ||
| 675 | if (intel_cqm_sched_in_event(entry->rmid)) | ||
| 676 | continue; | ||
| 677 | |||
| 678 | /* | ||
| 679 | * Otherwise place it onto the free list. | ||
| 680 | */ | ||
| 681 | list_add_tail(&entry->list, &cqm_rmid_free_lru); | ||
| 682 | } | ||
| 683 | |||
| 684 | |||
| 685 | return __rmid_valid(intel_cqm_rotation_rmid); | ||
| 686 | } | ||
| 687 | |||
| 688 | /* | ||
| 689 | * Pick a victim group and move it to the tail of the group list. | ||
| 690 | * @next: The first group without an RMID | ||
| 691 | */ | ||
| 692 | static void __intel_cqm_pick_and_rotate(struct perf_event *next) | ||
| 693 | { | ||
| 694 | struct perf_event *rotor; | ||
| 695 | u32 rmid; | ||
| 696 | |||
| 697 | lockdep_assert_held(&cache_mutex); | ||
| 698 | |||
| 699 | rotor = list_first_entry(&cache_groups, struct perf_event, | ||
| 700 | hw.cqm_groups_entry); | ||
| 701 | |||
| 702 | /* | ||
| 703 | * The group at the front of the list should always have a valid | ||
| 704 | * RMID. If it doesn't then no groups have RMIDs assigned and we | ||
| 705 | * don't need to rotate the list. | ||
| 706 | */ | ||
| 707 | if (next == rotor) | ||
| 708 | return; | ||
| 709 | |||
| 710 | rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID); | ||
| 711 | __put_rmid(rmid); | ||
| 712 | |||
| 713 | list_rotate_left(&cache_groups); | ||
| 714 | } | ||
| 715 | |||
| 716 | /* | ||
| 717 | * Deallocate the RMIDs from any events that conflict with @event, and | ||
| 718 | * place them on the back of the group list. | ||
| 719 | */ | ||
| 720 | static void intel_cqm_sched_out_conflicting_events(struct perf_event *event) | ||
| 721 | { | ||
| 722 | struct perf_event *group, *g; | ||
| 723 | u32 rmid; | ||
| 724 | |||
| 725 | lockdep_assert_held(&cache_mutex); | ||
| 726 | |||
| 727 | list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) { | ||
| 728 | if (group == event) | ||
| 729 | continue; | ||
| 730 | |||
| 731 | rmid = group->hw.cqm_rmid; | ||
| 732 | |||
| 733 | /* | ||
| 734 | * Skip events that don't have a valid RMID. | ||
| 735 | */ | ||
| 736 | if (!__rmid_valid(rmid)) | ||
| 737 | continue; | ||
| 738 | |||
| 739 | /* | ||
| 740 | * No conflict? No problem! Leave the event alone. | ||
| 741 | */ | ||
| 742 | if (!__conflict_event(group, event)) | ||
| 743 | continue; | ||
| 744 | |||
| 745 | intel_cqm_xchg_rmid(group, INVALID_RMID); | ||
| 746 | __put_rmid(rmid); | ||
| 747 | } | ||
| 748 | } | ||
| 749 | |||
| 750 | /* | ||
| 751 | * Attempt to rotate the groups and assign new RMIDs. | ||
| 752 | * | ||
| 753 | * We rotate for two reasons, | ||
| 754 | * 1. To handle the scheduling of conflicting events | ||
| 755 | * 2. To recycle RMIDs | ||
| 756 | * | ||
| 757 | * Rotating RMIDs is complicated because the hardware doesn't give us | ||
| 758 | * any clues. | ||
| 759 | * | ||
| 760 | * There's problems with the hardware interface; when you change the | ||
| 761 | * task:RMID map cachelines retain their 'old' tags, giving a skewed | ||
| 762 | * picture. In order to work around this, we must always keep one free | ||
| 763 | * RMID - intel_cqm_rotation_rmid. | ||
| 764 | * | ||
| 765 | * Rotation works by taking away an RMID from a group (the old RMID), | ||
| 766 | * and assigning the free RMID to another group (the new RMID). We must | ||
| 767 | * then wait for the old RMID to not be used (no cachelines tagged). | ||
| 768 | * This ensure that all cachelines are tagged with 'active' RMIDs. At | ||
| 769 | * this point we can start reading values for the new RMID and treat the | ||
| 770 | * old RMID as the free RMID for the next rotation. | ||
| 771 | * | ||
| 772 | * Return %true or %false depending on whether we did any rotating. | ||
| 773 | */ | ||
| 774 | static bool __intel_cqm_rmid_rotate(void) | ||
| 775 | { | ||
| 776 | struct perf_event *group, *start = NULL; | ||
| 777 | unsigned int threshold_limit; | ||
| 778 | unsigned int nr_needed = 0; | ||
| 779 | unsigned int nr_available; | ||
| 780 | bool rotated = false; | ||
| 781 | |||
| 782 | mutex_lock(&cache_mutex); | ||
| 783 | |||
| 784 | again: | ||
| 785 | /* | ||
| 786 | * Fast path through this function if there are no groups and no | ||
| 787 | * RMIDs that need cleaning. | ||
| 788 | */ | ||
| 789 | if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru)) | ||
| 790 | goto out; | ||
| 791 | |||
| 792 | list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) { | ||
| 793 | if (!__rmid_valid(group->hw.cqm_rmid)) { | ||
| 794 | if (!start) | ||
| 795 | start = group; | ||
| 796 | nr_needed++; | ||
| 797 | } | ||
| 798 | } | ||
| 799 | |||
| 800 | /* | ||
| 801 | * We have some event groups, but they all have RMIDs assigned | ||
| 802 | * and no RMIDs need cleaning. | ||
| 803 | */ | ||
| 804 | if (!nr_needed && list_empty(&cqm_rmid_limbo_lru)) | ||
| 805 | goto out; | ||
| 806 | |||
| 807 | if (!nr_needed) | ||
| 808 | goto stabilize; | ||
| 809 | |||
| 810 | /* | ||
| 811 | * We have more event groups without RMIDs than available RMIDs, | ||
| 812 | * or we have event groups that conflict with the ones currently | ||
| 813 | * scheduled. | ||
| 814 | * | ||
| 815 | * We force deallocate the rmid of the group at the head of | ||
| 816 | * cache_groups. The first event group without an RMID then gets | ||
| 817 | * assigned intel_cqm_rotation_rmid. This ensures we always make | ||
| 818 | * forward progress. | ||
| 819 | * | ||
| 820 | * Rotate the cache_groups list so the previous head is now the | ||
| 821 | * tail. | ||
| 822 | */ | ||
| 823 | __intel_cqm_pick_and_rotate(start); | ||
| 824 | |||
| 825 | /* | ||
| 826 | * If the rotation is going to succeed, reduce the threshold so | ||
| 827 | * that we don't needlessly reuse dirty RMIDs. | ||
| 828 | */ | ||
| 829 | if (__rmid_valid(intel_cqm_rotation_rmid)) { | ||
| 830 | intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid); | ||
| 831 | intel_cqm_rotation_rmid = __get_rmid(); | ||
| 832 | |||
| 833 | intel_cqm_sched_out_conflicting_events(start); | ||
| 834 | |||
| 835 | if (__intel_cqm_threshold) | ||
| 836 | __intel_cqm_threshold--; | ||
| 837 | } | ||
| 838 | |||
| 839 | rotated = true; | ||
| 840 | |||
| 841 | stabilize: | ||
| 842 | /* | ||
| 843 | * We now need to stablize the RMID we freed above (if any) to | ||
| 844 | * ensure that the next time we rotate we have an RMID with zero | ||
| 845 | * occupancy value. | ||
| 846 | * | ||
| 847 | * Alternatively, if we didn't need to perform any rotation, | ||
| 848 | * we'll have a bunch of RMIDs in limbo that need stabilizing. | ||
| 849 | */ | ||
| 850 | threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale; | ||
| 851 | |||
| 852 | while (intel_cqm_rmid_stabilize(&nr_available) && | ||
| 853 | __intel_cqm_threshold < threshold_limit) { | ||
| 854 | unsigned int steal_limit; | ||
| 855 | |||
| 856 | /* | ||
| 857 | * Don't spin if nobody is actively waiting for an RMID, | ||
| 858 | * the rotation worker will be kicked as soon as an | ||
| 859 | * event needs an RMID anyway. | ||
| 860 | */ | ||
| 861 | if (!nr_needed) | ||
| 862 | break; | ||
| 863 | |||
| 864 | /* Allow max 25% of RMIDs to be in limbo. */ | ||
| 865 | steal_limit = (cqm_max_rmid + 1) / 4; | ||
| 866 | |||
| 867 | /* | ||
| 868 | * We failed to stabilize any RMIDs so our rotation | ||
| 869 | * logic is now stuck. In order to make forward progress | ||
| 870 | * we have a few options: | ||
| 871 | * | ||
| 872 | * 1. rotate ("steal") another RMID | ||
| 873 | * 2. increase the threshold | ||
| 874 | * 3. do nothing | ||
| 875 | * | ||
| 876 | * We do both of 1. and 2. until we hit the steal limit. | ||
| 877 | * | ||
| 878 | * The steal limit prevents all RMIDs ending up on the | ||
| 879 | * limbo list. This can happen if every RMID has a | ||
| 880 | * non-zero occupancy above threshold_limit, and the | ||
| 881 | * occupancy values aren't dropping fast enough. | ||
| 882 | * | ||
| 883 | * Note that there is prioritisation at work here - we'd | ||
| 884 | * rather increase the number of RMIDs on the limbo list | ||
| 885 | * than increase the threshold, because increasing the | ||
| 886 | * threshold skews the event data (because we reuse | ||
| 887 | * dirty RMIDs) - threshold bumps are a last resort. | ||
| 888 | */ | ||
| 889 | if (nr_available < steal_limit) | ||
| 890 | goto again; | ||
| 891 | |||
| 892 | __intel_cqm_threshold++; | ||
| 893 | } | ||
| 894 | |||
| 895 | out: | ||
| 896 | mutex_unlock(&cache_mutex); | ||
| 897 | return rotated; | ||
| 898 | } | ||
| 899 | |||
| 900 | static void intel_cqm_rmid_rotate(struct work_struct *work); | ||
| 901 | |||
| 902 | static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate); | ||
| 903 | |||
| 904 | static struct pmu intel_cqm_pmu; | ||
| 905 | |||
| 906 | static void intel_cqm_rmid_rotate(struct work_struct *work) | ||
| 907 | { | ||
| 908 | unsigned long delay; | ||
| 909 | |||
| 910 | __intel_cqm_rmid_rotate(); | ||
| 911 | |||
| 912 | delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms); | ||
| 913 | schedule_delayed_work(&intel_cqm_rmid_work, delay); | ||
| 914 | } | ||
| 915 | |||
| 916 | static u64 update_sample(unsigned int rmid, u32 evt_type, int first) | ||
| 917 | { | ||
| 918 | struct sample *mbm_current; | ||
| 919 | u32 vrmid = rmid_2_index(rmid); | ||
| 920 | u64 val, bytes, shift; | ||
| 921 | u32 eventid; | ||
| 922 | |||
| 923 | if (evt_type == QOS_MBM_LOCAL_EVENT_ID) { | ||
| 924 | mbm_current = &mbm_local[vrmid]; | ||
| 925 | eventid = QOS_MBM_LOCAL_EVENT_ID; | ||
| 926 | } else { | ||
| 927 | mbm_current = &mbm_total[vrmid]; | ||
| 928 | eventid = QOS_MBM_TOTAL_EVENT_ID; | ||
| 929 | } | ||
| 930 | |||
| 931 | wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); | ||
| 932 | rdmsrl(MSR_IA32_QM_CTR, val); | ||
| 933 | if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) | ||
| 934 | return mbm_current->total_bytes; | ||
| 935 | |||
| 936 | if (first) { | ||
| 937 | mbm_current->prev_msr = val; | ||
| 938 | mbm_current->total_bytes = 0; | ||
| 939 | return mbm_current->total_bytes; | ||
| 940 | } | ||
| 941 | |||
| 942 | /* | ||
| 943 | * The h/w guarantees that counters will not overflow | ||
| 944 | * so long as we poll them at least once per second. | ||
| 945 | */ | ||
| 946 | shift = 64 - MBM_CNTR_WIDTH; | ||
| 947 | bytes = (val << shift) - (mbm_current->prev_msr << shift); | ||
| 948 | bytes >>= shift; | ||
| 949 | |||
| 950 | bytes *= cqm_l3_scale; | ||
| 951 | |||
| 952 | mbm_current->total_bytes += bytes; | ||
| 953 | mbm_current->prev_msr = val; | ||
| 954 | |||
| 955 | return mbm_current->total_bytes; | ||
| 956 | } | ||
| 957 | |||
| 958 | static u64 rmid_read_mbm(unsigned int rmid, u32 evt_type) | ||
| 959 | { | ||
| 960 | return update_sample(rmid, evt_type, 0); | ||
| 961 | } | ||
| 962 | |||
| 963 | static void __intel_mbm_event_init(void *info) | ||
| 964 | { | ||
| 965 | struct rmid_read *rr = info; | ||
| 966 | |||
| 967 | update_sample(rr->rmid, rr->evt_type, 1); | ||
| 968 | } | ||
| 969 | |||
| 970 | static void init_mbm_sample(u32 rmid, u32 evt_type) | ||
| 971 | { | ||
| 972 | struct rmid_read rr = { | ||
| 973 | .rmid = rmid, | ||
| 974 | .evt_type = evt_type, | ||
| 975 | .value = ATOMIC64_INIT(0), | ||
| 976 | }; | ||
| 977 | |||
| 978 | /* on each socket, init sample */ | ||
| 979 | on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1); | ||
| 980 | } | ||
| 981 | |||
| 982 | /* | ||
| 983 | * Find a group and setup RMID. | ||
| 984 | * | ||
| 985 | * If we're part of a group, we use the group's RMID. | ||
| 986 | */ | ||
| 987 | static void intel_cqm_setup_event(struct perf_event *event, | ||
| 988 | struct perf_event **group) | ||
| 989 | { | ||
| 990 | struct perf_event *iter; | ||
| 991 | bool conflict = false; | ||
| 992 | u32 rmid; | ||
| 993 | |||
| 994 | event->hw.is_group_event = false; | ||
| 995 | list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { | ||
| 996 | rmid = iter->hw.cqm_rmid; | ||
| 997 | |||
| 998 | if (__match_event(iter, event)) { | ||
| 999 | /* All tasks in a group share an RMID */ | ||
| 1000 | event->hw.cqm_rmid = rmid; | ||
| 1001 | *group = iter; | ||
| 1002 | if (is_mbm_event(event->attr.config) && __rmid_valid(rmid)) | ||
| 1003 | init_mbm_sample(rmid, event->attr.config); | ||
| 1004 | return; | ||
| 1005 | } | ||
| 1006 | |||
| 1007 | /* | ||
| 1008 | * We only care about conflicts for events that are | ||
| 1009 | * actually scheduled in (and hence have a valid RMID). | ||
| 1010 | */ | ||
| 1011 | if (__conflict_event(iter, event) && __rmid_valid(rmid)) | ||
| 1012 | conflict = true; | ||
| 1013 | } | ||
| 1014 | |||
| 1015 | if (conflict) | ||
| 1016 | rmid = INVALID_RMID; | ||
| 1017 | else | ||
| 1018 | rmid = __get_rmid(); | ||
| 1019 | |||
| 1020 | if (is_mbm_event(event->attr.config) && __rmid_valid(rmid)) | ||
| 1021 | init_mbm_sample(rmid, event->attr.config); | ||
| 1022 | |||
| 1023 | event->hw.cqm_rmid = rmid; | ||
| 1024 | } | ||
| 1025 | |||
| 1026 | static void intel_cqm_event_read(struct perf_event *event) | ||
| 1027 | { | ||
| 1028 | unsigned long flags; | ||
| 1029 | u32 rmid; | ||
| 1030 | u64 val; | ||
| 1031 | |||
| 1032 | /* | ||
| 1033 | * Task events are handled by intel_cqm_event_count(). | ||
| 1034 | */ | ||
| 1035 | if (event->cpu == -1) | ||
| 1036 | return; | ||
| 1037 | |||
| 1038 | raw_spin_lock_irqsave(&cache_lock, flags); | ||
| 1039 | rmid = event->hw.cqm_rmid; | ||
| 1040 | |||
| 1041 | if (!__rmid_valid(rmid)) | ||
| 1042 | goto out; | ||
| 1043 | |||
| 1044 | if (is_mbm_event(event->attr.config)) | ||
| 1045 | val = rmid_read_mbm(rmid, event->attr.config); | ||
| 1046 | else | ||
| 1047 | val = __rmid_read(rmid); | ||
| 1048 | |||
| 1049 | /* | ||
| 1050 | * Ignore this reading on error states and do not update the value. | ||
| 1051 | */ | ||
| 1052 | if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) | ||
| 1053 | goto out; | ||
| 1054 | |||
| 1055 | local64_set(&event->count, val); | ||
| 1056 | out: | ||
| 1057 | raw_spin_unlock_irqrestore(&cache_lock, flags); | ||
| 1058 | } | ||
| 1059 | |||
| 1060 | static void __intel_cqm_event_count(void *info) | ||
| 1061 | { | ||
| 1062 | struct rmid_read *rr = info; | ||
| 1063 | u64 val; | ||
| 1064 | |||
| 1065 | val = __rmid_read(rr->rmid); | ||
| 1066 | |||
| 1067 | if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) | ||
| 1068 | return; | ||
| 1069 | |||
| 1070 | atomic64_add(val, &rr->value); | ||
| 1071 | } | ||
| 1072 | |||
| 1073 | static inline bool cqm_group_leader(struct perf_event *event) | ||
| 1074 | { | ||
| 1075 | return !list_empty(&event->hw.cqm_groups_entry); | ||
| 1076 | } | ||
| 1077 | |||
| 1078 | static void __intel_mbm_event_count(void *info) | ||
| 1079 | { | ||
| 1080 | struct rmid_read *rr = info; | ||
| 1081 | u64 val; | ||
| 1082 | |||
| 1083 | val = rmid_read_mbm(rr->rmid, rr->evt_type); | ||
| 1084 | if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) | ||
| 1085 | return; | ||
| 1086 | atomic64_add(val, &rr->value); | ||
| 1087 | } | ||
| 1088 | |||
| 1089 | static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer) | ||
| 1090 | { | ||
| 1091 | struct perf_event *iter, *iter1; | ||
| 1092 | int ret = HRTIMER_RESTART; | ||
| 1093 | struct list_head *head; | ||
| 1094 | unsigned long flags; | ||
| 1095 | u32 grp_rmid; | ||
| 1096 | |||
| 1097 | /* | ||
| 1098 | * Need to cache_lock as the timer Event Select MSR reads | ||
| 1099 | * can race with the mbm/cqm count() and mbm_init() reads. | ||
| 1100 | */ | ||
| 1101 | raw_spin_lock_irqsave(&cache_lock, flags); | ||
| 1102 | |||
| 1103 | if (list_empty(&cache_groups)) { | ||
| 1104 | ret = HRTIMER_NORESTART; | ||
| 1105 | goto out; | ||
| 1106 | } | ||
| 1107 | |||
| 1108 | list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { | ||
| 1109 | grp_rmid = iter->hw.cqm_rmid; | ||
| 1110 | if (!__rmid_valid(grp_rmid)) | ||
| 1111 | continue; | ||
| 1112 | if (is_mbm_event(iter->attr.config)) | ||
| 1113 | update_sample(grp_rmid, iter->attr.config, 0); | ||
| 1114 | |||
| 1115 | head = &iter->hw.cqm_group_entry; | ||
| 1116 | if (list_empty(head)) | ||
| 1117 | continue; | ||
| 1118 | list_for_each_entry(iter1, head, hw.cqm_group_entry) { | ||
| 1119 | if (!iter1->hw.is_group_event) | ||
| 1120 | break; | ||
| 1121 | if (is_mbm_event(iter1->attr.config)) | ||
| 1122 | update_sample(iter1->hw.cqm_rmid, | ||
| 1123 | iter1->attr.config, 0); | ||
| 1124 | } | ||
| 1125 | } | ||
| 1126 | |||
| 1127 | hrtimer_forward_now(hrtimer, ms_to_ktime(MBM_CTR_OVERFLOW_TIME)); | ||
| 1128 | out: | ||
| 1129 | raw_spin_unlock_irqrestore(&cache_lock, flags); | ||
| 1130 | |||
| 1131 | return ret; | ||
| 1132 | } | ||
| 1133 | |||
| 1134 | static void __mbm_start_timer(void *info) | ||
| 1135 | { | ||
| 1136 | hrtimer_start(&mbm_timers[pkg_id], ms_to_ktime(MBM_CTR_OVERFLOW_TIME), | ||
| 1137 | HRTIMER_MODE_REL_PINNED); | ||
| 1138 | } | ||
| 1139 | |||
| 1140 | static void __mbm_stop_timer(void *info) | ||
| 1141 | { | ||
| 1142 | hrtimer_cancel(&mbm_timers[pkg_id]); | ||
| 1143 | } | ||
| 1144 | |||
| 1145 | static void mbm_start_timers(void) | ||
| 1146 | { | ||
| 1147 | on_each_cpu_mask(&cqm_cpumask, __mbm_start_timer, NULL, 1); | ||
| 1148 | } | ||
| 1149 | |||
| 1150 | static void mbm_stop_timers(void) | ||
| 1151 | { | ||
| 1152 | on_each_cpu_mask(&cqm_cpumask, __mbm_stop_timer, NULL, 1); | ||
| 1153 | } | ||
| 1154 | |||
| 1155 | static void mbm_hrtimer_init(void) | ||
| 1156 | { | ||
| 1157 | struct hrtimer *hr; | ||
| 1158 | int i; | ||
| 1159 | |||
| 1160 | for (i = 0; i < mbm_socket_max; i++) { | ||
| 1161 | hr = &mbm_timers[i]; | ||
| 1162 | hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 1163 | hr->function = mbm_hrtimer_handle; | ||
| 1164 | } | ||
| 1165 | } | ||
| 1166 | |||
| 1167 | static u64 intel_cqm_event_count(struct perf_event *event) | ||
| 1168 | { | ||
| 1169 | unsigned long flags; | ||
| 1170 | struct rmid_read rr = { | ||
| 1171 | .evt_type = event->attr.config, | ||
| 1172 | .value = ATOMIC64_INIT(0), | ||
| 1173 | }; | ||
| 1174 | |||
| 1175 | /* | ||
| 1176 | * We only need to worry about task events. System-wide events | ||
| 1177 | * are handled like usual, i.e. entirely with | ||
| 1178 | * intel_cqm_event_read(). | ||
| 1179 | */ | ||
| 1180 | if (event->cpu != -1) | ||
| 1181 | return __perf_event_count(event); | ||
| 1182 | |||
| 1183 | /* | ||
| 1184 | * Only the group leader gets to report values except in case of | ||
| 1185 | * multiple events in the same group, we still need to read the | ||
| 1186 | * other events.This stops us | ||
| 1187 | * reporting duplicate values to userspace, and gives us a clear | ||
| 1188 | * rule for which task gets to report the values. | ||
| 1189 | * | ||
| 1190 | * Note that it is impossible to attribute these values to | ||
| 1191 | * specific packages - we forfeit that ability when we create | ||
| 1192 | * task events. | ||
| 1193 | */ | ||
| 1194 | if (!cqm_group_leader(event) && !event->hw.is_group_event) | ||
| 1195 | return 0; | ||
| 1196 | |||
| 1197 | /* | ||
| 1198 | * Getting up-to-date values requires an SMP IPI which is not | ||
| 1199 | * possible if we're being called in interrupt context. Return | ||
| 1200 | * the cached values instead. | ||
| 1201 | */ | ||
| 1202 | if (unlikely(in_interrupt())) | ||
| 1203 | goto out; | ||
| 1204 | |||
| 1205 | /* | ||
| 1206 | * Notice that we don't perform the reading of an RMID | ||
| 1207 | * atomically, because we can't hold a spin lock across the | ||
| 1208 | * IPIs. | ||
| 1209 | * | ||
| 1210 | * Speculatively perform the read, since @event might be | ||
| 1211 | * assigned a different (possibly invalid) RMID while we're | ||
| 1212 | * busying performing the IPI calls. It's therefore necessary to | ||
| 1213 | * check @event's RMID afterwards, and if it has changed, | ||
| 1214 | * discard the result of the read. | ||
| 1215 | */ | ||
| 1216 | rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid); | ||
| 1217 | |||
| 1218 | if (!__rmid_valid(rr.rmid)) | ||
| 1219 | goto out; | ||
| 1220 | |||
| 1221 | cqm_mask_call(&rr); | ||
| 1222 | |||
| 1223 | raw_spin_lock_irqsave(&cache_lock, flags); | ||
| 1224 | if (event->hw.cqm_rmid == rr.rmid) | ||
| 1225 | local64_set(&event->count, atomic64_read(&rr.value)); | ||
| 1226 | raw_spin_unlock_irqrestore(&cache_lock, flags); | ||
| 1227 | out: | ||
| 1228 | return __perf_event_count(event); | ||
| 1229 | } | ||
| 1230 | |||
| 1231 | static void intel_cqm_event_start(struct perf_event *event, int mode) | ||
| 1232 | { | ||
| 1233 | struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); | ||
| 1234 | u32 rmid = event->hw.cqm_rmid; | ||
| 1235 | |||
| 1236 | if (!(event->hw.cqm_state & PERF_HES_STOPPED)) | ||
| 1237 | return; | ||
| 1238 | |||
| 1239 | event->hw.cqm_state &= ~PERF_HES_STOPPED; | ||
| 1240 | |||
| 1241 | if (state->rmid_usecnt++) { | ||
| 1242 | if (!WARN_ON_ONCE(state->rmid != rmid)) | ||
| 1243 | return; | ||
| 1244 | } else { | ||
| 1245 | WARN_ON_ONCE(state->rmid); | ||
| 1246 | } | ||
| 1247 | |||
| 1248 | state->rmid = rmid; | ||
| 1249 | wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid); | ||
| 1250 | } | ||
| 1251 | |||
| 1252 | static void intel_cqm_event_stop(struct perf_event *event, int mode) | ||
| 1253 | { | ||
| 1254 | struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); | ||
| 1255 | |||
| 1256 | if (event->hw.cqm_state & PERF_HES_STOPPED) | ||
| 1257 | return; | ||
| 1258 | |||
| 1259 | event->hw.cqm_state |= PERF_HES_STOPPED; | ||
| 1260 | |||
| 1261 | intel_cqm_event_read(event); | ||
| 1262 | |||
| 1263 | if (!--state->rmid_usecnt) { | ||
| 1264 | state->rmid = 0; | ||
| 1265 | wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid); | ||
| 1266 | } else { | ||
| 1267 | WARN_ON_ONCE(!state->rmid); | ||
| 1268 | } | ||
| 1269 | } | ||
| 1270 | |||
| 1271 | static int intel_cqm_event_add(struct perf_event *event, int mode) | ||
| 1272 | { | ||
| 1273 | unsigned long flags; | ||
| 1274 | u32 rmid; | ||
| 1275 | |||
| 1276 | raw_spin_lock_irqsave(&cache_lock, flags); | ||
| 1277 | |||
| 1278 | event->hw.cqm_state = PERF_HES_STOPPED; | ||
| 1279 | rmid = event->hw.cqm_rmid; | ||
| 1280 | |||
| 1281 | if (__rmid_valid(rmid) && (mode & PERF_EF_START)) | ||
| 1282 | intel_cqm_event_start(event, mode); | ||
| 1283 | |||
| 1284 | raw_spin_unlock_irqrestore(&cache_lock, flags); | ||
| 1285 | |||
| 1286 | return 0; | ||
| 1287 | } | ||
| 1288 | |||
| 1289 | static void intel_cqm_event_destroy(struct perf_event *event) | ||
| 1290 | { | ||
| 1291 | struct perf_event *group_other = NULL; | ||
| 1292 | unsigned long flags; | ||
| 1293 | |||
| 1294 | mutex_lock(&cache_mutex); | ||
| 1295 | /* | ||
| 1296 | * Hold the cache_lock as mbm timer handlers could be | ||
| 1297 | * scanning the list of events. | ||
| 1298 | */ | ||
| 1299 | raw_spin_lock_irqsave(&cache_lock, flags); | ||
| 1300 | |||
| 1301 | /* | ||
| 1302 | * If there's another event in this group... | ||
| 1303 | */ | ||
| 1304 | if (!list_empty(&event->hw.cqm_group_entry)) { | ||
| 1305 | group_other = list_first_entry(&event->hw.cqm_group_entry, | ||
| 1306 | struct perf_event, | ||
| 1307 | hw.cqm_group_entry); | ||
| 1308 | list_del(&event->hw.cqm_group_entry); | ||
| 1309 | } | ||
| 1310 | |||
| 1311 | /* | ||
| 1312 | * And we're the group leader.. | ||
| 1313 | */ | ||
| 1314 | if (cqm_group_leader(event)) { | ||
| 1315 | /* | ||
| 1316 | * If there was a group_other, make that leader, otherwise | ||
| 1317 | * destroy the group and return the RMID. | ||
| 1318 | */ | ||
| 1319 | if (group_other) { | ||
| 1320 | list_replace(&event->hw.cqm_groups_entry, | ||
| 1321 | &group_other->hw.cqm_groups_entry); | ||
| 1322 | } else { | ||
| 1323 | u32 rmid = event->hw.cqm_rmid; | ||
| 1324 | |||
| 1325 | if (__rmid_valid(rmid)) | ||
| 1326 | __put_rmid(rmid); | ||
| 1327 | list_del(&event->hw.cqm_groups_entry); | ||
| 1328 | } | ||
| 1329 | } | ||
| 1330 | |||
| 1331 | raw_spin_unlock_irqrestore(&cache_lock, flags); | ||
| 1332 | |||
| 1333 | /* | ||
| 1334 | * Stop the mbm overflow timers when the last event is destroyed. | ||
| 1335 | */ | ||
| 1336 | if (mbm_enabled && list_empty(&cache_groups)) | ||
| 1337 | mbm_stop_timers(); | ||
| 1338 | |||
| 1339 | mutex_unlock(&cache_mutex); | ||
| 1340 | } | ||
| 1341 | |||
| 1342 | static int intel_cqm_event_init(struct perf_event *event) | ||
| 1343 | { | ||
| 1344 | struct perf_event *group = NULL; | ||
| 1345 | bool rotate = false; | ||
| 1346 | unsigned long flags; | ||
| 1347 | |||
| 1348 | if (event->attr.type != intel_cqm_pmu.type) | ||
| 1349 | return -ENOENT; | ||
| 1350 | |||
| 1351 | if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) || | ||
| 1352 | (event->attr.config > QOS_MBM_LOCAL_EVENT_ID)) | ||
| 1353 | return -EINVAL; | ||
| 1354 | |||
| 1355 | if ((is_cqm_event(event->attr.config) && !cqm_enabled) || | ||
| 1356 | (is_mbm_event(event->attr.config) && !mbm_enabled)) | ||
| 1357 | return -EINVAL; | ||
| 1358 | |||
| 1359 | /* unsupported modes and filters */ | ||
| 1360 | if (event->attr.exclude_user || | ||
| 1361 | event->attr.exclude_kernel || | ||
| 1362 | event->attr.exclude_hv || | ||
| 1363 | event->attr.exclude_idle || | ||
| 1364 | event->attr.exclude_host || | ||
| 1365 | event->attr.exclude_guest || | ||
| 1366 | event->attr.sample_period) /* no sampling */ | ||
| 1367 | return -EINVAL; | ||
| 1368 | |||
| 1369 | INIT_LIST_HEAD(&event->hw.cqm_group_entry); | ||
| 1370 | INIT_LIST_HEAD(&event->hw.cqm_groups_entry); | ||
| 1371 | |||
| 1372 | event->destroy = intel_cqm_event_destroy; | ||
| 1373 | |||
| 1374 | mutex_lock(&cache_mutex); | ||
| 1375 | |||
| 1376 | /* | ||
| 1377 | * Start the mbm overflow timers when the first event is created. | ||
| 1378 | */ | ||
| 1379 | if (mbm_enabled && list_empty(&cache_groups)) | ||
| 1380 | mbm_start_timers(); | ||
| 1381 | |||
| 1382 | /* Will also set rmid */ | ||
| 1383 | intel_cqm_setup_event(event, &group); | ||
| 1384 | |||
| 1385 | /* | ||
| 1386 | * Hold the cache_lock as mbm timer handlers be | ||
| 1387 | * scanning the list of events. | ||
| 1388 | */ | ||
| 1389 | raw_spin_lock_irqsave(&cache_lock, flags); | ||
| 1390 | |||
| 1391 | if (group) { | ||
| 1392 | list_add_tail(&event->hw.cqm_group_entry, | ||
| 1393 | &group->hw.cqm_group_entry); | ||
| 1394 | } else { | ||
| 1395 | list_add_tail(&event->hw.cqm_groups_entry, | ||
| 1396 | &cache_groups); | ||
| 1397 | |||
| 1398 | /* | ||
| 1399 | * All RMIDs are either in use or have recently been | ||
| 1400 | * used. Kick the rotation worker to clean/free some. | ||
| 1401 | * | ||
| 1402 | * We only do this for the group leader, rather than for | ||
| 1403 | * every event in a group to save on needless work. | ||
| 1404 | */ | ||
| 1405 | if (!__rmid_valid(event->hw.cqm_rmid)) | ||
| 1406 | rotate = true; | ||
| 1407 | } | ||
| 1408 | |||
| 1409 | raw_spin_unlock_irqrestore(&cache_lock, flags); | ||
| 1410 | mutex_unlock(&cache_mutex); | ||
| 1411 | |||
| 1412 | if (rotate) | ||
| 1413 | schedule_delayed_work(&intel_cqm_rmid_work, 0); | ||
| 1414 | |||
| 1415 | return 0; | ||
| 1416 | } | ||
| 1417 | |||
| 1418 | EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01"); | ||
| 1419 | EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1"); | ||
| 1420 | EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes"); | ||
| 1421 | EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL); | ||
| 1422 | EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1"); | ||
| 1423 | |||
| 1424 | EVENT_ATTR_STR(total_bytes, intel_cqm_total_bytes, "event=0x02"); | ||
| 1425 | EVENT_ATTR_STR(total_bytes.per-pkg, intel_cqm_total_bytes_pkg, "1"); | ||
| 1426 | EVENT_ATTR_STR(total_bytes.unit, intel_cqm_total_bytes_unit, "MB"); | ||
| 1427 | EVENT_ATTR_STR(total_bytes.scale, intel_cqm_total_bytes_scale, "1e-6"); | ||
| 1428 | |||
| 1429 | EVENT_ATTR_STR(local_bytes, intel_cqm_local_bytes, "event=0x03"); | ||
| 1430 | EVENT_ATTR_STR(local_bytes.per-pkg, intel_cqm_local_bytes_pkg, "1"); | ||
| 1431 | EVENT_ATTR_STR(local_bytes.unit, intel_cqm_local_bytes_unit, "MB"); | ||
| 1432 | EVENT_ATTR_STR(local_bytes.scale, intel_cqm_local_bytes_scale, "1e-6"); | ||
| 1433 | |||
| 1434 | static struct attribute *intel_cqm_events_attr[] = { | ||
| 1435 | EVENT_PTR(intel_cqm_llc), | ||
| 1436 | EVENT_PTR(intel_cqm_llc_pkg), | ||
| 1437 | EVENT_PTR(intel_cqm_llc_unit), | ||
| 1438 | EVENT_PTR(intel_cqm_llc_scale), | ||
| 1439 | EVENT_PTR(intel_cqm_llc_snapshot), | ||
| 1440 | NULL, | ||
| 1441 | }; | ||
| 1442 | |||
| 1443 | static struct attribute *intel_mbm_events_attr[] = { | ||
| 1444 | EVENT_PTR(intel_cqm_total_bytes), | ||
| 1445 | EVENT_PTR(intel_cqm_local_bytes), | ||
| 1446 | EVENT_PTR(intel_cqm_total_bytes_pkg), | ||
| 1447 | EVENT_PTR(intel_cqm_local_bytes_pkg), | ||
| 1448 | EVENT_PTR(intel_cqm_total_bytes_unit), | ||
| 1449 | EVENT_PTR(intel_cqm_local_bytes_unit), | ||
| 1450 | EVENT_PTR(intel_cqm_total_bytes_scale), | ||
| 1451 | EVENT_PTR(intel_cqm_local_bytes_scale), | ||
| 1452 | NULL, | ||
| 1453 | }; | ||
| 1454 | |||
| 1455 | static struct attribute *intel_cmt_mbm_events_attr[] = { | ||
| 1456 | EVENT_PTR(intel_cqm_llc), | ||
| 1457 | EVENT_PTR(intel_cqm_total_bytes), | ||
| 1458 | EVENT_PTR(intel_cqm_local_bytes), | ||
| 1459 | EVENT_PTR(intel_cqm_llc_pkg), | ||
| 1460 | EVENT_PTR(intel_cqm_total_bytes_pkg), | ||
| 1461 | EVENT_PTR(intel_cqm_local_bytes_pkg), | ||
| 1462 | EVENT_PTR(intel_cqm_llc_unit), | ||
| 1463 | EVENT_PTR(intel_cqm_total_bytes_unit), | ||
| 1464 | EVENT_PTR(intel_cqm_local_bytes_unit), | ||
| 1465 | EVENT_PTR(intel_cqm_llc_scale), | ||
| 1466 | EVENT_PTR(intel_cqm_total_bytes_scale), | ||
| 1467 | EVENT_PTR(intel_cqm_local_bytes_scale), | ||
| 1468 | EVENT_PTR(intel_cqm_llc_snapshot), | ||
| 1469 | NULL, | ||
| 1470 | }; | ||
| 1471 | |||
| 1472 | static struct attribute_group intel_cqm_events_group = { | ||
| 1473 | .name = "events", | ||
| 1474 | .attrs = NULL, | ||
| 1475 | }; | ||
| 1476 | |||
| 1477 | PMU_FORMAT_ATTR(event, "config:0-7"); | ||
| 1478 | static struct attribute *intel_cqm_formats_attr[] = { | ||
| 1479 | &format_attr_event.attr, | ||
| 1480 | NULL, | ||
| 1481 | }; | ||
| 1482 | |||
| 1483 | static struct attribute_group intel_cqm_format_group = { | ||
| 1484 | .name = "format", | ||
| 1485 | .attrs = intel_cqm_formats_attr, | ||
| 1486 | }; | ||
| 1487 | |||
| 1488 | static ssize_t | ||
| 1489 | max_recycle_threshold_show(struct device *dev, struct device_attribute *attr, | ||
| 1490 | char *page) | ||
| 1491 | { | ||
| 1492 | ssize_t rv; | ||
| 1493 | |||
| 1494 | mutex_lock(&cache_mutex); | ||
| 1495 | rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold); | ||
| 1496 | mutex_unlock(&cache_mutex); | ||
| 1497 | |||
| 1498 | return rv; | ||
| 1499 | } | ||
| 1500 | |||
| 1501 | static ssize_t | ||
| 1502 | max_recycle_threshold_store(struct device *dev, | ||
| 1503 | struct device_attribute *attr, | ||
| 1504 | const char *buf, size_t count) | ||
| 1505 | { | ||
| 1506 | unsigned int bytes, cachelines; | ||
| 1507 | int ret; | ||
| 1508 | |||
| 1509 | ret = kstrtouint(buf, 0, &bytes); | ||
| 1510 | if (ret) | ||
| 1511 | return ret; | ||
| 1512 | |||
| 1513 | mutex_lock(&cache_mutex); | ||
| 1514 | |||
| 1515 | __intel_cqm_max_threshold = bytes; | ||
| 1516 | cachelines = bytes / cqm_l3_scale; | ||
| 1517 | |||
| 1518 | /* | ||
| 1519 | * The new maximum takes effect immediately. | ||
| 1520 | */ | ||
| 1521 | if (__intel_cqm_threshold > cachelines) | ||
| 1522 | __intel_cqm_threshold = cachelines; | ||
| 1523 | |||
| 1524 | mutex_unlock(&cache_mutex); | ||
| 1525 | |||
| 1526 | return count; | ||
| 1527 | } | ||
| 1528 | |||
| 1529 | static DEVICE_ATTR_RW(max_recycle_threshold); | ||
| 1530 | |||
| 1531 | static struct attribute *intel_cqm_attrs[] = { | ||
| 1532 | &dev_attr_max_recycle_threshold.attr, | ||
| 1533 | NULL, | ||
| 1534 | }; | ||
| 1535 | |||
| 1536 | static const struct attribute_group intel_cqm_group = { | ||
| 1537 | .attrs = intel_cqm_attrs, | ||
| 1538 | }; | ||
| 1539 | |||
| 1540 | static const struct attribute_group *intel_cqm_attr_groups[] = { | ||
| 1541 | &intel_cqm_events_group, | ||
| 1542 | &intel_cqm_format_group, | ||
| 1543 | &intel_cqm_group, | ||
| 1544 | NULL, | ||
| 1545 | }; | ||
| 1546 | |||
| 1547 | static struct pmu intel_cqm_pmu = { | ||
| 1548 | .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME, | ||
| 1549 | .attr_groups = intel_cqm_attr_groups, | ||
| 1550 | .task_ctx_nr = perf_sw_context, | ||
| 1551 | .event_init = intel_cqm_event_init, | ||
| 1552 | .add = intel_cqm_event_add, | ||
| 1553 | .del = intel_cqm_event_stop, | ||
| 1554 | .start = intel_cqm_event_start, | ||
| 1555 | .stop = intel_cqm_event_stop, | ||
| 1556 | .read = intel_cqm_event_read, | ||
| 1557 | .count = intel_cqm_event_count, | ||
| 1558 | }; | ||
| 1559 | |||
| 1560 | static inline void cqm_pick_event_reader(int cpu) | ||
| 1561 | { | ||
| 1562 | int reader; | ||
| 1563 | |||
| 1564 | /* First online cpu in package becomes the reader */ | ||
| 1565 | reader = cpumask_any_and(&cqm_cpumask, topology_core_cpumask(cpu)); | ||
| 1566 | if (reader >= nr_cpu_ids) | ||
| 1567 | cpumask_set_cpu(cpu, &cqm_cpumask); | ||
| 1568 | } | ||
| 1569 | |||
| 1570 | static int intel_cqm_cpu_starting(unsigned int cpu) | ||
| 1571 | { | ||
| 1572 | struct intel_pqr_state *state = &per_cpu(pqr_state, cpu); | ||
| 1573 | struct cpuinfo_x86 *c = &cpu_data(cpu); | ||
| 1574 | |||
| 1575 | state->rmid = 0; | ||
| 1576 | state->closid = 0; | ||
| 1577 | state->rmid_usecnt = 0; | ||
| 1578 | |||
| 1579 | WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid); | ||
| 1580 | WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale); | ||
| 1581 | |||
| 1582 | cqm_pick_event_reader(cpu); | ||
| 1583 | return 0; | ||
| 1584 | } | ||
| 1585 | |||
| 1586 | static int intel_cqm_cpu_exit(unsigned int cpu) | ||
| 1587 | { | ||
| 1588 | int target; | ||
| 1589 | |||
| 1590 | /* Is @cpu the current cqm reader for this package ? */ | ||
| 1591 | if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask)) | ||
| 1592 | return 0; | ||
| 1593 | |||
| 1594 | /* Find another online reader in this package */ | ||
| 1595 | target = cpumask_any_but(topology_core_cpumask(cpu), cpu); | ||
| 1596 | |||
| 1597 | if (target < nr_cpu_ids) | ||
| 1598 | cpumask_set_cpu(target, &cqm_cpumask); | ||
| 1599 | |||
| 1600 | return 0; | ||
| 1601 | } | ||
| 1602 | |||
| 1603 | static const struct x86_cpu_id intel_cqm_match[] = { | ||
| 1604 | { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC }, | ||
| 1605 | {} | ||
| 1606 | }; | ||
| 1607 | |||
| 1608 | static void mbm_cleanup(void) | ||
| 1609 | { | ||
| 1610 | if (!mbm_enabled) | ||
| 1611 | return; | ||
| 1612 | |||
| 1613 | kfree(mbm_local); | ||
| 1614 | kfree(mbm_total); | ||
| 1615 | mbm_enabled = false; | ||
| 1616 | } | ||
| 1617 | |||
| 1618 | static const struct x86_cpu_id intel_mbm_local_match[] = { | ||
| 1619 | { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL }, | ||
| 1620 | {} | ||
| 1621 | }; | ||
| 1622 | |||
| 1623 | static const struct x86_cpu_id intel_mbm_total_match[] = { | ||
| 1624 | { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_TOTAL }, | ||
| 1625 | {} | ||
| 1626 | }; | ||
| 1627 | |||
| 1628 | static int intel_mbm_init(void) | ||
| 1629 | { | ||
| 1630 | int ret = 0, array_size, maxid = cqm_max_rmid + 1; | ||
| 1631 | |||
| 1632 | mbm_socket_max = topology_max_packages(); | ||
| 1633 | array_size = sizeof(struct sample) * maxid * mbm_socket_max; | ||
| 1634 | mbm_local = kmalloc(array_size, GFP_KERNEL); | ||
| 1635 | if (!mbm_local) | ||
| 1636 | return -ENOMEM; | ||
| 1637 | |||
| 1638 | mbm_total = kmalloc(array_size, GFP_KERNEL); | ||
| 1639 | if (!mbm_total) { | ||
| 1640 | ret = -ENOMEM; | ||
| 1641 | goto out; | ||
| 1642 | } | ||
| 1643 | |||
| 1644 | array_size = sizeof(struct hrtimer) * mbm_socket_max; | ||
| 1645 | mbm_timers = kmalloc(array_size, GFP_KERNEL); | ||
| 1646 | if (!mbm_timers) { | ||
| 1647 | ret = -ENOMEM; | ||
| 1648 | goto out; | ||
| 1649 | } | ||
| 1650 | mbm_hrtimer_init(); | ||
| 1651 | |||
| 1652 | out: | ||
| 1653 | if (ret) | ||
| 1654 | mbm_cleanup(); | ||
| 1655 | |||
| 1656 | return ret; | ||
| 1657 | } | ||
| 1658 | |||
| 1659 | static int __init intel_cqm_init(void) | ||
| 1660 | { | ||
| 1661 | char *str = NULL, scale[20]; | ||
| 1662 | int cpu, ret; | ||
| 1663 | |||
| 1664 | if (x86_match_cpu(intel_cqm_match)) | ||
| 1665 | cqm_enabled = true; | ||
| 1666 | |||
| 1667 | if (x86_match_cpu(intel_mbm_local_match) && | ||
| 1668 | x86_match_cpu(intel_mbm_total_match)) | ||
| 1669 | mbm_enabled = true; | ||
| 1670 | |||
| 1671 | if (!cqm_enabled && !mbm_enabled) | ||
| 1672 | return -ENODEV; | ||
| 1673 | |||
| 1674 | cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale; | ||
| 1675 | |||
| 1676 | /* | ||
| 1677 | * It's possible that not all resources support the same number | ||
| 1678 | * of RMIDs. Instead of making scheduling much more complicated | ||
| 1679 | * (where we have to match a task's RMID to a cpu that supports | ||
| 1680 | * that many RMIDs) just find the minimum RMIDs supported across | ||
| 1681 | * all cpus. | ||
| 1682 | * | ||
| 1683 | * Also, check that the scales match on all cpus. | ||
| 1684 | */ | ||
| 1685 | cpus_read_lock(); | ||
| 1686 | for_each_online_cpu(cpu) { | ||
| 1687 | struct cpuinfo_x86 *c = &cpu_data(cpu); | ||
| 1688 | |||
| 1689 | if (c->x86_cache_max_rmid < cqm_max_rmid) | ||
| 1690 | cqm_max_rmid = c->x86_cache_max_rmid; | ||
| 1691 | |||
| 1692 | if (c->x86_cache_occ_scale != cqm_l3_scale) { | ||
| 1693 | pr_err("Multiple LLC scale values, disabling\n"); | ||
| 1694 | ret = -EINVAL; | ||
| 1695 | goto out; | ||
| 1696 | } | ||
| 1697 | } | ||
| 1698 | |||
| 1699 | /* | ||
| 1700 | * A reasonable upper limit on the max threshold is the number | ||
| 1701 | * of lines tagged per RMID if all RMIDs have the same number of | ||
| 1702 | * lines tagged in the LLC. | ||
| 1703 | * | ||
| 1704 | * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. | ||
| 1705 | */ | ||
| 1706 | __intel_cqm_max_threshold = | ||
| 1707 | boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1); | ||
| 1708 | |||
| 1709 | snprintf(scale, sizeof(scale), "%u", cqm_l3_scale); | ||
| 1710 | str = kstrdup(scale, GFP_KERNEL); | ||
| 1711 | if (!str) { | ||
| 1712 | ret = -ENOMEM; | ||
| 1713 | goto out; | ||
| 1714 | } | ||
| 1715 | |||
| 1716 | event_attr_intel_cqm_llc_scale.event_str = str; | ||
| 1717 | |||
| 1718 | ret = intel_cqm_setup_rmid_cache(); | ||
| 1719 | if (ret) | ||
| 1720 | goto out; | ||
| 1721 | |||
| 1722 | if (mbm_enabled) | ||
| 1723 | ret = intel_mbm_init(); | ||
| 1724 | if (ret && !cqm_enabled) | ||
| 1725 | goto out; | ||
| 1726 | |||
| 1727 | if (cqm_enabled && mbm_enabled) | ||
| 1728 | intel_cqm_events_group.attrs = intel_cmt_mbm_events_attr; | ||
| 1729 | else if (!cqm_enabled && mbm_enabled) | ||
| 1730 | intel_cqm_events_group.attrs = intel_mbm_events_attr; | ||
| 1731 | else if (cqm_enabled && !mbm_enabled) | ||
| 1732 | intel_cqm_events_group.attrs = intel_cqm_events_attr; | ||
| 1733 | |||
| 1734 | ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); | ||
| 1735 | if (ret) { | ||
| 1736 | pr_err("Intel CQM perf registration failed: %d\n", ret); | ||
| 1737 | goto out; | ||
| 1738 | } | ||
| 1739 | |||
| 1740 | if (cqm_enabled) | ||
| 1741 | pr_info("Intel CQM monitoring enabled\n"); | ||
| 1742 | if (mbm_enabled) | ||
| 1743 | pr_info("Intel MBM enabled\n"); | ||
| 1744 | |||
| 1745 | /* | ||
| 1746 | * Setup the hot cpu notifier once we are sure cqm | ||
| 1747 | * is enabled to avoid notifier leak. | ||
| 1748 | */ | ||
| 1749 | cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_STARTING, | ||
| 1750 | "perf/x86/cqm:starting", | ||
| 1751 | intel_cqm_cpu_starting, NULL); | ||
| 1752 | cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_ONLINE, | ||
| 1753 | "perf/x86/cqm:online", | ||
| 1754 | NULL, intel_cqm_cpu_exit); | ||
| 1755 | out: | ||
| 1756 | cpus_read_unlock(); | ||
| 1757 | |||
| 1758 | if (ret) { | ||
| 1759 | kfree(str); | ||
| 1760 | cqm_cleanup(); | ||
| 1761 | mbm_cleanup(); | ||
| 1762 | } | ||
| 1763 | |||
| 1764 | return ret; | ||
| 1765 | } | ||
| 1766 | device_initcall(intel_cqm_init); | ||
diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h deleted file mode 100644 index 597dc4995678..000000000000 --- a/arch/x86/include/asm/intel_rdt.h +++ /dev/null | |||
| @@ -1,286 +0,0 @@ | |||
| 1 | #ifndef _ASM_X86_INTEL_RDT_H | ||
| 2 | #define _ASM_X86_INTEL_RDT_H | ||
| 3 | |||
| 4 | #ifdef CONFIG_INTEL_RDT_A | ||
| 5 | |||
| 6 | #include <linux/sched.h> | ||
| 7 | #include <linux/kernfs.h> | ||
| 8 | #include <linux/jump_label.h> | ||
| 9 | |||
| 10 | #include <asm/intel_rdt_common.h> | ||
| 11 | |||
| 12 | #define IA32_L3_QOS_CFG 0xc81 | ||
| 13 | #define IA32_L3_CBM_BASE 0xc90 | ||
| 14 | #define IA32_L2_CBM_BASE 0xd10 | ||
| 15 | #define IA32_MBA_THRTL_BASE 0xd50 | ||
| 16 | |||
| 17 | #define L3_QOS_CDP_ENABLE 0x01ULL | ||
| 18 | |||
| 19 | /** | ||
| 20 | * struct rdtgroup - store rdtgroup's data in resctrl file system. | ||
| 21 | * @kn: kernfs node | ||
| 22 | * @rdtgroup_list: linked list for all rdtgroups | ||
| 23 | * @closid: closid for this rdtgroup | ||
| 24 | * @cpu_mask: CPUs assigned to this rdtgroup | ||
| 25 | * @flags: status bits | ||
| 26 | * @waitcount: how many cpus expect to find this | ||
| 27 | * group when they acquire rdtgroup_mutex | ||
| 28 | */ | ||
| 29 | struct rdtgroup { | ||
| 30 | struct kernfs_node *kn; | ||
| 31 | struct list_head rdtgroup_list; | ||
| 32 | int closid; | ||
| 33 | struct cpumask cpu_mask; | ||
| 34 | int flags; | ||
| 35 | atomic_t waitcount; | ||
| 36 | }; | ||
| 37 | |||
| 38 | /* rdtgroup.flags */ | ||
| 39 | #define RDT_DELETED 1 | ||
| 40 | |||
| 41 | /* rftype.flags */ | ||
| 42 | #define RFTYPE_FLAGS_CPUS_LIST 1 | ||
| 43 | |||
| 44 | /* List of all resource groups */ | ||
| 45 | extern struct list_head rdt_all_groups; | ||
| 46 | |||
| 47 | extern int max_name_width, max_data_width; | ||
| 48 | |||
| 49 | int __init rdtgroup_init(void); | ||
| 50 | |||
| 51 | /** | ||
| 52 | * struct rftype - describe each file in the resctrl file system | ||
| 53 | * @name: File name | ||
| 54 | * @mode: Access mode | ||
| 55 | * @kf_ops: File operations | ||
| 56 | * @flags: File specific RFTYPE_FLAGS_* flags | ||
| 57 | * @seq_show: Show content of the file | ||
| 58 | * @write: Write to the file | ||
| 59 | */ | ||
| 60 | struct rftype { | ||
| 61 | char *name; | ||
| 62 | umode_t mode; | ||
| 63 | struct kernfs_ops *kf_ops; | ||
| 64 | unsigned long flags; | ||
| 65 | |||
| 66 | int (*seq_show)(struct kernfs_open_file *of, | ||
| 67 | struct seq_file *sf, void *v); | ||
| 68 | /* | ||
| 69 | * write() is the generic write callback which maps directly to | ||
| 70 | * kernfs write operation and overrides all other operations. | ||
| 71 | * Maximum write size is determined by ->max_write_len. | ||
| 72 | */ | ||
| 73 | ssize_t (*write)(struct kernfs_open_file *of, | ||
| 74 | char *buf, size_t nbytes, loff_t off); | ||
| 75 | }; | ||
| 76 | |||
| 77 | /** | ||
| 78 | * struct rdt_domain - group of cpus sharing an RDT resource | ||
| 79 | * @list: all instances of this resource | ||
| 80 | * @id: unique id for this instance | ||
| 81 | * @cpu_mask: which cpus share this resource | ||
| 82 | * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) | ||
| 83 | * @new_ctrl: new ctrl value to be loaded | ||
| 84 | * @have_new_ctrl: did user provide new_ctrl for this domain | ||
| 85 | */ | ||
| 86 | struct rdt_domain { | ||
| 87 | struct list_head list; | ||
| 88 | int id; | ||
| 89 | struct cpumask cpu_mask; | ||
| 90 | u32 *ctrl_val; | ||
| 91 | u32 new_ctrl; | ||
| 92 | bool have_new_ctrl; | ||
| 93 | }; | ||
| 94 | |||
| 95 | /** | ||
| 96 | * struct msr_param - set a range of MSRs from a domain | ||
| 97 | * @res: The resource to use | ||
| 98 | * @low: Beginning index from base MSR | ||
| 99 | * @high: End index | ||
| 100 | */ | ||
| 101 | struct msr_param { | ||
| 102 | struct rdt_resource *res; | ||
| 103 | int low; | ||
| 104 | int high; | ||
| 105 | }; | ||
| 106 | |||
| 107 | /** | ||
| 108 | * struct rdt_cache - Cache allocation related data | ||
| 109 | * @cbm_len: Length of the cache bit mask | ||
| 110 | * @min_cbm_bits: Minimum number of consecutive bits to be set | ||
| 111 | * @cbm_idx_mult: Multiplier of CBM index | ||
| 112 | * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: | ||
| 113 | * closid * cbm_idx_multi + cbm_idx_offset | ||
| 114 | * in a cache bit mask | ||
| 115 | */ | ||
| 116 | struct rdt_cache { | ||
| 117 | unsigned int cbm_len; | ||
| 118 | unsigned int min_cbm_bits; | ||
| 119 | unsigned int cbm_idx_mult; | ||
| 120 | unsigned int cbm_idx_offset; | ||
| 121 | }; | ||
| 122 | |||
| 123 | /** | ||
| 124 | * struct rdt_membw - Memory bandwidth allocation related data | ||
| 125 | * @max_delay: Max throttle delay. Delay is the hardware | ||
| 126 | * representation for memory bandwidth. | ||
| 127 | * @min_bw: Minimum memory bandwidth percentage user can request | ||
| 128 | * @bw_gran: Granularity at which the memory bandwidth is allocated | ||
| 129 | * @delay_linear: True if memory B/W delay is in linear scale | ||
| 130 | * @mb_map: Mapping of memory B/W percentage to memory B/W delay | ||
| 131 | */ | ||
| 132 | struct rdt_membw { | ||
| 133 | u32 max_delay; | ||
| 134 | u32 min_bw; | ||
| 135 | u32 bw_gran; | ||
| 136 | u32 delay_linear; | ||
| 137 | u32 *mb_map; | ||
| 138 | }; | ||
| 139 | |||
| 140 | /** | ||
| 141 | * struct rdt_resource - attributes of an RDT resource | ||
| 142 | * @enabled: Is this feature enabled on this machine | ||
| 143 | * @capable: Is this feature available on this machine | ||
| 144 | * @name: Name to use in "schemata" file | ||
| 145 | * @num_closid: Number of CLOSIDs available | ||
| 146 | * @cache_level: Which cache level defines scope of this resource | ||
| 147 | * @default_ctrl: Specifies default cache cbm or memory B/W percent. | ||
| 148 | * @msr_base: Base MSR address for CBMs | ||
| 149 | * @msr_update: Function pointer to update QOS MSRs | ||
| 150 | * @data_width: Character width of data when displaying | ||
| 151 | * @domains: All domains for this resource | ||
| 152 | * @cache: Cache allocation related data | ||
| 153 | * @info_files: resctrl info files for the resource | ||
| 154 | * @nr_info_files: Number of info files | ||
| 155 | * @format_str: Per resource format string to show domain value | ||
| 156 | * @parse_ctrlval: Per resource function pointer to parse control values | ||
| 157 | */ | ||
| 158 | struct rdt_resource { | ||
| 159 | bool enabled; | ||
| 160 | bool capable; | ||
| 161 | char *name; | ||
| 162 | int num_closid; | ||
| 163 | int cache_level; | ||
| 164 | u32 default_ctrl; | ||
| 165 | unsigned int msr_base; | ||
| 166 | void (*msr_update) (struct rdt_domain *d, struct msr_param *m, | ||
| 167 | struct rdt_resource *r); | ||
| 168 | int data_width; | ||
| 169 | struct list_head domains; | ||
| 170 | struct rdt_cache cache; | ||
| 171 | struct rdt_membw membw; | ||
| 172 | struct rftype *info_files; | ||
| 173 | int nr_info_files; | ||
| 174 | const char *format_str; | ||
| 175 | int (*parse_ctrlval) (char *buf, struct rdt_resource *r, | ||
| 176 | struct rdt_domain *d); | ||
| 177 | }; | ||
| 178 | |||
| 179 | void rdt_get_cache_infofile(struct rdt_resource *r); | ||
| 180 | void rdt_get_mba_infofile(struct rdt_resource *r); | ||
| 181 | int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d); | ||
| 182 | int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d); | ||
| 183 | |||
| 184 | extern struct mutex rdtgroup_mutex; | ||
| 185 | |||
| 186 | extern struct rdt_resource rdt_resources_all[]; | ||
| 187 | extern struct rdtgroup rdtgroup_default; | ||
| 188 | DECLARE_STATIC_KEY_FALSE(rdt_enable_key); | ||
| 189 | |||
| 190 | int __init rdtgroup_init(void); | ||
| 191 | |||
| 192 | enum { | ||
| 193 | RDT_RESOURCE_L3, | ||
| 194 | RDT_RESOURCE_L3DATA, | ||
| 195 | RDT_RESOURCE_L3CODE, | ||
| 196 | RDT_RESOURCE_L2, | ||
| 197 | RDT_RESOURCE_MBA, | ||
| 198 | |||
| 199 | /* Must be the last */ | ||
| 200 | RDT_NUM_RESOURCES, | ||
| 201 | }; | ||
| 202 | |||
| 203 | #define for_each_capable_rdt_resource(r) \ | ||
| 204 | for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ | ||
| 205 | r++) \ | ||
| 206 | if (r->capable) | ||
| 207 | |||
| 208 | #define for_each_enabled_rdt_resource(r) \ | ||
| 209 | for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ | ||
| 210 | r++) \ | ||
| 211 | if (r->enabled) | ||
| 212 | |||
| 213 | /* CPUID.(EAX=10H, ECX=ResID=1).EAX */ | ||
| 214 | union cpuid_0x10_1_eax { | ||
| 215 | struct { | ||
| 216 | unsigned int cbm_len:5; | ||
| 217 | } split; | ||
| 218 | unsigned int full; | ||
| 219 | }; | ||
| 220 | |||
| 221 | /* CPUID.(EAX=10H, ECX=ResID=3).EAX */ | ||
| 222 | union cpuid_0x10_3_eax { | ||
| 223 | struct { | ||
| 224 | unsigned int max_delay:12; | ||
| 225 | } split; | ||
| 226 | unsigned int full; | ||
| 227 | }; | ||
| 228 | |||
| 229 | /* CPUID.(EAX=10H, ECX=ResID).EDX */ | ||
| 230 | union cpuid_0x10_x_edx { | ||
| 231 | struct { | ||
| 232 | unsigned int cos_max:16; | ||
| 233 | } split; | ||
| 234 | unsigned int full; | ||
| 235 | }; | ||
| 236 | |||
| 237 | DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid); | ||
| 238 | |||
| 239 | void rdt_ctrl_update(void *arg); | ||
| 240 | struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); | ||
| 241 | void rdtgroup_kn_unlock(struct kernfs_node *kn); | ||
| 242 | ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, | ||
| 243 | char *buf, size_t nbytes, loff_t off); | ||
| 244 | int rdtgroup_schemata_show(struct kernfs_open_file *of, | ||
| 245 | struct seq_file *s, void *v); | ||
| 246 | |||
| 247 | /* | ||
| 248 | * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR | ||
| 249 | * | ||
| 250 | * Following considerations are made so that this has minimal impact | ||
| 251 | * on scheduler hot path: | ||
| 252 | * - This will stay as no-op unless we are running on an Intel SKU | ||
| 253 | * which supports resource control and we enable by mounting the | ||
| 254 | * resctrl file system. | ||
| 255 | * - Caches the per cpu CLOSid values and does the MSR write only | ||
| 256 | * when a task with a different CLOSid is scheduled in. | ||
| 257 | * | ||
| 258 | * Must be called with preemption disabled. | ||
| 259 | */ | ||
| 260 | static inline void intel_rdt_sched_in(void) | ||
| 261 | { | ||
| 262 | if (static_branch_likely(&rdt_enable_key)) { | ||
| 263 | struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); | ||
| 264 | int closid; | ||
| 265 | |||
| 266 | /* | ||
| 267 | * If this task has a closid assigned, use it. | ||
| 268 | * Else use the closid assigned to this cpu. | ||
| 269 | */ | ||
| 270 | closid = current->closid; | ||
| 271 | if (closid == 0) | ||
| 272 | closid = this_cpu_read(cpu_closid); | ||
| 273 | |||
| 274 | if (closid != state->closid) { | ||
| 275 | state->closid = closid; | ||
| 276 | wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid); | ||
| 277 | } | ||
| 278 | } | ||
| 279 | } | ||
| 280 | |||
| 281 | #else | ||
| 282 | |||
| 283 | static inline void intel_rdt_sched_in(void) {} | ||
| 284 | |||
| 285 | #endif /* CONFIG_INTEL_RDT_A */ | ||
| 286 | #endif /* _ASM_X86_INTEL_RDT_H */ | ||
diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h deleted file mode 100644 index b31081b89407..000000000000 --- a/arch/x86/include/asm/intel_rdt_common.h +++ /dev/null | |||
| @@ -1,27 +0,0 @@ | |||
| 1 | #ifndef _ASM_X86_INTEL_RDT_COMMON_H | ||
| 2 | #define _ASM_X86_INTEL_RDT_COMMON_H | ||
| 3 | |||
| 4 | #define MSR_IA32_PQR_ASSOC 0x0c8f | ||
| 5 | |||
| 6 | /** | ||
| 7 | * struct intel_pqr_state - State cache for the PQR MSR | ||
| 8 | * @rmid: The cached Resource Monitoring ID | ||
| 9 | * @closid: The cached Class Of Service ID | ||
| 10 | * @rmid_usecnt: The usage counter for rmid | ||
| 11 | * | ||
| 12 | * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the | ||
| 13 | * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always | ||
| 14 | * contains both parts, so we need to cache them. | ||
| 15 | * | ||
| 16 | * The cache also helps to avoid pointless updates if the value does | ||
| 17 | * not change. | ||
| 18 | */ | ||
| 19 | struct intel_pqr_state { | ||
| 20 | u32 rmid; | ||
| 21 | u32 closid; | ||
| 22 | int rmid_usecnt; | ||
| 23 | }; | ||
| 24 | |||
| 25 | DECLARE_PER_CPU(struct intel_pqr_state, pqr_state); | ||
| 26 | |||
| 27 | #endif /* _ASM_X86_INTEL_RDT_COMMON_H */ | ||
diff --git a/arch/x86/include/asm/intel_rdt_sched.h b/arch/x86/include/asm/intel_rdt_sched.h new file mode 100644 index 000000000000..b4bbf8b21512 --- /dev/null +++ b/arch/x86/include/asm/intel_rdt_sched.h | |||
| @@ -0,0 +1,92 @@ | |||
| 1 | #ifndef _ASM_X86_INTEL_RDT_SCHED_H | ||
| 2 | #define _ASM_X86_INTEL_RDT_SCHED_H | ||
| 3 | |||
| 4 | #ifdef CONFIG_INTEL_RDT | ||
| 5 | |||
| 6 | #include <linux/sched.h> | ||
| 7 | #include <linux/jump_label.h> | ||
| 8 | |||
| 9 | #define IA32_PQR_ASSOC 0x0c8f | ||
| 10 | |||
| 11 | /** | ||
| 12 | * struct intel_pqr_state - State cache for the PQR MSR | ||
| 13 | * @cur_rmid: The cached Resource Monitoring ID | ||
| 14 | * @cur_closid: The cached Class Of Service ID | ||
| 15 | * @default_rmid: The user assigned Resource Monitoring ID | ||
| 16 | * @default_closid: The user assigned cached Class Of Service ID | ||
| 17 | * | ||
| 18 | * The upper 32 bits of IA32_PQR_ASSOC contain closid and the | ||
| 19 | * lower 10 bits rmid. The update to IA32_PQR_ASSOC always | ||
| 20 | * contains both parts, so we need to cache them. This also | ||
| 21 | * stores the user configured per cpu CLOSID and RMID. | ||
| 22 | * | ||
| 23 | * The cache also helps to avoid pointless updates if the value does | ||
| 24 | * not change. | ||
| 25 | */ | ||
| 26 | struct intel_pqr_state { | ||
| 27 | u32 cur_rmid; | ||
| 28 | u32 cur_closid; | ||
| 29 | u32 default_rmid; | ||
| 30 | u32 default_closid; | ||
| 31 | }; | ||
| 32 | |||
| 33 | DECLARE_PER_CPU(struct intel_pqr_state, pqr_state); | ||
| 34 | |||
| 35 | DECLARE_STATIC_KEY_FALSE(rdt_enable_key); | ||
| 36 | DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); | ||
| 37 | DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); | ||
| 38 | |||
| 39 | /* | ||
| 40 | * __intel_rdt_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR | ||
| 41 | * | ||
| 42 | * Following considerations are made so that this has minimal impact | ||
| 43 | * on scheduler hot path: | ||
| 44 | * - This will stay as no-op unless we are running on an Intel SKU | ||
| 45 | * which supports resource control or monitoring and we enable by | ||
| 46 | * mounting the resctrl file system. | ||
| 47 | * - Caches the per cpu CLOSid/RMID values and does the MSR write only | ||
| 48 | * when a task with a different CLOSid/RMID is scheduled in. | ||
| 49 | * - We allocate RMIDs/CLOSids globally in order to keep this as | ||
| 50 | * simple as possible. | ||
| 51 | * Must be called with preemption disabled. | ||
| 52 | */ | ||
| 53 | static void __intel_rdt_sched_in(void) | ||
| 54 | { | ||
| 55 | struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); | ||
| 56 | u32 closid = state->default_closid; | ||
| 57 | u32 rmid = state->default_rmid; | ||
| 58 | |||
| 59 | /* | ||
| 60 | * If this task has a closid/rmid assigned, use it. | ||
| 61 | * Else use the closid/rmid assigned to this cpu. | ||
| 62 | */ | ||
| 63 | if (static_branch_likely(&rdt_alloc_enable_key)) { | ||
| 64 | if (current->closid) | ||
| 65 | closid = current->closid; | ||
| 66 | } | ||
| 67 | |||
| 68 | if (static_branch_likely(&rdt_mon_enable_key)) { | ||
| 69 | if (current->rmid) | ||
| 70 | rmid = current->rmid; | ||
| 71 | } | ||
| 72 | |||
| 73 | if (closid != state->cur_closid || rmid != state->cur_rmid) { | ||
| 74 | state->cur_closid = closid; | ||
| 75 | state->cur_rmid = rmid; | ||
| 76 | wrmsr(IA32_PQR_ASSOC, rmid, closid); | ||
| 77 | } | ||
| 78 | } | ||
| 79 | |||
| 80 | static inline void intel_rdt_sched_in(void) | ||
| 81 | { | ||
| 82 | if (static_branch_likely(&rdt_enable_key)) | ||
| 83 | __intel_rdt_sched_in(); | ||
| 84 | } | ||
| 85 | |||
| 86 | #else | ||
| 87 | |||
| 88 | static inline void intel_rdt_sched_in(void) {} | ||
| 89 | |||
| 90 | #endif /* CONFIG_INTEL_RDT */ | ||
| 91 | |||
| 92 | #endif /* _ASM_X86_INTEL_RDT_SCHED_H */ | ||
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index cdf82492b770..e17942c131c8 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile | |||
| @@ -33,7 +33,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o | |||
| 33 | obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o | 33 | obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o |
| 34 | obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o | 34 | obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o |
| 35 | 35 | ||
| 36 | obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o | 36 | obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o intel_rdt_ctrlmondata.o |
| 37 | 37 | ||
| 38 | obj-$(CONFIG_X86_MCE) += mcheck/ | 38 | obj-$(CONFIG_X86_MCE) += mcheck/ |
| 39 | obj-$(CONFIG_MTRR) += mtrr/ | 39 | obj-$(CONFIG_MTRR) += mtrr/ |
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 5b366462f579..cd5fc61ba450 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c | |||
| @@ -30,7 +30,8 @@ | |||
| 30 | #include <linux/cpuhotplug.h> | 30 | #include <linux/cpuhotplug.h> |
| 31 | 31 | ||
| 32 | #include <asm/intel-family.h> | 32 | #include <asm/intel-family.h> |
| 33 | #include <asm/intel_rdt.h> | 33 | #include <asm/intel_rdt_sched.h> |
| 34 | #include "intel_rdt.h" | ||
| 34 | 35 | ||
| 35 | #define MAX_MBA_BW 100u | 36 | #define MAX_MBA_BW 100u |
| 36 | #define MBA_IS_LINEAR 0x4 | 37 | #define MBA_IS_LINEAR 0x4 |
| @@ -38,7 +39,13 @@ | |||
| 38 | /* Mutex to protect rdtgroup access. */ | 39 | /* Mutex to protect rdtgroup access. */ |
| 39 | DEFINE_MUTEX(rdtgroup_mutex); | 40 | DEFINE_MUTEX(rdtgroup_mutex); |
| 40 | 41 | ||
| 41 | DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); | 42 | /* |
| 43 | * The cached intel_pqr_state is strictly per CPU and can never be | ||
| 44 | * updated from a remote CPU. Functions which modify the state | ||
| 45 | * are called with interrupts disabled and no preemption, which | ||
| 46 | * is sufficient for the protection. | ||
| 47 | */ | ||
| 48 | DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); | ||
| 42 | 49 | ||
| 43 | /* | 50 | /* |
| 44 | * Used to store the max resource name width and max resource data width | 51 | * Used to store the max resource name width and max resource data width |
| @@ -46,6 +53,12 @@ DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); | |||
| 46 | */ | 53 | */ |
| 47 | int max_name_width, max_data_width; | 54 | int max_name_width, max_data_width; |
| 48 | 55 | ||
| 56 | /* | ||
| 57 | * Global boolean for rdt_alloc which is true if any | ||
| 58 | * resource allocation is enabled. | ||
| 59 | */ | ||
| 60 | bool rdt_alloc_capable; | ||
| 61 | |||
| 49 | static void | 62 | static void |
| 50 | mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); | 63 | mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); |
| 51 | static void | 64 | static void |
| @@ -54,7 +67,9 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); | |||
| 54 | #define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains) | 67 | #define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains) |
| 55 | 68 | ||
| 56 | struct rdt_resource rdt_resources_all[] = { | 69 | struct rdt_resource rdt_resources_all[] = { |
| 70 | [RDT_RESOURCE_L3] = | ||
| 57 | { | 71 | { |
| 72 | .rid = RDT_RESOURCE_L3, | ||
| 58 | .name = "L3", | 73 | .name = "L3", |
| 59 | .domains = domain_init(RDT_RESOURCE_L3), | 74 | .domains = domain_init(RDT_RESOURCE_L3), |
| 60 | .msr_base = IA32_L3_CBM_BASE, | 75 | .msr_base = IA32_L3_CBM_BASE, |
| @@ -67,8 +82,11 @@ struct rdt_resource rdt_resources_all[] = { | |||
| 67 | }, | 82 | }, |
| 68 | .parse_ctrlval = parse_cbm, | 83 | .parse_ctrlval = parse_cbm, |
| 69 | .format_str = "%d=%0*x", | 84 | .format_str = "%d=%0*x", |
| 85 | .fflags = RFTYPE_RES_CACHE, | ||
| 70 | }, | 86 | }, |
| 87 | [RDT_RESOURCE_L3DATA] = | ||
| 71 | { | 88 | { |
| 89 | .rid = RDT_RESOURCE_L3DATA, | ||
| 72 | .name = "L3DATA", | 90 | .name = "L3DATA", |
| 73 | .domains = domain_init(RDT_RESOURCE_L3DATA), | 91 | .domains = domain_init(RDT_RESOURCE_L3DATA), |
| 74 | .msr_base = IA32_L3_CBM_BASE, | 92 | .msr_base = IA32_L3_CBM_BASE, |
| @@ -81,8 +99,11 @@ struct rdt_resource rdt_resources_all[] = { | |||
| 81 | }, | 99 | }, |
| 82 | .parse_ctrlval = parse_cbm, | 100 | .parse_ctrlval = parse_cbm, |
| 83 | .format_str = "%d=%0*x", | 101 | .format_str = "%d=%0*x", |
| 102 | .fflags = RFTYPE_RES_CACHE, | ||
| 84 | }, | 103 | }, |
| 104 | [RDT_RESOURCE_L3CODE] = | ||
| 85 | { | 105 | { |
| 106 | .rid = RDT_RESOURCE_L3CODE, | ||
| 86 | .name = "L3CODE", | 107 | .name = "L3CODE", |
| 87 | .domains = domain_init(RDT_RESOURCE_L3CODE), | 108 | .domains = domain_init(RDT_RESOURCE_L3CODE), |
| 88 | .msr_base = IA32_L3_CBM_BASE, | 109 | .msr_base = IA32_L3_CBM_BASE, |
| @@ -95,8 +116,11 @@ struct rdt_resource rdt_resources_all[] = { | |||
| 95 | }, | 116 | }, |
| 96 | .parse_ctrlval = parse_cbm, | 117 | .parse_ctrlval = parse_cbm, |
| 97 | .format_str = "%d=%0*x", | 118 | .format_str = "%d=%0*x", |
| 119 | .fflags = RFTYPE_RES_CACHE, | ||
| 98 | }, | 120 | }, |
| 121 | [RDT_RESOURCE_L2] = | ||
| 99 | { | 122 | { |
| 123 | .rid = RDT_RESOURCE_L2, | ||
| 100 | .name = "L2", | 124 | .name = "L2", |
| 101 | .domains = domain_init(RDT_RESOURCE_L2), | 125 | .domains = domain_init(RDT_RESOURCE_L2), |
| 102 | .msr_base = IA32_L2_CBM_BASE, | 126 | .msr_base = IA32_L2_CBM_BASE, |
| @@ -109,8 +133,11 @@ struct rdt_resource rdt_resources_all[] = { | |||
| 109 | }, | 133 | }, |
| 110 | .parse_ctrlval = parse_cbm, | 134 | .parse_ctrlval = parse_cbm, |
| 111 | .format_str = "%d=%0*x", | 135 | .format_str = "%d=%0*x", |
| 136 | .fflags = RFTYPE_RES_CACHE, | ||
| 112 | }, | 137 | }, |
| 138 | [RDT_RESOURCE_MBA] = | ||
| 113 | { | 139 | { |
| 140 | .rid = RDT_RESOURCE_MBA, | ||
| 114 | .name = "MB", | 141 | .name = "MB", |
| 115 | .domains = domain_init(RDT_RESOURCE_MBA), | 142 | .domains = domain_init(RDT_RESOURCE_MBA), |
| 116 | .msr_base = IA32_MBA_THRTL_BASE, | 143 | .msr_base = IA32_MBA_THRTL_BASE, |
| @@ -118,6 +145,7 @@ struct rdt_resource rdt_resources_all[] = { | |||
| 118 | .cache_level = 3, | 145 | .cache_level = 3, |
| 119 | .parse_ctrlval = parse_bw, | 146 | .parse_ctrlval = parse_bw, |
| 120 | .format_str = "%d=%*d", | 147 | .format_str = "%d=%*d", |
| 148 | .fflags = RFTYPE_RES_MB, | ||
| 121 | }, | 149 | }, |
| 122 | }; | 150 | }; |
| 123 | 151 | ||
| @@ -144,33 +172,28 @@ static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid) | |||
| 144 | * is always 20 on hsw server parts. The minimum cache bitmask length | 172 | * is always 20 on hsw server parts. The minimum cache bitmask length |
| 145 | * allowed for HSW server is always 2 bits. Hardcode all of them. | 173 | * allowed for HSW server is always 2 bits. Hardcode all of them. |
| 146 | */ | 174 | */ |
| 147 | static inline bool cache_alloc_hsw_probe(void) | 175 | static inline void cache_alloc_hsw_probe(void) |
| 148 | { | 176 | { |
| 149 | if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && | 177 | struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; |
| 150 | boot_cpu_data.x86 == 6 && | 178 | u32 l, h, max_cbm = BIT_MASK(20) - 1; |
| 151 | boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) { | ||
| 152 | struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; | ||
| 153 | u32 l, h, max_cbm = BIT_MASK(20) - 1; | ||
| 154 | |||
| 155 | if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0)) | ||
| 156 | return false; | ||
| 157 | rdmsr(IA32_L3_CBM_BASE, l, h); | ||
| 158 | 179 | ||
| 159 | /* If all the bits were set in MSR, return success */ | 180 | if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0)) |
| 160 | if (l != max_cbm) | 181 | return; |
| 161 | return false; | 182 | rdmsr(IA32_L3_CBM_BASE, l, h); |
| 162 | 183 | ||
| 163 | r->num_closid = 4; | 184 | /* If all the bits were set in MSR, return success */ |
| 164 | r->default_ctrl = max_cbm; | 185 | if (l != max_cbm) |
| 165 | r->cache.cbm_len = 20; | 186 | return; |
| 166 | r->cache.min_cbm_bits = 2; | ||
| 167 | r->capable = true; | ||
| 168 | r->enabled = true; | ||
| 169 | 187 | ||
| 170 | return true; | 188 | r->num_closid = 4; |
| 171 | } | 189 | r->default_ctrl = max_cbm; |
| 190 | r->cache.cbm_len = 20; | ||
| 191 | r->cache.shareable_bits = 0xc0000; | ||
| 192 | r->cache.min_cbm_bits = 2; | ||
| 193 | r->alloc_capable = true; | ||
| 194 | r->alloc_enabled = true; | ||
| 172 | 195 | ||
| 173 | return false; | 196 | rdt_alloc_capable = true; |
| 174 | } | 197 | } |
| 175 | 198 | ||
| 176 | /* | 199 | /* |
| @@ -213,15 +236,14 @@ static bool rdt_get_mem_config(struct rdt_resource *r) | |||
| 213 | return false; | 236 | return false; |
| 214 | } | 237 | } |
| 215 | r->data_width = 3; | 238 | r->data_width = 3; |
| 216 | rdt_get_mba_infofile(r); | ||
| 217 | 239 | ||
| 218 | r->capable = true; | 240 | r->alloc_capable = true; |
| 219 | r->enabled = true; | 241 | r->alloc_enabled = true; |
| 220 | 242 | ||
| 221 | return true; | 243 | return true; |
| 222 | } | 244 | } |
| 223 | 245 | ||
| 224 | static void rdt_get_cache_config(int idx, struct rdt_resource *r) | 246 | static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r) |
| 225 | { | 247 | { |
| 226 | union cpuid_0x10_1_eax eax; | 248 | union cpuid_0x10_1_eax eax; |
| 227 | union cpuid_0x10_x_edx edx; | 249 | union cpuid_0x10_x_edx edx; |
| @@ -231,10 +253,10 @@ static void rdt_get_cache_config(int idx, struct rdt_resource *r) | |||
| 231 | r->num_closid = edx.split.cos_max + 1; | 253 | r->num_closid = edx.split.cos_max + 1; |
| 232 | r->cache.cbm_len = eax.split.cbm_len + 1; | 254 | r->cache.cbm_len = eax.split.cbm_len + 1; |
| 233 | r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; | 255 | r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; |
| 256 | r->cache.shareable_bits = ebx & r->default_ctrl; | ||
| 234 | r->data_width = (r->cache.cbm_len + 3) / 4; | 257 | r->data_width = (r->cache.cbm_len + 3) / 4; |
| 235 | rdt_get_cache_infofile(r); | 258 | r->alloc_capable = true; |
| 236 | r->capable = true; | 259 | r->alloc_enabled = true; |
| 237 | r->enabled = true; | ||
| 238 | } | 260 | } |
| 239 | 261 | ||
| 240 | static void rdt_get_cdp_l3_config(int type) | 262 | static void rdt_get_cdp_l3_config(int type) |
| @@ -246,12 +268,12 @@ static void rdt_get_cdp_l3_config(int type) | |||
| 246 | r->cache.cbm_len = r_l3->cache.cbm_len; | 268 | r->cache.cbm_len = r_l3->cache.cbm_len; |
| 247 | r->default_ctrl = r_l3->default_ctrl; | 269 | r->default_ctrl = r_l3->default_ctrl; |
| 248 | r->data_width = (r->cache.cbm_len + 3) / 4; | 270 | r->data_width = (r->cache.cbm_len + 3) / 4; |
| 249 | r->capable = true; | 271 | r->alloc_capable = true; |
| 250 | /* | 272 | /* |
| 251 | * By default, CDP is disabled. CDP can be enabled by mount parameter | 273 | * By default, CDP is disabled. CDP can be enabled by mount parameter |
| 252 | * "cdp" during resctrl file system mount time. | 274 | * "cdp" during resctrl file system mount time. |
| 253 | */ | 275 | */ |
| 254 | r->enabled = false; | 276 | r->alloc_enabled = false; |
| 255 | } | 277 | } |
| 256 | 278 | ||
| 257 | static int get_cache_id(int cpu, int level) | 279 | static int get_cache_id(int cpu, int level) |
| @@ -300,6 +322,19 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) | |||
| 300 | wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]); | 322 | wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]); |
| 301 | } | 323 | } |
| 302 | 324 | ||
| 325 | struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r) | ||
| 326 | { | ||
| 327 | struct rdt_domain *d; | ||
| 328 | |||
| 329 | list_for_each_entry(d, &r->domains, list) { | ||
| 330 | /* Find the domain that contains this CPU */ | ||
| 331 | if (cpumask_test_cpu(cpu, &d->cpu_mask)) | ||
| 332 | return d; | ||
| 333 | } | ||
| 334 | |||
| 335 | return NULL; | ||
| 336 | } | ||
| 337 | |||
| 303 | void rdt_ctrl_update(void *arg) | 338 | void rdt_ctrl_update(void *arg) |
| 304 | { | 339 | { |
| 305 | struct msr_param *m = arg; | 340 | struct msr_param *m = arg; |
| @@ -307,12 +342,10 @@ void rdt_ctrl_update(void *arg) | |||
| 307 | int cpu = smp_processor_id(); | 342 | int cpu = smp_processor_id(); |
| 308 | struct rdt_domain *d; | 343 | struct rdt_domain *d; |
| 309 | 344 | ||
| 310 | list_for_each_entry(d, &r->domains, list) { | 345 | d = get_domain_from_cpu(cpu, r); |
| 311 | /* Find the domain that contains this CPU */ | 346 | if (d) { |
| 312 | if (cpumask_test_cpu(cpu, &d->cpu_mask)) { | 347 | r->msr_update(d, m, r); |
| 313 | r->msr_update(d, m, r); | 348 | return; |
| 314 | return; | ||
| 315 | } | ||
| 316 | } | 349 | } |
| 317 | pr_warn_once("cpu %d not found in any domain for resource %s\n", | 350 | pr_warn_once("cpu %d not found in any domain for resource %s\n", |
| 318 | cpu, r->name); | 351 | cpu, r->name); |
| @@ -326,8 +359,8 @@ void rdt_ctrl_update(void *arg) | |||
| 326 | * caller, return the first domain whose id is bigger than the input id. | 359 | * caller, return the first domain whose id is bigger than the input id. |
| 327 | * The domain list is sorted by id in ascending order. | 360 | * The domain list is sorted by id in ascending order. |
| 328 | */ | 361 | */ |
| 329 | static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, | 362 | struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, |
| 330 | struct list_head **pos) | 363 | struct list_head **pos) |
| 331 | { | 364 | { |
| 332 | struct rdt_domain *d; | 365 | struct rdt_domain *d; |
| 333 | struct list_head *l; | 366 | struct list_head *l; |
| @@ -377,6 +410,44 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) | |||
| 377 | return 0; | 410 | return 0; |
| 378 | } | 411 | } |
| 379 | 412 | ||
| 413 | static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) | ||
| 414 | { | ||
| 415 | size_t tsize; | ||
| 416 | |||
| 417 | if (is_llc_occupancy_enabled()) { | ||
| 418 | d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid), | ||
| 419 | sizeof(unsigned long), | ||
| 420 | GFP_KERNEL); | ||
| 421 | if (!d->rmid_busy_llc) | ||
| 422 | return -ENOMEM; | ||
| 423 | INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); | ||
| 424 | } | ||
| 425 | if (is_mbm_total_enabled()) { | ||
| 426 | tsize = sizeof(*d->mbm_total); | ||
| 427 | d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); | ||
| 428 | if (!d->mbm_total) { | ||
| 429 | kfree(d->rmid_busy_llc); | ||
| 430 | return -ENOMEM; | ||
| 431 | } | ||
| 432 | } | ||
| 433 | if (is_mbm_local_enabled()) { | ||
| 434 | tsize = sizeof(*d->mbm_local); | ||
| 435 | d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); | ||
| 436 | if (!d->mbm_local) { | ||
| 437 | kfree(d->rmid_busy_llc); | ||
| 438 | kfree(d->mbm_total); | ||
| 439 | return -ENOMEM; | ||
| 440 | } | ||
| 441 | } | ||
| 442 | |||
| 443 | if (is_mbm_enabled()) { | ||
| 444 | INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); | ||
| 445 | mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL); | ||
| 446 | } | ||
| 447 | |||
| 448 | return 0; | ||
| 449 | } | ||
| 450 | |||
| 380 | /* | 451 | /* |
| 381 | * domain_add_cpu - Add a cpu to a resource's domain list. | 452 | * domain_add_cpu - Add a cpu to a resource's domain list. |
| 382 | * | 453 | * |
| @@ -412,14 +483,26 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) | |||
| 412 | return; | 483 | return; |
| 413 | 484 | ||
| 414 | d->id = id; | 485 | d->id = id; |
| 486 | cpumask_set_cpu(cpu, &d->cpu_mask); | ||
| 415 | 487 | ||
| 416 | if (domain_setup_ctrlval(r, d)) { | 488 | if (r->alloc_capable && domain_setup_ctrlval(r, d)) { |
| 489 | kfree(d); | ||
| 490 | return; | ||
| 491 | } | ||
| 492 | |||
| 493 | if (r->mon_capable && domain_setup_mon_state(r, d)) { | ||
| 417 | kfree(d); | 494 | kfree(d); |
| 418 | return; | 495 | return; |
| 419 | } | 496 | } |
| 420 | 497 | ||
| 421 | cpumask_set_cpu(cpu, &d->cpu_mask); | ||
| 422 | list_add_tail(&d->list, add_pos); | 498 | list_add_tail(&d->list, add_pos); |
| 499 | |||
| 500 | /* | ||
| 501 | * If resctrl is mounted, add | ||
| 502 | * per domain monitor data directories. | ||
| 503 | */ | ||
| 504 | if (static_branch_unlikely(&rdt_mon_enable_key)) | ||
| 505 | mkdir_mondata_subdir_allrdtgrp(r, d); | ||
| 423 | } | 506 | } |
| 424 | 507 | ||
| 425 | static void domain_remove_cpu(int cpu, struct rdt_resource *r) | 508 | static void domain_remove_cpu(int cpu, struct rdt_resource *r) |
| @@ -435,19 +518,58 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) | |||
| 435 | 518 | ||
| 436 | cpumask_clear_cpu(cpu, &d->cpu_mask); | 519 | cpumask_clear_cpu(cpu, &d->cpu_mask); |
| 437 | if (cpumask_empty(&d->cpu_mask)) { | 520 | if (cpumask_empty(&d->cpu_mask)) { |
| 521 | /* | ||
| 522 | * If resctrl is mounted, remove all the | ||
| 523 | * per domain monitor data directories. | ||
| 524 | */ | ||
| 525 | if (static_branch_unlikely(&rdt_mon_enable_key)) | ||
| 526 | rmdir_mondata_subdir_allrdtgrp(r, d->id); | ||
| 438 | kfree(d->ctrl_val); | 527 | kfree(d->ctrl_val); |
| 528 | kfree(d->rmid_busy_llc); | ||
| 529 | kfree(d->mbm_total); | ||
| 530 | kfree(d->mbm_local); | ||
| 439 | list_del(&d->list); | 531 | list_del(&d->list); |
| 532 | if (is_mbm_enabled()) | ||
| 533 | cancel_delayed_work(&d->mbm_over); | ||
| 534 | if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { | ||
| 535 | /* | ||
| 536 | * When a package is going down, forcefully | ||
| 537 | * decrement rmid->ebusy. There is no way to know | ||
| 538 | * that the L3 was flushed and hence may lead to | ||
| 539 | * incorrect counts in rare scenarios, but leaving | ||
| 540 | * the RMID as busy creates RMID leaks if the | ||
| 541 | * package never comes back. | ||
| 542 | */ | ||
| 543 | __check_limbo(d, true); | ||
| 544 | cancel_delayed_work(&d->cqm_limbo); | ||
| 545 | } | ||
| 546 | |||
| 440 | kfree(d); | 547 | kfree(d); |
| 548 | return; | ||
| 549 | } | ||
| 550 | |||
| 551 | if (r == &rdt_resources_all[RDT_RESOURCE_L3]) { | ||
| 552 | if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { | ||
| 553 | cancel_delayed_work(&d->mbm_over); | ||
| 554 | mbm_setup_overflow_handler(d, 0); | ||
| 555 | } | ||
| 556 | if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && | ||
| 557 | has_busy_rmid(r, d)) { | ||
| 558 | cancel_delayed_work(&d->cqm_limbo); | ||
| 559 | cqm_setup_limbo_handler(d, 0); | ||
| 560 | } | ||
| 441 | } | 561 | } |
| 442 | } | 562 | } |
| 443 | 563 | ||
| 444 | static void clear_closid(int cpu) | 564 | static void clear_closid_rmid(int cpu) |
| 445 | { | 565 | { |
| 446 | struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); | 566 | struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); |
| 447 | 567 | ||
| 448 | per_cpu(cpu_closid, cpu) = 0; | 568 | state->default_closid = 0; |
| 449 | state->closid = 0; | 569 | state->default_rmid = 0; |
| 450 | wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0); | 570 | state->cur_closid = 0; |
| 571 | state->cur_rmid = 0; | ||
| 572 | wrmsr(IA32_PQR_ASSOC, 0, 0); | ||
| 451 | } | 573 | } |
| 452 | 574 | ||
| 453 | static int intel_rdt_online_cpu(unsigned int cpu) | 575 | static int intel_rdt_online_cpu(unsigned int cpu) |
| @@ -459,12 +581,23 @@ static int intel_rdt_online_cpu(unsigned int cpu) | |||
| 459 | domain_add_cpu(cpu, r); | 581 | domain_add_cpu(cpu, r); |
| 460 | /* The cpu is set in default rdtgroup after online. */ | 582 | /* The cpu is set in default rdtgroup after online. */ |
| 461 | cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); | 583 | cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); |
| 462 | clear_closid(cpu); | 584 | clear_closid_rmid(cpu); |
| 463 | mutex_unlock(&rdtgroup_mutex); | 585 | mutex_unlock(&rdtgroup_mutex); |
| 464 | 586 | ||
| 465 | return 0; | 587 | return 0; |
| 466 | } | 588 | } |
| 467 | 589 | ||
| 590 | static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) | ||
| 591 | { | ||
| 592 | struct rdtgroup *cr; | ||
| 593 | |||
| 594 | list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { | ||
| 595 | if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) { | ||
| 596 | break; | ||
| 597 | } | ||
| 598 | } | ||
| 599 | } | ||
| 600 | |||
| 468 | static int intel_rdt_offline_cpu(unsigned int cpu) | 601 | static int intel_rdt_offline_cpu(unsigned int cpu) |
| 469 | { | 602 | { |
| 470 | struct rdtgroup *rdtgrp; | 603 | struct rdtgroup *rdtgrp; |
| @@ -474,10 +607,12 @@ static int intel_rdt_offline_cpu(unsigned int cpu) | |||
| 474 | for_each_capable_rdt_resource(r) | 607 | for_each_capable_rdt_resource(r) |
| 475 | domain_remove_cpu(cpu, r); | 608 | domain_remove_cpu(cpu, r); |
| 476 | list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { | 609 | list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { |
| 477 | if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) | 610 | if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { |
| 611 | clear_childcpus(rdtgrp, cpu); | ||
| 478 | break; | 612 | break; |
| 613 | } | ||
| 479 | } | 614 | } |
| 480 | clear_closid(cpu); | 615 | clear_closid_rmid(cpu); |
| 481 | mutex_unlock(&rdtgroup_mutex); | 616 | mutex_unlock(&rdtgroup_mutex); |
| 482 | 617 | ||
| 483 | return 0; | 618 | return 0; |
| @@ -492,7 +627,7 @@ static __init void rdt_init_padding(void) | |||
| 492 | struct rdt_resource *r; | 627 | struct rdt_resource *r; |
| 493 | int cl; | 628 | int cl; |
| 494 | 629 | ||
| 495 | for_each_capable_rdt_resource(r) { | 630 | for_each_alloc_capable_rdt_resource(r) { |
| 496 | cl = strlen(r->name); | 631 | cl = strlen(r->name); |
| 497 | if (cl > max_name_width) | 632 | if (cl > max_name_width) |
| 498 | max_name_width = cl; | 633 | max_name_width = cl; |
| @@ -502,38 +637,153 @@ static __init void rdt_init_padding(void) | |||
| 502 | } | 637 | } |
| 503 | } | 638 | } |
| 504 | 639 | ||
| 505 | static __init bool get_rdt_resources(void) | 640 | enum { |
| 641 | RDT_FLAG_CMT, | ||
| 642 | RDT_FLAG_MBM_TOTAL, | ||
| 643 | RDT_FLAG_MBM_LOCAL, | ||
| 644 | RDT_FLAG_L3_CAT, | ||
| 645 | RDT_FLAG_L3_CDP, | ||
| 646 | RDT_FLAG_L2_CAT, | ||
| 647 | RDT_FLAG_MBA, | ||
| 648 | }; | ||
| 649 | |||
| 650 | #define RDT_OPT(idx, n, f) \ | ||
| 651 | [idx] = { \ | ||
| 652 | .name = n, \ | ||
| 653 | .flag = f \ | ||
| 654 | } | ||
| 655 | |||
| 656 | struct rdt_options { | ||
| 657 | char *name; | ||
| 658 | int flag; | ||
| 659 | bool force_off, force_on; | ||
| 660 | }; | ||
| 661 | |||
| 662 | static struct rdt_options rdt_options[] __initdata = { | ||
| 663 | RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC), | ||
| 664 | RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL), | ||
| 665 | RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL), | ||
| 666 | RDT_OPT(RDT_FLAG_L3_CAT, "l3cat", X86_FEATURE_CAT_L3), | ||
| 667 | RDT_OPT(RDT_FLAG_L3_CDP, "l3cdp", X86_FEATURE_CDP_L3), | ||
| 668 | RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2), | ||
| 669 | RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA), | ||
| 670 | }; | ||
| 671 | #define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) | ||
| 672 | |||
| 673 | static int __init set_rdt_options(char *str) | ||
| 674 | { | ||
| 675 | struct rdt_options *o; | ||
| 676 | bool force_off; | ||
| 677 | char *tok; | ||
| 678 | |||
| 679 | if (*str == '=') | ||
| 680 | str++; | ||
| 681 | while ((tok = strsep(&str, ",")) != NULL) { | ||
| 682 | force_off = *tok == '!'; | ||
| 683 | if (force_off) | ||
| 684 | tok++; | ||
| 685 | for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { | ||
| 686 | if (strcmp(tok, o->name) == 0) { | ||
| 687 | if (force_off) | ||
| 688 | o->force_off = true; | ||
| 689 | else | ||
| 690 | o->force_on = true; | ||
| 691 | break; | ||
| 692 | } | ||
| 693 | } | ||
| 694 | } | ||
| 695 | return 1; | ||
| 696 | } | ||
| 697 | __setup("rdt", set_rdt_options); | ||
| 698 | |||
| 699 | static bool __init rdt_cpu_has(int flag) | ||
| 700 | { | ||
| 701 | bool ret = boot_cpu_has(flag); | ||
| 702 | struct rdt_options *o; | ||
| 703 | |||
| 704 | if (!ret) | ||
| 705 | return ret; | ||
| 706 | |||
| 707 | for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { | ||
| 708 | if (flag == o->flag) { | ||
| 709 | if (o->force_off) | ||
| 710 | ret = false; | ||
| 711 | if (o->force_on) | ||
| 712 | ret = true; | ||
| 713 | break; | ||
| 714 | } | ||
| 715 | } | ||
| 716 | return ret; | ||
| 717 | } | ||
| 718 | |||
| 719 | static __init bool get_rdt_alloc_resources(void) | ||
| 506 | { | 720 | { |
| 507 | bool ret = false; | 721 | bool ret = false; |
| 508 | 722 | ||
| 509 | if (cache_alloc_hsw_probe()) | 723 | if (rdt_alloc_capable) |
| 510 | return true; | 724 | return true; |
| 511 | 725 | ||
| 512 | if (!boot_cpu_has(X86_FEATURE_RDT_A)) | 726 | if (!boot_cpu_has(X86_FEATURE_RDT_A)) |
| 513 | return false; | 727 | return false; |
| 514 | 728 | ||
| 515 | if (boot_cpu_has(X86_FEATURE_CAT_L3)) { | 729 | if (rdt_cpu_has(X86_FEATURE_CAT_L3)) { |
| 516 | rdt_get_cache_config(1, &rdt_resources_all[RDT_RESOURCE_L3]); | 730 | rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]); |
| 517 | if (boot_cpu_has(X86_FEATURE_CDP_L3)) { | 731 | if (rdt_cpu_has(X86_FEATURE_CDP_L3)) { |
| 518 | rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); | 732 | rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); |
| 519 | rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); | 733 | rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); |
| 520 | } | 734 | } |
| 521 | ret = true; | 735 | ret = true; |
| 522 | } | 736 | } |
| 523 | if (boot_cpu_has(X86_FEATURE_CAT_L2)) { | 737 | if (rdt_cpu_has(X86_FEATURE_CAT_L2)) { |
| 524 | /* CPUID 0x10.2 fields are same format at 0x10.1 */ | 738 | /* CPUID 0x10.2 fields are same format at 0x10.1 */ |
| 525 | rdt_get_cache_config(2, &rdt_resources_all[RDT_RESOURCE_L2]); | 739 | rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]); |
| 526 | ret = true; | 740 | ret = true; |
| 527 | } | 741 | } |
| 528 | 742 | ||
| 529 | if (boot_cpu_has(X86_FEATURE_MBA)) { | 743 | if (rdt_cpu_has(X86_FEATURE_MBA)) { |
| 530 | if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA])) | 744 | if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA])) |
| 531 | ret = true; | 745 | ret = true; |
| 532 | } | 746 | } |
| 533 | |||
| 534 | return ret; | 747 | return ret; |
| 535 | } | 748 | } |
| 536 | 749 | ||
| 750 | static __init bool get_rdt_mon_resources(void) | ||
| 751 | { | ||
| 752 | if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) | ||
| 753 | rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID); | ||
| 754 | if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) | ||
| 755 | rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID); | ||
| 756 | if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) | ||
| 757 | rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID); | ||
| 758 | |||
| 759 | if (!rdt_mon_features) | ||
| 760 | return false; | ||
| 761 | |||
| 762 | return !rdt_get_mon_l3_config(&rdt_resources_all[RDT_RESOURCE_L3]); | ||
| 763 | } | ||
| 764 | |||
| 765 | static __init void rdt_quirks(void) | ||
| 766 | { | ||
| 767 | switch (boot_cpu_data.x86_model) { | ||
| 768 | case INTEL_FAM6_HASWELL_X: | ||
| 769 | if (!rdt_options[RDT_FLAG_L3_CAT].force_off) | ||
| 770 | cache_alloc_hsw_probe(); | ||
| 771 | break; | ||
| 772 | case INTEL_FAM6_SKYLAKE_X: | ||
| 773 | if (boot_cpu_data.x86_mask <= 4) | ||
| 774 | set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat"); | ||
| 775 | } | ||
| 776 | } | ||
| 777 | |||
| 778 | static __init bool get_rdt_resources(void) | ||
| 779 | { | ||
| 780 | rdt_quirks(); | ||
| 781 | rdt_alloc_capable = get_rdt_alloc_resources(); | ||
| 782 | rdt_mon_capable = get_rdt_mon_resources(); | ||
| 783 | |||
| 784 | return (rdt_mon_capable || rdt_alloc_capable); | ||
| 785 | } | ||
| 786 | |||
| 537 | static int __init intel_rdt_late_init(void) | 787 | static int __init intel_rdt_late_init(void) |
| 538 | { | 788 | { |
| 539 | struct rdt_resource *r; | 789 | struct rdt_resource *r; |
| @@ -556,9 +806,12 @@ static int __init intel_rdt_late_init(void) | |||
| 556 | return ret; | 806 | return ret; |
| 557 | } | 807 | } |
| 558 | 808 | ||
| 559 | for_each_capable_rdt_resource(r) | 809 | for_each_alloc_capable_rdt_resource(r) |
| 560 | pr_info("Intel RDT %s allocation detected\n", r->name); | 810 | pr_info("Intel RDT %s allocation detected\n", r->name); |
| 561 | 811 | ||
| 812 | for_each_mon_capable_rdt_resource(r) | ||
| 813 | pr_info("Intel RDT %s monitoring detected\n", r->name); | ||
| 814 | |||
| 562 | return 0; | 815 | return 0; |
| 563 | } | 816 | } |
| 564 | 817 | ||
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h new file mode 100644 index 000000000000..ebaddaeef023 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt.h | |||
| @@ -0,0 +1,440 @@ | |||
| 1 | #ifndef _ASM_X86_INTEL_RDT_H | ||
| 2 | #define _ASM_X86_INTEL_RDT_H | ||
| 3 | |||
| 4 | #include <linux/sched.h> | ||
| 5 | #include <linux/kernfs.h> | ||
| 6 | #include <linux/jump_label.h> | ||
| 7 | |||
| 8 | #define IA32_L3_QOS_CFG 0xc81 | ||
| 9 | #define IA32_L3_CBM_BASE 0xc90 | ||
| 10 | #define IA32_L2_CBM_BASE 0xd10 | ||
| 11 | #define IA32_MBA_THRTL_BASE 0xd50 | ||
| 12 | |||
| 13 | #define L3_QOS_CDP_ENABLE 0x01ULL | ||
| 14 | |||
| 15 | /* | ||
| 16 | * Event IDs are used to program IA32_QM_EVTSEL before reading event | ||
| 17 | * counter from IA32_QM_CTR | ||
| 18 | */ | ||
| 19 | #define QOS_L3_OCCUP_EVENT_ID 0x01 | ||
| 20 | #define QOS_L3_MBM_TOTAL_EVENT_ID 0x02 | ||
| 21 | #define QOS_L3_MBM_LOCAL_EVENT_ID 0x03 | ||
| 22 | |||
| 23 | #define CQM_LIMBOCHECK_INTERVAL 1000 | ||
| 24 | |||
| 25 | #define MBM_CNTR_WIDTH 24 | ||
| 26 | #define MBM_OVERFLOW_INTERVAL 1000 | ||
| 27 | |||
| 28 | #define RMID_VAL_ERROR BIT_ULL(63) | ||
| 29 | #define RMID_VAL_UNAVAIL BIT_ULL(62) | ||
| 30 | |||
| 31 | DECLARE_STATIC_KEY_FALSE(rdt_enable_key); | ||
| 32 | |||
| 33 | /** | ||
| 34 | * struct mon_evt - Entry in the event list of a resource | ||
| 35 | * @evtid: event id | ||
| 36 | * @name: name of the event | ||
| 37 | */ | ||
| 38 | struct mon_evt { | ||
| 39 | u32 evtid; | ||
| 40 | char *name; | ||
| 41 | struct list_head list; | ||
| 42 | }; | ||
| 43 | |||
| 44 | /** | ||
| 45 | * struct mon_data_bits - Monitoring details for each event file | ||
| 46 | * @rid: Resource id associated with the event file. | ||
| 47 | * @evtid: Event id associated with the event file | ||
| 48 | * @domid: The domain to which the event file belongs | ||
| 49 | */ | ||
| 50 | union mon_data_bits { | ||
| 51 | void *priv; | ||
| 52 | struct { | ||
| 53 | unsigned int rid : 10; | ||
| 54 | unsigned int evtid : 8; | ||
| 55 | unsigned int domid : 14; | ||
| 56 | } u; | ||
| 57 | }; | ||
| 58 | |||
| 59 | struct rmid_read { | ||
| 60 | struct rdtgroup *rgrp; | ||
| 61 | struct rdt_domain *d; | ||
| 62 | int evtid; | ||
| 63 | bool first; | ||
| 64 | u64 val; | ||
| 65 | }; | ||
| 66 | |||
| 67 | extern unsigned int intel_cqm_threshold; | ||
| 68 | extern bool rdt_alloc_capable; | ||
| 69 | extern bool rdt_mon_capable; | ||
| 70 | extern unsigned int rdt_mon_features; | ||
| 71 | |||
| 72 | enum rdt_group_type { | ||
| 73 | RDTCTRL_GROUP = 0, | ||
| 74 | RDTMON_GROUP, | ||
| 75 | RDT_NUM_GROUP, | ||
| 76 | }; | ||
| 77 | |||
| 78 | /** | ||
| 79 | * struct mongroup - store mon group's data in resctrl fs. | ||
| 80 | * @mon_data_kn kernlfs node for the mon_data directory | ||
| 81 | * @parent: parent rdtgrp | ||
| 82 | * @crdtgrp_list: child rdtgroup node list | ||
| 83 | * @rmid: rmid for this rdtgroup | ||
| 84 | */ | ||
| 85 | struct mongroup { | ||
| 86 | struct kernfs_node *mon_data_kn; | ||
| 87 | struct rdtgroup *parent; | ||
| 88 | struct list_head crdtgrp_list; | ||
| 89 | u32 rmid; | ||
| 90 | }; | ||
| 91 | |||
| 92 | /** | ||
| 93 | * struct rdtgroup - store rdtgroup's data in resctrl file system. | ||
| 94 | * @kn: kernfs node | ||
| 95 | * @rdtgroup_list: linked list for all rdtgroups | ||
| 96 | * @closid: closid for this rdtgroup | ||
| 97 | * @cpu_mask: CPUs assigned to this rdtgroup | ||
| 98 | * @flags: status bits | ||
| 99 | * @waitcount: how many cpus expect to find this | ||
| 100 | * group when they acquire rdtgroup_mutex | ||
| 101 | * @type: indicates type of this rdtgroup - either | ||
| 102 | * monitor only or ctrl_mon group | ||
| 103 | * @mon: mongroup related data | ||
| 104 | */ | ||
| 105 | struct rdtgroup { | ||
| 106 | struct kernfs_node *kn; | ||
| 107 | struct list_head rdtgroup_list; | ||
| 108 | u32 closid; | ||
| 109 | struct cpumask cpu_mask; | ||
| 110 | int flags; | ||
| 111 | atomic_t waitcount; | ||
| 112 | enum rdt_group_type type; | ||
| 113 | struct mongroup mon; | ||
| 114 | }; | ||
| 115 | |||
| 116 | /* rdtgroup.flags */ | ||
| 117 | #define RDT_DELETED 1 | ||
| 118 | |||
| 119 | /* rftype.flags */ | ||
| 120 | #define RFTYPE_FLAGS_CPUS_LIST 1 | ||
| 121 | |||
| 122 | /* | ||
| 123 | * Define the file type flags for base and info directories. | ||
| 124 | */ | ||
| 125 | #define RFTYPE_INFO BIT(0) | ||
| 126 | #define RFTYPE_BASE BIT(1) | ||
| 127 | #define RF_CTRLSHIFT 4 | ||
| 128 | #define RF_MONSHIFT 5 | ||
| 129 | #define RFTYPE_CTRL BIT(RF_CTRLSHIFT) | ||
| 130 | #define RFTYPE_MON BIT(RF_MONSHIFT) | ||
| 131 | #define RFTYPE_RES_CACHE BIT(8) | ||
| 132 | #define RFTYPE_RES_MB BIT(9) | ||
| 133 | #define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) | ||
| 134 | #define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON) | ||
| 135 | #define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) | ||
| 136 | |||
| 137 | /* List of all resource groups */ | ||
| 138 | extern struct list_head rdt_all_groups; | ||
| 139 | |||
| 140 | extern int max_name_width, max_data_width; | ||
| 141 | |||
| 142 | int __init rdtgroup_init(void); | ||
| 143 | |||
| 144 | /** | ||
| 145 | * struct rftype - describe each file in the resctrl file system | ||
| 146 | * @name: File name | ||
| 147 | * @mode: Access mode | ||
| 148 | * @kf_ops: File operations | ||
| 149 | * @flags: File specific RFTYPE_FLAGS_* flags | ||
| 150 | * @fflags: File specific RF_* or RFTYPE_* flags | ||
| 151 | * @seq_show: Show content of the file | ||
| 152 | * @write: Write to the file | ||
| 153 | */ | ||
| 154 | struct rftype { | ||
| 155 | char *name; | ||
| 156 | umode_t mode; | ||
| 157 | struct kernfs_ops *kf_ops; | ||
| 158 | unsigned long flags; | ||
| 159 | unsigned long fflags; | ||
| 160 | |||
| 161 | int (*seq_show)(struct kernfs_open_file *of, | ||
| 162 | struct seq_file *sf, void *v); | ||
| 163 | /* | ||
| 164 | * write() is the generic write callback which maps directly to | ||
| 165 | * kernfs write operation and overrides all other operations. | ||
| 166 | * Maximum write size is determined by ->max_write_len. | ||
| 167 | */ | ||
| 168 | ssize_t (*write)(struct kernfs_open_file *of, | ||
| 169 | char *buf, size_t nbytes, loff_t off); | ||
| 170 | }; | ||
| 171 | |||
| 172 | /** | ||
| 173 | * struct mbm_state - status for each MBM counter in each domain | ||
| 174 | * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) | ||
| 175 | * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it | ||
| 176 | */ | ||
| 177 | struct mbm_state { | ||
| 178 | u64 chunks; | ||
| 179 | u64 prev_msr; | ||
| 180 | }; | ||
| 181 | |||
| 182 | /** | ||
| 183 | * struct rdt_domain - group of cpus sharing an RDT resource | ||
| 184 | * @list: all instances of this resource | ||
| 185 | * @id: unique id for this instance | ||
| 186 | * @cpu_mask: which cpus share this resource | ||
| 187 | * @rmid_busy_llc: | ||
| 188 | * bitmap of which limbo RMIDs are above threshold | ||
| 189 | * @mbm_total: saved state for MBM total bandwidth | ||
| 190 | * @mbm_local: saved state for MBM local bandwidth | ||
| 191 | * @mbm_over: worker to periodically read MBM h/w counters | ||
| 192 | * @cqm_limbo: worker to periodically read CQM h/w counters | ||
| 193 | * @mbm_work_cpu: | ||
| 194 | * worker cpu for MBM h/w counters | ||
| 195 | * @cqm_work_cpu: | ||
| 196 | * worker cpu for CQM h/w counters | ||
| 197 | * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) | ||
| 198 | * @new_ctrl: new ctrl value to be loaded | ||
| 199 | * @have_new_ctrl: did user provide new_ctrl for this domain | ||
| 200 | */ | ||
| 201 | struct rdt_domain { | ||
| 202 | struct list_head list; | ||
| 203 | int id; | ||
| 204 | struct cpumask cpu_mask; | ||
| 205 | unsigned long *rmid_busy_llc; | ||
| 206 | struct mbm_state *mbm_total; | ||
| 207 | struct mbm_state *mbm_local; | ||
| 208 | struct delayed_work mbm_over; | ||
| 209 | struct delayed_work cqm_limbo; | ||
| 210 | int mbm_work_cpu; | ||
| 211 | int cqm_work_cpu; | ||
| 212 | u32 *ctrl_val; | ||
| 213 | u32 new_ctrl; | ||
| 214 | bool have_new_ctrl; | ||
| 215 | }; | ||
| 216 | |||
| 217 | /** | ||
| 218 | * struct msr_param - set a range of MSRs from a domain | ||
| 219 | * @res: The resource to use | ||
| 220 | * @low: Beginning index from base MSR | ||
| 221 | * @high: End index | ||
| 222 | */ | ||
| 223 | struct msr_param { | ||
| 224 | struct rdt_resource *res; | ||
| 225 | int low; | ||
| 226 | int high; | ||
| 227 | }; | ||
| 228 | |||
| 229 | /** | ||
| 230 | * struct rdt_cache - Cache allocation related data | ||
| 231 | * @cbm_len: Length of the cache bit mask | ||
| 232 | * @min_cbm_bits: Minimum number of consecutive bits to be set | ||
| 233 | * @cbm_idx_mult: Multiplier of CBM index | ||
| 234 | * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: | ||
| 235 | * closid * cbm_idx_multi + cbm_idx_offset | ||
| 236 | * in a cache bit mask | ||
| 237 | * @shareable_bits: Bitmask of shareable resource with other | ||
| 238 | * executing entities | ||
| 239 | */ | ||
| 240 | struct rdt_cache { | ||
| 241 | unsigned int cbm_len; | ||
| 242 | unsigned int min_cbm_bits; | ||
| 243 | unsigned int cbm_idx_mult; | ||
| 244 | unsigned int cbm_idx_offset; | ||
| 245 | unsigned int shareable_bits; | ||
| 246 | }; | ||
| 247 | |||
| 248 | /** | ||
| 249 | * struct rdt_membw - Memory bandwidth allocation related data | ||
| 250 | * @max_delay: Max throttle delay. Delay is the hardware | ||
| 251 | * representation for memory bandwidth. | ||
| 252 | * @min_bw: Minimum memory bandwidth percentage user can request | ||
| 253 | * @bw_gran: Granularity at which the memory bandwidth is allocated | ||
| 254 | * @delay_linear: True if memory B/W delay is in linear scale | ||
| 255 | * @mb_map: Mapping of memory B/W percentage to memory B/W delay | ||
| 256 | */ | ||
| 257 | struct rdt_membw { | ||
| 258 | u32 max_delay; | ||
| 259 | u32 min_bw; | ||
| 260 | u32 bw_gran; | ||
| 261 | u32 delay_linear; | ||
| 262 | u32 *mb_map; | ||
| 263 | }; | ||
| 264 | |||
| 265 | static inline bool is_llc_occupancy_enabled(void) | ||
| 266 | { | ||
| 267 | return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID)); | ||
| 268 | } | ||
| 269 | |||
| 270 | static inline bool is_mbm_total_enabled(void) | ||
| 271 | { | ||
| 272 | return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID)); | ||
| 273 | } | ||
| 274 | |||
| 275 | static inline bool is_mbm_local_enabled(void) | ||
| 276 | { | ||
| 277 | return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID)); | ||
| 278 | } | ||
| 279 | |||
| 280 | static inline bool is_mbm_enabled(void) | ||
| 281 | { | ||
| 282 | return (is_mbm_total_enabled() || is_mbm_local_enabled()); | ||
| 283 | } | ||
| 284 | |||
| 285 | static inline bool is_mbm_event(int e) | ||
| 286 | { | ||
| 287 | return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && | ||
| 288 | e <= QOS_L3_MBM_LOCAL_EVENT_ID); | ||
| 289 | } | ||
| 290 | |||
| 291 | /** | ||
| 292 | * struct rdt_resource - attributes of an RDT resource | ||
| 293 | * @rid: The index of the resource | ||
| 294 | * @alloc_enabled: Is allocation enabled on this machine | ||
| 295 | * @mon_enabled: Is monitoring enabled for this feature | ||
| 296 | * @alloc_capable: Is allocation available on this machine | ||
| 297 | * @mon_capable: Is monitor feature available on this machine | ||
| 298 | * @name: Name to use in "schemata" file | ||
| 299 | * @num_closid: Number of CLOSIDs available | ||
| 300 | * @cache_level: Which cache level defines scope of this resource | ||
| 301 | * @default_ctrl: Specifies default cache cbm or memory B/W percent. | ||
| 302 | * @msr_base: Base MSR address for CBMs | ||
| 303 | * @msr_update: Function pointer to update QOS MSRs | ||
| 304 | * @data_width: Character width of data when displaying | ||
| 305 | * @domains: All domains for this resource | ||
| 306 | * @cache: Cache allocation related data | ||
| 307 | * @format_str: Per resource format string to show domain value | ||
| 308 | * @parse_ctrlval: Per resource function pointer to parse control values | ||
| 309 | * @evt_list: List of monitoring events | ||
| 310 | * @num_rmid: Number of RMIDs available | ||
| 311 | * @mon_scale: cqm counter * mon_scale = occupancy in bytes | ||
| 312 | * @fflags: flags to choose base and info files | ||
| 313 | */ | ||
| 314 | struct rdt_resource { | ||
| 315 | int rid; | ||
| 316 | bool alloc_enabled; | ||
| 317 | bool mon_enabled; | ||
| 318 | bool alloc_capable; | ||
| 319 | bool mon_capable; | ||
| 320 | char *name; | ||
| 321 | int num_closid; | ||
| 322 | int cache_level; | ||
| 323 | u32 default_ctrl; | ||
| 324 | unsigned int msr_base; | ||
| 325 | void (*msr_update) (struct rdt_domain *d, struct msr_param *m, | ||
| 326 | struct rdt_resource *r); | ||
| 327 | int data_width; | ||
| 328 | struct list_head domains; | ||
| 329 | struct rdt_cache cache; | ||
| 330 | struct rdt_membw membw; | ||
| 331 | const char *format_str; | ||
| 332 | int (*parse_ctrlval) (char *buf, struct rdt_resource *r, | ||
| 333 | struct rdt_domain *d); | ||
| 334 | struct list_head evt_list; | ||
| 335 | int num_rmid; | ||
| 336 | unsigned int mon_scale; | ||
| 337 | unsigned long fflags; | ||
| 338 | }; | ||
| 339 | |||
| 340 | int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d); | ||
| 341 | int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d); | ||
| 342 | |||
| 343 | extern struct mutex rdtgroup_mutex; | ||
| 344 | |||
| 345 | extern struct rdt_resource rdt_resources_all[]; | ||
| 346 | extern struct rdtgroup rdtgroup_default; | ||
| 347 | DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); | ||
| 348 | |||
| 349 | int __init rdtgroup_init(void); | ||
| 350 | |||
| 351 | enum { | ||
| 352 | RDT_RESOURCE_L3, | ||
| 353 | RDT_RESOURCE_L3DATA, | ||
| 354 | RDT_RESOURCE_L3CODE, | ||
| 355 | RDT_RESOURCE_L2, | ||
| 356 | RDT_RESOURCE_MBA, | ||
| 357 | |||
| 358 | /* Must be the last */ | ||
| 359 | RDT_NUM_RESOURCES, | ||
| 360 | }; | ||
| 361 | |||
| 362 | #define for_each_capable_rdt_resource(r) \ | ||
| 363 | for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ | ||
| 364 | r++) \ | ||
| 365 | if (r->alloc_capable || r->mon_capable) | ||
| 366 | |||
| 367 | #define for_each_alloc_capable_rdt_resource(r) \ | ||
| 368 | for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ | ||
| 369 | r++) \ | ||
| 370 | if (r->alloc_capable) | ||
| 371 | |||
| 372 | #define for_each_mon_capable_rdt_resource(r) \ | ||
| 373 | for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ | ||
| 374 | r++) \ | ||
| 375 | if (r->mon_capable) | ||
| 376 | |||
| 377 | #define for_each_alloc_enabled_rdt_resource(r) \ | ||
| 378 | for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ | ||
| 379 | r++) \ | ||
| 380 | if (r->alloc_enabled) | ||
| 381 | |||
| 382 | #define for_each_mon_enabled_rdt_resource(r) \ | ||
| 383 | for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ | ||
| 384 | r++) \ | ||
| 385 | if (r->mon_enabled) | ||
| 386 | |||
| 387 | /* CPUID.(EAX=10H, ECX=ResID=1).EAX */ | ||
| 388 | union cpuid_0x10_1_eax { | ||
| 389 | struct { | ||
| 390 | unsigned int cbm_len:5; | ||
| 391 | } split; | ||
| 392 | unsigned int full; | ||
| 393 | }; | ||
| 394 | |||
| 395 | /* CPUID.(EAX=10H, ECX=ResID=3).EAX */ | ||
| 396 | union cpuid_0x10_3_eax { | ||
| 397 | struct { | ||
| 398 | unsigned int max_delay:12; | ||
| 399 | } split; | ||
| 400 | unsigned int full; | ||
| 401 | }; | ||
| 402 | |||
| 403 | /* CPUID.(EAX=10H, ECX=ResID).EDX */ | ||
| 404 | union cpuid_0x10_x_edx { | ||
| 405 | struct { | ||
| 406 | unsigned int cos_max:16; | ||
| 407 | } split; | ||
| 408 | unsigned int full; | ||
| 409 | }; | ||
| 410 | |||
| 411 | void rdt_ctrl_update(void *arg); | ||
| 412 | struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); | ||
| 413 | void rdtgroup_kn_unlock(struct kernfs_node *kn); | ||
| 414 | struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, | ||
| 415 | struct list_head **pos); | ||
| 416 | ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, | ||
| 417 | char *buf, size_t nbytes, loff_t off); | ||
| 418 | int rdtgroup_schemata_show(struct kernfs_open_file *of, | ||
| 419 | struct seq_file *s, void *v); | ||
| 420 | struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); | ||
| 421 | int alloc_rmid(void); | ||
| 422 | void free_rmid(u32 rmid); | ||
| 423 | int rdt_get_mon_l3_config(struct rdt_resource *r); | ||
| 424 | void mon_event_count(void *info); | ||
| 425 | int rdtgroup_mondata_show(struct seq_file *m, void *arg); | ||
| 426 | void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, | ||
| 427 | unsigned int dom_id); | ||
| 428 | void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, | ||
| 429 | struct rdt_domain *d); | ||
| 430 | void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, | ||
| 431 | struct rdtgroup *rdtgrp, int evtid, int first); | ||
| 432 | void mbm_setup_overflow_handler(struct rdt_domain *dom, | ||
| 433 | unsigned long delay_ms); | ||
| 434 | void mbm_handle_overflow(struct work_struct *work); | ||
| 435 | void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); | ||
| 436 | void cqm_handle_limbo(struct work_struct *work); | ||
| 437 | bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); | ||
| 438 | void __check_limbo(struct rdt_domain *d, bool force_free); | ||
| 439 | |||
| 440 | #endif /* _ASM_X86_INTEL_RDT_H */ | ||
diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index 406d7a6532f9..f6ea94f8954a 100644 --- a/arch/x86/kernel/cpu/intel_rdt_schemata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | |||
| @@ -26,7 +26,7 @@ | |||
| 26 | #include <linux/kernfs.h> | 26 | #include <linux/kernfs.h> |
| 27 | #include <linux/seq_file.h> | 27 | #include <linux/seq_file.h> |
| 28 | #include <linux/slab.h> | 28 | #include <linux/slab.h> |
| 29 | #include <asm/intel_rdt.h> | 29 | #include "intel_rdt.h" |
| 30 | 30 | ||
| 31 | /* | 31 | /* |
| 32 | * Check whether MBA bandwidth percentage value is correct. The value is | 32 | * Check whether MBA bandwidth percentage value is correct. The value is |
| @@ -192,7 +192,7 @@ static int rdtgroup_parse_resource(char *resname, char *tok, int closid) | |||
| 192 | { | 192 | { |
| 193 | struct rdt_resource *r; | 193 | struct rdt_resource *r; |
| 194 | 194 | ||
| 195 | for_each_enabled_rdt_resource(r) { | 195 | for_each_alloc_enabled_rdt_resource(r) { |
| 196 | if (!strcmp(resname, r->name) && closid < r->num_closid) | 196 | if (!strcmp(resname, r->name) && closid < r->num_closid) |
| 197 | return parse_line(tok, r); | 197 | return parse_line(tok, r); |
| 198 | } | 198 | } |
| @@ -221,7 +221,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, | |||
| 221 | 221 | ||
| 222 | closid = rdtgrp->closid; | 222 | closid = rdtgrp->closid; |
| 223 | 223 | ||
| 224 | for_each_enabled_rdt_resource(r) { | 224 | for_each_alloc_enabled_rdt_resource(r) { |
| 225 | list_for_each_entry(dom, &r->domains, list) | 225 | list_for_each_entry(dom, &r->domains, list) |
| 226 | dom->have_new_ctrl = false; | 226 | dom->have_new_ctrl = false; |
| 227 | } | 227 | } |
| @@ -237,7 +237,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, | |||
| 237 | goto out; | 237 | goto out; |
| 238 | } | 238 | } |
| 239 | 239 | ||
| 240 | for_each_enabled_rdt_resource(r) { | 240 | for_each_alloc_enabled_rdt_resource(r) { |
| 241 | ret = update_domains(r, closid); | 241 | ret = update_domains(r, closid); |
| 242 | if (ret) | 242 | if (ret) |
| 243 | goto out; | 243 | goto out; |
| @@ -269,12 +269,13 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, | |||
| 269 | { | 269 | { |
| 270 | struct rdtgroup *rdtgrp; | 270 | struct rdtgroup *rdtgrp; |
| 271 | struct rdt_resource *r; | 271 | struct rdt_resource *r; |
| 272 | int closid, ret = 0; | 272 | int ret = 0; |
| 273 | u32 closid; | ||
| 273 | 274 | ||
| 274 | rdtgrp = rdtgroup_kn_lock_live(of->kn); | 275 | rdtgrp = rdtgroup_kn_lock_live(of->kn); |
| 275 | if (rdtgrp) { | 276 | if (rdtgrp) { |
| 276 | closid = rdtgrp->closid; | 277 | closid = rdtgrp->closid; |
| 277 | for_each_enabled_rdt_resource(r) { | 278 | for_each_alloc_enabled_rdt_resource(r) { |
| 278 | if (closid < r->num_closid) | 279 | if (closid < r->num_closid) |
| 279 | show_doms(s, r, closid); | 280 | show_doms(s, r, closid); |
| 280 | } | 281 | } |
| @@ -284,3 +285,57 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, | |||
| 284 | rdtgroup_kn_unlock(of->kn); | 285 | rdtgroup_kn_unlock(of->kn); |
| 285 | return ret; | 286 | return ret; |
| 286 | } | 287 | } |
| 288 | |||
| 289 | void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, | ||
| 290 | struct rdtgroup *rdtgrp, int evtid, int first) | ||
| 291 | { | ||
| 292 | /* | ||
| 293 | * setup the parameters to send to the IPI to read the data. | ||
| 294 | */ | ||
| 295 | rr->rgrp = rdtgrp; | ||
| 296 | rr->evtid = evtid; | ||
| 297 | rr->d = d; | ||
| 298 | rr->val = 0; | ||
| 299 | rr->first = first; | ||
| 300 | |||
| 301 | smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1); | ||
| 302 | } | ||
| 303 | |||
| 304 | int rdtgroup_mondata_show(struct seq_file *m, void *arg) | ||
| 305 | { | ||
| 306 | struct kernfs_open_file *of = m->private; | ||
| 307 | u32 resid, evtid, domid; | ||
| 308 | struct rdtgroup *rdtgrp; | ||
| 309 | struct rdt_resource *r; | ||
| 310 | union mon_data_bits md; | ||
| 311 | struct rdt_domain *d; | ||
| 312 | struct rmid_read rr; | ||
| 313 | int ret = 0; | ||
| 314 | |||
| 315 | rdtgrp = rdtgroup_kn_lock_live(of->kn); | ||
| 316 | |||
| 317 | md.priv = of->kn->priv; | ||
| 318 | resid = md.u.rid; | ||
| 319 | domid = md.u.domid; | ||
| 320 | evtid = md.u.evtid; | ||
| 321 | |||
| 322 | r = &rdt_resources_all[resid]; | ||
| 323 | d = rdt_find_domain(r, domid, NULL); | ||
| 324 | if (!d) { | ||
| 325 | ret = -ENOENT; | ||
| 326 | goto out; | ||
| 327 | } | ||
| 328 | |||
| 329 | mon_event_read(&rr, d, rdtgrp, evtid, false); | ||
| 330 | |||
| 331 | if (rr.val & RMID_VAL_ERROR) | ||
| 332 | seq_puts(m, "Error\n"); | ||
| 333 | else if (rr.val & RMID_VAL_UNAVAIL) | ||
| 334 | seq_puts(m, "Unavailable\n"); | ||
| 335 | else | ||
| 336 | seq_printf(m, "%llu\n", rr.val * r->mon_scale); | ||
| 337 | |||
| 338 | out: | ||
| 339 | rdtgroup_kn_unlock(of->kn); | ||
| 340 | return ret; | ||
| 341 | } | ||
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c new file mode 100644 index 000000000000..30827510094b --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c | |||
| @@ -0,0 +1,499 @@ | |||
| 1 | /* | ||
| 2 | * Resource Director Technology(RDT) | ||
| 3 | * - Monitoring code | ||
| 4 | * | ||
| 5 | * Copyright (C) 2017 Intel Corporation | ||
| 6 | * | ||
| 7 | * Author: | ||
| 8 | * Vikas Shivappa <vikas.shivappa@intel.com> | ||
| 9 | * | ||
| 10 | * This replaces the cqm.c based on perf but we reuse a lot of | ||
| 11 | * code and datastructures originally from Peter Zijlstra and Matt Fleming. | ||
| 12 | * | ||
| 13 | * This program is free software; you can redistribute it and/or modify it | ||
| 14 | * under the terms and conditions of the GNU General Public License, | ||
| 15 | * version 2, as published by the Free Software Foundation. | ||
| 16 | * | ||
| 17 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
| 18 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
| 19 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
| 20 | * more details. | ||
| 21 | * | ||
| 22 | * More information about RDT be found in the Intel (R) x86 Architecture | ||
| 23 | * Software Developer Manual June 2016, volume 3, section 17.17. | ||
| 24 | */ | ||
| 25 | |||
| 26 | #include <linux/module.h> | ||
| 27 | #include <linux/slab.h> | ||
| 28 | #include <asm/cpu_device_id.h> | ||
| 29 | #include "intel_rdt.h" | ||
| 30 | |||
| 31 | #define MSR_IA32_QM_CTR 0x0c8e | ||
| 32 | #define MSR_IA32_QM_EVTSEL 0x0c8d | ||
| 33 | |||
| 34 | struct rmid_entry { | ||
| 35 | u32 rmid; | ||
| 36 | int busy; | ||
| 37 | struct list_head list; | ||
| 38 | }; | ||
| 39 | |||
| 40 | /** | ||
| 41 | * @rmid_free_lru A least recently used list of free RMIDs | ||
| 42 | * These RMIDs are guaranteed to have an occupancy less than the | ||
| 43 | * threshold occupancy | ||
| 44 | */ | ||
| 45 | static LIST_HEAD(rmid_free_lru); | ||
| 46 | |||
| 47 | /** | ||
| 48 | * @rmid_limbo_count count of currently unused but (potentially) | ||
| 49 | * dirty RMIDs. | ||
| 50 | * This counts RMIDs that no one is currently using but that | ||
| 51 | * may have a occupancy value > intel_cqm_threshold. User can change | ||
| 52 | * the threshold occupancy value. | ||
| 53 | */ | ||
| 54 | unsigned int rmid_limbo_count; | ||
| 55 | |||
| 56 | /** | ||
| 57 | * @rmid_entry - The entry in the limbo and free lists. | ||
| 58 | */ | ||
| 59 | static struct rmid_entry *rmid_ptrs; | ||
| 60 | |||
| 61 | /* | ||
| 62 | * Global boolean for rdt_monitor which is true if any | ||
| 63 | * resource monitoring is enabled. | ||
| 64 | */ | ||
| 65 | bool rdt_mon_capable; | ||
| 66 | |||
| 67 | /* | ||
| 68 | * Global to indicate which monitoring events are enabled. | ||
| 69 | */ | ||
| 70 | unsigned int rdt_mon_features; | ||
| 71 | |||
| 72 | /* | ||
| 73 | * This is the threshold cache occupancy at which we will consider an | ||
| 74 | * RMID available for re-allocation. | ||
| 75 | */ | ||
| 76 | unsigned int intel_cqm_threshold; | ||
| 77 | |||
| 78 | static inline struct rmid_entry *__rmid_entry(u32 rmid) | ||
| 79 | { | ||
| 80 | struct rmid_entry *entry; | ||
| 81 | |||
| 82 | entry = &rmid_ptrs[rmid]; | ||
| 83 | WARN_ON(entry->rmid != rmid); | ||
| 84 | |||
| 85 | return entry; | ||
| 86 | } | ||
| 87 | |||
| 88 | static u64 __rmid_read(u32 rmid, u32 eventid) | ||
| 89 | { | ||
| 90 | u64 val; | ||
| 91 | |||
| 92 | /* | ||
| 93 | * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured | ||
| 94 | * with a valid event code for supported resource type and the bits | ||
| 95 | * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID, | ||
| 96 | * IA32_QM_CTR.data (bits 61:0) reports the monitored data. | ||
| 97 | * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) | ||
| 98 | * are error bits. | ||
| 99 | */ | ||
| 100 | wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); | ||
| 101 | rdmsrl(MSR_IA32_QM_CTR, val); | ||
| 102 | |||
| 103 | return val; | ||
| 104 | } | ||
| 105 | |||
| 106 | static bool rmid_dirty(struct rmid_entry *entry) | ||
| 107 | { | ||
| 108 | u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); | ||
| 109 | |||
| 110 | return val >= intel_cqm_threshold; | ||
| 111 | } | ||
| 112 | |||
| 113 | /* | ||
| 114 | * Check the RMIDs that are marked as busy for this domain. If the | ||
| 115 | * reported LLC occupancy is below the threshold clear the busy bit and | ||
| 116 | * decrement the count. If the busy count gets to zero on an RMID, we | ||
| 117 | * free the RMID | ||
| 118 | */ | ||
| 119 | void __check_limbo(struct rdt_domain *d, bool force_free) | ||
| 120 | { | ||
| 121 | struct rmid_entry *entry; | ||
| 122 | struct rdt_resource *r; | ||
| 123 | u32 crmid = 1, nrmid; | ||
| 124 | |||
| 125 | r = &rdt_resources_all[RDT_RESOURCE_L3]; | ||
| 126 | |||
| 127 | /* | ||
| 128 | * Skip RMID 0 and start from RMID 1 and check all the RMIDs that | ||
| 129 | * are marked as busy for occupancy < threshold. If the occupancy | ||
| 130 | * is less than the threshold decrement the busy counter of the | ||
| 131 | * RMID and move it to the free list when the counter reaches 0. | ||
| 132 | */ | ||
| 133 | for (;;) { | ||
| 134 | nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid); | ||
| 135 | if (nrmid >= r->num_rmid) | ||
| 136 | break; | ||
| 137 | |||
| 138 | entry = __rmid_entry(nrmid); | ||
| 139 | if (force_free || !rmid_dirty(entry)) { | ||
| 140 | clear_bit(entry->rmid, d->rmid_busy_llc); | ||
| 141 | if (!--entry->busy) { | ||
| 142 | rmid_limbo_count--; | ||
| 143 | list_add_tail(&entry->list, &rmid_free_lru); | ||
| 144 | } | ||
| 145 | } | ||
| 146 | crmid = nrmid + 1; | ||
| 147 | } | ||
| 148 | } | ||
| 149 | |||
| 150 | bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d) | ||
| 151 | { | ||
| 152 | return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid; | ||
| 153 | } | ||
| 154 | |||
| 155 | /* | ||
| 156 | * As of now the RMIDs allocation is global. | ||
| 157 | * However we keep track of which packages the RMIDs | ||
| 158 | * are used to optimize the limbo list management. | ||
| 159 | */ | ||
| 160 | int alloc_rmid(void) | ||
| 161 | { | ||
| 162 | struct rmid_entry *entry; | ||
| 163 | |||
| 164 | lockdep_assert_held(&rdtgroup_mutex); | ||
| 165 | |||
| 166 | if (list_empty(&rmid_free_lru)) | ||
| 167 | return rmid_limbo_count ? -EBUSY : -ENOSPC; | ||
| 168 | |||
| 169 | entry = list_first_entry(&rmid_free_lru, | ||
| 170 | struct rmid_entry, list); | ||
| 171 | list_del(&entry->list); | ||
| 172 | |||
| 173 | return entry->rmid; | ||
| 174 | } | ||
| 175 | |||
| 176 | static void add_rmid_to_limbo(struct rmid_entry *entry) | ||
| 177 | { | ||
| 178 | struct rdt_resource *r; | ||
| 179 | struct rdt_domain *d; | ||
| 180 | int cpu; | ||
| 181 | u64 val; | ||
| 182 | |||
| 183 | r = &rdt_resources_all[RDT_RESOURCE_L3]; | ||
| 184 | |||
| 185 | entry->busy = 0; | ||
| 186 | cpu = get_cpu(); | ||
| 187 | list_for_each_entry(d, &r->domains, list) { | ||
| 188 | if (cpumask_test_cpu(cpu, &d->cpu_mask)) { | ||
| 189 | val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); | ||
| 190 | if (val <= intel_cqm_threshold) | ||
| 191 | continue; | ||
| 192 | } | ||
| 193 | |||
| 194 | /* | ||
| 195 | * For the first limbo RMID in the domain, | ||
| 196 | * setup up the limbo worker. | ||
| 197 | */ | ||
| 198 | if (!has_busy_rmid(r, d)) | ||
| 199 | cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL); | ||
| 200 | set_bit(entry->rmid, d->rmid_busy_llc); | ||
| 201 | entry->busy++; | ||
| 202 | } | ||
| 203 | put_cpu(); | ||
| 204 | |||
| 205 | if (entry->busy) | ||
| 206 | rmid_limbo_count++; | ||
| 207 | else | ||
| 208 | list_add_tail(&entry->list, &rmid_free_lru); | ||
| 209 | } | ||
| 210 | |||
| 211 | void free_rmid(u32 rmid) | ||
| 212 | { | ||
| 213 | struct rmid_entry *entry; | ||
| 214 | |||
| 215 | if (!rmid) | ||
| 216 | return; | ||
| 217 | |||
| 218 | lockdep_assert_held(&rdtgroup_mutex); | ||
| 219 | |||
| 220 | entry = __rmid_entry(rmid); | ||
| 221 | |||
| 222 | if (is_llc_occupancy_enabled()) | ||
| 223 | add_rmid_to_limbo(entry); | ||
| 224 | else | ||
| 225 | list_add_tail(&entry->list, &rmid_free_lru); | ||
| 226 | } | ||
| 227 | |||
| 228 | static int __mon_event_count(u32 rmid, struct rmid_read *rr) | ||
| 229 | { | ||
| 230 | u64 chunks, shift, tval; | ||
| 231 | struct mbm_state *m; | ||
| 232 | |||
| 233 | tval = __rmid_read(rmid, rr->evtid); | ||
| 234 | if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) { | ||
| 235 | rr->val = tval; | ||
| 236 | return -EINVAL; | ||
| 237 | } | ||
| 238 | switch (rr->evtid) { | ||
| 239 | case QOS_L3_OCCUP_EVENT_ID: | ||
| 240 | rr->val += tval; | ||
| 241 | return 0; | ||
| 242 | case QOS_L3_MBM_TOTAL_EVENT_ID: | ||
| 243 | m = &rr->d->mbm_total[rmid]; | ||
| 244 | break; | ||
| 245 | case QOS_L3_MBM_LOCAL_EVENT_ID: | ||
| 246 | m = &rr->d->mbm_local[rmid]; | ||
| 247 | break; | ||
| 248 | default: | ||
| 249 | /* | ||
| 250 | * Code would never reach here because | ||
| 251 | * an invalid event id would fail the __rmid_read. | ||
| 252 | */ | ||
| 253 | return -EINVAL; | ||
| 254 | } | ||
| 255 | |||
| 256 | if (rr->first) { | ||
| 257 | m->prev_msr = tval; | ||
| 258 | m->chunks = 0; | ||
| 259 | return 0; | ||
| 260 | } | ||
| 261 | |||
| 262 | shift = 64 - MBM_CNTR_WIDTH; | ||
| 263 | chunks = (tval << shift) - (m->prev_msr << shift); | ||
| 264 | chunks >>= shift; | ||
| 265 | m->chunks += chunks; | ||
| 266 | m->prev_msr = tval; | ||
| 267 | |||
| 268 | rr->val += m->chunks; | ||
| 269 | return 0; | ||
| 270 | } | ||
| 271 | |||
| 272 | /* | ||
| 273 | * This is called via IPI to read the CQM/MBM counters | ||
| 274 | * on a domain. | ||
| 275 | */ | ||
| 276 | void mon_event_count(void *info) | ||
| 277 | { | ||
| 278 | struct rdtgroup *rdtgrp, *entry; | ||
| 279 | struct rmid_read *rr = info; | ||
| 280 | struct list_head *head; | ||
| 281 | |||
| 282 | rdtgrp = rr->rgrp; | ||
| 283 | |||
| 284 | if (__mon_event_count(rdtgrp->mon.rmid, rr)) | ||
| 285 | return; | ||
| 286 | |||
| 287 | /* | ||
| 288 | * For Ctrl groups read data from child monitor groups. | ||
| 289 | */ | ||
| 290 | head = &rdtgrp->mon.crdtgrp_list; | ||
| 291 | |||
| 292 | if (rdtgrp->type == RDTCTRL_GROUP) { | ||
| 293 | list_for_each_entry(entry, head, mon.crdtgrp_list) { | ||
| 294 | if (__mon_event_count(entry->mon.rmid, rr)) | ||
| 295 | return; | ||
| 296 | } | ||
| 297 | } | ||
| 298 | } | ||
| 299 | |||
| 300 | static void mbm_update(struct rdt_domain *d, int rmid) | ||
| 301 | { | ||
| 302 | struct rmid_read rr; | ||
| 303 | |||
| 304 | rr.first = false; | ||
| 305 | rr.d = d; | ||
| 306 | |||
| 307 | /* | ||
| 308 | * This is protected from concurrent reads from user | ||
| 309 | * as both the user and we hold the global mutex. | ||
| 310 | */ | ||
| 311 | if (is_mbm_total_enabled()) { | ||
| 312 | rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; | ||
| 313 | __mon_event_count(rmid, &rr); | ||
| 314 | } | ||
| 315 | if (is_mbm_local_enabled()) { | ||
| 316 | rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; | ||
| 317 | __mon_event_count(rmid, &rr); | ||
| 318 | } | ||
| 319 | } | ||
| 320 | |||
| 321 | /* | ||
| 322 | * Handler to scan the limbo list and move the RMIDs | ||
| 323 | * to free list whose occupancy < threshold_occupancy. | ||
| 324 | */ | ||
| 325 | void cqm_handle_limbo(struct work_struct *work) | ||
| 326 | { | ||
| 327 | unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); | ||
| 328 | int cpu = smp_processor_id(); | ||
| 329 | struct rdt_resource *r; | ||
| 330 | struct rdt_domain *d; | ||
| 331 | |||
| 332 | mutex_lock(&rdtgroup_mutex); | ||
| 333 | |||
| 334 | r = &rdt_resources_all[RDT_RESOURCE_L3]; | ||
| 335 | d = get_domain_from_cpu(cpu, r); | ||
| 336 | |||
| 337 | if (!d) { | ||
| 338 | pr_warn_once("Failure to get domain for limbo worker\n"); | ||
| 339 | goto out_unlock; | ||
| 340 | } | ||
| 341 | |||
| 342 | __check_limbo(d, false); | ||
| 343 | |||
| 344 | if (has_busy_rmid(r, d)) | ||
| 345 | schedule_delayed_work_on(cpu, &d->cqm_limbo, delay); | ||
| 346 | |||
| 347 | out_unlock: | ||
| 348 | mutex_unlock(&rdtgroup_mutex); | ||
| 349 | } | ||
| 350 | |||
| 351 | void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms) | ||
| 352 | { | ||
| 353 | unsigned long delay = msecs_to_jiffies(delay_ms); | ||
| 354 | struct rdt_resource *r; | ||
| 355 | int cpu; | ||
| 356 | |||
| 357 | r = &rdt_resources_all[RDT_RESOURCE_L3]; | ||
| 358 | |||
| 359 | cpu = cpumask_any(&dom->cpu_mask); | ||
| 360 | dom->cqm_work_cpu = cpu; | ||
| 361 | |||
| 362 | schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); | ||
| 363 | } | ||
| 364 | |||
| 365 | void mbm_handle_overflow(struct work_struct *work) | ||
| 366 | { | ||
| 367 | unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); | ||
| 368 | struct rdtgroup *prgrp, *crgrp; | ||
| 369 | int cpu = smp_processor_id(); | ||
| 370 | struct list_head *head; | ||
| 371 | struct rdt_domain *d; | ||
| 372 | |||
| 373 | mutex_lock(&rdtgroup_mutex); | ||
| 374 | |||
| 375 | if (!static_branch_likely(&rdt_enable_key)) | ||
| 376 | goto out_unlock; | ||
| 377 | |||
| 378 | d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]); | ||
| 379 | if (!d) | ||
| 380 | goto out_unlock; | ||
| 381 | |||
| 382 | list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { | ||
| 383 | mbm_update(d, prgrp->mon.rmid); | ||
| 384 | |||
| 385 | head = &prgrp->mon.crdtgrp_list; | ||
| 386 | list_for_each_entry(crgrp, head, mon.crdtgrp_list) | ||
| 387 | mbm_update(d, crgrp->mon.rmid); | ||
| 388 | } | ||
| 389 | |||
| 390 | schedule_delayed_work_on(cpu, &d->mbm_over, delay); | ||
| 391 | |||
| 392 | out_unlock: | ||
| 393 | mutex_unlock(&rdtgroup_mutex); | ||
| 394 | } | ||
| 395 | |||
| 396 | void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) | ||
| 397 | { | ||
| 398 | unsigned long delay = msecs_to_jiffies(delay_ms); | ||
| 399 | int cpu; | ||
| 400 | |||
| 401 | if (!static_branch_likely(&rdt_enable_key)) | ||
| 402 | return; | ||
| 403 | cpu = cpumask_any(&dom->cpu_mask); | ||
| 404 | dom->mbm_work_cpu = cpu; | ||
| 405 | schedule_delayed_work_on(cpu, &dom->mbm_over, delay); | ||
| 406 | } | ||
| 407 | |||
| 408 | static int dom_data_init(struct rdt_resource *r) | ||
| 409 | { | ||
| 410 | struct rmid_entry *entry = NULL; | ||
| 411 | int i, nr_rmids; | ||
| 412 | |||
| 413 | nr_rmids = r->num_rmid; | ||
| 414 | rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL); | ||
| 415 | if (!rmid_ptrs) | ||
| 416 | return -ENOMEM; | ||
| 417 | |||
| 418 | for (i = 0; i < nr_rmids; i++) { | ||
| 419 | entry = &rmid_ptrs[i]; | ||
| 420 | INIT_LIST_HEAD(&entry->list); | ||
| 421 | |||
| 422 | entry->rmid = i; | ||
| 423 | list_add_tail(&entry->list, &rmid_free_lru); | ||
| 424 | } | ||
| 425 | |||
| 426 | /* | ||
| 427 | * RMID 0 is special and is always allocated. It's used for all | ||
| 428 | * tasks that are not monitored. | ||
| 429 | */ | ||
| 430 | entry = __rmid_entry(0); | ||
| 431 | list_del(&entry->list); | ||
| 432 | |||
| 433 | return 0; | ||
| 434 | } | ||
| 435 | |||
| 436 | static struct mon_evt llc_occupancy_event = { | ||
| 437 | .name = "llc_occupancy", | ||
| 438 | .evtid = QOS_L3_OCCUP_EVENT_ID, | ||
| 439 | }; | ||
| 440 | |||
| 441 | static struct mon_evt mbm_total_event = { | ||
| 442 | .name = "mbm_total_bytes", | ||
| 443 | .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, | ||
| 444 | }; | ||
| 445 | |||
| 446 | static struct mon_evt mbm_local_event = { | ||
| 447 | .name = "mbm_local_bytes", | ||
| 448 | .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, | ||
| 449 | }; | ||
| 450 | |||
| 451 | /* | ||
| 452 | * Initialize the event list for the resource. | ||
| 453 | * | ||
| 454 | * Note that MBM events are also part of RDT_RESOURCE_L3 resource | ||
| 455 | * because as per the SDM the total and local memory bandwidth | ||
| 456 | * are enumerated as part of L3 monitoring. | ||
| 457 | */ | ||
| 458 | static void l3_mon_evt_init(struct rdt_resource *r) | ||
| 459 | { | ||
| 460 | INIT_LIST_HEAD(&r->evt_list); | ||
| 461 | |||
| 462 | if (is_llc_occupancy_enabled()) | ||
| 463 | list_add_tail(&llc_occupancy_event.list, &r->evt_list); | ||
| 464 | if (is_mbm_total_enabled()) | ||
| 465 | list_add_tail(&mbm_total_event.list, &r->evt_list); | ||
| 466 | if (is_mbm_local_enabled()) | ||
| 467 | list_add_tail(&mbm_local_event.list, &r->evt_list); | ||
| 468 | } | ||
| 469 | |||
| 470 | int rdt_get_mon_l3_config(struct rdt_resource *r) | ||
| 471 | { | ||
| 472 | int ret; | ||
| 473 | |||
| 474 | r->mon_scale = boot_cpu_data.x86_cache_occ_scale; | ||
| 475 | r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; | ||
| 476 | |||
| 477 | /* | ||
| 478 | * A reasonable upper limit on the max threshold is the number | ||
| 479 | * of lines tagged per RMID if all RMIDs have the same number of | ||
| 480 | * lines tagged in the LLC. | ||
| 481 | * | ||
| 482 | * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. | ||
| 483 | */ | ||
| 484 | intel_cqm_threshold = boot_cpu_data.x86_cache_size * 1024 / r->num_rmid; | ||
| 485 | |||
| 486 | /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */ | ||
| 487 | intel_cqm_threshold /= r->mon_scale; | ||
| 488 | |||
| 489 | ret = dom_data_init(r); | ||
| 490 | if (ret) | ||
| 491 | return ret; | ||
| 492 | |||
| 493 | l3_mon_evt_init(r); | ||
| 494 | |||
| 495 | r->mon_capable = true; | ||
| 496 | r->mon_enabled = true; | ||
| 497 | |||
| 498 | return 0; | ||
| 499 | } | ||
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 9257bd9dc664..a869d4a073c5 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | |||
| @@ -32,17 +32,25 @@ | |||
| 32 | 32 | ||
| 33 | #include <uapi/linux/magic.h> | 33 | #include <uapi/linux/magic.h> |
| 34 | 34 | ||
| 35 | #include <asm/intel_rdt.h> | 35 | #include <asm/intel_rdt_sched.h> |
| 36 | #include <asm/intel_rdt_common.h> | 36 | #include "intel_rdt.h" |
| 37 | 37 | ||
| 38 | DEFINE_STATIC_KEY_FALSE(rdt_enable_key); | 38 | DEFINE_STATIC_KEY_FALSE(rdt_enable_key); |
| 39 | struct kernfs_root *rdt_root; | 39 | DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); |
| 40 | DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); | ||
| 41 | static struct kernfs_root *rdt_root; | ||
| 40 | struct rdtgroup rdtgroup_default; | 42 | struct rdtgroup rdtgroup_default; |
| 41 | LIST_HEAD(rdt_all_groups); | 43 | LIST_HEAD(rdt_all_groups); |
| 42 | 44 | ||
| 43 | /* Kernel fs node for "info" directory under root */ | 45 | /* Kernel fs node for "info" directory under root */ |
| 44 | static struct kernfs_node *kn_info; | 46 | static struct kernfs_node *kn_info; |
| 45 | 47 | ||
| 48 | /* Kernel fs node for "mon_groups" directory under root */ | ||
| 49 | static struct kernfs_node *kn_mongrp; | ||
| 50 | |||
| 51 | /* Kernel fs node for "mon_data" directory under root */ | ||
| 52 | static struct kernfs_node *kn_mondata; | ||
| 53 | |||
| 46 | /* | 54 | /* |
| 47 | * Trivial allocator for CLOSIDs. Since h/w only supports a small number, | 55 | * Trivial allocator for CLOSIDs. Since h/w only supports a small number, |
| 48 | * we can keep a bitmap of free CLOSIDs in a single integer. | 56 | * we can keep a bitmap of free CLOSIDs in a single integer. |
| @@ -66,7 +74,7 @@ static void closid_init(void) | |||
| 66 | int rdt_min_closid = 32; | 74 | int rdt_min_closid = 32; |
| 67 | 75 | ||
| 68 | /* Compute rdt_min_closid across all resources */ | 76 | /* Compute rdt_min_closid across all resources */ |
| 69 | for_each_enabled_rdt_resource(r) | 77 | for_each_alloc_enabled_rdt_resource(r) |
| 70 | rdt_min_closid = min(rdt_min_closid, r->num_closid); | 78 | rdt_min_closid = min(rdt_min_closid, r->num_closid); |
| 71 | 79 | ||
| 72 | closid_free_map = BIT_MASK(rdt_min_closid) - 1; | 80 | closid_free_map = BIT_MASK(rdt_min_closid) - 1; |
| @@ -75,9 +83,9 @@ static void closid_init(void) | |||
| 75 | closid_free_map &= ~1; | 83 | closid_free_map &= ~1; |
| 76 | } | 84 | } |
| 77 | 85 | ||
| 78 | int closid_alloc(void) | 86 | static int closid_alloc(void) |
| 79 | { | 87 | { |
| 80 | int closid = ffs(closid_free_map); | 88 | u32 closid = ffs(closid_free_map); |
| 81 | 89 | ||
| 82 | if (closid == 0) | 90 | if (closid == 0) |
| 83 | return -ENOSPC; | 91 | return -ENOSPC; |
| @@ -125,28 +133,6 @@ static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) | |||
| 125 | return 0; | 133 | return 0; |
| 126 | } | 134 | } |
| 127 | 135 | ||
| 128 | static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts, | ||
| 129 | int len) | ||
| 130 | { | ||
| 131 | struct rftype *rft; | ||
| 132 | int ret; | ||
| 133 | |||
| 134 | lockdep_assert_held(&rdtgroup_mutex); | ||
| 135 | |||
| 136 | for (rft = rfts; rft < rfts + len; rft++) { | ||
| 137 | ret = rdtgroup_add_file(kn, rft); | ||
| 138 | if (ret) | ||
| 139 | goto error; | ||
| 140 | } | ||
| 141 | |||
| 142 | return 0; | ||
| 143 | error: | ||
| 144 | pr_warn("Failed to add %s, err=%d\n", rft->name, ret); | ||
| 145 | while (--rft >= rfts) | ||
| 146 | kernfs_remove_by_name(kn, rft->name); | ||
| 147 | return ret; | ||
| 148 | } | ||
| 149 | |||
| 150 | static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) | 136 | static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) |
| 151 | { | 137 | { |
| 152 | struct kernfs_open_file *of = m->private; | 138 | struct kernfs_open_file *of = m->private; |
| @@ -174,6 +160,11 @@ static struct kernfs_ops rdtgroup_kf_single_ops = { | |||
| 174 | .seq_show = rdtgroup_seqfile_show, | 160 | .seq_show = rdtgroup_seqfile_show, |
| 175 | }; | 161 | }; |
| 176 | 162 | ||
| 163 | static struct kernfs_ops kf_mondata_ops = { | ||
| 164 | .atomic_write_len = PAGE_SIZE, | ||
| 165 | .seq_show = rdtgroup_mondata_show, | ||
| 166 | }; | ||
| 167 | |||
| 177 | static bool is_cpu_list(struct kernfs_open_file *of) | 168 | static bool is_cpu_list(struct kernfs_open_file *of) |
| 178 | { | 169 | { |
| 179 | struct rftype *rft = of->kn->priv; | 170 | struct rftype *rft = of->kn->priv; |
| @@ -203,13 +194,18 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, | |||
| 203 | /* | 194 | /* |
| 204 | * This is safe against intel_rdt_sched_in() called from __switch_to() | 195 | * This is safe against intel_rdt_sched_in() called from __switch_to() |
| 205 | * because __switch_to() is executed with interrupts disabled. A local call | 196 | * because __switch_to() is executed with interrupts disabled. A local call |
| 206 | * from rdt_update_closid() is proteced against __switch_to() because | 197 | * from update_closid_rmid() is proteced against __switch_to() because |
| 207 | * preemption is disabled. | 198 | * preemption is disabled. |
| 208 | */ | 199 | */ |
| 209 | static void rdt_update_cpu_closid(void *closid) | 200 | static void update_cpu_closid_rmid(void *info) |
| 210 | { | 201 | { |
| 211 | if (closid) | 202 | struct rdtgroup *r = info; |
| 212 | this_cpu_write(cpu_closid, *(int *)closid); | 203 | |
| 204 | if (r) { | ||
| 205 | this_cpu_write(pqr_state.default_closid, r->closid); | ||
| 206 | this_cpu_write(pqr_state.default_rmid, r->mon.rmid); | ||
| 207 | } | ||
| 208 | |||
| 213 | /* | 209 | /* |
| 214 | * We cannot unconditionally write the MSR because the current | 210 | * We cannot unconditionally write the MSR because the current |
| 215 | * executing task might have its own closid selected. Just reuse | 211 | * executing task might have its own closid selected. Just reuse |
| @@ -221,28 +217,128 @@ static void rdt_update_cpu_closid(void *closid) | |||
| 221 | /* | 217 | /* |
| 222 | * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, | 218 | * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, |
| 223 | * | 219 | * |
| 224 | * Per task closids must have been set up before calling this function. | 220 | * Per task closids/rmids must have been set up before calling this function. |
| 225 | * | ||
| 226 | * The per cpu closids are updated with the smp function call, when @closid | ||
| 227 | * is not NULL. If @closid is NULL then all affected percpu closids must | ||
| 228 | * have been set up before calling this function. | ||
| 229 | */ | 221 | */ |
| 230 | static void | 222 | static void |
| 231 | rdt_update_closid(const struct cpumask *cpu_mask, int *closid) | 223 | update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) |
| 232 | { | 224 | { |
| 233 | int cpu = get_cpu(); | 225 | int cpu = get_cpu(); |
| 234 | 226 | ||
| 235 | if (cpumask_test_cpu(cpu, cpu_mask)) | 227 | if (cpumask_test_cpu(cpu, cpu_mask)) |
| 236 | rdt_update_cpu_closid(closid); | 228 | update_cpu_closid_rmid(r); |
| 237 | smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1); | 229 | smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1); |
| 238 | put_cpu(); | 230 | put_cpu(); |
| 239 | } | 231 | } |
| 240 | 232 | ||
| 233 | static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, | ||
| 234 | cpumask_var_t tmpmask) | ||
| 235 | { | ||
| 236 | struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp; | ||
| 237 | struct list_head *head; | ||
| 238 | |||
| 239 | /* Check whether cpus belong to parent ctrl group */ | ||
| 240 | cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); | ||
| 241 | if (cpumask_weight(tmpmask)) | ||
| 242 | return -EINVAL; | ||
| 243 | |||
| 244 | /* Check whether cpus are dropped from this group */ | ||
| 245 | cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); | ||
| 246 | if (cpumask_weight(tmpmask)) { | ||
| 247 | /* Give any dropped cpus to parent rdtgroup */ | ||
| 248 | cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); | ||
| 249 | update_closid_rmid(tmpmask, prgrp); | ||
| 250 | } | ||
| 251 | |||
| 252 | /* | ||
| 253 | * If we added cpus, remove them from previous group that owned them | ||
| 254 | * and update per-cpu rmid | ||
| 255 | */ | ||
| 256 | cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); | ||
| 257 | if (cpumask_weight(tmpmask)) { | ||
| 258 | head = &prgrp->mon.crdtgrp_list; | ||
| 259 | list_for_each_entry(crgrp, head, mon.crdtgrp_list) { | ||
| 260 | if (crgrp == rdtgrp) | ||
| 261 | continue; | ||
| 262 | cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask, | ||
| 263 | tmpmask); | ||
| 264 | } | ||
| 265 | update_closid_rmid(tmpmask, rdtgrp); | ||
| 266 | } | ||
| 267 | |||
| 268 | /* Done pushing/pulling - update this group with new mask */ | ||
| 269 | cpumask_copy(&rdtgrp->cpu_mask, newmask); | ||
| 270 | |||
| 271 | return 0; | ||
| 272 | } | ||
| 273 | |||
| 274 | static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m) | ||
| 275 | { | ||
| 276 | struct rdtgroup *crgrp; | ||
| 277 | |||
| 278 | cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m); | ||
| 279 | /* update the child mon group masks as well*/ | ||
| 280 | list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list) | ||
| 281 | cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask); | ||
| 282 | } | ||
| 283 | |||
| 284 | static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, | ||
| 285 | cpumask_var_t tmpmask, cpumask_var_t tmpmask1) | ||
| 286 | { | ||
| 287 | struct rdtgroup *r, *crgrp; | ||
| 288 | struct list_head *head; | ||
| 289 | |||
| 290 | /* Check whether cpus are dropped from this group */ | ||
| 291 | cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); | ||
| 292 | if (cpumask_weight(tmpmask)) { | ||
| 293 | /* Can't drop from default group */ | ||
| 294 | if (rdtgrp == &rdtgroup_default) | ||
| 295 | return -EINVAL; | ||
| 296 | |||
| 297 | /* Give any dropped cpus to rdtgroup_default */ | ||
| 298 | cpumask_or(&rdtgroup_default.cpu_mask, | ||
| 299 | &rdtgroup_default.cpu_mask, tmpmask); | ||
| 300 | update_closid_rmid(tmpmask, &rdtgroup_default); | ||
| 301 | } | ||
| 302 | |||
| 303 | /* | ||
| 304 | * If we added cpus, remove them from previous group and | ||
| 305 | * the prev group's child groups that owned them | ||
| 306 | * and update per-cpu closid/rmid. | ||
| 307 | */ | ||
| 308 | cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); | ||
| 309 | if (cpumask_weight(tmpmask)) { | ||
| 310 | list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { | ||
| 311 | if (r == rdtgrp) | ||
| 312 | continue; | ||
| 313 | cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); | ||
| 314 | if (cpumask_weight(tmpmask1)) | ||
| 315 | cpumask_rdtgrp_clear(r, tmpmask1); | ||
| 316 | } | ||
| 317 | update_closid_rmid(tmpmask, rdtgrp); | ||
| 318 | } | ||
| 319 | |||
| 320 | /* Done pushing/pulling - update this group with new mask */ | ||
| 321 | cpumask_copy(&rdtgrp->cpu_mask, newmask); | ||
| 322 | |||
| 323 | /* | ||
| 324 | * Clear child mon group masks since there is a new parent mask | ||
| 325 | * now and update the rmid for the cpus the child lost. | ||
| 326 | */ | ||
| 327 | head = &rdtgrp->mon.crdtgrp_list; | ||
| 328 | list_for_each_entry(crgrp, head, mon.crdtgrp_list) { | ||
| 329 | cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask); | ||
| 330 | update_closid_rmid(tmpmask, rdtgrp); | ||
| 331 | cpumask_clear(&crgrp->cpu_mask); | ||
| 332 | } | ||
| 333 | |||
| 334 | return 0; | ||
| 335 | } | ||
| 336 | |||
| 241 | static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, | 337 | static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, |
| 242 | char *buf, size_t nbytes, loff_t off) | 338 | char *buf, size_t nbytes, loff_t off) |
| 243 | { | 339 | { |
| 244 | cpumask_var_t tmpmask, newmask; | 340 | cpumask_var_t tmpmask, newmask, tmpmask1; |
| 245 | struct rdtgroup *rdtgrp, *r; | 341 | struct rdtgroup *rdtgrp; |
| 246 | int ret; | 342 | int ret; |
| 247 | 343 | ||
| 248 | if (!buf) | 344 | if (!buf) |
| @@ -254,6 +350,11 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, | |||
| 254 | free_cpumask_var(tmpmask); | 350 | free_cpumask_var(tmpmask); |
| 255 | return -ENOMEM; | 351 | return -ENOMEM; |
| 256 | } | 352 | } |
| 353 | if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) { | ||
| 354 | free_cpumask_var(tmpmask); | ||
| 355 | free_cpumask_var(newmask); | ||
| 356 | return -ENOMEM; | ||
| 357 | } | ||
| 257 | 358 | ||
| 258 | rdtgrp = rdtgroup_kn_lock_live(of->kn); | 359 | rdtgrp = rdtgroup_kn_lock_live(of->kn); |
| 259 | if (!rdtgrp) { | 360 | if (!rdtgrp) { |
| @@ -276,41 +377,18 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, | |||
| 276 | goto unlock; | 377 | goto unlock; |
| 277 | } | 378 | } |
| 278 | 379 | ||
| 279 | /* Check whether cpus are dropped from this group */ | 380 | if (rdtgrp->type == RDTCTRL_GROUP) |
| 280 | cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); | 381 | ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1); |
| 281 | if (cpumask_weight(tmpmask)) { | 382 | else if (rdtgrp->type == RDTMON_GROUP) |
| 282 | /* Can't drop from default group */ | 383 | ret = cpus_mon_write(rdtgrp, newmask, tmpmask); |
| 283 | if (rdtgrp == &rdtgroup_default) { | 384 | else |
| 284 | ret = -EINVAL; | 385 | ret = -EINVAL; |
| 285 | goto unlock; | ||
| 286 | } | ||
| 287 | /* Give any dropped cpus to rdtgroup_default */ | ||
| 288 | cpumask_or(&rdtgroup_default.cpu_mask, | ||
| 289 | &rdtgroup_default.cpu_mask, tmpmask); | ||
| 290 | rdt_update_closid(tmpmask, &rdtgroup_default.closid); | ||
| 291 | } | ||
| 292 | |||
| 293 | /* | ||
| 294 | * If we added cpus, remove them from previous group that owned them | ||
| 295 | * and update per-cpu closid | ||
| 296 | */ | ||
| 297 | cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); | ||
| 298 | if (cpumask_weight(tmpmask)) { | ||
| 299 | list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { | ||
| 300 | if (r == rdtgrp) | ||
| 301 | continue; | ||
| 302 | cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask); | ||
| 303 | } | ||
| 304 | rdt_update_closid(tmpmask, &rdtgrp->closid); | ||
| 305 | } | ||
| 306 | |||
| 307 | /* Done pushing/pulling - update this group with new mask */ | ||
| 308 | cpumask_copy(&rdtgrp->cpu_mask, newmask); | ||
| 309 | 386 | ||
| 310 | unlock: | 387 | unlock: |
| 311 | rdtgroup_kn_unlock(of->kn); | 388 | rdtgroup_kn_unlock(of->kn); |
| 312 | free_cpumask_var(tmpmask); | 389 | free_cpumask_var(tmpmask); |
| 313 | free_cpumask_var(newmask); | 390 | free_cpumask_var(newmask); |
| 391 | free_cpumask_var(tmpmask1); | ||
| 314 | 392 | ||
| 315 | return ret ?: nbytes; | 393 | return ret ?: nbytes; |
| 316 | } | 394 | } |
| @@ -336,6 +414,7 @@ static void move_myself(struct callback_head *head) | |||
| 336 | if (atomic_dec_and_test(&rdtgrp->waitcount) && | 414 | if (atomic_dec_and_test(&rdtgrp->waitcount) && |
| 337 | (rdtgrp->flags & RDT_DELETED)) { | 415 | (rdtgrp->flags & RDT_DELETED)) { |
| 338 | current->closid = 0; | 416 | current->closid = 0; |
| 417 | current->rmid = 0; | ||
| 339 | kfree(rdtgrp); | 418 | kfree(rdtgrp); |
| 340 | } | 419 | } |
| 341 | 420 | ||
| @@ -374,7 +453,20 @@ static int __rdtgroup_move_task(struct task_struct *tsk, | |||
| 374 | atomic_dec(&rdtgrp->waitcount); | 453 | atomic_dec(&rdtgrp->waitcount); |
| 375 | kfree(callback); | 454 | kfree(callback); |
| 376 | } else { | 455 | } else { |
| 377 | tsk->closid = rdtgrp->closid; | 456 | /* |
| 457 | * For ctrl_mon groups move both closid and rmid. | ||
| 458 | * For monitor groups, can move the tasks only from | ||
| 459 | * their parent CTRL group. | ||
| 460 | */ | ||
| 461 | if (rdtgrp->type == RDTCTRL_GROUP) { | ||
| 462 | tsk->closid = rdtgrp->closid; | ||
| 463 | tsk->rmid = rdtgrp->mon.rmid; | ||
| 464 | } else if (rdtgrp->type == RDTMON_GROUP) { | ||
| 465 | if (rdtgrp->mon.parent->closid == tsk->closid) | ||
| 466 | tsk->rmid = rdtgrp->mon.rmid; | ||
| 467 | else | ||
| 468 | ret = -EINVAL; | ||
| 469 | } | ||
| 378 | } | 470 | } |
| 379 | return ret; | 471 | return ret; |
| 380 | } | 472 | } |
| @@ -454,7 +546,8 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) | |||
| 454 | 546 | ||
| 455 | rcu_read_lock(); | 547 | rcu_read_lock(); |
| 456 | for_each_process_thread(p, t) { | 548 | for_each_process_thread(p, t) { |
| 457 | if (t->closid == r->closid) | 549 | if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) || |
| 550 | (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) | ||
| 458 | seq_printf(s, "%d\n", t->pid); | 551 | seq_printf(s, "%d\n", t->pid); |
| 459 | } | 552 | } |
| 460 | rcu_read_unlock(); | 553 | rcu_read_unlock(); |
| @@ -476,39 +569,6 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of, | |||
| 476 | return ret; | 569 | return ret; |
| 477 | } | 570 | } |
| 478 | 571 | ||
| 479 | /* Files in each rdtgroup */ | ||
| 480 | static struct rftype rdtgroup_base_files[] = { | ||
| 481 | { | ||
| 482 | .name = "cpus", | ||
| 483 | .mode = 0644, | ||
| 484 | .kf_ops = &rdtgroup_kf_single_ops, | ||
| 485 | .write = rdtgroup_cpus_write, | ||
| 486 | .seq_show = rdtgroup_cpus_show, | ||
| 487 | }, | ||
| 488 | { | ||
| 489 | .name = "cpus_list", | ||
| 490 | .mode = 0644, | ||
| 491 | .kf_ops = &rdtgroup_kf_single_ops, | ||
| 492 | .write = rdtgroup_cpus_write, | ||
| 493 | .seq_show = rdtgroup_cpus_show, | ||
| 494 | .flags = RFTYPE_FLAGS_CPUS_LIST, | ||
| 495 | }, | ||
| 496 | { | ||
| 497 | .name = "tasks", | ||
| 498 | .mode = 0644, | ||
| 499 | .kf_ops = &rdtgroup_kf_single_ops, | ||
| 500 | .write = rdtgroup_tasks_write, | ||
| 501 | .seq_show = rdtgroup_tasks_show, | ||
| 502 | }, | ||
| 503 | { | ||
| 504 | .name = "schemata", | ||
| 505 | .mode = 0644, | ||
| 506 | .kf_ops = &rdtgroup_kf_single_ops, | ||
| 507 | .write = rdtgroup_schemata_write, | ||
| 508 | .seq_show = rdtgroup_schemata_show, | ||
| 509 | }, | ||
| 510 | }; | ||
| 511 | |||
| 512 | static int rdt_num_closids_show(struct kernfs_open_file *of, | 572 | static int rdt_num_closids_show(struct kernfs_open_file *of, |
| 513 | struct seq_file *seq, void *v) | 573 | struct seq_file *seq, void *v) |
| 514 | { | 574 | { |
| @@ -536,6 +596,15 @@ static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, | |||
| 536 | return 0; | 596 | return 0; |
| 537 | } | 597 | } |
| 538 | 598 | ||
| 599 | static int rdt_shareable_bits_show(struct kernfs_open_file *of, | ||
| 600 | struct seq_file *seq, void *v) | ||
| 601 | { | ||
| 602 | struct rdt_resource *r = of->kn->parent->priv; | ||
| 603 | |||
| 604 | seq_printf(seq, "%x\n", r->cache.shareable_bits); | ||
| 605 | return 0; | ||
| 606 | } | ||
| 607 | |||
| 539 | static int rdt_min_bw_show(struct kernfs_open_file *of, | 608 | static int rdt_min_bw_show(struct kernfs_open_file *of, |
| 540 | struct seq_file *seq, void *v) | 609 | struct seq_file *seq, void *v) |
| 541 | { | 610 | { |
| @@ -545,6 +614,28 @@ static int rdt_min_bw_show(struct kernfs_open_file *of, | |||
| 545 | return 0; | 614 | return 0; |
| 546 | } | 615 | } |
| 547 | 616 | ||
| 617 | static int rdt_num_rmids_show(struct kernfs_open_file *of, | ||
| 618 | struct seq_file *seq, void *v) | ||
| 619 | { | ||
| 620 | struct rdt_resource *r = of->kn->parent->priv; | ||
| 621 | |||
| 622 | seq_printf(seq, "%d\n", r->num_rmid); | ||
| 623 | |||
| 624 | return 0; | ||
| 625 | } | ||
| 626 | |||
| 627 | static int rdt_mon_features_show(struct kernfs_open_file *of, | ||
| 628 | struct seq_file *seq, void *v) | ||
| 629 | { | ||
| 630 | struct rdt_resource *r = of->kn->parent->priv; | ||
| 631 | struct mon_evt *mevt; | ||
| 632 | |||
| 633 | list_for_each_entry(mevt, &r->evt_list, list) | ||
| 634 | seq_printf(seq, "%s\n", mevt->name); | ||
| 635 | |||
| 636 | return 0; | ||
| 637 | } | ||
| 638 | |||
| 548 | static int rdt_bw_gran_show(struct kernfs_open_file *of, | 639 | static int rdt_bw_gran_show(struct kernfs_open_file *of, |
| 549 | struct seq_file *seq, void *v) | 640 | struct seq_file *seq, void *v) |
| 550 | { | 641 | { |
| @@ -563,74 +654,200 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of, | |||
| 563 | return 0; | 654 | return 0; |
| 564 | } | 655 | } |
| 565 | 656 | ||
| 657 | static int max_threshold_occ_show(struct kernfs_open_file *of, | ||
| 658 | struct seq_file *seq, void *v) | ||
| 659 | { | ||
| 660 | struct rdt_resource *r = of->kn->parent->priv; | ||
| 661 | |||
| 662 | seq_printf(seq, "%u\n", intel_cqm_threshold * r->mon_scale); | ||
| 663 | |||
| 664 | return 0; | ||
| 665 | } | ||
| 666 | |||
| 667 | static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, | ||
| 668 | char *buf, size_t nbytes, loff_t off) | ||
| 669 | { | ||
| 670 | struct rdt_resource *r = of->kn->parent->priv; | ||
| 671 | unsigned int bytes; | ||
| 672 | int ret; | ||
| 673 | |||
| 674 | ret = kstrtouint(buf, 0, &bytes); | ||
| 675 | if (ret) | ||
| 676 | return ret; | ||
| 677 | |||
| 678 | if (bytes > (boot_cpu_data.x86_cache_size * 1024)) | ||
| 679 | return -EINVAL; | ||
| 680 | |||
| 681 | intel_cqm_threshold = bytes / r->mon_scale; | ||
| 682 | |||
| 683 | return nbytes; | ||
| 684 | } | ||
| 685 | |||
| 566 | /* rdtgroup information files for one cache resource. */ | 686 | /* rdtgroup information files for one cache resource. */ |
| 567 | static struct rftype res_cache_info_files[] = { | 687 | static struct rftype res_common_files[] = { |
| 568 | { | 688 | { |
| 569 | .name = "num_closids", | 689 | .name = "num_closids", |
| 570 | .mode = 0444, | 690 | .mode = 0444, |
| 571 | .kf_ops = &rdtgroup_kf_single_ops, | 691 | .kf_ops = &rdtgroup_kf_single_ops, |
| 572 | .seq_show = rdt_num_closids_show, | 692 | .seq_show = rdt_num_closids_show, |
| 693 | .fflags = RF_CTRL_INFO, | ||
| 694 | }, | ||
| 695 | { | ||
| 696 | .name = "mon_features", | ||
| 697 | .mode = 0444, | ||
| 698 | .kf_ops = &rdtgroup_kf_single_ops, | ||
| 699 | .seq_show = rdt_mon_features_show, | ||
| 700 | .fflags = RF_MON_INFO, | ||
| 701 | }, | ||
| 702 | { | ||
| 703 | .name = "num_rmids", | ||
| 704 | .mode = 0444, | ||
| 705 | .kf_ops = &rdtgroup_kf_single_ops, | ||
| 706 | .seq_show = rdt_num_rmids_show, | ||
| 707 | .fflags = RF_MON_INFO, | ||
| 573 | }, | 708 | }, |
| 574 | { | 709 | { |
| 575 | .name = "cbm_mask", | 710 | .name = "cbm_mask", |
| 576 | .mode = 0444, | 711 | .mode = 0444, |
| 577 | .kf_ops = &rdtgroup_kf_single_ops, | 712 | .kf_ops = &rdtgroup_kf_single_ops, |
| 578 | .seq_show = rdt_default_ctrl_show, | 713 | .seq_show = rdt_default_ctrl_show, |
| 714 | .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, | ||
| 579 | }, | 715 | }, |
| 580 | { | 716 | { |
| 581 | .name = "min_cbm_bits", | 717 | .name = "min_cbm_bits", |
| 582 | .mode = 0444, | 718 | .mode = 0444, |
| 583 | .kf_ops = &rdtgroup_kf_single_ops, | 719 | .kf_ops = &rdtgroup_kf_single_ops, |
| 584 | .seq_show = rdt_min_cbm_bits_show, | 720 | .seq_show = rdt_min_cbm_bits_show, |
| 721 | .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, | ||
| 585 | }, | 722 | }, |
| 586 | }; | ||
| 587 | |||
| 588 | /* rdtgroup information files for memory bandwidth. */ | ||
| 589 | static struct rftype res_mba_info_files[] = { | ||
| 590 | { | 723 | { |
| 591 | .name = "num_closids", | 724 | .name = "shareable_bits", |
| 592 | .mode = 0444, | 725 | .mode = 0444, |
| 593 | .kf_ops = &rdtgroup_kf_single_ops, | 726 | .kf_ops = &rdtgroup_kf_single_ops, |
| 594 | .seq_show = rdt_num_closids_show, | 727 | .seq_show = rdt_shareable_bits_show, |
| 728 | .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, | ||
| 595 | }, | 729 | }, |
| 596 | { | 730 | { |
| 597 | .name = "min_bandwidth", | 731 | .name = "min_bandwidth", |
| 598 | .mode = 0444, | 732 | .mode = 0444, |
| 599 | .kf_ops = &rdtgroup_kf_single_ops, | 733 | .kf_ops = &rdtgroup_kf_single_ops, |
| 600 | .seq_show = rdt_min_bw_show, | 734 | .seq_show = rdt_min_bw_show, |
| 735 | .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, | ||
| 601 | }, | 736 | }, |
| 602 | { | 737 | { |
| 603 | .name = "bandwidth_gran", | 738 | .name = "bandwidth_gran", |
| 604 | .mode = 0444, | 739 | .mode = 0444, |
| 605 | .kf_ops = &rdtgroup_kf_single_ops, | 740 | .kf_ops = &rdtgroup_kf_single_ops, |
| 606 | .seq_show = rdt_bw_gran_show, | 741 | .seq_show = rdt_bw_gran_show, |
| 742 | .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, | ||
| 607 | }, | 743 | }, |
| 608 | { | 744 | { |
| 609 | .name = "delay_linear", | 745 | .name = "delay_linear", |
| 610 | .mode = 0444, | 746 | .mode = 0444, |
| 611 | .kf_ops = &rdtgroup_kf_single_ops, | 747 | .kf_ops = &rdtgroup_kf_single_ops, |
| 612 | .seq_show = rdt_delay_linear_show, | 748 | .seq_show = rdt_delay_linear_show, |
| 749 | .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, | ||
| 750 | }, | ||
| 751 | { | ||
| 752 | .name = "max_threshold_occupancy", | ||
| 753 | .mode = 0644, | ||
| 754 | .kf_ops = &rdtgroup_kf_single_ops, | ||
| 755 | .write = max_threshold_occ_write, | ||
| 756 | .seq_show = max_threshold_occ_show, | ||
| 757 | .fflags = RF_MON_INFO | RFTYPE_RES_CACHE, | ||
| 758 | }, | ||
| 759 | { | ||
| 760 | .name = "cpus", | ||
| 761 | .mode = 0644, | ||
| 762 | .kf_ops = &rdtgroup_kf_single_ops, | ||
| 763 | .write = rdtgroup_cpus_write, | ||
| 764 | .seq_show = rdtgroup_cpus_show, | ||
| 765 | .fflags = RFTYPE_BASE, | ||
| 766 | }, | ||
| 767 | { | ||
| 768 | .name = "cpus_list", | ||
| 769 | .mode = 0644, | ||
| 770 | .kf_ops = &rdtgroup_kf_single_ops, | ||
| 771 | .write = rdtgroup_cpus_write, | ||
| 772 | .seq_show = rdtgroup_cpus_show, | ||
| 773 | .flags = RFTYPE_FLAGS_CPUS_LIST, | ||
| 774 | .fflags = RFTYPE_BASE, | ||
| 775 | }, | ||
| 776 | { | ||
| 777 | .name = "tasks", | ||
| 778 | .mode = 0644, | ||
| 779 | .kf_ops = &rdtgroup_kf_single_ops, | ||
| 780 | .write = rdtgroup_tasks_write, | ||
| 781 | .seq_show = rdtgroup_tasks_show, | ||
| 782 | .fflags = RFTYPE_BASE, | ||
| 783 | }, | ||
| 784 | { | ||
| 785 | .name = "schemata", | ||
| 786 | .mode = 0644, | ||
| 787 | .kf_ops = &rdtgroup_kf_single_ops, | ||
| 788 | .write = rdtgroup_schemata_write, | ||
| 789 | .seq_show = rdtgroup_schemata_show, | ||
| 790 | .fflags = RF_CTRL_BASE, | ||
| 613 | }, | 791 | }, |
| 614 | }; | 792 | }; |
| 615 | 793 | ||
| 616 | void rdt_get_mba_infofile(struct rdt_resource *r) | 794 | static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) |
| 617 | { | 795 | { |
| 618 | r->info_files = res_mba_info_files; | 796 | struct rftype *rfts, *rft; |
| 619 | r->nr_info_files = ARRAY_SIZE(res_mba_info_files); | 797 | int ret, len; |
| 798 | |||
| 799 | rfts = res_common_files; | ||
| 800 | len = ARRAY_SIZE(res_common_files); | ||
| 801 | |||
| 802 | lockdep_assert_held(&rdtgroup_mutex); | ||
| 803 | |||
| 804 | for (rft = rfts; rft < rfts + len; rft++) { | ||
| 805 | if ((fflags & rft->fflags) == rft->fflags) { | ||
| 806 | ret = rdtgroup_add_file(kn, rft); | ||
| 807 | if (ret) | ||
| 808 | goto error; | ||
| 809 | } | ||
| 810 | } | ||
| 811 | |||
| 812 | return 0; | ||
| 813 | error: | ||
| 814 | pr_warn("Failed to add %s, err=%d\n", rft->name, ret); | ||
| 815 | while (--rft >= rfts) { | ||
| 816 | if ((fflags & rft->fflags) == rft->fflags) | ||
| 817 | kernfs_remove_by_name(kn, rft->name); | ||
| 818 | } | ||
| 819 | return ret; | ||
| 620 | } | 820 | } |
| 621 | 821 | ||
| 622 | void rdt_get_cache_infofile(struct rdt_resource *r) | 822 | static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name, |
| 823 | unsigned long fflags) | ||
| 623 | { | 824 | { |
| 624 | r->info_files = res_cache_info_files; | 825 | struct kernfs_node *kn_subdir; |
| 625 | r->nr_info_files = ARRAY_SIZE(res_cache_info_files); | 826 | int ret; |
| 827 | |||
| 828 | kn_subdir = kernfs_create_dir(kn_info, name, | ||
| 829 | kn_info->mode, r); | ||
| 830 | if (IS_ERR(kn_subdir)) | ||
| 831 | return PTR_ERR(kn_subdir); | ||
| 832 | |||
| 833 | kernfs_get(kn_subdir); | ||
| 834 | ret = rdtgroup_kn_set_ugid(kn_subdir); | ||
| 835 | if (ret) | ||
| 836 | return ret; | ||
| 837 | |||
| 838 | ret = rdtgroup_add_files(kn_subdir, fflags); | ||
| 839 | if (!ret) | ||
| 840 | kernfs_activate(kn_subdir); | ||
| 841 | |||
| 842 | return ret; | ||
| 626 | } | 843 | } |
| 627 | 844 | ||
| 628 | static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) | 845 | static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) |
| 629 | { | 846 | { |
| 630 | struct kernfs_node *kn_subdir; | ||
| 631 | struct rftype *res_info_files; | ||
| 632 | struct rdt_resource *r; | 847 | struct rdt_resource *r; |
| 633 | int ret, len; | 848 | unsigned long fflags; |
| 849 | char name[32]; | ||
| 850 | int ret; | ||
| 634 | 851 | ||
| 635 | /* create the directory */ | 852 | /* create the directory */ |
| 636 | kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); | 853 | kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); |
| @@ -638,25 +855,19 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) | |||
| 638 | return PTR_ERR(kn_info); | 855 | return PTR_ERR(kn_info); |
| 639 | kernfs_get(kn_info); | 856 | kernfs_get(kn_info); |
| 640 | 857 | ||
| 641 | for_each_enabled_rdt_resource(r) { | 858 | for_each_alloc_enabled_rdt_resource(r) { |
| 642 | kn_subdir = kernfs_create_dir(kn_info, r->name, | 859 | fflags = r->fflags | RF_CTRL_INFO; |
| 643 | kn_info->mode, r); | 860 | ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags); |
| 644 | if (IS_ERR(kn_subdir)) { | ||
| 645 | ret = PTR_ERR(kn_subdir); | ||
| 646 | goto out_destroy; | ||
| 647 | } | ||
| 648 | kernfs_get(kn_subdir); | ||
| 649 | ret = rdtgroup_kn_set_ugid(kn_subdir); | ||
| 650 | if (ret) | 861 | if (ret) |
| 651 | goto out_destroy; | 862 | goto out_destroy; |
| 863 | } | ||
| 652 | 864 | ||
| 653 | res_info_files = r->info_files; | 865 | for_each_mon_enabled_rdt_resource(r) { |
| 654 | len = r->nr_info_files; | 866 | fflags = r->fflags | RF_MON_INFO; |
| 655 | 867 | sprintf(name, "%s_MON", r->name); | |
| 656 | ret = rdtgroup_add_files(kn_subdir, res_info_files, len); | 868 | ret = rdtgroup_mkdir_info_resdir(r, name, fflags); |
| 657 | if (ret) | 869 | if (ret) |
| 658 | goto out_destroy; | 870 | goto out_destroy; |
| 659 | kernfs_activate(kn_subdir); | ||
| 660 | } | 871 | } |
| 661 | 872 | ||
| 662 | /* | 873 | /* |
| @@ -678,6 +889,39 @@ out_destroy: | |||
| 678 | return ret; | 889 | return ret; |
| 679 | } | 890 | } |
| 680 | 891 | ||
| 892 | static int | ||
| 893 | mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, | ||
| 894 | char *name, struct kernfs_node **dest_kn) | ||
| 895 | { | ||
| 896 | struct kernfs_node *kn; | ||
| 897 | int ret; | ||
| 898 | |||
| 899 | /* create the directory */ | ||
| 900 | kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); | ||
| 901 | if (IS_ERR(kn)) | ||
| 902 | return PTR_ERR(kn); | ||
| 903 | |||
| 904 | if (dest_kn) | ||
| 905 | *dest_kn = kn; | ||
| 906 | |||
| 907 | /* | ||
| 908 | * This extra ref will be put in kernfs_remove() and guarantees | ||
| 909 | * that @rdtgrp->kn is always accessible. | ||
| 910 | */ | ||
| 911 | kernfs_get(kn); | ||
| 912 | |||
| 913 | ret = rdtgroup_kn_set_ugid(kn); | ||
| 914 | if (ret) | ||
| 915 | goto out_destroy; | ||
| 916 | |||
| 917 | kernfs_activate(kn); | ||
| 918 | |||
| 919 | return 0; | ||
| 920 | |||
| 921 | out_destroy: | ||
| 922 | kernfs_remove(kn); | ||
| 923 | return ret; | ||
| 924 | } | ||
| 681 | static void l3_qos_cfg_update(void *arg) | 925 | static void l3_qos_cfg_update(void *arg) |
| 682 | { | 926 | { |
| 683 | bool *enable = arg; | 927 | bool *enable = arg; |
| @@ -718,14 +962,15 @@ static int cdp_enable(void) | |||
| 718 | struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; | 962 | struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; |
| 719 | int ret; | 963 | int ret; |
| 720 | 964 | ||
| 721 | if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable) | 965 | if (!r_l3->alloc_capable || !r_l3data->alloc_capable || |
| 966 | !r_l3code->alloc_capable) | ||
| 722 | return -EINVAL; | 967 | return -EINVAL; |
| 723 | 968 | ||
| 724 | ret = set_l3_qos_cfg(r_l3, true); | 969 | ret = set_l3_qos_cfg(r_l3, true); |
| 725 | if (!ret) { | 970 | if (!ret) { |
| 726 | r_l3->enabled = false; | 971 | r_l3->alloc_enabled = false; |
| 727 | r_l3data->enabled = true; | 972 | r_l3data->alloc_enabled = true; |
| 728 | r_l3code->enabled = true; | 973 | r_l3code->alloc_enabled = true; |
| 729 | } | 974 | } |
| 730 | return ret; | 975 | return ret; |
| 731 | } | 976 | } |
| @@ -734,11 +979,11 @@ static void cdp_disable(void) | |||
| 734 | { | 979 | { |
| 735 | struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; | 980 | struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; |
| 736 | 981 | ||
| 737 | r->enabled = r->capable; | 982 | r->alloc_enabled = r->alloc_capable; |
| 738 | 983 | ||
| 739 | if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) { | 984 | if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) { |
| 740 | rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false; | 985 | rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled = false; |
| 741 | rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false; | 986 | rdt_resources_all[RDT_RESOURCE_L3CODE].alloc_enabled = false; |
| 742 | set_l3_qos_cfg(r, false); | 987 | set_l3_qos_cfg(r, false); |
| 743 | } | 988 | } |
| 744 | } | 989 | } |
| @@ -823,10 +1068,16 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) | |||
| 823 | } | 1068 | } |
| 824 | } | 1069 | } |
| 825 | 1070 | ||
| 1071 | static int mkdir_mondata_all(struct kernfs_node *parent_kn, | ||
| 1072 | struct rdtgroup *prgrp, | ||
| 1073 | struct kernfs_node **mon_data_kn); | ||
| 1074 | |||
| 826 | static struct dentry *rdt_mount(struct file_system_type *fs_type, | 1075 | static struct dentry *rdt_mount(struct file_system_type *fs_type, |
| 827 | int flags, const char *unused_dev_name, | 1076 | int flags, const char *unused_dev_name, |
| 828 | void *data) | 1077 | void *data) |
| 829 | { | 1078 | { |
| 1079 | struct rdt_domain *dom; | ||
| 1080 | struct rdt_resource *r; | ||
| 830 | struct dentry *dentry; | 1081 | struct dentry *dentry; |
| 831 | int ret; | 1082 | int ret; |
| 832 | 1083 | ||
| @@ -853,15 +1104,54 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, | |||
| 853 | goto out_cdp; | 1104 | goto out_cdp; |
| 854 | } | 1105 | } |
| 855 | 1106 | ||
| 1107 | if (rdt_mon_capable) { | ||
| 1108 | ret = mongroup_create_dir(rdtgroup_default.kn, | ||
| 1109 | NULL, "mon_groups", | ||
| 1110 | &kn_mongrp); | ||
| 1111 | if (ret) { | ||
| 1112 | dentry = ERR_PTR(ret); | ||
| 1113 | goto out_info; | ||
| 1114 | } | ||
| 1115 | kernfs_get(kn_mongrp); | ||
| 1116 | |||
| 1117 | ret = mkdir_mondata_all(rdtgroup_default.kn, | ||
| 1118 | &rdtgroup_default, &kn_mondata); | ||
| 1119 | if (ret) { | ||
| 1120 | dentry = ERR_PTR(ret); | ||
| 1121 | goto out_mongrp; | ||
| 1122 | } | ||
| 1123 | kernfs_get(kn_mondata); | ||
| 1124 | rdtgroup_default.mon.mon_data_kn = kn_mondata; | ||
| 1125 | } | ||
| 1126 | |||
| 856 | dentry = kernfs_mount(fs_type, flags, rdt_root, | 1127 | dentry = kernfs_mount(fs_type, flags, rdt_root, |
| 857 | RDTGROUP_SUPER_MAGIC, NULL); | 1128 | RDTGROUP_SUPER_MAGIC, NULL); |
| 858 | if (IS_ERR(dentry)) | 1129 | if (IS_ERR(dentry)) |
| 859 | goto out_destroy; | 1130 | goto out_mondata; |
| 1131 | |||
| 1132 | if (rdt_alloc_capable) | ||
| 1133 | static_branch_enable(&rdt_alloc_enable_key); | ||
| 1134 | if (rdt_mon_capable) | ||
| 1135 | static_branch_enable(&rdt_mon_enable_key); | ||
| 1136 | |||
| 1137 | if (rdt_alloc_capable || rdt_mon_capable) | ||
| 1138 | static_branch_enable(&rdt_enable_key); | ||
| 1139 | |||
| 1140 | if (is_mbm_enabled()) { | ||
| 1141 | r = &rdt_resources_all[RDT_RESOURCE_L3]; | ||
| 1142 | list_for_each_entry(dom, &r->domains, list) | ||
| 1143 | mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL); | ||
| 1144 | } | ||
| 860 | 1145 | ||
| 861 | static_branch_enable(&rdt_enable_key); | ||
| 862 | goto out; | 1146 | goto out; |
| 863 | 1147 | ||
| 864 | out_destroy: | 1148 | out_mondata: |
| 1149 | if (rdt_mon_capable) | ||
| 1150 | kernfs_remove(kn_mondata); | ||
| 1151 | out_mongrp: | ||
| 1152 | if (rdt_mon_capable) | ||
| 1153 | kernfs_remove(kn_mongrp); | ||
| 1154 | out_info: | ||
| 865 | kernfs_remove(kn_info); | 1155 | kernfs_remove(kn_info); |
| 866 | out_cdp: | 1156 | out_cdp: |
| 867 | cdp_disable(); | 1157 | cdp_disable(); |
| @@ -909,6 +1199,18 @@ static int reset_all_ctrls(struct rdt_resource *r) | |||
| 909 | return 0; | 1199 | return 0; |
| 910 | } | 1200 | } |
| 911 | 1201 | ||
| 1202 | static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) | ||
| 1203 | { | ||
| 1204 | return (rdt_alloc_capable && | ||
| 1205 | (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); | ||
| 1206 | } | ||
| 1207 | |||
| 1208 | static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) | ||
| 1209 | { | ||
| 1210 | return (rdt_mon_capable && | ||
| 1211 | (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); | ||
| 1212 | } | ||
| 1213 | |||
| 912 | /* | 1214 | /* |
| 913 | * Move tasks from one to the other group. If @from is NULL, then all tasks | 1215 | * Move tasks from one to the other group. If @from is NULL, then all tasks |
| 914 | * in the systems are moved unconditionally (used for teardown). | 1216 | * in the systems are moved unconditionally (used for teardown). |
| @@ -924,8 +1226,11 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, | |||
| 924 | 1226 | ||
| 925 | read_lock(&tasklist_lock); | 1227 | read_lock(&tasklist_lock); |
| 926 | for_each_process_thread(p, t) { | 1228 | for_each_process_thread(p, t) { |
| 927 | if (!from || t->closid == from->closid) { | 1229 | if (!from || is_closid_match(t, from) || |
| 1230 | is_rmid_match(t, from)) { | ||
| 928 | t->closid = to->closid; | 1231 | t->closid = to->closid; |
| 1232 | t->rmid = to->mon.rmid; | ||
| 1233 | |||
| 929 | #ifdef CONFIG_SMP | 1234 | #ifdef CONFIG_SMP |
| 930 | /* | 1235 | /* |
| 931 | * This is safe on x86 w/o barriers as the ordering | 1236 | * This is safe on x86 w/o barriers as the ordering |
| @@ -944,6 +1249,19 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, | |||
| 944 | read_unlock(&tasklist_lock); | 1249 | read_unlock(&tasklist_lock); |
| 945 | } | 1250 | } |
| 946 | 1251 | ||
| 1252 | static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) | ||
| 1253 | { | ||
| 1254 | struct rdtgroup *sentry, *stmp; | ||
| 1255 | struct list_head *head; | ||
| 1256 | |||
| 1257 | head = &rdtgrp->mon.crdtgrp_list; | ||
| 1258 | list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { | ||
| 1259 | free_rmid(sentry->mon.rmid); | ||
| 1260 | list_del(&sentry->mon.crdtgrp_list); | ||
| 1261 | kfree(sentry); | ||
| 1262 | } | ||
| 1263 | } | ||
| 1264 | |||
| 947 | /* | 1265 | /* |
| 948 | * Forcibly remove all of subdirectories under root. | 1266 | * Forcibly remove all of subdirectories under root. |
| 949 | */ | 1267 | */ |
| @@ -955,6 +1273,9 @@ static void rmdir_all_sub(void) | |||
| 955 | rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); | 1273 | rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); |
| 956 | 1274 | ||
| 957 | list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { | 1275 | list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { |
| 1276 | /* Free any child rmids */ | ||
| 1277 | free_all_child_rdtgrp(rdtgrp); | ||
| 1278 | |||
| 958 | /* Remove each rdtgroup other than root */ | 1279 | /* Remove each rdtgroup other than root */ |
| 959 | if (rdtgrp == &rdtgroup_default) | 1280 | if (rdtgrp == &rdtgroup_default) |
| 960 | continue; | 1281 | continue; |
| @@ -967,16 +1288,20 @@ static void rmdir_all_sub(void) | |||
| 967 | cpumask_or(&rdtgroup_default.cpu_mask, | 1288 | cpumask_or(&rdtgroup_default.cpu_mask, |
| 968 | &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); | 1289 | &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); |
| 969 | 1290 | ||
| 1291 | free_rmid(rdtgrp->mon.rmid); | ||
| 1292 | |||
| 970 | kernfs_remove(rdtgrp->kn); | 1293 | kernfs_remove(rdtgrp->kn); |
| 971 | list_del(&rdtgrp->rdtgroup_list); | 1294 | list_del(&rdtgrp->rdtgroup_list); |
| 972 | kfree(rdtgrp); | 1295 | kfree(rdtgrp); |
| 973 | } | 1296 | } |
| 974 | /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ | 1297 | /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ |
| 975 | get_online_cpus(); | 1298 | get_online_cpus(); |
| 976 | rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid); | 1299 | update_closid_rmid(cpu_online_mask, &rdtgroup_default); |
| 977 | put_online_cpus(); | 1300 | put_online_cpus(); |
| 978 | 1301 | ||
| 979 | kernfs_remove(kn_info); | 1302 | kernfs_remove(kn_info); |
| 1303 | kernfs_remove(kn_mongrp); | ||
| 1304 | kernfs_remove(kn_mondata); | ||
| 980 | } | 1305 | } |
| 981 | 1306 | ||
| 982 | static void rdt_kill_sb(struct super_block *sb) | 1307 | static void rdt_kill_sb(struct super_block *sb) |
| @@ -986,10 +1311,12 @@ static void rdt_kill_sb(struct super_block *sb) | |||
| 986 | mutex_lock(&rdtgroup_mutex); | 1311 | mutex_lock(&rdtgroup_mutex); |
| 987 | 1312 | ||
| 988 | /*Put everything back to default values. */ | 1313 | /*Put everything back to default values. */ |
| 989 | for_each_enabled_rdt_resource(r) | 1314 | for_each_alloc_enabled_rdt_resource(r) |
| 990 | reset_all_ctrls(r); | 1315 | reset_all_ctrls(r); |
| 991 | cdp_disable(); | 1316 | cdp_disable(); |
| 992 | rmdir_all_sub(); | 1317 | rmdir_all_sub(); |
| 1318 | static_branch_disable(&rdt_alloc_enable_key); | ||
| 1319 | static_branch_disable(&rdt_mon_enable_key); | ||
| 993 | static_branch_disable(&rdt_enable_key); | 1320 | static_branch_disable(&rdt_enable_key); |
| 994 | kernfs_kill_sb(sb); | 1321 | kernfs_kill_sb(sb); |
| 995 | mutex_unlock(&rdtgroup_mutex); | 1322 | mutex_unlock(&rdtgroup_mutex); |
| @@ -1001,46 +1328,223 @@ static struct file_system_type rdt_fs_type = { | |||
| 1001 | .kill_sb = rdt_kill_sb, | 1328 | .kill_sb = rdt_kill_sb, |
| 1002 | }; | 1329 | }; |
| 1003 | 1330 | ||
| 1004 | static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, | 1331 | static int mon_addfile(struct kernfs_node *parent_kn, const char *name, |
| 1005 | umode_t mode) | 1332 | void *priv) |
| 1006 | { | 1333 | { |
| 1007 | struct rdtgroup *parent, *rdtgrp; | ||
| 1008 | struct kernfs_node *kn; | 1334 | struct kernfs_node *kn; |
| 1009 | int ret, closid; | 1335 | int ret = 0; |
| 1010 | 1336 | ||
| 1011 | /* Only allow mkdir in the root directory */ | 1337 | kn = __kernfs_create_file(parent_kn, name, 0444, 0, |
| 1012 | if (parent_kn != rdtgroup_default.kn) | 1338 | &kf_mondata_ops, priv, NULL, NULL); |
| 1013 | return -EPERM; | 1339 | if (IS_ERR(kn)) |
| 1340 | return PTR_ERR(kn); | ||
| 1014 | 1341 | ||
| 1015 | /* Do not accept '\n' to avoid unparsable situation. */ | 1342 | ret = rdtgroup_kn_set_ugid(kn); |
| 1016 | if (strchr(name, '\n')) | 1343 | if (ret) { |
| 1017 | return -EINVAL; | 1344 | kernfs_remove(kn); |
| 1345 | return ret; | ||
| 1346 | } | ||
| 1018 | 1347 | ||
| 1019 | parent = rdtgroup_kn_lock_live(parent_kn); | 1348 | return ret; |
| 1020 | if (!parent) { | 1349 | } |
| 1021 | ret = -ENODEV; | 1350 | |
| 1022 | goto out_unlock; | 1351 | /* |
| 1352 | * Remove all subdirectories of mon_data of ctrl_mon groups | ||
| 1353 | * and monitor groups with given domain id. | ||
| 1354 | */ | ||
| 1355 | void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id) | ||
| 1356 | { | ||
| 1357 | struct rdtgroup *prgrp, *crgrp; | ||
| 1358 | char name[32]; | ||
| 1359 | |||
| 1360 | if (!r->mon_enabled) | ||
| 1361 | return; | ||
| 1362 | |||
| 1363 | list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { | ||
| 1364 | sprintf(name, "mon_%s_%02d", r->name, dom_id); | ||
| 1365 | kernfs_remove_by_name(prgrp->mon.mon_data_kn, name); | ||
| 1366 | |||
| 1367 | list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) | ||
| 1368 | kernfs_remove_by_name(crgrp->mon.mon_data_kn, name); | ||
| 1023 | } | 1369 | } |
| 1370 | } | ||
| 1024 | 1371 | ||
| 1025 | ret = closid_alloc(); | 1372 | static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, |
| 1026 | if (ret < 0) | 1373 | struct rdt_domain *d, |
| 1374 | struct rdt_resource *r, struct rdtgroup *prgrp) | ||
| 1375 | { | ||
| 1376 | union mon_data_bits priv; | ||
| 1377 | struct kernfs_node *kn; | ||
| 1378 | struct mon_evt *mevt; | ||
| 1379 | struct rmid_read rr; | ||
| 1380 | char name[32]; | ||
| 1381 | int ret; | ||
| 1382 | |||
| 1383 | sprintf(name, "mon_%s_%02d", r->name, d->id); | ||
| 1384 | /* create the directory */ | ||
| 1385 | kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); | ||
| 1386 | if (IS_ERR(kn)) | ||
| 1387 | return PTR_ERR(kn); | ||
| 1388 | |||
| 1389 | /* | ||
| 1390 | * This extra ref will be put in kernfs_remove() and guarantees | ||
| 1391 | * that kn is always accessible. | ||
| 1392 | */ | ||
| 1393 | kernfs_get(kn); | ||
| 1394 | ret = rdtgroup_kn_set_ugid(kn); | ||
| 1395 | if (ret) | ||
| 1396 | goto out_destroy; | ||
| 1397 | |||
| 1398 | if (WARN_ON(list_empty(&r->evt_list))) { | ||
| 1399 | ret = -EPERM; | ||
| 1400 | goto out_destroy; | ||
| 1401 | } | ||
| 1402 | |||
| 1403 | priv.u.rid = r->rid; | ||
| 1404 | priv.u.domid = d->id; | ||
| 1405 | list_for_each_entry(mevt, &r->evt_list, list) { | ||
| 1406 | priv.u.evtid = mevt->evtid; | ||
| 1407 | ret = mon_addfile(kn, mevt->name, priv.priv); | ||
| 1408 | if (ret) | ||
| 1409 | goto out_destroy; | ||
| 1410 | |||
| 1411 | if (is_mbm_event(mevt->evtid)) | ||
| 1412 | mon_event_read(&rr, d, prgrp, mevt->evtid, true); | ||
| 1413 | } | ||
| 1414 | kernfs_activate(kn); | ||
| 1415 | return 0; | ||
| 1416 | |||
| 1417 | out_destroy: | ||
| 1418 | kernfs_remove(kn); | ||
| 1419 | return ret; | ||
| 1420 | } | ||
| 1421 | |||
| 1422 | /* | ||
| 1423 | * Add all subdirectories of mon_data for "ctrl_mon" groups | ||
| 1424 | * and "monitor" groups with given domain id. | ||
| 1425 | */ | ||
| 1426 | void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, | ||
| 1427 | struct rdt_domain *d) | ||
| 1428 | { | ||
| 1429 | struct kernfs_node *parent_kn; | ||
| 1430 | struct rdtgroup *prgrp, *crgrp; | ||
| 1431 | struct list_head *head; | ||
| 1432 | |||
| 1433 | if (!r->mon_enabled) | ||
| 1434 | return; | ||
| 1435 | |||
| 1436 | list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { | ||
| 1437 | parent_kn = prgrp->mon.mon_data_kn; | ||
| 1438 | mkdir_mondata_subdir(parent_kn, d, r, prgrp); | ||
| 1439 | |||
| 1440 | head = &prgrp->mon.crdtgrp_list; | ||
| 1441 | list_for_each_entry(crgrp, head, mon.crdtgrp_list) { | ||
| 1442 | parent_kn = crgrp->mon.mon_data_kn; | ||
| 1443 | mkdir_mondata_subdir(parent_kn, d, r, crgrp); | ||
| 1444 | } | ||
| 1445 | } | ||
| 1446 | } | ||
| 1447 | |||
| 1448 | static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, | ||
| 1449 | struct rdt_resource *r, | ||
| 1450 | struct rdtgroup *prgrp) | ||
| 1451 | { | ||
| 1452 | struct rdt_domain *dom; | ||
| 1453 | int ret; | ||
| 1454 | |||
| 1455 | list_for_each_entry(dom, &r->domains, list) { | ||
| 1456 | ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); | ||
| 1457 | if (ret) | ||
| 1458 | return ret; | ||
| 1459 | } | ||
| 1460 | |||
| 1461 | return 0; | ||
| 1462 | } | ||
| 1463 | |||
| 1464 | /* | ||
| 1465 | * This creates a directory mon_data which contains the monitored data. | ||
| 1466 | * | ||
| 1467 | * mon_data has one directory for each domain whic are named | ||
| 1468 | * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data | ||
| 1469 | * with L3 domain looks as below: | ||
| 1470 | * ./mon_data: | ||
| 1471 | * mon_L3_00 | ||
| 1472 | * mon_L3_01 | ||
| 1473 | * mon_L3_02 | ||
| 1474 | * ... | ||
| 1475 | * | ||
| 1476 | * Each domain directory has one file per event: | ||
| 1477 | * ./mon_L3_00/: | ||
| 1478 | * llc_occupancy | ||
| 1479 | * | ||
| 1480 | */ | ||
| 1481 | static int mkdir_mondata_all(struct kernfs_node *parent_kn, | ||
| 1482 | struct rdtgroup *prgrp, | ||
| 1483 | struct kernfs_node **dest_kn) | ||
| 1484 | { | ||
| 1485 | struct rdt_resource *r; | ||
| 1486 | struct kernfs_node *kn; | ||
| 1487 | int ret; | ||
| 1488 | |||
| 1489 | /* | ||
| 1490 | * Create the mon_data directory first. | ||
| 1491 | */ | ||
| 1492 | ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn); | ||
| 1493 | if (ret) | ||
| 1494 | return ret; | ||
| 1495 | |||
| 1496 | if (dest_kn) | ||
| 1497 | *dest_kn = kn; | ||
| 1498 | |||
| 1499 | /* | ||
| 1500 | * Create the subdirectories for each domain. Note that all events | ||
| 1501 | * in a domain like L3 are grouped into a resource whose domain is L3 | ||
| 1502 | */ | ||
| 1503 | for_each_mon_enabled_rdt_resource(r) { | ||
| 1504 | ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); | ||
| 1505 | if (ret) | ||
| 1506 | goto out_destroy; | ||
| 1507 | } | ||
| 1508 | |||
| 1509 | return 0; | ||
| 1510 | |||
| 1511 | out_destroy: | ||
| 1512 | kernfs_remove(kn); | ||
| 1513 | return ret; | ||
| 1514 | } | ||
| 1515 | |||
| 1516 | static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, | ||
| 1517 | struct kernfs_node *prgrp_kn, | ||
| 1518 | const char *name, umode_t mode, | ||
| 1519 | enum rdt_group_type rtype, struct rdtgroup **r) | ||
| 1520 | { | ||
| 1521 | struct rdtgroup *prdtgrp, *rdtgrp; | ||
| 1522 | struct kernfs_node *kn; | ||
| 1523 | uint files = 0; | ||
| 1524 | int ret; | ||
| 1525 | |||
| 1526 | prdtgrp = rdtgroup_kn_lock_live(prgrp_kn); | ||
| 1527 | if (!prdtgrp) { | ||
| 1528 | ret = -ENODEV; | ||
| 1027 | goto out_unlock; | 1529 | goto out_unlock; |
| 1028 | closid = ret; | 1530 | } |
| 1029 | 1531 | ||
| 1030 | /* allocate the rdtgroup. */ | 1532 | /* allocate the rdtgroup. */ |
| 1031 | rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); | 1533 | rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); |
| 1032 | if (!rdtgrp) { | 1534 | if (!rdtgrp) { |
| 1033 | ret = -ENOSPC; | 1535 | ret = -ENOSPC; |
| 1034 | goto out_closid_free; | 1536 | goto out_unlock; |
| 1035 | } | 1537 | } |
| 1036 | rdtgrp->closid = closid; | 1538 | *r = rdtgrp; |
| 1037 | list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); | 1539 | rdtgrp->mon.parent = prdtgrp; |
| 1540 | rdtgrp->type = rtype; | ||
| 1541 | INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list); | ||
| 1038 | 1542 | ||
| 1039 | /* kernfs creates the directory for rdtgrp */ | 1543 | /* kernfs creates the directory for rdtgrp */ |
| 1040 | kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp); | 1544 | kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); |
| 1041 | if (IS_ERR(kn)) { | 1545 | if (IS_ERR(kn)) { |
| 1042 | ret = PTR_ERR(kn); | 1546 | ret = PTR_ERR(kn); |
| 1043 | goto out_cancel_ref; | 1547 | goto out_free_rgrp; |
| 1044 | } | 1548 | } |
| 1045 | rdtgrp->kn = kn; | 1549 | rdtgrp->kn = kn; |
| 1046 | 1550 | ||
| @@ -1056,43 +1560,211 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, | |||
| 1056 | if (ret) | 1560 | if (ret) |
| 1057 | goto out_destroy; | 1561 | goto out_destroy; |
| 1058 | 1562 | ||
| 1059 | ret = rdtgroup_add_files(kn, rdtgroup_base_files, | 1563 | files = RFTYPE_BASE | RFTYPE_CTRL; |
| 1060 | ARRAY_SIZE(rdtgroup_base_files)); | 1564 | files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype); |
| 1565 | ret = rdtgroup_add_files(kn, files); | ||
| 1061 | if (ret) | 1566 | if (ret) |
| 1062 | goto out_destroy; | 1567 | goto out_destroy; |
| 1063 | 1568 | ||
| 1569 | if (rdt_mon_capable) { | ||
| 1570 | ret = alloc_rmid(); | ||
| 1571 | if (ret < 0) | ||
| 1572 | goto out_destroy; | ||
| 1573 | rdtgrp->mon.rmid = ret; | ||
| 1574 | |||
| 1575 | ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn); | ||
| 1576 | if (ret) | ||
| 1577 | goto out_idfree; | ||
| 1578 | } | ||
| 1064 | kernfs_activate(kn); | 1579 | kernfs_activate(kn); |
| 1065 | 1580 | ||
| 1066 | ret = 0; | 1581 | /* |
| 1067 | goto out_unlock; | 1582 | * The caller unlocks the prgrp_kn upon success. |
| 1583 | */ | ||
| 1584 | return 0; | ||
| 1068 | 1585 | ||
| 1586 | out_idfree: | ||
| 1587 | free_rmid(rdtgrp->mon.rmid); | ||
| 1069 | out_destroy: | 1588 | out_destroy: |
| 1070 | kernfs_remove(rdtgrp->kn); | 1589 | kernfs_remove(rdtgrp->kn); |
| 1071 | out_cancel_ref: | 1590 | out_free_rgrp: |
| 1072 | list_del(&rdtgrp->rdtgroup_list); | ||
| 1073 | kfree(rdtgrp); | 1591 | kfree(rdtgrp); |
| 1074 | out_closid_free: | ||
| 1075 | closid_free(closid); | ||
| 1076 | out_unlock: | 1592 | out_unlock: |
| 1077 | rdtgroup_kn_unlock(parent_kn); | 1593 | rdtgroup_kn_unlock(prgrp_kn); |
| 1078 | return ret; | 1594 | return ret; |
| 1079 | } | 1595 | } |
| 1080 | 1596 | ||
| 1081 | static int rdtgroup_rmdir(struct kernfs_node *kn) | 1597 | static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) |
| 1598 | { | ||
| 1599 | kernfs_remove(rgrp->kn); | ||
| 1600 | free_rmid(rgrp->mon.rmid); | ||
| 1601 | kfree(rgrp); | ||
| 1602 | } | ||
| 1603 | |||
| 1604 | /* | ||
| 1605 | * Create a monitor group under "mon_groups" directory of a control | ||
| 1606 | * and monitor group(ctrl_mon). This is a resource group | ||
| 1607 | * to monitor a subset of tasks and cpus in its parent ctrl_mon group. | ||
| 1608 | */ | ||
| 1609 | static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, | ||
| 1610 | struct kernfs_node *prgrp_kn, | ||
| 1611 | const char *name, | ||
| 1612 | umode_t mode) | ||
| 1613 | { | ||
| 1614 | struct rdtgroup *rdtgrp, *prgrp; | ||
| 1615 | int ret; | ||
| 1616 | |||
| 1617 | ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP, | ||
| 1618 | &rdtgrp); | ||
| 1619 | if (ret) | ||
| 1620 | return ret; | ||
| 1621 | |||
| 1622 | prgrp = rdtgrp->mon.parent; | ||
| 1623 | rdtgrp->closid = prgrp->closid; | ||
| 1624 | |||
| 1625 | /* | ||
| 1626 | * Add the rdtgrp to the list of rdtgrps the parent | ||
| 1627 | * ctrl_mon group has to track. | ||
| 1628 | */ | ||
| 1629 | list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); | ||
| 1630 | |||
| 1631 | rdtgroup_kn_unlock(prgrp_kn); | ||
| 1632 | return ret; | ||
| 1633 | } | ||
| 1634 | |||
| 1635 | /* | ||
| 1636 | * These are rdtgroups created under the root directory. Can be used | ||
| 1637 | * to allocate and monitor resources. | ||
| 1638 | */ | ||
| 1639 | static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, | ||
| 1640 | struct kernfs_node *prgrp_kn, | ||
| 1641 | const char *name, umode_t mode) | ||
| 1082 | { | 1642 | { |
| 1083 | int ret, cpu, closid = rdtgroup_default.closid; | ||
| 1084 | struct rdtgroup *rdtgrp; | 1643 | struct rdtgroup *rdtgrp; |
| 1085 | cpumask_var_t tmpmask; | 1644 | struct kernfs_node *kn; |
| 1645 | u32 closid; | ||
| 1646 | int ret; | ||
| 1086 | 1647 | ||
| 1087 | if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) | 1648 | ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP, |
| 1088 | return -ENOMEM; | 1649 | &rdtgrp); |
| 1650 | if (ret) | ||
| 1651 | return ret; | ||
| 1089 | 1652 | ||
| 1090 | rdtgrp = rdtgroup_kn_lock_live(kn); | 1653 | kn = rdtgrp->kn; |
| 1091 | if (!rdtgrp) { | 1654 | ret = closid_alloc(); |
| 1092 | ret = -EPERM; | 1655 | if (ret < 0) |
| 1093 | goto out; | 1656 | goto out_common_fail; |
| 1657 | closid = ret; | ||
| 1658 | |||
| 1659 | rdtgrp->closid = closid; | ||
| 1660 | list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); | ||
| 1661 | |||
| 1662 | if (rdt_mon_capable) { | ||
| 1663 | /* | ||
| 1664 | * Create an empty mon_groups directory to hold the subset | ||
| 1665 | * of tasks and cpus to monitor. | ||
| 1666 | */ | ||
| 1667 | ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL); | ||
| 1668 | if (ret) | ||
| 1669 | goto out_id_free; | ||
| 1094 | } | 1670 | } |
| 1095 | 1671 | ||
| 1672 | goto out_unlock; | ||
| 1673 | |||
| 1674 | out_id_free: | ||
| 1675 | closid_free(closid); | ||
| 1676 | list_del(&rdtgrp->rdtgroup_list); | ||
| 1677 | out_common_fail: | ||
| 1678 | mkdir_rdt_prepare_clean(rdtgrp); | ||
| 1679 | out_unlock: | ||
| 1680 | rdtgroup_kn_unlock(prgrp_kn); | ||
| 1681 | return ret; | ||
| 1682 | } | ||
| 1683 | |||
| 1684 | /* | ||
| 1685 | * We allow creating mon groups only with in a directory called "mon_groups" | ||
| 1686 | * which is present in every ctrl_mon group. Check if this is a valid | ||
| 1687 | * "mon_groups" directory. | ||
| 1688 | * | ||
| 1689 | * 1. The directory should be named "mon_groups". | ||
| 1690 | * 2. The mon group itself should "not" be named "mon_groups". | ||
| 1691 | * This makes sure "mon_groups" directory always has a ctrl_mon group | ||
| 1692 | * as parent. | ||
| 1693 | */ | ||
| 1694 | static bool is_mon_groups(struct kernfs_node *kn, const char *name) | ||
| 1695 | { | ||
| 1696 | return (!strcmp(kn->name, "mon_groups") && | ||
| 1697 | strcmp(name, "mon_groups")); | ||
| 1698 | } | ||
| 1699 | |||
| 1700 | static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, | ||
| 1701 | umode_t mode) | ||
| 1702 | { | ||
| 1703 | /* Do not accept '\n' to avoid unparsable situation. */ | ||
| 1704 | if (strchr(name, '\n')) | ||
| 1705 | return -EINVAL; | ||
| 1706 | |||
| 1707 | /* | ||
| 1708 | * If the parent directory is the root directory and RDT | ||
| 1709 | * allocation is supported, add a control and monitoring | ||
| 1710 | * subdirectory | ||
| 1711 | */ | ||
| 1712 | if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn) | ||
| 1713 | return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode); | ||
| 1714 | |||
| 1715 | /* | ||
| 1716 | * If RDT monitoring is supported and the parent directory is a valid | ||
| 1717 | * "mon_groups" directory, add a monitoring subdirectory. | ||
| 1718 | */ | ||
| 1719 | if (rdt_mon_capable && is_mon_groups(parent_kn, name)) | ||
| 1720 | return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode); | ||
| 1721 | |||
| 1722 | return -EPERM; | ||
| 1723 | } | ||
| 1724 | |||
| 1725 | static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp, | ||
| 1726 | cpumask_var_t tmpmask) | ||
| 1727 | { | ||
| 1728 | struct rdtgroup *prdtgrp = rdtgrp->mon.parent; | ||
| 1729 | int cpu; | ||
| 1730 | |||
| 1731 | /* Give any tasks back to the parent group */ | ||
| 1732 | rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask); | ||
| 1733 | |||
| 1734 | /* Update per cpu rmid of the moved CPUs first */ | ||
| 1735 | for_each_cpu(cpu, &rdtgrp->cpu_mask) | ||
| 1736 | per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid; | ||
| 1737 | /* | ||
| 1738 | * Update the MSR on moved CPUs and CPUs which have moved | ||
| 1739 | * task running on them. | ||
| 1740 | */ | ||
| 1741 | cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); | ||
| 1742 | update_closid_rmid(tmpmask, NULL); | ||
| 1743 | |||
| 1744 | rdtgrp->flags = RDT_DELETED; | ||
| 1745 | free_rmid(rdtgrp->mon.rmid); | ||
| 1746 | |||
| 1747 | /* | ||
| 1748 | * Remove the rdtgrp from the parent ctrl_mon group's list | ||
| 1749 | */ | ||
| 1750 | WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); | ||
| 1751 | list_del(&rdtgrp->mon.crdtgrp_list); | ||
| 1752 | |||
| 1753 | /* | ||
| 1754 | * one extra hold on this, will drop when we kfree(rdtgrp) | ||
| 1755 | * in rdtgroup_kn_unlock() | ||
| 1756 | */ | ||
| 1757 | kernfs_get(kn); | ||
| 1758 | kernfs_remove(rdtgrp->kn); | ||
| 1759 | |||
| 1760 | return 0; | ||
| 1761 | } | ||
| 1762 | |||
| 1763 | static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, | ||
| 1764 | cpumask_var_t tmpmask) | ||
| 1765 | { | ||
| 1766 | int cpu; | ||
| 1767 | |||
| 1096 | /* Give any tasks back to the default group */ | 1768 | /* Give any tasks back to the default group */ |
| 1097 | rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); | 1769 | rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); |
| 1098 | 1770 | ||
| @@ -1100,18 +1772,28 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) | |||
| 1100 | cpumask_or(&rdtgroup_default.cpu_mask, | 1772 | cpumask_or(&rdtgroup_default.cpu_mask, |
| 1101 | &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); | 1773 | &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); |
| 1102 | 1774 | ||
| 1103 | /* Update per cpu closid of the moved CPUs first */ | 1775 | /* Update per cpu closid and rmid of the moved CPUs first */ |
| 1104 | for_each_cpu(cpu, &rdtgrp->cpu_mask) | 1776 | for_each_cpu(cpu, &rdtgrp->cpu_mask) { |
| 1105 | per_cpu(cpu_closid, cpu) = closid; | 1777 | per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid; |
| 1778 | per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid; | ||
| 1779 | } | ||
| 1780 | |||
| 1106 | /* | 1781 | /* |
| 1107 | * Update the MSR on moved CPUs and CPUs which have moved | 1782 | * Update the MSR on moved CPUs and CPUs which have moved |
| 1108 | * task running on them. | 1783 | * task running on them. |
| 1109 | */ | 1784 | */ |
| 1110 | cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); | 1785 | cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); |
| 1111 | rdt_update_closid(tmpmask, NULL); | 1786 | update_closid_rmid(tmpmask, NULL); |
| 1112 | 1787 | ||
| 1113 | rdtgrp->flags = RDT_DELETED; | 1788 | rdtgrp->flags = RDT_DELETED; |
| 1114 | closid_free(rdtgrp->closid); | 1789 | closid_free(rdtgrp->closid); |
| 1790 | free_rmid(rdtgrp->mon.rmid); | ||
| 1791 | |||
| 1792 | /* | ||
| 1793 | * Free all the child monitor group rmids. | ||
| 1794 | */ | ||
| 1795 | free_all_child_rdtgrp(rdtgrp); | ||
| 1796 | |||
| 1115 | list_del(&rdtgrp->rdtgroup_list); | 1797 | list_del(&rdtgrp->rdtgroup_list); |
| 1116 | 1798 | ||
| 1117 | /* | 1799 | /* |
| @@ -1120,7 +1802,41 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) | |||
| 1120 | */ | 1802 | */ |
| 1121 | kernfs_get(kn); | 1803 | kernfs_get(kn); |
| 1122 | kernfs_remove(rdtgrp->kn); | 1804 | kernfs_remove(rdtgrp->kn); |
| 1123 | ret = 0; | 1805 | |
| 1806 | return 0; | ||
| 1807 | } | ||
| 1808 | |||
| 1809 | static int rdtgroup_rmdir(struct kernfs_node *kn) | ||
| 1810 | { | ||
| 1811 | struct kernfs_node *parent_kn = kn->parent; | ||
| 1812 | struct rdtgroup *rdtgrp; | ||
| 1813 | cpumask_var_t tmpmask; | ||
| 1814 | int ret = 0; | ||
| 1815 | |||
| 1816 | if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) | ||
| 1817 | return -ENOMEM; | ||
| 1818 | |||
| 1819 | rdtgrp = rdtgroup_kn_lock_live(kn); | ||
| 1820 | if (!rdtgrp) { | ||
| 1821 | ret = -EPERM; | ||
| 1822 | goto out; | ||
| 1823 | } | ||
| 1824 | |||
| 1825 | /* | ||
| 1826 | * If the rdtgroup is a ctrl_mon group and parent directory | ||
| 1827 | * is the root directory, remove the ctrl_mon group. | ||
| 1828 | * | ||
| 1829 | * If the rdtgroup is a mon group and parent directory | ||
| 1830 | * is a valid "mon_groups" directory, remove the mon group. | ||
| 1831 | */ | ||
| 1832 | if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) | ||
| 1833 | ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask); | ||
| 1834 | else if (rdtgrp->type == RDTMON_GROUP && | ||
| 1835 | is_mon_groups(parent_kn, kn->name)) | ||
| 1836 | ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask); | ||
| 1837 | else | ||
| 1838 | ret = -EPERM; | ||
| 1839 | |||
| 1124 | out: | 1840 | out: |
| 1125 | rdtgroup_kn_unlock(kn); | 1841 | rdtgroup_kn_unlock(kn); |
| 1126 | free_cpumask_var(tmpmask); | 1842 | free_cpumask_var(tmpmask); |
| @@ -1129,7 +1845,7 @@ out: | |||
| 1129 | 1845 | ||
| 1130 | static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) | 1846 | static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) |
| 1131 | { | 1847 | { |
| 1132 | if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) | 1848 | if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) |
| 1133 | seq_puts(seq, ",cdp"); | 1849 | seq_puts(seq, ",cdp"); |
| 1134 | return 0; | 1850 | return 0; |
| 1135 | } | 1851 | } |
| @@ -1153,10 +1869,13 @@ static int __init rdtgroup_setup_root(void) | |||
| 1153 | mutex_lock(&rdtgroup_mutex); | 1869 | mutex_lock(&rdtgroup_mutex); |
| 1154 | 1870 | ||
| 1155 | rdtgroup_default.closid = 0; | 1871 | rdtgroup_default.closid = 0; |
| 1872 | rdtgroup_default.mon.rmid = 0; | ||
| 1873 | rdtgroup_default.type = RDTCTRL_GROUP; | ||
| 1874 | INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); | ||
| 1875 | |||
| 1156 | list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); | 1876 | list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); |
| 1157 | 1877 | ||
| 1158 | ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files, | 1878 | ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE); |
| 1159 | ARRAY_SIZE(rdtgroup_base_files)); | ||
| 1160 | if (ret) { | 1879 | if (ret) { |
| 1161 | kernfs_destroy_root(rdt_root); | 1880 | kernfs_destroy_root(rdt_root); |
| 1162 | goto out; | 1881 | goto out; |
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index efc5eeb58292..11966251cd42 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c | |||
| @@ -56,7 +56,7 @@ | |||
| 56 | #include <asm/debugreg.h> | 56 | #include <asm/debugreg.h> |
| 57 | #include <asm/switch_to.h> | 57 | #include <asm/switch_to.h> |
| 58 | #include <asm/vm86.h> | 58 | #include <asm/vm86.h> |
| 59 | #include <asm/intel_rdt.h> | 59 | #include <asm/intel_rdt_sched.h> |
| 60 | #include <asm/proto.h> | 60 | #include <asm/proto.h> |
| 61 | 61 | ||
| 62 | void __show_regs(struct pt_regs *regs, int all) | 62 | void __show_regs(struct pt_regs *regs, int all) |
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c85269a76511..302e7b2572d1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
| @@ -52,7 +52,7 @@ | |||
| 52 | #include <asm/switch_to.h> | 52 | #include <asm/switch_to.h> |
| 53 | #include <asm/xen/hypervisor.h> | 53 | #include <asm/xen/hypervisor.h> |
| 54 | #include <asm/vdso.h> | 54 | #include <asm/vdso.h> |
| 55 | #include <asm/intel_rdt.h> | 55 | #include <asm/intel_rdt_sched.h> |
| 56 | #include <asm/unistd.h> | 56 | #include <asm/unistd.h> |
| 57 | #ifdef CONFIG_IA32_EMULATION | 57 | #ifdef CONFIG_IA32_EMULATION |
| 58 | /* Not included via unistd.h */ | 58 | /* Not included via unistd.h */ |
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 718ba163c1b9..8e22f24ded6a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h | |||
| @@ -139,14 +139,6 @@ struct hw_perf_event { | |||
| 139 | /* for tp_event->class */ | 139 | /* for tp_event->class */ |
| 140 | struct list_head tp_list; | 140 | struct list_head tp_list; |
| 141 | }; | 141 | }; |
| 142 | struct { /* intel_cqm */ | ||
| 143 | int cqm_state; | ||
| 144 | u32 cqm_rmid; | ||
| 145 | int is_group_event; | ||
| 146 | struct list_head cqm_events_entry; | ||
| 147 | struct list_head cqm_groups_entry; | ||
| 148 | struct list_head cqm_group_entry; | ||
| 149 | }; | ||
| 150 | struct { /* amd_power */ | 142 | struct { /* amd_power */ |
| 151 | u64 pwr_acc; | 143 | u64 pwr_acc; |
| 152 | u64 ptsc; | 144 | u64 ptsc; |
| @@ -414,11 +406,6 @@ struct pmu { | |||
| 414 | 406 | ||
| 415 | 407 | ||
| 416 | /* | 408 | /* |
| 417 | * Return the count value for a counter. | ||
| 418 | */ | ||
| 419 | u64 (*count) (struct perf_event *event); /*optional*/ | ||
| 420 | |||
| 421 | /* | ||
| 422 | * Set up pmu-private data structures for an AUX area | 409 | * Set up pmu-private data structures for an AUX area |
| 423 | */ | 410 | */ |
| 424 | void *(*setup_aux) (int cpu, void **pages, | 411 | void *(*setup_aux) (int cpu, void **pages, |
| @@ -1112,11 +1099,6 @@ static inline void perf_event_task_sched_out(struct task_struct *prev, | |||
| 1112 | __perf_event_task_sched_out(prev, next); | 1099 | __perf_event_task_sched_out(prev, next); |
| 1113 | } | 1100 | } |
| 1114 | 1101 | ||
| 1115 | static inline u64 __perf_event_count(struct perf_event *event) | ||
| 1116 | { | ||
| 1117 | return local64_read(&event->count) + atomic64_read(&event->child_count); | ||
| 1118 | } | ||
| 1119 | |||
| 1120 | extern void perf_event_mmap(struct vm_area_struct *vma); | 1102 | extern void perf_event_mmap(struct vm_area_struct *vma); |
| 1121 | extern struct perf_guest_info_callbacks *perf_guest_cbs; | 1103 | extern struct perf_guest_info_callbacks *perf_guest_cbs; |
| 1122 | extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); | 1104 | extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 9ba42c663fba..68b38335d33c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -909,8 +909,9 @@ struct task_struct { | |||
| 909 | /* cg_list protected by css_set_lock and tsk->alloc_lock: */ | 909 | /* cg_list protected by css_set_lock and tsk->alloc_lock: */ |
| 910 | struct list_head cg_list; | 910 | struct list_head cg_list; |
| 911 | #endif | 911 | #endif |
| 912 | #ifdef CONFIG_INTEL_RDT_A | 912 | #ifdef CONFIG_INTEL_RDT |
| 913 | int closid; | 913 | u32 closid; |
| 914 | u32 rmid; | ||
| 914 | #endif | 915 | #endif |
| 915 | #ifdef CONFIG_FUTEX | 916 | #ifdef CONFIG_FUTEX |
| 916 | struct robust_list_head __user *robust_list; | 917 | struct robust_list_head __user *robust_list; |
diff --git a/kernel/events/core.c b/kernel/events/core.c index ce64f3fed5c6..294f1927f944 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
| @@ -3673,10 +3673,7 @@ unlock: | |||
| 3673 | 3673 | ||
| 3674 | static inline u64 perf_event_count(struct perf_event *event) | 3674 | static inline u64 perf_event_count(struct perf_event *event) |
| 3675 | { | 3675 | { |
| 3676 | if (event->pmu->count) | 3676 | return local64_read(&event->count) + atomic64_read(&event->child_count); |
| 3677 | return event->pmu->count(event); | ||
| 3678 | |||
| 3679 | return __perf_event_count(event); | ||
| 3680 | } | 3677 | } |
| 3681 | 3678 | ||
| 3682 | /* | 3679 | /* |
| @@ -3707,15 +3704,6 @@ int perf_event_read_local(struct perf_event *event, u64 *value) | |||
| 3707 | goto out; | 3704 | goto out; |
| 3708 | } | 3705 | } |
| 3709 | 3706 | ||
| 3710 | /* | ||
| 3711 | * It must not have a pmu::count method, those are not | ||
| 3712 | * NMI safe. | ||
| 3713 | */ | ||
| 3714 | if (event->pmu->count) { | ||
| 3715 | ret = -EOPNOTSUPP; | ||
| 3716 | goto out; | ||
| 3717 | } | ||
| 3718 | |||
| 3719 | /* If this is a per-task event, it must be for current */ | 3707 | /* If this is a per-task event, it must be for current */ |
| 3720 | if ((event->attach_state & PERF_ATTACH_TASK) && | 3708 | if ((event->attach_state & PERF_ATTACH_TASK) && |
| 3721 | event->hw.target != current) { | 3709 | event->hw.target != current) { |
