aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-09-04 16:56:37 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-04 16:56:37 -0400
commitf57091767add2b79d76aac41b83b192d8ba1dce7 (patch)
tree652672c006ac87ba099deec8ca2b0949e6726d84
parentd725c7ac8b96cbdc28266895c6f7080c55bf2f23 (diff)
parentd56593eb5eda8f593db92927059697bbf89bc4b3 (diff)
Merge branch 'x86-cache-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 cache quality monitoring update from Thomas Gleixner: "This update provides a complete rewrite of the Cache Quality Monitoring (CQM) facility. The existing CQM support was duct taped into perf with a lot of issues and the attempts to fix those turned out to be incomplete and horrible. After lengthy discussions it was decided to integrate the CQM support into the Resource Director Technology (RDT) facility, which is the obvious choise as in hardware CQM is part of RDT. This allowed to add Memory Bandwidth Monitoring support on top. As a result the mechanisms for allocating cache/memory bandwidth and the corresponding monitoring mechanisms are integrated into a single management facility with a consistent user interface" * 'x86-cache-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (37 commits) x86/intel_rdt: Turn off most RDT features on Skylake x86/intel_rdt: Add command line options for resource director technology x86/intel_rdt: Move special case code for Haswell to a quirk function x86/intel_rdt: Remove redundant ternary operator on return x86/intel_rdt/cqm: Improve limbo list processing x86/intel_rdt/mbm: Fix MBM overflow handler during CPU hotplug x86/intel_rdt: Modify the intel_pqr_state for better performance x86/intel_rdt/cqm: Clear the default RMID during hotcpu x86/intel_rdt: Show bitmask of shareable resource with other executing units x86/intel_rdt/mbm: Handle counter overflow x86/intel_rdt/mbm: Add mbm counter initialization x86/intel_rdt/mbm: Basic counting of MBM events (total and local) x86/intel_rdt/cqm: Add CPU hotplug support x86/intel_rdt/cqm: Add sched_in support x86/intel_rdt: Introduce rdt_enable_key for scheduling x86/intel_rdt/cqm: Add mount,umount support x86/intel_rdt/cqm: Add rmdir support x86/intel_rdt: Separate the ctrl bits from rmdir x86/intel_rdt/cqm: Add mon_data x86/intel_rdt: Prepare for RDT monitor data support ...
-rw-r--r--Documentation/admin-guide/kernel-parameters.rst1
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt6
-rw-r--r--Documentation/x86/intel_rdt_ui.txt323
-rw-r--r--MAINTAINERS2
-rw-r--r--arch/x86/Kconfig12
-rw-r--r--arch/x86/events/intel/Makefile2
-rw-r--r--arch/x86/events/intel/cqm.c1766
-rw-r--r--arch/x86/include/asm/intel_rdt.h286
-rw-r--r--arch/x86/include/asm/intel_rdt_common.h27
-rw-r--r--arch/x86/include/asm/intel_rdt_sched.h92
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.c375
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.h440
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c (renamed from arch/x86/kernel/cpu/intel_rdt_schemata.c)67
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_monitor.c499
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c1117
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--include/linux/perf_event.h18
-rw-r--r--include/linux/sched.h5
-rw-r--r--kernel/events/core.c14
21 files changed, 2631 insertions, 2427 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst
index d76ab3907e2b..b2598cc9834c 100644
--- a/Documentation/admin-guide/kernel-parameters.rst
+++ b/Documentation/admin-guide/kernel-parameters.rst
@@ -138,6 +138,7 @@ parameter is applicable::
138 PPT Parallel port support is enabled. 138 PPT Parallel port support is enabled.
139 PS2 Appropriate PS/2 support is enabled. 139 PS2 Appropriate PS/2 support is enabled.
140 RAM RAM disk support is enabled. 140 RAM RAM disk support is enabled.
141 RDT Intel Resource Director Technology.
141 S390 S390 architecture is enabled. 142 S390 S390 architecture is enabled.
142 SCSI Appropriate SCSI support is enabled. 143 SCSI Appropriate SCSI support is enabled.
143 A lot of drivers have their options described inside 144 A lot of drivers have their options described inside
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index dad6fa01af95..591d48f3a7de 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3612,6 +3612,12 @@
3612 Run specified binary instead of /init from the ramdisk, 3612 Run specified binary instead of /init from the ramdisk,
3613 used for early userspace startup. See initrd. 3613 used for early userspace startup. See initrd.
3614 3614
3615 rdt= [HW,X86,RDT]
3616 Turn on/off individual RDT features. List is:
3617 cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, mba.
3618 E.g. to turn on cmt and turn off mba use:
3619 rdt=cmt,!mba
3620
3615 reboot= [KNL] 3621 reboot= [KNL]
3616 Format (x86 or x86_64): 3622 Format (x86 or x86_64):
3617 [w[arm] | c[old] | h[ard] | s[oft] | g[pio]] \ 3623 [w[arm] | c[old] | h[ard] | s[oft] | g[pio]] \
diff --git a/Documentation/x86/intel_rdt_ui.txt b/Documentation/x86/intel_rdt_ui.txt
index c491a1b82de2..4d8848e4e224 100644
--- a/Documentation/x86/intel_rdt_ui.txt
+++ b/Documentation/x86/intel_rdt_ui.txt
@@ -6,8 +6,8 @@ Fenghua Yu <fenghua.yu@intel.com>
6Tony Luck <tony.luck@intel.com> 6Tony Luck <tony.luck@intel.com>
7Vikas Shivappa <vikas.shivappa@intel.com> 7Vikas Shivappa <vikas.shivappa@intel.com>
8 8
9This feature is enabled by the CONFIG_INTEL_RDT_A Kconfig and the 9This feature is enabled by the CONFIG_INTEL_RDT Kconfig and the
10X86 /proc/cpuinfo flag bits "rdt", "cat_l3" and "cdp_l3". 10X86 /proc/cpuinfo flag bits "rdt", "cqm", "cat_l3" and "cdp_l3".
11 11
12To use the feature mount the file system: 12To use the feature mount the file system:
13 13
@@ -17,6 +17,13 @@ mount options are:
17 17
18"cdp": Enable code/data prioritization in L3 cache allocations. 18"cdp": Enable code/data prioritization in L3 cache allocations.
19 19
20RDT features are orthogonal. A particular system may support only
21monitoring, only control, or both monitoring and control.
22
23The mount succeeds if either of allocation or monitoring is present, but
24only those files and directories supported by the system will be created.
25For more details on the behavior of the interface during monitoring
26and allocation, see the "Resource alloc and monitor groups" section.
20 27
21Info directory 28Info directory
22-------------- 29--------------
@@ -24,7 +31,12 @@ Info directory
24The 'info' directory contains information about the enabled 31The 'info' directory contains information about the enabled
25resources. Each resource has its own subdirectory. The subdirectory 32resources. Each resource has its own subdirectory. The subdirectory
26names reflect the resource names. 33names reflect the resource names.
27Cache resource(L3/L2) subdirectory contains the following files: 34
35Each subdirectory contains the following files with respect to
36allocation:
37
38Cache resource(L3/L2) subdirectory contains the following files
39related to allocation:
28 40
29"num_closids": The number of CLOSIDs which are valid for this 41"num_closids": The number of CLOSIDs which are valid for this
30 resource. The kernel uses the smallest number of 42 resource. The kernel uses the smallest number of
@@ -36,7 +48,15 @@ Cache resource(L3/L2) subdirectory contains the following files:
36"min_cbm_bits": The minimum number of consecutive bits which 48"min_cbm_bits": The minimum number of consecutive bits which
37 must be set when writing a mask. 49 must be set when writing a mask.
38 50
39Memory bandwitdh(MB) subdirectory contains the following files: 51"shareable_bits": Bitmask of shareable resource with other executing
52 entities (e.g. I/O). User can use this when
53 setting up exclusive cache partitions. Note that
54 some platforms support devices that have their
55 own settings for cache use which can over-ride
56 these bits.
57
58Memory bandwitdh(MB) subdirectory contains the following files
59with respect to allocation:
40 60
41"min_bandwidth": The minimum memory bandwidth percentage which 61"min_bandwidth": The minimum memory bandwidth percentage which
42 user can request. 62 user can request.
@@ -52,48 +72,152 @@ Memory bandwitdh(MB) subdirectory contains the following files:
52 non-linear. This field is purely informational 72 non-linear. This field is purely informational
53 only. 73 only.
54 74
55Resource groups 75If RDT monitoring is available there will be an "L3_MON" directory
56--------------- 76with the following files:
77
78"num_rmids": The number of RMIDs available. This is the
79 upper bound for how many "CTRL_MON" + "MON"
80 groups can be created.
81
82"mon_features": Lists the monitoring events if
83 monitoring is enabled for the resource.
84
85"max_threshold_occupancy":
86 Read/write file provides the largest value (in
87 bytes) at which a previously used LLC_occupancy
88 counter can be considered for re-use.
89
90
91Resource alloc and monitor groups
92---------------------------------
93
57Resource groups are represented as directories in the resctrl file 94Resource groups are represented as directories in the resctrl file
58system. The default group is the root directory. Other groups may be 95system. The default group is the root directory which, immediately
59created as desired by the system administrator using the "mkdir(1)" 96after mounting, owns all the tasks and cpus in the system and can make
60command, and removed using "rmdir(1)". 97full use of all resources.
98
99On a system with RDT control features additional directories can be
100created in the root directory that specify different amounts of each
101resource (see "schemata" below). The root and these additional top level
102directories are referred to as "CTRL_MON" groups below.
103
104On a system with RDT monitoring the root directory and other top level
105directories contain a directory named "mon_groups" in which additional
106directories can be created to monitor subsets of tasks in the CTRL_MON
107group that is their ancestor. These are called "MON" groups in the rest
108of this document.
109
110Removing a directory will move all tasks and cpus owned by the group it
111represents to the parent. Removing one of the created CTRL_MON groups
112will automatically remove all MON groups below it.
113
114All groups contain the following files:
115
116"tasks":
117 Reading this file shows the list of all tasks that belong to
118 this group. Writing a task id to the file will add a task to the
119 group. If the group is a CTRL_MON group the task is removed from
120 whichever previous CTRL_MON group owned the task and also from
121 any MON group that owned the task. If the group is a MON group,
122 then the task must already belong to the CTRL_MON parent of this
123 group. The task is removed from any previous MON group.
124
125
126"cpus":
127 Reading this file shows a bitmask of the logical CPUs owned by
128 this group. Writing a mask to this file will add and remove
129 CPUs to/from this group. As with the tasks file a hierarchy is
130 maintained where MON groups may only include CPUs owned by the
131 parent CTRL_MON group.
132
61 133
62There are three files associated with each group: 134"cpus_list":
135 Just like "cpus", only using ranges of CPUs instead of bitmasks.
63 136
64"tasks": A list of tasks that belongs to this group. Tasks can be
65 added to a group by writing the task ID to the "tasks" file
66 (which will automatically remove them from the previous
67 group to which they belonged). New tasks created by fork(2)
68 and clone(2) are added to the same group as their parent.
69 If a pid is not in any sub partition, it is in root partition
70 (i.e. default partition).
71 137
72"cpus": A bitmask of logical CPUs assigned to this group. Writing 138When control is enabled all CTRL_MON groups will also contain:
73 a new mask can add/remove CPUs from this group. Added CPUs
74 are removed from their previous group. Removed ones are
75 given to the default (root) group. You cannot remove CPUs
76 from the default group.
77 139
78"cpus_list": One or more CPU ranges of logical CPUs assigned to this 140"schemata":
79 group. Same rules apply like for the "cpus" file. 141 A list of all the resources available to this group.
142 Each resource has its own line and format - see below for details.
80 143
81"schemata": A list of all the resources available to this group. 144When monitoring is enabled all MON groups will also contain:
82 Each resource has its own line and format - see below for
83 details.
84 145
85When a task is running the following rules define which resources 146"mon_data":
86are available to it: 147 This contains a set of files organized by L3 domain and by
148 RDT event. E.g. on a system with two L3 domains there will
149 be subdirectories "mon_L3_00" and "mon_L3_01". Each of these
150 directories have one file per event (e.g. "llc_occupancy",
151 "mbm_total_bytes", and "mbm_local_bytes"). In a MON group these
152 files provide a read out of the current value of the event for
153 all tasks in the group. In CTRL_MON groups these files provide
154 the sum for all tasks in the CTRL_MON group and all tasks in
155 MON groups. Please see example section for more details on usage.
156
157Resource allocation rules
158-------------------------
159When a task is running the following rules define which resources are
160available to it:
87 161
881) If the task is a member of a non-default group, then the schemata 1621) If the task is a member of a non-default group, then the schemata
89for that group is used. 163 for that group is used.
90 164
912) Else if the task belongs to the default group, but is running on a 1652) Else if the task belongs to the default group, but is running on a
92CPU that is assigned to some specific group, then the schemata for 166 CPU that is assigned to some specific group, then the schemata for the
93the CPU's group is used. 167 CPU's group is used.
94 168
953) Otherwise the schemata for the default group is used. 1693) Otherwise the schemata for the default group is used.
96 170
171Resource monitoring rules
172-------------------------
1731) If a task is a member of a MON group, or non-default CTRL_MON group
174 then RDT events for the task will be reported in that group.
175
1762) If a task is a member of the default CTRL_MON group, but is running
177 on a CPU that is assigned to some specific group, then the RDT events
178 for the task will be reported in that group.
179
1803) Otherwise RDT events for the task will be reported in the root level
181 "mon_data" group.
182
183
184Notes on cache occupancy monitoring and control
185-----------------------------------------------
186When moving a task from one group to another you should remember that
187this only affects *new* cache allocations by the task. E.g. you may have
188a task in a monitor group showing 3 MB of cache occupancy. If you move
189to a new group and immediately check the occupancy of the old and new
190groups you will likely see that the old group is still showing 3 MB and
191the new group zero. When the task accesses locations still in cache from
192before the move, the h/w does not update any counters. On a busy system
193you will likely see the occupancy in the old group go down as cache lines
194are evicted and re-used while the occupancy in the new group rises as
195the task accesses memory and loads into the cache are counted based on
196membership in the new group.
197
198The same applies to cache allocation control. Moving a task to a group
199with a smaller cache partition will not evict any cache lines. The
200process may continue to use them from the old partition.
201
202Hardware uses CLOSid(Class of service ID) and an RMID(Resource monitoring ID)
203to identify a control group and a monitoring group respectively. Each of
204the resource groups are mapped to these IDs based on the kind of group. The
205number of CLOSid and RMID are limited by the hardware and hence the creation of
206a "CTRL_MON" directory may fail if we run out of either CLOSID or RMID
207and creation of "MON" group may fail if we run out of RMIDs.
208
209max_threshold_occupancy - generic concepts
210------------------------------------------
211
212Note that an RMID once freed may not be immediately available for use as
213the RMID is still tagged the cache lines of the previous user of RMID.
214Hence such RMIDs are placed on limbo list and checked back if the cache
215occupancy has gone down. If there is a time when system has a lot of
216limbo RMIDs but which are not ready to be used, user may see an -EBUSY
217during mkdir.
218
219max_threshold_occupancy is a user configurable value to determine the
220occupancy at which an RMID can be freed.
97 221
98Schemata files - general concepts 222Schemata files - general concepts
99--------------------------------- 223---------------------------------
@@ -143,22 +267,22 @@ SKUs. Using a high bandwidth and a low bandwidth setting on two threads
143sharing a core will result in both threads being throttled to use the 267sharing a core will result in both threads being throttled to use the
144low bandwidth. 268low bandwidth.
145 269
146L3 details (code and data prioritization disabled) 270L3 schemata file details (code and data prioritization disabled)
147-------------------------------------------------- 271----------------------------------------------------------------
148With CDP disabled the L3 schemata format is: 272With CDP disabled the L3 schemata format is:
149 273
150 L3:<cache_id0>=<cbm>;<cache_id1>=<cbm>;... 274 L3:<cache_id0>=<cbm>;<cache_id1>=<cbm>;...
151 275
152L3 details (CDP enabled via mount option to resctrl) 276L3 schemata file details (CDP enabled via mount option to resctrl)
153---------------------------------------------------- 277------------------------------------------------------------------
154When CDP is enabled L3 control is split into two separate resources 278When CDP is enabled L3 control is split into two separate resources
155so you can specify independent masks for code and data like this: 279so you can specify independent masks for code and data like this:
156 280
157 L3data:<cache_id0>=<cbm>;<cache_id1>=<cbm>;... 281 L3data:<cache_id0>=<cbm>;<cache_id1>=<cbm>;...
158 L3code:<cache_id0>=<cbm>;<cache_id1>=<cbm>;... 282 L3code:<cache_id0>=<cbm>;<cache_id1>=<cbm>;...
159 283
160L2 details 284L2 schemata file details
161---------- 285------------------------
162L2 cache does not support code and data prioritization, so the 286L2 cache does not support code and data prioritization, so the
163schemata format is always: 287schemata format is always:
164 288
@@ -185,6 +309,8 @@ L3CODE:0=fffff;1=fffff;2=fffff;3=fffff
185L3DATA:0=fffff;1=fffff;2=3c0;3=fffff 309L3DATA:0=fffff;1=fffff;2=3c0;3=fffff
186L3CODE:0=fffff;1=fffff;2=fffff;3=fffff 310L3CODE:0=fffff;1=fffff;2=fffff;3=fffff
187 311
312Examples for RDT allocation usage:
313
188Example 1 314Example 1
189--------- 315---------
190On a two socket machine (one L3 cache per socket) with just four bits 316On a two socket machine (one L3 cache per socket) with just four bits
@@ -410,3 +536,124 @@ void main(void)
410 /* code to read and write directory contents */ 536 /* code to read and write directory contents */
411 resctrl_release_lock(fd); 537 resctrl_release_lock(fd);
412} 538}
539
540Examples for RDT Monitoring along with allocation usage:
541
542Reading monitored data
543----------------------
544Reading an event file (for ex: mon_data/mon_L3_00/llc_occupancy) would
545show the current snapshot of LLC occupancy of the corresponding MON
546group or CTRL_MON group.
547
548
549Example 1 (Monitor CTRL_MON group and subset of tasks in CTRL_MON group)
550---------
551On a two socket machine (one L3 cache per socket) with just four bits
552for cache bit masks
553
554# mount -t resctrl resctrl /sys/fs/resctrl
555# cd /sys/fs/resctrl
556# mkdir p0 p1
557# echo "L3:0=3;1=c" > /sys/fs/resctrl/p0/schemata
558# echo "L3:0=3;1=3" > /sys/fs/resctrl/p1/schemata
559# echo 5678 > p1/tasks
560# echo 5679 > p1/tasks
561
562The default resource group is unmodified, so we have access to all parts
563of all caches (its schemata file reads "L3:0=f;1=f").
564
565Tasks that are under the control of group "p0" may only allocate from the
566"lower" 50% on cache ID 0, and the "upper" 50% of cache ID 1.
567Tasks in group "p1" use the "lower" 50% of cache on both sockets.
568
569Create monitor groups and assign a subset of tasks to each monitor group.
570
571# cd /sys/fs/resctrl/p1/mon_groups
572# mkdir m11 m12
573# echo 5678 > m11/tasks
574# echo 5679 > m12/tasks
575
576fetch data (data shown in bytes)
577
578# cat m11/mon_data/mon_L3_00/llc_occupancy
57916234000
580# cat m11/mon_data/mon_L3_01/llc_occupancy
58114789000
582# cat m12/mon_data/mon_L3_00/llc_occupancy
58316789000
584
585The parent ctrl_mon group shows the aggregated data.
586
587# cat /sys/fs/resctrl/p1/mon_data/mon_l3_00/llc_occupancy
58831234000
589
590Example 2 (Monitor a task from its creation)
591---------
592On a two socket machine (one L3 cache per socket)
593
594# mount -t resctrl resctrl /sys/fs/resctrl
595# cd /sys/fs/resctrl
596# mkdir p0 p1
597
598An RMID is allocated to the group once its created and hence the <cmd>
599below is monitored from its creation.
600
601# echo $$ > /sys/fs/resctrl/p1/tasks
602# <cmd>
603
604Fetch the data
605
606# cat /sys/fs/resctrl/p1/mon_data/mon_l3_00/llc_occupancy
60731789000
608
609Example 3 (Monitor without CAT support or before creating CAT groups)
610---------
611
612Assume a system like HSW has only CQM and no CAT support. In this case
613the resctrl will still mount but cannot create CTRL_MON directories.
614But user can create different MON groups within the root group thereby
615able to monitor all tasks including kernel threads.
616
617This can also be used to profile jobs cache size footprint before being
618able to allocate them to different allocation groups.
619
620# mount -t resctrl resctrl /sys/fs/resctrl
621# cd /sys/fs/resctrl
622# mkdir mon_groups/m01
623# mkdir mon_groups/m02
624
625# echo 3478 > /sys/fs/resctrl/mon_groups/m01/tasks
626# echo 2467 > /sys/fs/resctrl/mon_groups/m02/tasks
627
628Monitor the groups separately and also get per domain data. From the
629below its apparent that the tasks are mostly doing work on
630domain(socket) 0.
631
632# cat /sys/fs/resctrl/mon_groups/m01/mon_L3_00/llc_occupancy
63331234000
634# cat /sys/fs/resctrl/mon_groups/m01/mon_L3_01/llc_occupancy
63534555
636# cat /sys/fs/resctrl/mon_groups/m02/mon_L3_00/llc_occupancy
63731234000
638# cat /sys/fs/resctrl/mon_groups/m02/mon_L3_01/llc_occupancy
63932789
640
641
642Example 4 (Monitor real time tasks)
643-----------------------------------
644
645A single socket system which has real time tasks running on cores 4-7
646and non real time tasks on other cpus. We want to monitor the cache
647occupancy of the real time threads on these cores.
648
649# mount -t resctrl resctrl /sys/fs/resctrl
650# cd /sys/fs/resctrl
651# mkdir p1
652
653Move the cpus 4-7 over to p1
654# echo f0 > p0/cpus
655
656View the llc occupancy snapshot
657
658# cat /sys/fs/resctrl/p1/mon_data/mon_L3_00/llc_occupancy
65911234000
diff --git a/MAINTAINERS b/MAINTAINERS
index b81e93b71c4b..8ef4694af6e8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11121,7 +11121,7 @@ M: Fenghua Yu <fenghua.yu@intel.com>
11121L: linux-kernel@vger.kernel.org 11121L: linux-kernel@vger.kernel.org
11122S: Supported 11122S: Supported
11123F: arch/x86/kernel/cpu/intel_rdt* 11123F: arch/x86/kernel/cpu/intel_rdt*
11124F: arch/x86/include/asm/intel_rdt* 11124F: arch/x86/include/asm/intel_rdt_sched.h
11125F: Documentation/x86/intel_rdt* 11125F: Documentation/x86/intel_rdt*
11126 11126
11127READ-COPY UPDATE (RCU) 11127READ-COPY UPDATE (RCU)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b4b27ab016f6..acb366bf6bc1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -429,16 +429,16 @@ config GOLDFISH
429 def_bool y 429 def_bool y
430 depends on X86_GOLDFISH 430 depends on X86_GOLDFISH
431 431
432config INTEL_RDT_A 432config INTEL_RDT
433 bool "Intel Resource Director Technology Allocation support" 433 bool "Intel Resource Director Technology support"
434 default n 434 default n
435 depends on X86 && CPU_SUP_INTEL 435 depends on X86 && CPU_SUP_INTEL
436 select KERNFS 436 select KERNFS
437 help 437 help
438 Select to enable resource allocation which is a sub-feature of 438 Select to enable resource allocation and monitoring which are
439 Intel Resource Director Technology(RDT). More information about 439 sub-features of Intel Resource Director Technology(RDT). More
440 RDT can be found in the Intel x86 Architecture Software 440 information about RDT can be found in the Intel x86
441 Developer Manual. 441 Architecture Software Developer Manual.
442 442
443 Say N if unsure. 443 Say N if unsure.
444 444
diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile
index 06c2baa51814..e9d8520a801a 100644
--- a/arch/x86/events/intel/Makefile
+++ b/arch/x86/events/intel/Makefile
@@ -1,4 +1,4 @@
1obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o cqm.o 1obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o
2obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o 2obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o
3obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o 3obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o
4obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl-perf.o 4obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl-perf.o
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
deleted file mode 100644
index 2521f771f2f5..000000000000
--- a/arch/x86/events/intel/cqm.c
+++ /dev/null
@@ -1,1766 +0,0 @@
1/*
2 * Intel Cache Quality-of-Service Monitoring (CQM) support.
3 *
4 * Based very, very heavily on work by Peter Zijlstra.
5 */
6
7#include <linux/perf_event.h>
8#include <linux/slab.h>
9#include <asm/cpu_device_id.h>
10#include <asm/intel_rdt_common.h>
11#include "../perf_event.h"
12
13#define MSR_IA32_QM_CTR 0x0c8e
14#define MSR_IA32_QM_EVTSEL 0x0c8d
15
16#define MBM_CNTR_WIDTH 24
17/*
18 * Guaranteed time in ms as per SDM where MBM counters will not overflow.
19 */
20#define MBM_CTR_OVERFLOW_TIME 1000
21
22static u32 cqm_max_rmid = -1;
23static unsigned int cqm_l3_scale; /* supposedly cacheline size */
24static bool cqm_enabled, mbm_enabled;
25unsigned int mbm_socket_max;
26
27/*
28 * The cached intel_pqr_state is strictly per CPU and can never be
29 * updated from a remote CPU. Both functions which modify the state
30 * (intel_cqm_event_start and intel_cqm_event_stop) are called with
31 * interrupts disabled, which is sufficient for the protection.
32 */
33DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
34static struct hrtimer *mbm_timers;
35/**
36 * struct sample - mbm event's (local or total) data
37 * @total_bytes #bytes since we began monitoring
38 * @prev_msr previous value of MSR
39 */
40struct sample {
41 u64 total_bytes;
42 u64 prev_msr;
43};
44
45/*
46 * samples profiled for total memory bandwidth type events
47 */
48static struct sample *mbm_total;
49/*
50 * samples profiled for local memory bandwidth type events
51 */
52static struct sample *mbm_local;
53
54#define pkg_id topology_physical_package_id(smp_processor_id())
55/*
56 * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array.
57 * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of
58 * rmids per socket, an example is given below
59 * RMID1 of Socket0: vrmid = 1
60 * RMID1 of Socket1: vrmid = 1 * (cqm_max_rmid + 1) + 1
61 * RMID1 of Socket2: vrmid = 2 * (cqm_max_rmid + 1) + 1
62 */
63#define rmid_2_index(rmid) ((pkg_id * (cqm_max_rmid + 1)) + rmid)
64/*
65 * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
66 * Also protects event->hw.cqm_rmid
67 *
68 * Hold either for stability, both for modification of ->hw.cqm_rmid.
69 */
70static DEFINE_MUTEX(cache_mutex);
71static DEFINE_RAW_SPINLOCK(cache_lock);
72
73/*
74 * Groups of events that have the same target(s), one RMID per group.
75 */
76static LIST_HEAD(cache_groups);
77
78/*
79 * Mask of CPUs for reading CQM values. We only need one per-socket.
80 */
81static cpumask_t cqm_cpumask;
82
83#define RMID_VAL_ERROR (1ULL << 63)
84#define RMID_VAL_UNAVAIL (1ULL << 62)
85
86/*
87 * Event IDs are used to program IA32_QM_EVTSEL before reading event
88 * counter from IA32_QM_CTR
89 */
90#define QOS_L3_OCCUP_EVENT_ID 0x01
91#define QOS_MBM_TOTAL_EVENT_ID 0x02
92#define QOS_MBM_LOCAL_EVENT_ID 0x03
93
94/*
95 * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
96 *
97 * This rmid is always free and is guaranteed to have an associated
98 * near-zero occupancy value, i.e. no cachelines are tagged with this
99 * RMID, once __intel_cqm_rmid_rotate() returns.
100 */
101static u32 intel_cqm_rotation_rmid;
102
103#define INVALID_RMID (-1)
104
105/*
106 * Is @rmid valid for programming the hardware?
107 *
108 * rmid 0 is reserved by the hardware for all non-monitored tasks, which
109 * means that we should never come across an rmid with that value.
110 * Likewise, an rmid value of -1 is used to indicate "no rmid currently
111 * assigned" and is used as part of the rotation code.
112 */
113static inline bool __rmid_valid(u32 rmid)
114{
115 if (!rmid || rmid == INVALID_RMID)
116 return false;
117
118 return true;
119}
120
121static u64 __rmid_read(u32 rmid)
122{
123 u64 val;
124
125 /*
126 * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
127 * it just says that to increase confusion.
128 */
129 wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
130 rdmsrl(MSR_IA32_QM_CTR, val);
131
132 /*
133 * Aside from the ERROR and UNAVAIL bits, assume this thing returns
134 * the number of cachelines tagged with @rmid.
135 */
136 return val;
137}
138
139enum rmid_recycle_state {
140 RMID_YOUNG = 0,
141 RMID_AVAILABLE,
142 RMID_DIRTY,
143};
144
145struct cqm_rmid_entry {
146 u32 rmid;
147 enum rmid_recycle_state state;
148 struct list_head list;
149 unsigned long queue_time;
150};
151
152/*
153 * cqm_rmid_free_lru - A least recently used list of RMIDs.
154 *
155 * Oldest entry at the head, newest (most recently used) entry at the
156 * tail. This list is never traversed, it's only used to keep track of
157 * the lru order. That is, we only pick entries of the head or insert
158 * them on the tail.
159 *
160 * All entries on the list are 'free', and their RMIDs are not currently
161 * in use. To mark an RMID as in use, remove its entry from the lru
162 * list.
163 *
164 *
165 * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
166 *
167 * This list is contains RMIDs that no one is currently using but that
168 * may have a non-zero occupancy value associated with them. The
169 * rotation worker moves RMIDs from the limbo list to the free list once
170 * the occupancy value drops below __intel_cqm_threshold.
171 *
172 * Both lists are protected by cache_mutex.
173 */
174static LIST_HEAD(cqm_rmid_free_lru);
175static LIST_HEAD(cqm_rmid_limbo_lru);
176
177/*
178 * We use a simple array of pointers so that we can lookup a struct
179 * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
180 * and __put_rmid() from having to worry about dealing with struct
181 * cqm_rmid_entry - they just deal with rmids, i.e. integers.
182 *
183 * Once this array is initialized it is read-only. No locks are required
184 * to access it.
185 *
186 * All entries for all RMIDs can be looked up in the this array at all
187 * times.
188 */
189static struct cqm_rmid_entry **cqm_rmid_ptrs;
190
191static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
192{
193 struct cqm_rmid_entry *entry;
194
195 entry = cqm_rmid_ptrs[rmid];
196 WARN_ON(entry->rmid != rmid);
197
198 return entry;
199}
200
201/*
202 * Returns < 0 on fail.
203 *
204 * We expect to be called with cache_mutex held.
205 */
206static u32 __get_rmid(void)
207{
208 struct cqm_rmid_entry *entry;
209
210 lockdep_assert_held(&cache_mutex);
211
212 if (list_empty(&cqm_rmid_free_lru))
213 return INVALID_RMID;
214
215 entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
216 list_del(&entry->list);
217
218 return entry->rmid;
219}
220
221static void __put_rmid(u32 rmid)
222{
223 struct cqm_rmid_entry *entry;
224
225 lockdep_assert_held(&cache_mutex);
226
227 WARN_ON(!__rmid_valid(rmid));
228 entry = __rmid_entry(rmid);
229
230 entry->queue_time = jiffies;
231 entry->state = RMID_YOUNG;
232
233 list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
234}
235
236static void cqm_cleanup(void)
237{
238 int i;
239
240 if (!cqm_rmid_ptrs)
241 return;
242
243 for (i = 0; i < cqm_max_rmid; i++)
244 kfree(cqm_rmid_ptrs[i]);
245
246 kfree(cqm_rmid_ptrs);
247 cqm_rmid_ptrs = NULL;
248 cqm_enabled = false;
249}
250
251static int intel_cqm_setup_rmid_cache(void)
252{
253 struct cqm_rmid_entry *entry;
254 unsigned int nr_rmids;
255 int r = 0;
256
257 nr_rmids = cqm_max_rmid + 1;
258 cqm_rmid_ptrs = kzalloc(sizeof(struct cqm_rmid_entry *) *
259 nr_rmids, GFP_KERNEL);
260 if (!cqm_rmid_ptrs)
261 return -ENOMEM;
262
263 for (; r <= cqm_max_rmid; r++) {
264 struct cqm_rmid_entry *entry;
265
266 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
267 if (!entry)
268 goto fail;
269
270 INIT_LIST_HEAD(&entry->list);
271 entry->rmid = r;
272 cqm_rmid_ptrs[r] = entry;
273
274 list_add_tail(&entry->list, &cqm_rmid_free_lru);
275 }
276
277 /*
278 * RMID 0 is special and is always allocated. It's used for all
279 * tasks that are not monitored.
280 */
281 entry = __rmid_entry(0);
282 list_del(&entry->list);
283
284 mutex_lock(&cache_mutex);
285 intel_cqm_rotation_rmid = __get_rmid();
286 mutex_unlock(&cache_mutex);
287
288 return 0;
289
290fail:
291 cqm_cleanup();
292 return -ENOMEM;
293}
294
295/*
296 * Determine if @a and @b measure the same set of tasks.
297 *
298 * If @a and @b measure the same set of tasks then we want to share a
299 * single RMID.
300 */
301static bool __match_event(struct perf_event *a, struct perf_event *b)
302{
303 /* Per-cpu and task events don't mix */
304 if ((a->attach_state & PERF_ATTACH_TASK) !=
305 (b->attach_state & PERF_ATTACH_TASK))
306 return false;
307
308#ifdef CONFIG_CGROUP_PERF
309 if (a->cgrp != b->cgrp)
310 return false;
311#endif
312
313 /* If not task event, we're machine wide */
314 if (!(b->attach_state & PERF_ATTACH_TASK))
315 return true;
316
317 /*
318 * Events that target same task are placed into the same cache group.
319 * Mark it as a multi event group, so that we update ->count
320 * for every event rather than just the group leader later.
321 */
322 if (a->hw.target == b->hw.target) {
323 b->hw.is_group_event = true;
324 return true;
325 }
326
327 /*
328 * Are we an inherited event?
329 */
330 if (b->parent == a)
331 return true;
332
333 return false;
334}
335
336#ifdef CONFIG_CGROUP_PERF
337static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
338{
339 if (event->attach_state & PERF_ATTACH_TASK)
340 return perf_cgroup_from_task(event->hw.target, event->ctx);
341
342 return event->cgrp;
343}
344#endif
345
346/*
347 * Determine if @a's tasks intersect with @b's tasks
348 *
349 * There are combinations of events that we explicitly prohibit,
350 *
351 * PROHIBITS
352 * system-wide -> cgroup and task
353 * cgroup -> system-wide
354 * -> task in cgroup
355 * task -> system-wide
356 * -> task in cgroup
357 *
358 * Call this function before allocating an RMID.
359 */
360static bool __conflict_event(struct perf_event *a, struct perf_event *b)
361{
362#ifdef CONFIG_CGROUP_PERF
363 /*
364 * We can have any number of cgroups but only one system-wide
365 * event at a time.
366 */
367 if (a->cgrp && b->cgrp) {
368 struct perf_cgroup *ac = a->cgrp;
369 struct perf_cgroup *bc = b->cgrp;
370
371 /*
372 * This condition should have been caught in
373 * __match_event() and we should be sharing an RMID.
374 */
375 WARN_ON_ONCE(ac == bc);
376
377 if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
378 cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
379 return true;
380
381 return false;
382 }
383
384 if (a->cgrp || b->cgrp) {
385 struct perf_cgroup *ac, *bc;
386
387 /*
388 * cgroup and system-wide events are mutually exclusive
389 */
390 if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
391 (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
392 return true;
393
394 /*
395 * Ensure neither event is part of the other's cgroup
396 */
397 ac = event_to_cgroup(a);
398 bc = event_to_cgroup(b);
399 if (ac == bc)
400 return true;
401
402 /*
403 * Must have cgroup and non-intersecting task events.
404 */
405 if (!ac || !bc)
406 return false;
407
408 /*
409 * We have cgroup and task events, and the task belongs
410 * to a cgroup. Check for for overlap.
411 */
412 if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
413 cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
414 return true;
415
416 return false;
417 }
418#endif
419 /*
420 * If one of them is not a task, same story as above with cgroups.
421 */
422 if (!(a->attach_state & PERF_ATTACH_TASK) ||
423 !(b->attach_state & PERF_ATTACH_TASK))
424 return true;
425
426 /*
427 * Must be non-overlapping.
428 */
429 return false;
430}
431
432struct rmid_read {
433 u32 rmid;
434 u32 evt_type;
435 atomic64_t value;
436};
437
438static void __intel_cqm_event_count(void *info);
439static void init_mbm_sample(u32 rmid, u32 evt_type);
440static void __intel_mbm_event_count(void *info);
441
442static bool is_cqm_event(int e)
443{
444 return (e == QOS_L3_OCCUP_EVENT_ID);
445}
446
447static bool is_mbm_event(int e)
448{
449 return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID);
450}
451
452static void cqm_mask_call(struct rmid_read *rr)
453{
454 if (is_mbm_event(rr->evt_type))
455 on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, rr, 1);
456 else
457 on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, rr, 1);
458}
459
460/*
461 * Exchange the RMID of a group of events.
462 */
463static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
464{
465 struct perf_event *event;
466 struct list_head *head = &group->hw.cqm_group_entry;
467 u32 old_rmid = group->hw.cqm_rmid;
468
469 lockdep_assert_held(&cache_mutex);
470
471 /*
472 * If our RMID is being deallocated, perform a read now.
473 */
474 if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
475 struct rmid_read rr = {
476 .rmid = old_rmid,
477 .evt_type = group->attr.config,
478 .value = ATOMIC64_INIT(0),
479 };
480
481 cqm_mask_call(&rr);
482 local64_set(&group->count, atomic64_read(&rr.value));
483 }
484
485 raw_spin_lock_irq(&cache_lock);
486
487 group->hw.cqm_rmid = rmid;
488 list_for_each_entry(event, head, hw.cqm_group_entry)
489 event->hw.cqm_rmid = rmid;
490
491 raw_spin_unlock_irq(&cache_lock);
492
493 /*
494 * If the allocation is for mbm, init the mbm stats.
495 * Need to check if each event in the group is mbm event
496 * because there could be multiple type of events in the same group.
497 */
498 if (__rmid_valid(rmid)) {
499 event = group;
500 if (is_mbm_event(event->attr.config))
501 init_mbm_sample(rmid, event->attr.config);
502
503 list_for_each_entry(event, head, hw.cqm_group_entry) {
504 if (is_mbm_event(event->attr.config))
505 init_mbm_sample(rmid, event->attr.config);
506 }
507 }
508
509 return old_rmid;
510}
511
512/*
513 * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
514 * cachelines are still tagged with RMIDs in limbo, we progressively
515 * increment the threshold until we find an RMID in limbo with <=
516 * __intel_cqm_threshold lines tagged. This is designed to mitigate the
517 * problem where cachelines tagged with an RMID are not steadily being
518 * evicted.
519 *
520 * On successful rotations we decrease the threshold back towards zero.
521 *
522 * __intel_cqm_max_threshold provides an upper bound on the threshold,
523 * and is measured in bytes because it's exposed to userland.
524 */
525static unsigned int __intel_cqm_threshold;
526static unsigned int __intel_cqm_max_threshold;
527
528/*
529 * Test whether an RMID has a zero occupancy value on this cpu.
530 */
531static void intel_cqm_stable(void *arg)
532{
533 struct cqm_rmid_entry *entry;
534
535 list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
536 if (entry->state != RMID_AVAILABLE)
537 break;
538
539 if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
540 entry->state = RMID_DIRTY;
541 }
542}
543
544/*
545 * If we have group events waiting for an RMID that don't conflict with
546 * events already running, assign @rmid.
547 */
548static bool intel_cqm_sched_in_event(u32 rmid)
549{
550 struct perf_event *leader, *event;
551
552 lockdep_assert_held(&cache_mutex);
553
554 leader = list_first_entry(&cache_groups, struct perf_event,
555 hw.cqm_groups_entry);
556 event = leader;
557
558 list_for_each_entry_continue(event, &cache_groups,
559 hw.cqm_groups_entry) {
560 if (__rmid_valid(event->hw.cqm_rmid))
561 continue;
562
563 if (__conflict_event(event, leader))
564 continue;
565
566 intel_cqm_xchg_rmid(event, rmid);
567 return true;
568 }
569
570 return false;
571}
572
573/*
574 * Initially use this constant for both the limbo queue time and the
575 * rotation timer interval, pmu::hrtimer_interval_ms.
576 *
577 * They don't need to be the same, but the two are related since if you
578 * rotate faster than you recycle RMIDs, you may run out of available
579 * RMIDs.
580 */
581#define RMID_DEFAULT_QUEUE_TIME 250 /* ms */
582
583static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
584
585/*
586 * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
587 * @nr_available: number of freeable RMIDs on the limbo list
588 *
589 * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
590 * cachelines are tagged with those RMIDs. After this we can reuse them
591 * and know that the current set of active RMIDs is stable.
592 *
593 * Return %true or %false depending on whether stabilization needs to be
594 * reattempted.
595 *
596 * If we return %true then @nr_available is updated to indicate the
597 * number of RMIDs on the limbo list that have been queued for the
598 * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
599 * are above __intel_cqm_threshold.
600 */
601static bool intel_cqm_rmid_stabilize(unsigned int *available)
602{
603 struct cqm_rmid_entry *entry, *tmp;
604
605 lockdep_assert_held(&cache_mutex);
606
607 *available = 0;
608 list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
609 unsigned long min_queue_time;
610 unsigned long now = jiffies;
611
612 /*
613 * We hold RMIDs placed into limbo for a minimum queue
614 * time. Before the minimum queue time has elapsed we do
615 * not recycle RMIDs.
616 *
617 * The reasoning is that until a sufficient time has
618 * passed since we stopped using an RMID, any RMID
619 * placed onto the limbo list will likely still have
620 * data tagged in the cache, which means we'll probably
621 * fail to recycle it anyway.
622 *
623 * We can save ourselves an expensive IPI by skipping
624 * any RMIDs that have not been queued for the minimum
625 * time.
626 */
627 min_queue_time = entry->queue_time +
628 msecs_to_jiffies(__rmid_queue_time_ms);
629
630 if (time_after(min_queue_time, now))
631 break;
632
633 entry->state = RMID_AVAILABLE;
634 (*available)++;
635 }
636
637 /*
638 * Fast return if none of the RMIDs on the limbo list have been
639 * sitting on the queue for the minimum queue time.
640 */
641 if (!*available)
642 return false;
643
644 /*
645 * Test whether an RMID is free for each package.
646 */
647 on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
648
649 list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
650 /*
651 * Exhausted all RMIDs that have waited min queue time.
652 */
653 if (entry->state == RMID_YOUNG)
654 break;
655
656 if (entry->state == RMID_DIRTY)
657 continue;
658
659 list_del(&entry->list); /* remove from limbo */
660
661 /*
662 * The rotation RMID gets priority if it's
663 * currently invalid. In which case, skip adding
664 * the RMID to the the free lru.
665 */
666 if (!__rmid_valid(intel_cqm_rotation_rmid)) {
667 intel_cqm_rotation_rmid = entry->rmid;
668 continue;
669 }
670
671 /*
672 * If we have groups waiting for RMIDs, hand
673 * them one now provided they don't conflict.
674 */
675 if (intel_cqm_sched_in_event(entry->rmid))
676 continue;
677
678 /*
679 * Otherwise place it onto the free list.
680 */
681 list_add_tail(&entry->list, &cqm_rmid_free_lru);
682 }
683
684
685 return __rmid_valid(intel_cqm_rotation_rmid);
686}
687
688/*
689 * Pick a victim group and move it to the tail of the group list.
690 * @next: The first group without an RMID
691 */
692static void __intel_cqm_pick_and_rotate(struct perf_event *next)
693{
694 struct perf_event *rotor;
695 u32 rmid;
696
697 lockdep_assert_held(&cache_mutex);
698
699 rotor = list_first_entry(&cache_groups, struct perf_event,
700 hw.cqm_groups_entry);
701
702 /*
703 * The group at the front of the list should always have a valid
704 * RMID. If it doesn't then no groups have RMIDs assigned and we
705 * don't need to rotate the list.
706 */
707 if (next == rotor)
708 return;
709
710 rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
711 __put_rmid(rmid);
712
713 list_rotate_left(&cache_groups);
714}
715
716/*
717 * Deallocate the RMIDs from any events that conflict with @event, and
718 * place them on the back of the group list.
719 */
720static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
721{
722 struct perf_event *group, *g;
723 u32 rmid;
724
725 lockdep_assert_held(&cache_mutex);
726
727 list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
728 if (group == event)
729 continue;
730
731 rmid = group->hw.cqm_rmid;
732
733 /*
734 * Skip events that don't have a valid RMID.
735 */
736 if (!__rmid_valid(rmid))
737 continue;
738
739 /*
740 * No conflict? No problem! Leave the event alone.
741 */
742 if (!__conflict_event(group, event))
743 continue;
744
745 intel_cqm_xchg_rmid(group, INVALID_RMID);
746 __put_rmid(rmid);
747 }
748}
749
750/*
751 * Attempt to rotate the groups and assign new RMIDs.
752 *
753 * We rotate for two reasons,
754 * 1. To handle the scheduling of conflicting events
755 * 2. To recycle RMIDs
756 *
757 * Rotating RMIDs is complicated because the hardware doesn't give us
758 * any clues.
759 *
760 * There's problems with the hardware interface; when you change the
761 * task:RMID map cachelines retain their 'old' tags, giving a skewed
762 * picture. In order to work around this, we must always keep one free
763 * RMID - intel_cqm_rotation_rmid.
764 *
765 * Rotation works by taking away an RMID from a group (the old RMID),
766 * and assigning the free RMID to another group (the new RMID). We must
767 * then wait for the old RMID to not be used (no cachelines tagged).
768 * This ensure that all cachelines are tagged with 'active' RMIDs. At
769 * this point we can start reading values for the new RMID and treat the
770 * old RMID as the free RMID for the next rotation.
771 *
772 * Return %true or %false depending on whether we did any rotating.
773 */
774static bool __intel_cqm_rmid_rotate(void)
775{
776 struct perf_event *group, *start = NULL;
777 unsigned int threshold_limit;
778 unsigned int nr_needed = 0;
779 unsigned int nr_available;
780 bool rotated = false;
781
782 mutex_lock(&cache_mutex);
783
784again:
785 /*
786 * Fast path through this function if there are no groups and no
787 * RMIDs that need cleaning.
788 */
789 if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
790 goto out;
791
792 list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
793 if (!__rmid_valid(group->hw.cqm_rmid)) {
794 if (!start)
795 start = group;
796 nr_needed++;
797 }
798 }
799
800 /*
801 * We have some event groups, but they all have RMIDs assigned
802 * and no RMIDs need cleaning.
803 */
804 if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
805 goto out;
806
807 if (!nr_needed)
808 goto stabilize;
809
810 /*
811 * We have more event groups without RMIDs than available RMIDs,
812 * or we have event groups that conflict with the ones currently
813 * scheduled.
814 *
815 * We force deallocate the rmid of the group at the head of
816 * cache_groups. The first event group without an RMID then gets
817 * assigned intel_cqm_rotation_rmid. This ensures we always make
818 * forward progress.
819 *
820 * Rotate the cache_groups list so the previous head is now the
821 * tail.
822 */
823 __intel_cqm_pick_and_rotate(start);
824
825 /*
826 * If the rotation is going to succeed, reduce the threshold so
827 * that we don't needlessly reuse dirty RMIDs.
828 */
829 if (__rmid_valid(intel_cqm_rotation_rmid)) {
830 intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
831 intel_cqm_rotation_rmid = __get_rmid();
832
833 intel_cqm_sched_out_conflicting_events(start);
834
835 if (__intel_cqm_threshold)
836 __intel_cqm_threshold--;
837 }
838
839 rotated = true;
840
841stabilize:
842 /*
843 * We now need to stablize the RMID we freed above (if any) to
844 * ensure that the next time we rotate we have an RMID with zero
845 * occupancy value.
846 *
847 * Alternatively, if we didn't need to perform any rotation,
848 * we'll have a bunch of RMIDs in limbo that need stabilizing.
849 */
850 threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
851
852 while (intel_cqm_rmid_stabilize(&nr_available) &&
853 __intel_cqm_threshold < threshold_limit) {
854 unsigned int steal_limit;
855
856 /*
857 * Don't spin if nobody is actively waiting for an RMID,
858 * the rotation worker will be kicked as soon as an
859 * event needs an RMID anyway.
860 */
861 if (!nr_needed)
862 break;
863
864 /* Allow max 25% of RMIDs to be in limbo. */
865 steal_limit = (cqm_max_rmid + 1) / 4;
866
867 /*
868 * We failed to stabilize any RMIDs so our rotation
869 * logic is now stuck. In order to make forward progress
870 * we have a few options:
871 *
872 * 1. rotate ("steal") another RMID
873 * 2. increase the threshold
874 * 3. do nothing
875 *
876 * We do both of 1. and 2. until we hit the steal limit.
877 *
878 * The steal limit prevents all RMIDs ending up on the
879 * limbo list. This can happen if every RMID has a
880 * non-zero occupancy above threshold_limit, and the
881 * occupancy values aren't dropping fast enough.
882 *
883 * Note that there is prioritisation at work here - we'd
884 * rather increase the number of RMIDs on the limbo list
885 * than increase the threshold, because increasing the
886 * threshold skews the event data (because we reuse
887 * dirty RMIDs) - threshold bumps are a last resort.
888 */
889 if (nr_available < steal_limit)
890 goto again;
891
892 __intel_cqm_threshold++;
893 }
894
895out:
896 mutex_unlock(&cache_mutex);
897 return rotated;
898}
899
900static void intel_cqm_rmid_rotate(struct work_struct *work);
901
902static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
903
904static struct pmu intel_cqm_pmu;
905
906static void intel_cqm_rmid_rotate(struct work_struct *work)
907{
908 unsigned long delay;
909
910 __intel_cqm_rmid_rotate();
911
912 delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
913 schedule_delayed_work(&intel_cqm_rmid_work, delay);
914}
915
916static u64 update_sample(unsigned int rmid, u32 evt_type, int first)
917{
918 struct sample *mbm_current;
919 u32 vrmid = rmid_2_index(rmid);
920 u64 val, bytes, shift;
921 u32 eventid;
922
923 if (evt_type == QOS_MBM_LOCAL_EVENT_ID) {
924 mbm_current = &mbm_local[vrmid];
925 eventid = QOS_MBM_LOCAL_EVENT_ID;
926 } else {
927 mbm_current = &mbm_total[vrmid];
928 eventid = QOS_MBM_TOTAL_EVENT_ID;
929 }
930
931 wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
932 rdmsrl(MSR_IA32_QM_CTR, val);
933 if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
934 return mbm_current->total_bytes;
935
936 if (first) {
937 mbm_current->prev_msr = val;
938 mbm_current->total_bytes = 0;
939 return mbm_current->total_bytes;
940 }
941
942 /*
943 * The h/w guarantees that counters will not overflow
944 * so long as we poll them at least once per second.
945 */
946 shift = 64 - MBM_CNTR_WIDTH;
947 bytes = (val << shift) - (mbm_current->prev_msr << shift);
948 bytes >>= shift;
949
950 bytes *= cqm_l3_scale;
951
952 mbm_current->total_bytes += bytes;
953 mbm_current->prev_msr = val;
954
955 return mbm_current->total_bytes;
956}
957
958static u64 rmid_read_mbm(unsigned int rmid, u32 evt_type)
959{
960 return update_sample(rmid, evt_type, 0);
961}
962
963static void __intel_mbm_event_init(void *info)
964{
965 struct rmid_read *rr = info;
966
967 update_sample(rr->rmid, rr->evt_type, 1);
968}
969
970static void init_mbm_sample(u32 rmid, u32 evt_type)
971{
972 struct rmid_read rr = {
973 .rmid = rmid,
974 .evt_type = evt_type,
975 .value = ATOMIC64_INIT(0),
976 };
977
978 /* on each socket, init sample */
979 on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1);
980}
981
982/*
983 * Find a group and setup RMID.
984 *
985 * If we're part of a group, we use the group's RMID.
986 */
987static void intel_cqm_setup_event(struct perf_event *event,
988 struct perf_event **group)
989{
990 struct perf_event *iter;
991 bool conflict = false;
992 u32 rmid;
993
994 event->hw.is_group_event = false;
995 list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
996 rmid = iter->hw.cqm_rmid;
997
998 if (__match_event(iter, event)) {
999 /* All tasks in a group share an RMID */
1000 event->hw.cqm_rmid = rmid;
1001 *group = iter;
1002 if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
1003 init_mbm_sample(rmid, event->attr.config);
1004 return;
1005 }
1006
1007 /*
1008 * We only care about conflicts for events that are
1009 * actually scheduled in (and hence have a valid RMID).
1010 */
1011 if (__conflict_event(iter, event) && __rmid_valid(rmid))
1012 conflict = true;
1013 }
1014
1015 if (conflict)
1016 rmid = INVALID_RMID;
1017 else
1018 rmid = __get_rmid();
1019
1020 if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
1021 init_mbm_sample(rmid, event->attr.config);
1022
1023 event->hw.cqm_rmid = rmid;
1024}
1025
1026static void intel_cqm_event_read(struct perf_event *event)
1027{
1028 unsigned long flags;
1029 u32 rmid;
1030 u64 val;
1031
1032 /*
1033 * Task events are handled by intel_cqm_event_count().
1034 */
1035 if (event->cpu == -1)
1036 return;
1037
1038 raw_spin_lock_irqsave(&cache_lock, flags);
1039 rmid = event->hw.cqm_rmid;
1040
1041 if (!__rmid_valid(rmid))
1042 goto out;
1043
1044 if (is_mbm_event(event->attr.config))
1045 val = rmid_read_mbm(rmid, event->attr.config);
1046 else
1047 val = __rmid_read(rmid);
1048
1049 /*
1050 * Ignore this reading on error states and do not update the value.
1051 */
1052 if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
1053 goto out;
1054
1055 local64_set(&event->count, val);
1056out:
1057 raw_spin_unlock_irqrestore(&cache_lock, flags);
1058}
1059
1060static void __intel_cqm_event_count(void *info)
1061{
1062 struct rmid_read *rr = info;
1063 u64 val;
1064
1065 val = __rmid_read(rr->rmid);
1066
1067 if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
1068 return;
1069
1070 atomic64_add(val, &rr->value);
1071}
1072
1073static inline bool cqm_group_leader(struct perf_event *event)
1074{
1075 return !list_empty(&event->hw.cqm_groups_entry);
1076}
1077
1078static void __intel_mbm_event_count(void *info)
1079{
1080 struct rmid_read *rr = info;
1081 u64 val;
1082
1083 val = rmid_read_mbm(rr->rmid, rr->evt_type);
1084 if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
1085 return;
1086 atomic64_add(val, &rr->value);
1087}
1088
1089static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer)
1090{
1091 struct perf_event *iter, *iter1;
1092 int ret = HRTIMER_RESTART;
1093 struct list_head *head;
1094 unsigned long flags;
1095 u32 grp_rmid;
1096
1097 /*
1098 * Need to cache_lock as the timer Event Select MSR reads
1099 * can race with the mbm/cqm count() and mbm_init() reads.
1100 */
1101 raw_spin_lock_irqsave(&cache_lock, flags);
1102
1103 if (list_empty(&cache_groups)) {
1104 ret = HRTIMER_NORESTART;
1105 goto out;
1106 }
1107
1108 list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
1109 grp_rmid = iter->hw.cqm_rmid;
1110 if (!__rmid_valid(grp_rmid))
1111 continue;
1112 if (is_mbm_event(iter->attr.config))
1113 update_sample(grp_rmid, iter->attr.config, 0);
1114
1115 head = &iter->hw.cqm_group_entry;
1116 if (list_empty(head))
1117 continue;
1118 list_for_each_entry(iter1, head, hw.cqm_group_entry) {
1119 if (!iter1->hw.is_group_event)
1120 break;
1121 if (is_mbm_event(iter1->attr.config))
1122 update_sample(iter1->hw.cqm_rmid,
1123 iter1->attr.config, 0);
1124 }
1125 }
1126
1127 hrtimer_forward_now(hrtimer, ms_to_ktime(MBM_CTR_OVERFLOW_TIME));
1128out:
1129 raw_spin_unlock_irqrestore(&cache_lock, flags);
1130
1131 return ret;
1132}
1133
1134static void __mbm_start_timer(void *info)
1135{
1136 hrtimer_start(&mbm_timers[pkg_id], ms_to_ktime(MBM_CTR_OVERFLOW_TIME),
1137 HRTIMER_MODE_REL_PINNED);
1138}
1139
1140static void __mbm_stop_timer(void *info)
1141{
1142 hrtimer_cancel(&mbm_timers[pkg_id]);
1143}
1144
1145static void mbm_start_timers(void)
1146{
1147 on_each_cpu_mask(&cqm_cpumask, __mbm_start_timer, NULL, 1);
1148}
1149
1150static void mbm_stop_timers(void)
1151{
1152 on_each_cpu_mask(&cqm_cpumask, __mbm_stop_timer, NULL, 1);
1153}
1154
1155static void mbm_hrtimer_init(void)
1156{
1157 struct hrtimer *hr;
1158 int i;
1159
1160 for (i = 0; i < mbm_socket_max; i++) {
1161 hr = &mbm_timers[i];
1162 hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1163 hr->function = mbm_hrtimer_handle;
1164 }
1165}
1166
1167static u64 intel_cqm_event_count(struct perf_event *event)
1168{
1169 unsigned long flags;
1170 struct rmid_read rr = {
1171 .evt_type = event->attr.config,
1172 .value = ATOMIC64_INIT(0),
1173 };
1174
1175 /*
1176 * We only need to worry about task events. System-wide events
1177 * are handled like usual, i.e. entirely with
1178 * intel_cqm_event_read().
1179 */
1180 if (event->cpu != -1)
1181 return __perf_event_count(event);
1182
1183 /*
1184 * Only the group leader gets to report values except in case of
1185 * multiple events in the same group, we still need to read the
1186 * other events.This stops us
1187 * reporting duplicate values to userspace, and gives us a clear
1188 * rule for which task gets to report the values.
1189 *
1190 * Note that it is impossible to attribute these values to
1191 * specific packages - we forfeit that ability when we create
1192 * task events.
1193 */
1194 if (!cqm_group_leader(event) && !event->hw.is_group_event)
1195 return 0;
1196
1197 /*
1198 * Getting up-to-date values requires an SMP IPI which is not
1199 * possible if we're being called in interrupt context. Return
1200 * the cached values instead.
1201 */
1202 if (unlikely(in_interrupt()))
1203 goto out;
1204
1205 /*
1206 * Notice that we don't perform the reading of an RMID
1207 * atomically, because we can't hold a spin lock across the
1208 * IPIs.
1209 *
1210 * Speculatively perform the read, since @event might be
1211 * assigned a different (possibly invalid) RMID while we're
1212 * busying performing the IPI calls. It's therefore necessary to
1213 * check @event's RMID afterwards, and if it has changed,
1214 * discard the result of the read.
1215 */
1216 rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
1217
1218 if (!__rmid_valid(rr.rmid))
1219 goto out;
1220
1221 cqm_mask_call(&rr);
1222
1223 raw_spin_lock_irqsave(&cache_lock, flags);
1224 if (event->hw.cqm_rmid == rr.rmid)
1225 local64_set(&event->count, atomic64_read(&rr.value));
1226 raw_spin_unlock_irqrestore(&cache_lock, flags);
1227out:
1228 return __perf_event_count(event);
1229}
1230
1231static void intel_cqm_event_start(struct perf_event *event, int mode)
1232{
1233 struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
1234 u32 rmid = event->hw.cqm_rmid;
1235
1236 if (!(event->hw.cqm_state & PERF_HES_STOPPED))
1237 return;
1238
1239 event->hw.cqm_state &= ~PERF_HES_STOPPED;
1240
1241 if (state->rmid_usecnt++) {
1242 if (!WARN_ON_ONCE(state->rmid != rmid))
1243 return;
1244 } else {
1245 WARN_ON_ONCE(state->rmid);
1246 }
1247
1248 state->rmid = rmid;
1249 wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
1250}
1251
1252static void intel_cqm_event_stop(struct perf_event *event, int mode)
1253{
1254 struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
1255
1256 if (event->hw.cqm_state & PERF_HES_STOPPED)
1257 return;
1258
1259 event->hw.cqm_state |= PERF_HES_STOPPED;
1260
1261 intel_cqm_event_read(event);
1262
1263 if (!--state->rmid_usecnt) {
1264 state->rmid = 0;
1265 wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
1266 } else {
1267 WARN_ON_ONCE(!state->rmid);
1268 }
1269}
1270
1271static int intel_cqm_event_add(struct perf_event *event, int mode)
1272{
1273 unsigned long flags;
1274 u32 rmid;
1275
1276 raw_spin_lock_irqsave(&cache_lock, flags);
1277
1278 event->hw.cqm_state = PERF_HES_STOPPED;
1279 rmid = event->hw.cqm_rmid;
1280
1281 if (__rmid_valid(rmid) && (mode & PERF_EF_START))
1282 intel_cqm_event_start(event, mode);
1283
1284 raw_spin_unlock_irqrestore(&cache_lock, flags);
1285
1286 return 0;
1287}
1288
1289static void intel_cqm_event_destroy(struct perf_event *event)
1290{
1291 struct perf_event *group_other = NULL;
1292 unsigned long flags;
1293
1294 mutex_lock(&cache_mutex);
1295 /*
1296 * Hold the cache_lock as mbm timer handlers could be
1297 * scanning the list of events.
1298 */
1299 raw_spin_lock_irqsave(&cache_lock, flags);
1300
1301 /*
1302 * If there's another event in this group...
1303 */
1304 if (!list_empty(&event->hw.cqm_group_entry)) {
1305 group_other = list_first_entry(&event->hw.cqm_group_entry,
1306 struct perf_event,
1307 hw.cqm_group_entry);
1308 list_del(&event->hw.cqm_group_entry);
1309 }
1310
1311 /*
1312 * And we're the group leader..
1313 */
1314 if (cqm_group_leader(event)) {
1315 /*
1316 * If there was a group_other, make that leader, otherwise
1317 * destroy the group and return the RMID.
1318 */
1319 if (group_other) {
1320 list_replace(&event->hw.cqm_groups_entry,
1321 &group_other->hw.cqm_groups_entry);
1322 } else {
1323 u32 rmid = event->hw.cqm_rmid;
1324
1325 if (__rmid_valid(rmid))
1326 __put_rmid(rmid);
1327 list_del(&event->hw.cqm_groups_entry);
1328 }
1329 }
1330
1331 raw_spin_unlock_irqrestore(&cache_lock, flags);
1332
1333 /*
1334 * Stop the mbm overflow timers when the last event is destroyed.
1335 */
1336 if (mbm_enabled && list_empty(&cache_groups))
1337 mbm_stop_timers();
1338
1339 mutex_unlock(&cache_mutex);
1340}
1341
1342static int intel_cqm_event_init(struct perf_event *event)
1343{
1344 struct perf_event *group = NULL;
1345 bool rotate = false;
1346 unsigned long flags;
1347
1348 if (event->attr.type != intel_cqm_pmu.type)
1349 return -ENOENT;
1350
1351 if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) ||
1352 (event->attr.config > QOS_MBM_LOCAL_EVENT_ID))
1353 return -EINVAL;
1354
1355 if ((is_cqm_event(event->attr.config) && !cqm_enabled) ||
1356 (is_mbm_event(event->attr.config) && !mbm_enabled))
1357 return -EINVAL;
1358
1359 /* unsupported modes and filters */
1360 if (event->attr.exclude_user ||
1361 event->attr.exclude_kernel ||
1362 event->attr.exclude_hv ||
1363 event->attr.exclude_idle ||
1364 event->attr.exclude_host ||
1365 event->attr.exclude_guest ||
1366 event->attr.sample_period) /* no sampling */
1367 return -EINVAL;
1368
1369 INIT_LIST_HEAD(&event->hw.cqm_group_entry);
1370 INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
1371
1372 event->destroy = intel_cqm_event_destroy;
1373
1374 mutex_lock(&cache_mutex);
1375
1376 /*
1377 * Start the mbm overflow timers when the first event is created.
1378 */
1379 if (mbm_enabled && list_empty(&cache_groups))
1380 mbm_start_timers();
1381
1382 /* Will also set rmid */
1383 intel_cqm_setup_event(event, &group);
1384
1385 /*
1386 * Hold the cache_lock as mbm timer handlers be
1387 * scanning the list of events.
1388 */
1389 raw_spin_lock_irqsave(&cache_lock, flags);
1390
1391 if (group) {
1392 list_add_tail(&event->hw.cqm_group_entry,
1393 &group->hw.cqm_group_entry);
1394 } else {
1395 list_add_tail(&event->hw.cqm_groups_entry,
1396 &cache_groups);
1397
1398 /*
1399 * All RMIDs are either in use or have recently been
1400 * used. Kick the rotation worker to clean/free some.
1401 *
1402 * We only do this for the group leader, rather than for
1403 * every event in a group to save on needless work.
1404 */
1405 if (!__rmid_valid(event->hw.cqm_rmid))
1406 rotate = true;
1407 }
1408
1409 raw_spin_unlock_irqrestore(&cache_lock, flags);
1410 mutex_unlock(&cache_mutex);
1411
1412 if (rotate)
1413 schedule_delayed_work(&intel_cqm_rmid_work, 0);
1414
1415 return 0;
1416}
1417
1418EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
1419EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
1420EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
1421EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
1422EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
1423
1424EVENT_ATTR_STR(total_bytes, intel_cqm_total_bytes, "event=0x02");
1425EVENT_ATTR_STR(total_bytes.per-pkg, intel_cqm_total_bytes_pkg, "1");
1426EVENT_ATTR_STR(total_bytes.unit, intel_cqm_total_bytes_unit, "MB");
1427EVENT_ATTR_STR(total_bytes.scale, intel_cqm_total_bytes_scale, "1e-6");
1428
1429EVENT_ATTR_STR(local_bytes, intel_cqm_local_bytes, "event=0x03");
1430EVENT_ATTR_STR(local_bytes.per-pkg, intel_cqm_local_bytes_pkg, "1");
1431EVENT_ATTR_STR(local_bytes.unit, intel_cqm_local_bytes_unit, "MB");
1432EVENT_ATTR_STR(local_bytes.scale, intel_cqm_local_bytes_scale, "1e-6");
1433
1434static struct attribute *intel_cqm_events_attr[] = {
1435 EVENT_PTR(intel_cqm_llc),
1436 EVENT_PTR(intel_cqm_llc_pkg),
1437 EVENT_PTR(intel_cqm_llc_unit),
1438 EVENT_PTR(intel_cqm_llc_scale),
1439 EVENT_PTR(intel_cqm_llc_snapshot),
1440 NULL,
1441};
1442
1443static struct attribute *intel_mbm_events_attr[] = {
1444 EVENT_PTR(intel_cqm_total_bytes),
1445 EVENT_PTR(intel_cqm_local_bytes),
1446 EVENT_PTR(intel_cqm_total_bytes_pkg),
1447 EVENT_PTR(intel_cqm_local_bytes_pkg),
1448 EVENT_PTR(intel_cqm_total_bytes_unit),
1449 EVENT_PTR(intel_cqm_local_bytes_unit),
1450 EVENT_PTR(intel_cqm_total_bytes_scale),
1451 EVENT_PTR(intel_cqm_local_bytes_scale),
1452 NULL,
1453};
1454
1455static struct attribute *intel_cmt_mbm_events_attr[] = {
1456 EVENT_PTR(intel_cqm_llc),
1457 EVENT_PTR(intel_cqm_total_bytes),
1458 EVENT_PTR(intel_cqm_local_bytes),
1459 EVENT_PTR(intel_cqm_llc_pkg),
1460 EVENT_PTR(intel_cqm_total_bytes_pkg),
1461 EVENT_PTR(intel_cqm_local_bytes_pkg),
1462 EVENT_PTR(intel_cqm_llc_unit),
1463 EVENT_PTR(intel_cqm_total_bytes_unit),
1464 EVENT_PTR(intel_cqm_local_bytes_unit),
1465 EVENT_PTR(intel_cqm_llc_scale),
1466 EVENT_PTR(intel_cqm_total_bytes_scale),
1467 EVENT_PTR(intel_cqm_local_bytes_scale),
1468 EVENT_PTR(intel_cqm_llc_snapshot),
1469 NULL,
1470};
1471
1472static struct attribute_group intel_cqm_events_group = {
1473 .name = "events",
1474 .attrs = NULL,
1475};
1476
1477PMU_FORMAT_ATTR(event, "config:0-7");
1478static struct attribute *intel_cqm_formats_attr[] = {
1479 &format_attr_event.attr,
1480 NULL,
1481};
1482
1483static struct attribute_group intel_cqm_format_group = {
1484 .name = "format",
1485 .attrs = intel_cqm_formats_attr,
1486};
1487
1488static ssize_t
1489max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
1490 char *page)
1491{
1492 ssize_t rv;
1493
1494 mutex_lock(&cache_mutex);
1495 rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
1496 mutex_unlock(&cache_mutex);
1497
1498 return rv;
1499}
1500
1501static ssize_t
1502max_recycle_threshold_store(struct device *dev,
1503 struct device_attribute *attr,
1504 const char *buf, size_t count)
1505{
1506 unsigned int bytes, cachelines;
1507 int ret;
1508
1509 ret = kstrtouint(buf, 0, &bytes);
1510 if (ret)
1511 return ret;
1512
1513 mutex_lock(&cache_mutex);
1514
1515 __intel_cqm_max_threshold = bytes;
1516 cachelines = bytes / cqm_l3_scale;
1517
1518 /*
1519 * The new maximum takes effect immediately.
1520 */
1521 if (__intel_cqm_threshold > cachelines)
1522 __intel_cqm_threshold = cachelines;
1523
1524 mutex_unlock(&cache_mutex);
1525
1526 return count;
1527}
1528
1529static DEVICE_ATTR_RW(max_recycle_threshold);
1530
1531static struct attribute *intel_cqm_attrs[] = {
1532 &dev_attr_max_recycle_threshold.attr,
1533 NULL,
1534};
1535
1536static const struct attribute_group intel_cqm_group = {
1537 .attrs = intel_cqm_attrs,
1538};
1539
1540static const struct attribute_group *intel_cqm_attr_groups[] = {
1541 &intel_cqm_events_group,
1542 &intel_cqm_format_group,
1543 &intel_cqm_group,
1544 NULL,
1545};
1546
1547static struct pmu intel_cqm_pmu = {
1548 .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
1549 .attr_groups = intel_cqm_attr_groups,
1550 .task_ctx_nr = perf_sw_context,
1551 .event_init = intel_cqm_event_init,
1552 .add = intel_cqm_event_add,
1553 .del = intel_cqm_event_stop,
1554 .start = intel_cqm_event_start,
1555 .stop = intel_cqm_event_stop,
1556 .read = intel_cqm_event_read,
1557 .count = intel_cqm_event_count,
1558};
1559
1560static inline void cqm_pick_event_reader(int cpu)
1561{
1562 int reader;
1563
1564 /* First online cpu in package becomes the reader */
1565 reader = cpumask_any_and(&cqm_cpumask, topology_core_cpumask(cpu));
1566 if (reader >= nr_cpu_ids)
1567 cpumask_set_cpu(cpu, &cqm_cpumask);
1568}
1569
1570static int intel_cqm_cpu_starting(unsigned int cpu)
1571{
1572 struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
1573 struct cpuinfo_x86 *c = &cpu_data(cpu);
1574
1575 state->rmid = 0;
1576 state->closid = 0;
1577 state->rmid_usecnt = 0;
1578
1579 WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
1580 WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
1581
1582 cqm_pick_event_reader(cpu);
1583 return 0;
1584}
1585
1586static int intel_cqm_cpu_exit(unsigned int cpu)
1587{
1588 int target;
1589
1590 /* Is @cpu the current cqm reader for this package ? */
1591 if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
1592 return 0;
1593
1594 /* Find another online reader in this package */
1595 target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
1596
1597 if (target < nr_cpu_ids)
1598 cpumask_set_cpu(target, &cqm_cpumask);
1599
1600 return 0;
1601}
1602
1603static const struct x86_cpu_id intel_cqm_match[] = {
1604 { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
1605 {}
1606};
1607
1608static void mbm_cleanup(void)
1609{
1610 if (!mbm_enabled)
1611 return;
1612
1613 kfree(mbm_local);
1614 kfree(mbm_total);
1615 mbm_enabled = false;
1616}
1617
1618static const struct x86_cpu_id intel_mbm_local_match[] = {
1619 { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL },
1620 {}
1621};
1622
1623static const struct x86_cpu_id intel_mbm_total_match[] = {
1624 { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_TOTAL },
1625 {}
1626};
1627
1628static int intel_mbm_init(void)
1629{
1630 int ret = 0, array_size, maxid = cqm_max_rmid + 1;
1631
1632 mbm_socket_max = topology_max_packages();
1633 array_size = sizeof(struct sample) * maxid * mbm_socket_max;
1634 mbm_local = kmalloc(array_size, GFP_KERNEL);
1635 if (!mbm_local)
1636 return -ENOMEM;
1637
1638 mbm_total = kmalloc(array_size, GFP_KERNEL);
1639 if (!mbm_total) {
1640 ret = -ENOMEM;
1641 goto out;
1642 }
1643
1644 array_size = sizeof(struct hrtimer) * mbm_socket_max;
1645 mbm_timers = kmalloc(array_size, GFP_KERNEL);
1646 if (!mbm_timers) {
1647 ret = -ENOMEM;
1648 goto out;
1649 }
1650 mbm_hrtimer_init();
1651
1652out:
1653 if (ret)
1654 mbm_cleanup();
1655
1656 return ret;
1657}
1658
1659static int __init intel_cqm_init(void)
1660{
1661 char *str = NULL, scale[20];
1662 int cpu, ret;
1663
1664 if (x86_match_cpu(intel_cqm_match))
1665 cqm_enabled = true;
1666
1667 if (x86_match_cpu(intel_mbm_local_match) &&
1668 x86_match_cpu(intel_mbm_total_match))
1669 mbm_enabled = true;
1670
1671 if (!cqm_enabled && !mbm_enabled)
1672 return -ENODEV;
1673
1674 cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
1675
1676 /*
1677 * It's possible that not all resources support the same number
1678 * of RMIDs. Instead of making scheduling much more complicated
1679 * (where we have to match a task's RMID to a cpu that supports
1680 * that many RMIDs) just find the minimum RMIDs supported across
1681 * all cpus.
1682 *
1683 * Also, check that the scales match on all cpus.
1684 */
1685 cpus_read_lock();
1686 for_each_online_cpu(cpu) {
1687 struct cpuinfo_x86 *c = &cpu_data(cpu);
1688
1689 if (c->x86_cache_max_rmid < cqm_max_rmid)
1690 cqm_max_rmid = c->x86_cache_max_rmid;
1691
1692 if (c->x86_cache_occ_scale != cqm_l3_scale) {
1693 pr_err("Multiple LLC scale values, disabling\n");
1694 ret = -EINVAL;
1695 goto out;
1696 }
1697 }
1698
1699 /*
1700 * A reasonable upper limit on the max threshold is the number
1701 * of lines tagged per RMID if all RMIDs have the same number of
1702 * lines tagged in the LLC.
1703 *
1704 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
1705 */
1706 __intel_cqm_max_threshold =
1707 boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
1708
1709 snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
1710 str = kstrdup(scale, GFP_KERNEL);
1711 if (!str) {
1712 ret = -ENOMEM;
1713 goto out;
1714 }
1715
1716 event_attr_intel_cqm_llc_scale.event_str = str;
1717
1718 ret = intel_cqm_setup_rmid_cache();
1719 if (ret)
1720 goto out;
1721
1722 if (mbm_enabled)
1723 ret = intel_mbm_init();
1724 if (ret && !cqm_enabled)
1725 goto out;
1726
1727 if (cqm_enabled && mbm_enabled)
1728 intel_cqm_events_group.attrs = intel_cmt_mbm_events_attr;
1729 else if (!cqm_enabled && mbm_enabled)
1730 intel_cqm_events_group.attrs = intel_mbm_events_attr;
1731 else if (cqm_enabled && !mbm_enabled)
1732 intel_cqm_events_group.attrs = intel_cqm_events_attr;
1733
1734 ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
1735 if (ret) {
1736 pr_err("Intel CQM perf registration failed: %d\n", ret);
1737 goto out;
1738 }
1739
1740 if (cqm_enabled)
1741 pr_info("Intel CQM monitoring enabled\n");
1742 if (mbm_enabled)
1743 pr_info("Intel MBM enabled\n");
1744
1745 /*
1746 * Setup the hot cpu notifier once we are sure cqm
1747 * is enabled to avoid notifier leak.
1748 */
1749 cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_STARTING,
1750 "perf/x86/cqm:starting",
1751 intel_cqm_cpu_starting, NULL);
1752 cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_ONLINE,
1753 "perf/x86/cqm:online",
1754 NULL, intel_cqm_cpu_exit);
1755out:
1756 cpus_read_unlock();
1757
1758 if (ret) {
1759 kfree(str);
1760 cqm_cleanup();
1761 mbm_cleanup();
1762 }
1763
1764 return ret;
1765}
1766device_initcall(intel_cqm_init);
diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
deleted file mode 100644
index 597dc4995678..000000000000
--- a/arch/x86/include/asm/intel_rdt.h
+++ /dev/null
@@ -1,286 +0,0 @@
1#ifndef _ASM_X86_INTEL_RDT_H
2#define _ASM_X86_INTEL_RDT_H
3
4#ifdef CONFIG_INTEL_RDT_A
5
6#include <linux/sched.h>
7#include <linux/kernfs.h>
8#include <linux/jump_label.h>
9
10#include <asm/intel_rdt_common.h>
11
12#define IA32_L3_QOS_CFG 0xc81
13#define IA32_L3_CBM_BASE 0xc90
14#define IA32_L2_CBM_BASE 0xd10
15#define IA32_MBA_THRTL_BASE 0xd50
16
17#define L3_QOS_CDP_ENABLE 0x01ULL
18
19/**
20 * struct rdtgroup - store rdtgroup's data in resctrl file system.
21 * @kn: kernfs node
22 * @rdtgroup_list: linked list for all rdtgroups
23 * @closid: closid for this rdtgroup
24 * @cpu_mask: CPUs assigned to this rdtgroup
25 * @flags: status bits
26 * @waitcount: how many cpus expect to find this
27 * group when they acquire rdtgroup_mutex
28 */
29struct rdtgroup {
30 struct kernfs_node *kn;
31 struct list_head rdtgroup_list;
32 int closid;
33 struct cpumask cpu_mask;
34 int flags;
35 atomic_t waitcount;
36};
37
38/* rdtgroup.flags */
39#define RDT_DELETED 1
40
41/* rftype.flags */
42#define RFTYPE_FLAGS_CPUS_LIST 1
43
44/* List of all resource groups */
45extern struct list_head rdt_all_groups;
46
47extern int max_name_width, max_data_width;
48
49int __init rdtgroup_init(void);
50
51/**
52 * struct rftype - describe each file in the resctrl file system
53 * @name: File name
54 * @mode: Access mode
55 * @kf_ops: File operations
56 * @flags: File specific RFTYPE_FLAGS_* flags
57 * @seq_show: Show content of the file
58 * @write: Write to the file
59 */
60struct rftype {
61 char *name;
62 umode_t mode;
63 struct kernfs_ops *kf_ops;
64 unsigned long flags;
65
66 int (*seq_show)(struct kernfs_open_file *of,
67 struct seq_file *sf, void *v);
68 /*
69 * write() is the generic write callback which maps directly to
70 * kernfs write operation and overrides all other operations.
71 * Maximum write size is determined by ->max_write_len.
72 */
73 ssize_t (*write)(struct kernfs_open_file *of,
74 char *buf, size_t nbytes, loff_t off);
75};
76
77/**
78 * struct rdt_domain - group of cpus sharing an RDT resource
79 * @list: all instances of this resource
80 * @id: unique id for this instance
81 * @cpu_mask: which cpus share this resource
82 * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID)
83 * @new_ctrl: new ctrl value to be loaded
84 * @have_new_ctrl: did user provide new_ctrl for this domain
85 */
86struct rdt_domain {
87 struct list_head list;
88 int id;
89 struct cpumask cpu_mask;
90 u32 *ctrl_val;
91 u32 new_ctrl;
92 bool have_new_ctrl;
93};
94
95/**
96 * struct msr_param - set a range of MSRs from a domain
97 * @res: The resource to use
98 * @low: Beginning index from base MSR
99 * @high: End index
100 */
101struct msr_param {
102 struct rdt_resource *res;
103 int low;
104 int high;
105};
106
107/**
108 * struct rdt_cache - Cache allocation related data
109 * @cbm_len: Length of the cache bit mask
110 * @min_cbm_bits: Minimum number of consecutive bits to be set
111 * @cbm_idx_mult: Multiplier of CBM index
112 * @cbm_idx_offset: Offset of CBM index. CBM index is computed by:
113 * closid * cbm_idx_multi + cbm_idx_offset
114 * in a cache bit mask
115 */
116struct rdt_cache {
117 unsigned int cbm_len;
118 unsigned int min_cbm_bits;
119 unsigned int cbm_idx_mult;
120 unsigned int cbm_idx_offset;
121};
122
123/**
124 * struct rdt_membw - Memory bandwidth allocation related data
125 * @max_delay: Max throttle delay. Delay is the hardware
126 * representation for memory bandwidth.
127 * @min_bw: Minimum memory bandwidth percentage user can request
128 * @bw_gran: Granularity at which the memory bandwidth is allocated
129 * @delay_linear: True if memory B/W delay is in linear scale
130 * @mb_map: Mapping of memory B/W percentage to memory B/W delay
131 */
132struct rdt_membw {
133 u32 max_delay;
134 u32 min_bw;
135 u32 bw_gran;
136 u32 delay_linear;
137 u32 *mb_map;
138};
139
140/**
141 * struct rdt_resource - attributes of an RDT resource
142 * @enabled: Is this feature enabled on this machine
143 * @capable: Is this feature available on this machine
144 * @name: Name to use in "schemata" file
145 * @num_closid: Number of CLOSIDs available
146 * @cache_level: Which cache level defines scope of this resource
147 * @default_ctrl: Specifies default cache cbm or memory B/W percent.
148 * @msr_base: Base MSR address for CBMs
149 * @msr_update: Function pointer to update QOS MSRs
150 * @data_width: Character width of data when displaying
151 * @domains: All domains for this resource
152 * @cache: Cache allocation related data
153 * @info_files: resctrl info files for the resource
154 * @nr_info_files: Number of info files
155 * @format_str: Per resource format string to show domain value
156 * @parse_ctrlval: Per resource function pointer to parse control values
157 */
158struct rdt_resource {
159 bool enabled;
160 bool capable;
161 char *name;
162 int num_closid;
163 int cache_level;
164 u32 default_ctrl;
165 unsigned int msr_base;
166 void (*msr_update) (struct rdt_domain *d, struct msr_param *m,
167 struct rdt_resource *r);
168 int data_width;
169 struct list_head domains;
170 struct rdt_cache cache;
171 struct rdt_membw membw;
172 struct rftype *info_files;
173 int nr_info_files;
174 const char *format_str;
175 int (*parse_ctrlval) (char *buf, struct rdt_resource *r,
176 struct rdt_domain *d);
177};
178
179void rdt_get_cache_infofile(struct rdt_resource *r);
180void rdt_get_mba_infofile(struct rdt_resource *r);
181int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d);
182int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d);
183
184extern struct mutex rdtgroup_mutex;
185
186extern struct rdt_resource rdt_resources_all[];
187extern struct rdtgroup rdtgroup_default;
188DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
189
190int __init rdtgroup_init(void);
191
192enum {
193 RDT_RESOURCE_L3,
194 RDT_RESOURCE_L3DATA,
195 RDT_RESOURCE_L3CODE,
196 RDT_RESOURCE_L2,
197 RDT_RESOURCE_MBA,
198
199 /* Must be the last */
200 RDT_NUM_RESOURCES,
201};
202
203#define for_each_capable_rdt_resource(r) \
204 for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
205 r++) \
206 if (r->capable)
207
208#define for_each_enabled_rdt_resource(r) \
209 for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
210 r++) \
211 if (r->enabled)
212
213/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
214union cpuid_0x10_1_eax {
215 struct {
216 unsigned int cbm_len:5;
217 } split;
218 unsigned int full;
219};
220
221/* CPUID.(EAX=10H, ECX=ResID=3).EAX */
222union cpuid_0x10_3_eax {
223 struct {
224 unsigned int max_delay:12;
225 } split;
226 unsigned int full;
227};
228
229/* CPUID.(EAX=10H, ECX=ResID).EDX */
230union cpuid_0x10_x_edx {
231 struct {
232 unsigned int cos_max:16;
233 } split;
234 unsigned int full;
235};
236
237DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid);
238
239void rdt_ctrl_update(void *arg);
240struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
241void rdtgroup_kn_unlock(struct kernfs_node *kn);
242ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
243 char *buf, size_t nbytes, loff_t off);
244int rdtgroup_schemata_show(struct kernfs_open_file *of,
245 struct seq_file *s, void *v);
246
247/*
248 * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
249 *
250 * Following considerations are made so that this has minimal impact
251 * on scheduler hot path:
252 * - This will stay as no-op unless we are running on an Intel SKU
253 * which supports resource control and we enable by mounting the
254 * resctrl file system.
255 * - Caches the per cpu CLOSid values and does the MSR write only
256 * when a task with a different CLOSid is scheduled in.
257 *
258 * Must be called with preemption disabled.
259 */
260static inline void intel_rdt_sched_in(void)
261{
262 if (static_branch_likely(&rdt_enable_key)) {
263 struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
264 int closid;
265
266 /*
267 * If this task has a closid assigned, use it.
268 * Else use the closid assigned to this cpu.
269 */
270 closid = current->closid;
271 if (closid == 0)
272 closid = this_cpu_read(cpu_closid);
273
274 if (closid != state->closid) {
275 state->closid = closid;
276 wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid);
277 }
278 }
279}
280
281#else
282
283static inline void intel_rdt_sched_in(void) {}
284
285#endif /* CONFIG_INTEL_RDT_A */
286#endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h
deleted file mode 100644
index b31081b89407..000000000000
--- a/arch/x86/include/asm/intel_rdt_common.h
+++ /dev/null
@@ -1,27 +0,0 @@
1#ifndef _ASM_X86_INTEL_RDT_COMMON_H
2#define _ASM_X86_INTEL_RDT_COMMON_H
3
4#define MSR_IA32_PQR_ASSOC 0x0c8f
5
6/**
7 * struct intel_pqr_state - State cache for the PQR MSR
8 * @rmid: The cached Resource Monitoring ID
9 * @closid: The cached Class Of Service ID
10 * @rmid_usecnt: The usage counter for rmid
11 *
12 * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
13 * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
14 * contains both parts, so we need to cache them.
15 *
16 * The cache also helps to avoid pointless updates if the value does
17 * not change.
18 */
19struct intel_pqr_state {
20 u32 rmid;
21 u32 closid;
22 int rmid_usecnt;
23};
24
25DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
26
27#endif /* _ASM_X86_INTEL_RDT_COMMON_H */
diff --git a/arch/x86/include/asm/intel_rdt_sched.h b/arch/x86/include/asm/intel_rdt_sched.h
new file mode 100644
index 000000000000..b4bbf8b21512
--- /dev/null
+++ b/arch/x86/include/asm/intel_rdt_sched.h
@@ -0,0 +1,92 @@
1#ifndef _ASM_X86_INTEL_RDT_SCHED_H
2#define _ASM_X86_INTEL_RDT_SCHED_H
3
4#ifdef CONFIG_INTEL_RDT
5
6#include <linux/sched.h>
7#include <linux/jump_label.h>
8
9#define IA32_PQR_ASSOC 0x0c8f
10
11/**
12 * struct intel_pqr_state - State cache for the PQR MSR
13 * @cur_rmid: The cached Resource Monitoring ID
14 * @cur_closid: The cached Class Of Service ID
15 * @default_rmid: The user assigned Resource Monitoring ID
16 * @default_closid: The user assigned cached Class Of Service ID
17 *
18 * The upper 32 bits of IA32_PQR_ASSOC contain closid and the
19 * lower 10 bits rmid. The update to IA32_PQR_ASSOC always
20 * contains both parts, so we need to cache them. This also
21 * stores the user configured per cpu CLOSID and RMID.
22 *
23 * The cache also helps to avoid pointless updates if the value does
24 * not change.
25 */
26struct intel_pqr_state {
27 u32 cur_rmid;
28 u32 cur_closid;
29 u32 default_rmid;
30 u32 default_closid;
31};
32
33DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
34
35DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
36DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
37DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
38
39/*
40 * __intel_rdt_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR
41 *
42 * Following considerations are made so that this has minimal impact
43 * on scheduler hot path:
44 * - This will stay as no-op unless we are running on an Intel SKU
45 * which supports resource control or monitoring and we enable by
46 * mounting the resctrl file system.
47 * - Caches the per cpu CLOSid/RMID values and does the MSR write only
48 * when a task with a different CLOSid/RMID is scheduled in.
49 * - We allocate RMIDs/CLOSids globally in order to keep this as
50 * simple as possible.
51 * Must be called with preemption disabled.
52 */
53static void __intel_rdt_sched_in(void)
54{
55 struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
56 u32 closid = state->default_closid;
57 u32 rmid = state->default_rmid;
58
59 /*
60 * If this task has a closid/rmid assigned, use it.
61 * Else use the closid/rmid assigned to this cpu.
62 */
63 if (static_branch_likely(&rdt_alloc_enable_key)) {
64 if (current->closid)
65 closid = current->closid;
66 }
67
68 if (static_branch_likely(&rdt_mon_enable_key)) {
69 if (current->rmid)
70 rmid = current->rmid;
71 }
72
73 if (closid != state->cur_closid || rmid != state->cur_rmid) {
74 state->cur_closid = closid;
75 state->cur_rmid = rmid;
76 wrmsr(IA32_PQR_ASSOC, rmid, closid);
77 }
78}
79
80static inline void intel_rdt_sched_in(void)
81{
82 if (static_branch_likely(&rdt_enable_key))
83 __intel_rdt_sched_in();
84}
85
86#else
87
88static inline void intel_rdt_sched_in(void) {}
89
90#endif /* CONFIG_INTEL_RDT */
91
92#endif /* _ASM_X86_INTEL_RDT_SCHED_H */
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index cdf82492b770..e17942c131c8 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -33,7 +33,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
33obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 33obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
34obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 34obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
35 35
36obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o 36obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o intel_rdt_ctrlmondata.o
37 37
38obj-$(CONFIG_X86_MCE) += mcheck/ 38obj-$(CONFIG_X86_MCE) += mcheck/
39obj-$(CONFIG_MTRR) += mtrr/ 39obj-$(CONFIG_MTRR) += mtrr/
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 5b366462f579..cd5fc61ba450 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -30,7 +30,8 @@
30#include <linux/cpuhotplug.h> 30#include <linux/cpuhotplug.h>
31 31
32#include <asm/intel-family.h> 32#include <asm/intel-family.h>
33#include <asm/intel_rdt.h> 33#include <asm/intel_rdt_sched.h>
34#include "intel_rdt.h"
34 35
35#define MAX_MBA_BW 100u 36#define MAX_MBA_BW 100u
36#define MBA_IS_LINEAR 0x4 37#define MBA_IS_LINEAR 0x4
@@ -38,7 +39,13 @@
38/* Mutex to protect rdtgroup access. */ 39/* Mutex to protect rdtgroup access. */
39DEFINE_MUTEX(rdtgroup_mutex); 40DEFINE_MUTEX(rdtgroup_mutex);
40 41
41DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); 42/*
43 * The cached intel_pqr_state is strictly per CPU and can never be
44 * updated from a remote CPU. Functions which modify the state
45 * are called with interrupts disabled and no preemption, which
46 * is sufficient for the protection.
47 */
48DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
42 49
43/* 50/*
44 * Used to store the max resource name width and max resource data width 51 * Used to store the max resource name width and max resource data width
@@ -46,6 +53,12 @@ DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);
46 */ 53 */
47int max_name_width, max_data_width; 54int max_name_width, max_data_width;
48 55
56/*
57 * Global boolean for rdt_alloc which is true if any
58 * resource allocation is enabled.
59 */
60bool rdt_alloc_capable;
61
49static void 62static void
50mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); 63mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
51static void 64static void
@@ -54,7 +67,9 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
54#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains) 67#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains)
55 68
56struct rdt_resource rdt_resources_all[] = { 69struct rdt_resource rdt_resources_all[] = {
70 [RDT_RESOURCE_L3] =
57 { 71 {
72 .rid = RDT_RESOURCE_L3,
58 .name = "L3", 73 .name = "L3",
59 .domains = domain_init(RDT_RESOURCE_L3), 74 .domains = domain_init(RDT_RESOURCE_L3),
60 .msr_base = IA32_L3_CBM_BASE, 75 .msr_base = IA32_L3_CBM_BASE,
@@ -67,8 +82,11 @@ struct rdt_resource rdt_resources_all[] = {
67 }, 82 },
68 .parse_ctrlval = parse_cbm, 83 .parse_ctrlval = parse_cbm,
69 .format_str = "%d=%0*x", 84 .format_str = "%d=%0*x",
85 .fflags = RFTYPE_RES_CACHE,
70 }, 86 },
87 [RDT_RESOURCE_L3DATA] =
71 { 88 {
89 .rid = RDT_RESOURCE_L3DATA,
72 .name = "L3DATA", 90 .name = "L3DATA",
73 .domains = domain_init(RDT_RESOURCE_L3DATA), 91 .domains = domain_init(RDT_RESOURCE_L3DATA),
74 .msr_base = IA32_L3_CBM_BASE, 92 .msr_base = IA32_L3_CBM_BASE,
@@ -81,8 +99,11 @@ struct rdt_resource rdt_resources_all[] = {
81 }, 99 },
82 .parse_ctrlval = parse_cbm, 100 .parse_ctrlval = parse_cbm,
83 .format_str = "%d=%0*x", 101 .format_str = "%d=%0*x",
102 .fflags = RFTYPE_RES_CACHE,
84 }, 103 },
104 [RDT_RESOURCE_L3CODE] =
85 { 105 {
106 .rid = RDT_RESOURCE_L3CODE,
86 .name = "L3CODE", 107 .name = "L3CODE",
87 .domains = domain_init(RDT_RESOURCE_L3CODE), 108 .domains = domain_init(RDT_RESOURCE_L3CODE),
88 .msr_base = IA32_L3_CBM_BASE, 109 .msr_base = IA32_L3_CBM_BASE,
@@ -95,8 +116,11 @@ struct rdt_resource rdt_resources_all[] = {
95 }, 116 },
96 .parse_ctrlval = parse_cbm, 117 .parse_ctrlval = parse_cbm,
97 .format_str = "%d=%0*x", 118 .format_str = "%d=%0*x",
119 .fflags = RFTYPE_RES_CACHE,
98 }, 120 },
121 [RDT_RESOURCE_L2] =
99 { 122 {
123 .rid = RDT_RESOURCE_L2,
100 .name = "L2", 124 .name = "L2",
101 .domains = domain_init(RDT_RESOURCE_L2), 125 .domains = domain_init(RDT_RESOURCE_L2),
102 .msr_base = IA32_L2_CBM_BASE, 126 .msr_base = IA32_L2_CBM_BASE,
@@ -109,8 +133,11 @@ struct rdt_resource rdt_resources_all[] = {
109 }, 133 },
110 .parse_ctrlval = parse_cbm, 134 .parse_ctrlval = parse_cbm,
111 .format_str = "%d=%0*x", 135 .format_str = "%d=%0*x",
136 .fflags = RFTYPE_RES_CACHE,
112 }, 137 },
138 [RDT_RESOURCE_MBA] =
113 { 139 {
140 .rid = RDT_RESOURCE_MBA,
114 .name = "MB", 141 .name = "MB",
115 .domains = domain_init(RDT_RESOURCE_MBA), 142 .domains = domain_init(RDT_RESOURCE_MBA),
116 .msr_base = IA32_MBA_THRTL_BASE, 143 .msr_base = IA32_MBA_THRTL_BASE,
@@ -118,6 +145,7 @@ struct rdt_resource rdt_resources_all[] = {
118 .cache_level = 3, 145 .cache_level = 3,
119 .parse_ctrlval = parse_bw, 146 .parse_ctrlval = parse_bw,
120 .format_str = "%d=%*d", 147 .format_str = "%d=%*d",
148 .fflags = RFTYPE_RES_MB,
121 }, 149 },
122}; 150};
123 151
@@ -144,33 +172,28 @@ static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid)
144 * is always 20 on hsw server parts. The minimum cache bitmask length 172 * is always 20 on hsw server parts. The minimum cache bitmask length
145 * allowed for HSW server is always 2 bits. Hardcode all of them. 173 * allowed for HSW server is always 2 bits. Hardcode all of them.
146 */ 174 */
147static inline bool cache_alloc_hsw_probe(void) 175static inline void cache_alloc_hsw_probe(void)
148{ 176{
149 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && 177 struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
150 boot_cpu_data.x86 == 6 && 178 u32 l, h, max_cbm = BIT_MASK(20) - 1;
151 boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) {
152 struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
153 u32 l, h, max_cbm = BIT_MASK(20) - 1;
154
155 if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0))
156 return false;
157 rdmsr(IA32_L3_CBM_BASE, l, h);
158 179
159 /* If all the bits were set in MSR, return success */ 180 if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0))
160 if (l != max_cbm) 181 return;
161 return false; 182 rdmsr(IA32_L3_CBM_BASE, l, h);
162 183
163 r->num_closid = 4; 184 /* If all the bits were set in MSR, return success */
164 r->default_ctrl = max_cbm; 185 if (l != max_cbm)
165 r->cache.cbm_len = 20; 186 return;
166 r->cache.min_cbm_bits = 2;
167 r->capable = true;
168 r->enabled = true;
169 187
170 return true; 188 r->num_closid = 4;
171 } 189 r->default_ctrl = max_cbm;
190 r->cache.cbm_len = 20;
191 r->cache.shareable_bits = 0xc0000;
192 r->cache.min_cbm_bits = 2;
193 r->alloc_capable = true;
194 r->alloc_enabled = true;
172 195
173 return false; 196 rdt_alloc_capable = true;
174} 197}
175 198
176/* 199/*
@@ -213,15 +236,14 @@ static bool rdt_get_mem_config(struct rdt_resource *r)
213 return false; 236 return false;
214 } 237 }
215 r->data_width = 3; 238 r->data_width = 3;
216 rdt_get_mba_infofile(r);
217 239
218 r->capable = true; 240 r->alloc_capable = true;
219 r->enabled = true; 241 r->alloc_enabled = true;
220 242
221 return true; 243 return true;
222} 244}
223 245
224static void rdt_get_cache_config(int idx, struct rdt_resource *r) 246static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
225{ 247{
226 union cpuid_0x10_1_eax eax; 248 union cpuid_0x10_1_eax eax;
227 union cpuid_0x10_x_edx edx; 249 union cpuid_0x10_x_edx edx;
@@ -231,10 +253,10 @@ static void rdt_get_cache_config(int idx, struct rdt_resource *r)
231 r->num_closid = edx.split.cos_max + 1; 253 r->num_closid = edx.split.cos_max + 1;
232 r->cache.cbm_len = eax.split.cbm_len + 1; 254 r->cache.cbm_len = eax.split.cbm_len + 1;
233 r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; 255 r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
256 r->cache.shareable_bits = ebx & r->default_ctrl;
234 r->data_width = (r->cache.cbm_len + 3) / 4; 257 r->data_width = (r->cache.cbm_len + 3) / 4;
235 rdt_get_cache_infofile(r); 258 r->alloc_capable = true;
236 r->capable = true; 259 r->alloc_enabled = true;
237 r->enabled = true;
238} 260}
239 261
240static void rdt_get_cdp_l3_config(int type) 262static void rdt_get_cdp_l3_config(int type)
@@ -246,12 +268,12 @@ static void rdt_get_cdp_l3_config(int type)
246 r->cache.cbm_len = r_l3->cache.cbm_len; 268 r->cache.cbm_len = r_l3->cache.cbm_len;
247 r->default_ctrl = r_l3->default_ctrl; 269 r->default_ctrl = r_l3->default_ctrl;
248 r->data_width = (r->cache.cbm_len + 3) / 4; 270 r->data_width = (r->cache.cbm_len + 3) / 4;
249 r->capable = true; 271 r->alloc_capable = true;
250 /* 272 /*
251 * By default, CDP is disabled. CDP can be enabled by mount parameter 273 * By default, CDP is disabled. CDP can be enabled by mount parameter
252 * "cdp" during resctrl file system mount time. 274 * "cdp" during resctrl file system mount time.
253 */ 275 */
254 r->enabled = false; 276 r->alloc_enabled = false;
255} 277}
256 278
257static int get_cache_id(int cpu, int level) 279static int get_cache_id(int cpu, int level)
@@ -300,6 +322,19 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
300 wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]); 322 wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]);
301} 323}
302 324
325struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
326{
327 struct rdt_domain *d;
328
329 list_for_each_entry(d, &r->domains, list) {
330 /* Find the domain that contains this CPU */
331 if (cpumask_test_cpu(cpu, &d->cpu_mask))
332 return d;
333 }
334
335 return NULL;
336}
337
303void rdt_ctrl_update(void *arg) 338void rdt_ctrl_update(void *arg)
304{ 339{
305 struct msr_param *m = arg; 340 struct msr_param *m = arg;
@@ -307,12 +342,10 @@ void rdt_ctrl_update(void *arg)
307 int cpu = smp_processor_id(); 342 int cpu = smp_processor_id();
308 struct rdt_domain *d; 343 struct rdt_domain *d;
309 344
310 list_for_each_entry(d, &r->domains, list) { 345 d = get_domain_from_cpu(cpu, r);
311 /* Find the domain that contains this CPU */ 346 if (d) {
312 if (cpumask_test_cpu(cpu, &d->cpu_mask)) { 347 r->msr_update(d, m, r);
313 r->msr_update(d, m, r); 348 return;
314 return;
315 }
316 } 349 }
317 pr_warn_once("cpu %d not found in any domain for resource %s\n", 350 pr_warn_once("cpu %d not found in any domain for resource %s\n",
318 cpu, r->name); 351 cpu, r->name);
@@ -326,8 +359,8 @@ void rdt_ctrl_update(void *arg)
326 * caller, return the first domain whose id is bigger than the input id. 359 * caller, return the first domain whose id is bigger than the input id.
327 * The domain list is sorted by id in ascending order. 360 * The domain list is sorted by id in ascending order.
328 */ 361 */
329static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, 362struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
330 struct list_head **pos) 363 struct list_head **pos)
331{ 364{
332 struct rdt_domain *d; 365 struct rdt_domain *d;
333 struct list_head *l; 366 struct list_head *l;
@@ -377,6 +410,44 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
377 return 0; 410 return 0;
378} 411}
379 412
413static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
414{
415 size_t tsize;
416
417 if (is_llc_occupancy_enabled()) {
418 d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid),
419 sizeof(unsigned long),
420 GFP_KERNEL);
421 if (!d->rmid_busy_llc)
422 return -ENOMEM;
423 INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
424 }
425 if (is_mbm_total_enabled()) {
426 tsize = sizeof(*d->mbm_total);
427 d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
428 if (!d->mbm_total) {
429 kfree(d->rmid_busy_llc);
430 return -ENOMEM;
431 }
432 }
433 if (is_mbm_local_enabled()) {
434 tsize = sizeof(*d->mbm_local);
435 d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
436 if (!d->mbm_local) {
437 kfree(d->rmid_busy_llc);
438 kfree(d->mbm_total);
439 return -ENOMEM;
440 }
441 }
442
443 if (is_mbm_enabled()) {
444 INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
445 mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL);
446 }
447
448 return 0;
449}
450
380/* 451/*
381 * domain_add_cpu - Add a cpu to a resource's domain list. 452 * domain_add_cpu - Add a cpu to a resource's domain list.
382 * 453 *
@@ -412,14 +483,26 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
412 return; 483 return;
413 484
414 d->id = id; 485 d->id = id;
486 cpumask_set_cpu(cpu, &d->cpu_mask);
415 487
416 if (domain_setup_ctrlval(r, d)) { 488 if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
489 kfree(d);
490 return;
491 }
492
493 if (r->mon_capable && domain_setup_mon_state(r, d)) {
417 kfree(d); 494 kfree(d);
418 return; 495 return;
419 } 496 }
420 497
421 cpumask_set_cpu(cpu, &d->cpu_mask);
422 list_add_tail(&d->list, add_pos); 498 list_add_tail(&d->list, add_pos);
499
500 /*
501 * If resctrl is mounted, add
502 * per domain monitor data directories.
503 */
504 if (static_branch_unlikely(&rdt_mon_enable_key))
505 mkdir_mondata_subdir_allrdtgrp(r, d);
423} 506}
424 507
425static void domain_remove_cpu(int cpu, struct rdt_resource *r) 508static void domain_remove_cpu(int cpu, struct rdt_resource *r)
@@ -435,19 +518,58 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
435 518
436 cpumask_clear_cpu(cpu, &d->cpu_mask); 519 cpumask_clear_cpu(cpu, &d->cpu_mask);
437 if (cpumask_empty(&d->cpu_mask)) { 520 if (cpumask_empty(&d->cpu_mask)) {
521 /*
522 * If resctrl is mounted, remove all the
523 * per domain monitor data directories.
524 */
525 if (static_branch_unlikely(&rdt_mon_enable_key))
526 rmdir_mondata_subdir_allrdtgrp(r, d->id);
438 kfree(d->ctrl_val); 527 kfree(d->ctrl_val);
528 kfree(d->rmid_busy_llc);
529 kfree(d->mbm_total);
530 kfree(d->mbm_local);
439 list_del(&d->list); 531 list_del(&d->list);
532 if (is_mbm_enabled())
533 cancel_delayed_work(&d->mbm_over);
534 if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) {
535 /*
536 * When a package is going down, forcefully
537 * decrement rmid->ebusy. There is no way to know
538 * that the L3 was flushed and hence may lead to
539 * incorrect counts in rare scenarios, but leaving
540 * the RMID as busy creates RMID leaks if the
541 * package never comes back.
542 */
543 __check_limbo(d, true);
544 cancel_delayed_work(&d->cqm_limbo);
545 }
546
440 kfree(d); 547 kfree(d);
548 return;
549 }
550
551 if (r == &rdt_resources_all[RDT_RESOURCE_L3]) {
552 if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
553 cancel_delayed_work(&d->mbm_over);
554 mbm_setup_overflow_handler(d, 0);
555 }
556 if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu &&
557 has_busy_rmid(r, d)) {
558 cancel_delayed_work(&d->cqm_limbo);
559 cqm_setup_limbo_handler(d, 0);
560 }
441 } 561 }
442} 562}
443 563
444static void clear_closid(int cpu) 564static void clear_closid_rmid(int cpu)
445{ 565{
446 struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); 566 struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
447 567
448 per_cpu(cpu_closid, cpu) = 0; 568 state->default_closid = 0;
449 state->closid = 0; 569 state->default_rmid = 0;
450 wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0); 570 state->cur_closid = 0;
571 state->cur_rmid = 0;
572 wrmsr(IA32_PQR_ASSOC, 0, 0);
451} 573}
452 574
453static int intel_rdt_online_cpu(unsigned int cpu) 575static int intel_rdt_online_cpu(unsigned int cpu)
@@ -459,12 +581,23 @@ static int intel_rdt_online_cpu(unsigned int cpu)
459 domain_add_cpu(cpu, r); 581 domain_add_cpu(cpu, r);
460 /* The cpu is set in default rdtgroup after online. */ 582 /* The cpu is set in default rdtgroup after online. */
461 cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); 583 cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
462 clear_closid(cpu); 584 clear_closid_rmid(cpu);
463 mutex_unlock(&rdtgroup_mutex); 585 mutex_unlock(&rdtgroup_mutex);
464 586
465 return 0; 587 return 0;
466} 588}
467 589
590static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
591{
592 struct rdtgroup *cr;
593
594 list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
595 if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) {
596 break;
597 }
598 }
599}
600
468static int intel_rdt_offline_cpu(unsigned int cpu) 601static int intel_rdt_offline_cpu(unsigned int cpu)
469{ 602{
470 struct rdtgroup *rdtgrp; 603 struct rdtgroup *rdtgrp;
@@ -474,10 +607,12 @@ static int intel_rdt_offline_cpu(unsigned int cpu)
474 for_each_capable_rdt_resource(r) 607 for_each_capable_rdt_resource(r)
475 domain_remove_cpu(cpu, r); 608 domain_remove_cpu(cpu, r);
476 list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { 609 list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
477 if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) 610 if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
611 clear_childcpus(rdtgrp, cpu);
478 break; 612 break;
613 }
479 } 614 }
480 clear_closid(cpu); 615 clear_closid_rmid(cpu);
481 mutex_unlock(&rdtgroup_mutex); 616 mutex_unlock(&rdtgroup_mutex);
482 617
483 return 0; 618 return 0;
@@ -492,7 +627,7 @@ static __init void rdt_init_padding(void)
492 struct rdt_resource *r; 627 struct rdt_resource *r;
493 int cl; 628 int cl;
494 629
495 for_each_capable_rdt_resource(r) { 630 for_each_alloc_capable_rdt_resource(r) {
496 cl = strlen(r->name); 631 cl = strlen(r->name);
497 if (cl > max_name_width) 632 if (cl > max_name_width)
498 max_name_width = cl; 633 max_name_width = cl;
@@ -502,38 +637,153 @@ static __init void rdt_init_padding(void)
502 } 637 }
503} 638}
504 639
505static __init bool get_rdt_resources(void) 640enum {
641 RDT_FLAG_CMT,
642 RDT_FLAG_MBM_TOTAL,
643 RDT_FLAG_MBM_LOCAL,
644 RDT_FLAG_L3_CAT,
645 RDT_FLAG_L3_CDP,
646 RDT_FLAG_L2_CAT,
647 RDT_FLAG_MBA,
648};
649
650#define RDT_OPT(idx, n, f) \
651[idx] = { \
652 .name = n, \
653 .flag = f \
654}
655
656struct rdt_options {
657 char *name;
658 int flag;
659 bool force_off, force_on;
660};
661
662static struct rdt_options rdt_options[] __initdata = {
663 RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC),
664 RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL),
665 RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL),
666 RDT_OPT(RDT_FLAG_L3_CAT, "l3cat", X86_FEATURE_CAT_L3),
667 RDT_OPT(RDT_FLAG_L3_CDP, "l3cdp", X86_FEATURE_CDP_L3),
668 RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2),
669 RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA),
670};
671#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options)
672
673static int __init set_rdt_options(char *str)
674{
675 struct rdt_options *o;
676 bool force_off;
677 char *tok;
678
679 if (*str == '=')
680 str++;
681 while ((tok = strsep(&str, ",")) != NULL) {
682 force_off = *tok == '!';
683 if (force_off)
684 tok++;
685 for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
686 if (strcmp(tok, o->name) == 0) {
687 if (force_off)
688 o->force_off = true;
689 else
690 o->force_on = true;
691 break;
692 }
693 }
694 }
695 return 1;
696}
697__setup("rdt", set_rdt_options);
698
699static bool __init rdt_cpu_has(int flag)
700{
701 bool ret = boot_cpu_has(flag);
702 struct rdt_options *o;
703
704 if (!ret)
705 return ret;
706
707 for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
708 if (flag == o->flag) {
709 if (o->force_off)
710 ret = false;
711 if (o->force_on)
712 ret = true;
713 break;
714 }
715 }
716 return ret;
717}
718
719static __init bool get_rdt_alloc_resources(void)
506{ 720{
507 bool ret = false; 721 bool ret = false;
508 722
509 if (cache_alloc_hsw_probe()) 723 if (rdt_alloc_capable)
510 return true; 724 return true;
511 725
512 if (!boot_cpu_has(X86_FEATURE_RDT_A)) 726 if (!boot_cpu_has(X86_FEATURE_RDT_A))
513 return false; 727 return false;
514 728
515 if (boot_cpu_has(X86_FEATURE_CAT_L3)) { 729 if (rdt_cpu_has(X86_FEATURE_CAT_L3)) {
516 rdt_get_cache_config(1, &rdt_resources_all[RDT_RESOURCE_L3]); 730 rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]);
517 if (boot_cpu_has(X86_FEATURE_CDP_L3)) { 731 if (rdt_cpu_has(X86_FEATURE_CDP_L3)) {
518 rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); 732 rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA);
519 rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); 733 rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE);
520 } 734 }
521 ret = true; 735 ret = true;
522 } 736 }
523 if (boot_cpu_has(X86_FEATURE_CAT_L2)) { 737 if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
524 /* CPUID 0x10.2 fields are same format at 0x10.1 */ 738 /* CPUID 0x10.2 fields are same format at 0x10.1 */
525 rdt_get_cache_config(2, &rdt_resources_all[RDT_RESOURCE_L2]); 739 rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]);
526 ret = true; 740 ret = true;
527 } 741 }
528 742
529 if (boot_cpu_has(X86_FEATURE_MBA)) { 743 if (rdt_cpu_has(X86_FEATURE_MBA)) {
530 if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA])) 744 if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA]))
531 ret = true; 745 ret = true;
532 } 746 }
533
534 return ret; 747 return ret;
535} 748}
536 749
750static __init bool get_rdt_mon_resources(void)
751{
752 if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC))
753 rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID);
754 if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL))
755 rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID);
756 if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))
757 rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID);
758
759 if (!rdt_mon_features)
760 return false;
761
762 return !rdt_get_mon_l3_config(&rdt_resources_all[RDT_RESOURCE_L3]);
763}
764
765static __init void rdt_quirks(void)
766{
767 switch (boot_cpu_data.x86_model) {
768 case INTEL_FAM6_HASWELL_X:
769 if (!rdt_options[RDT_FLAG_L3_CAT].force_off)
770 cache_alloc_hsw_probe();
771 break;
772 case INTEL_FAM6_SKYLAKE_X:
773 if (boot_cpu_data.x86_mask <= 4)
774 set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
775 }
776}
777
778static __init bool get_rdt_resources(void)
779{
780 rdt_quirks();
781 rdt_alloc_capable = get_rdt_alloc_resources();
782 rdt_mon_capable = get_rdt_mon_resources();
783
784 return (rdt_mon_capable || rdt_alloc_capable);
785}
786
537static int __init intel_rdt_late_init(void) 787static int __init intel_rdt_late_init(void)
538{ 788{
539 struct rdt_resource *r; 789 struct rdt_resource *r;
@@ -556,9 +806,12 @@ static int __init intel_rdt_late_init(void)
556 return ret; 806 return ret;
557 } 807 }
558 808
559 for_each_capable_rdt_resource(r) 809 for_each_alloc_capable_rdt_resource(r)
560 pr_info("Intel RDT %s allocation detected\n", r->name); 810 pr_info("Intel RDT %s allocation detected\n", r->name);
561 811
812 for_each_mon_capable_rdt_resource(r)
813 pr_info("Intel RDT %s monitoring detected\n", r->name);
814
562 return 0; 815 return 0;
563} 816}
564 817
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
new file mode 100644
index 000000000000..ebaddaeef023
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -0,0 +1,440 @@
1#ifndef _ASM_X86_INTEL_RDT_H
2#define _ASM_X86_INTEL_RDT_H
3
4#include <linux/sched.h>
5#include <linux/kernfs.h>
6#include <linux/jump_label.h>
7
8#define IA32_L3_QOS_CFG 0xc81
9#define IA32_L3_CBM_BASE 0xc90
10#define IA32_L2_CBM_BASE 0xd10
11#define IA32_MBA_THRTL_BASE 0xd50
12
13#define L3_QOS_CDP_ENABLE 0x01ULL
14
15/*
16 * Event IDs are used to program IA32_QM_EVTSEL before reading event
17 * counter from IA32_QM_CTR
18 */
19#define QOS_L3_OCCUP_EVENT_ID 0x01
20#define QOS_L3_MBM_TOTAL_EVENT_ID 0x02
21#define QOS_L3_MBM_LOCAL_EVENT_ID 0x03
22
23#define CQM_LIMBOCHECK_INTERVAL 1000
24
25#define MBM_CNTR_WIDTH 24
26#define MBM_OVERFLOW_INTERVAL 1000
27
28#define RMID_VAL_ERROR BIT_ULL(63)
29#define RMID_VAL_UNAVAIL BIT_ULL(62)
30
31DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
32
33/**
34 * struct mon_evt - Entry in the event list of a resource
35 * @evtid: event id
36 * @name: name of the event
37 */
38struct mon_evt {
39 u32 evtid;
40 char *name;
41 struct list_head list;
42};
43
44/**
45 * struct mon_data_bits - Monitoring details for each event file
46 * @rid: Resource id associated with the event file.
47 * @evtid: Event id associated with the event file
48 * @domid: The domain to which the event file belongs
49 */
50union mon_data_bits {
51 void *priv;
52 struct {
53 unsigned int rid : 10;
54 unsigned int evtid : 8;
55 unsigned int domid : 14;
56 } u;
57};
58
59struct rmid_read {
60 struct rdtgroup *rgrp;
61 struct rdt_domain *d;
62 int evtid;
63 bool first;
64 u64 val;
65};
66
67extern unsigned int intel_cqm_threshold;
68extern bool rdt_alloc_capable;
69extern bool rdt_mon_capable;
70extern unsigned int rdt_mon_features;
71
72enum rdt_group_type {
73 RDTCTRL_GROUP = 0,
74 RDTMON_GROUP,
75 RDT_NUM_GROUP,
76};
77
78/**
79 * struct mongroup - store mon group's data in resctrl fs.
80 * @mon_data_kn kernlfs node for the mon_data directory
81 * @parent: parent rdtgrp
82 * @crdtgrp_list: child rdtgroup node list
83 * @rmid: rmid for this rdtgroup
84 */
85struct mongroup {
86 struct kernfs_node *mon_data_kn;
87 struct rdtgroup *parent;
88 struct list_head crdtgrp_list;
89 u32 rmid;
90};
91
92/**
93 * struct rdtgroup - store rdtgroup's data in resctrl file system.
94 * @kn: kernfs node
95 * @rdtgroup_list: linked list for all rdtgroups
96 * @closid: closid for this rdtgroup
97 * @cpu_mask: CPUs assigned to this rdtgroup
98 * @flags: status bits
99 * @waitcount: how many cpus expect to find this
100 * group when they acquire rdtgroup_mutex
101 * @type: indicates type of this rdtgroup - either
102 * monitor only or ctrl_mon group
103 * @mon: mongroup related data
104 */
105struct rdtgroup {
106 struct kernfs_node *kn;
107 struct list_head rdtgroup_list;
108 u32 closid;
109 struct cpumask cpu_mask;
110 int flags;
111 atomic_t waitcount;
112 enum rdt_group_type type;
113 struct mongroup mon;
114};
115
116/* rdtgroup.flags */
117#define RDT_DELETED 1
118
119/* rftype.flags */
120#define RFTYPE_FLAGS_CPUS_LIST 1
121
122/*
123 * Define the file type flags for base and info directories.
124 */
125#define RFTYPE_INFO BIT(0)
126#define RFTYPE_BASE BIT(1)
127#define RF_CTRLSHIFT 4
128#define RF_MONSHIFT 5
129#define RFTYPE_CTRL BIT(RF_CTRLSHIFT)
130#define RFTYPE_MON BIT(RF_MONSHIFT)
131#define RFTYPE_RES_CACHE BIT(8)
132#define RFTYPE_RES_MB BIT(9)
133#define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL)
134#define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON)
135#define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL)
136
137/* List of all resource groups */
138extern struct list_head rdt_all_groups;
139
140extern int max_name_width, max_data_width;
141
142int __init rdtgroup_init(void);
143
144/**
145 * struct rftype - describe each file in the resctrl file system
146 * @name: File name
147 * @mode: Access mode
148 * @kf_ops: File operations
149 * @flags: File specific RFTYPE_FLAGS_* flags
150 * @fflags: File specific RF_* or RFTYPE_* flags
151 * @seq_show: Show content of the file
152 * @write: Write to the file
153 */
154struct rftype {
155 char *name;
156 umode_t mode;
157 struct kernfs_ops *kf_ops;
158 unsigned long flags;
159 unsigned long fflags;
160
161 int (*seq_show)(struct kernfs_open_file *of,
162 struct seq_file *sf, void *v);
163 /*
164 * write() is the generic write callback which maps directly to
165 * kernfs write operation and overrides all other operations.
166 * Maximum write size is determined by ->max_write_len.
167 */
168 ssize_t (*write)(struct kernfs_open_file *of,
169 char *buf, size_t nbytes, loff_t off);
170};
171
172/**
173 * struct mbm_state - status for each MBM counter in each domain
174 * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes)
175 * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it
176 */
177struct mbm_state {
178 u64 chunks;
179 u64 prev_msr;
180};
181
182/**
183 * struct rdt_domain - group of cpus sharing an RDT resource
184 * @list: all instances of this resource
185 * @id: unique id for this instance
186 * @cpu_mask: which cpus share this resource
187 * @rmid_busy_llc:
188 * bitmap of which limbo RMIDs are above threshold
189 * @mbm_total: saved state for MBM total bandwidth
190 * @mbm_local: saved state for MBM local bandwidth
191 * @mbm_over: worker to periodically read MBM h/w counters
192 * @cqm_limbo: worker to periodically read CQM h/w counters
193 * @mbm_work_cpu:
194 * worker cpu for MBM h/w counters
195 * @cqm_work_cpu:
196 * worker cpu for CQM h/w counters
197 * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID)
198 * @new_ctrl: new ctrl value to be loaded
199 * @have_new_ctrl: did user provide new_ctrl for this domain
200 */
201struct rdt_domain {
202 struct list_head list;
203 int id;
204 struct cpumask cpu_mask;
205 unsigned long *rmid_busy_llc;
206 struct mbm_state *mbm_total;
207 struct mbm_state *mbm_local;
208 struct delayed_work mbm_over;
209 struct delayed_work cqm_limbo;
210 int mbm_work_cpu;
211 int cqm_work_cpu;
212 u32 *ctrl_val;
213 u32 new_ctrl;
214 bool have_new_ctrl;
215};
216
217/**
218 * struct msr_param - set a range of MSRs from a domain
219 * @res: The resource to use
220 * @low: Beginning index from base MSR
221 * @high: End index
222 */
223struct msr_param {
224 struct rdt_resource *res;
225 int low;
226 int high;
227};
228
229/**
230 * struct rdt_cache - Cache allocation related data
231 * @cbm_len: Length of the cache bit mask
232 * @min_cbm_bits: Minimum number of consecutive bits to be set
233 * @cbm_idx_mult: Multiplier of CBM index
234 * @cbm_idx_offset: Offset of CBM index. CBM index is computed by:
235 * closid * cbm_idx_multi + cbm_idx_offset
236 * in a cache bit mask
237 * @shareable_bits: Bitmask of shareable resource with other
238 * executing entities
239 */
240struct rdt_cache {
241 unsigned int cbm_len;
242 unsigned int min_cbm_bits;
243 unsigned int cbm_idx_mult;
244 unsigned int cbm_idx_offset;
245 unsigned int shareable_bits;
246};
247
248/**
249 * struct rdt_membw - Memory bandwidth allocation related data
250 * @max_delay: Max throttle delay. Delay is the hardware
251 * representation for memory bandwidth.
252 * @min_bw: Minimum memory bandwidth percentage user can request
253 * @bw_gran: Granularity at which the memory bandwidth is allocated
254 * @delay_linear: True if memory B/W delay is in linear scale
255 * @mb_map: Mapping of memory B/W percentage to memory B/W delay
256 */
257struct rdt_membw {
258 u32 max_delay;
259 u32 min_bw;
260 u32 bw_gran;
261 u32 delay_linear;
262 u32 *mb_map;
263};
264
265static inline bool is_llc_occupancy_enabled(void)
266{
267 return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID));
268}
269
270static inline bool is_mbm_total_enabled(void)
271{
272 return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID));
273}
274
275static inline bool is_mbm_local_enabled(void)
276{
277 return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID));
278}
279
280static inline bool is_mbm_enabled(void)
281{
282 return (is_mbm_total_enabled() || is_mbm_local_enabled());
283}
284
285static inline bool is_mbm_event(int e)
286{
287 return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
288 e <= QOS_L3_MBM_LOCAL_EVENT_ID);
289}
290
291/**
292 * struct rdt_resource - attributes of an RDT resource
293 * @rid: The index of the resource
294 * @alloc_enabled: Is allocation enabled on this machine
295 * @mon_enabled: Is monitoring enabled for this feature
296 * @alloc_capable: Is allocation available on this machine
297 * @mon_capable: Is monitor feature available on this machine
298 * @name: Name to use in "schemata" file
299 * @num_closid: Number of CLOSIDs available
300 * @cache_level: Which cache level defines scope of this resource
301 * @default_ctrl: Specifies default cache cbm or memory B/W percent.
302 * @msr_base: Base MSR address for CBMs
303 * @msr_update: Function pointer to update QOS MSRs
304 * @data_width: Character width of data when displaying
305 * @domains: All domains for this resource
306 * @cache: Cache allocation related data
307 * @format_str: Per resource format string to show domain value
308 * @parse_ctrlval: Per resource function pointer to parse control values
309 * @evt_list: List of monitoring events
310 * @num_rmid: Number of RMIDs available
311 * @mon_scale: cqm counter * mon_scale = occupancy in bytes
312 * @fflags: flags to choose base and info files
313 */
314struct rdt_resource {
315 int rid;
316 bool alloc_enabled;
317 bool mon_enabled;
318 bool alloc_capable;
319 bool mon_capable;
320 char *name;
321 int num_closid;
322 int cache_level;
323 u32 default_ctrl;
324 unsigned int msr_base;
325 void (*msr_update) (struct rdt_domain *d, struct msr_param *m,
326 struct rdt_resource *r);
327 int data_width;
328 struct list_head domains;
329 struct rdt_cache cache;
330 struct rdt_membw membw;
331 const char *format_str;
332 int (*parse_ctrlval) (char *buf, struct rdt_resource *r,
333 struct rdt_domain *d);
334 struct list_head evt_list;
335 int num_rmid;
336 unsigned int mon_scale;
337 unsigned long fflags;
338};
339
340int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d);
341int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d);
342
343extern struct mutex rdtgroup_mutex;
344
345extern struct rdt_resource rdt_resources_all[];
346extern struct rdtgroup rdtgroup_default;
347DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
348
349int __init rdtgroup_init(void);
350
351enum {
352 RDT_RESOURCE_L3,
353 RDT_RESOURCE_L3DATA,
354 RDT_RESOURCE_L3CODE,
355 RDT_RESOURCE_L2,
356 RDT_RESOURCE_MBA,
357
358 /* Must be the last */
359 RDT_NUM_RESOURCES,
360};
361
362#define for_each_capable_rdt_resource(r) \
363 for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
364 r++) \
365 if (r->alloc_capable || r->mon_capable)
366
367#define for_each_alloc_capable_rdt_resource(r) \
368 for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
369 r++) \
370 if (r->alloc_capable)
371
372#define for_each_mon_capable_rdt_resource(r) \
373 for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
374 r++) \
375 if (r->mon_capable)
376
377#define for_each_alloc_enabled_rdt_resource(r) \
378 for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
379 r++) \
380 if (r->alloc_enabled)
381
382#define for_each_mon_enabled_rdt_resource(r) \
383 for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
384 r++) \
385 if (r->mon_enabled)
386
387/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
388union cpuid_0x10_1_eax {
389 struct {
390 unsigned int cbm_len:5;
391 } split;
392 unsigned int full;
393};
394
395/* CPUID.(EAX=10H, ECX=ResID=3).EAX */
396union cpuid_0x10_3_eax {
397 struct {
398 unsigned int max_delay:12;
399 } split;
400 unsigned int full;
401};
402
403/* CPUID.(EAX=10H, ECX=ResID).EDX */
404union cpuid_0x10_x_edx {
405 struct {
406 unsigned int cos_max:16;
407 } split;
408 unsigned int full;
409};
410
411void rdt_ctrl_update(void *arg);
412struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
413void rdtgroup_kn_unlock(struct kernfs_node *kn);
414struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
415 struct list_head **pos);
416ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
417 char *buf, size_t nbytes, loff_t off);
418int rdtgroup_schemata_show(struct kernfs_open_file *of,
419 struct seq_file *s, void *v);
420struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
421int alloc_rmid(void);
422void free_rmid(u32 rmid);
423int rdt_get_mon_l3_config(struct rdt_resource *r);
424void mon_event_count(void *info);
425int rdtgroup_mondata_show(struct seq_file *m, void *arg);
426void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
427 unsigned int dom_id);
428void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
429 struct rdt_domain *d);
430void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
431 struct rdtgroup *rdtgrp, int evtid, int first);
432void mbm_setup_overflow_handler(struct rdt_domain *dom,
433 unsigned long delay_ms);
434void mbm_handle_overflow(struct work_struct *work);
435void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
436void cqm_handle_limbo(struct work_struct *work);
437bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
438void __check_limbo(struct rdt_domain *d, bool force_free);
439
440#endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
index 406d7a6532f9..f6ea94f8954a 100644
--- a/arch/x86/kernel/cpu/intel_rdt_schemata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -26,7 +26,7 @@
26#include <linux/kernfs.h> 26#include <linux/kernfs.h>
27#include <linux/seq_file.h> 27#include <linux/seq_file.h>
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <asm/intel_rdt.h> 29#include "intel_rdt.h"
30 30
31/* 31/*
32 * Check whether MBA bandwidth percentage value is correct. The value is 32 * Check whether MBA bandwidth percentage value is correct. The value is
@@ -192,7 +192,7 @@ static int rdtgroup_parse_resource(char *resname, char *tok, int closid)
192{ 192{
193 struct rdt_resource *r; 193 struct rdt_resource *r;
194 194
195 for_each_enabled_rdt_resource(r) { 195 for_each_alloc_enabled_rdt_resource(r) {
196 if (!strcmp(resname, r->name) && closid < r->num_closid) 196 if (!strcmp(resname, r->name) && closid < r->num_closid)
197 return parse_line(tok, r); 197 return parse_line(tok, r);
198 } 198 }
@@ -221,7 +221,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
221 221
222 closid = rdtgrp->closid; 222 closid = rdtgrp->closid;
223 223
224 for_each_enabled_rdt_resource(r) { 224 for_each_alloc_enabled_rdt_resource(r) {
225 list_for_each_entry(dom, &r->domains, list) 225 list_for_each_entry(dom, &r->domains, list)
226 dom->have_new_ctrl = false; 226 dom->have_new_ctrl = false;
227 } 227 }
@@ -237,7 +237,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
237 goto out; 237 goto out;
238 } 238 }
239 239
240 for_each_enabled_rdt_resource(r) { 240 for_each_alloc_enabled_rdt_resource(r) {
241 ret = update_domains(r, closid); 241 ret = update_domains(r, closid);
242 if (ret) 242 if (ret)
243 goto out; 243 goto out;
@@ -269,12 +269,13 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
269{ 269{
270 struct rdtgroup *rdtgrp; 270 struct rdtgroup *rdtgrp;
271 struct rdt_resource *r; 271 struct rdt_resource *r;
272 int closid, ret = 0; 272 int ret = 0;
273 u32 closid;
273 274
274 rdtgrp = rdtgroup_kn_lock_live(of->kn); 275 rdtgrp = rdtgroup_kn_lock_live(of->kn);
275 if (rdtgrp) { 276 if (rdtgrp) {
276 closid = rdtgrp->closid; 277 closid = rdtgrp->closid;
277 for_each_enabled_rdt_resource(r) { 278 for_each_alloc_enabled_rdt_resource(r) {
278 if (closid < r->num_closid) 279 if (closid < r->num_closid)
279 show_doms(s, r, closid); 280 show_doms(s, r, closid);
280 } 281 }
@@ -284,3 +285,57 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
284 rdtgroup_kn_unlock(of->kn); 285 rdtgroup_kn_unlock(of->kn);
285 return ret; 286 return ret;
286} 287}
288
289void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
290 struct rdtgroup *rdtgrp, int evtid, int first)
291{
292 /*
293 * setup the parameters to send to the IPI to read the data.
294 */
295 rr->rgrp = rdtgrp;
296 rr->evtid = evtid;
297 rr->d = d;
298 rr->val = 0;
299 rr->first = first;
300
301 smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1);
302}
303
304int rdtgroup_mondata_show(struct seq_file *m, void *arg)
305{
306 struct kernfs_open_file *of = m->private;
307 u32 resid, evtid, domid;
308 struct rdtgroup *rdtgrp;
309 struct rdt_resource *r;
310 union mon_data_bits md;
311 struct rdt_domain *d;
312 struct rmid_read rr;
313 int ret = 0;
314
315 rdtgrp = rdtgroup_kn_lock_live(of->kn);
316
317 md.priv = of->kn->priv;
318 resid = md.u.rid;
319 domid = md.u.domid;
320 evtid = md.u.evtid;
321
322 r = &rdt_resources_all[resid];
323 d = rdt_find_domain(r, domid, NULL);
324 if (!d) {
325 ret = -ENOENT;
326 goto out;
327 }
328
329 mon_event_read(&rr, d, rdtgrp, evtid, false);
330
331 if (rr.val & RMID_VAL_ERROR)
332 seq_puts(m, "Error\n");
333 else if (rr.val & RMID_VAL_UNAVAIL)
334 seq_puts(m, "Unavailable\n");
335 else
336 seq_printf(m, "%llu\n", rr.val * r->mon_scale);
337
338out:
339 rdtgroup_kn_unlock(of->kn);
340 return ret;
341}
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
new file mode 100644
index 000000000000..30827510094b
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -0,0 +1,499 @@
1/*
2 * Resource Director Technology(RDT)
3 * - Monitoring code
4 *
5 * Copyright (C) 2017 Intel Corporation
6 *
7 * Author:
8 * Vikas Shivappa <vikas.shivappa@intel.com>
9 *
10 * This replaces the cqm.c based on perf but we reuse a lot of
11 * code and datastructures originally from Peter Zijlstra and Matt Fleming.
12 *
13 * This program is free software; you can redistribute it and/or modify it
14 * under the terms and conditions of the GNU General Public License,
15 * version 2, as published by the Free Software Foundation.
16 *
17 * This program is distributed in the hope it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 * more details.
21 *
22 * More information about RDT be found in the Intel (R) x86 Architecture
23 * Software Developer Manual June 2016, volume 3, section 17.17.
24 */
25
26#include <linux/module.h>
27#include <linux/slab.h>
28#include <asm/cpu_device_id.h>
29#include "intel_rdt.h"
30
31#define MSR_IA32_QM_CTR 0x0c8e
32#define MSR_IA32_QM_EVTSEL 0x0c8d
33
34struct rmid_entry {
35 u32 rmid;
36 int busy;
37 struct list_head list;
38};
39
40/**
41 * @rmid_free_lru A least recently used list of free RMIDs
42 * These RMIDs are guaranteed to have an occupancy less than the
43 * threshold occupancy
44 */
45static LIST_HEAD(rmid_free_lru);
46
47/**
48 * @rmid_limbo_count count of currently unused but (potentially)
49 * dirty RMIDs.
50 * This counts RMIDs that no one is currently using but that
51 * may have a occupancy value > intel_cqm_threshold. User can change
52 * the threshold occupancy value.
53 */
54unsigned int rmid_limbo_count;
55
56/**
57 * @rmid_entry - The entry in the limbo and free lists.
58 */
59static struct rmid_entry *rmid_ptrs;
60
61/*
62 * Global boolean for rdt_monitor which is true if any
63 * resource monitoring is enabled.
64 */
65bool rdt_mon_capable;
66
67/*
68 * Global to indicate which monitoring events are enabled.
69 */
70unsigned int rdt_mon_features;
71
72/*
73 * This is the threshold cache occupancy at which we will consider an
74 * RMID available for re-allocation.
75 */
76unsigned int intel_cqm_threshold;
77
78static inline struct rmid_entry *__rmid_entry(u32 rmid)
79{
80 struct rmid_entry *entry;
81
82 entry = &rmid_ptrs[rmid];
83 WARN_ON(entry->rmid != rmid);
84
85 return entry;
86}
87
88static u64 __rmid_read(u32 rmid, u32 eventid)
89{
90 u64 val;
91
92 /*
93 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
94 * with a valid event code for supported resource type and the bits
95 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
96 * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
97 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
98 * are error bits.
99 */
100 wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
101 rdmsrl(MSR_IA32_QM_CTR, val);
102
103 return val;
104}
105
106static bool rmid_dirty(struct rmid_entry *entry)
107{
108 u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
109
110 return val >= intel_cqm_threshold;
111}
112
113/*
114 * Check the RMIDs that are marked as busy for this domain. If the
115 * reported LLC occupancy is below the threshold clear the busy bit and
116 * decrement the count. If the busy count gets to zero on an RMID, we
117 * free the RMID
118 */
119void __check_limbo(struct rdt_domain *d, bool force_free)
120{
121 struct rmid_entry *entry;
122 struct rdt_resource *r;
123 u32 crmid = 1, nrmid;
124
125 r = &rdt_resources_all[RDT_RESOURCE_L3];
126
127 /*
128 * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
129 * are marked as busy for occupancy < threshold. If the occupancy
130 * is less than the threshold decrement the busy counter of the
131 * RMID and move it to the free list when the counter reaches 0.
132 */
133 for (;;) {
134 nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid);
135 if (nrmid >= r->num_rmid)
136 break;
137
138 entry = __rmid_entry(nrmid);
139 if (force_free || !rmid_dirty(entry)) {
140 clear_bit(entry->rmid, d->rmid_busy_llc);
141 if (!--entry->busy) {
142 rmid_limbo_count--;
143 list_add_tail(&entry->list, &rmid_free_lru);
144 }
145 }
146 crmid = nrmid + 1;
147 }
148}
149
150bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d)
151{
152 return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid;
153}
154
155/*
156 * As of now the RMIDs allocation is global.
157 * However we keep track of which packages the RMIDs
158 * are used to optimize the limbo list management.
159 */
160int alloc_rmid(void)
161{
162 struct rmid_entry *entry;
163
164 lockdep_assert_held(&rdtgroup_mutex);
165
166 if (list_empty(&rmid_free_lru))
167 return rmid_limbo_count ? -EBUSY : -ENOSPC;
168
169 entry = list_first_entry(&rmid_free_lru,
170 struct rmid_entry, list);
171 list_del(&entry->list);
172
173 return entry->rmid;
174}
175
176static void add_rmid_to_limbo(struct rmid_entry *entry)
177{
178 struct rdt_resource *r;
179 struct rdt_domain *d;
180 int cpu;
181 u64 val;
182
183 r = &rdt_resources_all[RDT_RESOURCE_L3];
184
185 entry->busy = 0;
186 cpu = get_cpu();
187 list_for_each_entry(d, &r->domains, list) {
188 if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
189 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
190 if (val <= intel_cqm_threshold)
191 continue;
192 }
193
194 /*
195 * For the first limbo RMID in the domain,
196 * setup up the limbo worker.
197 */
198 if (!has_busy_rmid(r, d))
199 cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL);
200 set_bit(entry->rmid, d->rmid_busy_llc);
201 entry->busy++;
202 }
203 put_cpu();
204
205 if (entry->busy)
206 rmid_limbo_count++;
207 else
208 list_add_tail(&entry->list, &rmid_free_lru);
209}
210
211void free_rmid(u32 rmid)
212{
213 struct rmid_entry *entry;
214
215 if (!rmid)
216 return;
217
218 lockdep_assert_held(&rdtgroup_mutex);
219
220 entry = __rmid_entry(rmid);
221
222 if (is_llc_occupancy_enabled())
223 add_rmid_to_limbo(entry);
224 else
225 list_add_tail(&entry->list, &rmid_free_lru);
226}
227
228static int __mon_event_count(u32 rmid, struct rmid_read *rr)
229{
230 u64 chunks, shift, tval;
231 struct mbm_state *m;
232
233 tval = __rmid_read(rmid, rr->evtid);
234 if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) {
235 rr->val = tval;
236 return -EINVAL;
237 }
238 switch (rr->evtid) {
239 case QOS_L3_OCCUP_EVENT_ID:
240 rr->val += tval;
241 return 0;
242 case QOS_L3_MBM_TOTAL_EVENT_ID:
243 m = &rr->d->mbm_total[rmid];
244 break;
245 case QOS_L3_MBM_LOCAL_EVENT_ID:
246 m = &rr->d->mbm_local[rmid];
247 break;
248 default:
249 /*
250 * Code would never reach here because
251 * an invalid event id would fail the __rmid_read.
252 */
253 return -EINVAL;
254 }
255
256 if (rr->first) {
257 m->prev_msr = tval;
258 m->chunks = 0;
259 return 0;
260 }
261
262 shift = 64 - MBM_CNTR_WIDTH;
263 chunks = (tval << shift) - (m->prev_msr << shift);
264 chunks >>= shift;
265 m->chunks += chunks;
266 m->prev_msr = tval;
267
268 rr->val += m->chunks;
269 return 0;
270}
271
272/*
273 * This is called via IPI to read the CQM/MBM counters
274 * on a domain.
275 */
276void mon_event_count(void *info)
277{
278 struct rdtgroup *rdtgrp, *entry;
279 struct rmid_read *rr = info;
280 struct list_head *head;
281
282 rdtgrp = rr->rgrp;
283
284 if (__mon_event_count(rdtgrp->mon.rmid, rr))
285 return;
286
287 /*
288 * For Ctrl groups read data from child monitor groups.
289 */
290 head = &rdtgrp->mon.crdtgrp_list;
291
292 if (rdtgrp->type == RDTCTRL_GROUP) {
293 list_for_each_entry(entry, head, mon.crdtgrp_list) {
294 if (__mon_event_count(entry->mon.rmid, rr))
295 return;
296 }
297 }
298}
299
300static void mbm_update(struct rdt_domain *d, int rmid)
301{
302 struct rmid_read rr;
303
304 rr.first = false;
305 rr.d = d;
306
307 /*
308 * This is protected from concurrent reads from user
309 * as both the user and we hold the global mutex.
310 */
311 if (is_mbm_total_enabled()) {
312 rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
313 __mon_event_count(rmid, &rr);
314 }
315 if (is_mbm_local_enabled()) {
316 rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
317 __mon_event_count(rmid, &rr);
318 }
319}
320
321/*
322 * Handler to scan the limbo list and move the RMIDs
323 * to free list whose occupancy < threshold_occupancy.
324 */
325void cqm_handle_limbo(struct work_struct *work)
326{
327 unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
328 int cpu = smp_processor_id();
329 struct rdt_resource *r;
330 struct rdt_domain *d;
331
332 mutex_lock(&rdtgroup_mutex);
333
334 r = &rdt_resources_all[RDT_RESOURCE_L3];
335 d = get_domain_from_cpu(cpu, r);
336
337 if (!d) {
338 pr_warn_once("Failure to get domain for limbo worker\n");
339 goto out_unlock;
340 }
341
342 __check_limbo(d, false);
343
344 if (has_busy_rmid(r, d))
345 schedule_delayed_work_on(cpu, &d->cqm_limbo, delay);
346
347out_unlock:
348 mutex_unlock(&rdtgroup_mutex);
349}
350
351void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms)
352{
353 unsigned long delay = msecs_to_jiffies(delay_ms);
354 struct rdt_resource *r;
355 int cpu;
356
357 r = &rdt_resources_all[RDT_RESOURCE_L3];
358
359 cpu = cpumask_any(&dom->cpu_mask);
360 dom->cqm_work_cpu = cpu;
361
362 schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
363}
364
365void mbm_handle_overflow(struct work_struct *work)
366{
367 unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
368 struct rdtgroup *prgrp, *crgrp;
369 int cpu = smp_processor_id();
370 struct list_head *head;
371 struct rdt_domain *d;
372
373 mutex_lock(&rdtgroup_mutex);
374
375 if (!static_branch_likely(&rdt_enable_key))
376 goto out_unlock;
377
378 d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]);
379 if (!d)
380 goto out_unlock;
381
382 list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
383 mbm_update(d, prgrp->mon.rmid);
384
385 head = &prgrp->mon.crdtgrp_list;
386 list_for_each_entry(crgrp, head, mon.crdtgrp_list)
387 mbm_update(d, crgrp->mon.rmid);
388 }
389
390 schedule_delayed_work_on(cpu, &d->mbm_over, delay);
391
392out_unlock:
393 mutex_unlock(&rdtgroup_mutex);
394}
395
396void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms)
397{
398 unsigned long delay = msecs_to_jiffies(delay_ms);
399 int cpu;
400
401 if (!static_branch_likely(&rdt_enable_key))
402 return;
403 cpu = cpumask_any(&dom->cpu_mask);
404 dom->mbm_work_cpu = cpu;
405 schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
406}
407
408static int dom_data_init(struct rdt_resource *r)
409{
410 struct rmid_entry *entry = NULL;
411 int i, nr_rmids;
412
413 nr_rmids = r->num_rmid;
414 rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL);
415 if (!rmid_ptrs)
416 return -ENOMEM;
417
418 for (i = 0; i < nr_rmids; i++) {
419 entry = &rmid_ptrs[i];
420 INIT_LIST_HEAD(&entry->list);
421
422 entry->rmid = i;
423 list_add_tail(&entry->list, &rmid_free_lru);
424 }
425
426 /*
427 * RMID 0 is special and is always allocated. It's used for all
428 * tasks that are not monitored.
429 */
430 entry = __rmid_entry(0);
431 list_del(&entry->list);
432
433 return 0;
434}
435
436static struct mon_evt llc_occupancy_event = {
437 .name = "llc_occupancy",
438 .evtid = QOS_L3_OCCUP_EVENT_ID,
439};
440
441static struct mon_evt mbm_total_event = {
442 .name = "mbm_total_bytes",
443 .evtid = QOS_L3_MBM_TOTAL_EVENT_ID,
444};
445
446static struct mon_evt mbm_local_event = {
447 .name = "mbm_local_bytes",
448 .evtid = QOS_L3_MBM_LOCAL_EVENT_ID,
449};
450
451/*
452 * Initialize the event list for the resource.
453 *
454 * Note that MBM events are also part of RDT_RESOURCE_L3 resource
455 * because as per the SDM the total and local memory bandwidth
456 * are enumerated as part of L3 monitoring.
457 */
458static void l3_mon_evt_init(struct rdt_resource *r)
459{
460 INIT_LIST_HEAD(&r->evt_list);
461
462 if (is_llc_occupancy_enabled())
463 list_add_tail(&llc_occupancy_event.list, &r->evt_list);
464 if (is_mbm_total_enabled())
465 list_add_tail(&mbm_total_event.list, &r->evt_list);
466 if (is_mbm_local_enabled())
467 list_add_tail(&mbm_local_event.list, &r->evt_list);
468}
469
470int rdt_get_mon_l3_config(struct rdt_resource *r)
471{
472 int ret;
473
474 r->mon_scale = boot_cpu_data.x86_cache_occ_scale;
475 r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
476
477 /*
478 * A reasonable upper limit on the max threshold is the number
479 * of lines tagged per RMID if all RMIDs have the same number of
480 * lines tagged in the LLC.
481 *
482 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
483 */
484 intel_cqm_threshold = boot_cpu_data.x86_cache_size * 1024 / r->num_rmid;
485
486 /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
487 intel_cqm_threshold /= r->mon_scale;
488
489 ret = dom_data_init(r);
490 if (ret)
491 return ret;
492
493 l3_mon_evt_init(r);
494
495 r->mon_capable = true;
496 r->mon_enabled = true;
497
498 return 0;
499}
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 9257bd9dc664..a869d4a073c5 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -32,17 +32,25 @@
32 32
33#include <uapi/linux/magic.h> 33#include <uapi/linux/magic.h>
34 34
35#include <asm/intel_rdt.h> 35#include <asm/intel_rdt_sched.h>
36#include <asm/intel_rdt_common.h> 36#include "intel_rdt.h"
37 37
38DEFINE_STATIC_KEY_FALSE(rdt_enable_key); 38DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
39struct kernfs_root *rdt_root; 39DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
40DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
41static struct kernfs_root *rdt_root;
40struct rdtgroup rdtgroup_default; 42struct rdtgroup rdtgroup_default;
41LIST_HEAD(rdt_all_groups); 43LIST_HEAD(rdt_all_groups);
42 44
43/* Kernel fs node for "info" directory under root */ 45/* Kernel fs node for "info" directory under root */
44static struct kernfs_node *kn_info; 46static struct kernfs_node *kn_info;
45 47
48/* Kernel fs node for "mon_groups" directory under root */
49static struct kernfs_node *kn_mongrp;
50
51/* Kernel fs node for "mon_data" directory under root */
52static struct kernfs_node *kn_mondata;
53
46/* 54/*
47 * Trivial allocator for CLOSIDs. Since h/w only supports a small number, 55 * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
48 * we can keep a bitmap of free CLOSIDs in a single integer. 56 * we can keep a bitmap of free CLOSIDs in a single integer.
@@ -66,7 +74,7 @@ static void closid_init(void)
66 int rdt_min_closid = 32; 74 int rdt_min_closid = 32;
67 75
68 /* Compute rdt_min_closid across all resources */ 76 /* Compute rdt_min_closid across all resources */
69 for_each_enabled_rdt_resource(r) 77 for_each_alloc_enabled_rdt_resource(r)
70 rdt_min_closid = min(rdt_min_closid, r->num_closid); 78 rdt_min_closid = min(rdt_min_closid, r->num_closid);
71 79
72 closid_free_map = BIT_MASK(rdt_min_closid) - 1; 80 closid_free_map = BIT_MASK(rdt_min_closid) - 1;
@@ -75,9 +83,9 @@ static void closid_init(void)
75 closid_free_map &= ~1; 83 closid_free_map &= ~1;
76} 84}
77 85
78int closid_alloc(void) 86static int closid_alloc(void)
79{ 87{
80 int closid = ffs(closid_free_map); 88 u32 closid = ffs(closid_free_map);
81 89
82 if (closid == 0) 90 if (closid == 0)
83 return -ENOSPC; 91 return -ENOSPC;
@@ -125,28 +133,6 @@ static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
125 return 0; 133 return 0;
126} 134}
127 135
128static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts,
129 int len)
130{
131 struct rftype *rft;
132 int ret;
133
134 lockdep_assert_held(&rdtgroup_mutex);
135
136 for (rft = rfts; rft < rfts + len; rft++) {
137 ret = rdtgroup_add_file(kn, rft);
138 if (ret)
139 goto error;
140 }
141
142 return 0;
143error:
144 pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
145 while (--rft >= rfts)
146 kernfs_remove_by_name(kn, rft->name);
147 return ret;
148}
149
150static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) 136static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
151{ 137{
152 struct kernfs_open_file *of = m->private; 138 struct kernfs_open_file *of = m->private;
@@ -174,6 +160,11 @@ static struct kernfs_ops rdtgroup_kf_single_ops = {
174 .seq_show = rdtgroup_seqfile_show, 160 .seq_show = rdtgroup_seqfile_show,
175}; 161};
176 162
163static struct kernfs_ops kf_mondata_ops = {
164 .atomic_write_len = PAGE_SIZE,
165 .seq_show = rdtgroup_mondata_show,
166};
167
177static bool is_cpu_list(struct kernfs_open_file *of) 168static bool is_cpu_list(struct kernfs_open_file *of)
178{ 169{
179 struct rftype *rft = of->kn->priv; 170 struct rftype *rft = of->kn->priv;
@@ -203,13 +194,18 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
203/* 194/*
204 * This is safe against intel_rdt_sched_in() called from __switch_to() 195 * This is safe against intel_rdt_sched_in() called from __switch_to()
205 * because __switch_to() is executed with interrupts disabled. A local call 196 * because __switch_to() is executed with interrupts disabled. A local call
206 * from rdt_update_closid() is proteced against __switch_to() because 197 * from update_closid_rmid() is proteced against __switch_to() because
207 * preemption is disabled. 198 * preemption is disabled.
208 */ 199 */
209static void rdt_update_cpu_closid(void *closid) 200static void update_cpu_closid_rmid(void *info)
210{ 201{
211 if (closid) 202 struct rdtgroup *r = info;
212 this_cpu_write(cpu_closid, *(int *)closid); 203
204 if (r) {
205 this_cpu_write(pqr_state.default_closid, r->closid);
206 this_cpu_write(pqr_state.default_rmid, r->mon.rmid);
207 }
208
213 /* 209 /*
214 * We cannot unconditionally write the MSR because the current 210 * We cannot unconditionally write the MSR because the current
215 * executing task might have its own closid selected. Just reuse 211 * executing task might have its own closid selected. Just reuse
@@ -221,28 +217,128 @@ static void rdt_update_cpu_closid(void *closid)
221/* 217/*
222 * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, 218 * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
223 * 219 *
224 * Per task closids must have been set up before calling this function. 220 * Per task closids/rmids must have been set up before calling this function.
225 *
226 * The per cpu closids are updated with the smp function call, when @closid
227 * is not NULL. If @closid is NULL then all affected percpu closids must
228 * have been set up before calling this function.
229 */ 221 */
230static void 222static void
231rdt_update_closid(const struct cpumask *cpu_mask, int *closid) 223update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
232{ 224{
233 int cpu = get_cpu(); 225 int cpu = get_cpu();
234 226
235 if (cpumask_test_cpu(cpu, cpu_mask)) 227 if (cpumask_test_cpu(cpu, cpu_mask))
236 rdt_update_cpu_closid(closid); 228 update_cpu_closid_rmid(r);
237 smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1); 229 smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1);
238 put_cpu(); 230 put_cpu();
239} 231}
240 232
233static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
234 cpumask_var_t tmpmask)
235{
236 struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
237 struct list_head *head;
238
239 /* Check whether cpus belong to parent ctrl group */
240 cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
241 if (cpumask_weight(tmpmask))
242 return -EINVAL;
243
244 /* Check whether cpus are dropped from this group */
245 cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
246 if (cpumask_weight(tmpmask)) {
247 /* Give any dropped cpus to parent rdtgroup */
248 cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
249 update_closid_rmid(tmpmask, prgrp);
250 }
251
252 /*
253 * If we added cpus, remove them from previous group that owned them
254 * and update per-cpu rmid
255 */
256 cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
257 if (cpumask_weight(tmpmask)) {
258 head = &prgrp->mon.crdtgrp_list;
259 list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
260 if (crgrp == rdtgrp)
261 continue;
262 cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
263 tmpmask);
264 }
265 update_closid_rmid(tmpmask, rdtgrp);
266 }
267
268 /* Done pushing/pulling - update this group with new mask */
269 cpumask_copy(&rdtgrp->cpu_mask, newmask);
270
271 return 0;
272}
273
274static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
275{
276 struct rdtgroup *crgrp;
277
278 cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
279 /* update the child mon group masks as well*/
280 list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
281 cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
282}
283
284static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
285 cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
286{
287 struct rdtgroup *r, *crgrp;
288 struct list_head *head;
289
290 /* Check whether cpus are dropped from this group */
291 cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
292 if (cpumask_weight(tmpmask)) {
293 /* Can't drop from default group */
294 if (rdtgrp == &rdtgroup_default)
295 return -EINVAL;
296
297 /* Give any dropped cpus to rdtgroup_default */
298 cpumask_or(&rdtgroup_default.cpu_mask,
299 &rdtgroup_default.cpu_mask, tmpmask);
300 update_closid_rmid(tmpmask, &rdtgroup_default);
301 }
302
303 /*
304 * If we added cpus, remove them from previous group and
305 * the prev group's child groups that owned them
306 * and update per-cpu closid/rmid.
307 */
308 cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
309 if (cpumask_weight(tmpmask)) {
310 list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
311 if (r == rdtgrp)
312 continue;
313 cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
314 if (cpumask_weight(tmpmask1))
315 cpumask_rdtgrp_clear(r, tmpmask1);
316 }
317 update_closid_rmid(tmpmask, rdtgrp);
318 }
319
320 /* Done pushing/pulling - update this group with new mask */
321 cpumask_copy(&rdtgrp->cpu_mask, newmask);
322
323 /*
324 * Clear child mon group masks since there is a new parent mask
325 * now and update the rmid for the cpus the child lost.
326 */
327 head = &rdtgrp->mon.crdtgrp_list;
328 list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
329 cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
330 update_closid_rmid(tmpmask, rdtgrp);
331 cpumask_clear(&crgrp->cpu_mask);
332 }
333
334 return 0;
335}
336
241static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, 337static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
242 char *buf, size_t nbytes, loff_t off) 338 char *buf, size_t nbytes, loff_t off)
243{ 339{
244 cpumask_var_t tmpmask, newmask; 340 cpumask_var_t tmpmask, newmask, tmpmask1;
245 struct rdtgroup *rdtgrp, *r; 341 struct rdtgroup *rdtgrp;
246 int ret; 342 int ret;
247 343
248 if (!buf) 344 if (!buf)
@@ -254,6 +350,11 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
254 free_cpumask_var(tmpmask); 350 free_cpumask_var(tmpmask);
255 return -ENOMEM; 351 return -ENOMEM;
256 } 352 }
353 if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
354 free_cpumask_var(tmpmask);
355 free_cpumask_var(newmask);
356 return -ENOMEM;
357 }
257 358
258 rdtgrp = rdtgroup_kn_lock_live(of->kn); 359 rdtgrp = rdtgroup_kn_lock_live(of->kn);
259 if (!rdtgrp) { 360 if (!rdtgrp) {
@@ -276,41 +377,18 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
276 goto unlock; 377 goto unlock;
277 } 378 }
278 379
279 /* Check whether cpus are dropped from this group */ 380 if (rdtgrp->type == RDTCTRL_GROUP)
280 cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); 381 ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
281 if (cpumask_weight(tmpmask)) { 382 else if (rdtgrp->type == RDTMON_GROUP)
282 /* Can't drop from default group */ 383 ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
283 if (rdtgrp == &rdtgroup_default) { 384 else
284 ret = -EINVAL; 385 ret = -EINVAL;
285 goto unlock;
286 }
287 /* Give any dropped cpus to rdtgroup_default */
288 cpumask_or(&rdtgroup_default.cpu_mask,
289 &rdtgroup_default.cpu_mask, tmpmask);
290 rdt_update_closid(tmpmask, &rdtgroup_default.closid);
291 }
292
293 /*
294 * If we added cpus, remove them from previous group that owned them
295 * and update per-cpu closid
296 */
297 cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
298 if (cpumask_weight(tmpmask)) {
299 list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
300 if (r == rdtgrp)
301 continue;
302 cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask);
303 }
304 rdt_update_closid(tmpmask, &rdtgrp->closid);
305 }
306
307 /* Done pushing/pulling - update this group with new mask */
308 cpumask_copy(&rdtgrp->cpu_mask, newmask);
309 386
310unlock: 387unlock:
311 rdtgroup_kn_unlock(of->kn); 388 rdtgroup_kn_unlock(of->kn);
312 free_cpumask_var(tmpmask); 389 free_cpumask_var(tmpmask);
313 free_cpumask_var(newmask); 390 free_cpumask_var(newmask);
391 free_cpumask_var(tmpmask1);
314 392
315 return ret ?: nbytes; 393 return ret ?: nbytes;
316} 394}
@@ -336,6 +414,7 @@ static void move_myself(struct callback_head *head)
336 if (atomic_dec_and_test(&rdtgrp->waitcount) && 414 if (atomic_dec_and_test(&rdtgrp->waitcount) &&
337 (rdtgrp->flags & RDT_DELETED)) { 415 (rdtgrp->flags & RDT_DELETED)) {
338 current->closid = 0; 416 current->closid = 0;
417 current->rmid = 0;
339 kfree(rdtgrp); 418 kfree(rdtgrp);
340 } 419 }
341 420
@@ -374,7 +453,20 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
374 atomic_dec(&rdtgrp->waitcount); 453 atomic_dec(&rdtgrp->waitcount);
375 kfree(callback); 454 kfree(callback);
376 } else { 455 } else {
377 tsk->closid = rdtgrp->closid; 456 /*
457 * For ctrl_mon groups move both closid and rmid.
458 * For monitor groups, can move the tasks only from
459 * their parent CTRL group.
460 */
461 if (rdtgrp->type == RDTCTRL_GROUP) {
462 tsk->closid = rdtgrp->closid;
463 tsk->rmid = rdtgrp->mon.rmid;
464 } else if (rdtgrp->type == RDTMON_GROUP) {
465 if (rdtgrp->mon.parent->closid == tsk->closid)
466 tsk->rmid = rdtgrp->mon.rmid;
467 else
468 ret = -EINVAL;
469 }
378 } 470 }
379 return ret; 471 return ret;
380} 472}
@@ -454,7 +546,8 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
454 546
455 rcu_read_lock(); 547 rcu_read_lock();
456 for_each_process_thread(p, t) { 548 for_each_process_thread(p, t) {
457 if (t->closid == r->closid) 549 if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
550 (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid))
458 seq_printf(s, "%d\n", t->pid); 551 seq_printf(s, "%d\n", t->pid);
459 } 552 }
460 rcu_read_unlock(); 553 rcu_read_unlock();
@@ -476,39 +569,6 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of,
476 return ret; 569 return ret;
477} 570}
478 571
479/* Files in each rdtgroup */
480static struct rftype rdtgroup_base_files[] = {
481 {
482 .name = "cpus",
483 .mode = 0644,
484 .kf_ops = &rdtgroup_kf_single_ops,
485 .write = rdtgroup_cpus_write,
486 .seq_show = rdtgroup_cpus_show,
487 },
488 {
489 .name = "cpus_list",
490 .mode = 0644,
491 .kf_ops = &rdtgroup_kf_single_ops,
492 .write = rdtgroup_cpus_write,
493 .seq_show = rdtgroup_cpus_show,
494 .flags = RFTYPE_FLAGS_CPUS_LIST,
495 },
496 {
497 .name = "tasks",
498 .mode = 0644,
499 .kf_ops = &rdtgroup_kf_single_ops,
500 .write = rdtgroup_tasks_write,
501 .seq_show = rdtgroup_tasks_show,
502 },
503 {
504 .name = "schemata",
505 .mode = 0644,
506 .kf_ops = &rdtgroup_kf_single_ops,
507 .write = rdtgroup_schemata_write,
508 .seq_show = rdtgroup_schemata_show,
509 },
510};
511
512static int rdt_num_closids_show(struct kernfs_open_file *of, 572static int rdt_num_closids_show(struct kernfs_open_file *of,
513 struct seq_file *seq, void *v) 573 struct seq_file *seq, void *v)
514{ 574{
@@ -536,6 +596,15 @@ static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
536 return 0; 596 return 0;
537} 597}
538 598
599static int rdt_shareable_bits_show(struct kernfs_open_file *of,
600 struct seq_file *seq, void *v)
601{
602 struct rdt_resource *r = of->kn->parent->priv;
603
604 seq_printf(seq, "%x\n", r->cache.shareable_bits);
605 return 0;
606}
607
539static int rdt_min_bw_show(struct kernfs_open_file *of, 608static int rdt_min_bw_show(struct kernfs_open_file *of,
540 struct seq_file *seq, void *v) 609 struct seq_file *seq, void *v)
541{ 610{
@@ -545,6 +614,28 @@ static int rdt_min_bw_show(struct kernfs_open_file *of,
545 return 0; 614 return 0;
546} 615}
547 616
617static int rdt_num_rmids_show(struct kernfs_open_file *of,
618 struct seq_file *seq, void *v)
619{
620 struct rdt_resource *r = of->kn->parent->priv;
621
622 seq_printf(seq, "%d\n", r->num_rmid);
623
624 return 0;
625}
626
627static int rdt_mon_features_show(struct kernfs_open_file *of,
628 struct seq_file *seq, void *v)
629{
630 struct rdt_resource *r = of->kn->parent->priv;
631 struct mon_evt *mevt;
632
633 list_for_each_entry(mevt, &r->evt_list, list)
634 seq_printf(seq, "%s\n", mevt->name);
635
636 return 0;
637}
638
548static int rdt_bw_gran_show(struct kernfs_open_file *of, 639static int rdt_bw_gran_show(struct kernfs_open_file *of,
549 struct seq_file *seq, void *v) 640 struct seq_file *seq, void *v)
550{ 641{
@@ -563,74 +654,200 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of,
563 return 0; 654 return 0;
564} 655}
565 656
657static int max_threshold_occ_show(struct kernfs_open_file *of,
658 struct seq_file *seq, void *v)
659{
660 struct rdt_resource *r = of->kn->parent->priv;
661
662 seq_printf(seq, "%u\n", intel_cqm_threshold * r->mon_scale);
663
664 return 0;
665}
666
667static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
668 char *buf, size_t nbytes, loff_t off)
669{
670 struct rdt_resource *r = of->kn->parent->priv;
671 unsigned int bytes;
672 int ret;
673
674 ret = kstrtouint(buf, 0, &bytes);
675 if (ret)
676 return ret;
677
678 if (bytes > (boot_cpu_data.x86_cache_size * 1024))
679 return -EINVAL;
680
681 intel_cqm_threshold = bytes / r->mon_scale;
682
683 return nbytes;
684}
685
566/* rdtgroup information files for one cache resource. */ 686/* rdtgroup information files for one cache resource. */
567static struct rftype res_cache_info_files[] = { 687static struct rftype res_common_files[] = {
568 { 688 {
569 .name = "num_closids", 689 .name = "num_closids",
570 .mode = 0444, 690 .mode = 0444,
571 .kf_ops = &rdtgroup_kf_single_ops, 691 .kf_ops = &rdtgroup_kf_single_ops,
572 .seq_show = rdt_num_closids_show, 692 .seq_show = rdt_num_closids_show,
693 .fflags = RF_CTRL_INFO,
694 },
695 {
696 .name = "mon_features",
697 .mode = 0444,
698 .kf_ops = &rdtgroup_kf_single_ops,
699 .seq_show = rdt_mon_features_show,
700 .fflags = RF_MON_INFO,
701 },
702 {
703 .name = "num_rmids",
704 .mode = 0444,
705 .kf_ops = &rdtgroup_kf_single_ops,
706 .seq_show = rdt_num_rmids_show,
707 .fflags = RF_MON_INFO,
573 }, 708 },
574 { 709 {
575 .name = "cbm_mask", 710 .name = "cbm_mask",
576 .mode = 0444, 711 .mode = 0444,
577 .kf_ops = &rdtgroup_kf_single_ops, 712 .kf_ops = &rdtgroup_kf_single_ops,
578 .seq_show = rdt_default_ctrl_show, 713 .seq_show = rdt_default_ctrl_show,
714 .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
579 }, 715 },
580 { 716 {
581 .name = "min_cbm_bits", 717 .name = "min_cbm_bits",
582 .mode = 0444, 718 .mode = 0444,
583 .kf_ops = &rdtgroup_kf_single_ops, 719 .kf_ops = &rdtgroup_kf_single_ops,
584 .seq_show = rdt_min_cbm_bits_show, 720 .seq_show = rdt_min_cbm_bits_show,
721 .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
585 }, 722 },
586};
587
588/* rdtgroup information files for memory bandwidth. */
589static struct rftype res_mba_info_files[] = {
590 { 723 {
591 .name = "num_closids", 724 .name = "shareable_bits",
592 .mode = 0444, 725 .mode = 0444,
593 .kf_ops = &rdtgroup_kf_single_ops, 726 .kf_ops = &rdtgroup_kf_single_ops,
594 .seq_show = rdt_num_closids_show, 727 .seq_show = rdt_shareable_bits_show,
728 .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
595 }, 729 },
596 { 730 {
597 .name = "min_bandwidth", 731 .name = "min_bandwidth",
598 .mode = 0444, 732 .mode = 0444,
599 .kf_ops = &rdtgroup_kf_single_ops, 733 .kf_ops = &rdtgroup_kf_single_ops,
600 .seq_show = rdt_min_bw_show, 734 .seq_show = rdt_min_bw_show,
735 .fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
601 }, 736 },
602 { 737 {
603 .name = "bandwidth_gran", 738 .name = "bandwidth_gran",
604 .mode = 0444, 739 .mode = 0444,
605 .kf_ops = &rdtgroup_kf_single_ops, 740 .kf_ops = &rdtgroup_kf_single_ops,
606 .seq_show = rdt_bw_gran_show, 741 .seq_show = rdt_bw_gran_show,
742 .fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
607 }, 743 },
608 { 744 {
609 .name = "delay_linear", 745 .name = "delay_linear",
610 .mode = 0444, 746 .mode = 0444,
611 .kf_ops = &rdtgroup_kf_single_ops, 747 .kf_ops = &rdtgroup_kf_single_ops,
612 .seq_show = rdt_delay_linear_show, 748 .seq_show = rdt_delay_linear_show,
749 .fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
750 },
751 {
752 .name = "max_threshold_occupancy",
753 .mode = 0644,
754 .kf_ops = &rdtgroup_kf_single_ops,
755 .write = max_threshold_occ_write,
756 .seq_show = max_threshold_occ_show,
757 .fflags = RF_MON_INFO | RFTYPE_RES_CACHE,
758 },
759 {
760 .name = "cpus",
761 .mode = 0644,
762 .kf_ops = &rdtgroup_kf_single_ops,
763 .write = rdtgroup_cpus_write,
764 .seq_show = rdtgroup_cpus_show,
765 .fflags = RFTYPE_BASE,
766 },
767 {
768 .name = "cpus_list",
769 .mode = 0644,
770 .kf_ops = &rdtgroup_kf_single_ops,
771 .write = rdtgroup_cpus_write,
772 .seq_show = rdtgroup_cpus_show,
773 .flags = RFTYPE_FLAGS_CPUS_LIST,
774 .fflags = RFTYPE_BASE,
775 },
776 {
777 .name = "tasks",
778 .mode = 0644,
779 .kf_ops = &rdtgroup_kf_single_ops,
780 .write = rdtgroup_tasks_write,
781 .seq_show = rdtgroup_tasks_show,
782 .fflags = RFTYPE_BASE,
783 },
784 {
785 .name = "schemata",
786 .mode = 0644,
787 .kf_ops = &rdtgroup_kf_single_ops,
788 .write = rdtgroup_schemata_write,
789 .seq_show = rdtgroup_schemata_show,
790 .fflags = RF_CTRL_BASE,
613 }, 791 },
614}; 792};
615 793
616void rdt_get_mba_infofile(struct rdt_resource *r) 794static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
617{ 795{
618 r->info_files = res_mba_info_files; 796 struct rftype *rfts, *rft;
619 r->nr_info_files = ARRAY_SIZE(res_mba_info_files); 797 int ret, len;
798
799 rfts = res_common_files;
800 len = ARRAY_SIZE(res_common_files);
801
802 lockdep_assert_held(&rdtgroup_mutex);
803
804 for (rft = rfts; rft < rfts + len; rft++) {
805 if ((fflags & rft->fflags) == rft->fflags) {
806 ret = rdtgroup_add_file(kn, rft);
807 if (ret)
808 goto error;
809 }
810 }
811
812 return 0;
813error:
814 pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
815 while (--rft >= rfts) {
816 if ((fflags & rft->fflags) == rft->fflags)
817 kernfs_remove_by_name(kn, rft->name);
818 }
819 return ret;
620} 820}
621 821
622void rdt_get_cache_infofile(struct rdt_resource *r) 822static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
823 unsigned long fflags)
623{ 824{
624 r->info_files = res_cache_info_files; 825 struct kernfs_node *kn_subdir;
625 r->nr_info_files = ARRAY_SIZE(res_cache_info_files); 826 int ret;
827
828 kn_subdir = kernfs_create_dir(kn_info, name,
829 kn_info->mode, r);
830 if (IS_ERR(kn_subdir))
831 return PTR_ERR(kn_subdir);
832
833 kernfs_get(kn_subdir);
834 ret = rdtgroup_kn_set_ugid(kn_subdir);
835 if (ret)
836 return ret;
837
838 ret = rdtgroup_add_files(kn_subdir, fflags);
839 if (!ret)
840 kernfs_activate(kn_subdir);
841
842 return ret;
626} 843}
627 844
628static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) 845static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
629{ 846{
630 struct kernfs_node *kn_subdir;
631 struct rftype *res_info_files;
632 struct rdt_resource *r; 847 struct rdt_resource *r;
633 int ret, len; 848 unsigned long fflags;
849 char name[32];
850 int ret;
634 851
635 /* create the directory */ 852 /* create the directory */
636 kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); 853 kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
@@ -638,25 +855,19 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
638 return PTR_ERR(kn_info); 855 return PTR_ERR(kn_info);
639 kernfs_get(kn_info); 856 kernfs_get(kn_info);
640 857
641 for_each_enabled_rdt_resource(r) { 858 for_each_alloc_enabled_rdt_resource(r) {
642 kn_subdir = kernfs_create_dir(kn_info, r->name, 859 fflags = r->fflags | RF_CTRL_INFO;
643 kn_info->mode, r); 860 ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags);
644 if (IS_ERR(kn_subdir)) {
645 ret = PTR_ERR(kn_subdir);
646 goto out_destroy;
647 }
648 kernfs_get(kn_subdir);
649 ret = rdtgroup_kn_set_ugid(kn_subdir);
650 if (ret) 861 if (ret)
651 goto out_destroy; 862 goto out_destroy;
863 }
652 864
653 res_info_files = r->info_files; 865 for_each_mon_enabled_rdt_resource(r) {
654 len = r->nr_info_files; 866 fflags = r->fflags | RF_MON_INFO;
655 867 sprintf(name, "%s_MON", r->name);
656 ret = rdtgroup_add_files(kn_subdir, res_info_files, len); 868 ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
657 if (ret) 869 if (ret)
658 goto out_destroy; 870 goto out_destroy;
659 kernfs_activate(kn_subdir);
660 } 871 }
661 872
662 /* 873 /*
@@ -678,6 +889,39 @@ out_destroy:
678 return ret; 889 return ret;
679} 890}
680 891
892static int
893mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
894 char *name, struct kernfs_node **dest_kn)
895{
896 struct kernfs_node *kn;
897 int ret;
898
899 /* create the directory */
900 kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
901 if (IS_ERR(kn))
902 return PTR_ERR(kn);
903
904 if (dest_kn)
905 *dest_kn = kn;
906
907 /*
908 * This extra ref will be put in kernfs_remove() and guarantees
909 * that @rdtgrp->kn is always accessible.
910 */
911 kernfs_get(kn);
912
913 ret = rdtgroup_kn_set_ugid(kn);
914 if (ret)
915 goto out_destroy;
916
917 kernfs_activate(kn);
918
919 return 0;
920
921out_destroy:
922 kernfs_remove(kn);
923 return ret;
924}
681static void l3_qos_cfg_update(void *arg) 925static void l3_qos_cfg_update(void *arg)
682{ 926{
683 bool *enable = arg; 927 bool *enable = arg;
@@ -718,14 +962,15 @@ static int cdp_enable(void)
718 struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; 962 struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3];
719 int ret; 963 int ret;
720 964
721 if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable) 965 if (!r_l3->alloc_capable || !r_l3data->alloc_capable ||
966 !r_l3code->alloc_capable)
722 return -EINVAL; 967 return -EINVAL;
723 968
724 ret = set_l3_qos_cfg(r_l3, true); 969 ret = set_l3_qos_cfg(r_l3, true);
725 if (!ret) { 970 if (!ret) {
726 r_l3->enabled = false; 971 r_l3->alloc_enabled = false;
727 r_l3data->enabled = true; 972 r_l3data->alloc_enabled = true;
728 r_l3code->enabled = true; 973 r_l3code->alloc_enabled = true;
729 } 974 }
730 return ret; 975 return ret;
731} 976}
@@ -734,11 +979,11 @@ static void cdp_disable(void)
734{ 979{
735 struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; 980 struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
736 981
737 r->enabled = r->capable; 982 r->alloc_enabled = r->alloc_capable;
738 983
739 if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) { 984 if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) {
740 rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false; 985 rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled = false;
741 rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false; 986 rdt_resources_all[RDT_RESOURCE_L3CODE].alloc_enabled = false;
742 set_l3_qos_cfg(r, false); 987 set_l3_qos_cfg(r, false);
743 } 988 }
744} 989}
@@ -823,10 +1068,16 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn)
823 } 1068 }
824} 1069}
825 1070
1071static int mkdir_mondata_all(struct kernfs_node *parent_kn,
1072 struct rdtgroup *prgrp,
1073 struct kernfs_node **mon_data_kn);
1074
826static struct dentry *rdt_mount(struct file_system_type *fs_type, 1075static struct dentry *rdt_mount(struct file_system_type *fs_type,
827 int flags, const char *unused_dev_name, 1076 int flags, const char *unused_dev_name,
828 void *data) 1077 void *data)
829{ 1078{
1079 struct rdt_domain *dom;
1080 struct rdt_resource *r;
830 struct dentry *dentry; 1081 struct dentry *dentry;
831 int ret; 1082 int ret;
832 1083
@@ -853,15 +1104,54 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
853 goto out_cdp; 1104 goto out_cdp;
854 } 1105 }
855 1106
1107 if (rdt_mon_capable) {
1108 ret = mongroup_create_dir(rdtgroup_default.kn,
1109 NULL, "mon_groups",
1110 &kn_mongrp);
1111 if (ret) {
1112 dentry = ERR_PTR(ret);
1113 goto out_info;
1114 }
1115 kernfs_get(kn_mongrp);
1116
1117 ret = mkdir_mondata_all(rdtgroup_default.kn,
1118 &rdtgroup_default, &kn_mondata);
1119 if (ret) {
1120 dentry = ERR_PTR(ret);
1121 goto out_mongrp;
1122 }
1123 kernfs_get(kn_mondata);
1124 rdtgroup_default.mon.mon_data_kn = kn_mondata;
1125 }
1126
856 dentry = kernfs_mount(fs_type, flags, rdt_root, 1127 dentry = kernfs_mount(fs_type, flags, rdt_root,
857 RDTGROUP_SUPER_MAGIC, NULL); 1128 RDTGROUP_SUPER_MAGIC, NULL);
858 if (IS_ERR(dentry)) 1129 if (IS_ERR(dentry))
859 goto out_destroy; 1130 goto out_mondata;
1131
1132 if (rdt_alloc_capable)
1133 static_branch_enable(&rdt_alloc_enable_key);
1134 if (rdt_mon_capable)
1135 static_branch_enable(&rdt_mon_enable_key);
1136
1137 if (rdt_alloc_capable || rdt_mon_capable)
1138 static_branch_enable(&rdt_enable_key);
1139
1140 if (is_mbm_enabled()) {
1141 r = &rdt_resources_all[RDT_RESOURCE_L3];
1142 list_for_each_entry(dom, &r->domains, list)
1143 mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL);
1144 }
860 1145
861 static_branch_enable(&rdt_enable_key);
862 goto out; 1146 goto out;
863 1147
864out_destroy: 1148out_mondata:
1149 if (rdt_mon_capable)
1150 kernfs_remove(kn_mondata);
1151out_mongrp:
1152 if (rdt_mon_capable)
1153 kernfs_remove(kn_mongrp);
1154out_info:
865 kernfs_remove(kn_info); 1155 kernfs_remove(kn_info);
866out_cdp: 1156out_cdp:
867 cdp_disable(); 1157 cdp_disable();
@@ -909,6 +1199,18 @@ static int reset_all_ctrls(struct rdt_resource *r)
909 return 0; 1199 return 0;
910} 1200}
911 1201
1202static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
1203{
1204 return (rdt_alloc_capable &&
1205 (r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
1206}
1207
1208static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
1209{
1210 return (rdt_mon_capable &&
1211 (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
1212}
1213
912/* 1214/*
913 * Move tasks from one to the other group. If @from is NULL, then all tasks 1215 * Move tasks from one to the other group. If @from is NULL, then all tasks
914 * in the systems are moved unconditionally (used for teardown). 1216 * in the systems are moved unconditionally (used for teardown).
@@ -924,8 +1226,11 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
924 1226
925 read_lock(&tasklist_lock); 1227 read_lock(&tasklist_lock);
926 for_each_process_thread(p, t) { 1228 for_each_process_thread(p, t) {
927 if (!from || t->closid == from->closid) { 1229 if (!from || is_closid_match(t, from) ||
1230 is_rmid_match(t, from)) {
928 t->closid = to->closid; 1231 t->closid = to->closid;
1232 t->rmid = to->mon.rmid;
1233
929#ifdef CONFIG_SMP 1234#ifdef CONFIG_SMP
930 /* 1235 /*
931 * This is safe on x86 w/o barriers as the ordering 1236 * This is safe on x86 w/o barriers as the ordering
@@ -944,6 +1249,19 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
944 read_unlock(&tasklist_lock); 1249 read_unlock(&tasklist_lock);
945} 1250}
946 1251
1252static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
1253{
1254 struct rdtgroup *sentry, *stmp;
1255 struct list_head *head;
1256
1257 head = &rdtgrp->mon.crdtgrp_list;
1258 list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
1259 free_rmid(sentry->mon.rmid);
1260 list_del(&sentry->mon.crdtgrp_list);
1261 kfree(sentry);
1262 }
1263}
1264
947/* 1265/*
948 * Forcibly remove all of subdirectories under root. 1266 * Forcibly remove all of subdirectories under root.
949 */ 1267 */
@@ -955,6 +1273,9 @@ static void rmdir_all_sub(void)
955 rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); 1273 rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
956 1274
957 list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { 1275 list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
1276 /* Free any child rmids */
1277 free_all_child_rdtgrp(rdtgrp);
1278
958 /* Remove each rdtgroup other than root */ 1279 /* Remove each rdtgroup other than root */
959 if (rdtgrp == &rdtgroup_default) 1280 if (rdtgrp == &rdtgroup_default)
960 continue; 1281 continue;
@@ -967,16 +1288,20 @@ static void rmdir_all_sub(void)
967 cpumask_or(&rdtgroup_default.cpu_mask, 1288 cpumask_or(&rdtgroup_default.cpu_mask,
968 &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); 1289 &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
969 1290
1291 free_rmid(rdtgrp->mon.rmid);
1292
970 kernfs_remove(rdtgrp->kn); 1293 kernfs_remove(rdtgrp->kn);
971 list_del(&rdtgrp->rdtgroup_list); 1294 list_del(&rdtgrp->rdtgroup_list);
972 kfree(rdtgrp); 1295 kfree(rdtgrp);
973 } 1296 }
974 /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ 1297 /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
975 get_online_cpus(); 1298 get_online_cpus();
976 rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid); 1299 update_closid_rmid(cpu_online_mask, &rdtgroup_default);
977 put_online_cpus(); 1300 put_online_cpus();
978 1301
979 kernfs_remove(kn_info); 1302 kernfs_remove(kn_info);
1303 kernfs_remove(kn_mongrp);
1304 kernfs_remove(kn_mondata);
980} 1305}
981 1306
982static void rdt_kill_sb(struct super_block *sb) 1307static void rdt_kill_sb(struct super_block *sb)
@@ -986,10 +1311,12 @@ static void rdt_kill_sb(struct super_block *sb)
986 mutex_lock(&rdtgroup_mutex); 1311 mutex_lock(&rdtgroup_mutex);
987 1312
988 /*Put everything back to default values. */ 1313 /*Put everything back to default values. */
989 for_each_enabled_rdt_resource(r) 1314 for_each_alloc_enabled_rdt_resource(r)
990 reset_all_ctrls(r); 1315 reset_all_ctrls(r);
991 cdp_disable(); 1316 cdp_disable();
992 rmdir_all_sub(); 1317 rmdir_all_sub();
1318 static_branch_disable(&rdt_alloc_enable_key);
1319 static_branch_disable(&rdt_mon_enable_key);
993 static_branch_disable(&rdt_enable_key); 1320 static_branch_disable(&rdt_enable_key);
994 kernfs_kill_sb(sb); 1321 kernfs_kill_sb(sb);
995 mutex_unlock(&rdtgroup_mutex); 1322 mutex_unlock(&rdtgroup_mutex);
@@ -1001,46 +1328,223 @@ static struct file_system_type rdt_fs_type = {
1001 .kill_sb = rdt_kill_sb, 1328 .kill_sb = rdt_kill_sb,
1002}; 1329};
1003 1330
1004static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, 1331static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
1005 umode_t mode) 1332 void *priv)
1006{ 1333{
1007 struct rdtgroup *parent, *rdtgrp;
1008 struct kernfs_node *kn; 1334 struct kernfs_node *kn;
1009 int ret, closid; 1335 int ret = 0;
1010 1336
1011 /* Only allow mkdir in the root directory */ 1337 kn = __kernfs_create_file(parent_kn, name, 0444, 0,
1012 if (parent_kn != rdtgroup_default.kn) 1338 &kf_mondata_ops, priv, NULL, NULL);
1013 return -EPERM; 1339 if (IS_ERR(kn))
1340 return PTR_ERR(kn);
1014 1341
1015 /* Do not accept '\n' to avoid unparsable situation. */ 1342 ret = rdtgroup_kn_set_ugid(kn);
1016 if (strchr(name, '\n')) 1343 if (ret) {
1017 return -EINVAL; 1344 kernfs_remove(kn);
1345 return ret;
1346 }
1018 1347
1019 parent = rdtgroup_kn_lock_live(parent_kn); 1348 return ret;
1020 if (!parent) { 1349}
1021 ret = -ENODEV; 1350
1022 goto out_unlock; 1351/*
1352 * Remove all subdirectories of mon_data of ctrl_mon groups
1353 * and monitor groups with given domain id.
1354 */
1355void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id)
1356{
1357 struct rdtgroup *prgrp, *crgrp;
1358 char name[32];
1359
1360 if (!r->mon_enabled)
1361 return;
1362
1363 list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
1364 sprintf(name, "mon_%s_%02d", r->name, dom_id);
1365 kernfs_remove_by_name(prgrp->mon.mon_data_kn, name);
1366
1367 list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
1368 kernfs_remove_by_name(crgrp->mon.mon_data_kn, name);
1023 } 1369 }
1370}
1024 1371
1025 ret = closid_alloc(); 1372static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
1026 if (ret < 0) 1373 struct rdt_domain *d,
1374 struct rdt_resource *r, struct rdtgroup *prgrp)
1375{
1376 union mon_data_bits priv;
1377 struct kernfs_node *kn;
1378 struct mon_evt *mevt;
1379 struct rmid_read rr;
1380 char name[32];
1381 int ret;
1382
1383 sprintf(name, "mon_%s_%02d", r->name, d->id);
1384 /* create the directory */
1385 kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
1386 if (IS_ERR(kn))
1387 return PTR_ERR(kn);
1388
1389 /*
1390 * This extra ref will be put in kernfs_remove() and guarantees
1391 * that kn is always accessible.
1392 */
1393 kernfs_get(kn);
1394 ret = rdtgroup_kn_set_ugid(kn);
1395 if (ret)
1396 goto out_destroy;
1397
1398 if (WARN_ON(list_empty(&r->evt_list))) {
1399 ret = -EPERM;
1400 goto out_destroy;
1401 }
1402
1403 priv.u.rid = r->rid;
1404 priv.u.domid = d->id;
1405 list_for_each_entry(mevt, &r->evt_list, list) {
1406 priv.u.evtid = mevt->evtid;
1407 ret = mon_addfile(kn, mevt->name, priv.priv);
1408 if (ret)
1409 goto out_destroy;
1410
1411 if (is_mbm_event(mevt->evtid))
1412 mon_event_read(&rr, d, prgrp, mevt->evtid, true);
1413 }
1414 kernfs_activate(kn);
1415 return 0;
1416
1417out_destroy:
1418 kernfs_remove(kn);
1419 return ret;
1420}
1421
1422/*
1423 * Add all subdirectories of mon_data for "ctrl_mon" groups
1424 * and "monitor" groups with given domain id.
1425 */
1426void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
1427 struct rdt_domain *d)
1428{
1429 struct kernfs_node *parent_kn;
1430 struct rdtgroup *prgrp, *crgrp;
1431 struct list_head *head;
1432
1433 if (!r->mon_enabled)
1434 return;
1435
1436 list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
1437 parent_kn = prgrp->mon.mon_data_kn;
1438 mkdir_mondata_subdir(parent_kn, d, r, prgrp);
1439
1440 head = &prgrp->mon.crdtgrp_list;
1441 list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
1442 parent_kn = crgrp->mon.mon_data_kn;
1443 mkdir_mondata_subdir(parent_kn, d, r, crgrp);
1444 }
1445 }
1446}
1447
1448static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
1449 struct rdt_resource *r,
1450 struct rdtgroup *prgrp)
1451{
1452 struct rdt_domain *dom;
1453 int ret;
1454
1455 list_for_each_entry(dom, &r->domains, list) {
1456 ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
1457 if (ret)
1458 return ret;
1459 }
1460
1461 return 0;
1462}
1463
1464/*
1465 * This creates a directory mon_data which contains the monitored data.
1466 *
1467 * mon_data has one directory for each domain whic are named
1468 * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
1469 * with L3 domain looks as below:
1470 * ./mon_data:
1471 * mon_L3_00
1472 * mon_L3_01
1473 * mon_L3_02
1474 * ...
1475 *
1476 * Each domain directory has one file per event:
1477 * ./mon_L3_00/:
1478 * llc_occupancy
1479 *
1480 */
1481static int mkdir_mondata_all(struct kernfs_node *parent_kn,
1482 struct rdtgroup *prgrp,
1483 struct kernfs_node **dest_kn)
1484{
1485 struct rdt_resource *r;
1486 struct kernfs_node *kn;
1487 int ret;
1488
1489 /*
1490 * Create the mon_data directory first.
1491 */
1492 ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn);
1493 if (ret)
1494 return ret;
1495
1496 if (dest_kn)
1497 *dest_kn = kn;
1498
1499 /*
1500 * Create the subdirectories for each domain. Note that all events
1501 * in a domain like L3 are grouped into a resource whose domain is L3
1502 */
1503 for_each_mon_enabled_rdt_resource(r) {
1504 ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
1505 if (ret)
1506 goto out_destroy;
1507 }
1508
1509 return 0;
1510
1511out_destroy:
1512 kernfs_remove(kn);
1513 return ret;
1514}
1515
1516static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
1517 struct kernfs_node *prgrp_kn,
1518 const char *name, umode_t mode,
1519 enum rdt_group_type rtype, struct rdtgroup **r)
1520{
1521 struct rdtgroup *prdtgrp, *rdtgrp;
1522 struct kernfs_node *kn;
1523 uint files = 0;
1524 int ret;
1525
1526 prdtgrp = rdtgroup_kn_lock_live(prgrp_kn);
1527 if (!prdtgrp) {
1528 ret = -ENODEV;
1027 goto out_unlock; 1529 goto out_unlock;
1028 closid = ret; 1530 }
1029 1531
1030 /* allocate the rdtgroup. */ 1532 /* allocate the rdtgroup. */
1031 rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); 1533 rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
1032 if (!rdtgrp) { 1534 if (!rdtgrp) {
1033 ret = -ENOSPC; 1535 ret = -ENOSPC;
1034 goto out_closid_free; 1536 goto out_unlock;
1035 } 1537 }
1036 rdtgrp->closid = closid; 1538 *r = rdtgrp;
1037 list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); 1539 rdtgrp->mon.parent = prdtgrp;
1540 rdtgrp->type = rtype;
1541 INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
1038 1542
1039 /* kernfs creates the directory for rdtgrp */ 1543 /* kernfs creates the directory for rdtgrp */
1040 kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp); 1544 kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
1041 if (IS_ERR(kn)) { 1545 if (IS_ERR(kn)) {
1042 ret = PTR_ERR(kn); 1546 ret = PTR_ERR(kn);
1043 goto out_cancel_ref; 1547 goto out_free_rgrp;
1044 } 1548 }
1045 rdtgrp->kn = kn; 1549 rdtgrp->kn = kn;
1046 1550
@@ -1056,43 +1560,211 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
1056 if (ret) 1560 if (ret)
1057 goto out_destroy; 1561 goto out_destroy;
1058 1562
1059 ret = rdtgroup_add_files(kn, rdtgroup_base_files, 1563 files = RFTYPE_BASE | RFTYPE_CTRL;
1060 ARRAY_SIZE(rdtgroup_base_files)); 1564 files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype);
1565 ret = rdtgroup_add_files(kn, files);
1061 if (ret) 1566 if (ret)
1062 goto out_destroy; 1567 goto out_destroy;
1063 1568
1569 if (rdt_mon_capable) {
1570 ret = alloc_rmid();
1571 if (ret < 0)
1572 goto out_destroy;
1573 rdtgrp->mon.rmid = ret;
1574
1575 ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
1576 if (ret)
1577 goto out_idfree;
1578 }
1064 kernfs_activate(kn); 1579 kernfs_activate(kn);
1065 1580
1066 ret = 0; 1581 /*
1067 goto out_unlock; 1582 * The caller unlocks the prgrp_kn upon success.
1583 */
1584 return 0;
1068 1585
1586out_idfree:
1587 free_rmid(rdtgrp->mon.rmid);
1069out_destroy: 1588out_destroy:
1070 kernfs_remove(rdtgrp->kn); 1589 kernfs_remove(rdtgrp->kn);
1071out_cancel_ref: 1590out_free_rgrp:
1072 list_del(&rdtgrp->rdtgroup_list);
1073 kfree(rdtgrp); 1591 kfree(rdtgrp);
1074out_closid_free:
1075 closid_free(closid);
1076out_unlock: 1592out_unlock:
1077 rdtgroup_kn_unlock(parent_kn); 1593 rdtgroup_kn_unlock(prgrp_kn);
1078 return ret; 1594 return ret;
1079} 1595}
1080 1596
1081static int rdtgroup_rmdir(struct kernfs_node *kn) 1597static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
1598{
1599 kernfs_remove(rgrp->kn);
1600 free_rmid(rgrp->mon.rmid);
1601 kfree(rgrp);
1602}
1603
1604/*
1605 * Create a monitor group under "mon_groups" directory of a control
1606 * and monitor group(ctrl_mon). This is a resource group
1607 * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
1608 */
1609static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
1610 struct kernfs_node *prgrp_kn,
1611 const char *name,
1612 umode_t mode)
1613{
1614 struct rdtgroup *rdtgrp, *prgrp;
1615 int ret;
1616
1617 ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP,
1618 &rdtgrp);
1619 if (ret)
1620 return ret;
1621
1622 prgrp = rdtgrp->mon.parent;
1623 rdtgrp->closid = prgrp->closid;
1624
1625 /*
1626 * Add the rdtgrp to the list of rdtgrps the parent
1627 * ctrl_mon group has to track.
1628 */
1629 list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
1630
1631 rdtgroup_kn_unlock(prgrp_kn);
1632 return ret;
1633}
1634
1635/*
1636 * These are rdtgroups created under the root directory. Can be used
1637 * to allocate and monitor resources.
1638 */
1639static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
1640 struct kernfs_node *prgrp_kn,
1641 const char *name, umode_t mode)
1082{ 1642{
1083 int ret, cpu, closid = rdtgroup_default.closid;
1084 struct rdtgroup *rdtgrp; 1643 struct rdtgroup *rdtgrp;
1085 cpumask_var_t tmpmask; 1644 struct kernfs_node *kn;
1645 u32 closid;
1646 int ret;
1086 1647
1087 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) 1648 ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP,
1088 return -ENOMEM; 1649 &rdtgrp);
1650 if (ret)
1651 return ret;
1089 1652
1090 rdtgrp = rdtgroup_kn_lock_live(kn); 1653 kn = rdtgrp->kn;
1091 if (!rdtgrp) { 1654 ret = closid_alloc();
1092 ret = -EPERM; 1655 if (ret < 0)
1093 goto out; 1656 goto out_common_fail;
1657 closid = ret;
1658
1659 rdtgrp->closid = closid;
1660 list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
1661
1662 if (rdt_mon_capable) {
1663 /*
1664 * Create an empty mon_groups directory to hold the subset
1665 * of tasks and cpus to monitor.
1666 */
1667 ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL);
1668 if (ret)
1669 goto out_id_free;
1094 } 1670 }
1095 1671
1672 goto out_unlock;
1673
1674out_id_free:
1675 closid_free(closid);
1676 list_del(&rdtgrp->rdtgroup_list);
1677out_common_fail:
1678 mkdir_rdt_prepare_clean(rdtgrp);
1679out_unlock:
1680 rdtgroup_kn_unlock(prgrp_kn);
1681 return ret;
1682}
1683
1684/*
1685 * We allow creating mon groups only with in a directory called "mon_groups"
1686 * which is present in every ctrl_mon group. Check if this is a valid
1687 * "mon_groups" directory.
1688 *
1689 * 1. The directory should be named "mon_groups".
1690 * 2. The mon group itself should "not" be named "mon_groups".
1691 * This makes sure "mon_groups" directory always has a ctrl_mon group
1692 * as parent.
1693 */
1694static bool is_mon_groups(struct kernfs_node *kn, const char *name)
1695{
1696 return (!strcmp(kn->name, "mon_groups") &&
1697 strcmp(name, "mon_groups"));
1698}
1699
1700static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
1701 umode_t mode)
1702{
1703 /* Do not accept '\n' to avoid unparsable situation. */
1704 if (strchr(name, '\n'))
1705 return -EINVAL;
1706
1707 /*
1708 * If the parent directory is the root directory and RDT
1709 * allocation is supported, add a control and monitoring
1710 * subdirectory
1711 */
1712 if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
1713 return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode);
1714
1715 /*
1716 * If RDT monitoring is supported and the parent directory is a valid
1717 * "mon_groups" directory, add a monitoring subdirectory.
1718 */
1719 if (rdt_mon_capable && is_mon_groups(parent_kn, name))
1720 return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode);
1721
1722 return -EPERM;
1723}
1724
1725static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
1726 cpumask_var_t tmpmask)
1727{
1728 struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
1729 int cpu;
1730
1731 /* Give any tasks back to the parent group */
1732 rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
1733
1734 /* Update per cpu rmid of the moved CPUs first */
1735 for_each_cpu(cpu, &rdtgrp->cpu_mask)
1736 per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid;
1737 /*
1738 * Update the MSR on moved CPUs and CPUs which have moved
1739 * task running on them.
1740 */
1741 cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
1742 update_closid_rmid(tmpmask, NULL);
1743
1744 rdtgrp->flags = RDT_DELETED;
1745 free_rmid(rdtgrp->mon.rmid);
1746
1747 /*
1748 * Remove the rdtgrp from the parent ctrl_mon group's list
1749 */
1750 WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
1751 list_del(&rdtgrp->mon.crdtgrp_list);
1752
1753 /*
1754 * one extra hold on this, will drop when we kfree(rdtgrp)
1755 * in rdtgroup_kn_unlock()
1756 */
1757 kernfs_get(kn);
1758 kernfs_remove(rdtgrp->kn);
1759
1760 return 0;
1761}
1762
1763static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
1764 cpumask_var_t tmpmask)
1765{
1766 int cpu;
1767
1096 /* Give any tasks back to the default group */ 1768 /* Give any tasks back to the default group */
1097 rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); 1769 rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
1098 1770
@@ -1100,18 +1772,28 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
1100 cpumask_or(&rdtgroup_default.cpu_mask, 1772 cpumask_or(&rdtgroup_default.cpu_mask,
1101 &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); 1773 &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
1102 1774
1103 /* Update per cpu closid of the moved CPUs first */ 1775 /* Update per cpu closid and rmid of the moved CPUs first */
1104 for_each_cpu(cpu, &rdtgrp->cpu_mask) 1776 for_each_cpu(cpu, &rdtgrp->cpu_mask) {
1105 per_cpu(cpu_closid, cpu) = closid; 1777 per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid;
1778 per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid;
1779 }
1780
1106 /* 1781 /*
1107 * Update the MSR on moved CPUs and CPUs which have moved 1782 * Update the MSR on moved CPUs and CPUs which have moved
1108 * task running on them. 1783 * task running on them.
1109 */ 1784 */
1110 cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); 1785 cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
1111 rdt_update_closid(tmpmask, NULL); 1786 update_closid_rmid(tmpmask, NULL);
1112 1787
1113 rdtgrp->flags = RDT_DELETED; 1788 rdtgrp->flags = RDT_DELETED;
1114 closid_free(rdtgrp->closid); 1789 closid_free(rdtgrp->closid);
1790 free_rmid(rdtgrp->mon.rmid);
1791
1792 /*
1793 * Free all the child monitor group rmids.
1794 */
1795 free_all_child_rdtgrp(rdtgrp);
1796
1115 list_del(&rdtgrp->rdtgroup_list); 1797 list_del(&rdtgrp->rdtgroup_list);
1116 1798
1117 /* 1799 /*
@@ -1120,7 +1802,41 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
1120 */ 1802 */
1121 kernfs_get(kn); 1803 kernfs_get(kn);
1122 kernfs_remove(rdtgrp->kn); 1804 kernfs_remove(rdtgrp->kn);
1123 ret = 0; 1805
1806 return 0;
1807}
1808
1809static int rdtgroup_rmdir(struct kernfs_node *kn)
1810{
1811 struct kernfs_node *parent_kn = kn->parent;
1812 struct rdtgroup *rdtgrp;
1813 cpumask_var_t tmpmask;
1814 int ret = 0;
1815
1816 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
1817 return -ENOMEM;
1818
1819 rdtgrp = rdtgroup_kn_lock_live(kn);
1820 if (!rdtgrp) {
1821 ret = -EPERM;
1822 goto out;
1823 }
1824
1825 /*
1826 * If the rdtgroup is a ctrl_mon group and parent directory
1827 * is the root directory, remove the ctrl_mon group.
1828 *
1829 * If the rdtgroup is a mon group and parent directory
1830 * is a valid "mon_groups" directory, remove the mon group.
1831 */
1832 if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn)
1833 ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
1834 else if (rdtgrp->type == RDTMON_GROUP &&
1835 is_mon_groups(parent_kn, kn->name))
1836 ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask);
1837 else
1838 ret = -EPERM;
1839
1124out: 1840out:
1125 rdtgroup_kn_unlock(kn); 1841 rdtgroup_kn_unlock(kn);
1126 free_cpumask_var(tmpmask); 1842 free_cpumask_var(tmpmask);
@@ -1129,7 +1845,7 @@ out:
1129 1845
1130static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) 1846static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
1131{ 1847{
1132 if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) 1848 if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
1133 seq_puts(seq, ",cdp"); 1849 seq_puts(seq, ",cdp");
1134 return 0; 1850 return 0;
1135} 1851}
@@ -1153,10 +1869,13 @@ static int __init rdtgroup_setup_root(void)
1153 mutex_lock(&rdtgroup_mutex); 1869 mutex_lock(&rdtgroup_mutex);
1154 1870
1155 rdtgroup_default.closid = 0; 1871 rdtgroup_default.closid = 0;
1872 rdtgroup_default.mon.rmid = 0;
1873 rdtgroup_default.type = RDTCTRL_GROUP;
1874 INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
1875
1156 list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); 1876 list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
1157 1877
1158 ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files, 1878 ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE);
1159 ARRAY_SIZE(rdtgroup_base_files));
1160 if (ret) { 1879 if (ret) {
1161 kernfs_destroy_root(rdt_root); 1880 kernfs_destroy_root(rdt_root);
1162 goto out; 1881 goto out;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index efc5eeb58292..11966251cd42 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -56,7 +56,7 @@
56#include <asm/debugreg.h> 56#include <asm/debugreg.h>
57#include <asm/switch_to.h> 57#include <asm/switch_to.h>
58#include <asm/vm86.h> 58#include <asm/vm86.h>
59#include <asm/intel_rdt.h> 59#include <asm/intel_rdt_sched.h>
60#include <asm/proto.h> 60#include <asm/proto.h>
61 61
62void __show_regs(struct pt_regs *regs, int all) 62void __show_regs(struct pt_regs *regs, int all)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c85269a76511..302e7b2572d1 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -52,7 +52,7 @@
52#include <asm/switch_to.h> 52#include <asm/switch_to.h>
53#include <asm/xen/hypervisor.h> 53#include <asm/xen/hypervisor.h>
54#include <asm/vdso.h> 54#include <asm/vdso.h>
55#include <asm/intel_rdt.h> 55#include <asm/intel_rdt_sched.h>
56#include <asm/unistd.h> 56#include <asm/unistd.h>
57#ifdef CONFIG_IA32_EMULATION 57#ifdef CONFIG_IA32_EMULATION
58/* Not included via unistd.h */ 58/* Not included via unistd.h */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 718ba163c1b9..8e22f24ded6a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -139,14 +139,6 @@ struct hw_perf_event {
139 /* for tp_event->class */ 139 /* for tp_event->class */
140 struct list_head tp_list; 140 struct list_head tp_list;
141 }; 141 };
142 struct { /* intel_cqm */
143 int cqm_state;
144 u32 cqm_rmid;
145 int is_group_event;
146 struct list_head cqm_events_entry;
147 struct list_head cqm_groups_entry;
148 struct list_head cqm_group_entry;
149 };
150 struct { /* amd_power */ 142 struct { /* amd_power */
151 u64 pwr_acc; 143 u64 pwr_acc;
152 u64 ptsc; 144 u64 ptsc;
@@ -414,11 +406,6 @@ struct pmu {
414 406
415 407
416 /* 408 /*
417 * Return the count value for a counter.
418 */
419 u64 (*count) (struct perf_event *event); /*optional*/
420
421 /*
422 * Set up pmu-private data structures for an AUX area 409 * Set up pmu-private data structures for an AUX area
423 */ 410 */
424 void *(*setup_aux) (int cpu, void **pages, 411 void *(*setup_aux) (int cpu, void **pages,
@@ -1112,11 +1099,6 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
1112 __perf_event_task_sched_out(prev, next); 1099 __perf_event_task_sched_out(prev, next);
1113} 1100}
1114 1101
1115static inline u64 __perf_event_count(struct perf_event *event)
1116{
1117 return local64_read(&event->count) + atomic64_read(&event->child_count);
1118}
1119
1120extern void perf_event_mmap(struct vm_area_struct *vma); 1102extern void perf_event_mmap(struct vm_area_struct *vma);
1121extern struct perf_guest_info_callbacks *perf_guest_cbs; 1103extern struct perf_guest_info_callbacks *perf_guest_cbs;
1122extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); 1104extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9ba42c663fba..68b38335d33c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -909,8 +909,9 @@ struct task_struct {
909 /* cg_list protected by css_set_lock and tsk->alloc_lock: */ 909 /* cg_list protected by css_set_lock and tsk->alloc_lock: */
910 struct list_head cg_list; 910 struct list_head cg_list;
911#endif 911#endif
912#ifdef CONFIG_INTEL_RDT_A 912#ifdef CONFIG_INTEL_RDT
913 int closid; 913 u32 closid;
914 u32 rmid;
914#endif 915#endif
915#ifdef CONFIG_FUTEX 916#ifdef CONFIG_FUTEX
916 struct robust_list_head __user *robust_list; 917 struct robust_list_head __user *robust_list;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ce64f3fed5c6..294f1927f944 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3673,10 +3673,7 @@ unlock:
3673 3673
3674static inline u64 perf_event_count(struct perf_event *event) 3674static inline u64 perf_event_count(struct perf_event *event)
3675{ 3675{
3676 if (event->pmu->count) 3676 return local64_read(&event->count) + atomic64_read(&event->child_count);
3677 return event->pmu->count(event);
3678
3679 return __perf_event_count(event);
3680} 3677}
3681 3678
3682/* 3679/*
@@ -3707,15 +3704,6 @@ int perf_event_read_local(struct perf_event *event, u64 *value)
3707 goto out; 3704 goto out;
3708 } 3705 }
3709 3706
3710 /*
3711 * It must not have a pmu::count method, those are not
3712 * NMI safe.
3713 */
3714 if (event->pmu->count) {
3715 ret = -EOPNOTSUPP;
3716 goto out;
3717 }
3718
3719 /* If this is a per-task event, it must be for current */ 3707 /* If this is a per-task event, it must be for current */
3720 if ((event->attach_state & PERF_ATTACH_TASK) && 3708 if ((event->attach_state & PERF_ATTACH_TASK) &&
3721 event->hw.target != current) { 3709 event->hw.target != current) {