21 files changed, 2631 insertions, 2427 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst
index d76ab3907e2b..b2598cc9834c 100644
--- a/Documentation/admin-guide/kernel-parameters.rst
+++ b/Documentation/admin-guide/kernel-parameters.rst
@@ -138,6 +138,7 @@ parameter is applicable::
        PPT     Parallel port support is enabled.
        PS2     Appropriate PS/2 support is enabled.
        RAM     RAM disk support is enabled.
+        RDT     Intel Resource Director Technology.
        S390    S390 architecture is enabled.
        SCSI    Appropriate SCSI support is enabled.
                        A lot of drivers have their options described inside
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index dad6fa01af95..591d48f3a7de 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3612,6 +3612,12 @@
                        Run specified binary instead of /init from the ramdisk,
                        used for early userspace startup. See initrd.
+        rdt=            [HW,X86,RDT]
+                        Turn on/off individual RDT features. List is:
+                        cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, mba.
+                        E.g. to turn on cmt and turn off mba use:
+                                rdt=cmt,!mba
        reboot=         [KNL]
                        Format (x86 or x86_64):
                                [w[arm] | c[old] | h[ard] | s[oft] | g[pio]] \
diff --git a/Documentation/x86/intel_rdt_ui.txt b/Documentation/x86/intel_rdt_ui.txt
index c491a1b82de2..4d8848e4e224 100644
--- a/Documentation/x86/intel_rdt_ui.txt
+++ b/Documentation/x86/intel_rdt_ui.txt
@@ -6,8 +6,8 @@ Fenghua Yu <fenghua.yu@intel.com>
 Tony Luck <tony.luck@intel.com>
 Vikas Shivappa <vikas.shivappa@intel.com>
-This feature is enabled by the CONFIG_INTEL_RDT_A Kconfig and the
+This feature is enabled by the CONFIG_INTEL_RDT Kconfig and the
-X86 /proc/cpuinfo flag bits "rdt", "cat_l3" and "cdp_l3".
+X86 /proc/cpuinfo flag bits "rdt", "cqm", "cat_l3" and "cdp_l3".
 To use the feature mount the file system:
@@ -17,6 +17,13 @@ mount options are:
 "cdp": Enable code/data prioritization in L3 cache allocations.
+RDT features are orthogonal. A particular system may support only
+monitoring, only control, or both monitoring and control.
+The mount succeeds if either of allocation or monitoring is present, but
+only those files and directories supported by the system will be created.
+For more details on the behavior of the interface during monitoring
+and allocation, see the "Resource alloc and monitor groups" section.
 Info directory
 --------------
@@ -24,7 +31,12 @@ Info directory
 The 'info' directory contains information about the enabled
 resources. Each resource has its own subdirectory. The subdirectory
 names reflect the resource names.
-Cache resource(L3/L2)  subdirectory contains the following files:
+Each subdirectory contains the following files with respect to
+allocation:
+Cache resource(L3/L2)  subdirectory contains the following files
+related to allocation:
 "num_closids":          The number of CLOSIDs which are valid for this
                        resource. The kernel uses the smallest number of
@@ -36,7 +48,15 @@ Cache resource(L3/L2)  subdirectory contains the following files:
 "min_cbm_bits":         The minimum number of consecutive bits which
                        must be set when writing a mask.
-Memory bandwitdh(MB) subdirectory contains the following files:
+"shareable_bits":       Bitmask of shareable resource with other executing
+                        entities (e.g. I/O). User can use this when
+                        setting up exclusive cache partitions. Note that
+                        some platforms support devices that have their
+                        own settings for cache use which can over-ride
+                        these bits.
+Memory bandwitdh(MB) subdirectory contains the following files
+with respect to allocation:
 "min_bandwidth":        The minimum memory bandwidth percentage which
                        user can request.
@@ -52,48 +72,152 @@ Memory bandwitdh(MB) subdirectory contains the following files:
                        non-linear. This field is purely informational
                        only.
-Resource groups
+If RDT monitoring is available there will be an "L3_MON" directory
---------------
+with the following files:
+"num_rmids":            The number of RMIDs available. This is the
+                        upper bound for how many "CTRL_MON" + "MON"
+                        groups can be created.
+"mon_features": Lists the monitoring events if
+                        monitoring is enabled for the resource.
+"max_threshold_occupancy":
+                        Read/write file provides the largest value (in
+                        bytes) at which a previously used LLC_occupancy
+                        counter can be considered for re-use.
+Resource alloc and monitor groups
+---------------------------------
 Resource groups are represented as directories in the resctrl file
-system. The default group is the root directory. Other groups may be
+system.  The default group is the root directory which, immediately
-created as desired by the system administrator using the "mkdir(1)"
+after mounting, owns all the tasks and cpus in the system and can make
-command, and removed using "rmdir(1)".
+full use of all resources.
+On a system with RDT control features additional directories can be
+created in the root directory that specify different amounts of each
+resource (see "schemata" below). The root and these additional top level
+directories are referred to as "CTRL_MON" groups below.
+On a system with RDT monitoring the root directory and other top level
+directories contain a directory named "mon_groups" in which additional
+directories can be created to monitor subsets of tasks in the CTRL_MON
+group that is their ancestor. These are called "MON" groups in the rest
+of this document.
+Removing a directory will move all tasks and cpus owned by the group it
+represents to the parent. Removing one of the created CTRL_MON groups
+will automatically remove all MON groups below it.
+All groups contain the following files:
+"tasks":
+        Reading this file shows the list of all tasks that belong to
+        this group. Writing a task id to the file will add a task to the
+        group. If the group is a CTRL_MON group the task is removed from
+        whichever previous CTRL_MON group owned the task and also from
+        any MON group that owned the task. If the group is a MON group,
+        then the task must already belong to the CTRL_MON parent of this
+        group. The task is removed from any previous MON group.
+"cpus":
+        Reading this file shows a bitmask of the logical CPUs owned by
+        this group. Writing a mask to this file will add and remove
+        CPUs to/from this group. As with the tasks file a hierarchy is
+        maintained where MON groups may only include CPUs owned by the
+        parent CTRL_MON group.
-There are three files associated with each group:
+"cpus_list":
+        Just like "cpus", only using ranges of CPUs instead of bitmasks.
-"tasks": A list of tasks that belongs to this group. Tasks can be
-        added to a group by writing the task ID to the "tasks" file
-        (which will automatically remove them from the previous
-        group to which they belonged). New tasks created by fork(2)
-        and clone(2) are added to the same group as their parent.
-        If a pid is not in any sub partition, it is in root partition
-        (i.e. default partition).
-"cpus": A bitmask of logical CPUs assigned to this group. Writing
+When control is enabled all CTRL_MON groups will also contain:
-        a new mask can add/remove CPUs from this group. Added CPUs
-        are removed from their previous group. Removed ones are
-        given to the default (root) group. You cannot remove CPUs
-        from the default group.
-"cpus_list": One or more CPU ranges of logical CPUs assigned to this
+"schemata":
-             group. Same rules apply like for the "cpus" file.
+        A list of all the resources available to this group.
+        Each resource has its own line and format - see below for details.
-"schemata": A list of all the resources available to this group.
+When monitoring is enabled all MON groups will also contain:
-        Each resource has its own line and format - see below for
-        details.
-When a task is running the following rules define which resources
+"mon_data":
-are available to it:
+        This contains a set of files organized by L3 domain and by
+        RDT event. E.g. on a system with two L3 domains there will
+        be subdirectories "mon_L3_00" and "mon_L3_01".  Each of these
+        directories have one file per event (e.g. "llc_occupancy",
+        "mbm_total_bytes", and "mbm_local_bytes"). In a MON group these
+        files provide a read out of the current value of the event for
+        all tasks in the group. In CTRL_MON groups these files provide
+        the sum for all tasks in the CTRL_MON group and all tasks in
+        MON groups. Please see example section for more details on usage.
+Resource allocation rules
+-------------------------
+When a task is running the following rules define which resources are
+available to it:
 1) If the task is a member of a non-default group, then the schemata
-for that group is used.
+   for that group is used.
 2) Else if the task belongs to the default group, but is running on a
-CPU that is assigned to some specific group, then the schemata for
+   CPU that is assigned to some specific group, then the schemata for the
-the CPU's group is used.
+   CPU's group is used.
 3) Otherwise the schemata for the default group is used.
+Resource monitoring rules
+-------------------------
+1) If a task is a member of a MON group, or non-default CTRL_MON group
+   then RDT events for the task will be reported in that group.
+2) If a task is a member of the default CTRL_MON group, but is running
+   on a CPU that is assigned to some specific group, then the RDT events
+   for the task will be reported in that group.
+3) Otherwise RDT events for the task will be reported in the root level
+   "mon_data" group.
+Notes on cache occupancy monitoring and control
+-----------------------------------------------
+When moving a task from one group to another you should remember that
+this only affects *new* cache allocations by the task. E.g. you may have
+a task in a monitor group showing 3 MB of cache occupancy. If you move
+to a new group and immediately check the occupancy of the old and new
+groups you will likely see that the old group is still showing 3 MB and
+the new group zero. When the task accesses locations still in cache from
+before the move, the h/w does not update any counters. On a busy system
+you will likely see the occupancy in the old group go down as cache lines
+are evicted and re-used while the occupancy in the new group rises as
+the task accesses memory and loads into the cache are counted based on
+membership in the new group.
+The same applies to cache allocation control. Moving a task to a group
+with a smaller cache partition will not evict any cache lines. The
+process may continue to use them from the old partition.
+Hardware uses CLOSid(Class of service ID) and an RMID(Resource monitoring ID)
+to identify a control group and a monitoring group respectively. Each of
+the resource groups are mapped to these IDs based on the kind of group. The
+number of CLOSid and RMID are limited by the hardware and hence the creation of
+a "CTRL_MON" directory may fail if we run out of either CLOSID or RMID
+and creation of "MON" group may fail if we run out of RMIDs.
+max_threshold_occupancy - generic concepts
+------------------------------------------
+Note that an RMID once freed may not be immediately available for use as
+the RMID is still tagged the cache lines of the previous user of RMID.
+Hence such RMIDs are placed on limbo list and checked back if the cache
+occupancy has gone down. If there is a time when system has a lot of
+limbo RMIDs but which are not ready to be used, user may see an -EBUSY
+during mkdir.
+max_threshold_occupancy is a user configurable value to determine the
+occupancy at which an RMID can be freed.
 Schemata files - general concepts
 ---------------------------------
@@ -143,22 +267,22 @@ SKUs. Using a high bandwidth and a low bandwidth setting on two threads
 sharing a core will result in both threads being throttled to use the
 low bandwidth.
-L3 details (code and data prioritization disabled)
+L3 schemata file details (code and data prioritization disabled)
--------------------------------------------------
+----------------------------------------------------------------
 With CDP disabled the L3 schemata format is:
        L3:<cache_id0>=<cbm>;<cache_id1>=<cbm>;...
-L3 details (CDP enabled via mount option to resctrl)
+L3 schemata file details (CDP enabled via mount option to resctrl)
----------------------------------------------------
+------------------------------------------------------------------
 When CDP is enabled L3 control is split into two separate resources
 so you can specify independent masks for code and data like this:
        L3data:<cache_id0>=<cbm>;<cache_id1>=<cbm>;...
        L3code:<cache_id0>=<cbm>;<cache_id1>=<cbm>;...
-L2 details
+L2 schemata file details
----------
+------------------------
 L2 cache does not support code and data prioritization, so the
 schemata format is always:
@@ -185,6 +309,8 @@ L3CODE:0=fffff;1=fffff;2=fffff;3=fffff
 L3DATA:0=fffff;1=fffff;2=3c0;3=fffff
 L3CODE:0=fffff;1=fffff;2=fffff;3=fffff
+Examples for RDT allocation usage:
 Example 1
 ---------
 On a two socket machine (one L3 cache per socket) with just four bits
@@ -410,3 +536,124 @@ void main(void)
        /* code to read and write directory contents */
        resctrl_release_lock(fd);
 }
+Examples for RDT Monitoring along with allocation usage:
+Reading monitored data
+----------------------
+Reading an event file (for ex: mon_data/mon_L3_00/llc_occupancy) would
+show the current snapshot of LLC occupancy of the corresponding MON
+group or CTRL_MON group.
+Example 1 (Monitor CTRL_MON group and subset of tasks in CTRL_MON group)
+---------
+On a two socket machine (one L3 cache per socket) with just four bits
+for cache bit masks
+# mount -t resctrl resctrl /sys/fs/resctrl
+# cd /sys/fs/resctrl
+# mkdir p0 p1
+# echo "L3:0=3;1=c" > /sys/fs/resctrl/p0/schemata
+# echo "L3:0=3;1=3" > /sys/fs/resctrl/p1/schemata
+# echo 5678 > p1/tasks
+# echo 5679 > p1/tasks
+The default resource group is unmodified, so we have access to all parts
+of all caches (its schemata file reads "L3:0=f;1=f").
+Tasks that are under the control of group "p0" may only allocate from the
+"lower" 50% on cache ID 0, and the "upper" 50% of cache ID 1.
+Tasks in group "p1" use the "lower" 50% of cache on both sockets.
+Create monitor groups and assign a subset of tasks to each monitor group.
+# cd /sys/fs/resctrl/p1/mon_groups
+# mkdir m11 m12
+# echo 5678 > m11/tasks
+# echo 5679 > m12/tasks
+fetch data (data shown in bytes)
+# cat m11/mon_data/mon_L3_00/llc_occupancy
+16234000
+# cat m11/mon_data/mon_L3_01/llc_occupancy
+14789000
+# cat m12/mon_data/mon_L3_00/llc_occupancy
+16789000
+The parent ctrl_mon group shows the aggregated data.
+# cat /sys/fs/resctrl/p1/mon_data/mon_l3_00/llc_occupancy
+31234000
+Example 2 (Monitor a task from its creation)
+---------
+On a two socket machine (one L3 cache per socket)
+# mount -t resctrl resctrl /sys/fs/resctrl
+# cd /sys/fs/resctrl
+# mkdir p0 p1
+An RMID is allocated to the group once its created and hence the <cmd>
+below is monitored from its creation.
+# echo $$ > /sys/fs/resctrl/p1/tasks
+# <cmd>
+Fetch the data
+# cat /sys/fs/resctrl/p1/mon_data/mon_l3_00/llc_occupancy
+31789000
+Example 3 (Monitor without CAT support or before creating CAT groups)
+---------
+Assume a system like HSW has only CQM and no CAT support. In this case
+the resctrl will still mount but cannot create CTRL_MON directories.
+But user can create different MON groups within the root group thereby
+able to monitor all tasks including kernel threads.
+This can also be used to profile jobs cache size footprint before being
+able to allocate them to different allocation groups.
+# mount -t resctrl resctrl /sys/fs/resctrl
+# cd /sys/fs/resctrl
+# mkdir mon_groups/m01
+# mkdir mon_groups/m02
+# echo 3478 > /sys/fs/resctrl/mon_groups/m01/tasks
+# echo 2467 > /sys/fs/resctrl/mon_groups/m02/tasks
+Monitor the groups separately and also get per domain data. From the
+below its apparent that the tasks are mostly doing work on
+domain(socket) 0.
+# cat /sys/fs/resctrl/mon_groups/m01/mon_L3_00/llc_occupancy
+31234000
+# cat /sys/fs/resctrl/mon_groups/m01/mon_L3_01/llc_occupancy
+34555
+# cat /sys/fs/resctrl/mon_groups/m02/mon_L3_00/llc_occupancy
+31234000
+# cat /sys/fs/resctrl/mon_groups/m02/mon_L3_01/llc_occupancy
+32789
+Example 4 (Monitor real time tasks)
+-----------------------------------
+A single socket system which has real time tasks running on cores 4-7
+and non real time tasks on other cpus. We want to monitor the cache
+occupancy of the real time threads on these cores.
+# mount -t resctrl resctrl /sys/fs/resctrl
+# cd /sys/fs/resctrl
+# mkdir p1
+Move the cpus 4-7 over to p1
+# echo f0 > p0/cpus
+View the llc occupancy snapshot
+# cat /sys/fs/resctrl/p1/mon_data/mon_L3_00/llc_occupancy
+11234000
diff --git a/MAINTAINERS b/MAINTAINERS
index b81e93b71c4b..8ef4694af6e8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11121,7 +11121,7 @@ M:	Fenghua Yu <fenghua.yu@intel.com>
 L:      linux-kernel@vger.kernel.org
 S:      Supported
 F:      arch/x86/kernel/cpu/intel_rdt*
-F:      arch/x86/include/asm/intel_rdt*
+F:      arch/x86/include/asm/intel_rdt_sched.h
 F:      Documentation/x86/intel_rdt*
 READ-COPY UPDATE (RCU)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b4b27ab016f6..acb366bf6bc1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -429,16 +429,16 @@ config GOLDFISH
       def_bool y
       depends on X86_GOLDFISH
-config INTEL_RDT_A
+config INTEL_RDT
-        bool "Intel Resource Director Technology Allocation support"
+        bool "Intel Resource Director Technology support"
        default n
        depends on X86 && CPU_SUP_INTEL
        select KERNFS
        help
-          Select to enable resource allocation which is a sub-feature of
+          Select to enable resource allocation and monitoring which are
-          Intel Resource Director Technology(RDT). More information about
+          sub-features of Intel Resource Director Technology(RDT). More
-          RDT can be found in the Intel x86 Architecture Software
+          information about RDT can be found in the Intel x86
-          Developer Manual.
+          Architecture Software Developer Manual.
          Say N if unsure.
diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile
index 06c2baa51814..e9d8520a801a 100644
--- a/arch/x86/events/intel/Makefile
+++ b/arch/x86/events/intel/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_CPU_SUP_INTEL)             += core.o bts.o cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL)             += core.o bts.o
 obj-$(CONFIG_CPU_SUP_INTEL)             += ds.o knc.o
 obj-$(CONFIG_CPU_SUP_INTEL)             += lbr.o p4.o p6.o pt.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL)    += intel-rapl-perf.o
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
deleted file mode 100644
index 2521f771f2f5..000000000000
--- a/arch/x86/events/intel/cqm.c
+++ /dev/null
@@ -1,1766 +0,0 @@
-/*
- * Intel Cache Quality-of-Service Monitoring (CQM) support.
- *
- * Based very, very heavily on work by Peter Zijlstra.
- */
-#include <linux/perf_event.h>
-#include <linux/slab.h>
-#include <asm/cpu_device_id.h>
-#include <asm/intel_rdt_common.h>
-#include "../perf_event.h"
-#define MSR_IA32_QM_CTR         0x0c8e
-#define MSR_IA32_QM_EVTSEL      0x0c8d
-#define MBM_CNTR_WIDTH          24
-/*
- * Guaranteed time in ms as per SDM where MBM counters will not overflow.
- */
-#define MBM_CTR_OVERFLOW_TIME   1000
-static u32 cqm_max_rmid = -1;
-static unsigned int cqm_l3_scale; /* supposedly cacheline size */
-static bool cqm_enabled, mbm_enabled;
-unsigned int mbm_socket_max;
-/*
- * The cached intel_pqr_state is strictly per CPU and can never be
- * updated from a remote CPU. Both functions which modify the state
- * (intel_cqm_event_start and intel_cqm_event_stop) are called with
- * interrupts disabled, which is sufficient for the protection.
- */
-DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
-static struct hrtimer *mbm_timers;
-/**
- * struct sample - mbm event's (local or total) data
- * @total_bytes    #bytes since we began monitoring
- * @prev_msr       previous value of MSR
- */
-struct sample {
-        u64     total_bytes;
-        u64     prev_msr;
-};
-/*
- * samples profiled for total memory bandwidth type events
- */
-static struct sample *mbm_total;
-/*
- * samples profiled for local memory bandwidth type events
- */
-static struct sample *mbm_local;
-#define pkg_id  topology_physical_package_id(smp_processor_id())
-/*
- * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array.
- * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of
- * rmids per socket, an example is given below
- * RMID1 of Socket0:  vrmid =  1
- * RMID1 of Socket1:  vrmid =  1 * (cqm_max_rmid + 1) + 1
- * RMID1 of Socket2:  vrmid =  2 * (cqm_max_rmid + 1) + 1
- */
-#define rmid_2_index(rmid)  ((pkg_id * (cqm_max_rmid + 1)) + rmid)
-/*
- * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
- * Also protects event->hw.cqm_rmid
- *
- * Hold either for stability, both for modification of ->hw.cqm_rmid.
- */
-static DEFINE_MUTEX(cache_mutex);
-static DEFINE_RAW_SPINLOCK(cache_lock);
-/*
- * Groups of events that have the same target(s), one RMID per group.
- */
-static LIST_HEAD(cache_groups);
-/*
- * Mask of CPUs for reading CQM values. We only need one per-socket.
- */
-static cpumask_t cqm_cpumask;
-#define RMID_VAL_ERROR          (1ULL << 63)
-#define RMID_VAL_UNAVAIL        (1ULL << 62)
-/*
- * Event IDs are used to program IA32_QM_EVTSEL before reading event
- * counter from IA32_QM_CTR
- */
-#define QOS_L3_OCCUP_EVENT_ID   0x01
-#define QOS_MBM_TOTAL_EVENT_ID  0x02
-#define QOS_MBM_LOCAL_EVENT_ID  0x03
-/*
- * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
- *
- * This rmid is always free and is guaranteed to have an associated
- * near-zero occupancy value, i.e. no cachelines are tagged with this
- * RMID, once __intel_cqm_rmid_rotate() returns.
- */
-static u32 intel_cqm_rotation_rmid;
-#define INVALID_RMID            (-1)
-/*
- * Is @rmid valid for programming the hardware?
- *
- * rmid 0 is reserved by the hardware for all non-monitored tasks, which
- * means that we should never come across an rmid with that value.
- * Likewise, an rmid value of -1 is used to indicate "no rmid currently
- * assigned" and is used as part of the rotation code.
- */
-static inline bool __rmid_valid(u32 rmid)
-{
-        if (!rmid || rmid == INVALID_RMID)
-                return false;
-        return true;
-}
-static u64 __rmid_read(u32 rmid)
-{
-        u64 val;
-        /*
-         * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
-         * it just says that to increase confusion.
-         */
-        wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
-        rdmsrl(MSR_IA32_QM_CTR, val);
-        /*
-         * Aside from the ERROR and UNAVAIL bits, assume this thing returns
-         * the number of cachelines tagged with @rmid.
-         */
-        return val;
-}
-enum rmid_recycle_state {
-        RMID_YOUNG = 0,
-        RMID_AVAILABLE,
-        RMID_DIRTY,
-};
-struct cqm_rmid_entry {
-        u32 rmid;
-        enum rmid_recycle_state state;
-        struct list_head list;
-        unsigned long queue_time;
-};
-/*
- * cqm_rmid_free_lru - A least recently used list of RMIDs.
- *
- * Oldest entry at the head, newest (most recently used) entry at the
- * tail. This list is never traversed, it's only used to keep track of
- * the lru order. That is, we only pick entries of the head or insert
- * them on the tail.
- *
- * All entries on the list are 'free', and their RMIDs are not currently
- * in use. To mark an RMID as in use, remove its entry from the lru
- * list.
- *
- *
- * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
- *
- * This list is contains RMIDs that no one is currently using but that
- * may have a non-zero occupancy value associated with them. The
- * rotation worker moves RMIDs from the limbo list to the free list once
- * the occupancy value drops below __intel_cqm_threshold.
- *
- * Both lists are protected by cache_mutex.
- */
-static LIST_HEAD(cqm_rmid_free_lru);
-static LIST_HEAD(cqm_rmid_limbo_lru);
-/*
- * We use a simple array of pointers so that we can lookup a struct
- * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
- * and __put_rmid() from having to worry about dealing with struct
- * cqm_rmid_entry - they just deal with rmids, i.e. integers.
- *
- * Once this array is initialized it is read-only. No locks are required
- * to access it.
- *
- * All entries for all RMIDs can be looked up in the this array at all
- * times.
- */
-static struct cqm_rmid_entry **cqm_rmid_ptrs;
-static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
-{
-        struct cqm_rmid_entry *entry;
-        entry = cqm_rmid_ptrs[rmid];
-        WARN_ON(entry->rmid != rmid);
-        return entry;
-}
-/*
- * Returns < 0 on fail.
- *
- * We expect to be called with cache_mutex held.
- */
-static u32 __get_rmid(void)
-{
-        struct cqm_rmid_entry *entry;
-        lockdep_assert_held(&cache_mutex);
-        if (list_empty(&cqm_rmid_free_lru))
-                return INVALID_RMID;
-        entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
-        list_del(&entry->list);
-        return entry->rmid;
-}
-static void __put_rmid(u32 rmid)
-{
-        struct cqm_rmid_entry *entry;
-        lockdep_assert_held(&cache_mutex);
-        WARN_ON(!__rmid_valid(rmid));
-        entry = __rmid_entry(rmid);
-        entry->queue_time = jiffies;
-        entry->state = RMID_YOUNG;
-        list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
-}
-static void cqm_cleanup(void)
-{
-        int i;
-        if (!cqm_rmid_ptrs)
-                return;
-        for (i = 0; i < cqm_max_rmid; i++)
-                kfree(cqm_rmid_ptrs[i]);
-        kfree(cqm_rmid_ptrs);
-        cqm_rmid_ptrs = NULL;
-        cqm_enabled = false;
-}
-static int intel_cqm_setup_rmid_cache(void)
-{
-        struct cqm_rmid_entry *entry;
-        unsigned int nr_rmids;
-        int r = 0;
-        nr_rmids = cqm_max_rmid + 1;
-        cqm_rmid_ptrs = kzalloc(sizeof(struct cqm_rmid_entry *) *
-                                nr_rmids, GFP_KERNEL);
-        if (!cqm_rmid_ptrs)
-                return -ENOMEM;
-        for (; r <= cqm_max_rmid; r++) {
-                struct cqm_rmid_entry *entry;
-                entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-                if (!entry)
-                        goto fail;
-                INIT_LIST_HEAD(&entry->list);
-                entry->rmid = r;
-                cqm_rmid_ptrs[r] = entry;
-                list_add_tail(&entry->list, &cqm_rmid_free_lru);
-        }
-        /*
-         * RMID 0 is special and is always allocated. It's used for all
-         * tasks that are not monitored.
-         */
-        entry = __rmid_entry(0);
-        list_del(&entry->list);
-        mutex_lock(&cache_mutex);
-        intel_cqm_rotation_rmid = __get_rmid();
-        mutex_unlock(&cache_mutex);
-        return 0;
-fail:
-        cqm_cleanup();
-        return -ENOMEM;
-}
-/*
- * Determine if @a and @b measure the same set of tasks.
- *
- * If @a and @b measure the same set of tasks then we want to share a
- * single RMID.
- */
-static bool __match_event(struct perf_event *a, struct perf_event *b)
-{
-        /* Per-cpu and task events don't mix */
-        if ((a->attach_state & PERF_ATTACH_TASK) !=
-            (b->attach_state & PERF_ATTACH_TASK))
-                return false;
-#ifdef CONFIG_CGROUP_PERF
-        if (a->cgrp != b->cgrp)
-                return false;
-#endif
-        /* If not task event, we're machine wide */
-        if (!(b->attach_state & PERF_ATTACH_TASK))
-                return true;
-        /*
-         * Events that target same task are placed into the same cache group.
-         * Mark it as a multi event group, so that we update ->count
-         * for every event rather than just the group leader later.
-         */
-        if (a->hw.target == b->hw.target) {
-                b->hw.is_group_event = true;
-                return true;
-        }
-        /*
-         * Are we an inherited event?
-         */
-        if (b->parent == a)
-                return true;
-        return false;
-}
-#ifdef CONFIG_CGROUP_PERF
-static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
-{
-        if (event->attach_state & PERF_ATTACH_TASK)
-                return perf_cgroup_from_task(event->hw.target, event->ctx);
-        return event->cgrp;
-}
-#endif
-/*
- * Determine if @a's tasks intersect with @b's tasks
- *
- * There are combinations of events that we explicitly prohibit,
- *
- *                 PROHIBITS
- *     system-wide    ->        cgroup and task
- *     cgroup         ->        system-wide
- *                    ->        task in cgroup
- *     task           ->        system-wide
- *                    ->        task in cgroup
- *
- * Call this function before allocating an RMID.
- */
-static bool __conflict_event(struct perf_event *a, struct perf_event *b)
-{
-#ifdef CONFIG_CGROUP_PERF
-        /*
-         * We can have any number of cgroups but only one system-wide
-         * event at a time.
-         */
-        if (a->cgrp && b->cgrp) {
-                struct perf_cgroup *ac = a->cgrp;
-                struct perf_cgroup *bc = b->cgrp;
-                /*
-                 * This condition should have been caught in
-                 * __match_event() and we should be sharing an RMID.
-                 */
-                WARN_ON_ONCE(ac == bc);
-                if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
-                    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
-                        return true;
-                return false;
-        }
-        if (a->cgrp || b->cgrp) {
-                struct perf_cgroup *ac, *bc;
-                /*
-                 * cgroup and system-wide events are mutually exclusive
-                 */
-                if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
-                    (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
-                        return true;
-                /*
-                 * Ensure neither event is part of the other's cgroup
-                 */
-                ac = event_to_cgroup(a);
-                bc = event_to_cgroup(b);
-                if (ac == bc)
-                        return true;
-                /*
-                 * Must have cgroup and non-intersecting task events.
-                 */
-                if (!ac || !bc)
-                        return false;
-                /*
-                 * We have cgroup and task events, and the task belongs
-                 * to a cgroup. Check for for overlap.
-                 */
-                if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
-                    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
-                        return true;
-                return false;
-        }
-#endif
-        /*
-         * If one of them is not a task, same story as above with cgroups.
-         */
-        if (!(a->attach_state & PERF_ATTACH_TASK) ||
-            !(b->attach_state & PERF_ATTACH_TASK))
-                return true;
-        /*
-         * Must be non-overlapping.
-         */
-        return false;
-}
-struct rmid_read {
-        u32 rmid;
-        u32 evt_type;
-        atomic64_t value;
-};
-static void __intel_cqm_event_count(void *info);
-static void init_mbm_sample(u32 rmid, u32 evt_type);
-static void __intel_mbm_event_count(void *info);
-static bool is_cqm_event(int e)
-{
-        return (e == QOS_L3_OCCUP_EVENT_ID);
-}
-static bool is_mbm_event(int e)
-{
-        return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID);
-}
-static void cqm_mask_call(struct rmid_read *rr)
-{
-        if (is_mbm_event(rr->evt_type))
-                on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, rr, 1);
-        else
-                on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, rr, 1);
-}
-/*
- * Exchange the RMID of a group of events.
- */
-static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
-{
-        struct perf_event *event;
-        struct list_head *head = &group->hw.cqm_group_entry;
-        u32 old_rmid = group->hw.cqm_rmid;
-        lockdep_assert_held(&cache_mutex);
-        /*
-         * If our RMID is being deallocated, perform a read now.
-         */
-        if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
-                struct rmid_read rr = {
-                        .rmid = old_rmid,
-                        .evt_type = group->attr.config,
-                        .value = ATOMIC64_INIT(0),
-                };
-                cqm_mask_call(&rr);
-                local64_set(&group->count, atomic64_read(&rr.value));
-        }
-        raw_spin_lock_irq(&cache_lock);
-        group->hw.cqm_rmid = rmid;
-        list_for_each_entry(event, head, hw.cqm_group_entry)
-                event->hw.cqm_rmid = rmid;
-        raw_spin_unlock_irq(&cache_lock);
-        /*
-         * If the allocation is for mbm, init the mbm stats.
-         * Need to check if each event in the group is mbm event
-         * because there could be multiple type of events in the same group.
-         */
-        if (__rmid_valid(rmid)) {
-                event = group;
-                if (is_mbm_event(event->attr.config))
-                        init_mbm_sample(rmid, event->attr.config);
-                list_for_each_entry(event, head, hw.cqm_group_entry) {
-                        if (is_mbm_event(event->attr.config))
-                                init_mbm_sample(rmid, event->attr.config);
-                }
-        }
-        return old_rmid;
-}
-/*
- * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
- * cachelines are still tagged with RMIDs in limbo, we progressively
- * increment the threshold until we find an RMID in limbo with <=
- * __intel_cqm_threshold lines tagged. This is designed to mitigate the
- * problem where cachelines tagged with an RMID are not steadily being
- * evicted.
- *
- * On successful rotations we decrease the threshold back towards zero.
- *
- * __intel_cqm_max_threshold provides an upper bound on the threshold,
- * and is measured in bytes because it's exposed to userland.
- */
-static unsigned int __intel_cqm_threshold;
-static unsigned int __intel_cqm_max_threshold;
-/*
- * Test whether an RMID has a zero occupancy value on this cpu.
- */
-static void intel_cqm_stable(void *arg)
-{
-        struct cqm_rmid_entry *entry;
-        list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
-                if (entry->state != RMID_AVAILABLE)
-                        break;
-                if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
-                        entry->state = RMID_DIRTY;
-        }
-}
-/*
- * If we have group events waiting for an RMID that don't conflict with
- * events already running, assign @rmid.
- */
-static bool intel_cqm_sched_in_event(u32 rmid)
-{
-        struct perf_event *leader, *event;
-        lockdep_assert_held(&cache_mutex);
-        leader = list_first_entry(&cache_groups, struct perf_event,
-                                  hw.cqm_groups_entry);
-        event = leader;
-        list_for_each_entry_continue(event, &cache_groups,
-                                     hw.cqm_groups_entry) {
-                if (__rmid_valid(event->hw.cqm_rmid))
-                        continue;
-                if (__conflict_event(event, leader))
-                        continue;
-                intel_cqm_xchg_rmid(event, rmid);
-                return true;
-        }
-        return false;
-}
-/*
- * Initially use this constant for both the limbo queue time and the
- * rotation timer interval, pmu::hrtimer_interval_ms.
- *
- * They don't need to be the same, but the two are related since if you
- * rotate faster than you recycle RMIDs, you may run out of available
- * RMIDs.
- */
-#define RMID_DEFAULT_QUEUE_TIME 250     /* ms */
-static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
-/*
- * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
- * @nr_available: number of freeable RMIDs on the limbo list
- *
- * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
- * cachelines are tagged with those RMIDs. After this we can reuse them
- * and know that the current set of active RMIDs is stable.
- *
- * Return %true or %false depending on whether stabilization needs to be
- * reattempted.
- *
- * If we return %true then @nr_available is updated to indicate the
- * number of RMIDs on the limbo list that have been queued for the
- * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
- * are above __intel_cqm_threshold.
- */
-static bool intel_cqm_rmid_stabilize(unsigned int *available)
-{
-        struct cqm_rmid_entry *entry, *tmp;
-        lockdep_assert_held(&cache_mutex);
-        *available = 0;
-        list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
-                unsigned long min_queue_time;
-                unsigned long now = jiffies;
-                /*
-                 * We hold RMIDs placed into limbo for a minimum queue
-                 * time. Before the minimum queue time has elapsed we do
-                 * not recycle RMIDs.
-                 *
-                 * The reasoning is that until a sufficient time has
-                 * passed since we stopped using an RMID, any RMID
-                 * placed onto the limbo list will likely still have
-                 * data tagged in the cache, which means we'll probably
-                 * fail to recycle it anyway.
-                 *
-                 * We can save ourselves an expensive IPI by skipping
-                 * any RMIDs that have not been queued for the minimum
-                 * time.
-                 */
-                min_queue_time = entry->queue_time +
-                        msecs_to_jiffies(__rmid_queue_time_ms);
-                if (time_after(min_queue_time, now))
-                        break;
-                entry->state = RMID_AVAILABLE;
-                (*available)++;
-        }
-        /*
-         * Fast return if none of the RMIDs on the limbo list have been
-         * sitting on the queue for the minimum queue time.
-         */
-        if (!*available)
-                return false;
-        /*
-         * Test whether an RMID is free for each package.
-         */
-        on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
-        list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
-                /*
-                 * Exhausted all RMIDs that have waited min queue time.
-                 */
-                if (entry->state == RMID_YOUNG)
-                        break;
-                if (entry->state == RMID_DIRTY)
-                        continue;
-                list_del(&entry->list); /* remove from limbo */
-                /*
-                 * The rotation RMID gets priority if it's
-                 * currently invalid. In which case, skip adding
-                 * the RMID to the the free lru.
-                 */
-                if (!__rmid_valid(intel_cqm_rotation_rmid)) {
-                        intel_cqm_rotation_rmid = entry->rmid;
-                        continue;
-                }
-                /*
-                 * If we have groups waiting for RMIDs, hand
-                 * them one now provided they don't conflict.
-                 */
-                if (intel_cqm_sched_in_event(entry->rmid))
-                        continue;
-                /*
-                 * Otherwise place it onto the free list.
-                 */
-                list_add_tail(&entry->list, &cqm_rmid_free_lru);
-        }
-        return __rmid_valid(intel_cqm_rotation_rmid);
-}
-/*
- * Pick a victim group and move it to the tail of the group list.
- * @next: The first group without an RMID
- */
-static void __intel_cqm_pick_and_rotate(struct perf_event *next)
-{
-        struct perf_event *rotor;
-        u32 rmid;
-        lockdep_assert_held(&cache_mutex);
-        rotor = list_first_entry(&cache_groups, struct perf_event,
-                                 hw.cqm_groups_entry);
-        /*
-         * The group at the front of the list should always have a valid
-         * RMID. If it doesn't then no groups have RMIDs assigned and we
-         * don't need to rotate the list.
-         */
-        if (next == rotor)
-                return;
-        rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
-        __put_rmid(rmid);
-        list_rotate_left(&cache_groups);
-}
-/*
- * Deallocate the RMIDs from any events that conflict with @event, and
- * place them on the back of the group list.
- */
-static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
-{
-        struct perf_event *group, *g;
-        u32 rmid;
-        lockdep_assert_held(&cache_mutex);
-        list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
-                if (group == event)
-                        continue;
-                rmid = group->hw.cqm_rmid;
-                /*
-                 * Skip events that don't have a valid RMID.
-                 */
-                if (!__rmid_valid(rmid))
-                        continue;
-                /*
-                 * No conflict? No problem! Leave the event alone.
-                 */
-                if (!__conflict_event(group, event))
-                        continue;
-                intel_cqm_xchg_rmid(group, INVALID_RMID);
-                __put_rmid(rmid);
-        }
-}
-/*
- * Attempt to rotate the groups and assign new RMIDs.
- *
- * We rotate for two reasons,
- *   1. To handle the scheduling of conflicting events
- *   2. To recycle RMIDs
- *
- * Rotating RMIDs is complicated because the hardware doesn't give us
- * any clues.
- *
- * There's problems with the hardware interface; when you change the
- * task:RMID map cachelines retain their 'old' tags, giving a skewed
- * picture. In order to work around this, we must always keep one free
- * RMID - intel_cqm_rotation_rmid.
- *
- * Rotation works by taking away an RMID from a group (the old RMID),
- * and assigning the free RMID to another group (the new RMID). We must
- * then wait for the old RMID to not be used (no cachelines tagged).
- * This ensure that all cachelines are tagged with 'active' RMIDs. At
- * this point we can start reading values for the new RMID and treat the
- * old RMID as the free RMID for the next rotation.
- *
- * Return %true or %false depending on whether we did any rotating.
- */
-static bool __intel_cqm_rmid_rotate(void)
-{
-        struct perf_event *group, *start = NULL;
-        unsigned int threshold_limit;
-        unsigned int nr_needed = 0;
-        unsigned int nr_available;
-        bool rotated = false;
-        mutex_lock(&cache_mutex);
-again:
-        /*
-         * Fast path through this function if there are no groups and no
-         * RMIDs that need cleaning.
-         */
-        if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
-                goto out;
-        list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
-                if (!__rmid_valid(group->hw.cqm_rmid)) {
-                        if (!start)
-                                start = group;
-                        nr_needed++;
-                }
-        }
-        /*
-         * We have some event groups, but they all have RMIDs assigned
-         * and no RMIDs need cleaning.
-         */
-        if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
-                goto out;
-        if (!nr_needed)
-                goto stabilize;
-        /*
-         * We have more event groups without RMIDs than available RMIDs,
-         * or we have event groups that conflict with the ones currently
-         * scheduled.
-         *
-         * We force deallocate the rmid of the group at the head of
-         * cache_groups. The first event group without an RMID then gets
-         * assigned intel_cqm_rotation_rmid. This ensures we always make
-         * forward progress.
-         *
-         * Rotate the cache_groups list so the previous head is now the
-         * tail.
-         */
-        __intel_cqm_pick_and_rotate(start);
-        /*
-         * If the rotation is going to succeed, reduce the threshold so
-         * that we don't needlessly reuse dirty RMIDs.
-         */
-        if (__rmid_valid(intel_cqm_rotation_rmid)) {
-                intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
-                intel_cqm_rotation_rmid = __get_rmid();
-                intel_cqm_sched_out_conflicting_events(start);
-                if (__intel_cqm_threshold)
-                        __intel_cqm_threshold--;
-        }
-        rotated = true;
-stabilize:
-        /*
-         * We now need to stablize the RMID we freed above (if any) to
-         * ensure that the next time we rotate we have an RMID with zero
-         * occupancy value.
-         *
-         * Alternatively, if we didn't need to perform any rotation,
-         * we'll have a bunch of RMIDs in limbo that need stabilizing.
-         */
-        threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
-        while (intel_cqm_rmid_stabilize(&nr_available) &&
-               __intel_cqm_threshold < threshold_limit) {
-                unsigned int steal_limit;
-                /*
-                 * Don't spin if nobody is actively waiting for an RMID,
-                 * the rotation worker will be kicked as soon as an
-                 * event needs an RMID anyway.
-                 */
-                if (!nr_needed)
-                        break;
-                /* Allow max 25% of RMIDs to be in limbo. */
-                steal_limit = (cqm_max_rmid + 1) / 4;
-                /*
-                 * We failed to stabilize any RMIDs so our rotation
-                 * logic is now stuck. In order to make forward progress
-                 * we have a few options:
-                 *
-                 *   1. rotate ("steal") another RMID
-                 *   2. increase the threshold
-                 *   3. do nothing
-                 *
-                 * We do both of 1. and 2. until we hit the steal limit.
-                 *
-                 * The steal limit prevents all RMIDs ending up on the
-                 * limbo list. This can happen if every RMID has a
-                 * non-zero occupancy above threshold_limit, and the
-                 * occupancy values aren't dropping fast enough.
-                 *
-                 * Note that there is prioritisation at work here - we'd
-                 * rather increase the number of RMIDs on the limbo list
-                 * than increase the threshold, because increasing the
-                 * threshold skews the event data (because we reuse
-                 * dirty RMIDs) - threshold bumps are a last resort.
-                 */
-                if (nr_available < steal_limit)
-                        goto again;
-                __intel_cqm_threshold++;
-        }
-out:
-        mutex_unlock(&cache_mutex);
-        return rotated;
-}
-static void intel_cqm_rmid_rotate(struct work_struct *work);
-static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
-static struct pmu intel_cqm_pmu;
-static void intel_cqm_rmid_rotate(struct work_struct *work)
-{
-        unsigned long delay;
-        __intel_cqm_rmid_rotate();
-        delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
-        schedule_delayed_work(&intel_cqm_rmid_work, delay);
-}
-static u64 update_sample(unsigned int rmid, u32 evt_type, int first)
-{
-        struct sample *mbm_current;
-        u32 vrmid = rmid_2_index(rmid);
-        u64 val, bytes, shift;
-        u32 eventid;
-        if (evt_type == QOS_MBM_LOCAL_EVENT_ID) {
-                mbm_current = &mbm_local[vrmid];
-                eventid     = QOS_MBM_LOCAL_EVENT_ID;
-        } else {
-                mbm_current = &mbm_total[vrmid];
-                eventid     = QOS_MBM_TOTAL_EVENT_ID;
-        }
-        wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
-        rdmsrl(MSR_IA32_QM_CTR, val);
-        if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-                return mbm_current->total_bytes;
-        if (first) {
-                mbm_current->prev_msr = val;
-                mbm_current->total_bytes = 0;
-                return mbm_current->total_bytes;
-        }
-        /*
-         * The h/w guarantees that counters will not overflow
-         * so long as we poll them at least once per second.
-         */
-        shift = 64 - MBM_CNTR_WIDTH;
-        bytes = (val << shift) - (mbm_current->prev_msr << shift);
-        bytes >>= shift;
-        bytes *= cqm_l3_scale;
-        mbm_current->total_bytes += bytes;
-        mbm_current->prev_msr = val;
-        return mbm_current->total_bytes;
-}
-static u64 rmid_read_mbm(unsigned int rmid, u32 evt_type)
-{
-        return update_sample(rmid, evt_type, 0);
-}
-static void __intel_mbm_event_init(void *info)
-{
-        struct rmid_read *rr = info;
-        update_sample(rr->rmid, rr->evt_type, 1);
-}
-static void init_mbm_sample(u32 rmid, u32 evt_type)
-{
-        struct rmid_read rr = {
-                .rmid = rmid,
-                .evt_type = evt_type,
-                .value = ATOMIC64_INIT(0),
-        };
-        /* on each socket, init sample */
-        on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1);
-}
-/*
- * Find a group and setup RMID.
- *
- * If we're part of a group, we use the group's RMID.
- */
-static void intel_cqm_setup_event(struct perf_event *event,
-                                  struct perf_event **group)
-{
-        struct perf_event *iter;
-        bool conflict = false;
-        u32 rmid;
-        event->hw.is_group_event = false;
-        list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
-                rmid = iter->hw.cqm_rmid;
-                if (__match_event(iter, event)) {
-                        /* All tasks in a group share an RMID */
-                        event->hw.cqm_rmid = rmid;
-                        *group = iter;
-                        if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
-                                init_mbm_sample(rmid, event->attr.config);
-                        return;
-                }
-                /*
-                 * We only care about conflicts for events that are
-                 * actually scheduled in (and hence have a valid RMID).
-                 */
-                if (__conflict_event(iter, event) && __rmid_valid(rmid))
-                        conflict = true;
-        }
-        if (conflict)
-                rmid = INVALID_RMID;
-        else
-                rmid = __get_rmid();
-        if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
-                init_mbm_sample(rmid, event->attr.config);
-        event->hw.cqm_rmid = rmid;
-}
-static void intel_cqm_event_read(struct perf_event *event)
-{
-        unsigned long flags;
-        u32 rmid;
-        u64 val;
-        /*
-         * Task events are handled by intel_cqm_event_count().
-         */
-        if (event->cpu == -1)
-                return;
-        raw_spin_lock_irqsave(&cache_lock, flags);
-        rmid = event->hw.cqm_rmid;
-        if (!__rmid_valid(rmid))
-                goto out;
-        if (is_mbm_event(event->attr.config))
-                val = rmid_read_mbm(rmid, event->attr.config);
-        else
-                val = __rmid_read(rmid);
-        /*
-         * Ignore this reading on error states and do not update the value.
-         */
-        if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-                goto out;
-        local64_set(&event->count, val);
-out:
-        raw_spin_unlock_irqrestore(&cache_lock, flags);
-}
-static void __intel_cqm_event_count(void *info)
-{
-        struct rmid_read *rr = info;
-        u64 val;
-        val = __rmid_read(rr->rmid);
-        if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-                return;
-        atomic64_add(val, &rr->value);
-}
-static inline bool cqm_group_leader(struct perf_event *event)
-{
-        return !list_empty(&event->hw.cqm_groups_entry);
-}
-static void __intel_mbm_event_count(void *info)
-{
-        struct rmid_read *rr = info;
-        u64 val;
-        val = rmid_read_mbm(rr->rmid, rr->evt_type);
-        if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-                return;
-        atomic64_add(val, &rr->value);
-}
-static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer)
-{
-        struct perf_event *iter, *iter1;
-        int ret = HRTIMER_RESTART;
-        struct list_head *head;
-        unsigned long flags;
-        u32 grp_rmid;
-        /*
-         * Need to cache_lock as the timer Event Select MSR reads
-         * can race with the mbm/cqm count() and mbm_init() reads.
-         */
-        raw_spin_lock_irqsave(&cache_lock, flags);
-        if (list_empty(&cache_groups)) {
-                ret = HRTIMER_NORESTART;
-                goto out;
-        }
-        list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
-                grp_rmid = iter->hw.cqm_rmid;
-                if (!__rmid_valid(grp_rmid))
-                        continue;
-                if (is_mbm_event(iter->attr.config))
-                        update_sample(grp_rmid, iter->attr.config, 0);
-                head = &iter->hw.cqm_group_entry;
-                if (list_empty(head))
-                        continue;
-                list_for_each_entry(iter1, head, hw.cqm_group_entry) {
-                        if (!iter1->hw.is_group_event)
-                                break;
-                        if (is_mbm_event(iter1->attr.config))
-                                update_sample(iter1->hw.cqm_rmid,
-                                              iter1->attr.config, 0);
-                }
-        }
-        hrtimer_forward_now(hrtimer, ms_to_ktime(MBM_CTR_OVERFLOW_TIME));
-out:
-        raw_spin_unlock_irqrestore(&cache_lock, flags);
-        return ret;
-}
-static void __mbm_start_timer(void *info)
-{
-        hrtimer_start(&mbm_timers[pkg_id], ms_to_ktime(MBM_CTR_OVERFLOW_TIME),
-                             HRTIMER_MODE_REL_PINNED);
-}
-static void __mbm_stop_timer(void *info)
-{
-        hrtimer_cancel(&mbm_timers[pkg_id]);
-}
-static void mbm_start_timers(void)
-{
-        on_each_cpu_mask(&cqm_cpumask, __mbm_start_timer, NULL, 1);
-}
-static void mbm_stop_timers(void)
-{
-        on_each_cpu_mask(&cqm_cpumask, __mbm_stop_timer, NULL, 1);
-}
-static void mbm_hrtimer_init(void)
-{
-        struct hrtimer *hr;
-        int i;
-        for (i = 0; i < mbm_socket_max; i++) {
-                hr = &mbm_timers[i];
-                hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-                hr->function = mbm_hrtimer_handle;
-        }
-}
-static u64 intel_cqm_event_count(struct perf_event *event)
-{
-        unsigned long flags;
-        struct rmid_read rr = {
-                .evt_type = event->attr.config,
-                .value = ATOMIC64_INIT(0),
-        };
-        /*
-         * We only need to worry about task events. System-wide events
-         * are handled like usual, i.e. entirely with
-         * intel_cqm_event_read().
-         */
-        if (event->cpu != -1)
-                return __perf_event_count(event);
-        /*
-         * Only the group leader gets to report values except in case of
-         * multiple events in the same group, we still need to read the
-         * other events.This stops us
-         * reporting duplicate values to userspace, and gives us a clear
-         * rule for which task gets to report the values.
-         *
-         * Note that it is impossible to attribute these values to
-         * specific packages - we forfeit that ability when we create
-         * task events.
-         */
-        if (!cqm_group_leader(event) && !event->hw.is_group_event)
-                return 0;
-        /*
-         * Getting up-to-date values requires an SMP IPI which is not
-         * possible if we're being called in interrupt context. Return
-         * the cached values instead.
-         */
-        if (unlikely(in_interrupt()))
-                goto out;
-        /*
-         * Notice that we don't perform the reading of an RMID
-         * atomically, because we can't hold a spin lock across the
-         * IPIs.
-         *
-         * Speculatively perform the read, since @event might be
-         * assigned a different (possibly invalid) RMID while we're
-         * busying performing the IPI calls. It's therefore necessary to
-         * check @event's RMID afterwards, and if it has changed,
-         * discard the result of the read.
-         */
-        rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
-        if (!__rmid_valid(rr.rmid))
-                goto out;
-        cqm_mask_call(&rr);
-        raw_spin_lock_irqsave(&cache_lock, flags);
-        if (event->hw.cqm_rmid == rr.rmid)
-                local64_set(&event->count, atomic64_read(&rr.value));
-        raw_spin_unlock_irqrestore(&cache_lock, flags);
-out:
-        return __perf_event_count(event);
-}
-static void intel_cqm_event_start(struct perf_event *event, int mode)
-{
-        struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-        u32 rmid = event->hw.cqm_rmid;
-        if (!(event->hw.cqm_state & PERF_HES_STOPPED))
-                return;
-        event->hw.cqm_state &= ~PERF_HES_STOPPED;
-        if (state->rmid_usecnt++) {
-                if (!WARN_ON_ONCE(state->rmid != rmid))
-                        return;
-        } else {
-                WARN_ON_ONCE(state->rmid);
-        }
-        state->rmid = rmid;
-        wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
-}
-static void intel_cqm_event_stop(struct perf_event *event, int mode)
-{
-        struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-        if (event->hw.cqm_state & PERF_HES_STOPPED)
-                return;
-        event->hw.cqm_state |= PERF_HES_STOPPED;
-        intel_cqm_event_read(event);
-        if (!--state->rmid_usecnt) {
-                state->rmid = 0;
-                wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
-        } else {
-                WARN_ON_ONCE(!state->rmid);
-        }
-}
-static int intel_cqm_event_add(struct perf_event *event, int mode)
-{
-        unsigned long flags;
-        u32 rmid;
-        raw_spin_lock_irqsave(&cache_lock, flags);
-        event->hw.cqm_state = PERF_HES_STOPPED;
-        rmid = event->hw.cqm_rmid;
-        if (__rmid_valid(rmid) && (mode & PERF_EF_START))
-                intel_cqm_event_start(event, mode);
-        raw_spin_unlock_irqrestore(&cache_lock, flags);
-        return 0;
-}
-static void intel_cqm_event_destroy(struct perf_event *event)
-{
-        struct perf_event *group_other = NULL;
-        unsigned long flags;
-        mutex_lock(&cache_mutex);
-        /*
-        * Hold the cache_lock as mbm timer handlers could be
-        * scanning the list of events.
-        */
-        raw_spin_lock_irqsave(&cache_lock, flags);
-        /*
-         * If there's another event in this group...
-         */
-        if (!list_empty(&event->hw.cqm_group_entry)) {
-                group_other = list_first_entry(&event->hw.cqm_group_entry,
-                                               struct perf_event,
-                                               hw.cqm_group_entry);
-                list_del(&event->hw.cqm_group_entry);
-        }
-        /*
-         * And we're the group leader..
-         */
-        if (cqm_group_leader(event)) {
-                /*
-                 * If there was a group_other, make that leader, otherwise
-                 * destroy the group and return the RMID.
-                 */
-                if (group_other) {
-                        list_replace(&event->hw.cqm_groups_entry,
-                                     &group_other->hw.cqm_groups_entry);
-                } else {
-                        u32 rmid = event->hw.cqm_rmid;
-                        if (__rmid_valid(rmid))
-                                __put_rmid(rmid);
-                        list_del(&event->hw.cqm_groups_entry);
-                }
-        }
-        raw_spin_unlock_irqrestore(&cache_lock, flags);
-        /*
-         * Stop the mbm overflow timers when the last event is destroyed.
-        */
-        if (mbm_enabled && list_empty(&cache_groups))
-                mbm_stop_timers();
-        mutex_unlock(&cache_mutex);
-}
-static int intel_cqm_event_init(struct perf_event *event)
-{
-        struct perf_event *group = NULL;
-        bool rotate = false;
-        unsigned long flags;
-        if (event->attr.type != intel_cqm_pmu.type)
-                return -ENOENT;
-        if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) ||
-             (event->attr.config > QOS_MBM_LOCAL_EVENT_ID))
-                return -EINVAL;
-        if ((is_cqm_event(event->attr.config) && !cqm_enabled) ||
-            (is_mbm_event(event->attr.config) && !mbm_enabled))
-                return -EINVAL;
-        /* unsupported modes and filters */
-        if (event->attr.exclude_user   ||
-            event->attr.exclude_kernel ||
-            event->attr.exclude_hv     ||
-            event->attr.exclude_idle   ||
-            event->attr.exclude_host   ||
-            event->attr.exclude_guest  ||
-            event->attr.sample_period) /* no sampling */
-                return -EINVAL;
-        INIT_LIST_HEAD(&event->hw.cqm_group_entry);
-        INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
-        event->destroy = intel_cqm_event_destroy;
-        mutex_lock(&cache_mutex);
-        /*
-         * Start the mbm overflow timers when the first event is created.
-        */
-        if (mbm_enabled && list_empty(&cache_groups))
-                mbm_start_timers();
-        /* Will also set rmid */
-        intel_cqm_setup_event(event, &group);
-        /*
-        * Hold the cache_lock as mbm timer handlers be
-        * scanning the list of events.
-        */
-        raw_spin_lock_irqsave(&cache_lock, flags);
-        if (group) {
-                list_add_tail(&event->hw.cqm_group_entry,
-                              &group->hw.cqm_group_entry);
-        } else {
-                list_add_tail(&event->hw.cqm_groups_entry,
-                              &cache_groups);
-                /*
-                 * All RMIDs are either in use or have recently been
-                 * used. Kick the rotation worker to clean/free some.
-                 *
-                 * We only do this for the group leader, rather than for
-                 * every event in a group to save on needless work.
-                 */
-                if (!__rmid_valid(event->hw.cqm_rmid))
-                        rotate = true;
-        }
-        raw_spin_unlock_irqrestore(&cache_lock, flags);
-        mutex_unlock(&cache_mutex);
-        if (rotate)
-                schedule_delayed_work(&intel_cqm_rmid_work, 0);
-        return 0;
-}
-EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
-EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
-EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
-EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
-EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
-EVENT_ATTR_STR(total_bytes, intel_cqm_total_bytes, "event=0x02");
-EVENT_ATTR_STR(total_bytes.per-pkg, intel_cqm_total_bytes_pkg, "1");
-EVENT_ATTR_STR(total_bytes.unit, intel_cqm_total_bytes_unit, "MB");
-EVENT_ATTR_STR(total_bytes.scale, intel_cqm_total_bytes_scale, "1e-6");
-EVENT_ATTR_STR(local_bytes, intel_cqm_local_bytes, "event=0x03");
-EVENT_ATTR_STR(local_bytes.per-pkg, intel_cqm_local_bytes_pkg, "1");
-EVENT_ATTR_STR(local_bytes.unit, intel_cqm_local_bytes_unit, "MB");
-EVENT_ATTR_STR(local_bytes.scale, intel_cqm_local_bytes_scale, "1e-6");
-static struct attribute *intel_cqm_events_attr[] = {
-        EVENT_PTR(intel_cqm_llc),
-        EVENT_PTR(intel_cqm_llc_pkg),
-        EVENT_PTR(intel_cqm_llc_unit),
-        EVENT_PTR(intel_cqm_llc_scale),
-        EVENT_PTR(intel_cqm_llc_snapshot),
-        NULL,
-};
-static struct attribute *intel_mbm_events_attr[] = {
-        EVENT_PTR(intel_cqm_total_bytes),
-        EVENT_PTR(intel_cqm_local_bytes),
-        EVENT_PTR(intel_cqm_total_bytes_pkg),
-        EVENT_PTR(intel_cqm_local_bytes_pkg),
-        EVENT_PTR(intel_cqm_total_bytes_unit),
-        EVENT_PTR(intel_cqm_local_bytes_unit),
-        EVENT_PTR(intel_cqm_total_bytes_scale),
-        EVENT_PTR(intel_cqm_local_bytes_scale),
-        NULL,
-};
-static struct attribute *intel_cmt_mbm_events_attr[] = {
-        EVENT_PTR(intel_cqm_llc),
-        EVENT_PTR(intel_cqm_total_bytes),
-        EVENT_PTR(intel_cqm_local_bytes),
-        EVENT_PTR(intel_cqm_llc_pkg),
-        EVENT_PTR(intel_cqm_total_bytes_pkg),
-        EVENT_PTR(intel_cqm_local_bytes_pkg),
-        EVENT_PTR(intel_cqm_llc_unit),
-        EVENT_PTR(intel_cqm_total_bytes_unit),
-        EVENT_PTR(intel_cqm_local_bytes_unit),
-        EVENT_PTR(intel_cqm_llc_scale),
-        EVENT_PTR(intel_cqm_total_bytes_scale),
-        EVENT_PTR(intel_cqm_local_bytes_scale),
-        EVENT_PTR(intel_cqm_llc_snapshot),
-        NULL,
-};
-static struct attribute_group intel_cqm_events_group = {
-        .name = "events",
-        .attrs = NULL,
-};
-PMU_FORMAT_ATTR(event, "config:0-7");
-static struct attribute *intel_cqm_formats_attr[] = {
-        &format_attr_event.attr,
-        NULL,
-};
-static struct attribute_group intel_cqm_format_group = {
-        .name = "format",
-        .attrs = intel_cqm_formats_attr,
-};
-static ssize_t
-max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
-                           char *page)
-{
-        ssize_t rv;
-        mutex_lock(&cache_mutex);
-        rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
-        mutex_unlock(&cache_mutex);
-        return rv;
-}
-static ssize_t
-max_recycle_threshold_store(struct device *dev,
-                            struct device_attribute *attr,
-                            const char *buf, size_t count)
-{
-        unsigned int bytes, cachelines;
-        int ret;
-        ret = kstrtouint(buf, 0, &bytes);
-        if (ret)
-                return ret;
-        mutex_lock(&cache_mutex);
-        __intel_cqm_max_threshold = bytes;
-        cachelines = bytes / cqm_l3_scale;
-        /*
-         * The new maximum takes effect immediately.
-         */
-        if (__intel_cqm_threshold > cachelines)
-                __intel_cqm_threshold = cachelines;
-        mutex_unlock(&cache_mutex);
-        return count;
-}
-static DEVICE_ATTR_RW(max_recycle_threshold);
-static struct attribute *intel_cqm_attrs[] = {
-        &dev_attr_max_recycle_threshold.attr,
-        NULL,
-};
-static const struct attribute_group intel_cqm_group = {
-        .attrs = intel_cqm_attrs,
-};
-static const struct attribute_group *intel_cqm_attr_groups[] = {
-        &intel_cqm_events_group,
-        &intel_cqm_format_group,
-        &intel_cqm_group,
-        NULL,
-};
-static struct pmu intel_cqm_pmu = {
-        .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
-        .attr_groups         = intel_cqm_attr_groups,
-        .task_ctx_nr         = perf_sw_context,
-        .event_init          = intel_cqm_event_init,
-        .add                 = intel_cqm_event_add,
-        .del                 = intel_cqm_event_stop,
-        .start               = intel_cqm_event_start,
-        .stop                = intel_cqm_event_stop,
-        .read                = intel_cqm_event_read,
-        .count               = intel_cqm_event_count,
-};
-static inline void cqm_pick_event_reader(int cpu)
-{
-        int reader;
-        /* First online cpu in package becomes the reader */
-        reader = cpumask_any_and(&cqm_cpumask, topology_core_cpumask(cpu));
-        if (reader >= nr_cpu_ids)
-                cpumask_set_cpu(cpu, &cqm_cpumask);
-}
-static int intel_cqm_cpu_starting(unsigned int cpu)
-{
-        struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
-        struct cpuinfo_x86 *c = &cpu_data(cpu);
-        state->rmid = 0;
-        state->closid = 0;
-        state->rmid_usecnt = 0;
-        WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
-        WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
-        cqm_pick_event_reader(cpu);
-        return 0;
-}
-static int intel_cqm_cpu_exit(unsigned int cpu)
-{
-        int target;
-        /* Is @cpu the current cqm reader for this package ? */
-        if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
-                return 0;
-        /* Find another online reader in this package */
-        target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
-        if (target < nr_cpu_ids)
-                cpumask_set_cpu(target, &cqm_cpumask);
-        return 0;
-}
-static const struct x86_cpu_id intel_cqm_match[] = {
-        { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
-        {}
-};
-static void mbm_cleanup(void)
-{
-        if (!mbm_enabled)
-                return;
-        kfree(mbm_local);
-        kfree(mbm_total);
-        mbm_enabled = false;
-}
-static const struct x86_cpu_id intel_mbm_local_match[] = {
-        { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL },
-        {}
-};
-static const struct x86_cpu_id intel_mbm_total_match[] = {
-        { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_TOTAL },
-        {}
-};
-static int intel_mbm_init(void)
-{
-        int ret = 0, array_size, maxid = cqm_max_rmid + 1;
-        mbm_socket_max = topology_max_packages();
-        array_size = sizeof(struct sample) * maxid * mbm_socket_max;
-        mbm_local = kmalloc(array_size, GFP_KERNEL);
-        if (!mbm_local)
-                return -ENOMEM;
-        mbm_total = kmalloc(array_size, GFP_KERNEL);
-        if (!mbm_total) {
-                ret = -ENOMEM;
-                goto out;
-        }
-        array_size = sizeof(struct hrtimer) * mbm_socket_max;
-        mbm_timers = kmalloc(array_size, GFP_KERNEL);
-        if (!mbm_timers) {
-                ret = -ENOMEM;
-                goto out;
-        }
-        mbm_hrtimer_init();
-out:
-        if (ret)
-                mbm_cleanup();
-        return ret;
-}
-static int __init intel_cqm_init(void)
-{
-        char *str = NULL, scale[20];
-        int cpu, ret;
-        if (x86_match_cpu(intel_cqm_match))
-                cqm_enabled = true;
-        if (x86_match_cpu(intel_mbm_local_match) &&
-             x86_match_cpu(intel_mbm_total_match))
-                mbm_enabled = true;
-        if (!cqm_enabled && !mbm_enabled)
-                return -ENODEV;
-        cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
-        /*
-         * It's possible that not all resources support the same number
-         * of RMIDs. Instead of making scheduling much more complicated
-         * (where we have to match a task's RMID to a cpu that supports
-         * that many RMIDs) just find the minimum RMIDs supported across
-         * all cpus.
-         *
-         * Also, check that the scales match on all cpus.
-         */
-        cpus_read_lock();
-        for_each_online_cpu(cpu) {
-                struct cpuinfo_x86 *c = &cpu_data(cpu);
-                if (c->x86_cache_max_rmid < cqm_max_rmid)
-                        cqm_max_rmid = c->x86_cache_max_rmid;
-                if (c->x86_cache_occ_scale != cqm_l3_scale) {
-                        pr_err("Multiple LLC scale values, disabling\n");
-                        ret = -EINVAL;
-                        goto out;
-                }
-        }
-        /*
-         * A reasonable upper limit on the max threshold is the number
-         * of lines tagged per RMID if all RMIDs have the same number of
-         * lines tagged in the LLC.
-         *
-         * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
-         */
-        __intel_cqm_max_threshold =
-                boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
-        snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
-        str = kstrdup(scale, GFP_KERNEL);
-        if (!str) {
-                ret = -ENOMEM;
-                goto out;
-        }
-        event_attr_intel_cqm_llc_scale.event_str = str;
-        ret = intel_cqm_setup_rmid_cache();
-        if (ret)
-                goto out;
-        if (mbm_enabled)
-                ret = intel_mbm_init();
-        if (ret && !cqm_enabled)
-                goto out;
-        if (cqm_enabled && mbm_enabled)
-                intel_cqm_events_group.attrs = intel_cmt_mbm_events_attr;
-        else if (!cqm_enabled && mbm_enabled)
-                intel_cqm_events_group.attrs = intel_mbm_events_attr;
-        else if (cqm_enabled && !mbm_enabled)
-                intel_cqm_events_group.attrs = intel_cqm_events_attr;
-        ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
-        if (ret) {
-                pr_err("Intel CQM perf registration failed: %d\n", ret);
-                goto out;
-        }
-        if (cqm_enabled)
-                pr_info("Intel CQM monitoring enabled\n");
-        if (mbm_enabled)
-                pr_info("Intel MBM enabled\n");
-        /*
-         * Setup the hot cpu notifier once we are sure cqm
-         * is enabled to avoid notifier leak.
-         */
-        cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_STARTING,
-                                     "perf/x86/cqm:starting",
-                                     intel_cqm_cpu_starting, NULL);
-        cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_ONLINE,
-                                     "perf/x86/cqm:online",
-                                     NULL, intel_cqm_cpu_exit);
-out:
-        cpus_read_unlock();
-        if (ret) {
-                kfree(str);
-                cqm_cleanup();
-                mbm_cleanup();
-        }
-        return ret;
-}
-device_initcall(intel_cqm_init);
diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
deleted file mode 100644
index 597dc4995678..000000000000
--- a/arch/x86/include/asm/intel_rdt.h
+++ /dev/null
@@ -1,286 +0,0 @@
-#ifndef _ASM_X86_INTEL_RDT_H
-#define _ASM_X86_INTEL_RDT_H
-#ifdef CONFIG_INTEL_RDT_A
-#include <linux/sched.h>
-#include <linux/kernfs.h>
-#include <linux/jump_label.h>
-#include <asm/intel_rdt_common.h>
-#define IA32_L3_QOS_CFG         0xc81
-#define IA32_L3_CBM_BASE        0xc90
-#define IA32_L2_CBM_BASE        0xd10
-#define IA32_MBA_THRTL_BASE     0xd50
-#define L3_QOS_CDP_ENABLE       0x01ULL
-/**
- * struct rdtgroup - store rdtgroup's data in resctrl file system.
- * @kn:                         kernfs node
- * @rdtgroup_list:              linked list for all rdtgroups
- * @closid:                     closid for this rdtgroup
- * @cpu_mask:                   CPUs assigned to this rdtgroup
- * @flags:                      status bits
- * @waitcount:                  how many cpus expect to find this
- *                              group when they acquire rdtgroup_mutex
- */
-struct rdtgroup {
-        struct kernfs_node      *kn;
-        struct list_head        rdtgroup_list;
-        int                     closid;
-        struct cpumask          cpu_mask;
-        int                     flags;
-        atomic_t                waitcount;
-};
-/* rdtgroup.flags */
-#define RDT_DELETED             1
-/* rftype.flags */
-#define RFTYPE_FLAGS_CPUS_LIST  1
-/* List of all resource groups */
-extern struct list_head rdt_all_groups;
-extern int max_name_width, max_data_width;
-int __init rdtgroup_init(void);
-/**
- * struct rftype - describe each file in the resctrl file system
- * @name:       File name
- * @mode:       Access mode
- * @kf_ops:     File operations
- * @flags:      File specific RFTYPE_FLAGS_* flags
- * @seq_show:   Show content of the file
- * @write:      Write to the file
- */
-struct rftype {
-        char                    *name;
-        umode_t                 mode;
-        struct kernfs_ops       *kf_ops;
-        unsigned long           flags;
-        int (*seq_show)(struct kernfs_open_file *of,
-                        struct seq_file *sf, void *v);
-        /*
-         * write() is the generic write callback which maps directly to
-         * kernfs write operation and overrides all other operations.
-         * Maximum write size is determined by ->max_write_len.
-         */
-        ssize_t (*write)(struct kernfs_open_file *of,
-                         char *buf, size_t nbytes, loff_t off);
-};
-/**
- * struct rdt_domain - group of cpus sharing an RDT resource
- * @list:       all instances of this resource
- * @id:         unique id for this instance
- * @cpu_mask:   which cpus share this resource
- * @ctrl_val:   array of cache or mem ctrl values (indexed by CLOSID)
- * @new_ctrl:   new ctrl value to be loaded
- * @have_new_ctrl: did user provide new_ctrl for this domain
- */
-struct rdt_domain {
-        struct list_head        list;
-        int                     id;
-        struct cpumask          cpu_mask;
-        u32                     *ctrl_val;
-        u32                     new_ctrl;
-        bool                    have_new_ctrl;
-};
-/**
- * struct msr_param - set a range of MSRs from a domain
- * @res:       The resource to use
- * @low:       Beginning index from base MSR
- * @high:      End index
- */
-struct msr_param {
-        struct rdt_resource     *res;
-        int                     low;
-        int                     high;
-};
-/**
- * struct rdt_cache - Cache allocation related data
- * @cbm_len:            Length of the cache bit mask
- * @min_cbm_bits:       Minimum number of consecutive bits to be set
- * @cbm_idx_mult:       Multiplier of CBM index
- * @cbm_idx_offset:     Offset of CBM index. CBM index is computed by:
- *                      closid * cbm_idx_multi + cbm_idx_offset
- *                      in a cache bit mask
- */
-struct rdt_cache {
-        unsigned int    cbm_len;
-        unsigned int    min_cbm_bits;
-        unsigned int    cbm_idx_mult;
-        unsigned int    cbm_idx_offset;
-};
-/**
- * struct rdt_membw - Memory bandwidth allocation related data
- * @max_delay:          Max throttle delay. Delay is the hardware
- *                      representation for memory bandwidth.
- * @min_bw:             Minimum memory bandwidth percentage user can request
- * @bw_gran:            Granularity at which the memory bandwidth is allocated
- * @delay_linear:       True if memory B/W delay is in linear scale
- * @mb_map:             Mapping of memory B/W percentage to memory B/W delay
- */
-struct rdt_membw {
-        u32             max_delay;
-        u32             min_bw;
-        u32             bw_gran;
-        u32             delay_linear;
-        u32             *mb_map;
-};
-/**
- * struct rdt_resource - attributes of an RDT resource
- * @enabled:            Is this feature enabled on this machine
- * @capable:            Is this feature available on this machine
- * @name:               Name to use in "schemata" file
- * @num_closid:         Number of CLOSIDs available
- * @cache_level:        Which cache level defines scope of this resource
- * @default_ctrl:       Specifies default cache cbm or memory B/W percent.
- * @msr_base:           Base MSR address for CBMs
- * @msr_update:         Function pointer to update QOS MSRs
- * @data_width:         Character width of data when displaying
- * @domains:            All domains for this resource
- * @cache:              Cache allocation related data
- * @info_files:         resctrl info files for the resource
- * @nr_info_files:      Number of info files
- * @format_str:         Per resource format string to show domain value
- * @parse_ctrlval:      Per resource function pointer to parse control values
- */
-struct rdt_resource {
-        bool                    enabled;
-        bool                    capable;
-        char                    *name;
-        int                     num_closid;
-        int                     cache_level;
-        u32                     default_ctrl;
-        unsigned int            msr_base;
-        void (*msr_update)      (struct rdt_domain *d, struct msr_param *m,
-                                 struct rdt_resource *r);
-        int                     data_width;
-        struct list_head        domains;
-        struct rdt_cache        cache;
-        struct rdt_membw        membw;
-        struct rftype           *info_files;
-        int                     nr_info_files;
-        const char              *format_str;
-        int (*parse_ctrlval)    (char *buf, struct rdt_resource *r,
-                                 struct rdt_domain *d);
-};
-void rdt_get_cache_infofile(struct rdt_resource *r);
-void rdt_get_mba_infofile(struct rdt_resource *r);
-int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d);
-int parse_bw(char *buf, struct rdt_resource *r,  struct rdt_domain *d);
-extern struct mutex rdtgroup_mutex;
-extern struct rdt_resource rdt_resources_all[];
-extern struct rdtgroup rdtgroup_default;
-DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
-int __init rdtgroup_init(void);
-enum {
-        RDT_RESOURCE_L3,
-        RDT_RESOURCE_L3DATA,
-        RDT_RESOURCE_L3CODE,
-        RDT_RESOURCE_L2,
-        RDT_RESOURCE_MBA,
-        /* Must be the last */
-        RDT_NUM_RESOURCES,
-};
-#define for_each_capable_rdt_resource(r)                                      \
-        for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
-             r++)                                                             \
-                if (r->capable)
-#define for_each_enabled_rdt_resource(r)                                      \
-        for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
-             r++)                                                             \
-                if (r->enabled)
-/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
-union cpuid_0x10_1_eax {
-        struct {
-                unsigned int cbm_len:5;
-        } split;
-        unsigned int full;
-};
-/* CPUID.(EAX=10H, ECX=ResID=3).EAX */
-union cpuid_0x10_3_eax {
-        struct {
-                unsigned int max_delay:12;
-        } split;
-        unsigned int full;
-};
-/* CPUID.(EAX=10H, ECX=ResID).EDX */
-union cpuid_0x10_x_edx {
-        struct {
-                unsigned int cos_max:16;
-        } split;
-        unsigned int full;
-};
-DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid);
-void rdt_ctrl_update(void *arg);
-struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
-void rdtgroup_kn_unlock(struct kernfs_node *kn);
-ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
-                                char *buf, size_t nbytes, loff_t off);
-int rdtgroup_schemata_show(struct kernfs_open_file *of,
-                           struct seq_file *s, void *v);
-/*
- * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
- *
- * Following considerations are made so that this has minimal impact
- * on scheduler hot path:
- * - This will stay as no-op unless we are running on an Intel SKU
- *   which supports resource control and we enable by mounting the
- *   resctrl file system.
- * - Caches the per cpu CLOSid values and does the MSR write only
- *   when a task with a different CLOSid is scheduled in.
- *
- * Must be called with preemption disabled.
- */
-static inline void intel_rdt_sched_in(void)
-{
-        if (static_branch_likely(&rdt_enable_key)) {
-                struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-                int closid;
-                /*
-                 * If this task has a closid assigned, use it.
-                 * Else use the closid assigned to this cpu.
-                 */
-                closid = current->closid;
-                if (closid == 0)
-                        closid = this_cpu_read(cpu_closid);
-                if (closid != state->closid) {
-                        state->closid = closid;
-                        wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid);
-                }
-        }
-}
-#else
-static inline void intel_rdt_sched_in(void) {}
-#endif /* CONFIG_INTEL_RDT_A */
-#endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h
deleted file mode 100644
index b31081b89407..000000000000
--- a/arch/x86/include/asm/intel_rdt_common.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef _ASM_X86_INTEL_RDT_COMMON_H
-#define _ASM_X86_INTEL_RDT_COMMON_H
-#define MSR_IA32_PQR_ASSOC      0x0c8f
-/**
- * struct intel_pqr_state - State cache for the PQR MSR
- * @rmid:               The cached Resource Monitoring ID
- * @closid:             The cached Class Of Service ID
- * @rmid_usecnt:        The usage counter for rmid
- *
- * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
- * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
- * contains both parts, so we need to cache them.
- *
- * The cache also helps to avoid pointless updates if the value does
- * not change.
- */
-struct intel_pqr_state {
-        u32                     rmid;
-        u32                     closid;
-        int                     rmid_usecnt;
-};
-DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
-#endif /* _ASM_X86_INTEL_RDT_COMMON_H */
diff --git a/arch/x86/include/asm/intel_rdt_sched.h b/arch/x86/include/asm/intel_rdt_sched.h
new file mode 100644
index 000000000000..b4bbf8b21512
--- /dev/null
+++ b/arch/x86/include/asm/intel_rdt_sched.h
@@ -0,0 +1,92 @@
+#ifndef _ASM_X86_INTEL_RDT_SCHED_H
+#define _ASM_X86_INTEL_RDT_SCHED_H
+#ifdef CONFIG_INTEL_RDT
+#include <linux/sched.h>
+#include <linux/jump_label.h>
+#define IA32_PQR_ASSOC  0x0c8f
+/**
+ * struct intel_pqr_state - State cache for the PQR MSR
+ * @cur_rmid:           The cached Resource Monitoring ID
+ * @cur_closid: The cached Class Of Service ID
+ * @default_rmid:       The user assigned Resource Monitoring ID
+ * @default_closid:     The user assigned cached Class Of Service ID
+ *
+ * The upper 32 bits of IA32_PQR_ASSOC contain closid and the
+ * lower 10 bits rmid. The update to IA32_PQR_ASSOC always
+ * contains both parts, so we need to cache them. This also
+ * stores the user configured per cpu CLOSID and RMID.
+ *
+ * The cache also helps to avoid pointless updates if the value does
+ * not change.
+ */
+struct intel_pqr_state {
+        u32                     cur_rmid;
+        u32                     cur_closid;
+        u32                     default_rmid;
+        u32                     default_closid;
+};
+DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
+/*
+ * __intel_rdt_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR
+ *
+ * Following considerations are made so that this has minimal impact
+ * on scheduler hot path:
+ * - This will stay as no-op unless we are running on an Intel SKU
+ *   which supports resource control or monitoring and we enable by
+ *   mounting the resctrl file system.
+ * - Caches the per cpu CLOSid/RMID values and does the MSR write only
+ *   when a task with a different CLOSid/RMID is scheduled in.
+ * - We allocate RMIDs/CLOSids globally in order to keep this as
+ *   simple as possible.
+ * Must be called with preemption disabled.
+ */
+static void __intel_rdt_sched_in(void)
+{
+        struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+        u32 closid = state->default_closid;
+        u32 rmid = state->default_rmid;
+        /*
+         * If this task has a closid/rmid assigned, use it.
+         * Else use the closid/rmid assigned to this cpu.
+         */
+        if (static_branch_likely(&rdt_alloc_enable_key)) {
+                if (current->closid)
+                        closid = current->closid;
+        }
+        if (static_branch_likely(&rdt_mon_enable_key)) {
+                if (current->rmid)
+                        rmid = current->rmid;
+        }
+        if (closid != state->cur_closid || rmid != state->cur_rmid) {
+                state->cur_closid = closid;
+                state->cur_rmid = rmid;
+                wrmsr(IA32_PQR_ASSOC, rmid, closid);
+        }
+}
+static inline void intel_rdt_sched_in(void)
+{
+        if (static_branch_likely(&rdt_enable_key))
+                __intel_rdt_sched_in();
+}
+#else
+static inline void intel_rdt_sched_in(void) {}
+#endif /* CONFIG_INTEL_RDT */
+#endif /* _ASM_X86_INTEL_RDT_SCHED_H */
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index cdf82492b770..e17942c131c8 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -33,7 +33,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)      += transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)            += umc.o
-obj-$(CONFIG_INTEL_RDT_A)       += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o
+obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o intel_rdt_ctrlmondata.o
 obj-$(CONFIG_X86_MCE)                   += mcheck/
 obj-$(CONFIG_MTRR)                      += mtrr/
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 5b366462f579..cd5fc61ba450 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -30,7 +30,8 @@
 #include <linux/cpuhotplug.h>
 #include <asm/intel-family.h>
-#include <asm/intel_rdt.h>
+#include <asm/intel_rdt_sched.h>
+#include "intel_rdt.h"
 #define MAX_MBA_BW      100u
 #define MBA_IS_LINEAR   0x4
@@ -38,7 +39,13 @@
 /* Mutex to protect rdtgroup access. */
 DEFINE_MUTEX(rdtgroup_mutex);
-DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);
+/*
+ * The cached intel_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Functions which modify the state
+ * are called with interrupts disabled and no preemption, which
+ * is sufficient for the protection.
+ */
+DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
 /*
 * Used to store the max resource name width and max resource data width
@@ -46,6 +53,12 @@ DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);
 */
 int max_name_width, max_data_width;
+/*
+ * Global boolean for rdt_alloc which is true if any
+ * resource allocation is enabled.
+ */
+bool rdt_alloc_capable;
 static void
 mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
 static void
@@ -54,7 +67,9 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
 #define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains)
 struct rdt_resource rdt_resources_all[] = {
+        [RDT_RESOURCE_L3] =
        {
+                .rid                    = RDT_RESOURCE_L3,
                .name                   = "L3",
                .domains                = domain_init(RDT_RESOURCE_L3),
                .msr_base               = IA32_L3_CBM_BASE,
@@ -67,8 +82,11 @@ struct rdt_resource rdt_resources_all[] = {
                },
                .parse_ctrlval          = parse_cbm,
                .format_str             = "%d=%0*x",
+                .fflags                 = RFTYPE_RES_CACHE,
        },
+        [RDT_RESOURCE_L3DATA] =
        {
+                .rid                    = RDT_RESOURCE_L3DATA,
                .name                   = "L3DATA",
                .domains                = domain_init(RDT_RESOURCE_L3DATA),
                .msr_base               = IA32_L3_CBM_BASE,
@@ -81,8 +99,11 @@ struct rdt_resource rdt_resources_all[] = {
                },
                .parse_ctrlval          = parse_cbm,
                .format_str             = "%d=%0*x",
+                .fflags                 = RFTYPE_RES_CACHE,
        },
+        [RDT_RESOURCE_L3CODE] =
        {
+                .rid                    = RDT_RESOURCE_L3CODE,
                .name                   = "L3CODE",
                .domains                = domain_init(RDT_RESOURCE_L3CODE),
                .msr_base               = IA32_L3_CBM_BASE,
@@ -95,8 +116,11 @@ struct rdt_resource rdt_resources_all[] = {
                },
                .parse_ctrlval          = parse_cbm,
                .format_str             = "%d=%0*x",
+                .fflags                 = RFTYPE_RES_CACHE,
        },
+        [RDT_RESOURCE_L2] =
        {
+                .rid                    = RDT_RESOURCE_L2,
                .name                   = "L2",
                .domains                = domain_init(RDT_RESOURCE_L2),
                .msr_base               = IA32_L2_CBM_BASE,
@@ -109,8 +133,11 @@ struct rdt_resource rdt_resources_all[] = {
                },
                .parse_ctrlval          = parse_cbm,
                .format_str             = "%d=%0*x",
+                .fflags                 = RFTYPE_RES_CACHE,
        },
+        [RDT_RESOURCE_MBA] =
        {
+                .rid                    = RDT_RESOURCE_MBA,
                .name                   = "MB",
                .domains                = domain_init(RDT_RESOURCE_MBA),
                .msr_base               = IA32_MBA_THRTL_BASE,
@@ -118,6 +145,7 @@ struct rdt_resource rdt_resources_all[] = {
                .cache_level            = 3,
                .parse_ctrlval          = parse_bw,
                .format_str             = "%d=%*d",
+                .fflags                 = RFTYPE_RES_MB,
        },
 };
@@ -144,33 +172,28 @@ static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid)
 * is always 20 on hsw server parts. The minimum cache bitmask length
 * allowed for HSW server is always 2 bits. Hardcode all of them.
 */
-static inline bool cache_alloc_hsw_probe(void)
+static inline void cache_alloc_hsw_probe(void)
 {
-        if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+        struct rdt_resource *r  = &rdt_resources_all[RDT_RESOURCE_L3];
-            boot_cpu_data.x86 == 6 &&
+        u32 l, h, max_cbm = BIT_MASK(20) - 1;
-            boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) {
-                struct rdt_resource *r  = &rdt_resources_all[RDT_RESOURCE_L3];
-                u32 l, h, max_cbm = BIT_MASK(20) - 1;
-                if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0))
-                        return false;
-                rdmsr(IA32_L3_CBM_BASE, l, h);
-                /* If all the bits were set in MSR, return success */
+        if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0))
-                if (l != max_cbm)
+                return;
-                        return false;
+        rdmsr(IA32_L3_CBM_BASE, l, h);
-                r->num_closid = 4;
+        /* If all the bits were set in MSR, return success */
-                r->default_ctrl = max_cbm;
+        if (l != max_cbm)
-                r->cache.cbm_len = 20;
+                return;
-                r->cache.min_cbm_bits = 2;
-                r->capable = true;
-                r->enabled = true;
-                return true;
+        r->num_closid = 4;
-        }
+        r->default_ctrl = max_cbm;
+        r->cache.cbm_len = 20;
+        r->cache.shareable_bits = 0xc0000;
+        r->cache.min_cbm_bits = 2;
+        r->alloc_capable = true;
+        r->alloc_enabled = true;
-        return false;
+        rdt_alloc_capable = true;
 }
 /*
@@ -213,15 +236,14 @@ static bool rdt_get_mem_config(struct rdt_resource *r)
                        return false;
        }
        r->data_width = 3;
-        rdt_get_mba_infofile(r);
-        r->capable = true;
+        r->alloc_capable = true;
-        r->enabled = true;
+        r->alloc_enabled = true;
        return true;
 }
-static void rdt_get_cache_config(int idx, struct rdt_resource *r)
+static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
 {
        union cpuid_0x10_1_eax eax;
        union cpuid_0x10_x_edx edx;
@@ -231,10 +253,10 @@ static void rdt_get_cache_config(int idx, struct rdt_resource *r)
        r->num_closid = edx.split.cos_max + 1;
        r->cache.cbm_len = eax.split.cbm_len + 1;
        r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
+        r->cache.shareable_bits = ebx & r->default_ctrl;
        r->data_width = (r->cache.cbm_len + 3) / 4;
-        rdt_get_cache_infofile(r);
+        r->alloc_capable = true;
-        r->capable = true;
+        r->alloc_enabled = true;
-        r->enabled = true;
 }
 static void rdt_get_cdp_l3_config(int type)
@@ -246,12 +268,12 @@ static void rdt_get_cdp_l3_config(int type)
        r->cache.cbm_len = r_l3->cache.cbm_len;
        r->default_ctrl = r_l3->default_ctrl;
        r->data_width = (r->cache.cbm_len + 3) / 4;
-        r->capable = true;
+        r->alloc_capable = true;
        /*
         * By default, CDP is disabled. CDP can be enabled by mount parameter
         * "cdp" during resctrl file system mount time.
         */
-        r->enabled = false;
+        r->alloc_enabled = false;
 }
 static int get_cache_id(int cpu, int level)
@@ -300,6 +322,19 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
                wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]);
 }
+struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
+{
+        struct rdt_domain *d;
+        list_for_each_entry(d, &r->domains, list) {
+                /* Find the domain that contains this CPU */
+                if (cpumask_test_cpu(cpu, &d->cpu_mask))
+                        return d;
+        }
+        return NULL;
+}
 void rdt_ctrl_update(void *arg)
 {
        struct msr_param *m = arg;
@@ -307,12 +342,10 @@ void rdt_ctrl_update(void *arg)
        int cpu = smp_processor_id();
        struct rdt_domain *d;
-        list_for_each_entry(d, &r->domains, list) {
+        d = get_domain_from_cpu(cpu, r);
-                /* Find the domain that contains this CPU */
+        if (d) {
-                if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
+                r->msr_update(d, m, r);
-                        r->msr_update(d, m, r);
+                return;
-                        return;
-                }
        }
        pr_warn_once("cpu %d not found in any domain for resource %s\n",
                     cpu, r->name);
@@ -326,8 +359,8 @@ void rdt_ctrl_update(void *arg)
 * caller, return the first domain whose id is bigger than the input id.
 * The domain list is sorted by id in ascending order.
 */
-static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
-                                          struct list_head **pos)
+                                   struct list_head **pos)
 {
        struct rdt_domain *d;
        struct list_head *l;
@@ -377,6 +410,44 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
        return 0;
 }
+static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
+{
+        size_t tsize;
+        if (is_llc_occupancy_enabled()) {
+                d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid),
+                                           sizeof(unsigned long),
+                                           GFP_KERNEL);
+                if (!d->rmid_busy_llc)
+                        return -ENOMEM;
+                INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
+        }
+        if (is_mbm_total_enabled()) {
+                tsize = sizeof(*d->mbm_total);
+                d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+                if (!d->mbm_total) {
+                        kfree(d->rmid_busy_llc);
+                        return -ENOMEM;
+                }
+        }
+        if (is_mbm_local_enabled()) {
+                tsize = sizeof(*d->mbm_local);
+                d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+                if (!d->mbm_local) {
+                        kfree(d->rmid_busy_llc);
+                        kfree(d->mbm_total);
+                        return -ENOMEM;
+                }
+        }
+        if (is_mbm_enabled()) {
+                INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
+                mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL);
+        }
+        return 0;
+}
 /*
 * domain_add_cpu - Add a cpu to a resource's domain list.
 *
@@ -412,14 +483,26 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
                return;
        d->id = id;
+        cpumask_set_cpu(cpu, &d->cpu_mask);
-        if (domain_setup_ctrlval(r, d)) {
+        if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
+                kfree(d);
+                return;
+        }
+        if (r->mon_capable && domain_setup_mon_state(r, d)) {
                kfree(d);
                return;
        }
-        cpumask_set_cpu(cpu, &d->cpu_mask);
        list_add_tail(&d->list, add_pos);
+        /*
+         * If resctrl is mounted, add
+         * per domain monitor data directories.
+         */
+        if (static_branch_unlikely(&rdt_mon_enable_key))
+                mkdir_mondata_subdir_allrdtgrp(r, d);
 }
 static void domain_remove_cpu(int cpu, struct rdt_resource *r)
@@ -435,19 +518,58 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
        cpumask_clear_cpu(cpu, &d->cpu_mask);
        if (cpumask_empty(&d->cpu_mask)) {
+                /*
+                 * If resctrl is mounted, remove all the
+                 * per domain monitor data directories.
+                 */
+                if (static_branch_unlikely(&rdt_mon_enable_key))
+                        rmdir_mondata_subdir_allrdtgrp(r, d->id);
                kfree(d->ctrl_val);
+                kfree(d->rmid_busy_llc);
+                kfree(d->mbm_total);
+                kfree(d->mbm_local);
                list_del(&d->list);
+                if (is_mbm_enabled())
+                        cancel_delayed_work(&d->mbm_over);
+                if (is_llc_occupancy_enabled() &&  has_busy_rmid(r, d)) {
+                        /*
+                         * When a package is going down, forcefully
+                         * decrement rmid->ebusy. There is no way to know
+                         * that the L3 was flushed and hence may lead to
+                         * incorrect counts in rare scenarios, but leaving
+                         * the RMID as busy creates RMID leaks if the
+                         * package never comes back.
+                         */
+                        __check_limbo(d, true);
+                        cancel_delayed_work(&d->cqm_limbo);
+                }
                kfree(d);
+                return;
+        }
+        if (r == &rdt_resources_all[RDT_RESOURCE_L3]) {
+                if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
+                        cancel_delayed_work(&d->mbm_over);
+                        mbm_setup_overflow_handler(d, 0);
+                }
+                if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu &&
+                    has_busy_rmid(r, d)) {
+                        cancel_delayed_work(&d->cqm_limbo);
+                        cqm_setup_limbo_handler(d, 0);
+                }
        }
 }
-static void clear_closid(int cpu)
+static void clear_closid_rmid(int cpu)
 {
        struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-        per_cpu(cpu_closid, cpu) = 0;
+        state->default_closid = 0;
-        state->closid = 0;
+        state->default_rmid = 0;
-        wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0);
+        state->cur_closid = 0;
+        state->cur_rmid = 0;
+        wrmsr(IA32_PQR_ASSOC, 0, 0);
 }
 static int intel_rdt_online_cpu(unsigned int cpu)
@@ -459,12 +581,23 @@ static int intel_rdt_online_cpu(unsigned int cpu)
                domain_add_cpu(cpu, r);
        /* The cpu is set in default rdtgroup after online. */
        cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
-        clear_closid(cpu);
+        clear_closid_rmid(cpu);
        mutex_unlock(&rdtgroup_mutex);
        return 0;
 }
+static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
+{
+        struct rdtgroup *cr;
+        list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
+                if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) {
+                        break;
+                }
+        }
+}
 static int intel_rdt_offline_cpu(unsigned int cpu)
 {
        struct rdtgroup *rdtgrp;
@@ -474,10 +607,12 @@ static int intel_rdt_offline_cpu(unsigned int cpu)
        for_each_capable_rdt_resource(r)
                domain_remove_cpu(cpu, r);
        list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
-                if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask))
+                if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
+                        clear_childcpus(rdtgrp, cpu);
                        break;
+                }
        }
-        clear_closid(cpu);
+        clear_closid_rmid(cpu);
        mutex_unlock(&rdtgroup_mutex);
        return 0;
@@ -492,7 +627,7 @@ static __init void rdt_init_padding(void)
        struct rdt_resource *r;
        int cl;
-        for_each_capable_rdt_resource(r) {
+        for_each_alloc_capable_rdt_resource(r) {
                cl = strlen(r->name);
                if (cl > max_name_width)
                        max_name_width = cl;
@@ -502,38 +637,153 @@ static __init void rdt_init_padding(void)
        }
 }
-static __init bool get_rdt_resources(void)
+enum {
+        RDT_FLAG_CMT,
+        RDT_FLAG_MBM_TOTAL,
+        RDT_FLAG_MBM_LOCAL,
+        RDT_FLAG_L3_CAT,
+        RDT_FLAG_L3_CDP,
+        RDT_FLAG_L2_CAT,
+        RDT_FLAG_MBA,
+};
+#define RDT_OPT(idx, n, f)      \
+[idx] = {                       \
+        .name = n,              \
+        .flag = f               \
+}
+struct rdt_options {
+        char    *name;
+        int     flag;
+        bool    force_off, force_on;
+};
+static struct rdt_options rdt_options[]  __initdata = {
+        RDT_OPT(RDT_FLAG_CMT,       "cmt",      X86_FEATURE_CQM_OCCUP_LLC),
+        RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL),
+        RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL),
+        RDT_OPT(RDT_FLAG_L3_CAT,    "l3cat",    X86_FEATURE_CAT_L3),
+        RDT_OPT(RDT_FLAG_L3_CDP,    "l3cdp",    X86_FEATURE_CDP_L3),
+        RDT_OPT(RDT_FLAG_L2_CAT,    "l2cat",    X86_FEATURE_CAT_L2),
+        RDT_OPT(RDT_FLAG_MBA,       "mba",      X86_FEATURE_MBA),
+};
+#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options)
+static int __init set_rdt_options(char *str)
+{
+        struct rdt_options *o;
+        bool force_off;
+        char *tok;
+        if (*str == '=')
+                str++;
+        while ((tok = strsep(&str, ",")) != NULL) {
+                force_off = *tok == '!';
+                if (force_off)
+                        tok++;
+                for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
+                        if (strcmp(tok, o->name) == 0) {
+                                if (force_off)
+                                        o->force_off = true;
+                                else
+                                        o->force_on = true;
+                                break;
+                        }
+                }
+        }
+        return 1;
+}
+__setup("rdt", set_rdt_options);
+static bool __init rdt_cpu_has(int flag)
+{
+        bool ret = boot_cpu_has(flag);
+        struct rdt_options *o;
+        if (!ret)
+                return ret;
+        for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
+                if (flag == o->flag) {
+                        if (o->force_off)
+                                ret = false;
+                        if (o->force_on)
+                                ret = true;
+                        break;
+                }
+        }
+        return ret;
+}
+static __init bool get_rdt_alloc_resources(void)
 {
        bool ret = false;
-        if (cache_alloc_hsw_probe())
+        if (rdt_alloc_capable)
                return true;
        if (!boot_cpu_has(X86_FEATURE_RDT_A))
                return false;
-        if (boot_cpu_has(X86_FEATURE_CAT_L3)) {
+        if (rdt_cpu_has(X86_FEATURE_CAT_L3)) {
-                rdt_get_cache_config(1, &rdt_resources_all[RDT_RESOURCE_L3]);
+                rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]);
-                if (boot_cpu_has(X86_FEATURE_CDP_L3)) {
+                if (rdt_cpu_has(X86_FEATURE_CDP_L3)) {
                        rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA);
                        rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE);
                }
                ret = true;
        }
-        if (boot_cpu_has(X86_FEATURE_CAT_L2)) {
+        if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
                /* CPUID 0x10.2 fields are same format at 0x10.1 */
-                rdt_get_cache_config(2, &rdt_resources_all[RDT_RESOURCE_L2]);
+                rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]);
                ret = true;
        }
-        if (boot_cpu_has(X86_FEATURE_MBA)) {
+        if (rdt_cpu_has(X86_FEATURE_MBA)) {
                if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA]))
                        ret = true;
        }
        return ret;
 }
+static __init bool get_rdt_mon_resources(void)
+{
+        if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC))
+                rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID);
+        if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL))
+                rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID);
+        if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))
+                rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID);
+        if (!rdt_mon_features)
+                return false;
+        return !rdt_get_mon_l3_config(&rdt_resources_all[RDT_RESOURCE_L3]);
+}
+static __init void rdt_quirks(void)
+{
+        switch (boot_cpu_data.x86_model) {
+        case INTEL_FAM6_HASWELL_X:
+                if (!rdt_options[RDT_FLAG_L3_CAT].force_off)
+                        cache_alloc_hsw_probe();
+                break;
+        case INTEL_FAM6_SKYLAKE_X:
+                if (boot_cpu_data.x86_mask <= 4)
+                        set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
+        }
+}
+static __init bool get_rdt_resources(void)
+{
+        rdt_quirks();
+        rdt_alloc_capable = get_rdt_alloc_resources();
+        rdt_mon_capable = get_rdt_mon_resources();
+        return (rdt_mon_capable || rdt_alloc_capable);
+}
 static int __init intel_rdt_late_init(void)
 {
        struct rdt_resource *r;
@@ -556,9 +806,12 @@ static int __init intel_rdt_late_init(void)
                return ret;
        }
-        for_each_capable_rdt_resource(r)
+        for_each_alloc_capable_rdt_resource(r)
                pr_info("Intel RDT %s allocation detected\n", r->name);
+        for_each_mon_capable_rdt_resource(r)
+                pr_info("Intel RDT %s monitoring detected\n", r->name);
        return 0;
 }
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
new file mode 100644
index 000000000000..ebaddaeef023
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -0,0 +1,440 @@
+#ifndef _ASM_X86_INTEL_RDT_H
+#define _ASM_X86_INTEL_RDT_H
+#include <linux/sched.h>
+#include <linux/kernfs.h>
+#include <linux/jump_label.h>
+#define IA32_L3_QOS_CFG         0xc81
+#define IA32_L3_CBM_BASE        0xc90
+#define IA32_L2_CBM_BASE        0xd10
+#define IA32_MBA_THRTL_BASE     0xd50
+#define L3_QOS_CDP_ENABLE       0x01ULL
+/*
+ * Event IDs are used to program IA32_QM_EVTSEL before reading event
+ * counter from IA32_QM_CTR
+ */
+#define QOS_L3_OCCUP_EVENT_ID           0x01
+#define QOS_L3_MBM_TOTAL_EVENT_ID       0x02
+#define QOS_L3_MBM_LOCAL_EVENT_ID       0x03
+#define CQM_LIMBOCHECK_INTERVAL 1000
+#define MBM_CNTR_WIDTH                  24
+#define MBM_OVERFLOW_INTERVAL           1000
+#define RMID_VAL_ERROR                  BIT_ULL(63)
+#define RMID_VAL_UNAVAIL                BIT_ULL(62)
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+/**
+ * struct mon_evt - Entry in the event list of a resource
+ * @evtid:              event id
+ * @name:               name of the event
+ */
+struct mon_evt {
+        u32                     evtid;
+        char                    *name;
+        struct list_head        list;
+};
+/**
+ * struct mon_data_bits - Monitoring details for each event file
+ * @rid:               Resource id associated with the event file.
+ * @evtid:             Event id associated with the event file
+ * @domid:             The domain to which the event file belongs
+ */
+union mon_data_bits {
+        void *priv;
+        struct {
+                unsigned int rid        : 10;
+                unsigned int evtid      : 8;
+                unsigned int domid      : 14;
+        } u;
+};
+struct rmid_read {
+        struct rdtgroup         *rgrp;
+        struct rdt_domain       *d;
+        int                     evtid;
+        bool                    first;
+        u64                     val;
+};
+extern unsigned int intel_cqm_threshold;
+extern bool rdt_alloc_capable;
+extern bool rdt_mon_capable;
+extern unsigned int rdt_mon_features;
+enum rdt_group_type {
+        RDTCTRL_GROUP = 0,
+        RDTMON_GROUP,
+        RDT_NUM_GROUP,
+};
+/**
+ * struct mongroup - store mon group's data in resctrl fs.
+ * @mon_data_kn         kernlfs node for the mon_data directory
+ * @parent:                     parent rdtgrp
+ * @crdtgrp_list:               child rdtgroup node list
+ * @rmid:                       rmid for this rdtgroup
+ */
+struct mongroup {
+        struct kernfs_node      *mon_data_kn;
+        struct rdtgroup         *parent;
+        struct list_head        crdtgrp_list;
+        u32                     rmid;
+};
+/**
+ * struct rdtgroup - store rdtgroup's data in resctrl file system.
+ * @kn:                         kernfs node
+ * @rdtgroup_list:              linked list for all rdtgroups
+ * @closid:                     closid for this rdtgroup
+ * @cpu_mask:                   CPUs assigned to this rdtgroup
+ * @flags:                      status bits
+ * @waitcount:                  how many cpus expect to find this
+ *                              group when they acquire rdtgroup_mutex
+ * @type:                       indicates type of this rdtgroup - either
+ *                              monitor only or ctrl_mon group
+ * @mon:                        mongroup related data
+ */
+struct rdtgroup {
+        struct kernfs_node      *kn;
+        struct list_head        rdtgroup_list;
+        u32                     closid;
+        struct cpumask          cpu_mask;
+        int                     flags;
+        atomic_t                waitcount;
+        enum rdt_group_type     type;
+        struct mongroup         mon;
+};
+/* rdtgroup.flags */
+#define RDT_DELETED             1
+/* rftype.flags */
+#define RFTYPE_FLAGS_CPUS_LIST  1
+/*
+ * Define the file type flags for base and info directories.
+ */
+#define RFTYPE_INFO                     BIT(0)
+#define RFTYPE_BASE                     BIT(1)
+#define RF_CTRLSHIFT                    4
+#define RF_MONSHIFT                     5
+#define RFTYPE_CTRL                     BIT(RF_CTRLSHIFT)
+#define RFTYPE_MON                      BIT(RF_MONSHIFT)
+#define RFTYPE_RES_CACHE                BIT(8)
+#define RFTYPE_RES_MB                   BIT(9)
+#define RF_CTRL_INFO                    (RFTYPE_INFO | RFTYPE_CTRL)
+#define RF_MON_INFO                     (RFTYPE_INFO | RFTYPE_MON)
+#define RF_CTRL_BASE                    (RFTYPE_BASE | RFTYPE_CTRL)
+/* List of all resource groups */
+extern struct list_head rdt_all_groups;
+extern int max_name_width, max_data_width;
+int __init rdtgroup_init(void);
+/**
+ * struct rftype - describe each file in the resctrl file system
+ * @name:       File name
+ * @mode:       Access mode
+ * @kf_ops:     File operations
+ * @flags:      File specific RFTYPE_FLAGS_* flags
+ * @fflags:     File specific RF_* or RFTYPE_* flags
+ * @seq_show:   Show content of the file
+ * @write:      Write to the file
+ */
+struct rftype {
+        char                    *name;
+        umode_t                 mode;
+        struct kernfs_ops       *kf_ops;
+        unsigned long           flags;
+        unsigned long           fflags;
+        int (*seq_show)(struct kernfs_open_file *of,
+                        struct seq_file *sf, void *v);
+        /*
+         * write() is the generic write callback which maps directly to
+         * kernfs write operation and overrides all other operations.
+         * Maximum write size is determined by ->max_write_len.
+         */
+        ssize_t (*write)(struct kernfs_open_file *of,
+                         char *buf, size_t nbytes, loff_t off);
+};
+/**
+ * struct mbm_state - status for each MBM counter in each domain
+ * @chunks:     Total data moved (multiply by rdt_group.mon_scale to get bytes)
+ * @prev_msr    Value of IA32_QM_CTR for this RMID last time we read it
+ */
+struct mbm_state {
+        u64     chunks;
+        u64     prev_msr;
+};
+/**
+ * struct rdt_domain - group of cpus sharing an RDT resource
+ * @list:       all instances of this resource
+ * @id:         unique id for this instance
+ * @cpu_mask:   which cpus share this resource
+ * @rmid_busy_llc:
+ *              bitmap of which limbo RMIDs are above threshold
+ * @mbm_total:  saved state for MBM total bandwidth
+ * @mbm_local:  saved state for MBM local bandwidth
+ * @mbm_over:   worker to periodically read MBM h/w counters
+ * @cqm_limbo:  worker to periodically read CQM h/w counters
+ * @mbm_work_cpu:
+ *              worker cpu for MBM h/w counters
+ * @cqm_work_cpu:
+ *              worker cpu for CQM h/w counters
+ * @ctrl_val:   array of cache or mem ctrl values (indexed by CLOSID)
+ * @new_ctrl:   new ctrl value to be loaded
+ * @have_new_ctrl: did user provide new_ctrl for this domain
+ */
+struct rdt_domain {
+        struct list_head        list;
+        int                     id;
+        struct cpumask          cpu_mask;
+        unsigned long           *rmid_busy_llc;
+        struct mbm_state        *mbm_total;
+        struct mbm_state        *mbm_local;
+        struct delayed_work     mbm_over;
+        struct delayed_work     cqm_limbo;
+        int                     mbm_work_cpu;
+        int                     cqm_work_cpu;
+        u32                     *ctrl_val;
+        u32                     new_ctrl;
+        bool                    have_new_ctrl;
+};
+/**
+ * struct msr_param - set a range of MSRs from a domain
+ * @res:       The resource to use
+ * @low:       Beginning index from base MSR
+ * @high:      End index
+ */
+struct msr_param {
+        struct rdt_resource     *res;
+        int                     low;
+        int                     high;
+};
+/**
+ * struct rdt_cache - Cache allocation related data
+ * @cbm_len:            Length of the cache bit mask
+ * @min_cbm_bits:       Minimum number of consecutive bits to be set
+ * @cbm_idx_mult:       Multiplier of CBM index
+ * @cbm_idx_offset:     Offset of CBM index. CBM index is computed by:
+ *                      closid * cbm_idx_multi + cbm_idx_offset
+ *                      in a cache bit mask
+ * @shareable_bits:     Bitmask of shareable resource with other
+ *                      executing entities
+ */
+struct rdt_cache {
+        unsigned int    cbm_len;
+        unsigned int    min_cbm_bits;
+        unsigned int    cbm_idx_mult;
+        unsigned int    cbm_idx_offset;
+        unsigned int    shareable_bits;
+};
+/**
+ * struct rdt_membw - Memory bandwidth allocation related data
+ * @max_delay:          Max throttle delay. Delay is the hardware
+ *                      representation for memory bandwidth.
+ * @min_bw:             Minimum memory bandwidth percentage user can request
+ * @bw_gran:            Granularity at which the memory bandwidth is allocated
+ * @delay_linear:       True if memory B/W delay is in linear scale
+ * @mb_map:             Mapping of memory B/W percentage to memory B/W delay
+ */
+struct rdt_membw {
+        u32             max_delay;
+        u32             min_bw;
+        u32             bw_gran;
+        u32             delay_linear;
+        u32             *mb_map;
+};
+static inline bool is_llc_occupancy_enabled(void)
+{
+        return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID));
+}
+static inline bool is_mbm_total_enabled(void)
+{
+        return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID));
+}
+static inline bool is_mbm_local_enabled(void)
+{
+        return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID));
+}
+static inline bool is_mbm_enabled(void)
+{
+        return (is_mbm_total_enabled() || is_mbm_local_enabled());
+}
+static inline bool is_mbm_event(int e)
+{
+        return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
+                e <= QOS_L3_MBM_LOCAL_EVENT_ID);
+}
+/**
+ * struct rdt_resource - attributes of an RDT resource
+ * @rid:                The index of the resource
+ * @alloc_enabled:      Is allocation enabled on this machine
+ * @mon_enabled:                Is monitoring enabled for this feature
+ * @alloc_capable:      Is allocation available on this machine
+ * @mon_capable:                Is monitor feature available on this machine
+ * @name:               Name to use in "schemata" file
+ * @num_closid:         Number of CLOSIDs available
+ * @cache_level:        Which cache level defines scope of this resource
+ * @default_ctrl:       Specifies default cache cbm or memory B/W percent.
+ * @msr_base:           Base MSR address for CBMs
+ * @msr_update:         Function pointer to update QOS MSRs
+ * @data_width:         Character width of data when displaying
+ * @domains:            All domains for this resource
+ * @cache:              Cache allocation related data
+ * @format_str:         Per resource format string to show domain value
+ * @parse_ctrlval:      Per resource function pointer to parse control values
+ * @evt_list:                   List of monitoring events
+ * @num_rmid:                   Number of RMIDs available
+ * @mon_scale:                  cqm counter * mon_scale = occupancy in bytes
+ * @fflags:                     flags to choose base and info files
+ */
+struct rdt_resource {
+        int                     rid;
+        bool                    alloc_enabled;
+        bool                    mon_enabled;
+        bool                    alloc_capable;
+        bool                    mon_capable;
+        char                    *name;
+        int                     num_closid;
+        int                     cache_level;
+        u32                     default_ctrl;
+        unsigned int            msr_base;
+        void (*msr_update)      (struct rdt_domain *d, struct msr_param *m,
+                                 struct rdt_resource *r);
+        int                     data_width;
+        struct list_head        domains;
+        struct rdt_cache        cache;
+        struct rdt_membw        membw;
+        const char              *format_str;
+        int (*parse_ctrlval)    (char *buf, struct rdt_resource *r,
+                                 struct rdt_domain *d);
+        struct list_head        evt_list;
+        int                     num_rmid;
+        unsigned int            mon_scale;
+        unsigned long           fflags;
+};
+int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d);
+int parse_bw(char *buf, struct rdt_resource *r,  struct rdt_domain *d);
+extern struct mutex rdtgroup_mutex;
+extern struct rdt_resource rdt_resources_all[];
+extern struct rdtgroup rdtgroup_default;
+DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+int __init rdtgroup_init(void);
+enum {
+        RDT_RESOURCE_L3,
+        RDT_RESOURCE_L3DATA,
+        RDT_RESOURCE_L3CODE,
+        RDT_RESOURCE_L2,
+        RDT_RESOURCE_MBA,
+        /* Must be the last */
+        RDT_NUM_RESOURCES,
+};
+#define for_each_capable_rdt_resource(r)                                      \
+        for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+             r++)                                                             \
+                if (r->alloc_capable || r->mon_capable)
+#define for_each_alloc_capable_rdt_resource(r)                                \
+        for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+             r++)                                                             \
+                if (r->alloc_capable)
+#define for_each_mon_capable_rdt_resource(r)                                  \
+        for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+             r++)                                                             \
+                if (r->mon_capable)
+#define for_each_alloc_enabled_rdt_resource(r)                                \
+        for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+             r++)                                                             \
+                if (r->alloc_enabled)
+#define for_each_mon_enabled_rdt_resource(r)                                  \
+        for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+             r++)                                                             \
+                if (r->mon_enabled)
+/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
+union cpuid_0x10_1_eax {
+        struct {
+                unsigned int cbm_len:5;
+        } split;
+        unsigned int full;
+};
+/* CPUID.(EAX=10H, ECX=ResID=3).EAX */
+union cpuid_0x10_3_eax {
+        struct {
+                unsigned int max_delay:12;
+        } split;
+        unsigned int full;
+};
+/* CPUID.(EAX=10H, ECX=ResID).EDX */
+union cpuid_0x10_x_edx {
+        struct {
+                unsigned int cos_max:16;
+        } split;
+        unsigned int full;
+};
+void rdt_ctrl_update(void *arg);
+struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
+void rdtgroup_kn_unlock(struct kernfs_node *kn);
+struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+                                   struct list_head **pos);
+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
+                                char *buf, size_t nbytes, loff_t off);
+int rdtgroup_schemata_show(struct kernfs_open_file *of,
+                           struct seq_file *s, void *v);
+struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
+int alloc_rmid(void);
+void free_rmid(u32 rmid);
+int rdt_get_mon_l3_config(struct rdt_resource *r);
+void mon_event_count(void *info);
+int rdtgroup_mondata_show(struct seq_file *m, void *arg);
+void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+                                    unsigned int dom_id);
+void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+                                    struct rdt_domain *d);
+void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
+                    struct rdtgroup *rdtgrp, int evtid, int first);
+void mbm_setup_overflow_handler(struct rdt_domain *dom,
+                                unsigned long delay_ms);
+void mbm_handle_overflow(struct work_struct *work);
+void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
+void cqm_handle_limbo(struct work_struct *work);
+bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
+void __check_limbo(struct rdt_domain *d, bool force_free);
+#endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
index 406d7a6532f9..f6ea94f8954a 100644
--- a/arch/x86/kernel/cpu/intel_rdt_schemata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -26,7 +26,7 @@
 #include <linux/kernfs.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
-#include <asm/intel_rdt.h>
+#include "intel_rdt.h"
 /*
 * Check whether MBA bandwidth percentage value is correct. The value is
@@ -192,7 +192,7 @@ static int rdtgroup_parse_resource(char *resname, char *tok, int closid)
 {
        struct rdt_resource *r;
-        for_each_enabled_rdt_resource(r) {
+        for_each_alloc_enabled_rdt_resource(r) {
                if (!strcmp(resname, r->name) && closid < r->num_closid)
                        return parse_line(tok, r);
        }
@@ -221,7 +221,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
        closid = rdtgrp->closid;
-        for_each_enabled_rdt_resource(r) {
+        for_each_alloc_enabled_rdt_resource(r) {
                list_for_each_entry(dom, &r->domains, list)
                        dom->have_new_ctrl = false;
        }
@@ -237,7 +237,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
                        goto out;
        }
-        for_each_enabled_rdt_resource(r) {
+        for_each_alloc_enabled_rdt_resource(r) {
                ret = update_domains(r, closid);
                if (ret)
                        goto out;
@@ -269,12 +269,13 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 {
        struct rdtgroup *rdtgrp;
        struct rdt_resource *r;
-        int closid, ret = 0;
+        int ret = 0;
+        u32 closid;
        rdtgrp = rdtgroup_kn_lock_live(of->kn);
        if (rdtgrp) {
                closid = rdtgrp->closid;
-                for_each_enabled_rdt_resource(r) {
+                for_each_alloc_enabled_rdt_resource(r) {
                        if (closid < r->num_closid)
                                show_doms(s, r, closid);
                }
@@ -284,3 +285,57 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
        rdtgroup_kn_unlock(of->kn);
        return ret;
 }
+void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
+                    struct rdtgroup *rdtgrp, int evtid, int first)
+{
+        /*
+         * setup the parameters to send to the IPI to read the data.
+         */
+        rr->rgrp = rdtgrp;
+        rr->evtid = evtid;
+        rr->d = d;
+        rr->val = 0;
+        rr->first = first;
+        smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1);
+}
+int rdtgroup_mondata_show(struct seq_file *m, void *arg)
+{
+        struct kernfs_open_file *of = m->private;
+        u32 resid, evtid, domid;
+        struct rdtgroup *rdtgrp;
+        struct rdt_resource *r;
+        union mon_data_bits md;
+        struct rdt_domain *d;
+        struct rmid_read rr;
+        int ret = 0;
+        rdtgrp = rdtgroup_kn_lock_live(of->kn);
+        md.priv = of->kn->priv;
+        resid = md.u.rid;
+        domid = md.u.domid;
+        evtid = md.u.evtid;
+        r = &rdt_resources_all[resid];
+        d = rdt_find_domain(r, domid, NULL);
+        if (!d) {
+                ret = -ENOENT;
+                goto out;
+        }
+        mon_event_read(&rr, d, rdtgrp, evtid, false);
+        if (rr.val & RMID_VAL_ERROR)
+                seq_puts(m, "Error\n");
+        else if (rr.val & RMID_VAL_UNAVAIL)
+                seq_puts(m, "Unavailable\n");
+        else
+                seq_printf(m, "%llu\n", rr.val * r->mon_scale);
+out:
+        rdtgroup_kn_unlock(of->kn);
+        return ret;
+}
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
new file mode 100644
index 000000000000..30827510094b
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -0,0 +1,499 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Monitoring code
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author:
+ *    Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * This replaces the cqm.c based on perf but we reuse a lot of
+ * code and datastructures originally from Peter Zijlstra and Matt Fleming.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/cpu_device_id.h>
+#include "intel_rdt.h"
+#define MSR_IA32_QM_CTR         0x0c8e
+#define MSR_IA32_QM_EVTSEL              0x0c8d
+struct rmid_entry {
+        u32                             rmid;
+        int                             busy;
+        struct list_head                list;
+};
+/**
+ * @rmid_free_lru    A least recently used list of free RMIDs
+ *     These RMIDs are guaranteed to have an occupancy less than the
+ *     threshold occupancy
+ */
+static LIST_HEAD(rmid_free_lru);
+/**
+ * @rmid_limbo_count     count of currently unused but (potentially)
+ *     dirty RMIDs.
+ *     This counts RMIDs that no one is currently using but that
+ *     may have a occupancy value > intel_cqm_threshold. User can change
+ *     the threshold occupancy value.
+ */
+unsigned int rmid_limbo_count;
+/**
+ * @rmid_entry - The entry in the limbo and free lists.
+ */
+static struct rmid_entry        *rmid_ptrs;
+/*
+ * Global boolean for rdt_monitor which is true if any
+ * resource monitoring is enabled.
+ */
+bool rdt_mon_capable;
+/*
+ * Global to indicate which monitoring events are enabled.
+ */
+unsigned int rdt_mon_features;
+/*
+ * This is the threshold cache occupancy at which we will consider an
+ * RMID available for re-allocation.
+ */
+unsigned int intel_cqm_threshold;
+static inline struct rmid_entry *__rmid_entry(u32 rmid)
+{
+        struct rmid_entry *entry;
+        entry = &rmid_ptrs[rmid];
+        WARN_ON(entry->rmid != rmid);
+        return entry;
+}
+static u64 __rmid_read(u32 rmid, u32 eventid)
+{
+        u64 val;
+        /*
+         * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
+         * with a valid event code for supported resource type and the bits
+         * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
+         * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
+         * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
+         * are error bits.
+         */
+        wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
+        rdmsrl(MSR_IA32_QM_CTR, val);
+        return val;
+}
+static bool rmid_dirty(struct rmid_entry *entry)
+{
+        u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
+        return val >= intel_cqm_threshold;
+}
+/*
+ * Check the RMIDs that are marked as busy for this domain. If the
+ * reported LLC occupancy is below the threshold clear the busy bit and
+ * decrement the count. If the busy count gets to zero on an RMID, we
+ * free the RMID
+ */
+void __check_limbo(struct rdt_domain *d, bool force_free)
+{
+        struct rmid_entry *entry;
+        struct rdt_resource *r;
+        u32 crmid = 1, nrmid;
+        r = &rdt_resources_all[RDT_RESOURCE_L3];
+        /*
+         * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
+         * are marked as busy for occupancy < threshold. If the occupancy
+         * is less than the threshold decrement the busy counter of the
+         * RMID and move it to the free list when the counter reaches 0.
+         */
+        for (;;) {
+                nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid);
+                if (nrmid >= r->num_rmid)
+                        break;
+                entry = __rmid_entry(nrmid);
+                if (force_free || !rmid_dirty(entry)) {
+                        clear_bit(entry->rmid, d->rmid_busy_llc);
+                        if (!--entry->busy) {
+                                rmid_limbo_count--;
+                                list_add_tail(&entry->list, &rmid_free_lru);
+                        }
+                }
+                crmid = nrmid + 1;
+        }
+}
+bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d)
+{
+        return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid;
+}
+/*
+ * As of now the RMIDs allocation is global.
+ * However we keep track of which packages the RMIDs
+ * are used to optimize the limbo list management.
+ */
+int alloc_rmid(void)
+{
+        struct rmid_entry *entry;
+        lockdep_assert_held(&rdtgroup_mutex);
+        if (list_empty(&rmid_free_lru))
+                return rmid_limbo_count ? -EBUSY : -ENOSPC;
+        entry = list_first_entry(&rmid_free_lru,
+                                 struct rmid_entry, list);
+        list_del(&entry->list);
+        return entry->rmid;
+}
+static void add_rmid_to_limbo(struct rmid_entry *entry)
+{
+        struct rdt_resource *r;
+        struct rdt_domain *d;
+        int cpu;
+        u64 val;
+        r = &rdt_resources_all[RDT_RESOURCE_L3];
+        entry->busy = 0;
+        cpu = get_cpu();
+        list_for_each_entry(d, &r->domains, list) {
+                if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
+                        val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
+                        if (val <= intel_cqm_threshold)
+                                continue;
+                }
+                /*
+                 * For the first limbo RMID in the domain,
+                 * setup up the limbo worker.
+                 */
+                if (!has_busy_rmid(r, d))
+                        cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL);
+                set_bit(entry->rmid, d->rmid_busy_llc);
+                entry->busy++;
+        }
+        put_cpu();
+        if (entry->busy)
+                rmid_limbo_count++;
+        else
+                list_add_tail(&entry->list, &rmid_free_lru);
+}
+void free_rmid(u32 rmid)
+{
+        struct rmid_entry *entry;
+        if (!rmid)
+                return;
+        lockdep_assert_held(&rdtgroup_mutex);
+        entry = __rmid_entry(rmid);
+        if (is_llc_occupancy_enabled())
+                add_rmid_to_limbo(entry);
+        else
+                list_add_tail(&entry->list, &rmid_free_lru);
+}
+static int __mon_event_count(u32 rmid, struct rmid_read *rr)
+{
+        u64 chunks, shift, tval;
+        struct mbm_state *m;
+        tval = __rmid_read(rmid, rr->evtid);
+        if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) {
+                rr->val = tval;
+                return -EINVAL;
+        }
+        switch (rr->evtid) {
+        case QOS_L3_OCCUP_EVENT_ID:
+                rr->val += tval;
+                return 0;
+        case QOS_L3_MBM_TOTAL_EVENT_ID:
+                m = &rr->d->mbm_total[rmid];
+                break;
+        case QOS_L3_MBM_LOCAL_EVENT_ID:
+                m = &rr->d->mbm_local[rmid];
+                break;
+        default:
+                /*
+                 * Code would never reach here because
+                 * an invalid event id would fail the __rmid_read.
+                 */
+                return -EINVAL;
+        }
+        if (rr->first) {
+                m->prev_msr = tval;
+                m->chunks = 0;
+                return 0;
+        }
+        shift = 64 - MBM_CNTR_WIDTH;
+        chunks = (tval << shift) - (m->prev_msr << shift);
+        chunks >>= shift;
+        m->chunks += chunks;
+        m->prev_msr = tval;
+        rr->val += m->chunks;
+        return 0;
+}
+/*
+ * This is called via IPI to read the CQM/MBM counters
+ * on a domain.
+ */
+void mon_event_count(void *info)
+{
+        struct rdtgroup *rdtgrp, *entry;
+        struct rmid_read *rr = info;
+        struct list_head *head;
+        rdtgrp = rr->rgrp;
+        if (__mon_event_count(rdtgrp->mon.rmid, rr))
+                return;
+        /*
+         * For Ctrl groups read data from child monitor groups.
+         */
+        head = &rdtgrp->mon.crdtgrp_list;
+        if (rdtgrp->type == RDTCTRL_GROUP) {
+                list_for_each_entry(entry, head, mon.crdtgrp_list) {
+                        if (__mon_event_count(entry->mon.rmid, rr))
+                                return;
+                }
+        }
+}
+static void mbm_update(struct rdt_domain *d, int rmid)
+{
+        struct rmid_read rr;
+        rr.first = false;
+        rr.d = d;
+        /*
+         * This is protected from concurrent reads from user
+         * as both the user and we hold the global mutex.
+         */
+        if (is_mbm_total_enabled()) {
+                rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
+                __mon_event_count(rmid, &rr);
+        }
+        if (is_mbm_local_enabled()) {
+                rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
+                __mon_event_count(rmid, &rr);
+        }
+}
+/*
+ * Handler to scan the limbo list and move the RMIDs
+ * to free list whose occupancy < threshold_occupancy.
+ */
+void cqm_handle_limbo(struct work_struct *work)
+{
+        unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
+        int cpu = smp_processor_id();
+        struct rdt_resource *r;
+        struct rdt_domain *d;
+        mutex_lock(&rdtgroup_mutex);
+        r = &rdt_resources_all[RDT_RESOURCE_L3];
+        d = get_domain_from_cpu(cpu, r);
+        if (!d) {
+                pr_warn_once("Failure to get domain for limbo worker\n");
+                goto out_unlock;
+        }
+        __check_limbo(d, false);
+        if (has_busy_rmid(r, d))
+                schedule_delayed_work_on(cpu, &d->cqm_limbo, delay);
+out_unlock:
+        mutex_unlock(&rdtgroup_mutex);
+}
+void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms)
+{
+        unsigned long delay = msecs_to_jiffies(delay_ms);
+        struct rdt_resource *r;
+        int cpu;
+        r = &rdt_resources_all[RDT_RESOURCE_L3];
+        cpu = cpumask_any(&dom->cpu_mask);
+        dom->cqm_work_cpu = cpu;
+        schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
+}
+void mbm_handle_overflow(struct work_struct *work)
+{
+        unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
+        struct rdtgroup *prgrp, *crgrp;
+        int cpu = smp_processor_id();
+        struct list_head *head;
+        struct rdt_domain *d;
+        mutex_lock(&rdtgroup_mutex);
+        if (!static_branch_likely(&rdt_enable_key))
+                goto out_unlock;
+        d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]);
+        if (!d)
+                goto out_unlock;
+        list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+                mbm_update(d, prgrp->mon.rmid);
+                head = &prgrp->mon.crdtgrp_list;
+                list_for_each_entry(crgrp, head, mon.crdtgrp_list)
+                        mbm_update(d, crgrp->mon.rmid);
+        }
+        schedule_delayed_work_on(cpu, &d->mbm_over, delay);
+out_unlock:
+        mutex_unlock(&rdtgroup_mutex);
+}
+void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms)
+{
+        unsigned long delay = msecs_to_jiffies(delay_ms);
+        int cpu;
+        if (!static_branch_likely(&rdt_enable_key))
+                return;
+        cpu = cpumask_any(&dom->cpu_mask);
+        dom->mbm_work_cpu = cpu;
+        schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
+}
+static int dom_data_init(struct rdt_resource *r)
+{
+        struct rmid_entry *entry = NULL;
+        int i, nr_rmids;
+        nr_rmids = r->num_rmid;
+        rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL);
+        if (!rmid_ptrs)
+                return -ENOMEM;
+        for (i = 0; i < nr_rmids; i++) {
+                entry = &rmid_ptrs[i];
+                INIT_LIST_HEAD(&entry->list);
+                entry->rmid = i;
+                list_add_tail(&entry->list, &rmid_free_lru);
+        }
+        /*
+         * RMID 0 is special and is always allocated. It's used for all
+         * tasks that are not monitored.
+         */
+        entry = __rmid_entry(0);
+        list_del(&entry->list);
+        return 0;
+}
+static struct mon_evt llc_occupancy_event = {
+        .name           = "llc_occupancy",
+        .evtid          = QOS_L3_OCCUP_EVENT_ID,
+};
+static struct mon_evt mbm_total_event = {
+        .name           = "mbm_total_bytes",
+        .evtid          = QOS_L3_MBM_TOTAL_EVENT_ID,
+};
+static struct mon_evt mbm_local_event = {
+        .name           = "mbm_local_bytes",
+        .evtid          = QOS_L3_MBM_LOCAL_EVENT_ID,
+};
+/*
+ * Initialize the event list for the resource.
+ *
+ * Note that MBM events are also part of RDT_RESOURCE_L3 resource
+ * because as per the SDM the total and local memory bandwidth
+ * are enumerated as part of L3 monitoring.
+ */
+static void l3_mon_evt_init(struct rdt_resource *r)
+{
+        INIT_LIST_HEAD(&r->evt_list);
+        if (is_llc_occupancy_enabled())
+                list_add_tail(&llc_occupancy_event.list, &r->evt_list);
+        if (is_mbm_total_enabled())
+                list_add_tail(&mbm_total_event.list, &r->evt_list);
+        if (is_mbm_local_enabled())
+                list_add_tail(&mbm_local_event.list, &r->evt_list);
+}
+int rdt_get_mon_l3_config(struct rdt_resource *r)
+{
+        int ret;
+        r->mon_scale = boot_cpu_data.x86_cache_occ_scale;
+        r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
+        /*
+         * A reasonable upper limit on the max threshold is the number
+         * of lines tagged per RMID if all RMIDs have the same number of
+         * lines tagged in the LLC.
+         *
+         * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+         */
+        intel_cqm_threshold = boot_cpu_data.x86_cache_size * 1024 / r->num_rmid;
+        /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
+        intel_cqm_threshold /= r->mon_scale;
+        ret = dom_data_init(r);
+        if (ret)
+                return ret;
+        l3_mon_evt_init(r);
+        r->mon_capable = true;
+        r->mon_enabled = true;
+        return 0;
+}
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 9257bd9dc664..a869d4a073c5 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -32,17 +32,25 @@
 #include <uapi/linux/magic.h>
-#include <asm/intel_rdt.h>
+#include <asm/intel_rdt_sched.h>
-#include <asm/intel_rdt_common.h>
+#include "intel_rdt.h"
 DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
-struct kernfs_root *rdt_root;
+DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
+DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+static struct kernfs_root *rdt_root;
 struct rdtgroup rdtgroup_default;
 LIST_HEAD(rdt_all_groups);
 /* Kernel fs node for "info" directory under root */
 static struct kernfs_node *kn_info;
+/* Kernel fs node for "mon_groups" directory under root */
+static struct kernfs_node *kn_mongrp;
+/* Kernel fs node for "mon_data" directory under root */
+static struct kernfs_node *kn_mondata;
 /*
 * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
 * we can keep a bitmap of free CLOSIDs in a single integer.
@@ -66,7 +74,7 @@ static void closid_init(void)
        int rdt_min_closid = 32;
        /* Compute rdt_min_closid across all resources */
-        for_each_enabled_rdt_resource(r)
+        for_each_alloc_enabled_rdt_resource(r)
                rdt_min_closid = min(rdt_min_closid, r->num_closid);
        closid_free_map = BIT_MASK(rdt_min_closid) - 1;
@@ -75,9 +83,9 @@ static void closid_init(void)
        closid_free_map &= ~1;
 }
-int closid_alloc(void)
+static int closid_alloc(void)
 {
-        int closid = ffs(closid_free_map);
+        u32 closid = ffs(closid_free_map);
        if (closid == 0)
                return -ENOSPC;
@@ -125,28 +133,6 @@ static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
        return 0;
 }
-static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts,
-                              int len)
-{
-        struct rftype *rft;
-        int ret;
-        lockdep_assert_held(&rdtgroup_mutex);
-        for (rft = rfts; rft < rfts + len; rft++) {
-                ret = rdtgroup_add_file(kn, rft);
-                if (ret)
-                        goto error;
-        }
-        return 0;
-error:
-        pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
-        while (--rft >= rfts)
-                kernfs_remove_by_name(kn, rft->name);
-        return ret;
-}
 static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
 {
        struct kernfs_open_file *of = m->private;
@@ -174,6 +160,11 @@ static struct kernfs_ops rdtgroup_kf_single_ops = {
        .seq_show               = rdtgroup_seqfile_show,
 };
+static struct kernfs_ops kf_mondata_ops = {
+        .atomic_write_len       = PAGE_SIZE,
+        .seq_show               = rdtgroup_mondata_show,
+};
 static bool is_cpu_list(struct kernfs_open_file *of)
 {
        struct rftype *rft = of->kn->priv;
@@ -203,13 +194,18 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
 /*
 * This is safe against intel_rdt_sched_in() called from __switch_to()
 * because __switch_to() is executed with interrupts disabled. A local call
- * from rdt_update_closid() is proteced against __switch_to() because
+ * from update_closid_rmid() is proteced against __switch_to() because
 * preemption is disabled.
 */
-static void rdt_update_cpu_closid(void *closid)
+static void update_cpu_closid_rmid(void *info)
 {
-        if (closid)
+        struct rdtgroup *r = info;
-                this_cpu_write(cpu_closid, *(int *)closid);
+        if (r) {
+                this_cpu_write(pqr_state.default_closid, r->closid);
+                this_cpu_write(pqr_state.default_rmid, r->mon.rmid);
+        }
        /*
         * We cannot unconditionally write the MSR because the current
         * executing task might have its own closid selected. Just reuse
@@ -221,28 +217,128 @@ static void rdt_update_cpu_closid(void *closid)
 /*
 * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
 *
- * Per task closids must have been set up before calling this function.
+ * Per task closids/rmids must have been set up before calling this function.
- *
- * The per cpu closids are updated with the smp function call, when @closid
- * is not NULL. If @closid is NULL then all affected percpu closids must
- * have been set up before calling this function.
 */
 static void
-rdt_update_closid(const struct cpumask *cpu_mask, int *closid)
+update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
 {
        int cpu = get_cpu();
        if (cpumask_test_cpu(cpu, cpu_mask))
-                rdt_update_cpu_closid(closid);
+                update_cpu_closid_rmid(r);
-        smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1);
+        smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1);
        put_cpu();
 }
+static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
+                          cpumask_var_t tmpmask)
+{
+        struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
+        struct list_head *head;
+        /* Check whether cpus belong to parent ctrl group */
+        cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
+        if (cpumask_weight(tmpmask))
+                return -EINVAL;
+        /* Check whether cpus are dropped from this group */
+        cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
+        if (cpumask_weight(tmpmask)) {
+                /* Give any dropped cpus to parent rdtgroup */
+                cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
+                update_closid_rmid(tmpmask, prgrp);
+        }
+        /*
+         * If we added cpus, remove them from previous group that owned them
+         * and update per-cpu rmid
+         */
+        cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
+        if (cpumask_weight(tmpmask)) {
+                head = &prgrp->mon.crdtgrp_list;
+                list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+                        if (crgrp == rdtgrp)
+                                continue;
+                        cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
+                                       tmpmask);
+                }
+                update_closid_rmid(tmpmask, rdtgrp);
+        }
+        /* Done pushing/pulling - update this group with new mask */
+        cpumask_copy(&rdtgrp->cpu_mask, newmask);
+        return 0;
+}
+static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
+{
+        struct rdtgroup *crgrp;
+        cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
+        /* update the child mon group masks as well*/
+        list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
+                cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
+}
+static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
+                           cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
+{
+        struct rdtgroup *r, *crgrp;
+        struct list_head *head;
+        /* Check whether cpus are dropped from this group */
+        cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
+        if (cpumask_weight(tmpmask)) {
+                /* Can't drop from default group */
+                if (rdtgrp == &rdtgroup_default)
+                        return -EINVAL;
+                /* Give any dropped cpus to rdtgroup_default */
+                cpumask_or(&rdtgroup_default.cpu_mask,
+                           &rdtgroup_default.cpu_mask, tmpmask);
+                update_closid_rmid(tmpmask, &rdtgroup_default);
+        }
+        /*
+         * If we added cpus, remove them from previous group and
+         * the prev group's child groups that owned them
+         * and update per-cpu closid/rmid.
+         */
+        cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
+        if (cpumask_weight(tmpmask)) {
+                list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
+                        if (r == rdtgrp)
+                                continue;
+                        cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
+                        if (cpumask_weight(tmpmask1))
+                                cpumask_rdtgrp_clear(r, tmpmask1);
+                }
+                update_closid_rmid(tmpmask, rdtgrp);
+        }
+        /* Done pushing/pulling - update this group with new mask */
+        cpumask_copy(&rdtgrp->cpu_mask, newmask);
+        /*
+         * Clear child mon group masks since there is a new parent mask
+         * now and update the rmid for the cpus the child lost.
+         */
+        head = &rdtgrp->mon.crdtgrp_list;
+        list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+                cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
+                update_closid_rmid(tmpmask, rdtgrp);
+                cpumask_clear(&crgrp->cpu_mask);
+        }
+        return 0;
+}
 static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
                                   char *buf, size_t nbytes, loff_t off)
 {
-        cpumask_var_t tmpmask, newmask;
+        cpumask_var_t tmpmask, newmask, tmpmask1;
-        struct rdtgroup *rdtgrp, *r;
+        struct rdtgroup *rdtgrp;
        int ret;
        if (!buf)
@@ -254,6 +350,11 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
                free_cpumask_var(tmpmask);
                return -ENOMEM;
        }
+        if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
+                free_cpumask_var(tmpmask);
+                free_cpumask_var(newmask);
+                return -ENOMEM;
+        }
        rdtgrp = rdtgroup_kn_lock_live(of->kn);
        if (!rdtgrp) {
@@ -276,41 +377,18 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
                goto unlock;
        }
-        /* Check whether cpus are dropped from this group */
+        if (rdtgrp->type == RDTCTRL_GROUP)
-        cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
+                ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
-        if (cpumask_weight(tmpmask)) {
+        else if (rdtgrp->type == RDTMON_GROUP)
-                /* Can't drop from default group */
+                ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
-                if (rdtgrp == &rdtgroup_default) {
+        else
-                        ret = -EINVAL;
+                ret = -EINVAL;
-                        goto unlock;
-                }
-                /* Give any dropped cpus to rdtgroup_default */
-                cpumask_or(&rdtgroup_default.cpu_mask,
-                           &rdtgroup_default.cpu_mask, tmpmask);
-                rdt_update_closid(tmpmask, &rdtgroup_default.closid);
-        }
-        /*
-         * If we added cpus, remove them from previous group that owned them
-         * and update per-cpu closid
-         */
-        cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
-        if (cpumask_weight(tmpmask)) {
-                list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
-                        if (r == rdtgrp)
-                                continue;
-                        cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask);
-                }
-                rdt_update_closid(tmpmask, &rdtgrp->closid);
-        }
-        /* Done pushing/pulling - update this group with new mask */
-        cpumask_copy(&rdtgrp->cpu_mask, newmask);
 unlock:
        rdtgroup_kn_unlock(of->kn);
        free_cpumask_var(tmpmask);
        free_cpumask_var(newmask);
+        free_cpumask_var(tmpmask1);
        return ret ?: nbytes;
 }
@@ -336,6 +414,7 @@ static void move_myself(struct callback_head *head)
        if (atomic_dec_and_test(&rdtgrp->waitcount) &&
            (rdtgrp->flags & RDT_DELETED)) {
                current->closid = 0;
+                current->rmid = 0;
                kfree(rdtgrp);
        }
@@ -374,7 +453,20 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
                atomic_dec(&rdtgrp->waitcount);
                kfree(callback);
        } else {
-                tsk->closid = rdtgrp->closid;
+                /*
+                 * For ctrl_mon groups move both closid and rmid.
+                 * For monitor groups, can move the tasks only from
+                 * their parent CTRL group.
+                 */
+                if (rdtgrp->type == RDTCTRL_GROUP) {
+                        tsk->closid = rdtgrp->closid;
+                        tsk->rmid = rdtgrp->mon.rmid;
+                } else if (rdtgrp->type == RDTMON_GROUP) {
+                        if (rdtgrp->mon.parent->closid == tsk->closid)
+                                tsk->rmid = rdtgrp->mon.rmid;
+                        else
+                                ret = -EINVAL;
+                }
        }
        return ret;
 }
@@ -454,7 +546,8 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
        rcu_read_lock();
        for_each_process_thread(p, t) {
-                if (t->closid == r->closid)
+                if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
+                    (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid))
                        seq_printf(s, "%d\n", t->pid);
        }
        rcu_read_unlock();
@@ -476,39 +569,6 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of,
        return ret;
 }
-/* Files in each rdtgroup */
-static struct rftype rdtgroup_base_files[] = {
-        {
-                .name           = "cpus",
-                .mode           = 0644,
-                .kf_ops         = &rdtgroup_kf_single_ops,
-                .write          = rdtgroup_cpus_write,
-                .seq_show       = rdtgroup_cpus_show,
-        },
-        {
-                .name           = "cpus_list",
-                .mode           = 0644,
-                .kf_ops         = &rdtgroup_kf_single_ops,
-                .write          = rdtgroup_cpus_write,
-                .seq_show       = rdtgroup_cpus_show,
-                .flags          = RFTYPE_FLAGS_CPUS_LIST,
-        },
-        {
-                .name           = "tasks",
-                .mode           = 0644,
-                .kf_ops         = &rdtgroup_kf_single_ops,
-                .write          = rdtgroup_tasks_write,
-                .seq_show       = rdtgroup_tasks_show,
-        },
-        {
-                .name           = "schemata",
-                .mode           = 0644,
-                .kf_ops         = &rdtgroup_kf_single_ops,
-                .write          = rdtgroup_schemata_write,
-                .seq_show       = rdtgroup_schemata_show,
-        },
-};
 static int rdt_num_closids_show(struct kernfs_open_file *of,
                                struct seq_file *seq, void *v)
 {
@@ -536,6 +596,15 @@ static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
        return 0;
 }
+static int rdt_shareable_bits_show(struct kernfs_open_file *of,
+                                   struct seq_file *seq, void *v)
+{
+        struct rdt_resource *r = of->kn->parent->priv;
+        seq_printf(seq, "%x\n", r->cache.shareable_bits);
+        return 0;
+}
 static int rdt_min_bw_show(struct kernfs_open_file *of,
                             struct seq_file *seq, void *v)
 {
@@ -545,6 +614,28 @@ static int rdt_min_bw_show(struct kernfs_open_file *of,
        return 0;
 }
+static int rdt_num_rmids_show(struct kernfs_open_file *of,
+                              struct seq_file *seq, void *v)
+{
+        struct rdt_resource *r = of->kn->parent->priv;
+        seq_printf(seq, "%d\n", r->num_rmid);
+        return 0;
+}
+static int rdt_mon_features_show(struct kernfs_open_file *of,
+                                 struct seq_file *seq, void *v)
+{
+        struct rdt_resource *r = of->kn->parent->priv;
+        struct mon_evt *mevt;
+        list_for_each_entry(mevt, &r->evt_list, list)
+                seq_printf(seq, "%s\n", mevt->name);
+        return 0;
+}
 static int rdt_bw_gran_show(struct kernfs_open_file *of,
                             struct seq_file *seq, void *v)
 {
@@ -563,74 +654,200 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of,
        return 0;
 }
+static int max_threshold_occ_show(struct kernfs_open_file *of,
+                                  struct seq_file *seq, void *v)
+{
+        struct rdt_resource *r = of->kn->parent->priv;
+        seq_printf(seq, "%u\n", intel_cqm_threshold * r->mon_scale);
+        return 0;
+}
+static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
+                                       char *buf, size_t nbytes, loff_t off)
+{
+        struct rdt_resource *r = of->kn->parent->priv;
+        unsigned int bytes;
+        int ret;
+        ret = kstrtouint(buf, 0, &bytes);
+        if (ret)
+                return ret;
+        if (bytes > (boot_cpu_data.x86_cache_size * 1024))
+                return -EINVAL;
+        intel_cqm_threshold = bytes / r->mon_scale;
+        return nbytes;
+}
 /* rdtgroup information files for one cache resource. */
-static struct rftype res_cache_info_files[] = {
+static struct rftype res_common_files[] = {
        {
                .name           = "num_closids",
                .mode           = 0444,
                .kf_ops         = &rdtgroup_kf_single_ops,
                .seq_show       = rdt_num_closids_show,
+                .fflags         = RF_CTRL_INFO,
+        },
+        {
+                .name           = "mon_features",
+                .mode           = 0444,
+                .kf_ops         = &rdtgroup_kf_single_ops,
+                .seq_show       = rdt_mon_features_show,
+                .fflags         = RF_MON_INFO,
+        },
+        {
+                .name           = "num_rmids",
+                .mode           = 0444,
+                .kf_ops         = &rdtgroup_kf_single_ops,
+                .seq_show       = rdt_num_rmids_show,
+                .fflags         = RF_MON_INFO,
        },
        {
                .name           = "cbm_mask",
                .mode           = 0444,
                .kf_ops         = &rdtgroup_kf_single_ops,
                .seq_show       = rdt_default_ctrl_show,
+                .fflags         = RF_CTRL_INFO | RFTYPE_RES_CACHE,
        },
        {
                .name           = "min_cbm_bits",
                .mode           = 0444,
                .kf_ops         = &rdtgroup_kf_single_ops,
                .seq_show       = rdt_min_cbm_bits_show,
+                .fflags         = RF_CTRL_INFO | RFTYPE_RES_CACHE,
        },
-};
-/* rdtgroup information files for memory bandwidth. */
-static struct rftype res_mba_info_files[] = {
        {
-                .name           = "num_closids",
+                .name           = "shareable_bits",
                .mode           = 0444,
                .kf_ops         = &rdtgroup_kf_single_ops,
-                .seq_show       = rdt_num_closids_show,
+                .seq_show       = rdt_shareable_bits_show,
+                .fflags         = RF_CTRL_INFO | RFTYPE_RES_CACHE,
        },
        {
                .name           = "min_bandwidth",
                .mode           = 0444,
                .kf_ops         = &rdtgroup_kf_single_ops,
                .seq_show       = rdt_min_bw_show,
+                .fflags         = RF_CTRL_INFO | RFTYPE_RES_MB,
        },
        {
                .name           = "bandwidth_gran",
                .mode           = 0444,
                .kf_ops         = &rdtgroup_kf_single_ops,
                .seq_show       = rdt_bw_gran_show,
+                .fflags         = RF_CTRL_INFO | RFTYPE_RES_MB,
        },
        {
                .name           = "delay_linear",
                .mode           = 0444,
                .kf_ops         = &rdtgroup_kf_single_ops,
                .seq_show       = rdt_delay_linear_show,
+                .fflags         = RF_CTRL_INFO | RFTYPE_RES_MB,
+        },
+        {
+                .name           = "max_threshold_occupancy",
+                .mode           = 0644,
+                .kf_ops         = &rdtgroup_kf_single_ops,
+                .write          = max_threshold_occ_write,
+                .seq_show       = max_threshold_occ_show,
+                .fflags         = RF_MON_INFO | RFTYPE_RES_CACHE,
+        },
+        {
+                .name           = "cpus",
+                .mode           = 0644,
+                .kf_ops         = &rdtgroup_kf_single_ops,
+                .write          = rdtgroup_cpus_write,
+                .seq_show       = rdtgroup_cpus_show,
+                .fflags         = RFTYPE_BASE,
+        },
+        {
+                .name           = "cpus_list",
+                .mode           = 0644,
+                .kf_ops         = &rdtgroup_kf_single_ops,
+                .write          = rdtgroup_cpus_write,
+                .seq_show       = rdtgroup_cpus_show,
+                .flags          = RFTYPE_FLAGS_CPUS_LIST,
+                .fflags         = RFTYPE_BASE,
+        },
+        {
+                .name           = "tasks",
+                .mode           = 0644,
+                .kf_ops         = &rdtgroup_kf_single_ops,
+                .write          = rdtgroup_tasks_write,
+                .seq_show       = rdtgroup_tasks_show,
+                .fflags         = RFTYPE_BASE,
+        },
+        {
+                .name           = "schemata",
+                .mode           = 0644,
+                .kf_ops         = &rdtgroup_kf_single_ops,
+                .write          = rdtgroup_schemata_write,
+                .seq_show       = rdtgroup_schemata_show,
+                .fflags         = RF_CTRL_BASE,
        },
 };
-void rdt_get_mba_infofile(struct rdt_resource *r)
+static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
 {
-        r->info_files = res_mba_info_files;
+        struct rftype *rfts, *rft;
-        r->nr_info_files = ARRAY_SIZE(res_mba_info_files);
+        int ret, len;
+        rfts = res_common_files;
+        len = ARRAY_SIZE(res_common_files);
+        lockdep_assert_held(&rdtgroup_mutex);
+        for (rft = rfts; rft < rfts + len; rft++) {
+                if ((fflags & rft->fflags) == rft->fflags) {
+                        ret = rdtgroup_add_file(kn, rft);
+                        if (ret)
+                                goto error;
+                }
+        }
+        return 0;
+error:
+        pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
+        while (--rft >= rfts) {
+                if ((fflags & rft->fflags) == rft->fflags)
+                        kernfs_remove_by_name(kn, rft->name);
+        }
+        return ret;
 }
-void rdt_get_cache_infofile(struct rdt_resource *r)
+static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
+                                      unsigned long fflags)
 {
-        r->info_files = res_cache_info_files;
+        struct kernfs_node *kn_subdir;
-        r->nr_info_files = ARRAY_SIZE(res_cache_info_files);
+        int ret;
+        kn_subdir = kernfs_create_dir(kn_info, name,
+                                      kn_info->mode, r);
+        if (IS_ERR(kn_subdir))
+                return PTR_ERR(kn_subdir);
+        kernfs_get(kn_subdir);
+        ret = rdtgroup_kn_set_ugid(kn_subdir);
+        if (ret)
+                return ret;
+        ret = rdtgroup_add_files(kn_subdir, fflags);
+        if (!ret)
+                kernfs_activate(kn_subdir);
+        return ret;
 }
 static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
 {
-        struct kernfs_node *kn_subdir;
-        struct rftype *res_info_files;
        struct rdt_resource *r;
-        int ret, len;
+        unsigned long fflags;
+        char name[32];
+        int ret;
        /* create the directory */
        kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
@@ -638,25 +855,19 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
                return PTR_ERR(kn_info);
        kernfs_get(kn_info);
-        for_each_enabled_rdt_resource(r) {
+        for_each_alloc_enabled_rdt_resource(r) {
-                kn_subdir = kernfs_create_dir(kn_info, r->name,
+                fflags =  r->fflags | RF_CTRL_INFO;
-                                              kn_info->mode, r);
+                ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags);
-                if (IS_ERR(kn_subdir)) {
-                        ret = PTR_ERR(kn_subdir);
-                        goto out_destroy;
-                }
-                kernfs_get(kn_subdir);
-                ret = rdtgroup_kn_set_ugid(kn_subdir);
                if (ret)
                        goto out_destroy;
+        }
-                res_info_files = r->info_files;
+        for_each_mon_enabled_rdt_resource(r) {
-                len = r->nr_info_files;
+                fflags =  r->fflags | RF_MON_INFO;
+                sprintf(name, "%s_MON", r->name);
-                ret = rdtgroup_add_files(kn_subdir, res_info_files, len);
+                ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
                if (ret)
                        goto out_destroy;
-                kernfs_activate(kn_subdir);
        }
        /*
@@ -678,6 +889,39 @@ out_destroy:
        return ret;
 }
+static int
+mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
+                    char *name, struct kernfs_node **dest_kn)
+{
+        struct kernfs_node *kn;
+        int ret;
+        /* create the directory */
+        kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
+        if (IS_ERR(kn))
+                return PTR_ERR(kn);
+        if (dest_kn)
+                *dest_kn = kn;
+        /*
+         * This extra ref will be put in kernfs_remove() and guarantees
+         * that @rdtgrp->kn is always accessible.
+         */
+        kernfs_get(kn);
+        ret = rdtgroup_kn_set_ugid(kn);
+        if (ret)
+                goto out_destroy;
+        kernfs_activate(kn);
+        return 0;
+out_destroy:
+        kernfs_remove(kn);
+        return ret;
+}
 static void l3_qos_cfg_update(void *arg)
 {
        bool *enable = arg;
@@ -718,14 +962,15 @@ static int cdp_enable(void)
        struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3];
        int ret;
-        if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable)
+        if (!r_l3->alloc_capable || !r_l3data->alloc_capable ||
+            !r_l3code->alloc_capable)
                return -EINVAL;
        ret = set_l3_qos_cfg(r_l3, true);
        if (!ret) {
-                r_l3->enabled = false;
+                r_l3->alloc_enabled = false;
-                r_l3data->enabled = true;
+                r_l3data->alloc_enabled = true;
-                r_l3code->enabled = true;
+                r_l3code->alloc_enabled = true;
        }
        return ret;
 }
@@ -734,11 +979,11 @@ static void cdp_disable(void)
 {
        struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
-        r->enabled = r->capable;
+        r->alloc_enabled = r->alloc_capable;
-        if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) {
+        if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) {
-                rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false;
+                rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled = false;
-                rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false;
+                rdt_resources_all[RDT_RESOURCE_L3CODE].alloc_enabled = false;
                set_l3_qos_cfg(r, false);
        }
 }
@@ -823,10 +1068,16 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn)
        }
 }
+static int mkdir_mondata_all(struct kernfs_node *parent_kn,
+                             struct rdtgroup *prgrp,
+                             struct kernfs_node **mon_data_kn);
 static struct dentry *rdt_mount(struct file_system_type *fs_type,
                                int flags, const char *unused_dev_name,
                                void *data)
 {
+        struct rdt_domain *dom;
+        struct rdt_resource *r;
        struct dentry *dentry;
        int ret;
@@ -853,15 +1104,54 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
                goto out_cdp;
        }
+        if (rdt_mon_capable) {
+                ret = mongroup_create_dir(rdtgroup_default.kn,
+                                          NULL, "mon_groups",
+                                          &kn_mongrp);
+                if (ret) {
+                        dentry = ERR_PTR(ret);
+                        goto out_info;
+                }
+                kernfs_get(kn_mongrp);
+                ret = mkdir_mondata_all(rdtgroup_default.kn,
+                                        &rdtgroup_default, &kn_mondata);
+                if (ret) {
+                        dentry = ERR_PTR(ret);
+                        goto out_mongrp;
+                }
+                kernfs_get(kn_mondata);
+                rdtgroup_default.mon.mon_data_kn = kn_mondata;
+        }
        dentry = kernfs_mount(fs_type, flags, rdt_root,
                              RDTGROUP_SUPER_MAGIC, NULL);
        if (IS_ERR(dentry))
-                goto out_destroy;
+                goto out_mondata;
+        if (rdt_alloc_capable)
+                static_branch_enable(&rdt_alloc_enable_key);
+        if (rdt_mon_capable)
+                static_branch_enable(&rdt_mon_enable_key);
+        if (rdt_alloc_capable || rdt_mon_capable)
+                static_branch_enable(&rdt_enable_key);
+        if (is_mbm_enabled()) {
+                r = &rdt_resources_all[RDT_RESOURCE_L3];
+                list_for_each_entry(dom, &r->domains, list)
+                        mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL);
+        }
-        static_branch_enable(&rdt_enable_key);
        goto out;
-out_destroy:
+out_mondata:
+        if (rdt_mon_capable)
+                kernfs_remove(kn_mondata);
+out_mongrp:
+        if (rdt_mon_capable)
+                kernfs_remove(kn_mongrp);
+out_info:
        kernfs_remove(kn_info);
 out_cdp:
        cdp_disable();
@@ -909,6 +1199,18 @@ static int reset_all_ctrls(struct rdt_resource *r)
        return 0;
 }
+static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
+{
+        return (rdt_alloc_capable &&
+                (r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
+}
+static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
+{
+        return (rdt_mon_capable &&
+                (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
+}
 /*
 * Move tasks from one to the other group. If @from is NULL, then all tasks
 * in the systems are moved unconditionally (used for teardown).
@@ -924,8 +1226,11 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
        read_lock(&tasklist_lock);
        for_each_process_thread(p, t) {
-                if (!from || t->closid == from->closid) {
+                if (!from || is_closid_match(t, from) ||
+                    is_rmid_match(t, from)) {
                        t->closid = to->closid;
+                        t->rmid = to->mon.rmid;
 #ifdef CONFIG_SMP
                        /*
                         * This is safe on x86 w/o barriers as the ordering
@@ -944,6 +1249,19 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
        read_unlock(&tasklist_lock);
 }
+static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
+{
+        struct rdtgroup *sentry, *stmp;
+        struct list_head *head;
+        head = &rdtgrp->mon.crdtgrp_list;
+        list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
+                free_rmid(sentry->mon.rmid);
+                list_del(&sentry->mon.crdtgrp_list);
+                kfree(sentry);
+        }
+}
 /*
 * Forcibly remove all of subdirectories under root.
 */
@@ -955,6 +1273,9 @@ static void rmdir_all_sub(void)
        rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
        list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
+                /* Free any child rmids */
+                free_all_child_rdtgrp(rdtgrp);
                /* Remove each rdtgroup other than root */
                if (rdtgrp == &rdtgroup_default)
                        continue;
@@ -967,16 +1288,20 @@ static void rmdir_all_sub(void)
                cpumask_or(&rdtgroup_default.cpu_mask,
                           &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
+                free_rmid(rdtgrp->mon.rmid);
                kernfs_remove(rdtgrp->kn);
                list_del(&rdtgrp->rdtgroup_list);
                kfree(rdtgrp);
        }
        /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
        get_online_cpus();
-        rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid);
+        update_closid_rmid(cpu_online_mask, &rdtgroup_default);
        put_online_cpus();
        kernfs_remove(kn_info);
+        kernfs_remove(kn_mongrp);
+        kernfs_remove(kn_mondata);
 }
 static void rdt_kill_sb(struct super_block *sb)
@@ -986,10 +1311,12 @@ static void rdt_kill_sb(struct super_block *sb)
        mutex_lock(&rdtgroup_mutex);
        /*Put everything back to default values. */
-        for_each_enabled_rdt_resource(r)
+        for_each_alloc_enabled_rdt_resource(r)
                reset_all_ctrls(r);
        cdp_disable();
        rmdir_all_sub();
+        static_branch_disable(&rdt_alloc_enable_key);
+        static_branch_disable(&rdt_mon_enable_key);
        static_branch_disable(&rdt_enable_key);
        kernfs_kill_sb(sb);
        mutex_unlock(&rdtgroup_mutex);
@@ -1001,46 +1328,223 @@ static struct file_system_type rdt_fs_type = {
        .kill_sb = rdt_kill_sb,
 };
-static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
-                          umode_t mode)
+                       void *priv)
 {
-        struct rdtgroup *parent, *rdtgrp;
        struct kernfs_node *kn;
-        int ret, closid;
+        int ret = 0;
-        /* Only allow mkdir in the root directory */
+        kn = __kernfs_create_file(parent_kn, name, 0444, 0,
-        if (parent_kn != rdtgroup_default.kn)
+                                  &kf_mondata_ops, priv, NULL, NULL);
-                return -EPERM;
+        if (IS_ERR(kn))
+                return PTR_ERR(kn);
-        /* Do not accept '\n' to avoid unparsable situation. */
+        ret = rdtgroup_kn_set_ugid(kn);
-        if (strchr(name, '\n'))
+        if (ret) {
-                return -EINVAL;
+                kernfs_remove(kn);
+                return ret;
+        }
-        parent = rdtgroup_kn_lock_live(parent_kn);
+        return ret;
-        if (!parent) {
+}
-                ret = -ENODEV;
-                goto out_unlock;
+/*
+ * Remove all subdirectories of mon_data of ctrl_mon groups
+ * and monitor groups with given domain id.
+ */
+void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id)
+{
+        struct rdtgroup *prgrp, *crgrp;
+        char name[32];
+        if (!r->mon_enabled)
+                return;
+        list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+                sprintf(name, "mon_%s_%02d", r->name, dom_id);
+                kernfs_remove_by_name(prgrp->mon.mon_data_kn, name);
+                list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
+                        kernfs_remove_by_name(crgrp->mon.mon_data_kn, name);
        }
+}
-        ret = closid_alloc();
+static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
-        if (ret < 0)
+                                struct rdt_domain *d,
+                                struct rdt_resource *r, struct rdtgroup *prgrp)
+{
+        union mon_data_bits priv;
+        struct kernfs_node *kn;
+        struct mon_evt *mevt;
+        struct rmid_read rr;
+        char name[32];
+        int ret;
+        sprintf(name, "mon_%s_%02d", r->name, d->id);
+        /* create the directory */
+        kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
+        if (IS_ERR(kn))
+                return PTR_ERR(kn);
+        /*
+         * This extra ref will be put in kernfs_remove() and guarantees
+         * that kn is always accessible.
+         */
+        kernfs_get(kn);
+        ret = rdtgroup_kn_set_ugid(kn);
+        if (ret)
+                goto out_destroy;
+        if (WARN_ON(list_empty(&r->evt_list))) {
+                ret = -EPERM;
+                goto out_destroy;
+        }
+        priv.u.rid = r->rid;
+        priv.u.domid = d->id;
+        list_for_each_entry(mevt, &r->evt_list, list) {
+                priv.u.evtid = mevt->evtid;
+                ret = mon_addfile(kn, mevt->name, priv.priv);
+                if (ret)
+                        goto out_destroy;
+                if (is_mbm_event(mevt->evtid))
+                        mon_event_read(&rr, d, prgrp, mevt->evtid, true);
+        }
+        kernfs_activate(kn);
+        return 0;
+out_destroy:
+        kernfs_remove(kn);
+        return ret;
+}
+/*
+ * Add all subdirectories of mon_data for "ctrl_mon" groups
+ * and "monitor" groups with given domain id.
+ */
+void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+                                    struct rdt_domain *d)
+{
+        struct kernfs_node *parent_kn;
+        struct rdtgroup *prgrp, *crgrp;
+        struct list_head *head;
+        if (!r->mon_enabled)
+                return;
+        list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+                parent_kn = prgrp->mon.mon_data_kn;
+                mkdir_mondata_subdir(parent_kn, d, r, prgrp);
+                head = &prgrp->mon.crdtgrp_list;
+                list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+                        parent_kn = crgrp->mon.mon_data_kn;
+                        mkdir_mondata_subdir(parent_kn, d, r, crgrp);
+                }
+        }
+}
+static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
+                                       struct rdt_resource *r,
+                                       struct rdtgroup *prgrp)
+{
+        struct rdt_domain *dom;
+        int ret;
+        list_for_each_entry(dom, &r->domains, list) {
+                ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
+                if (ret)
+                        return ret;
+        }
+        return 0;
+}
+/*
+ * This creates a directory mon_data which contains the monitored data.
+ *
+ * mon_data has one directory for each domain whic are named
+ * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
+ * with L3 domain looks as below:
+ * ./mon_data:
+ * mon_L3_00
+ * mon_L3_01
+ * mon_L3_02
+ * ...
+ *
+ * Each domain directory has one file per event:
+ * ./mon_L3_00/:
+ * llc_occupancy
+ *
+ */
+static int mkdir_mondata_all(struct kernfs_node *parent_kn,
+                             struct rdtgroup *prgrp,
+                             struct kernfs_node **dest_kn)
+{
+        struct rdt_resource *r;
+        struct kernfs_node *kn;
+        int ret;
+        /*
+         * Create the mon_data directory first.
+         */
+        ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn);
+        if (ret)
+                return ret;
+        if (dest_kn)
+                *dest_kn = kn;
+        /*
+         * Create the subdirectories for each domain. Note that all events
+         * in a domain like L3 are grouped into a resource whose domain is L3
+         */
+        for_each_mon_enabled_rdt_resource(r) {
+                ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
+                if (ret)
+                        goto out_destroy;
+        }
+        return 0;
+out_destroy:
+        kernfs_remove(kn);
+        return ret;
+}
+static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
+                             struct kernfs_node *prgrp_kn,
+                             const char *name, umode_t mode,
+                             enum rdt_group_type rtype, struct rdtgroup **r)
+{
+        struct rdtgroup *prdtgrp, *rdtgrp;
+        struct kernfs_node *kn;
+        uint files = 0;
+        int ret;
+        prdtgrp = rdtgroup_kn_lock_live(prgrp_kn);
+        if (!prdtgrp) {
+                ret = -ENODEV;
                goto out_unlock;
-        closid = ret;
+        }
        /* allocate the rdtgroup. */
        rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
        if (!rdtgrp) {
                ret = -ENOSPC;
-                goto out_closid_free;
+                goto out_unlock;
        }
-        rdtgrp->closid = closid;
+        *r = rdtgrp;
-        list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
+        rdtgrp->mon.parent = prdtgrp;
+        rdtgrp->type = rtype;
+        INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
        /* kernfs creates the directory for rdtgrp */
-        kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp);
+        kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
        if (IS_ERR(kn)) {
                ret = PTR_ERR(kn);
-                goto out_cancel_ref;
+                goto out_free_rgrp;
        }
        rdtgrp->kn = kn;
@@ -1056,43 +1560,211 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
        if (ret)
                goto out_destroy;
-        ret = rdtgroup_add_files(kn, rdtgroup_base_files,
+        files = RFTYPE_BASE | RFTYPE_CTRL;
-                                 ARRAY_SIZE(rdtgroup_base_files));
+        files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype);
+        ret = rdtgroup_add_files(kn, files);
        if (ret)
                goto out_destroy;
+        if (rdt_mon_capable) {
+                ret = alloc_rmid();
+                if (ret < 0)
+                        goto out_destroy;
+                rdtgrp->mon.rmid = ret;
+                ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
+                if (ret)
+                        goto out_idfree;
+        }
        kernfs_activate(kn);
-        ret = 0;
+        /*
-        goto out_unlock;
+         * The caller unlocks the prgrp_kn upon success.
+         */
+        return 0;
+out_idfree:
+        free_rmid(rdtgrp->mon.rmid);
 out_destroy:
        kernfs_remove(rdtgrp->kn);
-out_cancel_ref:
+out_free_rgrp:
-        list_del(&rdtgrp->rdtgroup_list);
        kfree(rdtgrp);
-out_closid_free:
-        closid_free(closid);
 out_unlock:
-        rdtgroup_kn_unlock(parent_kn);
+        rdtgroup_kn_unlock(prgrp_kn);
        return ret;
 }
-static int rdtgroup_rmdir(struct kernfs_node *kn)
+static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
+{
+        kernfs_remove(rgrp->kn);
+        free_rmid(rgrp->mon.rmid);
+        kfree(rgrp);
+}
+/*
+ * Create a monitor group under "mon_groups" directory of a control
+ * and monitor group(ctrl_mon). This is a resource group
+ * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
+ */
+static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
+                              struct kernfs_node *prgrp_kn,
+                              const char *name,
+                              umode_t mode)
+{
+        struct rdtgroup *rdtgrp, *prgrp;
+        int ret;
+        ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP,
+                                &rdtgrp);
+        if (ret)
+                return ret;
+        prgrp = rdtgrp->mon.parent;
+        rdtgrp->closid = prgrp->closid;
+        /*
+         * Add the rdtgrp to the list of rdtgrps the parent
+         * ctrl_mon group has to track.
+         */
+        list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
+        rdtgroup_kn_unlock(prgrp_kn);
+        return ret;
+}
+/*
+ * These are rdtgroups created under the root directory. Can be used
+ * to allocate and monitor resources.
+ */
+static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
+                                   struct kernfs_node *prgrp_kn,
+                                   const char *name, umode_t mode)
 {
-        int ret, cpu, closid = rdtgroup_default.closid;
        struct rdtgroup *rdtgrp;
-        cpumask_var_t tmpmask;
+        struct kernfs_node *kn;
+        u32 closid;
+        int ret;
-        if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+        ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP,
-                return -ENOMEM;
+                                &rdtgrp);
+        if (ret)
+                return ret;
-        rdtgrp = rdtgroup_kn_lock_live(kn);
+        kn = rdtgrp->kn;
-        if (!rdtgrp) {
+        ret = closid_alloc();
-                ret = -EPERM;
+        if (ret < 0)
-                goto out;
+                goto out_common_fail;
+        closid = ret;
+        rdtgrp->closid = closid;
+        list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
+        if (rdt_mon_capable) {
+                /*
+                 * Create an empty mon_groups directory to hold the subset
+                 * of tasks and cpus to monitor.
+                 */
+                ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL);
+                if (ret)
+                        goto out_id_free;
        }
+        goto out_unlock;
+out_id_free:
+        closid_free(closid);
+        list_del(&rdtgrp->rdtgroup_list);
+out_common_fail:
+        mkdir_rdt_prepare_clean(rdtgrp);
+out_unlock:
+        rdtgroup_kn_unlock(prgrp_kn);
+        return ret;
+}
+/*
+ * We allow creating mon groups only with in a directory called "mon_groups"
+ * which is present in every ctrl_mon group. Check if this is a valid
+ * "mon_groups" directory.
+ *
+ * 1. The directory should be named "mon_groups".
+ * 2. The mon group itself should "not" be named "mon_groups".
+ *   This makes sure "mon_groups" directory always has a ctrl_mon group
+ *   as parent.
+ */
+static bool is_mon_groups(struct kernfs_node *kn, const char *name)
+{
+        return (!strcmp(kn->name, "mon_groups") &&
+                strcmp(name, "mon_groups"));
+}
+static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+                          umode_t mode)
+{
+        /* Do not accept '\n' to avoid unparsable situation. */
+        if (strchr(name, '\n'))
+                return -EINVAL;
+        /*
+         * If the parent directory is the root directory and RDT
+         * allocation is supported, add a control and monitoring
+         * subdirectory
+         */
+        if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
+                return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode);
+        /*
+         * If RDT monitoring is supported and the parent directory is a valid
+         * "mon_groups" directory, add a monitoring subdirectory.
+         */
+        if (rdt_mon_capable && is_mon_groups(parent_kn, name))
+                return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode);
+        return -EPERM;
+}
+static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
+                              cpumask_var_t tmpmask)
+{
+        struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
+        int cpu;
+        /* Give any tasks back to the parent group */
+        rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
+        /* Update per cpu rmid of the moved CPUs first */
+        for_each_cpu(cpu, &rdtgrp->cpu_mask)
+                per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid;
+        /*
+         * Update the MSR on moved CPUs and CPUs which have moved
+         * task running on them.
+         */
+        cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
+        update_closid_rmid(tmpmask, NULL);
+        rdtgrp->flags = RDT_DELETED;
+        free_rmid(rdtgrp->mon.rmid);
+        /*
+         * Remove the rdtgrp from the parent ctrl_mon group's list
+         */
+        WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
+        list_del(&rdtgrp->mon.crdtgrp_list);
+        /*
+         * one extra hold on this, will drop when we kfree(rdtgrp)
+         * in rdtgroup_kn_unlock()
+         */
+        kernfs_get(kn);
+        kernfs_remove(rdtgrp->kn);
+        return 0;
+}
+static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
+                               cpumask_var_t tmpmask)
+{
+        int cpu;
        /* Give any tasks back to the default group */
        rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
@@ -1100,18 +1772,28 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
        cpumask_or(&rdtgroup_default.cpu_mask,
                   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
-        /* Update per cpu closid of the moved CPUs first */
+        /* Update per cpu closid and rmid of the moved CPUs first */
-        for_each_cpu(cpu, &rdtgrp->cpu_mask)
+        for_each_cpu(cpu, &rdtgrp->cpu_mask) {
-                per_cpu(cpu_closid, cpu) = closid;
+                per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid;
+                per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid;
+        }
        /*
         * Update the MSR on moved CPUs and CPUs which have moved
         * task running on them.
         */
        cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
-        rdt_update_closid(tmpmask, NULL);
+        update_closid_rmid(tmpmask, NULL);
        rdtgrp->flags = RDT_DELETED;
        closid_free(rdtgrp->closid);
+        free_rmid(rdtgrp->mon.rmid);
+        /*
+         * Free all the child monitor group rmids.
+         */
+        free_all_child_rdtgrp(rdtgrp);
        list_del(&rdtgrp->rdtgroup_list);
        /*
@@ -1120,7 +1802,41 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
         */
        kernfs_get(kn);
        kernfs_remove(rdtgrp->kn);
-        ret = 0;
+        return 0;
+}
+static int rdtgroup_rmdir(struct kernfs_node *kn)
+{
+        struct kernfs_node *parent_kn = kn->parent;
+        struct rdtgroup *rdtgrp;
+        cpumask_var_t tmpmask;
+        int ret = 0;
+        if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+                return -ENOMEM;
+        rdtgrp = rdtgroup_kn_lock_live(kn);
+        if (!rdtgrp) {
+                ret = -EPERM;
+                goto out;
+        }
+        /*
+         * If the rdtgroup is a ctrl_mon group and parent directory
+         * is the root directory, remove the ctrl_mon group.
+         *
+         * If the rdtgroup is a mon group and parent directory
+         * is a valid "mon_groups" directory, remove the mon group.
+         */
+        if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn)
+                ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
+        else if (rdtgrp->type == RDTMON_GROUP &&
+                 is_mon_groups(parent_kn, kn->name))
+                ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask);
+        else
+                ret = -EPERM;
 out:
        rdtgroup_kn_unlock(kn);
        free_cpumask_var(tmpmask);
@@ -1129,7 +1845,7 @@ out:
 static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
 {
-        if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled)
+        if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
                seq_puts(seq, ",cdp");
        return 0;
 }
@@ -1153,10 +1869,13 @@ static int __init rdtgroup_setup_root(void)
        mutex_lock(&rdtgroup_mutex);
        rdtgroup_default.closid = 0;
+        rdtgroup_default.mon.rmid = 0;
+        rdtgroup_default.type = RDTCTRL_GROUP;
+        INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
        list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
-        ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files,
+        ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE);
-                                 ARRAY_SIZE(rdtgroup_base_files));
        if (ret) {
                kernfs_destroy_root(rdt_root);
                goto out;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index efc5eeb58292..11966251cd42 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -56,7 +56,7 @@
 #include <asm/debugreg.h>
 #include <asm/switch_to.h>
 #include <asm/vm86.h>
-#include <asm/intel_rdt.h>
+#include <asm/intel_rdt_sched.h>
 #include <asm/proto.h>
 void __show_regs(struct pt_regs *regs, int all)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c85269a76511..302e7b2572d1 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -52,7 +52,7 @@
 #include <asm/switch_to.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/vdso.h>
-#include <asm/intel_rdt.h>
+#include <asm/intel_rdt_sched.h>
 #include <asm/unistd.h>
 #ifdef CONFIG_IA32_EMULATION
 /* Not included via unistd.h */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 718ba163c1b9..8e22f24ded6a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -139,14 +139,6 @@ struct hw_perf_event {
                        /* for tp_event->class */
                        struct list_head        tp_list;
                };
-                struct { /* intel_cqm */
-                        int                     cqm_state;
-                        u32                     cqm_rmid;
-                        int                     is_group_event;
-                        struct list_head        cqm_events_entry;
-                        struct list_head        cqm_groups_entry;
-                        struct list_head        cqm_group_entry;
-                };
                struct { /* amd_power */
                        u64     pwr_acc;
                        u64     ptsc;
@@ -414,11 +406,6 @@ struct pmu {
        /*
-         * Return the count value for a counter.
-         */
-        u64 (*count)                    (struct perf_event *event); /*optional*/
-        /*
         * Set up pmu-private data structures for an AUX area
         */
        void *(*setup_aux)              (int cpu, void **pages,
@@ -1112,11 +1099,6 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
                __perf_event_task_sched_out(prev, next);
 }
-static inline u64 __perf_event_count(struct perf_event *event)
-{
-        return local64_read(&event->count) + atomic64_read(&event->child_count);
-}
 extern void perf_event_mmap(struct vm_area_struct *vma);
 extern struct perf_guest_info_callbacks *perf_guest_cbs;
 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9ba42c663fba..68b38335d33c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -909,8 +909,9 @@ struct task_struct {
        /* cg_list protected by css_set_lock and tsk->alloc_lock: */
        struct list_head                cg_list;
 #endif
-#ifdef CONFIG_INTEL_RDT_A
+#ifdef CONFIG_INTEL_RDT
-        int                             closid;
+        u32                             closid;
+        u32                             rmid;
 #endif
 #ifdef CONFIG_FUTEX
        struct robust_list_head __user  *robust_list;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ce64f3fed5c6..294f1927f944 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3673,10 +3673,7 @@ unlock:
 static inline u64 perf_event_count(struct perf_event *event)
 {
-        if (event->pmu->count)
+        return local64_read(&event->count) + atomic64_read(&event->child_count);
-                return event->pmu->count(event);
-        return __perf_event_count(event);
 }
 /*
@@ -3707,15 +3704,6 @@ int perf_event_read_local(struct perf_event *event, u64 *value)
                goto out;
        }
-        /*
-         * It must not have a pmu::count method, those are not
-         * NMI safe.
-         */
-        if (event->pmu->count) {
-                ret = -EOPNOTSUPP;
-                goto out;
-        }
        /* If this is a per-task event, it must be for current */
        if ((event->attach_state & PERF_ATTACH_TASK) &&
            event->hw.target != current) {