3 files changed, 128 insertions, 3 deletions
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index ad03d975883e..24bfa63e86cf 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -33,7 +33,6 @@
 #include <asm/intel_rdt_sched.h>
 #include "intel_rdt.h"
-#define MAX_MBA_BW      100u
 #define MBA_IS_LINEAR   0x4
 #define MBA_MAX_MBPS    U32_MAX
@@ -350,7 +349,7 @@ static int get_cache_id(int cpu, int level)
 * that can be written to QOS_MSRs.
 * There are currently no SKUs which support non linear delay values.
 */
-static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
+u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
 {
        if (r->membw.delay_linear)
                return MAX_MBA_BW - bw;
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 66a0ba37a8a3..39752825e376 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -28,6 +28,7 @@
 #define MBM_CNTR_WIDTH                  24
 #define MBM_OVERFLOW_INTERVAL           1000
+#define MAX_MBA_BW                      100u
 #define RMID_VAL_ERROR                  BIT_ULL(63)
 #define RMID_VAL_UNAVAIL                BIT_ULL(62)
@@ -461,6 +462,7 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom,
 void mbm_handle_overflow(struct work_struct *work);
 bool is_mba_sc(struct rdt_resource *r);
 void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm);
+u32 delay_bw_map(unsigned long bw, struct rdt_resource *r);
 void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
 void cqm_handle_limbo(struct work_struct *work);
 bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index 7690402c42b7..b0f3aed76b75 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -329,6 +329,118 @@ void mon_event_count(void *info)
        }
 }
+/*
+ * Feedback loop for MBA software controller (mba_sc)
+ *
+ * mba_sc is a feedback loop where we periodically read MBM counters and
+ * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
+ * that:
+ *
+ *   current bandwdith(cur_bw) < user specified bandwidth(user_bw)
+ *
+ * This uses the MBM counters to measure the bandwidth and MBA throttle
+ * MSRs to control the bandwidth for a particular rdtgrp. It builds on the
+ * fact that resctrl rdtgroups have both monitoring and control.
+ *
+ * The frequency of the checks is 1s and we just tag along the MBM overflow
+ * timer. Having 1s interval makes the calculation of bandwidth simpler.
+ *
+ * Although MBA's goal is to restrict the bandwidth to a maximum, there may
+ * be a need to increase the bandwidth to avoid uncecessarily restricting
+ * the L2 <-> L3 traffic.
+ *
+ * Since MBA controls the L2 external bandwidth where as MBM measures the
+ * L3 external bandwidth the following sequence could lead to such a
+ * situation.
+ *
+ * Consider an rdtgroup which had high L3 <-> memory traffic in initial
+ * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but
+ * after some time rdtgroup has mostly L2 <-> L3 traffic.
+ *
+ * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its
+ * throttle MSRs already have low percentage values.  To avoid
+ * unnecessarily restricting such rdtgroups, we also increase the bandwidth.
+ */
+static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
+{
+        u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val;
+        struct mbm_state *pmbm_data, *cmbm_data;
+        u32 cur_bw, delta_bw, user_bw;
+        struct rdt_resource *r_mba;
+        struct rdt_domain *dom_mba;
+        struct list_head *head;
+        struct rdtgroup *entry;
+        r_mba = &rdt_resources_all[RDT_RESOURCE_MBA];
+        closid = rgrp->closid;
+        rmid = rgrp->mon.rmid;
+        pmbm_data = &dom_mbm->mbm_local[rmid];
+        dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba);
+        if (!dom_mba) {
+                pr_warn_once("Failure to get domain for MBA update\n");
+                return;
+        }
+        cur_bw = pmbm_data->prev_bw;
+        user_bw = dom_mba->mbps_val[closid];
+        delta_bw = pmbm_data->delta_bw;
+        cur_msr_val = dom_mba->ctrl_val[closid];
+        /*
+         * For Ctrl groups read data from child monitor groups.
+         */
+        head = &rgrp->mon.crdtgrp_list;
+        list_for_each_entry(entry, head, mon.crdtgrp_list) {
+                cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
+                cur_bw += cmbm_data->prev_bw;
+                delta_bw += cmbm_data->delta_bw;
+        }
+        /*
+         * Scale up/down the bandwidth linearly for the ctrl group.  The
+         * bandwidth step is the bandwidth granularity specified by the
+         * hardware.
+         *
+         * The delta_bw is used when increasing the bandwidth so that we
+         * dont alternately increase and decrease the control values
+         * continuously.
+         *
+         * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if
+         * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep
+         * switching between 90 and 110 continuously if we only check
+         * cur_bw < user_bw.
+         */
+        if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
+                new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
+        } else if (cur_msr_val < MAX_MBA_BW &&
+                   (user_bw > (cur_bw + delta_bw))) {
+                new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
+        } else {
+                return;
+        }
+        cur_msr = r_mba->msr_base + closid;
+        wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba));
+        dom_mba->ctrl_val[closid] = new_msr_val;
+        /*
+         * Delta values are updated dynamically package wise for each
+         * rdtgrp everytime the throttle MSR changes value.
+         *
+         * This is because (1)the increase in bandwidth is not perfectly
+         * linear and only "approximately" linear even when the hardware
+         * says it is linear.(2)Also since MBA is a core specific
+         * mechanism, the delta values vary based on number of cores used
+         * by the rdtgrp.
+         */
+        pmbm_data->delta_comp = true;
+        list_for_each_entry(entry, head, mon.crdtgrp_list) {
+                cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
+                cmbm_data->delta_comp = true;
+        }
+}
 static void mbm_update(struct rdt_domain *d, int rmid)
 {
        struct rmid_read rr;
@@ -346,7 +458,16 @@ static void mbm_update(struct rdt_domain *d, int rmid)
        }
        if (is_mbm_local_enabled()) {
                rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
-                __mon_event_count(rmid, &rr);
+                /*
+                 * Call the MBA software controller only for the
+                 * control groups and when user has enabled
+                 * the software controller explicitly.
+                 */
+                if (!is_mba_sc(NULL))
+                        __mon_event_count(rmid, &rr);
+                else
+                        mbm_bw_count(rmid, &rr);
        }
 }
@@ -417,6 +538,9 @@ void mbm_handle_overflow(struct work_struct *work)
                head = &prgrp->mon.crdtgrp_list;
                list_for_each_entry(crgrp, head, mon.crdtgrp_list)
                        mbm_update(d, crgrp->mon.rmid);
+                if (is_mba_sc(NULL))
+                        update_mba_bw(prgrp, d);
        }
        schedule_delayed_work_on(cpu, &d->mbm_over, delay);

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index ad03d975883e..24bfa63e86cf 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -33,7 +33,6 @@
33	#include <asm/intel_rdt_sched.h>	33	#include <asm/intel_rdt_sched.h>
34	#include "intel_rdt.h"	34	#include "intel_rdt.h"
35		35
36	#define MAX_MBA_BW 100u
37	#define MBA_IS_LINEAR 0x4	36	#define MBA_IS_LINEAR 0x4
38	#define MBA_MAX_MBPS U32_MAX	37	#define MBA_MAX_MBPS U32_MAX
39		38
@@ -350,7 +349,7 @@ static int get_cache_id(int cpu, int level)
350	* that can be written to QOS_MSRs.	349	* that can be written to QOS_MSRs.
351	* There are currently no SKUs which support non linear delay values.	350	* There are currently no SKUs which support non linear delay values.
352	*/	351	*/
353	static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)	352	u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
354	{	353	{
355	if (r->membw.delay_linear)	354	if (r->membw.delay_linear)
356	return MAX_MBA_BW - bw;	355	return MAX_MBA_BW - bw;


diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 66a0ba37a8a3..39752825e376 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -28,6 +28,7 @@
28		28
29	#define MBM_CNTR_WIDTH 24	29	#define MBM_CNTR_WIDTH 24
30	#define MBM_OVERFLOW_INTERVAL 1000	30	#define MBM_OVERFLOW_INTERVAL 1000
		31	#define MAX_MBA_BW 100u
31		32
32	#define RMID_VAL_ERROR BIT_ULL(63)	33	#define RMID_VAL_ERROR BIT_ULL(63)
33	#define RMID_VAL_UNAVAIL BIT_ULL(62)	34	#define RMID_VAL_UNAVAIL BIT_ULL(62)
@@ -461,6 +462,7 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom,
461	void mbm_handle_overflow(struct work_struct *work);	462	void mbm_handle_overflow(struct work_struct *work);
462	bool is_mba_sc(struct rdt_resource *r);	463	bool is_mba_sc(struct rdt_resource *r);
463	void setup_default_ctrlval(struct rdt_resource r, u32 dc, u32 *dm);	464	void setup_default_ctrlval(struct rdt_resource r, u32 dc, u32 *dm);
		465	u32 delay_bw_map(unsigned long bw, struct rdt_resource *r);
464	void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);	466	void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
465	void cqm_handle_limbo(struct work_struct *work);	467	void cqm_handle_limbo(struct work_struct *work);
466	bool has_busy_rmid(struct rdt_resource r, struct rdt_domain d);	468	bool has_busy_rmid(struct rdt_resource r, struct rdt_domain d);


diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c index 7690402c42b7..b0f3aed76b75 100644 --- a/arch/x86/kernel/cpu/intel_rdt_monitor.c +++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -329,6 +329,118 @@ void mon_event_count(void *info)
329	}	329	}
330	}	330	}
331		331
		332	/*
		333	* Feedback loop for MBA software controller (mba_sc)
		334	*
		335	* mba_sc is a feedback loop where we periodically read MBM counters and
		336	* adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
		337	* that:
		338	*
		339	* current bandwdith(cur_bw) < user specified bandwidth(user_bw)
		340	*
		341	* This uses the MBM counters to measure the bandwidth and MBA throttle
		342	* MSRs to control the bandwidth for a particular rdtgrp. It builds on the
		343	* fact that resctrl rdtgroups have both monitoring and control.
		344	*
		345	* The frequency of the checks is 1s and we just tag along the MBM overflow
		346	* timer. Having 1s interval makes the calculation of bandwidth simpler.
		347	*
		348	* Although MBA's goal is to restrict the bandwidth to a maximum, there may
		349	* be a need to increase the bandwidth to avoid uncecessarily restricting
		350	* the L2 <-> L3 traffic.
		351	*
		352	* Since MBA controls the L2 external bandwidth where as MBM measures the
		353	* L3 external bandwidth the following sequence could lead to such a
		354	* situation.
		355	*
		356	* Consider an rdtgroup which had high L3 <-> memory traffic in initial
		357	* phases -> mba_sc kicks in and reduced bandwidth percentage values -> but
		358	* after some time rdtgroup has mostly L2 <-> L3 traffic.
		359	*
		360	* In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its
		361	* throttle MSRs already have low percentage values. To avoid
		362	* unnecessarily restricting such rdtgroups, we also increase the bandwidth.
		363	*/
		364	static void update_mba_bw(struct rdtgroup rgrp, struct rdt_domain dom_mbm)
		365	{
		366	u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val;
		367	struct mbm_state pmbm_data, cmbm_data;
		368	u32 cur_bw, delta_bw, user_bw;
		369	struct rdt_resource *r_mba;
		370	struct rdt_domain *dom_mba;
		371	struct list_head *head;
		372	struct rdtgroup *entry;
		373
		374	r_mba = &rdt_resources_all[RDT_RESOURCE_MBA];
		375	closid = rgrp->closid;
		376	rmid = rgrp->mon.rmid;
		377	pmbm_data = &dom_mbm->mbm_local[rmid];
		378
		379	dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba);
		380	if (!dom_mba) {
		381	pr_warn_once("Failure to get domain for MBA update\n");
		382	return;
		383	}
		384
		385	cur_bw = pmbm_data->prev_bw;
		386	user_bw = dom_mba->mbps_val[closid];
		387	delta_bw = pmbm_data->delta_bw;
		388	cur_msr_val = dom_mba->ctrl_val[closid];
		389
		390	/*
		391	* For Ctrl groups read data from child monitor groups.
		392	*/
		393	head = &rgrp->mon.crdtgrp_list;
		394	list_for_each_entry(entry, head, mon.crdtgrp_list) {
		395	cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
		396	cur_bw += cmbm_data->prev_bw;
		397	delta_bw += cmbm_data->delta_bw;
		398	}
		399
		400	/*
		401	* Scale up/down the bandwidth linearly for the ctrl group. The
		402	* bandwidth step is the bandwidth granularity specified by the
		403	* hardware.
		404	*
		405	* The delta_bw is used when increasing the bandwidth so that we
		406	* dont alternately increase and decrease the control values
		407	* continuously.
		408	*
		409	* For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if
		410	* bandwidth step is 20MBps(> user_bw - cur_bw), we would keep
		411	* switching between 90 and 110 continuously if we only check
		412	* cur_bw < user_bw.
		413	*/
		414	if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
		415	new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
		416	} else if (cur_msr_val < MAX_MBA_BW &&
		417	(user_bw > (cur_bw + delta_bw))) {
		418	new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
		419	} else {
		420	return;
		421	}
		422
		423	cur_msr = r_mba->msr_base + closid;
		424	wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba));
		425	dom_mba->ctrl_val[closid] = new_msr_val;
		426
		427	/*
		428	* Delta values are updated dynamically package wise for each
		429	* rdtgrp everytime the throttle MSR changes value.
		430	*
		431	* This is because (1)the increase in bandwidth is not perfectly
		432	* linear and only "approximately" linear even when the hardware
		433	* says it is linear.(2)Also since MBA is a core specific
		434	* mechanism, the delta values vary based on number of cores used
		435	* by the rdtgrp.
		436	*/
		437	pmbm_data->delta_comp = true;
		438	list_for_each_entry(entry, head, mon.crdtgrp_list) {
		439	cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
		440	cmbm_data->delta_comp = true;
		441	}
		442	}
		443
332	static void mbm_update(struct rdt_domain *d, int rmid)	444	static void mbm_update(struct rdt_domain *d, int rmid)
333	{	445	{
334	struct rmid_read rr;	446	struct rmid_read rr;
@@ -346,7 +458,16 @@ static void mbm_update(struct rdt_domain *d, int rmid)
346	}	458	}
347	if (is_mbm_local_enabled()) {	459	if (is_mbm_local_enabled()) {
348	rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;	460	rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
349	__mon_event_count(rmid, &rr);	461
		462	/*
		463	* Call the MBA software controller only for the
		464	* control groups and when user has enabled
		465	* the software controller explicitly.
		466	*/
		467	if (!is_mba_sc(NULL))
		468	__mon_event_count(rmid, &rr);
		469	else
		470	mbm_bw_count(rmid, &rr);
350	}	471	}
351	}	472	}
352		473
@@ -417,6 +538,9 @@ void mbm_handle_overflow(struct work_struct *work)
417	head = &prgrp->mon.crdtgrp_list;	538	head = &prgrp->mon.crdtgrp_list;
418	list_for_each_entry(crgrp, head, mon.crdtgrp_list)	539	list_for_each_entry(crgrp, head, mon.crdtgrp_list)
419	mbm_update(d, crgrp->mon.rmid);	540	mbm_update(d, crgrp->mon.rmid);
		541
		542	if (is_mba_sc(NULL))
		543	update_mba_bw(prgrp, d);
420	}	544	}
421		545
422	schedule_delayed_work_on(cpu, &d->mbm_over, delay);	546	schedule_delayed_work_on(cpu, &d->mbm_over, delay);