x86/intel_rdt/mba_sc: Feedback loop to dynamically update mem bandwidth

mba_sc is a feedback loop where we periodically read MBM counters and try to restrict the bandwidth below a max value so the below is always true: "current bandwidth(cur_bw) < user specified bandwidth(user_bw)" The frequency of these checks is currently 1s and we just tag along the MBM overflow timer to do the updates. Doing it once in a second also makes the calculation of bandwidth easy. The steps of increase or decrease of bandwidth is the minimum granularity specified by the hardware. Although the MBA's goal is to restrict the bandwidth below a maximum, there may be a need to even increase the bandwidth. Since MBA controls the L2 external bandwidth where as MBM measures the L3 external bandwidth, we may end up restricting some rdtgroups unnecessarily. This may happen in the sequence where rdtgroup (set of jobs) had high "L3 <-> memory traffic" in initial phases -> mba_sc kicks in and reduced bandwidth percentage values -> but after some it has mostly "L2 <-> L3" traffic. In this scenario mba_sc increases the bandwidth percentage when there is lesser memory traffic. Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: ravi.v.shankar@intel.com Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: vikas.shivappa@intel.com Cc: ak@linux.intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/1524263781-14267-7-git-send-email-vikas.shivappa@linux.intel.com
author: Vikas Shivappa <vikas.shivappa@linux.intel.com> 2018-04-20 18:36:21 -0400
committer: Thomas Gleixner <tglx@linutronix.de> 2018-05-19 07:16:44 -0400
commit: de73f38f768021610bd305cf74ef3702fcf6a1eb (patch)
tree: e72107bd73f857e91041e41bebd4b446a93fde06
parent: ba0f26d8529c2dfc9aa6d9e8a338180737f8c1be (diff)
3 files changed, 128 insertions, 3 deletions
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index ad03d975883e..24bfa63e86cf 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -33,7 +33,6 @@
 #include <asm/intel_rdt_sched.h>
 #include "intel_rdt.h"
-#define MAX_MBA_BW      100u
 #define MBA_IS_LINEAR   0x4
 #define MBA_MAX_MBPS    U32_MAX
@@ -350,7 +349,7 @@ static int get_cache_id(int cpu, int level)
 * that can be written to QOS_MSRs.
 * There are currently no SKUs which support non linear delay values.
 */
-static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
+u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
 {
        if (r->membw.delay_linear)
                return MAX_MBA_BW - bw;
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 66a0ba37a8a3..39752825e376 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -28,6 +28,7 @@
 #define MBM_CNTR_WIDTH                  24
 #define MBM_OVERFLOW_INTERVAL           1000
+#define MAX_MBA_BW                      100u
 #define RMID_VAL_ERROR                  BIT_ULL(63)
 #define RMID_VAL_UNAVAIL                BIT_ULL(62)
@@ -461,6 +462,7 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom,
 void mbm_handle_overflow(struct work_struct *work);
 bool is_mba_sc(struct rdt_resource *r);
 void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm);
+u32 delay_bw_map(unsigned long bw, struct rdt_resource *r);
 void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
 void cqm_handle_limbo(struct work_struct *work);
 bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index 7690402c42b7..b0f3aed76b75 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -329,6 +329,118 @@ void mon_event_count(void *info)
        }
 }
+/*
+ * Feedback loop for MBA software controller (mba_sc)
+ *
+ * mba_sc is a feedback loop where we periodically read MBM counters and
+ * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
+ * that:
+ *
+ *   current bandwdith(cur_bw) < user specified bandwidth(user_bw)
+ *
+ * This uses the MBM counters to measure the bandwidth and MBA throttle
+ * MSRs to control the bandwidth for a particular rdtgrp. It builds on the
+ * fact that resctrl rdtgroups have both monitoring and control.
+ *
+ * The frequency of the checks is 1s and we just tag along the MBM overflow
+ * timer. Having 1s interval makes the calculation of bandwidth simpler.
+ *
+ * Although MBA's goal is to restrict the bandwidth to a maximum, there may
+ * be a need to increase the bandwidth to avoid uncecessarily restricting
+ * the L2 <-> L3 traffic.
+ *
+ * Since MBA controls the L2 external bandwidth where as MBM measures the
+ * L3 external bandwidth the following sequence could lead to such a
+ * situation.
+ *
+ * Consider an rdtgroup which had high L3 <-> memory traffic in initial
+ * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but
+ * after some time rdtgroup has mostly L2 <-> L3 traffic.
+ *
+ * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its
+ * throttle MSRs already have low percentage values.  To avoid
+ * unnecessarily restricting such rdtgroups, we also increase the bandwidth.
+ */
+static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
+{
+        u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val;
+        struct mbm_state *pmbm_data, *cmbm_data;
+        u32 cur_bw, delta_bw, user_bw;
+        struct rdt_resource *r_mba;
+        struct rdt_domain *dom_mba;
+        struct list_head *head;
+        struct rdtgroup *entry;
+        r_mba = &rdt_resources_all[RDT_RESOURCE_MBA];
+        closid = rgrp->closid;
+        rmid = rgrp->mon.rmid;
+        pmbm_data = &dom_mbm->mbm_local[rmid];
+        dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba);
+        if (!dom_mba) {
+                pr_warn_once("Failure to get domain for MBA update\n");
+                return;
+        }
+        cur_bw = pmbm_data->prev_bw;
+        user_bw = dom_mba->mbps_val[closid];
+        delta_bw = pmbm_data->delta_bw;
+        cur_msr_val = dom_mba->ctrl_val[closid];
+        /*
+         * For Ctrl groups read data from child monitor groups.
+         */
+        head = &rgrp->mon.crdtgrp_list;
+        list_for_each_entry(entry, head, mon.crdtgrp_list) {
+                cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
+                cur_bw += cmbm_data->prev_bw;
+                delta_bw += cmbm_data->delta_bw;
+        }
+        /*
+         * Scale up/down the bandwidth linearly for the ctrl group.  The
+         * bandwidth step is the bandwidth granularity specified by the
+         * hardware.
+         *
+         * The delta_bw is used when increasing the bandwidth so that we
+         * dont alternately increase and decrease the control values
+         * continuously.
+         *
+         * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if
+         * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep
+         * switching between 90 and 110 continuously if we only check
+         * cur_bw < user_bw.
+         */
+        if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
+                new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
+        } else if (cur_msr_val < MAX_MBA_BW &&
+                   (user_bw > (cur_bw + delta_bw))) {
+                new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
+        } else {
+                return;
+        }
+        cur_msr = r_mba->msr_base + closid;
+        wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba));
+        dom_mba->ctrl_val[closid] = new_msr_val;
+        /*
+         * Delta values are updated dynamically package wise for each
+         * rdtgrp everytime the throttle MSR changes value.
+         *
+         * This is because (1)the increase in bandwidth is not perfectly
+         * linear and only "approximately" linear even when the hardware
+         * says it is linear.(2)Also since MBA is a core specific
+         * mechanism, the delta values vary based on number of cores used
+         * by the rdtgrp.
+         */
+        pmbm_data->delta_comp = true;
+        list_for_each_entry(entry, head, mon.crdtgrp_list) {
+                cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
+                cmbm_data->delta_comp = true;
+        }
+}
 static void mbm_update(struct rdt_domain *d, int rmid)
 {
        struct rmid_read rr;
@@ -346,7 +458,16 @@ static void mbm_update(struct rdt_domain *d, int rmid)
        }
        if (is_mbm_local_enabled()) {
                rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
-                __mon_event_count(rmid, &rr);
+                /*
+                 * Call the MBA software controller only for the
+                 * control groups and when user has enabled
+                 * the software controller explicitly.
+                 */
+                if (!is_mba_sc(NULL))
+                        __mon_event_count(rmid, &rr);
+                else
+                        mbm_bw_count(rmid, &rr);
        }
 }
@@ -417,6 +538,9 @@ void mbm_handle_overflow(struct work_struct *work)
                head = &prgrp->mon.crdtgrp_list;
                list_for_each_entry(crgrp, head, mon.crdtgrp_list)
                        mbm_update(d, crgrp->mon.rmid);
+                if (is_mba_sc(NULL))
+                        update_mba_bw(prgrp, d);
        }
        schedule_delayed_work_on(cpu, &d->mbm_over, delay);
author	Vikas Shivappa <vikas.shivappa@linux.intel.com>	2018-04-20 18:36:21 -0400
committer	Thomas Gleixner <tglx@linutronix.de>	2018-05-19 07:16:44 -0400
commit	de73f38f768021610bd305cf74ef3702fcf6a1eb (patch)
tree	e72107bd73f857e91041e41bebd4b446a93fde06
parent	ba0f26d8529c2dfc9aa6d9e8a338180737f8c1be (diff)

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index ad03d975883e..24bfa63e86cf 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -33,7 +33,6 @@
33	#include <asm/intel_rdt_sched.h>	33	#include <asm/intel_rdt_sched.h>
34	#include "intel_rdt.h"	34	#include "intel_rdt.h"
35		35
36	#define MAX_MBA_BW 100u
37	#define MBA_IS_LINEAR 0x4	36	#define MBA_IS_LINEAR 0x4
38	#define MBA_MAX_MBPS U32_MAX	37	#define MBA_MAX_MBPS U32_MAX
39		38
@@ -350,7 +349,7 @@ static int get_cache_id(int cpu, int level)
350	* that can be written to QOS_MSRs.	349	* that can be written to QOS_MSRs.
351	* There are currently no SKUs which support non linear delay values.	350	* There are currently no SKUs which support non linear delay values.
352	*/	351	*/
353	static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)	352	u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
354	{	353	{
355	if (r->membw.delay_linear)	354	if (r->membw.delay_linear)
356	return MAX_MBA_BW - bw;	355	return MAX_MBA_BW - bw;


diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 66a0ba37a8a3..39752825e376 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -28,6 +28,7 @@
28		28
29	#define MBM_CNTR_WIDTH 24	29	#define MBM_CNTR_WIDTH 24
30	#define MBM_OVERFLOW_INTERVAL 1000	30	#define MBM_OVERFLOW_INTERVAL 1000
		31	#define MAX_MBA_BW 100u
31		32
32	#define RMID_VAL_ERROR BIT_ULL(63)	33	#define RMID_VAL_ERROR BIT_ULL(63)
33	#define RMID_VAL_UNAVAIL BIT_ULL(62)	34	#define RMID_VAL_UNAVAIL BIT_ULL(62)
@@ -461,6 +462,7 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom,
461	void mbm_handle_overflow(struct work_struct *work);	462	void mbm_handle_overflow(struct work_struct *work);
462	bool is_mba_sc(struct rdt_resource *r);	463	bool is_mba_sc(struct rdt_resource *r);
463	void setup_default_ctrlval(struct rdt_resource r, u32 dc, u32 *dm);	464	void setup_default_ctrlval(struct rdt_resource r, u32 dc, u32 *dm);
		465	u32 delay_bw_map(unsigned long bw, struct rdt_resource *r);
464	void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);	466	void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
465	void cqm_handle_limbo(struct work_struct *work);	467	void cqm_handle_limbo(struct work_struct *work);
466	bool has_busy_rmid(struct rdt_resource r, struct rdt_domain d);	468	bool has_busy_rmid(struct rdt_resource r, struct rdt_domain d);


diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c index 7690402c42b7..b0f3aed76b75 100644 --- a/arch/x86/kernel/cpu/intel_rdt_monitor.c +++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -329,6 +329,118 @@ void mon_event_count(void *info)
329	}	329	}
330	}	330	}
331		331
		332	/*
		333	* Feedback loop for MBA software controller (mba_sc)
		334	*
		335	* mba_sc is a feedback loop where we periodically read MBM counters and
		336	* adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
		337	* that:
		338	*
		339	* current bandwdith(cur_bw) < user specified bandwidth(user_bw)
		340	*
		341	* This uses the MBM counters to measure the bandwidth and MBA throttle
		342	* MSRs to control the bandwidth for a particular rdtgrp. It builds on the
		343	* fact that resctrl rdtgroups have both monitoring and control.
		344	*
		345	* The frequency of the checks is 1s and we just tag along the MBM overflow
		346	* timer. Having 1s interval makes the calculation of bandwidth simpler.
		347	*
		348	* Although MBA's goal is to restrict the bandwidth to a maximum, there may
		349	* be a need to increase the bandwidth to avoid uncecessarily restricting
		350	* the L2 <-> L3 traffic.
		351	*
		352	* Since MBA controls the L2 external bandwidth where as MBM measures the
		353	* L3 external bandwidth the following sequence could lead to such a
		354	* situation.
		355	*
		356	* Consider an rdtgroup which had high L3 <-> memory traffic in initial
		357	* phases -> mba_sc kicks in and reduced bandwidth percentage values -> but
		358	* after some time rdtgroup has mostly L2 <-> L3 traffic.
		359	*
		360	* In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its
		361	* throttle MSRs already have low percentage values. To avoid
		362	* unnecessarily restricting such rdtgroups, we also increase the bandwidth.
		363	*/
		364	static void update_mba_bw(struct rdtgroup rgrp, struct rdt_domain dom_mbm)
		365	{
		366	u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val;
		367	struct mbm_state pmbm_data, cmbm_data;
		368	u32 cur_bw, delta_bw, user_bw;
		369	struct rdt_resource *r_mba;
		370	struct rdt_domain *dom_mba;
		371	struct list_head *head;
		372	struct rdtgroup *entry;
		373
		374	r_mba = &rdt_resources_all[RDT_RESOURCE_MBA];
		375	closid = rgrp->closid;
		376	rmid = rgrp->mon.rmid;
		377	pmbm_data = &dom_mbm->mbm_local[rmid];
		378
		379	dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba);
		380	if (!dom_mba) {
		381	pr_warn_once("Failure to get domain for MBA update\n");
		382	return;
		383	}
		384
		385	cur_bw = pmbm_data->prev_bw;
		386	user_bw = dom_mba->mbps_val[closid];
		387	delta_bw = pmbm_data->delta_bw;
		388	cur_msr_val = dom_mba->ctrl_val[closid];
		389
		390	/*
		391	* For Ctrl groups read data from child monitor groups.
		392	*/
		393	head = &rgrp->mon.crdtgrp_list;
		394	list_for_each_entry(entry, head, mon.crdtgrp_list) {
		395	cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
		396	cur_bw += cmbm_data->prev_bw;
		397	delta_bw += cmbm_data->delta_bw;
		398	}
		399
		400	/*
		401	* Scale up/down the bandwidth linearly for the ctrl group. The
		402	* bandwidth step is the bandwidth granularity specified by the
		403	* hardware.
		404	*
		405	* The delta_bw is used when increasing the bandwidth so that we
		406	* dont alternately increase and decrease the control values
		407	* continuously.
		408	*
		409	* For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if
		410	* bandwidth step is 20MBps(> user_bw - cur_bw), we would keep
		411	* switching between 90 and 110 continuously if we only check
		412	* cur_bw < user_bw.
		413	*/
		414	if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
		415	new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
		416	} else if (cur_msr_val < MAX_MBA_BW &&
		417	(user_bw > (cur_bw + delta_bw))) {
		418	new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
		419	} else {
		420	return;
		421	}
		422
		423	cur_msr = r_mba->msr_base + closid;
		424	wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba));
		425	dom_mba->ctrl_val[closid] = new_msr_val;
		426
		427	/*
		428	* Delta values are updated dynamically package wise for each
		429	* rdtgrp everytime the throttle MSR changes value.
		430	*
		431	* This is because (1)the increase in bandwidth is not perfectly
		432	* linear and only "approximately" linear even when the hardware
		433	* says it is linear.(2)Also since MBA is a core specific
		434	* mechanism, the delta values vary based on number of cores used
		435	* by the rdtgrp.
		436	*/
		437	pmbm_data->delta_comp = true;
		438	list_for_each_entry(entry, head, mon.crdtgrp_list) {
		439	cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
		440	cmbm_data->delta_comp = true;
		441	}
		442	}
		443
332	static void mbm_update(struct rdt_domain *d, int rmid)	444	static void mbm_update(struct rdt_domain *d, int rmid)
333	{	445	{
334	struct rmid_read rr;	446	struct rmid_read rr;
@@ -346,7 +458,16 @@ static void mbm_update(struct rdt_domain *d, int rmid)
346	}	458	}
347	if (is_mbm_local_enabled()) {	459	if (is_mbm_local_enabled()) {
348	rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;	460	rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
349	__mon_event_count(rmid, &rr);	461
		462	/*
		463	* Call the MBA software controller only for the
		464	* control groups and when user has enabled
		465	* the software controller explicitly.
		466	*/
		467	if (!is_mba_sc(NULL))
		468	__mon_event_count(rmid, &rr);
		469	else
		470	mbm_bw_count(rmid, &rr);
350	}	471	}
351	}	472	}
352		473
@@ -417,6 +538,9 @@ void mbm_handle_overflow(struct work_struct *work)
417	head = &prgrp->mon.crdtgrp_list;	538	head = &prgrp->mon.crdtgrp_list;
418	list_for_each_entry(crgrp, head, mon.crdtgrp_list)	539	list_for_each_entry(crgrp, head, mon.crdtgrp_list)
419	mbm_update(d, crgrp->mon.rmid);	540	mbm_update(d, crgrp->mon.rmid);
		541
		542	if (is_mba_sc(NULL))
		543	update_mba_bw(prgrp, d);
420	}	544	}
421		545
422	schedule_delayed_work_on(cpu, &d->mbm_over, delay);	546	schedule_delayed_work_on(cpu, &d->mbm_over, delay);