aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-06-05 00:34:39 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-06-05 00:34:39 -0400
commitab20fd0013cd086230bb39344918f5b6eb41c4ad (patch)
tree1828996b036347cb7a49ae89c9ab4b8a55df4716
parentba252f16e4433e7599fec986e77722e6d0eed186 (diff)
parentde73f38f768021610bd305cf74ef3702fcf6a1eb (diff)
Merge branch 'x86-cache-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 cache resource controller updates from Thomas Gleixner: "An update for the Intel Resource Director Technolgy (RDT) which adds a feedback driven software controller to runtime adjust the bandwidth allocation MSRs. This makes the allocations more accurate and allows to use bandwidth values in understandable units (MB/s) instead of using percentage based allocations as the original, still available, interface. The software controller can be enabled with a new mount option for the resctrl filesystem" * 'x86-cache-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/intel_rdt/mba_sc: Feedback loop to dynamically update mem bandwidth x86/intel_rdt/mba_sc: Prepare for feedback loop x86/intel_rdt/mba_sc: Add schemata support x86/intel_rdt/mba_sc: Add initialization support x86/intel_rdt/mba_sc: Enable/disable MBA software controller x86/intel_rdt/mba_sc: Documentation for MBA software controller(mba_sc)
-rw-r--r--Documentation/x86/intel_rdt_ui.txt75
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.c50
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.h18
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c24
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_monitor.c170
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c33
6 files changed, 337 insertions, 33 deletions
diff --git a/Documentation/x86/intel_rdt_ui.txt b/Documentation/x86/intel_rdt_ui.txt
index 71c30984e94d..a16aa2113840 100644
--- a/Documentation/x86/intel_rdt_ui.txt
+++ b/Documentation/x86/intel_rdt_ui.txt
@@ -17,12 +17,14 @@ MBA (Memory Bandwidth Allocation) - "mba"
17 17
18To use the feature mount the file system: 18To use the feature mount the file system:
19 19
20 # mount -t resctrl resctrl [-o cdp[,cdpl2]] /sys/fs/resctrl 20 # mount -t resctrl resctrl [-o cdp[,cdpl2][,mba_MBps]] /sys/fs/resctrl
21 21
22mount options are: 22mount options are:
23 23
24"cdp": Enable code/data prioritization in L3 cache allocations. 24"cdp": Enable code/data prioritization in L3 cache allocations.
25"cdpl2": Enable code/data prioritization in L2 cache allocations. 25"cdpl2": Enable code/data prioritization in L2 cache allocations.
26"mba_MBps": Enable the MBA Software Controller(mba_sc) to specify MBA
27 bandwidth in MBps
26 28
27L2 and L3 CDP are controlled seperately. 29L2 and L3 CDP are controlled seperately.
28 30
@@ -270,10 +272,11 @@ and 0xA are not. On a system with a 20-bit mask each bit represents 5%
270of the capacity of the cache. You could partition the cache into four 272of the capacity of the cache. You could partition the cache into four
271equal parts with masks: 0x1f, 0x3e0, 0x7c00, 0xf8000. 273equal parts with masks: 0x1f, 0x3e0, 0x7c00, 0xf8000.
272 274
273Memory bandwidth(b/w) percentage 275Memory bandwidth Allocation and monitoring
274-------------------------------- 276------------------------------------------
275For Memory b/w resource, user controls the resource by indicating the 277
276percentage of total memory b/w. 278For Memory bandwidth resource, by default the user controls the resource
279by indicating the percentage of total memory bandwidth.
277 280
278The minimum bandwidth percentage value for each cpu model is predefined 281The minimum bandwidth percentage value for each cpu model is predefined
279and can be looked up through "info/MB/min_bandwidth". The bandwidth 282and can be looked up through "info/MB/min_bandwidth". The bandwidth
@@ -285,7 +288,47 @@ to the next control step available on the hardware.
285The bandwidth throttling is a core specific mechanism on some of Intel 288The bandwidth throttling is a core specific mechanism on some of Intel
286SKUs. Using a high bandwidth and a low bandwidth setting on two threads 289SKUs. Using a high bandwidth and a low bandwidth setting on two threads
287sharing a core will result in both threads being throttled to use the 290sharing a core will result in both threads being throttled to use the
288low bandwidth. 291low bandwidth. The fact that Memory bandwidth allocation(MBA) is a core
292specific mechanism where as memory bandwidth monitoring(MBM) is done at
293the package level may lead to confusion when users try to apply control
294via the MBA and then monitor the bandwidth to see if the controls are
295effective. Below are such scenarios:
296
2971. User may *not* see increase in actual bandwidth when percentage
298 values are increased:
299
300This can occur when aggregate L2 external bandwidth is more than L3
301external bandwidth. Consider an SKL SKU with 24 cores on a package and
302where L2 external is 10GBps (hence aggregate L2 external bandwidth is
303240GBps) and L3 external bandwidth is 100GBps. Now a workload with '20
304threads, having 50% bandwidth, each consuming 5GBps' consumes the max L3
305bandwidth of 100GBps although the percentage value specified is only 50%
306<< 100%. Hence increasing the bandwidth percentage will not yeild any
307more bandwidth. This is because although the L2 external bandwidth still
308has capacity, the L3 external bandwidth is fully used. Also note that
309this would be dependent on number of cores the benchmark is run on.
310
3112. Same bandwidth percentage may mean different actual bandwidth
312 depending on # of threads:
313
314For the same SKU in #1, a 'single thread, with 10% bandwidth' and '4
315thread, with 10% bandwidth' can consume upto 10GBps and 40GBps although
316they have same percentage bandwidth of 10%. This is simply because as
317threads start using more cores in an rdtgroup, the actual bandwidth may
318increase or vary although user specified bandwidth percentage is same.
319
320In order to mitigate this and make the interface more user friendly,
321resctrl added support for specifying the bandwidth in MBps as well. The
322kernel underneath would use a software feedback mechanism or a "Software
323Controller(mba_sc)" which reads the actual bandwidth using MBM counters
324and adjust the memowy bandwidth percentages to ensure
325
326 "actual bandwidth < user specified bandwidth".
327
328By default, the schemata would take the bandwidth percentage values
329where as user can switch to the "MBA software controller" mode using
330a mount option 'mba_MBps'. The schemata format is specified in the below
331sections.
289 332
290L3 schemata file details (code and data prioritization disabled) 333L3 schemata file details (code and data prioritization disabled)
291---------------------------------------------------------------- 334----------------------------------------------------------------
@@ -308,13 +351,20 @@ schemata format is always:
308 351
309 L2:<cache_id0>=<cbm>;<cache_id1>=<cbm>;... 352 L2:<cache_id0>=<cbm>;<cache_id1>=<cbm>;...
310 353
311Memory b/w Allocation details 354Memory bandwidth Allocation (default mode)
312----------------------------- 355------------------------------------------
313 356
314Memory b/w domain is L3 cache. 357Memory b/w domain is L3 cache.
315 358
316 MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;... 359 MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;...
317 360
361Memory bandwidth Allocation specified in MBps
362---------------------------------------------
363
364Memory bandwidth domain is L3 cache.
365
366 MB:<cache_id0>=bw_MBps0;<cache_id1>=bw_MBps1;...
367
318Reading/writing the schemata file 368Reading/writing the schemata file
319--------------------------------- 369---------------------------------
320Reading the schemata file will show the state of all resources 370Reading the schemata file will show the state of all resources
@@ -358,6 +408,15 @@ allocations can overlap or not. The allocations specifies the maximum
358b/w that the group may be able to use and the system admin can configure 408b/w that the group may be able to use and the system admin can configure
359the b/w accordingly. 409the b/w accordingly.
360 410
411If the MBA is specified in MB(megabytes) then user can enter the max b/w in MB
412rather than the percentage values.
413
414# echo "L3:0=3;1=c\nMB:0=1024;1=500" > /sys/fs/resctrl/p0/schemata
415# echo "L3:0=3;1=3\nMB:0=1024;1=500" > /sys/fs/resctrl/p1/schemata
416
417In the above example the tasks in "p1" and "p0" on socket 0 would use a max b/w
418of 1024MB where as on socket 1 they would use 500MB.
419
361Example 2 420Example 2
362--------- 421---------
363Again two sockets, but this time with a more realistic 20-bit mask. 422Again two sockets, but this time with a more realistic 20-bit mask.
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 589b948e6e01..24bfa63e86cf 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -33,8 +33,8 @@
33#include <asm/intel_rdt_sched.h> 33#include <asm/intel_rdt_sched.h>
34#include "intel_rdt.h" 34#include "intel_rdt.h"
35 35
36#define MAX_MBA_BW 100u
37#define MBA_IS_LINEAR 0x4 36#define MBA_IS_LINEAR 0x4
37#define MBA_MAX_MBPS U32_MAX
38 38
39/* Mutex to protect rdtgroup access. */ 39/* Mutex to protect rdtgroup access. */
40DEFINE_MUTEX(rdtgroup_mutex); 40DEFINE_MUTEX(rdtgroup_mutex);
@@ -178,7 +178,7 @@ struct rdt_resource rdt_resources_all[] = {
178 .msr_update = mba_wrmsr, 178 .msr_update = mba_wrmsr,
179 .cache_level = 3, 179 .cache_level = 3,
180 .parse_ctrlval = parse_bw, 180 .parse_ctrlval = parse_bw,
181 .format_str = "%d=%*d", 181 .format_str = "%d=%*u",
182 .fflags = RFTYPE_RES_MB, 182 .fflags = RFTYPE_RES_MB,
183 }, 183 },
184}; 184};
@@ -230,6 +230,14 @@ static inline void cache_alloc_hsw_probe(void)
230 rdt_alloc_capable = true; 230 rdt_alloc_capable = true;
231} 231}
232 232
233bool is_mba_sc(struct rdt_resource *r)
234{
235 if (!r)
236 return rdt_resources_all[RDT_RESOURCE_MBA].membw.mba_sc;
237
238 return r->membw.mba_sc;
239}
240
233/* 241/*
234 * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values 242 * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values
235 * exposed to user interface and the h/w understandable delay values. 243 * exposed to user interface and the h/w understandable delay values.
@@ -341,7 +349,7 @@ static int get_cache_id(int cpu, int level)
341 * that can be written to QOS_MSRs. 349 * that can be written to QOS_MSRs.
342 * There are currently no SKUs which support non linear delay values. 350 * There are currently no SKUs which support non linear delay values.
343 */ 351 */
344static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) 352u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
345{ 353{
346 if (r->membw.delay_linear) 354 if (r->membw.delay_linear)
347 return MAX_MBA_BW - bw; 355 return MAX_MBA_BW - bw;
@@ -431,25 +439,40 @@ struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
431 return NULL; 439 return NULL;
432} 440}
433 441
442void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm)
443{
444 int i;
445
446 /*
447 * Initialize the Control MSRs to having no control.
448 * For Cache Allocation: Set all bits in cbm
449 * For Memory Allocation: Set b/w requested to 100%
450 * and the bandwidth in MBps to U32_MAX
451 */
452 for (i = 0; i < r->num_closid; i++, dc++, dm++) {
453 *dc = r->default_ctrl;
454 *dm = MBA_MAX_MBPS;
455 }
456}
457
434static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) 458static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
435{ 459{
436 struct msr_param m; 460 struct msr_param m;
437 u32 *dc; 461 u32 *dc, *dm;
438 int i;
439 462
440 dc = kmalloc_array(r->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL); 463 dc = kmalloc_array(r->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL);
441 if (!dc) 464 if (!dc)
442 return -ENOMEM; 465 return -ENOMEM;
443 466
444 d->ctrl_val = dc; 467 dm = kmalloc_array(r->num_closid, sizeof(*d->mbps_val), GFP_KERNEL);
468 if (!dm) {
469 kfree(dc);
470 return -ENOMEM;
471 }
445 472
446 /* 473 d->ctrl_val = dc;
447 * Initialize the Control MSRs to having no control. 474 d->mbps_val = dm;
448 * For Cache Allocation: Set all bits in cbm 475 setup_default_ctrlval(r, dc, dm);
449 * For Memory Allocation: Set b/w requested to 100
450 */
451 for (i = 0; i < r->num_closid; i++, dc++)
452 *dc = r->default_ctrl;
453 476
454 m.low = 0; 477 m.low = 0;
455 m.high = r->num_closid; 478 m.high = r->num_closid;
@@ -588,6 +611,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
588 } 611 }
589 612
590 kfree(d->ctrl_val); 613 kfree(d->ctrl_val);
614 kfree(d->mbps_val);
591 kfree(d->rmid_busy_llc); 615 kfree(d->rmid_busy_llc);
592 kfree(d->mbm_total); 616 kfree(d->mbm_total);
593 kfree(d->mbm_local); 617 kfree(d->mbm_local);
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 3fd7a70ee04a..39752825e376 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -28,6 +28,7 @@
28 28
29#define MBM_CNTR_WIDTH 24 29#define MBM_CNTR_WIDTH 24
30#define MBM_OVERFLOW_INTERVAL 1000 30#define MBM_OVERFLOW_INTERVAL 1000
31#define MAX_MBA_BW 100u
31 32
32#define RMID_VAL_ERROR BIT_ULL(63) 33#define RMID_VAL_ERROR BIT_ULL(63)
33#define RMID_VAL_UNAVAIL BIT_ULL(62) 34#define RMID_VAL_UNAVAIL BIT_ULL(62)
@@ -180,10 +181,20 @@ struct rftype {
180 * struct mbm_state - status for each MBM counter in each domain 181 * struct mbm_state - status for each MBM counter in each domain
181 * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) 182 * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes)
182 * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it 183 * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it
184 * @chunks_bw Total local data moved. Used for bandwidth calculation
185 * @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting
186 * @prev_bw The most recent bandwidth in MBps
187 * @delta_bw Difference between the current and previous bandwidth
188 * @delta_comp Indicates whether to compute the delta_bw
183 */ 189 */
184struct mbm_state { 190struct mbm_state {
185 u64 chunks; 191 u64 chunks;
186 u64 prev_msr; 192 u64 prev_msr;
193 u64 chunks_bw;
194 u64 prev_bw_msr;
195 u32 prev_bw;
196 u32 delta_bw;
197 bool delta_comp;
187}; 198};
188 199
189/** 200/**
@@ -202,6 +213,7 @@ struct mbm_state {
202 * @cqm_work_cpu: 213 * @cqm_work_cpu:
203 * worker cpu for CQM h/w counters 214 * worker cpu for CQM h/w counters
204 * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) 215 * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID)
216 * @mbps_val: When mba_sc is enabled, this holds the bandwidth in MBps
205 * @new_ctrl: new ctrl value to be loaded 217 * @new_ctrl: new ctrl value to be loaded
206 * @have_new_ctrl: did user provide new_ctrl for this domain 218 * @have_new_ctrl: did user provide new_ctrl for this domain
207 */ 219 */
@@ -217,6 +229,7 @@ struct rdt_domain {
217 int mbm_work_cpu; 229 int mbm_work_cpu;
218 int cqm_work_cpu; 230 int cqm_work_cpu;
219 u32 *ctrl_val; 231 u32 *ctrl_val;
232 u32 *mbps_val;
220 u32 new_ctrl; 233 u32 new_ctrl;
221 bool have_new_ctrl; 234 bool have_new_ctrl;
222}; 235};
@@ -259,6 +272,7 @@ struct rdt_cache {
259 * @min_bw: Minimum memory bandwidth percentage user can request 272 * @min_bw: Minimum memory bandwidth percentage user can request
260 * @bw_gran: Granularity at which the memory bandwidth is allocated 273 * @bw_gran: Granularity at which the memory bandwidth is allocated
261 * @delay_linear: True if memory B/W delay is in linear scale 274 * @delay_linear: True if memory B/W delay is in linear scale
275 * @mba_sc: True if MBA software controller(mba_sc) is enabled
262 * @mb_map: Mapping of memory B/W percentage to memory B/W delay 276 * @mb_map: Mapping of memory B/W percentage to memory B/W delay
263 */ 277 */
264struct rdt_membw { 278struct rdt_membw {
@@ -266,6 +280,7 @@ struct rdt_membw {
266 u32 min_bw; 280 u32 min_bw;
267 u32 bw_gran; 281 u32 bw_gran;
268 u32 delay_linear; 282 u32 delay_linear;
283 bool mba_sc;
269 u32 *mb_map; 284 u32 *mb_map;
270}; 285};
271 286
@@ -445,6 +460,9 @@ void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
445void mbm_setup_overflow_handler(struct rdt_domain *dom, 460void mbm_setup_overflow_handler(struct rdt_domain *dom,
446 unsigned long delay_ms); 461 unsigned long delay_ms);
447void mbm_handle_overflow(struct work_struct *work); 462void mbm_handle_overflow(struct work_struct *work);
463bool is_mba_sc(struct rdt_resource *r);
464void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm);
465u32 delay_bw_map(unsigned long bw, struct rdt_resource *r);
448void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); 466void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
449void cqm_handle_limbo(struct work_struct *work); 467void cqm_handle_limbo(struct work_struct *work);
450bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); 468bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
index 23e1d5c249c6..116d57b248d3 100644
--- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -53,7 +53,8 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
53 return false; 53 return false;
54 } 54 }
55 55
56 if (bw < r->membw.min_bw || bw > r->default_ctrl) { 56 if ((bw < r->membw.min_bw || bw > r->default_ctrl) &&
57 !is_mba_sc(r)) {
57 rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw, 58 rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw,
58 r->membw.min_bw, r->default_ctrl); 59 r->membw.min_bw, r->default_ctrl);
59 return false; 60 return false;
@@ -179,6 +180,8 @@ static int update_domains(struct rdt_resource *r, int closid)
179 struct msr_param msr_param; 180 struct msr_param msr_param;
180 cpumask_var_t cpu_mask; 181 cpumask_var_t cpu_mask;
181 struct rdt_domain *d; 182 struct rdt_domain *d;
183 bool mba_sc;
184 u32 *dc;
182 int cpu; 185 int cpu;
183 186
184 if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) 187 if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
@@ -188,13 +191,20 @@ static int update_domains(struct rdt_resource *r, int closid)
188 msr_param.high = msr_param.low + 1; 191 msr_param.high = msr_param.low + 1;
189 msr_param.res = r; 192 msr_param.res = r;
190 193
194 mba_sc = is_mba_sc(r);
191 list_for_each_entry(d, &r->domains, list) { 195 list_for_each_entry(d, &r->domains, list) {
192 if (d->have_new_ctrl && d->new_ctrl != d->ctrl_val[closid]) { 196 dc = !mba_sc ? d->ctrl_val : d->mbps_val;
197 if (d->have_new_ctrl && d->new_ctrl != dc[closid]) {
193 cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); 198 cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
194 d->ctrl_val[closid] = d->new_ctrl; 199 dc[closid] = d->new_ctrl;
195 } 200 }
196 } 201 }
197 if (cpumask_empty(cpu_mask)) 202
203 /*
204 * Avoid writing the control msr with control values when
205 * MBA software controller is enabled
206 */
207 if (cpumask_empty(cpu_mask) || mba_sc)
198 goto done; 208 goto done;
199 cpu = get_cpu(); 209 cpu = get_cpu();
200 /* Update CBM on this cpu if it's in cpu_mask. */ 210 /* Update CBM on this cpu if it's in cpu_mask. */
@@ -282,13 +292,17 @@ static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
282{ 292{
283 struct rdt_domain *dom; 293 struct rdt_domain *dom;
284 bool sep = false; 294 bool sep = false;
295 u32 ctrl_val;
285 296
286 seq_printf(s, "%*s:", max_name_width, r->name); 297 seq_printf(s, "%*s:", max_name_width, r->name);
287 list_for_each_entry(dom, &r->domains, list) { 298 list_for_each_entry(dom, &r->domains, list) {
288 if (sep) 299 if (sep)
289 seq_puts(s, ";"); 300 seq_puts(s, ";");
301
302 ctrl_val = (!is_mba_sc(r) ? dom->ctrl_val[closid] :
303 dom->mbps_val[closid]);
290 seq_printf(s, r->format_str, dom->id, max_data_width, 304 seq_printf(s, r->format_str, dom->id, max_data_width,
291 dom->ctrl_val[closid]); 305 ctrl_val);
292 sep = true; 306 sep = true;
293 } 307 }
294 seq_puts(s, "\n"); 308 seq_puts(s, "\n");
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index 681450eee428..b0f3aed76b75 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -225,10 +225,18 @@ void free_rmid(u32 rmid)
225 list_add_tail(&entry->list, &rmid_free_lru); 225 list_add_tail(&entry->list, &rmid_free_lru);
226} 226}
227 227
228static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr)
229{
230 u64 shift = 64 - MBM_CNTR_WIDTH, chunks;
231
232 chunks = (cur_msr << shift) - (prev_msr << shift);
233 return chunks >>= shift;
234}
235
228static int __mon_event_count(u32 rmid, struct rmid_read *rr) 236static int __mon_event_count(u32 rmid, struct rmid_read *rr)
229{ 237{
230 u64 chunks, shift, tval;
231 struct mbm_state *m; 238 struct mbm_state *m;
239 u64 chunks, tval;
232 240
233 tval = __rmid_read(rmid, rr->evtid); 241 tval = __rmid_read(rmid, rr->evtid);
234 if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) { 242 if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) {
@@ -254,14 +262,12 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
254 } 262 }
255 263
256 if (rr->first) { 264 if (rr->first) {
257 m->prev_msr = tval; 265 memset(m, 0, sizeof(struct mbm_state));
258 m->chunks = 0; 266 m->prev_bw_msr = m->prev_msr = tval;
259 return 0; 267 return 0;
260 } 268 }
261 269
262 shift = 64 - MBM_CNTR_WIDTH; 270 chunks = mbm_overflow_count(m->prev_msr, tval);
263 chunks = (tval << shift) - (m->prev_msr << shift);
264 chunks >>= shift;
265 m->chunks += chunks; 271 m->chunks += chunks;
266 m->prev_msr = tval; 272 m->prev_msr = tval;
267 273
@@ -270,6 +276,32 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
270} 276}
271 277
272/* 278/*
279 * Supporting function to calculate the memory bandwidth
280 * and delta bandwidth in MBps.
281 */
282static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
283{
284 struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
285 struct mbm_state *m = &rr->d->mbm_local[rmid];
286 u64 tval, cur_bw, chunks;
287
288 tval = __rmid_read(rmid, rr->evtid);
289 if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
290 return;
291
292 chunks = mbm_overflow_count(m->prev_bw_msr, tval);
293 m->chunks_bw += chunks;
294 m->chunks = m->chunks_bw;
295 cur_bw = (chunks * r->mon_scale) >> 20;
296
297 if (m->delta_comp)
298 m->delta_bw = abs(cur_bw - m->prev_bw);
299 m->delta_comp = false;
300 m->prev_bw = cur_bw;
301 m->prev_bw_msr = tval;
302}
303
304/*
273 * This is called via IPI to read the CQM/MBM counters 305 * This is called via IPI to read the CQM/MBM counters
274 * on a domain. 306 * on a domain.
275 */ 307 */
@@ -297,6 +329,118 @@ void mon_event_count(void *info)
297 } 329 }
298} 330}
299 331
332/*
333 * Feedback loop for MBA software controller (mba_sc)
334 *
335 * mba_sc is a feedback loop where we periodically read MBM counters and
336 * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
337 * that:
338 *
339 * current bandwdith(cur_bw) < user specified bandwidth(user_bw)
340 *
341 * This uses the MBM counters to measure the bandwidth and MBA throttle
342 * MSRs to control the bandwidth for a particular rdtgrp. It builds on the
343 * fact that resctrl rdtgroups have both monitoring and control.
344 *
345 * The frequency of the checks is 1s and we just tag along the MBM overflow
346 * timer. Having 1s interval makes the calculation of bandwidth simpler.
347 *
348 * Although MBA's goal is to restrict the bandwidth to a maximum, there may
349 * be a need to increase the bandwidth to avoid uncecessarily restricting
350 * the L2 <-> L3 traffic.
351 *
352 * Since MBA controls the L2 external bandwidth where as MBM measures the
353 * L3 external bandwidth the following sequence could lead to such a
354 * situation.
355 *
356 * Consider an rdtgroup which had high L3 <-> memory traffic in initial
357 * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but
358 * after some time rdtgroup has mostly L2 <-> L3 traffic.
359 *
360 * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its
361 * throttle MSRs already have low percentage values. To avoid
362 * unnecessarily restricting such rdtgroups, we also increase the bandwidth.
363 */
364static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
365{
366 u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val;
367 struct mbm_state *pmbm_data, *cmbm_data;
368 u32 cur_bw, delta_bw, user_bw;
369 struct rdt_resource *r_mba;
370 struct rdt_domain *dom_mba;
371 struct list_head *head;
372 struct rdtgroup *entry;
373
374 r_mba = &rdt_resources_all[RDT_RESOURCE_MBA];
375 closid = rgrp->closid;
376 rmid = rgrp->mon.rmid;
377 pmbm_data = &dom_mbm->mbm_local[rmid];
378
379 dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba);
380 if (!dom_mba) {
381 pr_warn_once("Failure to get domain for MBA update\n");
382 return;
383 }
384
385 cur_bw = pmbm_data->prev_bw;
386 user_bw = dom_mba->mbps_val[closid];
387 delta_bw = pmbm_data->delta_bw;
388 cur_msr_val = dom_mba->ctrl_val[closid];
389
390 /*
391 * For Ctrl groups read data from child monitor groups.
392 */
393 head = &rgrp->mon.crdtgrp_list;
394 list_for_each_entry(entry, head, mon.crdtgrp_list) {
395 cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
396 cur_bw += cmbm_data->prev_bw;
397 delta_bw += cmbm_data->delta_bw;
398 }
399
400 /*
401 * Scale up/down the bandwidth linearly for the ctrl group. The
402 * bandwidth step is the bandwidth granularity specified by the
403 * hardware.
404 *
405 * The delta_bw is used when increasing the bandwidth so that we
406 * dont alternately increase and decrease the control values
407 * continuously.
408 *
409 * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if
410 * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep
411 * switching between 90 and 110 continuously if we only check
412 * cur_bw < user_bw.
413 */
414 if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
415 new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
416 } else if (cur_msr_val < MAX_MBA_BW &&
417 (user_bw > (cur_bw + delta_bw))) {
418 new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
419 } else {
420 return;
421 }
422
423 cur_msr = r_mba->msr_base + closid;
424 wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba));
425 dom_mba->ctrl_val[closid] = new_msr_val;
426
427 /*
428 * Delta values are updated dynamically package wise for each
429 * rdtgrp everytime the throttle MSR changes value.
430 *
431 * This is because (1)the increase in bandwidth is not perfectly
432 * linear and only "approximately" linear even when the hardware
433 * says it is linear.(2)Also since MBA is a core specific
434 * mechanism, the delta values vary based on number of cores used
435 * by the rdtgrp.
436 */
437 pmbm_data->delta_comp = true;
438 list_for_each_entry(entry, head, mon.crdtgrp_list) {
439 cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
440 cmbm_data->delta_comp = true;
441 }
442}
443
300static void mbm_update(struct rdt_domain *d, int rmid) 444static void mbm_update(struct rdt_domain *d, int rmid)
301{ 445{
302 struct rmid_read rr; 446 struct rmid_read rr;
@@ -314,7 +458,16 @@ static void mbm_update(struct rdt_domain *d, int rmid)
314 } 458 }
315 if (is_mbm_local_enabled()) { 459 if (is_mbm_local_enabled()) {
316 rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; 460 rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
317 __mon_event_count(rmid, &rr); 461
462 /*
463 * Call the MBA software controller only for the
464 * control groups and when user has enabled
465 * the software controller explicitly.
466 */
467 if (!is_mba_sc(NULL))
468 __mon_event_count(rmid, &rr);
469 else
470 mbm_bw_count(rmid, &rr);
318 } 471 }
319} 472}
320 473
@@ -385,6 +538,9 @@ void mbm_handle_overflow(struct work_struct *work)
385 head = &prgrp->mon.crdtgrp_list; 538 head = &prgrp->mon.crdtgrp_list;
386 list_for_each_entry(crgrp, head, mon.crdtgrp_list) 539 list_for_each_entry(crgrp, head, mon.crdtgrp_list)
387 mbm_update(d, crgrp->mon.rmid); 540 mbm_update(d, crgrp->mon.rmid);
541
542 if (is_mba_sc(NULL))
543 update_mba_bw(prgrp, d);
388 } 544 }
389 545
390 schedule_delayed_work_on(cpu, &d->mbm_over, delay); 546 schedule_delayed_work_on(cpu, &d->mbm_over, delay);
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index fca759d272a1..749856a2e736 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -1005,6 +1005,11 @@ static void l2_qos_cfg_update(void *arg)
1005 wrmsrl(IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL); 1005 wrmsrl(IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
1006} 1006}
1007 1007
1008static inline bool is_mba_linear(void)
1009{
1010 return rdt_resources_all[RDT_RESOURCE_MBA].membw.delay_linear;
1011}
1012
1008static int set_cache_qos_cfg(int level, bool enable) 1013static int set_cache_qos_cfg(int level, bool enable)
1009{ 1014{
1010 void (*update)(void *arg); 1015 void (*update)(void *arg);
@@ -1041,6 +1046,28 @@ static int set_cache_qos_cfg(int level, bool enable)
1041 return 0; 1046 return 0;
1042} 1047}
1043 1048
1049/*
1050 * Enable or disable the MBA software controller
1051 * which helps user specify bandwidth in MBps.
1052 * MBA software controller is supported only if
1053 * MBM is supported and MBA is in linear scale.
1054 */
1055static int set_mba_sc(bool mba_sc)
1056{
1057 struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA];
1058 struct rdt_domain *d;
1059
1060 if (!is_mbm_enabled() || !is_mba_linear() ||
1061 mba_sc == is_mba_sc(r))
1062 return -EINVAL;
1063
1064 r->membw.mba_sc = mba_sc;
1065 list_for_each_entry(d, &r->domains, list)
1066 setup_default_ctrlval(r, d->ctrl_val, d->mbps_val);
1067
1068 return 0;
1069}
1070
1044static int cdp_enable(int level, int data_type, int code_type) 1071static int cdp_enable(int level, int data_type, int code_type)
1045{ 1072{
1046 struct rdt_resource *r_ldata = &rdt_resources_all[data_type]; 1073 struct rdt_resource *r_ldata = &rdt_resources_all[data_type];
@@ -1123,6 +1150,10 @@ static int parse_rdtgroupfs_options(char *data)
1123 ret = cdpl2_enable(); 1150 ret = cdpl2_enable();
1124 if (ret) 1151 if (ret)
1125 goto out; 1152 goto out;
1153 } else if (!strcmp(token, "mba_MBps")) {
1154 ret = set_mba_sc(true);
1155 if (ret)
1156 goto out;
1126 } else { 1157 } else {
1127 ret = -EINVAL; 1158 ret = -EINVAL;
1128 goto out; 1159 goto out;
@@ -1445,6 +1476,8 @@ static void rdt_kill_sb(struct super_block *sb)
1445 cpus_read_lock(); 1476 cpus_read_lock();
1446 mutex_lock(&rdtgroup_mutex); 1477 mutex_lock(&rdtgroup_mutex);
1447 1478
1479 set_mba_sc(false);
1480
1448 /*Put everything back to default values. */ 1481 /*Put everything back to default values. */
1449 for_each_alloc_enabled_rdt_resource(r) 1482 for_each_alloc_enabled_rdt_resource(r)
1450 reset_all_ctrls(r); 1483 reset_all_ctrls(r);