diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-06-05 00:34:39 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-06-05 00:34:39 -0400 |
commit | ab20fd0013cd086230bb39344918f5b6eb41c4ad (patch) | |
tree | 1828996b036347cb7a49ae89c9ab4b8a55df4716 | |
parent | ba252f16e4433e7599fec986e77722e6d0eed186 (diff) | |
parent | de73f38f768021610bd305cf74ef3702fcf6a1eb (diff) |
Merge branch 'x86-cache-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 cache resource controller updates from Thomas Gleixner:
"An update for the Intel Resource Director Technolgy (RDT) which adds a
feedback driven software controller to runtime adjust the bandwidth
allocation MSRs.
This makes the allocations more accurate and allows to use bandwidth
values in understandable units (MB/s) instead of using percentage
based allocations as the original, still available, interface.
The software controller can be enabled with a new mount option for the
resctrl filesystem"
* 'x86-cache-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/intel_rdt/mba_sc: Feedback loop to dynamically update mem bandwidth
x86/intel_rdt/mba_sc: Prepare for feedback loop
x86/intel_rdt/mba_sc: Add schemata support
x86/intel_rdt/mba_sc: Add initialization support
x86/intel_rdt/mba_sc: Enable/disable MBA software controller
x86/intel_rdt/mba_sc: Documentation for MBA software controller(mba_sc)
-rw-r--r-- | Documentation/x86/intel_rdt_ui.txt | 75 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/intel_rdt.c | 50 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/intel_rdt.h | 18 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 24 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/intel_rdt_monitor.c | 170 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 33 |
6 files changed, 337 insertions, 33 deletions
diff --git a/Documentation/x86/intel_rdt_ui.txt b/Documentation/x86/intel_rdt_ui.txt index 71c30984e94d..a16aa2113840 100644 --- a/Documentation/x86/intel_rdt_ui.txt +++ b/Documentation/x86/intel_rdt_ui.txt | |||
@@ -17,12 +17,14 @@ MBA (Memory Bandwidth Allocation) - "mba" | |||
17 | 17 | ||
18 | To use the feature mount the file system: | 18 | To use the feature mount the file system: |
19 | 19 | ||
20 | # mount -t resctrl resctrl [-o cdp[,cdpl2]] /sys/fs/resctrl | 20 | # mount -t resctrl resctrl [-o cdp[,cdpl2][,mba_MBps]] /sys/fs/resctrl |
21 | 21 | ||
22 | mount options are: | 22 | mount options are: |
23 | 23 | ||
24 | "cdp": Enable code/data prioritization in L3 cache allocations. | 24 | "cdp": Enable code/data prioritization in L3 cache allocations. |
25 | "cdpl2": Enable code/data prioritization in L2 cache allocations. | 25 | "cdpl2": Enable code/data prioritization in L2 cache allocations. |
26 | "mba_MBps": Enable the MBA Software Controller(mba_sc) to specify MBA | ||
27 | bandwidth in MBps | ||
26 | 28 | ||
27 | L2 and L3 CDP are controlled seperately. | 29 | L2 and L3 CDP are controlled seperately. |
28 | 30 | ||
@@ -270,10 +272,11 @@ and 0xA are not. On a system with a 20-bit mask each bit represents 5% | |||
270 | of the capacity of the cache. You could partition the cache into four | 272 | of the capacity of the cache. You could partition the cache into four |
271 | equal parts with masks: 0x1f, 0x3e0, 0x7c00, 0xf8000. | 273 | equal parts with masks: 0x1f, 0x3e0, 0x7c00, 0xf8000. |
272 | 274 | ||
273 | Memory bandwidth(b/w) percentage | 275 | Memory bandwidth Allocation and monitoring |
274 | -------------------------------- | 276 | ------------------------------------------ |
275 | For Memory b/w resource, user controls the resource by indicating the | 277 | |
276 | percentage of total memory b/w. | 278 | For Memory bandwidth resource, by default the user controls the resource |
279 | by indicating the percentage of total memory bandwidth. | ||
277 | 280 | ||
278 | The minimum bandwidth percentage value for each cpu model is predefined | 281 | The minimum bandwidth percentage value for each cpu model is predefined |
279 | and can be looked up through "info/MB/min_bandwidth". The bandwidth | 282 | and can be looked up through "info/MB/min_bandwidth". The bandwidth |
@@ -285,7 +288,47 @@ to the next control step available on the hardware. | |||
285 | The bandwidth throttling is a core specific mechanism on some of Intel | 288 | The bandwidth throttling is a core specific mechanism on some of Intel |
286 | SKUs. Using a high bandwidth and a low bandwidth setting on two threads | 289 | SKUs. Using a high bandwidth and a low bandwidth setting on two threads |
287 | sharing a core will result in both threads being throttled to use the | 290 | sharing a core will result in both threads being throttled to use the |
288 | low bandwidth. | 291 | low bandwidth. The fact that Memory bandwidth allocation(MBA) is a core |
292 | specific mechanism where as memory bandwidth monitoring(MBM) is done at | ||
293 | the package level may lead to confusion when users try to apply control | ||
294 | via the MBA and then monitor the bandwidth to see if the controls are | ||
295 | effective. Below are such scenarios: | ||
296 | |||
297 | 1. User may *not* see increase in actual bandwidth when percentage | ||
298 | values are increased: | ||
299 | |||
300 | This can occur when aggregate L2 external bandwidth is more than L3 | ||
301 | external bandwidth. Consider an SKL SKU with 24 cores on a package and | ||
302 | where L2 external is 10GBps (hence aggregate L2 external bandwidth is | ||
303 | 240GBps) and L3 external bandwidth is 100GBps. Now a workload with '20 | ||
304 | threads, having 50% bandwidth, each consuming 5GBps' consumes the max L3 | ||
305 | bandwidth of 100GBps although the percentage value specified is only 50% | ||
306 | << 100%. Hence increasing the bandwidth percentage will not yeild any | ||
307 | more bandwidth. This is because although the L2 external bandwidth still | ||
308 | has capacity, the L3 external bandwidth is fully used. Also note that | ||
309 | this would be dependent on number of cores the benchmark is run on. | ||
310 | |||
311 | 2. Same bandwidth percentage may mean different actual bandwidth | ||
312 | depending on # of threads: | ||
313 | |||
314 | For the same SKU in #1, a 'single thread, with 10% bandwidth' and '4 | ||
315 | thread, with 10% bandwidth' can consume upto 10GBps and 40GBps although | ||
316 | they have same percentage bandwidth of 10%. This is simply because as | ||
317 | threads start using more cores in an rdtgroup, the actual bandwidth may | ||
318 | increase or vary although user specified bandwidth percentage is same. | ||
319 | |||
320 | In order to mitigate this and make the interface more user friendly, | ||
321 | resctrl added support for specifying the bandwidth in MBps as well. The | ||
322 | kernel underneath would use a software feedback mechanism or a "Software | ||
323 | Controller(mba_sc)" which reads the actual bandwidth using MBM counters | ||
324 | and adjust the memowy bandwidth percentages to ensure | ||
325 | |||
326 | "actual bandwidth < user specified bandwidth". | ||
327 | |||
328 | By default, the schemata would take the bandwidth percentage values | ||
329 | where as user can switch to the "MBA software controller" mode using | ||
330 | a mount option 'mba_MBps'. The schemata format is specified in the below | ||
331 | sections. | ||
289 | 332 | ||
290 | L3 schemata file details (code and data prioritization disabled) | 333 | L3 schemata file details (code and data prioritization disabled) |
291 | ---------------------------------------------------------------- | 334 | ---------------------------------------------------------------- |
@@ -308,13 +351,20 @@ schemata format is always: | |||
308 | 351 | ||
309 | L2:<cache_id0>=<cbm>;<cache_id1>=<cbm>;... | 352 | L2:<cache_id0>=<cbm>;<cache_id1>=<cbm>;... |
310 | 353 | ||
311 | Memory b/w Allocation details | 354 | Memory bandwidth Allocation (default mode) |
312 | ----------------------------- | 355 | ------------------------------------------ |
313 | 356 | ||
314 | Memory b/w domain is L3 cache. | 357 | Memory b/w domain is L3 cache. |
315 | 358 | ||
316 | MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;... | 359 | MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;... |
317 | 360 | ||
361 | Memory bandwidth Allocation specified in MBps | ||
362 | --------------------------------------------- | ||
363 | |||
364 | Memory bandwidth domain is L3 cache. | ||
365 | |||
366 | MB:<cache_id0>=bw_MBps0;<cache_id1>=bw_MBps1;... | ||
367 | |||
318 | Reading/writing the schemata file | 368 | Reading/writing the schemata file |
319 | --------------------------------- | 369 | --------------------------------- |
320 | Reading the schemata file will show the state of all resources | 370 | Reading the schemata file will show the state of all resources |
@@ -358,6 +408,15 @@ allocations can overlap or not. The allocations specifies the maximum | |||
358 | b/w that the group may be able to use and the system admin can configure | 408 | b/w that the group may be able to use and the system admin can configure |
359 | the b/w accordingly. | 409 | the b/w accordingly. |
360 | 410 | ||
411 | If the MBA is specified in MB(megabytes) then user can enter the max b/w in MB | ||
412 | rather than the percentage values. | ||
413 | |||
414 | # echo "L3:0=3;1=c\nMB:0=1024;1=500" > /sys/fs/resctrl/p0/schemata | ||
415 | # echo "L3:0=3;1=3\nMB:0=1024;1=500" > /sys/fs/resctrl/p1/schemata | ||
416 | |||
417 | In the above example the tasks in "p1" and "p0" on socket 0 would use a max b/w | ||
418 | of 1024MB where as on socket 1 they would use 500MB. | ||
419 | |||
361 | Example 2 | 420 | Example 2 |
362 | --------- | 421 | --------- |
363 | Again two sockets, but this time with a more realistic 20-bit mask. | 422 | Again two sockets, but this time with a more realistic 20-bit mask. |
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 589b948e6e01..24bfa63e86cf 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c | |||
@@ -33,8 +33,8 @@ | |||
33 | #include <asm/intel_rdt_sched.h> | 33 | #include <asm/intel_rdt_sched.h> |
34 | #include "intel_rdt.h" | 34 | #include "intel_rdt.h" |
35 | 35 | ||
36 | #define MAX_MBA_BW 100u | ||
37 | #define MBA_IS_LINEAR 0x4 | 36 | #define MBA_IS_LINEAR 0x4 |
37 | #define MBA_MAX_MBPS U32_MAX | ||
38 | 38 | ||
39 | /* Mutex to protect rdtgroup access. */ | 39 | /* Mutex to protect rdtgroup access. */ |
40 | DEFINE_MUTEX(rdtgroup_mutex); | 40 | DEFINE_MUTEX(rdtgroup_mutex); |
@@ -178,7 +178,7 @@ struct rdt_resource rdt_resources_all[] = { | |||
178 | .msr_update = mba_wrmsr, | 178 | .msr_update = mba_wrmsr, |
179 | .cache_level = 3, | 179 | .cache_level = 3, |
180 | .parse_ctrlval = parse_bw, | 180 | .parse_ctrlval = parse_bw, |
181 | .format_str = "%d=%*d", | 181 | .format_str = "%d=%*u", |
182 | .fflags = RFTYPE_RES_MB, | 182 | .fflags = RFTYPE_RES_MB, |
183 | }, | 183 | }, |
184 | }; | 184 | }; |
@@ -230,6 +230,14 @@ static inline void cache_alloc_hsw_probe(void) | |||
230 | rdt_alloc_capable = true; | 230 | rdt_alloc_capable = true; |
231 | } | 231 | } |
232 | 232 | ||
233 | bool is_mba_sc(struct rdt_resource *r) | ||
234 | { | ||
235 | if (!r) | ||
236 | return rdt_resources_all[RDT_RESOURCE_MBA].membw.mba_sc; | ||
237 | |||
238 | return r->membw.mba_sc; | ||
239 | } | ||
240 | |||
233 | /* | 241 | /* |
234 | * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values | 242 | * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values |
235 | * exposed to user interface and the h/w understandable delay values. | 243 | * exposed to user interface and the h/w understandable delay values. |
@@ -341,7 +349,7 @@ static int get_cache_id(int cpu, int level) | |||
341 | * that can be written to QOS_MSRs. | 349 | * that can be written to QOS_MSRs. |
342 | * There are currently no SKUs which support non linear delay values. | 350 | * There are currently no SKUs which support non linear delay values. |
343 | */ | 351 | */ |
344 | static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) | 352 | u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) |
345 | { | 353 | { |
346 | if (r->membw.delay_linear) | 354 | if (r->membw.delay_linear) |
347 | return MAX_MBA_BW - bw; | 355 | return MAX_MBA_BW - bw; |
@@ -431,25 +439,40 @@ struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, | |||
431 | return NULL; | 439 | return NULL; |
432 | } | 440 | } |
433 | 441 | ||
442 | void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm) | ||
443 | { | ||
444 | int i; | ||
445 | |||
446 | /* | ||
447 | * Initialize the Control MSRs to having no control. | ||
448 | * For Cache Allocation: Set all bits in cbm | ||
449 | * For Memory Allocation: Set b/w requested to 100% | ||
450 | * and the bandwidth in MBps to U32_MAX | ||
451 | */ | ||
452 | for (i = 0; i < r->num_closid; i++, dc++, dm++) { | ||
453 | *dc = r->default_ctrl; | ||
454 | *dm = MBA_MAX_MBPS; | ||
455 | } | ||
456 | } | ||
457 | |||
434 | static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) | 458 | static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) |
435 | { | 459 | { |
436 | struct msr_param m; | 460 | struct msr_param m; |
437 | u32 *dc; | 461 | u32 *dc, *dm; |
438 | int i; | ||
439 | 462 | ||
440 | dc = kmalloc_array(r->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL); | 463 | dc = kmalloc_array(r->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL); |
441 | if (!dc) | 464 | if (!dc) |
442 | return -ENOMEM; | 465 | return -ENOMEM; |
443 | 466 | ||
444 | d->ctrl_val = dc; | 467 | dm = kmalloc_array(r->num_closid, sizeof(*d->mbps_val), GFP_KERNEL); |
468 | if (!dm) { | ||
469 | kfree(dc); | ||
470 | return -ENOMEM; | ||
471 | } | ||
445 | 472 | ||
446 | /* | 473 | d->ctrl_val = dc; |
447 | * Initialize the Control MSRs to having no control. | 474 | d->mbps_val = dm; |
448 | * For Cache Allocation: Set all bits in cbm | 475 | setup_default_ctrlval(r, dc, dm); |
449 | * For Memory Allocation: Set b/w requested to 100 | ||
450 | */ | ||
451 | for (i = 0; i < r->num_closid; i++, dc++) | ||
452 | *dc = r->default_ctrl; | ||
453 | 476 | ||
454 | m.low = 0; | 477 | m.low = 0; |
455 | m.high = r->num_closid; | 478 | m.high = r->num_closid; |
@@ -588,6 +611,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) | |||
588 | } | 611 | } |
589 | 612 | ||
590 | kfree(d->ctrl_val); | 613 | kfree(d->ctrl_val); |
614 | kfree(d->mbps_val); | ||
591 | kfree(d->rmid_busy_llc); | 615 | kfree(d->rmid_busy_llc); |
592 | kfree(d->mbm_total); | 616 | kfree(d->mbm_total); |
593 | kfree(d->mbm_local); | 617 | kfree(d->mbm_local); |
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 3fd7a70ee04a..39752825e376 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h | |||
@@ -28,6 +28,7 @@ | |||
28 | 28 | ||
29 | #define MBM_CNTR_WIDTH 24 | 29 | #define MBM_CNTR_WIDTH 24 |
30 | #define MBM_OVERFLOW_INTERVAL 1000 | 30 | #define MBM_OVERFLOW_INTERVAL 1000 |
31 | #define MAX_MBA_BW 100u | ||
31 | 32 | ||
32 | #define RMID_VAL_ERROR BIT_ULL(63) | 33 | #define RMID_VAL_ERROR BIT_ULL(63) |
33 | #define RMID_VAL_UNAVAIL BIT_ULL(62) | 34 | #define RMID_VAL_UNAVAIL BIT_ULL(62) |
@@ -180,10 +181,20 @@ struct rftype { | |||
180 | * struct mbm_state - status for each MBM counter in each domain | 181 | * struct mbm_state - status for each MBM counter in each domain |
181 | * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) | 182 | * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) |
182 | * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it | 183 | * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it |
184 | * @chunks_bw Total local data moved. Used for bandwidth calculation | ||
185 | * @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting | ||
186 | * @prev_bw The most recent bandwidth in MBps | ||
187 | * @delta_bw Difference between the current and previous bandwidth | ||
188 | * @delta_comp Indicates whether to compute the delta_bw | ||
183 | */ | 189 | */ |
184 | struct mbm_state { | 190 | struct mbm_state { |
185 | u64 chunks; | 191 | u64 chunks; |
186 | u64 prev_msr; | 192 | u64 prev_msr; |
193 | u64 chunks_bw; | ||
194 | u64 prev_bw_msr; | ||
195 | u32 prev_bw; | ||
196 | u32 delta_bw; | ||
197 | bool delta_comp; | ||
187 | }; | 198 | }; |
188 | 199 | ||
189 | /** | 200 | /** |
@@ -202,6 +213,7 @@ struct mbm_state { | |||
202 | * @cqm_work_cpu: | 213 | * @cqm_work_cpu: |
203 | * worker cpu for CQM h/w counters | 214 | * worker cpu for CQM h/w counters |
204 | * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) | 215 | * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) |
216 | * @mbps_val: When mba_sc is enabled, this holds the bandwidth in MBps | ||
205 | * @new_ctrl: new ctrl value to be loaded | 217 | * @new_ctrl: new ctrl value to be loaded |
206 | * @have_new_ctrl: did user provide new_ctrl for this domain | 218 | * @have_new_ctrl: did user provide new_ctrl for this domain |
207 | */ | 219 | */ |
@@ -217,6 +229,7 @@ struct rdt_domain { | |||
217 | int mbm_work_cpu; | 229 | int mbm_work_cpu; |
218 | int cqm_work_cpu; | 230 | int cqm_work_cpu; |
219 | u32 *ctrl_val; | 231 | u32 *ctrl_val; |
232 | u32 *mbps_val; | ||
220 | u32 new_ctrl; | 233 | u32 new_ctrl; |
221 | bool have_new_ctrl; | 234 | bool have_new_ctrl; |
222 | }; | 235 | }; |
@@ -259,6 +272,7 @@ struct rdt_cache { | |||
259 | * @min_bw: Minimum memory bandwidth percentage user can request | 272 | * @min_bw: Minimum memory bandwidth percentage user can request |
260 | * @bw_gran: Granularity at which the memory bandwidth is allocated | 273 | * @bw_gran: Granularity at which the memory bandwidth is allocated |
261 | * @delay_linear: True if memory B/W delay is in linear scale | 274 | * @delay_linear: True if memory B/W delay is in linear scale |
275 | * @mba_sc: True if MBA software controller(mba_sc) is enabled | ||
262 | * @mb_map: Mapping of memory B/W percentage to memory B/W delay | 276 | * @mb_map: Mapping of memory B/W percentage to memory B/W delay |
263 | */ | 277 | */ |
264 | struct rdt_membw { | 278 | struct rdt_membw { |
@@ -266,6 +280,7 @@ struct rdt_membw { | |||
266 | u32 min_bw; | 280 | u32 min_bw; |
267 | u32 bw_gran; | 281 | u32 bw_gran; |
268 | u32 delay_linear; | 282 | u32 delay_linear; |
283 | bool mba_sc; | ||
269 | u32 *mb_map; | 284 | u32 *mb_map; |
270 | }; | 285 | }; |
271 | 286 | ||
@@ -445,6 +460,9 @@ void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, | |||
445 | void mbm_setup_overflow_handler(struct rdt_domain *dom, | 460 | void mbm_setup_overflow_handler(struct rdt_domain *dom, |
446 | unsigned long delay_ms); | 461 | unsigned long delay_ms); |
447 | void mbm_handle_overflow(struct work_struct *work); | 462 | void mbm_handle_overflow(struct work_struct *work); |
463 | bool is_mba_sc(struct rdt_resource *r); | ||
464 | void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm); | ||
465 | u32 delay_bw_map(unsigned long bw, struct rdt_resource *r); | ||
448 | void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); | 466 | void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); |
449 | void cqm_handle_limbo(struct work_struct *work); | 467 | void cqm_handle_limbo(struct work_struct *work); |
450 | bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); | 468 | bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); |
diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index 23e1d5c249c6..116d57b248d3 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | |||
@@ -53,7 +53,8 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) | |||
53 | return false; | 53 | return false; |
54 | } | 54 | } |
55 | 55 | ||
56 | if (bw < r->membw.min_bw || bw > r->default_ctrl) { | 56 | if ((bw < r->membw.min_bw || bw > r->default_ctrl) && |
57 | !is_mba_sc(r)) { | ||
57 | rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw, | 58 | rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw, |
58 | r->membw.min_bw, r->default_ctrl); | 59 | r->membw.min_bw, r->default_ctrl); |
59 | return false; | 60 | return false; |
@@ -179,6 +180,8 @@ static int update_domains(struct rdt_resource *r, int closid) | |||
179 | struct msr_param msr_param; | 180 | struct msr_param msr_param; |
180 | cpumask_var_t cpu_mask; | 181 | cpumask_var_t cpu_mask; |
181 | struct rdt_domain *d; | 182 | struct rdt_domain *d; |
183 | bool mba_sc; | ||
184 | u32 *dc; | ||
182 | int cpu; | 185 | int cpu; |
183 | 186 | ||
184 | if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) | 187 | if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) |
@@ -188,13 +191,20 @@ static int update_domains(struct rdt_resource *r, int closid) | |||
188 | msr_param.high = msr_param.low + 1; | 191 | msr_param.high = msr_param.low + 1; |
189 | msr_param.res = r; | 192 | msr_param.res = r; |
190 | 193 | ||
194 | mba_sc = is_mba_sc(r); | ||
191 | list_for_each_entry(d, &r->domains, list) { | 195 | list_for_each_entry(d, &r->domains, list) { |
192 | if (d->have_new_ctrl && d->new_ctrl != d->ctrl_val[closid]) { | 196 | dc = !mba_sc ? d->ctrl_val : d->mbps_val; |
197 | if (d->have_new_ctrl && d->new_ctrl != dc[closid]) { | ||
193 | cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); | 198 | cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); |
194 | d->ctrl_val[closid] = d->new_ctrl; | 199 | dc[closid] = d->new_ctrl; |
195 | } | 200 | } |
196 | } | 201 | } |
197 | if (cpumask_empty(cpu_mask)) | 202 | |
203 | /* | ||
204 | * Avoid writing the control msr with control values when | ||
205 | * MBA software controller is enabled | ||
206 | */ | ||
207 | if (cpumask_empty(cpu_mask) || mba_sc) | ||
198 | goto done; | 208 | goto done; |
199 | cpu = get_cpu(); | 209 | cpu = get_cpu(); |
200 | /* Update CBM on this cpu if it's in cpu_mask. */ | 210 | /* Update CBM on this cpu if it's in cpu_mask. */ |
@@ -282,13 +292,17 @@ static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid) | |||
282 | { | 292 | { |
283 | struct rdt_domain *dom; | 293 | struct rdt_domain *dom; |
284 | bool sep = false; | 294 | bool sep = false; |
295 | u32 ctrl_val; | ||
285 | 296 | ||
286 | seq_printf(s, "%*s:", max_name_width, r->name); | 297 | seq_printf(s, "%*s:", max_name_width, r->name); |
287 | list_for_each_entry(dom, &r->domains, list) { | 298 | list_for_each_entry(dom, &r->domains, list) { |
288 | if (sep) | 299 | if (sep) |
289 | seq_puts(s, ";"); | 300 | seq_puts(s, ";"); |
301 | |||
302 | ctrl_val = (!is_mba_sc(r) ? dom->ctrl_val[closid] : | ||
303 | dom->mbps_val[closid]); | ||
290 | seq_printf(s, r->format_str, dom->id, max_data_width, | 304 | seq_printf(s, r->format_str, dom->id, max_data_width, |
291 | dom->ctrl_val[closid]); | 305 | ctrl_val); |
292 | sep = true; | 306 | sep = true; |
293 | } | 307 | } |
294 | seq_puts(s, "\n"); | 308 | seq_puts(s, "\n"); |
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c index 681450eee428..b0f3aed76b75 100644 --- a/arch/x86/kernel/cpu/intel_rdt_monitor.c +++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c | |||
@@ -225,10 +225,18 @@ void free_rmid(u32 rmid) | |||
225 | list_add_tail(&entry->list, &rmid_free_lru); | 225 | list_add_tail(&entry->list, &rmid_free_lru); |
226 | } | 226 | } |
227 | 227 | ||
228 | static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr) | ||
229 | { | ||
230 | u64 shift = 64 - MBM_CNTR_WIDTH, chunks; | ||
231 | |||
232 | chunks = (cur_msr << shift) - (prev_msr << shift); | ||
233 | return chunks >>= shift; | ||
234 | } | ||
235 | |||
228 | static int __mon_event_count(u32 rmid, struct rmid_read *rr) | 236 | static int __mon_event_count(u32 rmid, struct rmid_read *rr) |
229 | { | 237 | { |
230 | u64 chunks, shift, tval; | ||
231 | struct mbm_state *m; | 238 | struct mbm_state *m; |
239 | u64 chunks, tval; | ||
232 | 240 | ||
233 | tval = __rmid_read(rmid, rr->evtid); | 241 | tval = __rmid_read(rmid, rr->evtid); |
234 | if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) { | 242 | if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) { |
@@ -254,14 +262,12 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr) | |||
254 | } | 262 | } |
255 | 263 | ||
256 | if (rr->first) { | 264 | if (rr->first) { |
257 | m->prev_msr = tval; | 265 | memset(m, 0, sizeof(struct mbm_state)); |
258 | m->chunks = 0; | 266 | m->prev_bw_msr = m->prev_msr = tval; |
259 | return 0; | 267 | return 0; |
260 | } | 268 | } |
261 | 269 | ||
262 | shift = 64 - MBM_CNTR_WIDTH; | 270 | chunks = mbm_overflow_count(m->prev_msr, tval); |
263 | chunks = (tval << shift) - (m->prev_msr << shift); | ||
264 | chunks >>= shift; | ||
265 | m->chunks += chunks; | 271 | m->chunks += chunks; |
266 | m->prev_msr = tval; | 272 | m->prev_msr = tval; |
267 | 273 | ||
@@ -270,6 +276,32 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr) | |||
270 | } | 276 | } |
271 | 277 | ||
272 | /* | 278 | /* |
279 | * Supporting function to calculate the memory bandwidth | ||
280 | * and delta bandwidth in MBps. | ||
281 | */ | ||
282 | static void mbm_bw_count(u32 rmid, struct rmid_read *rr) | ||
283 | { | ||
284 | struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; | ||
285 | struct mbm_state *m = &rr->d->mbm_local[rmid]; | ||
286 | u64 tval, cur_bw, chunks; | ||
287 | |||
288 | tval = __rmid_read(rmid, rr->evtid); | ||
289 | if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) | ||
290 | return; | ||
291 | |||
292 | chunks = mbm_overflow_count(m->prev_bw_msr, tval); | ||
293 | m->chunks_bw += chunks; | ||
294 | m->chunks = m->chunks_bw; | ||
295 | cur_bw = (chunks * r->mon_scale) >> 20; | ||
296 | |||
297 | if (m->delta_comp) | ||
298 | m->delta_bw = abs(cur_bw - m->prev_bw); | ||
299 | m->delta_comp = false; | ||
300 | m->prev_bw = cur_bw; | ||
301 | m->prev_bw_msr = tval; | ||
302 | } | ||
303 | |||
304 | /* | ||
273 | * This is called via IPI to read the CQM/MBM counters | 305 | * This is called via IPI to read the CQM/MBM counters |
274 | * on a domain. | 306 | * on a domain. |
275 | */ | 307 | */ |
@@ -297,6 +329,118 @@ void mon_event_count(void *info) | |||
297 | } | 329 | } |
298 | } | 330 | } |
299 | 331 | ||
332 | /* | ||
333 | * Feedback loop for MBA software controller (mba_sc) | ||
334 | * | ||
335 | * mba_sc is a feedback loop where we periodically read MBM counters and | ||
336 | * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so | ||
337 | * that: | ||
338 | * | ||
339 | * current bandwdith(cur_bw) < user specified bandwidth(user_bw) | ||
340 | * | ||
341 | * This uses the MBM counters to measure the bandwidth and MBA throttle | ||
342 | * MSRs to control the bandwidth for a particular rdtgrp. It builds on the | ||
343 | * fact that resctrl rdtgroups have both monitoring and control. | ||
344 | * | ||
345 | * The frequency of the checks is 1s and we just tag along the MBM overflow | ||
346 | * timer. Having 1s interval makes the calculation of bandwidth simpler. | ||
347 | * | ||
348 | * Although MBA's goal is to restrict the bandwidth to a maximum, there may | ||
349 | * be a need to increase the bandwidth to avoid uncecessarily restricting | ||
350 | * the L2 <-> L3 traffic. | ||
351 | * | ||
352 | * Since MBA controls the L2 external bandwidth where as MBM measures the | ||
353 | * L3 external bandwidth the following sequence could lead to such a | ||
354 | * situation. | ||
355 | * | ||
356 | * Consider an rdtgroup which had high L3 <-> memory traffic in initial | ||
357 | * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but | ||
358 | * after some time rdtgroup has mostly L2 <-> L3 traffic. | ||
359 | * | ||
360 | * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its | ||
361 | * throttle MSRs already have low percentage values. To avoid | ||
362 | * unnecessarily restricting such rdtgroups, we also increase the bandwidth. | ||
363 | */ | ||
364 | static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) | ||
365 | { | ||
366 | u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val; | ||
367 | struct mbm_state *pmbm_data, *cmbm_data; | ||
368 | u32 cur_bw, delta_bw, user_bw; | ||
369 | struct rdt_resource *r_mba; | ||
370 | struct rdt_domain *dom_mba; | ||
371 | struct list_head *head; | ||
372 | struct rdtgroup *entry; | ||
373 | |||
374 | r_mba = &rdt_resources_all[RDT_RESOURCE_MBA]; | ||
375 | closid = rgrp->closid; | ||
376 | rmid = rgrp->mon.rmid; | ||
377 | pmbm_data = &dom_mbm->mbm_local[rmid]; | ||
378 | |||
379 | dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba); | ||
380 | if (!dom_mba) { | ||
381 | pr_warn_once("Failure to get domain for MBA update\n"); | ||
382 | return; | ||
383 | } | ||
384 | |||
385 | cur_bw = pmbm_data->prev_bw; | ||
386 | user_bw = dom_mba->mbps_val[closid]; | ||
387 | delta_bw = pmbm_data->delta_bw; | ||
388 | cur_msr_val = dom_mba->ctrl_val[closid]; | ||
389 | |||
390 | /* | ||
391 | * For Ctrl groups read data from child monitor groups. | ||
392 | */ | ||
393 | head = &rgrp->mon.crdtgrp_list; | ||
394 | list_for_each_entry(entry, head, mon.crdtgrp_list) { | ||
395 | cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; | ||
396 | cur_bw += cmbm_data->prev_bw; | ||
397 | delta_bw += cmbm_data->delta_bw; | ||
398 | } | ||
399 | |||
400 | /* | ||
401 | * Scale up/down the bandwidth linearly for the ctrl group. The | ||
402 | * bandwidth step is the bandwidth granularity specified by the | ||
403 | * hardware. | ||
404 | * | ||
405 | * The delta_bw is used when increasing the bandwidth so that we | ||
406 | * dont alternately increase and decrease the control values | ||
407 | * continuously. | ||
408 | * | ||
409 | * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if | ||
410 | * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep | ||
411 | * switching between 90 and 110 continuously if we only check | ||
412 | * cur_bw < user_bw. | ||
413 | */ | ||
414 | if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { | ||
415 | new_msr_val = cur_msr_val - r_mba->membw.bw_gran; | ||
416 | } else if (cur_msr_val < MAX_MBA_BW && | ||
417 | (user_bw > (cur_bw + delta_bw))) { | ||
418 | new_msr_val = cur_msr_val + r_mba->membw.bw_gran; | ||
419 | } else { | ||
420 | return; | ||
421 | } | ||
422 | |||
423 | cur_msr = r_mba->msr_base + closid; | ||
424 | wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba)); | ||
425 | dom_mba->ctrl_val[closid] = new_msr_val; | ||
426 | |||
427 | /* | ||
428 | * Delta values are updated dynamically package wise for each | ||
429 | * rdtgrp everytime the throttle MSR changes value. | ||
430 | * | ||
431 | * This is because (1)the increase in bandwidth is not perfectly | ||
432 | * linear and only "approximately" linear even when the hardware | ||
433 | * says it is linear.(2)Also since MBA is a core specific | ||
434 | * mechanism, the delta values vary based on number of cores used | ||
435 | * by the rdtgrp. | ||
436 | */ | ||
437 | pmbm_data->delta_comp = true; | ||
438 | list_for_each_entry(entry, head, mon.crdtgrp_list) { | ||
439 | cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; | ||
440 | cmbm_data->delta_comp = true; | ||
441 | } | ||
442 | } | ||
443 | |||
300 | static void mbm_update(struct rdt_domain *d, int rmid) | 444 | static void mbm_update(struct rdt_domain *d, int rmid) |
301 | { | 445 | { |
302 | struct rmid_read rr; | 446 | struct rmid_read rr; |
@@ -314,7 +458,16 @@ static void mbm_update(struct rdt_domain *d, int rmid) | |||
314 | } | 458 | } |
315 | if (is_mbm_local_enabled()) { | 459 | if (is_mbm_local_enabled()) { |
316 | rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; | 460 | rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; |
317 | __mon_event_count(rmid, &rr); | 461 | |
462 | /* | ||
463 | * Call the MBA software controller only for the | ||
464 | * control groups and when user has enabled | ||
465 | * the software controller explicitly. | ||
466 | */ | ||
467 | if (!is_mba_sc(NULL)) | ||
468 | __mon_event_count(rmid, &rr); | ||
469 | else | ||
470 | mbm_bw_count(rmid, &rr); | ||
318 | } | 471 | } |
319 | } | 472 | } |
320 | 473 | ||
@@ -385,6 +538,9 @@ void mbm_handle_overflow(struct work_struct *work) | |||
385 | head = &prgrp->mon.crdtgrp_list; | 538 | head = &prgrp->mon.crdtgrp_list; |
386 | list_for_each_entry(crgrp, head, mon.crdtgrp_list) | 539 | list_for_each_entry(crgrp, head, mon.crdtgrp_list) |
387 | mbm_update(d, crgrp->mon.rmid); | 540 | mbm_update(d, crgrp->mon.rmid); |
541 | |||
542 | if (is_mba_sc(NULL)) | ||
543 | update_mba_bw(prgrp, d); | ||
388 | } | 544 | } |
389 | 545 | ||
390 | schedule_delayed_work_on(cpu, &d->mbm_over, delay); | 546 | schedule_delayed_work_on(cpu, &d->mbm_over, delay); |
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index fca759d272a1..749856a2e736 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | |||
@@ -1005,6 +1005,11 @@ static void l2_qos_cfg_update(void *arg) | |||
1005 | wrmsrl(IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL); | 1005 | wrmsrl(IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL); |
1006 | } | 1006 | } |
1007 | 1007 | ||
1008 | static inline bool is_mba_linear(void) | ||
1009 | { | ||
1010 | return rdt_resources_all[RDT_RESOURCE_MBA].membw.delay_linear; | ||
1011 | } | ||
1012 | |||
1008 | static int set_cache_qos_cfg(int level, bool enable) | 1013 | static int set_cache_qos_cfg(int level, bool enable) |
1009 | { | 1014 | { |
1010 | void (*update)(void *arg); | 1015 | void (*update)(void *arg); |
@@ -1041,6 +1046,28 @@ static int set_cache_qos_cfg(int level, bool enable) | |||
1041 | return 0; | 1046 | return 0; |
1042 | } | 1047 | } |
1043 | 1048 | ||
1049 | /* | ||
1050 | * Enable or disable the MBA software controller | ||
1051 | * which helps user specify bandwidth in MBps. | ||
1052 | * MBA software controller is supported only if | ||
1053 | * MBM is supported and MBA is in linear scale. | ||
1054 | */ | ||
1055 | static int set_mba_sc(bool mba_sc) | ||
1056 | { | ||
1057 | struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA]; | ||
1058 | struct rdt_domain *d; | ||
1059 | |||
1060 | if (!is_mbm_enabled() || !is_mba_linear() || | ||
1061 | mba_sc == is_mba_sc(r)) | ||
1062 | return -EINVAL; | ||
1063 | |||
1064 | r->membw.mba_sc = mba_sc; | ||
1065 | list_for_each_entry(d, &r->domains, list) | ||
1066 | setup_default_ctrlval(r, d->ctrl_val, d->mbps_val); | ||
1067 | |||
1068 | return 0; | ||
1069 | } | ||
1070 | |||
1044 | static int cdp_enable(int level, int data_type, int code_type) | 1071 | static int cdp_enable(int level, int data_type, int code_type) |
1045 | { | 1072 | { |
1046 | struct rdt_resource *r_ldata = &rdt_resources_all[data_type]; | 1073 | struct rdt_resource *r_ldata = &rdt_resources_all[data_type]; |
@@ -1123,6 +1150,10 @@ static int parse_rdtgroupfs_options(char *data) | |||
1123 | ret = cdpl2_enable(); | 1150 | ret = cdpl2_enable(); |
1124 | if (ret) | 1151 | if (ret) |
1125 | goto out; | 1152 | goto out; |
1153 | } else if (!strcmp(token, "mba_MBps")) { | ||
1154 | ret = set_mba_sc(true); | ||
1155 | if (ret) | ||
1156 | goto out; | ||
1126 | } else { | 1157 | } else { |
1127 | ret = -EINVAL; | 1158 | ret = -EINVAL; |
1128 | goto out; | 1159 | goto out; |
@@ -1445,6 +1476,8 @@ static void rdt_kill_sb(struct super_block *sb) | |||
1445 | cpus_read_lock(); | 1476 | cpus_read_lock(); |
1446 | mutex_lock(&rdtgroup_mutex); | 1477 | mutex_lock(&rdtgroup_mutex); |
1447 | 1478 | ||
1479 | set_mba_sc(false); | ||
1480 | |||
1448 | /*Put everything back to default values. */ | 1481 | /*Put everything back to default values. */ |
1449 | for_each_alloc_enabled_rdt_resource(r) | 1482 | for_each_alloc_enabled_rdt_resource(r) |
1450 | reset_all_ctrls(r); | 1483 | reset_all_ctrls(r); |