aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.c3
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.h2
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_monitor.c126
3 files changed, 128 insertions, 3 deletions
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index ad03d975883e..24bfa63e86cf 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -33,7 +33,6 @@
33#include <asm/intel_rdt_sched.h> 33#include <asm/intel_rdt_sched.h>
34#include "intel_rdt.h" 34#include "intel_rdt.h"
35 35
36#define MAX_MBA_BW 100u
37#define MBA_IS_LINEAR 0x4 36#define MBA_IS_LINEAR 0x4
38#define MBA_MAX_MBPS U32_MAX 37#define MBA_MAX_MBPS U32_MAX
39 38
@@ -350,7 +349,7 @@ static int get_cache_id(int cpu, int level)
350 * that can be written to QOS_MSRs. 349 * that can be written to QOS_MSRs.
351 * There are currently no SKUs which support non linear delay values. 350 * There are currently no SKUs which support non linear delay values.
352 */ 351 */
353static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) 352u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
354{ 353{
355 if (r->membw.delay_linear) 354 if (r->membw.delay_linear)
356 return MAX_MBA_BW - bw; 355 return MAX_MBA_BW - bw;
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 66a0ba37a8a3..39752825e376 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -28,6 +28,7 @@
28 28
29#define MBM_CNTR_WIDTH 24 29#define MBM_CNTR_WIDTH 24
30#define MBM_OVERFLOW_INTERVAL 1000 30#define MBM_OVERFLOW_INTERVAL 1000
31#define MAX_MBA_BW 100u
31 32
32#define RMID_VAL_ERROR BIT_ULL(63) 33#define RMID_VAL_ERROR BIT_ULL(63)
33#define RMID_VAL_UNAVAIL BIT_ULL(62) 34#define RMID_VAL_UNAVAIL BIT_ULL(62)
@@ -461,6 +462,7 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom,
461void mbm_handle_overflow(struct work_struct *work); 462void mbm_handle_overflow(struct work_struct *work);
462bool is_mba_sc(struct rdt_resource *r); 463bool is_mba_sc(struct rdt_resource *r);
463void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm); 464void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm);
465u32 delay_bw_map(unsigned long bw, struct rdt_resource *r);
464void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); 466void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
465void cqm_handle_limbo(struct work_struct *work); 467void cqm_handle_limbo(struct work_struct *work);
466bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); 468bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index 7690402c42b7..b0f3aed76b75 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -329,6 +329,118 @@ void mon_event_count(void *info)
329 } 329 }
330} 330}
331 331
332/*
333 * Feedback loop for MBA software controller (mba_sc)
334 *
335 * mba_sc is a feedback loop where we periodically read MBM counters and
336 * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
337 * that:
338 *
339 * current bandwdith(cur_bw) < user specified bandwidth(user_bw)
340 *
341 * This uses the MBM counters to measure the bandwidth and MBA throttle
342 * MSRs to control the bandwidth for a particular rdtgrp. It builds on the
343 * fact that resctrl rdtgroups have both monitoring and control.
344 *
345 * The frequency of the checks is 1s and we just tag along the MBM overflow
346 * timer. Having 1s interval makes the calculation of bandwidth simpler.
347 *
348 * Although MBA's goal is to restrict the bandwidth to a maximum, there may
349 * be a need to increase the bandwidth to avoid uncecessarily restricting
350 * the L2 <-> L3 traffic.
351 *
352 * Since MBA controls the L2 external bandwidth where as MBM measures the
353 * L3 external bandwidth the following sequence could lead to such a
354 * situation.
355 *
356 * Consider an rdtgroup which had high L3 <-> memory traffic in initial
357 * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but
358 * after some time rdtgroup has mostly L2 <-> L3 traffic.
359 *
360 * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its
361 * throttle MSRs already have low percentage values. To avoid
362 * unnecessarily restricting such rdtgroups, we also increase the bandwidth.
363 */
364static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
365{
366 u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val;
367 struct mbm_state *pmbm_data, *cmbm_data;
368 u32 cur_bw, delta_bw, user_bw;
369 struct rdt_resource *r_mba;
370 struct rdt_domain *dom_mba;
371 struct list_head *head;
372 struct rdtgroup *entry;
373
374 r_mba = &rdt_resources_all[RDT_RESOURCE_MBA];
375 closid = rgrp->closid;
376 rmid = rgrp->mon.rmid;
377 pmbm_data = &dom_mbm->mbm_local[rmid];
378
379 dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba);
380 if (!dom_mba) {
381 pr_warn_once("Failure to get domain for MBA update\n");
382 return;
383 }
384
385 cur_bw = pmbm_data->prev_bw;
386 user_bw = dom_mba->mbps_val[closid];
387 delta_bw = pmbm_data->delta_bw;
388 cur_msr_val = dom_mba->ctrl_val[closid];
389
390 /*
391 * For Ctrl groups read data from child monitor groups.
392 */
393 head = &rgrp->mon.crdtgrp_list;
394 list_for_each_entry(entry, head, mon.crdtgrp_list) {
395 cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
396 cur_bw += cmbm_data->prev_bw;
397 delta_bw += cmbm_data->delta_bw;
398 }
399
400 /*
401 * Scale up/down the bandwidth linearly for the ctrl group. The
402 * bandwidth step is the bandwidth granularity specified by the
403 * hardware.
404 *
405 * The delta_bw is used when increasing the bandwidth so that we
406 * dont alternately increase and decrease the control values
407 * continuously.
408 *
409 * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if
410 * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep
411 * switching between 90 and 110 continuously if we only check
412 * cur_bw < user_bw.
413 */
414 if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
415 new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
416 } else if (cur_msr_val < MAX_MBA_BW &&
417 (user_bw > (cur_bw + delta_bw))) {
418 new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
419 } else {
420 return;
421 }
422
423 cur_msr = r_mba->msr_base + closid;
424 wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba));
425 dom_mba->ctrl_val[closid] = new_msr_val;
426
427 /*
428 * Delta values are updated dynamically package wise for each
429 * rdtgrp everytime the throttle MSR changes value.
430 *
431 * This is because (1)the increase in bandwidth is not perfectly
432 * linear and only "approximately" linear even when the hardware
433 * says it is linear.(2)Also since MBA is a core specific
434 * mechanism, the delta values vary based on number of cores used
435 * by the rdtgrp.
436 */
437 pmbm_data->delta_comp = true;
438 list_for_each_entry(entry, head, mon.crdtgrp_list) {
439 cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
440 cmbm_data->delta_comp = true;
441 }
442}
443
332static void mbm_update(struct rdt_domain *d, int rmid) 444static void mbm_update(struct rdt_domain *d, int rmid)
333{ 445{
334 struct rmid_read rr; 446 struct rmid_read rr;
@@ -346,7 +458,16 @@ static void mbm_update(struct rdt_domain *d, int rmid)
346 } 458 }
347 if (is_mbm_local_enabled()) { 459 if (is_mbm_local_enabled()) {
348 rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; 460 rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
349 __mon_event_count(rmid, &rr); 461
462 /*
463 * Call the MBA software controller only for the
464 * control groups and when user has enabled
465 * the software controller explicitly.
466 */
467 if (!is_mba_sc(NULL))
468 __mon_event_count(rmid, &rr);
469 else
470 mbm_bw_count(rmid, &rr);
350 } 471 }
351} 472}
352 473
@@ -417,6 +538,9 @@ void mbm_handle_overflow(struct work_struct *work)
417 head = &prgrp->mon.crdtgrp_list; 538 head = &prgrp->mon.crdtgrp_list;
418 list_for_each_entry(crgrp, head, mon.crdtgrp_list) 539 list_for_each_entry(crgrp, head, mon.crdtgrp_list)
419 mbm_update(d, crgrp->mon.rmid); 540 mbm_update(d, crgrp->mon.rmid);
541
542 if (is_mba_sc(NULL))
543 update_mba_bw(prgrp, d);
420 } 544 }
421 545
422 schedule_delayed_work_on(cpu, &d->mbm_over, delay); 546 schedule_delayed_work_on(cpu, &d->mbm_over, delay);