diff options
| -rw-r--r-- | arch/x86/kernel/cpu/intel_rdt.c | 3 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/intel_rdt.h | 2 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/intel_rdt_monitor.c | 126 |
3 files changed, 128 insertions, 3 deletions
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index ad03d975883e..24bfa63e86cf 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c | |||
| @@ -33,7 +33,6 @@ | |||
| 33 | #include <asm/intel_rdt_sched.h> | 33 | #include <asm/intel_rdt_sched.h> |
| 34 | #include "intel_rdt.h" | 34 | #include "intel_rdt.h" |
| 35 | 35 | ||
| 36 | #define MAX_MBA_BW 100u | ||
| 37 | #define MBA_IS_LINEAR 0x4 | 36 | #define MBA_IS_LINEAR 0x4 |
| 38 | #define MBA_MAX_MBPS U32_MAX | 37 | #define MBA_MAX_MBPS U32_MAX |
| 39 | 38 | ||
| @@ -350,7 +349,7 @@ static int get_cache_id(int cpu, int level) | |||
| 350 | * that can be written to QOS_MSRs. | 349 | * that can be written to QOS_MSRs. |
| 351 | * There are currently no SKUs which support non linear delay values. | 350 | * There are currently no SKUs which support non linear delay values. |
| 352 | */ | 351 | */ |
| 353 | static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) | 352 | u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) |
| 354 | { | 353 | { |
| 355 | if (r->membw.delay_linear) | 354 | if (r->membw.delay_linear) |
| 356 | return MAX_MBA_BW - bw; | 355 | return MAX_MBA_BW - bw; |
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 66a0ba37a8a3..39752825e376 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h | |||
| @@ -28,6 +28,7 @@ | |||
| 28 | 28 | ||
| 29 | #define MBM_CNTR_WIDTH 24 | 29 | #define MBM_CNTR_WIDTH 24 |
| 30 | #define MBM_OVERFLOW_INTERVAL 1000 | 30 | #define MBM_OVERFLOW_INTERVAL 1000 |
| 31 | #define MAX_MBA_BW 100u | ||
| 31 | 32 | ||
| 32 | #define RMID_VAL_ERROR BIT_ULL(63) | 33 | #define RMID_VAL_ERROR BIT_ULL(63) |
| 33 | #define RMID_VAL_UNAVAIL BIT_ULL(62) | 34 | #define RMID_VAL_UNAVAIL BIT_ULL(62) |
| @@ -461,6 +462,7 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, | |||
| 461 | void mbm_handle_overflow(struct work_struct *work); | 462 | void mbm_handle_overflow(struct work_struct *work); |
| 462 | bool is_mba_sc(struct rdt_resource *r); | 463 | bool is_mba_sc(struct rdt_resource *r); |
| 463 | void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm); | 464 | void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm); |
| 465 | u32 delay_bw_map(unsigned long bw, struct rdt_resource *r); | ||
| 464 | void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); | 466 | void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); |
| 465 | void cqm_handle_limbo(struct work_struct *work); | 467 | void cqm_handle_limbo(struct work_struct *work); |
| 466 | bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); | 468 | bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); |
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c index 7690402c42b7..b0f3aed76b75 100644 --- a/arch/x86/kernel/cpu/intel_rdt_monitor.c +++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c | |||
| @@ -329,6 +329,118 @@ void mon_event_count(void *info) | |||
| 329 | } | 329 | } |
| 330 | } | 330 | } |
| 331 | 331 | ||
| 332 | /* | ||
| 333 | * Feedback loop for MBA software controller (mba_sc) | ||
| 334 | * | ||
| 335 | * mba_sc is a feedback loop where we periodically read MBM counters and | ||
| 336 | * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so | ||
| 337 | * that: | ||
| 338 | * | ||
| 339 | * current bandwdith(cur_bw) < user specified bandwidth(user_bw) | ||
| 340 | * | ||
| 341 | * This uses the MBM counters to measure the bandwidth and MBA throttle | ||
| 342 | * MSRs to control the bandwidth for a particular rdtgrp. It builds on the | ||
| 343 | * fact that resctrl rdtgroups have both monitoring and control. | ||
| 344 | * | ||
| 345 | * The frequency of the checks is 1s and we just tag along the MBM overflow | ||
| 346 | * timer. Having 1s interval makes the calculation of bandwidth simpler. | ||
| 347 | * | ||
| 348 | * Although MBA's goal is to restrict the bandwidth to a maximum, there may | ||
| 349 | * be a need to increase the bandwidth to avoid uncecessarily restricting | ||
| 350 | * the L2 <-> L3 traffic. | ||
| 351 | * | ||
| 352 | * Since MBA controls the L2 external bandwidth where as MBM measures the | ||
| 353 | * L3 external bandwidth the following sequence could lead to such a | ||
| 354 | * situation. | ||
| 355 | * | ||
| 356 | * Consider an rdtgroup which had high L3 <-> memory traffic in initial | ||
| 357 | * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but | ||
| 358 | * after some time rdtgroup has mostly L2 <-> L3 traffic. | ||
| 359 | * | ||
| 360 | * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its | ||
| 361 | * throttle MSRs already have low percentage values. To avoid | ||
| 362 | * unnecessarily restricting such rdtgroups, we also increase the bandwidth. | ||
| 363 | */ | ||
| 364 | static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) | ||
| 365 | { | ||
| 366 | u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val; | ||
| 367 | struct mbm_state *pmbm_data, *cmbm_data; | ||
| 368 | u32 cur_bw, delta_bw, user_bw; | ||
| 369 | struct rdt_resource *r_mba; | ||
| 370 | struct rdt_domain *dom_mba; | ||
| 371 | struct list_head *head; | ||
| 372 | struct rdtgroup *entry; | ||
| 373 | |||
| 374 | r_mba = &rdt_resources_all[RDT_RESOURCE_MBA]; | ||
| 375 | closid = rgrp->closid; | ||
| 376 | rmid = rgrp->mon.rmid; | ||
| 377 | pmbm_data = &dom_mbm->mbm_local[rmid]; | ||
| 378 | |||
| 379 | dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba); | ||
| 380 | if (!dom_mba) { | ||
| 381 | pr_warn_once("Failure to get domain for MBA update\n"); | ||
| 382 | return; | ||
| 383 | } | ||
| 384 | |||
| 385 | cur_bw = pmbm_data->prev_bw; | ||
| 386 | user_bw = dom_mba->mbps_val[closid]; | ||
| 387 | delta_bw = pmbm_data->delta_bw; | ||
| 388 | cur_msr_val = dom_mba->ctrl_val[closid]; | ||
| 389 | |||
| 390 | /* | ||
| 391 | * For Ctrl groups read data from child monitor groups. | ||
| 392 | */ | ||
| 393 | head = &rgrp->mon.crdtgrp_list; | ||
| 394 | list_for_each_entry(entry, head, mon.crdtgrp_list) { | ||
| 395 | cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; | ||
| 396 | cur_bw += cmbm_data->prev_bw; | ||
| 397 | delta_bw += cmbm_data->delta_bw; | ||
| 398 | } | ||
| 399 | |||
| 400 | /* | ||
| 401 | * Scale up/down the bandwidth linearly for the ctrl group. The | ||
| 402 | * bandwidth step is the bandwidth granularity specified by the | ||
| 403 | * hardware. | ||
| 404 | * | ||
| 405 | * The delta_bw is used when increasing the bandwidth so that we | ||
| 406 | * dont alternately increase and decrease the control values | ||
| 407 | * continuously. | ||
| 408 | * | ||
| 409 | * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if | ||
| 410 | * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep | ||
| 411 | * switching between 90 and 110 continuously if we only check | ||
| 412 | * cur_bw < user_bw. | ||
| 413 | */ | ||
| 414 | if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { | ||
| 415 | new_msr_val = cur_msr_val - r_mba->membw.bw_gran; | ||
| 416 | } else if (cur_msr_val < MAX_MBA_BW && | ||
| 417 | (user_bw > (cur_bw + delta_bw))) { | ||
| 418 | new_msr_val = cur_msr_val + r_mba->membw.bw_gran; | ||
| 419 | } else { | ||
| 420 | return; | ||
| 421 | } | ||
| 422 | |||
| 423 | cur_msr = r_mba->msr_base + closid; | ||
| 424 | wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba)); | ||
| 425 | dom_mba->ctrl_val[closid] = new_msr_val; | ||
| 426 | |||
| 427 | /* | ||
| 428 | * Delta values are updated dynamically package wise for each | ||
| 429 | * rdtgrp everytime the throttle MSR changes value. | ||
| 430 | * | ||
| 431 | * This is because (1)the increase in bandwidth is not perfectly | ||
| 432 | * linear and only "approximately" linear even when the hardware | ||
| 433 | * says it is linear.(2)Also since MBA is a core specific | ||
| 434 | * mechanism, the delta values vary based on number of cores used | ||
| 435 | * by the rdtgrp. | ||
| 436 | */ | ||
| 437 | pmbm_data->delta_comp = true; | ||
| 438 | list_for_each_entry(entry, head, mon.crdtgrp_list) { | ||
| 439 | cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; | ||
| 440 | cmbm_data->delta_comp = true; | ||
| 441 | } | ||
| 442 | } | ||
| 443 | |||
| 332 | static void mbm_update(struct rdt_domain *d, int rmid) | 444 | static void mbm_update(struct rdt_domain *d, int rmid) |
| 333 | { | 445 | { |
| 334 | struct rmid_read rr; | 446 | struct rmid_read rr; |
| @@ -346,7 +458,16 @@ static void mbm_update(struct rdt_domain *d, int rmid) | |||
| 346 | } | 458 | } |
| 347 | if (is_mbm_local_enabled()) { | 459 | if (is_mbm_local_enabled()) { |
| 348 | rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; | 460 | rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; |
| 349 | __mon_event_count(rmid, &rr); | 461 | |
| 462 | /* | ||
| 463 | * Call the MBA software controller only for the | ||
| 464 | * control groups and when user has enabled | ||
| 465 | * the software controller explicitly. | ||
| 466 | */ | ||
| 467 | if (!is_mba_sc(NULL)) | ||
| 468 | __mon_event_count(rmid, &rr); | ||
| 469 | else | ||
| 470 | mbm_bw_count(rmid, &rr); | ||
| 350 | } | 471 | } |
| 351 | } | 472 | } |
| 352 | 473 | ||
| @@ -417,6 +538,9 @@ void mbm_handle_overflow(struct work_struct *work) | |||
| 417 | head = &prgrp->mon.crdtgrp_list; | 538 | head = &prgrp->mon.crdtgrp_list; |
| 418 | list_for_each_entry(crgrp, head, mon.crdtgrp_list) | 539 | list_for_each_entry(crgrp, head, mon.crdtgrp_list) |
| 419 | mbm_update(d, crgrp->mon.rmid); | 540 | mbm_update(d, crgrp->mon.rmid); |
| 541 | |||
| 542 | if (is_mba_sc(NULL)) | ||
| 543 | update_mba_bw(prgrp, d); | ||
| 420 | } | 544 | } |
| 421 | 545 | ||
| 422 | schedule_delayed_work_on(cpu, &d->mbm_over, delay); | 546 | schedule_delayed_work_on(cpu, &d->mbm_over, delay); |
