summaryrefslogtreecommitdiffstats
path: root/block/bfq-iosched.c
diff options
context:
space:
mode:
authorPaolo Valente <paolo.valente@linaro.org>2019-03-12 04:59:28 -0400
committerJens Axboe <axboe@kernel.dk>2019-04-01 10:15:39 -0400
commitfb53ac6cd0269987b1b77f957db453b3ec7bf7e4 (patch)
treef791f3b8262500b016ed9e01c9d8b3d1f657bfb4 /block/bfq-iosched.c
parent778c02a236a8728bb992de10ed1f12c0be5b7b0e (diff)
block, bfq: do not idle for lowest-weight queues
In most cases, it is detrimental for throughput to plug I/O dispatch when the in-service bfq_queue becomes temporarily empty (plugging is performed to wait for the possible arrival, soon, of new I/O from the in-service queue). There is however a case where plugging is needed for service guarantees. If a bfq_queue, say Q, has a higher weight than some other active bfq_queue, and is sync, i.e., contains sync I/O, then, to guarantee that Q does receive a higher share of the throughput than other lower-weight queues, it is necessary to plug I/O dispatch when Q remains temporarily empty while being served. For this reason, BFQ performs I/O plugging when some active bfq_queue has a higher weight than some other active bfq_queue. But this is unnecessarily overkill. In fact, if the in-service bfq_queue actually has a weight lower than or equal to the other queues, then the queue *must not* be guaranteed a higher share of the throughput than the other queues. So, not plugging I/O cannot cause any harm to the queue. And can boost throughput. Taking advantage of this fact, this commit does not plug I/O for sync bfq_queues with a weight lower than or equal to the weights of the other queues. Here is an example of the resulting throughput boost with the dbench workload, which is particularly nasty for BFQ. With the dbench test in the Phoronix suite, BFQ reaches its lowest total throughput with 6 clients on a filesystem with journaling, in case the journaling daemon has a higher weight than normal processes. Before this commit, the total throughput was ~80 MB/sec on a PLEXTOR PX-256M5, after this commit it is ~100 MB/sec. Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com> Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name> Signed-off-by: Paolo Valente <paolo.valente@linaro.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'block/bfq-iosched.c')
-rw-r--r--block/bfq-iosched.c204
1 files changed, 114 insertions, 90 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index f30d1cb887d4..2eb587fe7c1a 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -629,12 +629,19 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
629} 629}
630 630
631/* 631/*
632 * The following function returns true if every queue must receive the 632 * The following function returns false either if every active queue
633 * same share of the throughput (this condition is used when deciding 633 * must receive the same share of the throughput (symmetric scenario),
634 * whether idling may be disabled, see the comments in the function 634 * or, as a special case, if bfqq must receive a share of the
635 * bfq_better_to_idle()). 635 * throughput lower than or equal to the share that every other active
636 * queue must receive. If bfqq does sync I/O, then these are the only
637 * two cases where bfqq happens to be guaranteed its share of the
638 * throughput even if I/O dispatching is not plugged when bfqq remains
639 * temporarily empty (for more details, see the comments in the
640 * function bfq_better_to_idle()). For this reason, the return value
641 * of this function is used to check whether I/O-dispatch plugging can
642 * be avoided.
636 * 643 *
637 * Such a scenario occurs when: 644 * The above first case (symmetric scenario) occurs when:
638 * 1) all active queues have the same weight, 645 * 1) all active queues have the same weight,
639 * 2) all active queues belong to the same I/O-priority class, 646 * 2) all active queues belong to the same I/O-priority class,
640 * 3) all active groups at the same level in the groups tree have the same 647 * 3) all active groups at the same level in the groups tree have the same
@@ -654,30 +661,36 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
654 * support or the cgroups interface are not enabled, thus no state 661 * support or the cgroups interface are not enabled, thus no state
655 * needs to be maintained in this case. 662 * needs to be maintained in this case.
656 */ 663 */
657static bool bfq_symmetric_scenario(struct bfq_data *bfqd) 664static bool bfq_asymmetric_scenario(struct bfq_data *bfqd,
665 struct bfq_queue *bfqq)
658{ 666{
667 bool smallest_weight = bfqq &&
668 bfqq->weight_counter &&
669 bfqq->weight_counter ==
670 container_of(
671 rb_first_cached(&bfqd->queue_weights_tree),
672 struct bfq_weight_counter,
673 weights_node);
674
659 /* 675 /*
660 * For queue weights to differ, queue_weights_tree must contain 676 * For queue weights to differ, queue_weights_tree must contain
661 * at least two nodes. 677 * at least two nodes.
662 */ 678 */
663 bool varied_queue_weights = !RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && 679 bool varied_queue_weights = !smallest_weight &&
664 (bfqd->queue_weights_tree.rb_node->rb_left || 680 !RB_EMPTY_ROOT(&bfqd->queue_weights_tree.rb_root) &&
665 bfqd->queue_weights_tree.rb_node->rb_right); 681 (bfqd->queue_weights_tree.rb_root.rb_node->rb_left ||
682 bfqd->queue_weights_tree.rb_root.rb_node->rb_right);
666 683
667 bool multiple_classes_busy = 684 bool multiple_classes_busy =
668 (bfqd->busy_queues[0] && bfqd->busy_queues[1]) || 685 (bfqd->busy_queues[0] && bfqd->busy_queues[1]) ||
669 (bfqd->busy_queues[0] && bfqd->busy_queues[2]) || 686 (bfqd->busy_queues[0] && bfqd->busy_queues[2]) ||
670 (bfqd->busy_queues[1] && bfqd->busy_queues[2]); 687 (bfqd->busy_queues[1] && bfqd->busy_queues[2]);
671 688
672 /* 689 return varied_queue_weights || multiple_classes_busy
673 * For queue weights to differ, queue_weights_tree must contain
674 * at least two nodes.
675 */
676 return !(varied_queue_weights || multiple_classes_busy
677#ifdef CONFIG_BFQ_GROUP_IOSCHED 690#ifdef CONFIG_BFQ_GROUP_IOSCHED
678 || bfqd->num_groups_with_pending_reqs > 0 691 || bfqd->num_groups_with_pending_reqs > 0
679#endif 692#endif
680 ); 693 ;
681} 694}
682 695
683/* 696/*
@@ -694,10 +707,11 @@ static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
694 * should be low too. 707 * should be low too.
695 */ 708 */
696void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq, 709void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
697 struct rb_root *root) 710 struct rb_root_cached *root)
698{ 711{
699 struct bfq_entity *entity = &bfqq->entity; 712 struct bfq_entity *entity = &bfqq->entity;
700 struct rb_node **new = &(root->rb_node), *parent = NULL; 713 struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL;
714 bool leftmost = true;
701 715
702 /* 716 /*
703 * Do not insert if the queue is already associated with a 717 * Do not insert if the queue is already associated with a
@@ -726,8 +740,10 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
726 } 740 }
727 if (entity->weight < __counter->weight) 741 if (entity->weight < __counter->weight)
728 new = &((*new)->rb_left); 742 new = &((*new)->rb_left);
729 else 743 else {
730 new = &((*new)->rb_right); 744 new = &((*new)->rb_right);
745 leftmost = false;
746 }
731 } 747 }
732 748
733 bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), 749 bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
@@ -736,7 +752,7 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
736 /* 752 /*
737 * In the unlucky event of an allocation failure, we just 753 * In the unlucky event of an allocation failure, we just
738 * exit. This will cause the weight of queue to not be 754 * exit. This will cause the weight of queue to not be
739 * considered in bfq_symmetric_scenario, which, in its turn, 755 * considered in bfq_asymmetric_scenario, which, in its turn,
740 * causes the scenario to be deemed wrongly symmetric in case 756 * causes the scenario to be deemed wrongly symmetric in case
741 * bfqq's weight would have been the only weight making the 757 * bfqq's weight would have been the only weight making the
742 * scenario asymmetric. On the bright side, no unbalance will 758 * scenario asymmetric. On the bright side, no unbalance will
@@ -750,7 +766,8 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
750 766
751 bfqq->weight_counter->weight = entity->weight; 767 bfqq->weight_counter->weight = entity->weight;
752 rb_link_node(&bfqq->weight_counter->weights_node, parent, new); 768 rb_link_node(&bfqq->weight_counter->weights_node, parent, new);
753 rb_insert_color(&bfqq->weight_counter->weights_node, root); 769 rb_insert_color_cached(&bfqq->weight_counter->weights_node, root,
770 leftmost);
754 771
755inc_counter: 772inc_counter:
756 bfqq->weight_counter->num_active++; 773 bfqq->weight_counter->num_active++;
@@ -765,7 +782,7 @@ inc_counter:
765 */ 782 */
766void __bfq_weights_tree_remove(struct bfq_data *bfqd, 783void __bfq_weights_tree_remove(struct bfq_data *bfqd,
767 struct bfq_queue *bfqq, 784 struct bfq_queue *bfqq,
768 struct rb_root *root) 785 struct rb_root_cached *root)
769{ 786{
770 if (!bfqq->weight_counter) 787 if (!bfqq->weight_counter)
771 return; 788 return;
@@ -774,7 +791,7 @@ void __bfq_weights_tree_remove(struct bfq_data *bfqd,
774 if (bfqq->weight_counter->num_active > 0) 791 if (bfqq->weight_counter->num_active > 0)
775 goto reset_entity_pointer; 792 goto reset_entity_pointer;
776 793
777 rb_erase(&bfqq->weight_counter->weights_node, root); 794 rb_erase_cached(&bfqq->weight_counter->weights_node, root);
778 kfree(bfqq->weight_counter); 795 kfree(bfqq->weight_counter);
779 796
780reset_entity_pointer: 797reset_entity_pointer:
@@ -889,7 +906,7 @@ static unsigned long bfq_serv_to_charge(struct request *rq,
889 struct bfq_queue *bfqq) 906 struct bfq_queue *bfqq)
890{ 907{
891 if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1 || 908 if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1 ||
892 !bfq_symmetric_scenario(bfqq->bfqd)) 909 bfq_asymmetric_scenario(bfqq->bfqd, bfqq))
893 return blk_rq_sectors(rq); 910 return blk_rq_sectors(rq);
894 911
895 return blk_rq_sectors(rq) * bfq_async_charge_factor; 912 return blk_rq_sectors(rq) * bfq_async_charge_factor;
@@ -2543,7 +2560,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
2543 * queue). 2560 * queue).
2544 */ 2561 */
2545 if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && 2562 if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
2546 bfq_symmetric_scenario(bfqd)) 2563 !bfq_asymmetric_scenario(bfqd, bfqq))
2547 sl = min_t(u64, sl, BFQ_MIN_TT); 2564 sl = min_t(u64, sl, BFQ_MIN_TT);
2548 else if (bfqq->wr_coeff > 1) 2565 else if (bfqq->wr_coeff > 1)
2549 sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC); 2566 sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC);
@@ -3500,8 +3517,9 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
3500} 3517}
3501 3518
3502/* 3519/*
3503 * There is a case where idling must be performed not for 3520 * There is a case where idling does not have to be performed for
3504 * throughput concerns, but to preserve service guarantees. 3521 * throughput concerns, but to preserve the throughput share of
3522 * the process associated with bfqq.
3505 * 3523 *
3506 * To introduce this case, we can note that allowing the drive 3524 * To introduce this case, we can note that allowing the drive
3507 * to enqueue more than one request at a time, and hence 3525 * to enqueue more than one request at a time, and hence
@@ -3517,77 +3535,83 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
3517 * concern about per-process throughput distribution, and 3535 * concern about per-process throughput distribution, and
3518 * makes its decisions only on a per-request basis. Therefore, 3536 * makes its decisions only on a per-request basis. Therefore,
3519 * the service distribution enforced by the drive's internal 3537 * the service distribution enforced by the drive's internal
3520 * scheduler is likely to coincide with the desired 3538 * scheduler is likely to coincide with the desired throughput
3521 * device-throughput distribution only in a completely 3539 * distribution only in a completely symmetric, or favorably
3522 * symmetric scenario where: 3540 * skewed scenario where:
3523 * (i) each of these processes must get the same throughput as 3541 * (i-a) each of these processes must get the same throughput as
3524 * the others; 3542 * the others,
3525 * (ii) the I/O of each process has the same properties, in 3543 * (i-b) in case (i-a) does not hold, it holds that the process
3526 * terms of locality (sequential or random), direction 3544 * associated with bfqq must receive a lower or equal
3527 * (reads or writes), request sizes, greediness 3545 * throughput than any of the other processes;
3528 * (from I/O-bound to sporadic), and so on. 3546 * (ii) the I/O of each process has the same properties, in
3529 * In fact, in such a scenario, the drive tends to treat 3547 * terms of locality (sequential or random), direction
3530 * the requests of each of these processes in about the same 3548 * (reads or writes), request sizes, greediness
3531 * way as the requests of the others, and thus to provide 3549 * (from I/O-bound to sporadic), and so on;
3532 * each of these processes with about the same throughput 3550
3533 * (which is exactly the desired throughput distribution). In 3551 * In fact, in such a scenario, the drive tends to treat the requests
3534 * contrast, in any asymmetric scenario, device idling is 3552 * of each process in about the same way as the requests of the
3535 * certainly needed to guarantee that bfqq receives its 3553 * others, and thus to provide each of these processes with about the
3536 * assigned fraction of the device throughput (see [1] for 3554 * same throughput. This is exactly the desired throughput
3537 * details). 3555 * distribution if (i-a) holds, or, if (i-b) holds instead, this is an
3538 * The problem is that idling may significantly reduce 3556 * even more convenient distribution for (the process associated with)
3539 * throughput with certain combinations of types of I/O and 3557 * bfqq.
3540 * devices. An important example is sync random I/O, on flash 3558 *
3541 * storage with command queueing. So, unless bfqq falls in the 3559 * In contrast, in any asymmetric or unfavorable scenario, device
3542 * above cases where idling also boosts throughput, it would 3560 * idling (I/O-dispatch plugging) is certainly needed to guarantee
3543 * be important to check conditions (i) and (ii) accurately, 3561 * that bfqq receives its assigned fraction of the device throughput
3544 * so as to avoid idling when not strictly needed for service 3562 * (see [1] for details).
3545 * guarantees. 3563 *
3564 * The problem is that idling may significantly reduce throughput with
3565 * certain combinations of types of I/O and devices. An important
3566 * example is sync random I/O on flash storage with command
3567 * queueing. So, unless bfqq falls in cases where idling also boosts
3568 * throughput, it is important to check conditions (i-a), i(-b) and
3569 * (ii) accurately, so as to avoid idling when not strictly needed for
3570 * service guarantees.
3546 * 3571 *
3547 * Unfortunately, it is extremely difficult to thoroughly 3572 * Unfortunately, it is extremely difficult to thoroughly check
3548 * check condition (ii). And, in case there are active groups, 3573 * condition (ii). And, in case there are active groups, it becomes
3549 * it becomes very difficult to check condition (i) too. In 3574 * very difficult to check conditions (i-a) and (i-b) too. In fact,
3550 * fact, if there are active groups, then, for condition (i) 3575 * if there are active groups, then, for conditions (i-a) or (i-b) to
3551 * to become false, it is enough that an active group contains 3576 * become false 'indirectly', it is enough that an active group
3552 * more active processes or sub-groups than some other active 3577 * contains more active processes or sub-groups than some other active
3553 * group. More precisely, for condition (i) to hold because of 3578 * group. More precisely, for conditions (i-a) or (i-b) to become
3554 * such a group, it is not even necessary that the group is 3579 * false because of such a group, it is not even necessary that the
3555 * (still) active: it is sufficient that, even if the group 3580 * group is (still) active: it is sufficient that, even if the group
3556 * has become inactive, some of its descendant processes still 3581 * has become inactive, some of its descendant processes still have
3557 * have some request already dispatched but still waiting for 3582 * some request already dispatched but still waiting for
3558 * completion. In fact, requests have still to be guaranteed 3583 * completion. In fact, requests have still to be guaranteed their
3559 * their share of the throughput even after being 3584 * share of the throughput even after being dispatched. In this
3560 * dispatched. In this respect, it is easy to show that, if a 3585 * respect, it is easy to show that, if a group frequently becomes
3561 * group frequently becomes inactive while still having 3586 * inactive while still having in-flight requests, and if, when this
3562 * in-flight requests, and if, when this happens, the group is 3587 * happens, the group is not considered in the calculation of whether
3563 * not considered in the calculation of whether the scenario 3588 * the scenario is asymmetric, then the group may fail to be
3564 * is asymmetric, then the group may fail to be guaranteed its 3589 * guaranteed its fair share of the throughput (basically because
3565 * fair share of the throughput (basically because idling may 3590 * idling may not be performed for the descendant processes of the
3566 * not be performed for the descendant processes of the group, 3591 * group, but it had to be). We address this issue with the following
3567 * but it had to be). We address this issue with the 3592 * bi-modal behavior, implemented in the function
3568 * following bi-modal behavior, implemented in the function 3593 * bfq_asymmetric_scenario().
3569 * bfq_symmetric_scenario().
3570 * 3594 *
3571 * If there are groups with requests waiting for completion 3595 * If there are groups with requests waiting for completion
3572 * (as commented above, some of these groups may even be 3596 * (as commented above, some of these groups may even be
3573 * already inactive), then the scenario is tagged as 3597 * already inactive), then the scenario is tagged as
3574 * asymmetric, conservatively, without checking any of the 3598 * asymmetric, conservatively, without checking any of the
3575 * conditions (i) and (ii). So the device is idled for bfqq. 3599 * conditions (i-a), (i-b) or (ii). So the device is idled for bfqq.
3576 * This behavior matches also the fact that groups are created 3600 * This behavior matches also the fact that groups are created
3577 * exactly if controlling I/O is a primary concern (to 3601 * exactly if controlling I/O is a primary concern (to
3578 * preserve bandwidth and latency guarantees). 3602 * preserve bandwidth and latency guarantees).
3579 * 3603 *
3580 * On the opposite end, if there are no groups with requests 3604 * On the opposite end, if there are no groups with requests waiting
3581 * waiting for completion, then only condition (i) is actually 3605 * for completion, then only conditions (i-a) and (i-b) are actually
3582 * controlled, i.e., provided that condition (i) holds, idling 3606 * controlled, i.e., provided that conditions (i-a) or (i-b) holds,
3583 * is not performed, regardless of whether condition (ii) 3607 * idling is not performed, regardless of whether condition (ii)
3584 * holds. In other words, only if condition (i) does not hold, 3608 * holds. In other words, only if conditions (i-a) and (i-b) do not
3585 * then idling is allowed, and the device tends to be 3609 * hold, then idling is allowed, and the device tends to be prevented
3586 * prevented from queueing many requests, possibly of several 3610 * from queueing many requests, possibly of several processes. Since
3587 * processes. Since there are no groups with requests waiting 3611 * there are no groups with requests waiting for completion, then, to
3588 * for completion, then, to control condition (i) it is enough 3612 * control conditions (i-a) and (i-b) it is enough to check just
3589 * to check just whether all the queues with requests waiting 3613 * whether all the queues with requests waiting for completion also
3590 * for completion also have the same weight. 3614 * have the same weight.
3591 * 3615 *
3592 * Not checking condition (ii) evidently exposes bfqq to the 3616 * Not checking condition (ii) evidently exposes bfqq to the
3593 * risk of getting less throughput than its fair share. 3617 * risk of getting less throughput than its fair share.
@@ -3639,7 +3663,7 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
3639 * compound condition that is checked below for deciding 3663 * compound condition that is checked below for deciding
3640 * whether the scenario is asymmetric. To explain this 3664 * whether the scenario is asymmetric. To explain this
3641 * compound condition, we need to add that the function 3665 * compound condition, we need to add that the function
3642 * bfq_symmetric_scenario checks the weights of only 3666 * bfq_asymmetric_scenario checks the weights of only
3643 * non-weight-raised queues, for efficiency reasons (see 3667 * non-weight-raised queues, for efficiency reasons (see
3644 * comments on bfq_weights_tree_add()). Then the fact that 3668 * comments on bfq_weights_tree_add()). Then the fact that
3645 * bfqq is weight-raised is checked explicitly here. More 3669 * bfqq is weight-raised is checked explicitly here. More
@@ -3667,7 +3691,7 @@ static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
3667 return (bfqq->wr_coeff > 1 && 3691 return (bfqq->wr_coeff > 1 &&
3668 bfqd->wr_busy_queues < 3692 bfqd->wr_busy_queues <
3669 bfq_tot_busy_queues(bfqd)) || 3693 bfq_tot_busy_queues(bfqd)) ||
3670 !bfq_symmetric_scenario(bfqd); 3694 bfq_asymmetric_scenario(bfqd, bfqq);
3671} 3695}
3672 3696
3673/* 3697/*
@@ -5505,7 +5529,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
5505 HRTIMER_MODE_REL); 5529 HRTIMER_MODE_REL);
5506 bfqd->idle_slice_timer.function = bfq_idle_slice_timer; 5530 bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
5507 5531
5508 bfqd->queue_weights_tree = RB_ROOT; 5532 bfqd->queue_weights_tree = RB_ROOT_CACHED;
5509 bfqd->num_groups_with_pending_reqs = 0; 5533 bfqd->num_groups_with_pending_reqs = 0;
5510 5534
5511 INIT_LIST_HEAD(&bfqd->active_list); 5535 INIT_LIST_HEAD(&bfqd->active_list);