summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2019-09-04 15:45:52 -0400
committerJens Axboe <axboe@kernel.dk>2019-09-10 14:31:39 -0400
commit36a524814ff3e5d5385f42d30152fe8c5e1fd2c1 (patch)
treed4a10798e234ebf997eeb86f2a686f403f2b996c /block
parente036c4cabaa8d24375262ced3a191819a8077b74 (diff)
blk-iocost: Account force-charged overage in absolute vtime
Currently, when a bio needs to be force-charged and there isn't enough budget, vtime is simply pushed into the future. This means that the cost of the whole bio is scaled using the current hweight and then charged immediately. Until the global vtime advances beyond this future vtime, the cgroup won't be allowed to issue normal IOs. This is incorrect and can lead to, for example, exploding vrate or extended stalls if vrate range is constrained. Consider the following scenario. 1. A cgroup with a very low hweight runs out of budget. 2. A storm of swap-out happens on it. All of them are scaled according to the current low hweight and charged to vtime pushing it to a far future. 3. All other cgroups go idle and now the above cgroup has access to the whole device. However, because vtime is already wound using the past low hweight, what its current hweight is doesn't matter until global vtime catches up to the local vtime. 4. As a result, either vrate gets ramped up extremely or the IOs stall while the underlying device is idle. This is because the hweight the overage is calculated at is different from the hweight that it's being paid at. Fix it by remembering the overage in absoulte vtime and continuously paying with the actual budget according to the current hweight at each period. Note that non-forced bios which wait already remembers the cost in absolute vtime. This brings forced-bio accounting in line. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'block')
-rw-r--r--block/blk-iocost.c62
1 files changed, 55 insertions, 7 deletions
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 7af350293c2f..cffed980dfac 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -469,6 +469,7 @@ struct ioc_gq {
469 */ 469 */
470 atomic64_t vtime; 470 atomic64_t vtime;
471 atomic64_t done_vtime; 471 atomic64_t done_vtime;
472 atomic64_t abs_vdebt;
472 u64 last_vtime; 473 u64 last_vtime;
473 474
474 /* 475 /*
@@ -653,13 +654,21 @@ static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
653 654
654/* 655/*
655 * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical 656 * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical
656 * weight, the more expensive each IO. 657 * weight, the more expensive each IO. Must round up.
657 */ 658 */
658static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse) 659static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
659{ 660{
660 return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse); 661 return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
661} 662}
662 663
664/*
665 * The inverse of abs_cost_to_cost(). Must round up.
666 */
667static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
668{
669 return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE);
670}
671
663static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost) 672static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
664{ 673{
665 bio->bi_iocost_cost = cost; 674 bio->bi_iocost_cost = cost;
@@ -1132,16 +1141,36 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
1132 struct iocg_wake_ctx ctx = { .iocg = iocg }; 1141 struct iocg_wake_ctx ctx = { .iocg = iocg };
1133 u64 margin_ns = (u64)(ioc->period_us * 1142 u64 margin_ns = (u64)(ioc->period_us *
1134 WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC; 1143 WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
1135 u64 vshortage, expires, oexpires; 1144 u64 abs_vdebt, vdebt, vshortage, expires, oexpires;
1145 s64 vbudget;
1146 u32 hw_inuse;
1136 1147
1137 lockdep_assert_held(&iocg->waitq.lock); 1148 lockdep_assert_held(&iocg->waitq.lock);
1138 1149
1150 current_hweight(iocg, NULL, &hw_inuse);
1151 vbudget = now->vnow - atomic64_read(&iocg->vtime);
1152
1153 /* pay off debt */
1154 abs_vdebt = atomic64_read(&iocg->abs_vdebt);
1155 vdebt = abs_cost_to_cost(abs_vdebt, hw_inuse);
1156 if (vdebt && vbudget > 0) {
1157 u64 delta = min_t(u64, vbudget, vdebt);
1158 u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
1159 abs_vdebt);
1160
1161 atomic64_add(delta, &iocg->vtime);
1162 atomic64_add(delta, &iocg->done_vtime);
1163 atomic64_sub(abs_delta, &iocg->abs_vdebt);
1164 if (WARN_ON_ONCE(atomic64_read(&iocg->abs_vdebt) < 0))
1165 atomic64_set(&iocg->abs_vdebt, 0);
1166 }
1167
1139 /* 1168 /*
1140 * Wake up the ones which are due and see how much vtime we'll need 1169 * Wake up the ones which are due and see how much vtime we'll need
1141 * for the next one. 1170 * for the next one.
1142 */ 1171 */
1143 current_hweight(iocg, NULL, &ctx.hw_inuse); 1172 ctx.hw_inuse = hw_inuse;
1144 ctx.vbudget = now->vnow - atomic64_read(&iocg->vtime); 1173 ctx.vbudget = vbudget - vdebt;
1145 __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx); 1174 __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1146 if (!waitqueue_active(&iocg->waitq)) 1175 if (!waitqueue_active(&iocg->waitq))
1147 return; 1176 return;
@@ -1187,6 +1216,11 @@ static void iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost)
1187 u64 vmargin = ioc->margin_us * now->vrate; 1216 u64 vmargin = ioc->margin_us * now->vrate;
1188 u64 margin_ns = ioc->margin_us * NSEC_PER_USEC; 1217 u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
1189 u64 expires, oexpires; 1218 u64 expires, oexpires;
1219 u32 hw_inuse;
1220
1221 /* debt-adjust vtime */
1222 current_hweight(iocg, NULL, &hw_inuse);
1223 vtime += abs_cost_to_cost(atomic64_read(&iocg->abs_vdebt), hw_inuse);
1190 1224
1191 /* clear or maintain depending on the overage */ 1225 /* clear or maintain depending on the overage */
1192 if (time_before_eq64(vtime, now->vnow)) { 1226 if (time_before_eq64(vtime, now->vnow)) {
@@ -1332,12 +1366,14 @@ static void ioc_timer_fn(struct timer_list *timer)
1332 * should have woken up in the last period and expire idle iocgs. 1366 * should have woken up in the last period and expire idle iocgs.
1333 */ 1367 */
1334 list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { 1368 list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
1335 if (!waitqueue_active(&iocg->waitq) && !iocg_is_idle(iocg)) 1369 if (!waitqueue_active(&iocg->waitq) &&
1370 !atomic64_read(&iocg->abs_vdebt) && !iocg_is_idle(iocg))
1336 continue; 1371 continue;
1337 1372
1338 spin_lock(&iocg->waitq.lock); 1373 spin_lock(&iocg->waitq.lock);
1339 1374
1340 if (waitqueue_active(&iocg->waitq)) { 1375 if (waitqueue_active(&iocg->waitq) ||
1376 atomic64_read(&iocg->abs_vdebt)) {
1341 /* might be oversleeping vtime / hweight changes, kick */ 1377 /* might be oversleeping vtime / hweight changes, kick */
1342 iocg_kick_waitq(iocg, &now); 1378 iocg_kick_waitq(iocg, &now);
1343 iocg_kick_delay(iocg, &now, 0); 1379 iocg_kick_delay(iocg, &now, 0);
@@ -1673,13 +1709,24 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
1673 * in a while which is fine. 1709 * in a while which is fine.
1674 */ 1710 */
1675 if (!waitqueue_active(&iocg->waitq) && 1711 if (!waitqueue_active(&iocg->waitq) &&
1712 !atomic64_read(&iocg->abs_vdebt) &&
1676 time_before_eq64(vtime + cost, now.vnow)) { 1713 time_before_eq64(vtime + cost, now.vnow)) {
1677 iocg_commit_bio(iocg, bio, cost); 1714 iocg_commit_bio(iocg, bio, cost);
1678 return; 1715 return;
1679 } 1716 }
1680 1717
1718 /*
1719 * We're over budget. If @bio has to be issued regardless,
1720 * remember the abs_cost instead of advancing vtime.
1721 * iocg_kick_waitq() will pay off the debt before waking more IOs.
1722 * This way, the debt is continuously paid off each period with the
1723 * actual budget available to the cgroup. If we just wound vtime,
1724 * we would incorrectly use the current hw_inuse for the entire
1725 * amount which, for example, can lead to the cgroup staying
1726 * blocked for a long time even with substantially raised hw_inuse.
1727 */
1681 if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) { 1728 if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
1682 iocg_commit_bio(iocg, bio, cost); 1729 atomic64_add(abs_cost, &iocg->abs_vdebt);
1683 iocg_kick_delay(iocg, &now, cost); 1730 iocg_kick_delay(iocg, &now, cost);
1684 return; 1731 return;
1685 } 1732 }
@@ -1928,6 +1975,7 @@ static void ioc_pd_init(struct blkg_policy_data *pd)
1928 iocg->ioc = ioc; 1975 iocg->ioc = ioc;
1929 atomic64_set(&iocg->vtime, now.vnow); 1976 atomic64_set(&iocg->vtime, now.vnow);
1930 atomic64_set(&iocg->done_vtime, now.vnow); 1977 atomic64_set(&iocg->done_vtime, now.vnow);
1978 atomic64_set(&iocg->abs_vdebt, 0);
1931 atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); 1979 atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
1932 INIT_LIST_HEAD(&iocg->active_list); 1980 INIT_LIST_HEAD(&iocg->active_list);
1933 iocg->hweight_active = HWEIGHT_WHOLE; 1981 iocg->hweight_active = HWEIGHT_WHOLE;