summaryrefslogtreecommitdiffstats
path: root/block/blk-throttle.c
diff options
context:
space:
mode:
authorShaohua Li <shli@fb.com>2017-03-27 13:51:40 -0400
committerJens Axboe <axboe@fb.com>2017-03-28 10:02:20 -0400
commit7394e31fa440ab7cd20cebd233580b360a7e9ecc (patch)
tree311eb865d3b2b01d7461bbb63398242e0e71bdfa /block/blk-throttle.c
parentaec242468cb84b8eea7130c10530a69d2b352bff (diff)
blk-throttle: make bandwidth change smooth
When cgroups all reach low limit, cgroups can dispatch more IO. This could make some cgroups dispatch more IO but others not, and even some cgroups could dispatch less IO than their low limit. For example, cg1 low limit 10MB/s, cg2 limit 80MB/s, assume disk maximum bandwidth is 120M/s for the workload. Their bps could something like this: cg1/cg2 bps: T1: 10/80 -> T2: 60/60 -> T3: 10/80 At T1, all cgroups reach low limit, so they can dispatch more IO later. Then cg1 dispatch more IO and cg2 has no room to dispatch enough IO. At T2, cg2 only dispatches 60M/s. Since We detect cg2 dispatches less IO than its low limit 80M/s, we downgrade the queue from LIMIT_MAX to LIMIT_LOW, then all cgroups are throttled to their low limit (T3). cg2 will have bandwidth below its low limit at most time. The big problem here is we don't know the maximum bandwidth of the workload, so we can't make smart decision to avoid the situation. This patch makes cgroup bandwidth change smooth. After disk upgrades from LIMIT_LOW to LIMIT_MAX, we don't allow cgroups use all bandwidth upto their max limit immediately. Their bandwidth limit will be increased gradually to avoid above situation. So above example will became something like: cg1/cg2 bps: 10/80 -> 15/105 -> 20/100 -> 25/95 -> 30/90 -> 35/85 -> 40/80 -> 45/75 -> 22/98 In this way cgroups bandwidth will be above their limit in majority time, this still doesn't fully utilize disk bandwidth, but that's something we pay for sharing. Scale up is linear. The limit scales up 1/2 .low limit every throtl_slice after upgrade. The scale up will stop if the adjusted limit hits .max limit. Scale down is exponential. We cut the scale value half if a cgroup doesn't hit its .low limit. If the scale becomes 0, we then fully downgrade the queue to LIMIT_LOW state. Note this doesn't completely avoid cgroup running under its low limit. The best way to guarantee cgroup doesn't run under its limit is to set max limit. For example, if we set cg1 max limit to 40, cg2 will never run under its low limit. Signed-off-by: Shaohua Li <shli@fb.com> Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'block/blk-throttle.c')
-rw-r--r--block/blk-throttle.c57
1 files changed, 54 insertions, 3 deletions
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 014b2e96a423..62984fc92015 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -175,6 +175,8 @@ struct throtl_data
175 175
176 unsigned long low_upgrade_time; 176 unsigned long low_upgrade_time;
177 unsigned long low_downgrade_time; 177 unsigned long low_downgrade_time;
178
179 unsigned int scale;
178}; 180};
179 181
180static void throtl_pending_timer_fn(unsigned long arg); 182static void throtl_pending_timer_fn(unsigned long arg);
@@ -226,29 +228,70 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
226 return container_of(sq, struct throtl_data, service_queue); 228 return container_of(sq, struct throtl_data, service_queue);
227} 229}
228 230
231/*
232 * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
233 * make the IO dispatch more smooth.
234 * Scale up: linearly scale up according to lapsed time since upgrade. For
235 * every throtl_slice, the limit scales up 1/2 .low limit till the
236 * limit hits .max limit
237 * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit
238 */
239static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td)
240{
241 /* arbitrary value to avoid too big scale */
242 if (td->scale < 4096 && time_after_eq(jiffies,
243 td->low_upgrade_time + td->scale * td->throtl_slice))
244 td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice;
245
246 return low + (low >> 1) * td->scale;
247}
248
229static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw) 249static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
230{ 250{
231 struct blkcg_gq *blkg = tg_to_blkg(tg); 251 struct blkcg_gq *blkg = tg_to_blkg(tg);
252 struct throtl_data *td;
232 uint64_t ret; 253 uint64_t ret;
233 254
234 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) 255 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
235 return U64_MAX; 256 return U64_MAX;
236 ret = tg->bps[rw][tg->td->limit_index]; 257
237 if (ret == 0 && tg->td->limit_index == LIMIT_LOW) 258 td = tg->td;
259 ret = tg->bps[rw][td->limit_index];
260 if (ret == 0 && td->limit_index == LIMIT_LOW)
238 return tg->bps[rw][LIMIT_MAX]; 261 return tg->bps[rw][LIMIT_MAX];
262
263 if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] &&
264 tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) {
265 uint64_t adjusted;
266
267 adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td);
268 ret = min(tg->bps[rw][LIMIT_MAX], adjusted);
269 }
239 return ret; 270 return ret;
240} 271}
241 272
242static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) 273static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
243{ 274{
244 struct blkcg_gq *blkg = tg_to_blkg(tg); 275 struct blkcg_gq *blkg = tg_to_blkg(tg);
276 struct throtl_data *td;
245 unsigned int ret; 277 unsigned int ret;
246 278
247 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) 279 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
248 return UINT_MAX; 280 return UINT_MAX;
249 ret = tg->iops[rw][tg->td->limit_index]; 281 td = tg->td;
282 ret = tg->iops[rw][td->limit_index];
250 if (ret == 0 && tg->td->limit_index == LIMIT_LOW) 283 if (ret == 0 && tg->td->limit_index == LIMIT_LOW)
251 return tg->iops[rw][LIMIT_MAX]; 284 return tg->iops[rw][LIMIT_MAX];
285
286 if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] &&
287 tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) {
288 uint64_t adjusted;
289
290 adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td);
291 if (adjusted > UINT_MAX)
292 adjusted = UINT_MAX;
293 ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted);
294 }
252 return ret; 295 return ret;
253} 296}
254 297
@@ -1677,6 +1720,7 @@ static void throtl_upgrade_state(struct throtl_data *td)
1677 1720
1678 td->limit_index = LIMIT_MAX; 1721 td->limit_index = LIMIT_MAX;
1679 td->low_upgrade_time = jiffies; 1722 td->low_upgrade_time = jiffies;
1723 td->scale = 0;
1680 rcu_read_lock(); 1724 rcu_read_lock();
1681 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { 1725 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
1682 struct throtl_grp *tg = blkg_to_tg(blkg); 1726 struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -1694,6 +1738,13 @@ static void throtl_upgrade_state(struct throtl_data *td)
1694 1738
1695static void throtl_downgrade_state(struct throtl_data *td, int new) 1739static void throtl_downgrade_state(struct throtl_data *td, int new)
1696{ 1740{
1741 td->scale /= 2;
1742
1743 if (td->scale) {
1744 td->low_upgrade_time = jiffies - td->scale * td->throtl_slice;
1745 return;
1746 }
1747
1697 td->limit_index = new; 1748 td->limit_index = new;
1698 td->low_downgrade_time = jiffies; 1749 td->low_downgrade_time = jiffies;
1699} 1750}