blk-throttle: make bandwidth change smooth

When cgroups all reach low limit, cgroups can dispatch more IO. This could make some cgroups dispatch more IO but others not, and even some cgroups could dispatch less IO than their low limit. For example, cg1 low limit 10MB/s, cg2 limit 80MB/s, assume disk maximum bandwidth is 120M/s for the workload. Their bps could something like this: cg1/cg2 bps: T1: 10/80 -> T2: 60/60 -> T3: 10/80 At T1, all cgroups reach low limit, so they can dispatch more IO later. Then cg1 dispatch more IO and cg2 has no room to dispatch enough IO. At T2, cg2 only dispatches 60M/s. Since We detect cg2 dispatches less IO than its low limit 80M/s, we downgrade the queue from LIMIT_MAX to LIMIT_LOW, then all cgroups are throttled to their low limit (T3). cg2 will have bandwidth below its low limit at most time. The big problem here is we don't know the maximum bandwidth of the workload, so we can't make smart decision to avoid the situation. This patch makes cgroup bandwidth change smooth. After disk upgrades from LIMIT_LOW to LIMIT_MAX, we don't allow cgroups use all bandwidth upto their max limit immediately. Their bandwidth limit will be increased gradually to avoid above situation. So above example will became something like: cg1/cg2 bps: 10/80 -> 15/105 -> 20/100 -> 25/95 -> 30/90 -> 35/85 -> 40/80 -> 45/75 -> 22/98 In this way cgroups bandwidth will be above their limit in majority time, this still doesn't fully utilize disk bandwidth, but that's something we pay for sharing. Scale up is linear. The limit scales up 1/2 .low limit every throtl_slice after upgrade. The scale up will stop if the adjusted limit hits .max limit. Scale down is exponential. We cut the scale value half if a cgroup doesn't hit its .low limit. If the scale becomes 0, we then fully downgrade the queue to LIMIT_LOW state. Note this doesn't completely avoid cgroup running under its low limit. The best way to guarantee cgroup doesn't run under its limit is to set max limit. For example, if we set cg1 max limit to 40, cg2 will never run under its low limit. Signed-off-by: Shaohua Li <shli@fb.com> Signed-off-by: Jens Axboe <axboe@fb.com>
author: Shaohua Li <shli@fb.com> 2017-03-27 13:51:40 -0400
committer: Jens Axboe <axboe@fb.com> 2017-03-28 10:02:20 -0400
commit: 7394e31fa440ab7cd20cebd233580b360a7e9ecc (patch)
tree: 311eb865d3b2b01d7461bbb63398242e0e71bdfa /block/blk-throttle.c
parent: aec242468cb84b8eea7130c10530a69d2b352bff (diff)
1 files changed, 54 insertions, 3 deletions
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 014b2e96a423..62984fc92015 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -175,6 +175,8 @@ struct throtl_data
        unsigned long low_upgrade_time;
        unsigned long low_downgrade_time;
+        unsigned int scale;
 };
 static void throtl_pending_timer_fn(unsigned long arg);
@@ -226,29 +228,70 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
                return container_of(sq, struct throtl_data, service_queue);
 }
+/*
+ * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
+ * make the IO dispatch more smooth.
+ * Scale up: linearly scale up according to lapsed time since upgrade. For
+ *           every throtl_slice, the limit scales up 1/2 .low limit till the
+ *           limit hits .max limit
+ * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit
+ */
+static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td)
+{
+        /* arbitrary value to avoid too big scale */
+        if (td->scale < 4096 && time_after_eq(jiffies,
+            td->low_upgrade_time + td->scale * td->throtl_slice))
+                td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice;
+        return low + (low >> 1) * td->scale;
+}
 static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
 {
        struct blkcg_gq *blkg = tg_to_blkg(tg);
+        struct throtl_data *td;
        uint64_t ret;
        if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
                return U64_MAX;
-        ret = tg->bps[rw][tg->td->limit_index];
-        if (ret == 0 && tg->td->limit_index == LIMIT_LOW)
+        td = tg->td;
+        ret = tg->bps[rw][td->limit_index];
+        if (ret == 0 && td->limit_index == LIMIT_LOW)
                return tg->bps[rw][LIMIT_MAX];
+        if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] &&
+            tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) {
+                uint64_t adjusted;
+                adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td);
+                ret = min(tg->bps[rw][LIMIT_MAX], adjusted);
+        }
        return ret;
 }
 static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
 {
        struct blkcg_gq *blkg = tg_to_blkg(tg);
+        struct throtl_data *td;
        unsigned int ret;
        if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
                return UINT_MAX;
-        ret = tg->iops[rw][tg->td->limit_index];
+        td = tg->td;
+        ret = tg->iops[rw][td->limit_index];
        if (ret == 0 && tg->td->limit_index == LIMIT_LOW)
                return tg->iops[rw][LIMIT_MAX];
+        if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] &&
+            tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) {
+                uint64_t adjusted;
+                adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td);
+                if (adjusted > UINT_MAX)
+                        adjusted = UINT_MAX;
+                ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted);
+        }
        return ret;
 }
@@ -1677,6 +1720,7 @@ static void throtl_upgrade_state(struct throtl_data *td)
        td->limit_index = LIMIT_MAX;
        td->low_upgrade_time = jiffies;
+        td->scale = 0;
        rcu_read_lock();
        blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
                struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -1694,6 +1738,13 @@ static void throtl_upgrade_state(struct throtl_data *td)
 static void throtl_downgrade_state(struct throtl_data *td, int new)
 {
+        td->scale /= 2;
+        if (td->scale) {
+                td->low_upgrade_time = jiffies - td->scale * td->throtl_slice;
+                return;
+        }
        td->limit_index = new;
        td->low_downgrade_time = jiffies;
 }
author	Shaohua Li <shli@fb.com>	2017-03-27 13:51:40 -0400
committer	Jens Axboe <axboe@fb.com>	2017-03-28 10:02:20 -0400
commit	7394e31fa440ab7cd20cebd233580b360a7e9ecc (patch)
tree	311eb865d3b2b01d7461bbb63398242e0e71bdfa /block/blk-throttle.c
parent	aec242468cb84b8eea7130c10530a69d2b352bff (diff)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 014b2e96a423..62984fc92015 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c
@@ -175,6 +175,8 @@ struct throtl_data
175		175
176	unsigned long low_upgrade_time;	176	unsigned long low_upgrade_time;
177	unsigned long low_downgrade_time;	177	unsigned long low_downgrade_time;
		178
		179	unsigned int scale;
178	};	180	};
179		181
180	static void throtl_pending_timer_fn(unsigned long arg);	182	static void throtl_pending_timer_fn(unsigned long arg);
@@ -226,29 +228,70 @@ static struct throtl_data sq_to_td(struct throtl_service_queue sq)
226	return container_of(sq, struct throtl_data, service_queue);	228	return container_of(sq, struct throtl_data, service_queue);
227	}	229	}
228		230
		231	/*
		232	* cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
		233	* make the IO dispatch more smooth.
		234	* Scale up: linearly scale up according to lapsed time since upgrade. For
		235	* every throtl_slice, the limit scales up 1/2 .low limit till the
		236	* limit hits .max limit
		237	* Scale down: exponentially scale down if a cgroup doesn't hit its .low limit
		238	*/
		239	static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td)
		240	{
		241	/* arbitrary value to avoid too big scale */
		242	if (td->scale < 4096 && time_after_eq(jiffies,
		243	td->low_upgrade_time + td->scale * td->throtl_slice))
		244	td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice;
		245
		246	return low + (low >> 1) * td->scale;
		247	}
		248
229	static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)	249	static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
230	{	250	{
231	struct blkcg_gq *blkg = tg_to_blkg(tg);	251	struct blkcg_gq *blkg = tg_to_blkg(tg);
		252	struct throtl_data *td;
232	uint64_t ret;	253	uint64_t ret;
233		254
234	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)	255	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
235	return U64_MAX;	256	return U64_MAX;
236	ret = tg->bps[rw][tg->td->limit_index];	257
237	if (ret == 0 && tg->td->limit_index == LIMIT_LOW)	258	td = tg->td;
		259	ret = tg->bps[rw][td->limit_index];
		260	if (ret == 0 && td->limit_index == LIMIT_LOW)
238	return tg->bps[rw][LIMIT_MAX];	261	return tg->bps[rw][LIMIT_MAX];
		262
		263	if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] &&
		264	tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) {
		265	uint64_t adjusted;
		266
		267	adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td);
		268	ret = min(tg->bps[rw][LIMIT_MAX], adjusted);
		269	}
239	return ret;	270	return ret;
240	}	271	}
241		272
242	static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)	273	static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
243	{	274	{
244	struct blkcg_gq *blkg = tg_to_blkg(tg);	275	struct blkcg_gq *blkg = tg_to_blkg(tg);
		276	struct throtl_data *td;
245	unsigned int ret;	277	unsigned int ret;
246		278
247	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)	279	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
248	return UINT_MAX;	280	return UINT_MAX;
249	ret = tg->iops[rw][tg->td->limit_index];	281	td = tg->td;
		282	ret = tg->iops[rw][td->limit_index];
250	if (ret == 0 && tg->td->limit_index == LIMIT_LOW)	283	if (ret == 0 && tg->td->limit_index == LIMIT_LOW)
251	return tg->iops[rw][LIMIT_MAX];	284	return tg->iops[rw][LIMIT_MAX];
		285
		286	if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] &&
		287	tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) {
		288	uint64_t adjusted;
		289
		290	adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td);
		291	if (adjusted > UINT_MAX)
		292	adjusted = UINT_MAX;
		293	ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted);
		294	}
252	return ret;	295	return ret;
253	}	296	}
254		297
@@ -1677,6 +1720,7 @@ static void throtl_upgrade_state(struct throtl_data *td)
1677		1720
1678	td->limit_index = LIMIT_MAX;	1721	td->limit_index = LIMIT_MAX;
1679	td->low_upgrade_time = jiffies;	1722	td->low_upgrade_time = jiffies;
		1723	td->scale = 0;
1680	rcu_read_lock();	1724	rcu_read_lock();
1681	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {	1725	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
1682	struct throtl_grp *tg = blkg_to_tg(blkg);	1726	struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -1694,6 +1738,13 @@ static void throtl_upgrade_state(struct throtl_data *td)
1694		1738
1695	static void throtl_downgrade_state(struct throtl_data *td, int new)	1739	static void throtl_downgrade_state(struct throtl_data *td, int new)
1696	{	1740	{
		1741	td->scale /= 2;
		1742
		1743	if (td->scale) {
		1744	td->low_upgrade_time = jiffies - td->scale * td->throtl_slice;
		1745	return;
		1746	}
		1747
1697	td->limit_index = new;	1748	td->limit_index = new;
1698	td->low_downgrade_time = jiffies;	1749	td->low_downgrade_time = jiffies;
1699	}	1750	}