summaryrefslogtreecommitdiffstats
path: root/block/blk-throttle.c
diff options
context:
space:
mode:
authorShaohua Li <shli@fb.com>2017-03-27 18:19:42 -0400
committerJens Axboe <axboe@fb.com>2017-03-28 10:02:20 -0400
commitb9147dd1bae2b15d6931ecd42f8606c775fecbc9 (patch)
tree9becbcfbf24e535538680bb53f38962808b4e28e /block/blk-throttle.c
parent88eeca495ba7de749ff253376ec6be19bb05368d (diff)
blk-throttle: add a mechanism to estimate IO latency
User configures latency target, but the latency threshold for each request size isn't fixed. For a SSD, the IO latency highly depends on request size. To calculate latency threshold, we sample some data, eg, average latency for request size 4k, 8k, 16k, 32k .. 1M. The latency threshold of each request size will be the sample latency (I'll call it base latency) plus latency target. For example, the base latency for request size 4k is 80us and user configures latency target 60us. The 4k latency threshold will be 80 + 60 = 140us. To sample data, we calculate the order base 2 of rounded up IO sectors. If the IO size is bigger than 1M, it will be accounted as 1M. Since the calculation does round up, the base latency will be slightly smaller than actual value. Also if there isn't any IO dispatched for a specific IO size, we will use the base latency of smaller IO size for this IO size. But we shouldn't sample data at any time. The base latency is supposed to be latency where disk isn't congested, because we use latency threshold to schedule IOs between cgroups. If disk is congested, the latency is higher, using it for scheduling is meaningless. Hence we only do the sampling when block throttling is in the LOW limit, with assumption disk isn't congested in such state. If the assumption isn't true, eg, low limit is too high, calculated latency threshold will be higher. Hard disk is completely different. Latency depends on spindle seek instead of request size. Currently this feature is SSD only, we probably can use a fixed threshold like 4ms for hard disk though. Signed-off-by: Shaohua Li <shli@fb.com> Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'block/blk-throttle.c')
-rw-r--r--block/blk-throttle.c166
1 files changed, 161 insertions, 5 deletions
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 6e1c29860eec..140da29f5800 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -28,6 +28,8 @@ static int throtl_quantum = 32;
28/* default latency target is 0, eg, guarantee IO latency by default */ 28/* default latency target is 0, eg, guarantee IO latency by default */
29#define DFL_LATENCY_TARGET (0) 29#define DFL_LATENCY_TARGET (0)
30 30
31#define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT)
32
31static struct blkcg_policy blkcg_policy_throtl; 33static struct blkcg_policy blkcg_policy_throtl;
32 34
33/* A workqueue to queue throttle related work */ 35/* A workqueue to queue throttle related work */
@@ -165,6 +167,19 @@ struct throtl_grp {
165 unsigned long idletime_threshold; /* us */ 167 unsigned long idletime_threshold; /* us */
166}; 168};
167 169
170/* We measure latency for request size from <= 4k to >= 1M */
171#define LATENCY_BUCKET_SIZE 9
172
173struct latency_bucket {
174 unsigned long total_latency; /* ns / 1024 */
175 int samples;
176};
177
178struct avg_latency_bucket {
179 unsigned long latency; /* ns / 1024 */
180 bool valid;
181};
182
168struct throtl_data 183struct throtl_data
169{ 184{
170 /* service tree for active throtl groups */ 185 /* service tree for active throtl groups */
@@ -188,6 +203,13 @@ struct throtl_data
188 unsigned long low_downgrade_time; 203 unsigned long low_downgrade_time;
189 204
190 unsigned int scale; 205 unsigned int scale;
206
207 struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
208 struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
209 struct latency_bucket __percpu *latency_buckets;
210 unsigned long last_calculate_time;
211
212 bool track_bio_latency;
191}; 213};
192 214
193static void throtl_pending_timer_fn(unsigned long arg); 215static void throtl_pending_timer_fn(unsigned long arg);
@@ -306,6 +328,9 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
306 return ret; 328 return ret;
307} 329}
308 330
331#define request_bucket_index(sectors) \
332 clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1)
333
309/** 334/**
310 * throtl_log - log debug message via blktrace 335 * throtl_log - log debug message via blktrace
311 * @sq: the service_queue being reported 336 * @sq: the service_queue being reported
@@ -1931,6 +1956,73 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg)
1931 tg->checked_last_finish_time = last_finish_time; 1956 tg->checked_last_finish_time = last_finish_time;
1932} 1957}
1933 1958
1959#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
1960static void throtl_update_latency_buckets(struct throtl_data *td)
1961{
1962 struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
1963 int i, cpu;
1964 unsigned long last_latency = 0;
1965 unsigned long latency;
1966
1967 if (!blk_queue_nonrot(td->queue))
1968 return;
1969 if (time_before(jiffies, td->last_calculate_time + HZ))
1970 return;
1971 td->last_calculate_time = jiffies;
1972
1973 memset(avg_latency, 0, sizeof(avg_latency));
1974 for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
1975 struct latency_bucket *tmp = &td->tmp_buckets[i];
1976
1977 for_each_possible_cpu(cpu) {
1978 struct latency_bucket *bucket;
1979
1980 /* this isn't race free, but ok in practice */
1981 bucket = per_cpu_ptr(td->latency_buckets, cpu);
1982 tmp->total_latency += bucket[i].total_latency;
1983 tmp->samples += bucket[i].samples;
1984 bucket[i].total_latency = 0;
1985 bucket[i].samples = 0;
1986 }
1987
1988 if (tmp->samples >= 32) {
1989 int samples = tmp->samples;
1990
1991 latency = tmp->total_latency;
1992
1993 tmp->total_latency = 0;
1994 tmp->samples = 0;
1995 latency /= samples;
1996 if (latency == 0)
1997 continue;
1998 avg_latency[i].latency = latency;
1999 }
2000 }
2001
2002 for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
2003 if (!avg_latency[i].latency) {
2004 if (td->avg_buckets[i].latency < last_latency)
2005 td->avg_buckets[i].latency = last_latency;
2006 continue;
2007 }
2008
2009 if (!td->avg_buckets[i].valid)
2010 latency = avg_latency[i].latency;
2011 else
2012 latency = (td->avg_buckets[i].latency * 7 +
2013 avg_latency[i].latency) >> 3;
2014
2015 td->avg_buckets[i].latency = max(latency, last_latency);
2016 td->avg_buckets[i].valid = true;
2017 last_latency = td->avg_buckets[i].latency;
2018 }
2019}
2020#else
2021static inline void throtl_update_latency_buckets(struct throtl_data *td)
2022{
2023}
2024#endif
2025
1934bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, 2026bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
1935 struct bio *bio) 2027 struct bio *bio)
1936{ 2028{
@@ -1939,6 +2031,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
1939 struct throtl_service_queue *sq; 2031 struct throtl_service_queue *sq;
1940 bool rw = bio_data_dir(bio); 2032 bool rw = bio_data_dir(bio);
1941 bool throttled = false; 2033 bool throttled = false;
2034 struct throtl_data *td = tg->td;
1942 int ret; 2035 int ret;
1943 2036
1944 WARN_ON_ONCE(!rcu_read_lock_held()); 2037 WARN_ON_ONCE(!rcu_read_lock_held());
@@ -1949,6 +2042,8 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
1949 2042
1950 spin_lock_irq(q->queue_lock); 2043 spin_lock_irq(q->queue_lock);
1951 2044
2045 throtl_update_latency_buckets(td);
2046
1952 if (unlikely(blk_queue_bypass(q))) 2047 if (unlikely(blk_queue_bypass(q)))
1953 goto out_unlock; 2048 goto out_unlock;
1954 2049
@@ -1956,6 +2051,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
1956#ifdef CONFIG_BLK_DEV_THROTTLING_LOW 2051#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
1957 if (ret == 0 || ret == -EBUSY) 2052 if (ret == 0 || ret == -EBUSY)
1958 bio->bi_cg_private = tg; 2053 bio->bi_cg_private = tg;
2054 blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
1959#endif 2055#endif
1960 blk_throtl_update_idletime(tg); 2056 blk_throtl_update_idletime(tg);
1961 2057
@@ -1974,8 +2070,8 @@ again:
1974 /* if above limits, break to queue */ 2070 /* if above limits, break to queue */
1975 if (!tg_may_dispatch(tg, bio, NULL)) { 2071 if (!tg_may_dispatch(tg, bio, NULL)) {
1976 tg->last_low_overflow_time[rw] = jiffies; 2072 tg->last_low_overflow_time[rw] = jiffies;
1977 if (throtl_can_upgrade(tg->td, tg)) { 2073 if (throtl_can_upgrade(td, tg)) {
1978 throtl_upgrade_state(tg->td); 2074 throtl_upgrade_state(td);
1979 goto again; 2075 goto again;
1980 } 2076 }
1981 break; 2077 break;
@@ -2019,7 +2115,7 @@ again:
2019 2115
2020 tg->last_low_overflow_time[rw] = jiffies; 2116 tg->last_low_overflow_time[rw] = jiffies;
2021 2117
2022 tg->td->nr_queued[rw]++; 2118 td->nr_queued[rw]++;
2023 throtl_add_bio_tg(bio, qn, tg); 2119 throtl_add_bio_tg(bio, qn, tg);
2024 throttled = true; 2120 throttled = true;
2025 2121
@@ -2044,20 +2140,67 @@ out:
2044 */ 2140 */
2045 if (!throttled) 2141 if (!throttled)
2046 bio_clear_flag(bio, BIO_THROTTLED); 2142 bio_clear_flag(bio, BIO_THROTTLED);
2143
2144#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2145 if (throttled || !td->track_bio_latency)
2146 bio->bi_issue_stat.stat |= SKIP_LATENCY;
2147#endif
2047 return throttled; 2148 return throttled;
2048} 2149}
2049 2150
2050#ifdef CONFIG_BLK_DEV_THROTTLING_LOW 2151#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2152static void throtl_track_latency(struct throtl_data *td, sector_t size,
2153 int op, unsigned long time)
2154{
2155 struct latency_bucket *latency;
2156 int index;
2157
2158 if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
2159 !blk_queue_nonrot(td->queue))
2160 return;
2161
2162 index = request_bucket_index(size);
2163
2164 latency = get_cpu_ptr(td->latency_buckets);
2165 latency[index].total_latency += time;
2166 latency[index].samples++;
2167 put_cpu_ptr(td->latency_buckets);
2168}
2169
2170void blk_throtl_stat_add(struct request *rq, u64 time_ns)
2171{
2172 struct request_queue *q = rq->q;
2173 struct throtl_data *td = q->td;
2174
2175 throtl_track_latency(td, blk_stat_size(&rq->issue_stat),
2176 req_op(rq), time_ns >> 10);
2177}
2178
2051void blk_throtl_bio_endio(struct bio *bio) 2179void blk_throtl_bio_endio(struct bio *bio)
2052{ 2180{
2053 struct throtl_grp *tg; 2181 struct throtl_grp *tg;
2182 u64 finish_time_ns;
2183 unsigned long finish_time;
2184 unsigned long start_time;
2185 unsigned long lat;
2054 2186
2055 tg = bio->bi_cg_private; 2187 tg = bio->bi_cg_private;
2056 if (!tg) 2188 if (!tg)
2057 return; 2189 return;
2058 bio->bi_cg_private = NULL; 2190 bio->bi_cg_private = NULL;
2059 2191
2060 tg->last_finish_time = ktime_get_ns() >> 10; 2192 finish_time_ns = ktime_get_ns();
2193 tg->last_finish_time = finish_time_ns >> 10;
2194
2195 start_time = blk_stat_time(&bio->bi_issue_stat) >> 10;
2196 finish_time = __blk_stat_time(finish_time_ns) >> 10;
2197 /* this is only for bio based driver */
2198 if (start_time && finish_time > start_time &&
2199 !(bio->bi_issue_stat.stat & SKIP_LATENCY)) {
2200 lat = finish_time - start_time;
2201 throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat),
2202 bio_op(bio), lat);
2203 }
2061} 2204}
2062#endif 2205#endif
2063 2206
@@ -2133,6 +2276,12 @@ int blk_throtl_init(struct request_queue *q)
2133 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 2276 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
2134 if (!td) 2277 if (!td)
2135 return -ENOMEM; 2278 return -ENOMEM;
2279 td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
2280 LATENCY_BUCKET_SIZE, __alignof__(u64));
2281 if (!td->latency_buckets) {
2282 kfree(td);
2283 return -ENOMEM;
2284 }
2136 2285
2137 INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); 2286 INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
2138 throtl_service_queue_init(&td->service_queue); 2287 throtl_service_queue_init(&td->service_queue);
@@ -2147,8 +2296,10 @@ int blk_throtl_init(struct request_queue *q)
2147 2296
2148 /* activate policy */ 2297 /* activate policy */
2149 ret = blkcg_activate_policy(q, &blkcg_policy_throtl); 2298 ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
2150 if (ret) 2299 if (ret) {
2300 free_percpu(td->latency_buckets);
2151 kfree(td); 2301 kfree(td);
2302 }
2152 return ret; 2303 return ret;
2153} 2304}
2154 2305
@@ -2157,6 +2308,7 @@ void blk_throtl_exit(struct request_queue *q)
2157 BUG_ON(!q->td); 2308 BUG_ON(!q->td);
2158 throtl_shutdown_wq(q); 2309 throtl_shutdown_wq(q);
2159 blkcg_deactivate_policy(q, &blkcg_policy_throtl); 2310 blkcg_deactivate_policy(q, &blkcg_policy_throtl);
2311 free_percpu(q->td->latency_buckets);
2160 kfree(q->td); 2312 kfree(q->td);
2161} 2313}
2162 2314
@@ -2181,6 +2333,10 @@ void blk_throtl_register_queue(struct request_queue *q)
2181 td->throtl_slice = DFL_THROTL_SLICE_HD; 2333 td->throtl_slice = DFL_THROTL_SLICE_HD;
2182#endif 2334#endif
2183 2335
2336 td->track_bio_latency = !q->mq_ops && !q->request_fn;
2337 if (!td->track_bio_latency)
2338 blk_stat_enable_accounting(q);
2339
2184 /* 2340 /*
2185 * some tg are created before queue is fully initialized, eg, nonrot 2341 * some tg are created before queue is fully initialized, eg, nonrot
2186 * isn't initialized yet 2342 * isn't initialized yet