summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorShaohua Li <shli@fb.com>2017-03-27 18:19:42 -0400
committerJens Axboe <axboe@fb.com>2017-03-28 10:02:20 -0400
commitb9147dd1bae2b15d6931ecd42f8606c775fecbc9 (patch)
tree9becbcfbf24e535538680bb53f38962808b4e28e
parent88eeca495ba7de749ff253376ec6be19bb05368d (diff)
blk-throttle: add a mechanism to estimate IO latency
User configures latency target, but the latency threshold for each request size isn't fixed. For a SSD, the IO latency highly depends on request size. To calculate latency threshold, we sample some data, eg, average latency for request size 4k, 8k, 16k, 32k .. 1M. The latency threshold of each request size will be the sample latency (I'll call it base latency) plus latency target. For example, the base latency for request size 4k is 80us and user configures latency target 60us. The 4k latency threshold will be 80 + 60 = 140us. To sample data, we calculate the order base 2 of rounded up IO sectors. If the IO size is bigger than 1M, it will be accounted as 1M. Since the calculation does round up, the base latency will be slightly smaller than actual value. Also if there isn't any IO dispatched for a specific IO size, we will use the base latency of smaller IO size for this IO size. But we shouldn't sample data at any time. The base latency is supposed to be latency where disk isn't congested, because we use latency threshold to schedule IOs between cgroups. If disk is congested, the latency is higher, using it for scheduling is meaningless. Hence we only do the sampling when block throttling is in the LOW limit, with assumption disk isn't congested in such state. If the assumption isn't true, eg, low limit is too high, calculated latency threshold will be higher. Hard disk is completely different. Latency depends on spindle seek instead of request size. Currently this feature is SSD only, we probably can use a fixed threshold like 4ms for hard disk though. Signed-off-by: Shaohua Li <shli@fb.com> Signed-off-by: Jens Axboe <axboe@fb.com>
-rw-r--r--block/blk-stat.c15
-rw-r--r--block/blk-stat.h3
-rw-r--r--block/blk-throttle.c166
-rw-r--r--block/blk.h2
-rw-r--r--include/linux/blk_types.h9
5 files changed, 185 insertions, 10 deletions
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 188b535cf4d6..e77ec52f5bb5 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -9,12 +9,14 @@
9 9
10#include "blk-stat.h" 10#include "blk-stat.h"
11#include "blk-mq.h" 11#include "blk-mq.h"
12#include "blk.h"
12 13
13#define BLK_RQ_STAT_BATCH 64 14#define BLK_RQ_STAT_BATCH 64
14 15
15struct blk_queue_stats { 16struct blk_queue_stats {
16 struct list_head callbacks; 17 struct list_head callbacks;
17 spinlock_t lock; 18 spinlock_t lock;
19 bool enable_accounting;
18}; 20};
19 21
20unsigned int blk_stat_rq_ddir(const struct request *rq) 22unsigned int blk_stat_rq_ddir(const struct request *rq)
@@ -96,6 +98,8 @@ void blk_stat_add(struct request *rq)
96 98
97 value = now - blk_stat_time(&rq->issue_stat); 99 value = now - blk_stat_time(&rq->issue_stat);
98 100
101 blk_throtl_stat_add(rq, value);
102
99 rcu_read_lock(); 103 rcu_read_lock();
100 list_for_each_entry_rcu(cb, &q->stats->callbacks, list) { 104 list_for_each_entry_rcu(cb, &q->stats->callbacks, list) {
101 if (blk_stat_is_active(cb)) { 105 if (blk_stat_is_active(cb)) {
@@ -190,7 +194,7 @@ void blk_stat_remove_callback(struct request_queue *q,
190{ 194{
191 spin_lock(&q->stats->lock); 195 spin_lock(&q->stats->lock);
192 list_del_rcu(&cb->list); 196 list_del_rcu(&cb->list);
193 if (list_empty(&q->stats->callbacks)) 197 if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting)
194 clear_bit(QUEUE_FLAG_STATS, &q->queue_flags); 198 clear_bit(QUEUE_FLAG_STATS, &q->queue_flags);
195 spin_unlock(&q->stats->lock); 199 spin_unlock(&q->stats->lock);
196 200
@@ -215,6 +219,14 @@ void blk_stat_free_callback(struct blk_stat_callback *cb)
215} 219}
216EXPORT_SYMBOL_GPL(blk_stat_free_callback); 220EXPORT_SYMBOL_GPL(blk_stat_free_callback);
217 221
222void blk_stat_enable_accounting(struct request_queue *q)
223{
224 spin_lock(&q->stats->lock);
225 q->stats->enable_accounting = true;
226 set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
227 spin_unlock(&q->stats->lock);
228}
229
218struct blk_queue_stats *blk_alloc_queue_stats(void) 230struct blk_queue_stats *blk_alloc_queue_stats(void)
219{ 231{
220 struct blk_queue_stats *stats; 232 struct blk_queue_stats *stats;
@@ -225,6 +237,7 @@ struct blk_queue_stats *blk_alloc_queue_stats(void)
225 237
226 INIT_LIST_HEAD(&stats->callbacks); 238 INIT_LIST_HEAD(&stats->callbacks);
227 spin_lock_init(&stats->lock); 239 spin_lock_init(&stats->lock);
240 stats->enable_accounting = false;
228 241
229 return stats; 242 return stats;
230} 243}
diff --git a/block/blk-stat.h b/block/blk-stat.h
index ee47f816d5bd..53f08a63bf15 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -108,6 +108,9 @@ static inline void blk_stat_set_issue(struct blk_issue_stat *stat,
108 (((u64)blk_capped_size(size)) << BLK_STAT_SIZE_SHIFT); 108 (((u64)blk_capped_size(size)) << BLK_STAT_SIZE_SHIFT);
109} 109}
110 110
111/* record time/size info in request but not add a callback */
112void blk_stat_enable_accounting(struct request_queue *q);
113
111/* 114/*
112 * blk_stat_rq_ddir() - Bucket callback function for the request data direction. 115 * blk_stat_rq_ddir() - Bucket callback function for the request data direction.
113 * @rq: Request. 116 * @rq: Request.
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 6e1c29860eec..140da29f5800 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -28,6 +28,8 @@ static int throtl_quantum = 32;
28/* default latency target is 0, eg, guarantee IO latency by default */ 28/* default latency target is 0, eg, guarantee IO latency by default */
29#define DFL_LATENCY_TARGET (0) 29#define DFL_LATENCY_TARGET (0)
30 30
31#define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT)
32
31static struct blkcg_policy blkcg_policy_throtl; 33static struct blkcg_policy blkcg_policy_throtl;
32 34
33/* A workqueue to queue throttle related work */ 35/* A workqueue to queue throttle related work */
@@ -165,6 +167,19 @@ struct throtl_grp {
165 unsigned long idletime_threshold; /* us */ 167 unsigned long idletime_threshold; /* us */
166}; 168};
167 169
170/* We measure latency for request size from <= 4k to >= 1M */
171#define LATENCY_BUCKET_SIZE 9
172
173struct latency_bucket {
174 unsigned long total_latency; /* ns / 1024 */
175 int samples;
176};
177
178struct avg_latency_bucket {
179 unsigned long latency; /* ns / 1024 */
180 bool valid;
181};
182
168struct throtl_data 183struct throtl_data
169{ 184{
170 /* service tree for active throtl groups */ 185 /* service tree for active throtl groups */
@@ -188,6 +203,13 @@ struct throtl_data
188 unsigned long low_downgrade_time; 203 unsigned long low_downgrade_time;
189 204
190 unsigned int scale; 205 unsigned int scale;
206
207 struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
208 struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
209 struct latency_bucket __percpu *latency_buckets;
210 unsigned long last_calculate_time;
211
212 bool track_bio_latency;
191}; 213};
192 214
193static void throtl_pending_timer_fn(unsigned long arg); 215static void throtl_pending_timer_fn(unsigned long arg);
@@ -306,6 +328,9 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
306 return ret; 328 return ret;
307} 329}
308 330
331#define request_bucket_index(sectors) \
332 clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1)
333
309/** 334/**
310 * throtl_log - log debug message via blktrace 335 * throtl_log - log debug message via blktrace
311 * @sq: the service_queue being reported 336 * @sq: the service_queue being reported
@@ -1931,6 +1956,73 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg)
1931 tg->checked_last_finish_time = last_finish_time; 1956 tg->checked_last_finish_time = last_finish_time;
1932} 1957}
1933 1958
1959#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
1960static void throtl_update_latency_buckets(struct throtl_data *td)
1961{
1962 struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
1963 int i, cpu;
1964 unsigned long last_latency = 0;
1965 unsigned long latency;
1966
1967 if (!blk_queue_nonrot(td->queue))
1968 return;
1969 if (time_before(jiffies, td->last_calculate_time + HZ))
1970 return;
1971 td->last_calculate_time = jiffies;
1972
1973 memset(avg_latency, 0, sizeof(avg_latency));
1974 for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
1975 struct latency_bucket *tmp = &td->tmp_buckets[i];
1976
1977 for_each_possible_cpu(cpu) {
1978 struct latency_bucket *bucket;
1979
1980 /* this isn't race free, but ok in practice */
1981 bucket = per_cpu_ptr(td->latency_buckets, cpu);
1982 tmp->total_latency += bucket[i].total_latency;
1983 tmp->samples += bucket[i].samples;
1984 bucket[i].total_latency = 0;
1985 bucket[i].samples = 0;
1986 }
1987
1988 if (tmp->samples >= 32) {
1989 int samples = tmp->samples;
1990
1991 latency = tmp->total_latency;
1992
1993 tmp->total_latency = 0;
1994 tmp->samples = 0;
1995 latency /= samples;
1996 if (latency == 0)
1997 continue;
1998 avg_latency[i].latency = latency;
1999 }
2000 }
2001
2002 for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
2003 if (!avg_latency[i].latency) {
2004 if (td->avg_buckets[i].latency < last_latency)
2005 td->avg_buckets[i].latency = last_latency;
2006 continue;
2007 }
2008
2009 if (!td->avg_buckets[i].valid)
2010 latency = avg_latency[i].latency;
2011 else
2012 latency = (td->avg_buckets[i].latency * 7 +
2013 avg_latency[i].latency) >> 3;
2014
2015 td->avg_buckets[i].latency = max(latency, last_latency);
2016 td->avg_buckets[i].valid = true;
2017 last_latency = td->avg_buckets[i].latency;
2018 }
2019}
2020#else
2021static inline void throtl_update_latency_buckets(struct throtl_data *td)
2022{
2023}
2024#endif
2025
1934bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, 2026bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
1935 struct bio *bio) 2027 struct bio *bio)
1936{ 2028{
@@ -1939,6 +2031,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
1939 struct throtl_service_queue *sq; 2031 struct throtl_service_queue *sq;
1940 bool rw = bio_data_dir(bio); 2032 bool rw = bio_data_dir(bio);
1941 bool throttled = false; 2033 bool throttled = false;
2034 struct throtl_data *td = tg->td;
1942 int ret; 2035 int ret;
1943 2036
1944 WARN_ON_ONCE(!rcu_read_lock_held()); 2037 WARN_ON_ONCE(!rcu_read_lock_held());
@@ -1949,6 +2042,8 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
1949 2042
1950 spin_lock_irq(q->queue_lock); 2043 spin_lock_irq(q->queue_lock);
1951 2044
2045 throtl_update_latency_buckets(td);
2046
1952 if (unlikely(blk_queue_bypass(q))) 2047 if (unlikely(blk_queue_bypass(q)))
1953 goto out_unlock; 2048 goto out_unlock;
1954 2049
@@ -1956,6 +2051,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
1956#ifdef CONFIG_BLK_DEV_THROTTLING_LOW 2051#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
1957 if (ret == 0 || ret == -EBUSY) 2052 if (ret == 0 || ret == -EBUSY)
1958 bio->bi_cg_private = tg; 2053 bio->bi_cg_private = tg;
2054 blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
1959#endif 2055#endif
1960 blk_throtl_update_idletime(tg); 2056 blk_throtl_update_idletime(tg);
1961 2057
@@ -1974,8 +2070,8 @@ again:
1974 /* if above limits, break to queue */ 2070 /* if above limits, break to queue */
1975 if (!tg_may_dispatch(tg, bio, NULL)) { 2071 if (!tg_may_dispatch(tg, bio, NULL)) {
1976 tg->last_low_overflow_time[rw] = jiffies; 2072 tg->last_low_overflow_time[rw] = jiffies;
1977 if (throtl_can_upgrade(tg->td, tg)) { 2073 if (throtl_can_upgrade(td, tg)) {
1978 throtl_upgrade_state(tg->td); 2074 throtl_upgrade_state(td);
1979 goto again; 2075 goto again;
1980 } 2076 }
1981 break; 2077 break;
@@ -2019,7 +2115,7 @@ again:
2019 2115
2020 tg->last_low_overflow_time[rw] = jiffies; 2116 tg->last_low_overflow_time[rw] = jiffies;
2021 2117
2022 tg->td->nr_queued[rw]++; 2118 td->nr_queued[rw]++;
2023 throtl_add_bio_tg(bio, qn, tg); 2119 throtl_add_bio_tg(bio, qn, tg);
2024 throttled = true; 2120 throttled = true;
2025 2121
@@ -2044,20 +2140,67 @@ out:
2044 */ 2140 */
2045 if (!throttled) 2141 if (!throttled)
2046 bio_clear_flag(bio, BIO_THROTTLED); 2142 bio_clear_flag(bio, BIO_THROTTLED);
2143
2144#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2145 if (throttled || !td->track_bio_latency)
2146 bio->bi_issue_stat.stat |= SKIP_LATENCY;
2147#endif
2047 return throttled; 2148 return throttled;
2048} 2149}
2049 2150
2050#ifdef CONFIG_BLK_DEV_THROTTLING_LOW 2151#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2152static void throtl_track_latency(struct throtl_data *td, sector_t size,
2153 int op, unsigned long time)
2154{
2155 struct latency_bucket *latency;
2156 int index;
2157
2158 if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
2159 !blk_queue_nonrot(td->queue))
2160 return;
2161
2162 index = request_bucket_index(size);
2163
2164 latency = get_cpu_ptr(td->latency_buckets);
2165 latency[index].total_latency += time;
2166 latency[index].samples++;
2167 put_cpu_ptr(td->latency_buckets);
2168}
2169
2170void blk_throtl_stat_add(struct request *rq, u64 time_ns)
2171{
2172 struct request_queue *q = rq->q;
2173 struct throtl_data *td = q->td;
2174
2175 throtl_track_latency(td, blk_stat_size(&rq->issue_stat),
2176 req_op(rq), time_ns >> 10);
2177}
2178
2051void blk_throtl_bio_endio(struct bio *bio) 2179void blk_throtl_bio_endio(struct bio *bio)
2052{ 2180{
2053 struct throtl_grp *tg; 2181 struct throtl_grp *tg;
2182 u64 finish_time_ns;
2183 unsigned long finish_time;
2184 unsigned long start_time;
2185 unsigned long lat;
2054 2186
2055 tg = bio->bi_cg_private; 2187 tg = bio->bi_cg_private;
2056 if (!tg) 2188 if (!tg)
2057 return; 2189 return;
2058 bio->bi_cg_private = NULL; 2190 bio->bi_cg_private = NULL;
2059 2191
2060 tg->last_finish_time = ktime_get_ns() >> 10; 2192 finish_time_ns = ktime_get_ns();
2193 tg->last_finish_time = finish_time_ns >> 10;
2194
2195 start_time = blk_stat_time(&bio->bi_issue_stat) >> 10;
2196 finish_time = __blk_stat_time(finish_time_ns) >> 10;
2197 /* this is only for bio based driver */
2198 if (start_time && finish_time > start_time &&
2199 !(bio->bi_issue_stat.stat & SKIP_LATENCY)) {
2200 lat = finish_time - start_time;
2201 throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat),
2202 bio_op(bio), lat);
2203 }
2061} 2204}
2062#endif 2205#endif
2063 2206
@@ -2133,6 +2276,12 @@ int blk_throtl_init(struct request_queue *q)
2133 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 2276 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
2134 if (!td) 2277 if (!td)
2135 return -ENOMEM; 2278 return -ENOMEM;
2279 td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
2280 LATENCY_BUCKET_SIZE, __alignof__(u64));
2281 if (!td->latency_buckets) {
2282 kfree(td);
2283 return -ENOMEM;
2284 }
2136 2285
2137 INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); 2286 INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
2138 throtl_service_queue_init(&td->service_queue); 2287 throtl_service_queue_init(&td->service_queue);
@@ -2147,8 +2296,10 @@ int blk_throtl_init(struct request_queue *q)
2147 2296
2148 /* activate policy */ 2297 /* activate policy */
2149 ret = blkcg_activate_policy(q, &blkcg_policy_throtl); 2298 ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
2150 if (ret) 2299 if (ret) {
2300 free_percpu(td->latency_buckets);
2151 kfree(td); 2301 kfree(td);
2302 }
2152 return ret; 2303 return ret;
2153} 2304}
2154 2305
@@ -2157,6 +2308,7 @@ void blk_throtl_exit(struct request_queue *q)
2157 BUG_ON(!q->td); 2308 BUG_ON(!q->td);
2158 throtl_shutdown_wq(q); 2309 throtl_shutdown_wq(q);
2159 blkcg_deactivate_policy(q, &blkcg_policy_throtl); 2310 blkcg_deactivate_policy(q, &blkcg_policy_throtl);
2311 free_percpu(q->td->latency_buckets);
2160 kfree(q->td); 2312 kfree(q->td);
2161} 2313}
2162 2314
@@ -2181,6 +2333,10 @@ void blk_throtl_register_queue(struct request_queue *q)
2181 td->throtl_slice = DFL_THROTL_SLICE_HD; 2333 td->throtl_slice = DFL_THROTL_SLICE_HD;
2182#endif 2334#endif
2183 2335
2336 td->track_bio_latency = !q->mq_ops && !q->request_fn;
2337 if (!td->track_bio_latency)
2338 blk_stat_enable_accounting(q);
2339
2184 /* 2340 /*
2185 * some tg are created before queue is fully initialized, eg, nonrot 2341 * some tg are created before queue is fully initialized, eg, nonrot
2186 * isn't initialized yet 2342 * isn't initialized yet
diff --git a/block/blk.h b/block/blk.h
index 3ac833ec2adb..07d375183f31 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -331,8 +331,10 @@ extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
331extern ssize_t blk_throtl_sample_time_store(struct request_queue *q, 331extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
332 const char *page, size_t count); 332 const char *page, size_t count);
333extern void blk_throtl_bio_endio(struct bio *bio); 333extern void blk_throtl_bio_endio(struct bio *bio);
334extern void blk_throtl_stat_add(struct request *rq, u64 time);
334#else 335#else
335static inline void blk_throtl_bio_endio(struct bio *bio) { } 336static inline void blk_throtl_bio_endio(struct bio *bio) { }
337static inline void blk_throtl_stat_add(struct request *rq, u64 time) { }
336#endif 338#endif
337 339
338#endif /* BLK_INTERNAL_H */ 340#endif /* BLK_INTERNAL_H */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 3ad567347671..67bcf8a5326e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -17,6 +17,10 @@ struct io_context;
17struct cgroup_subsys_state; 17struct cgroup_subsys_state;
18typedef void (bio_end_io_t) (struct bio *); 18typedef void (bio_end_io_t) (struct bio *);
19 19
20struct blk_issue_stat {
21 u64 stat;
22};
23
20/* 24/*
21 * main unit of I/O for the block layer and lower layers (ie drivers and 25 * main unit of I/O for the block layer and lower layers (ie drivers and
22 * stacking drivers) 26 * stacking drivers)
@@ -60,6 +64,7 @@ struct bio {
60 struct cgroup_subsys_state *bi_css; 64 struct cgroup_subsys_state *bi_css;
61#ifdef CONFIG_BLK_DEV_THROTTLING_LOW 65#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
62 void *bi_cg_private; 66 void *bi_cg_private;
67 struct blk_issue_stat bi_issue_stat;
63#endif 68#endif
64#endif 69#endif
65 union { 70 union {
@@ -286,10 +291,6 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
286 return (cookie & BLK_QC_T_INTERNAL) != 0; 291 return (cookie & BLK_QC_T_INTERNAL) != 0;
287} 292}
288 293
289struct blk_issue_stat {
290 u64 stat;
291};
292
293struct blk_rq_stat { 294struct blk_rq_stat {
294 s64 mean; 295 s64 mean;
295 u64 min; 296 u64 min;