summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorOmar Sandoval <osandov@fb.com>2018-09-27 18:55:54 -0400
committerJens Axboe <axboe@kernel.dk>2018-09-27 19:34:57 -0400
commit6e25cb01ea206362616a2be469d4f3635f58ca63 (patch)
tree4910f9e2fcfe20267e6337394080cc489cbdd150 /block
parentfa2a1f609e6491383ab63ff6329e0aaa2db2b9f7 (diff)
kyber: implement improved heuristics
Kyber's current heuristics have a few flaws: - It's based on the mean latency, but p99 latency tends to be more meaningful to anyone who cares about latency. The mean can also be skewed by rare outliers that the scheduler can't do anything about. - The statistics calculations are purely time-based with a short window. This works for steady, high load, but is more sensitive to outliers with bursty workloads. - It only considers the latency once an I/O has been submitted to the device, but the user cares about the time spent in the kernel, as well. These are shortcomings of the generic blk-stat code which doesn't quite fit the ideal use case for Kyber. So, this replaces the statistics with a histogram used to calculate percentiles of total latency and I/O latency, which we then use to adjust depths in a slightly more intelligent manner: - Sync and async writes are now the same domain. - Discards are a separate domain. - Domain queue depths are scaled by the ratio of the p99 total latency to the target latency (e.g., if the p99 latency is double the target latency, we will double the queue depth; if the p99 latency is half of the target latency, we can halve the queue depth). - We use the I/O latency to determine whether we should scale queue depths down: we will only scale down if any domain's I/O latency exceeds the target latency, which is an indicator of congestion in the device. These new heuristics are just as scalable as the heuristics they replace. Signed-off-by: Omar Sandoval <osandov@fb.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'block')
-rw-r--r--block/kyber-iosched.c497
1 files changed, 279 insertions, 218 deletions
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 08eb5295c18d..adc8e6393829 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -29,13 +29,16 @@
29#include "blk-mq-debugfs.h" 29#include "blk-mq-debugfs.h"
30#include "blk-mq-sched.h" 30#include "blk-mq-sched.h"
31#include "blk-mq-tag.h" 31#include "blk-mq-tag.h"
32#include "blk-stat.h"
33 32
34/* Scheduling domains. */ 33/*
34 * Scheduling domains: the device is divided into multiple domains based on the
35 * request type.
36 */
35enum { 37enum {
36 KYBER_READ, 38 KYBER_READ,
37 KYBER_SYNC_WRITE, 39 KYBER_WRITE,
38 KYBER_OTHER, /* Async writes, discard, etc. */ 40 KYBER_DISCARD,
41 KYBER_OTHER,
39 KYBER_NUM_DOMAINS, 42 KYBER_NUM_DOMAINS,
40}; 43};
41 44
@@ -49,25 +52,82 @@ enum {
49}; 52};
50 53
51/* 54/*
52 * Initial device-wide depths for each scheduling domain. 55 * Maximum device-wide depth for each scheduling domain.
53 * 56 *
54 * Even for fast devices with lots of tags like NVMe, you can saturate 57 * Even for fast devices with lots of tags like NVMe, you can saturate the
55 * the device with only a fraction of the maximum possible queue depth. 58 * device with only a fraction of the maximum possible queue depth. So, we cap
56 * So, we cap these to a reasonable value. 59 * these to a reasonable value.
57 */ 60 */
58static const unsigned int kyber_depth[] = { 61static const unsigned int kyber_depth[] = {
59 [KYBER_READ] = 256, 62 [KYBER_READ] = 256,
60 [KYBER_SYNC_WRITE] = 128, 63 [KYBER_WRITE] = 128,
61 [KYBER_OTHER] = 64, 64 [KYBER_DISCARD] = 64,
65 [KYBER_OTHER] = 16,
62}; 66};
63 67
64/* 68/*
65 * Scheduling domain batch sizes. We favor reads. 69 * Default latency targets for each scheduling domain.
70 */
71static const u64 kyber_latency_targets[] = {
72 [KYBER_READ] = 2 * NSEC_PER_MSEC,
73 [KYBER_WRITE] = 10 * NSEC_PER_MSEC,
74 [KYBER_DISCARD] = 5 * NSEC_PER_SEC,
75};
76
77/*
78 * Batch size (number of requests we'll dispatch in a row) for each scheduling
79 * domain.
66 */ 80 */
67static const unsigned int kyber_batch_size[] = { 81static const unsigned int kyber_batch_size[] = {
68 [KYBER_READ] = 16, 82 [KYBER_READ] = 16,
69 [KYBER_SYNC_WRITE] = 8, 83 [KYBER_WRITE] = 8,
70 [KYBER_OTHER] = 8, 84 [KYBER_DISCARD] = 1,
85 [KYBER_OTHER] = 1,
86};
87
88/*
89 * Requests latencies are recorded in a histogram with buckets defined relative
90 * to the target latency:
91 *
92 * <= 1/4 * target latency
93 * <= 1/2 * target latency
94 * <= 3/4 * target latency
95 * <= target latency
96 * <= 1 1/4 * target latency
97 * <= 1 1/2 * target latency
98 * <= 1 3/4 * target latency
99 * > 1 3/4 * target latency
100 */
101enum {
102 /*
103 * The width of the latency histogram buckets is
104 * 1 / (1 << KYBER_LATENCY_SHIFT) * target latency.
105 */
106 KYBER_LATENCY_SHIFT = 2,
107 /*
108 * The first (1 << KYBER_LATENCY_SHIFT) buckets are <= target latency,
109 * thus, "good".
110 */
111 KYBER_GOOD_BUCKETS = 1 << KYBER_LATENCY_SHIFT,
112 /* There are also (1 << KYBER_LATENCY_SHIFT) "bad" buckets. */
113 KYBER_LATENCY_BUCKETS = 2 << KYBER_LATENCY_SHIFT,
114};
115
116/*
117 * We measure both the total latency and the I/O latency (i.e., latency after
118 * submitting to the device).
119 */
120enum {
121 KYBER_TOTAL_LATENCY,
122 KYBER_IO_LATENCY,
123};
124
125/*
126 * Per-cpu latency histograms: total latency and I/O latency for each scheduling
127 * domain except for KYBER_OTHER.
128 */
129struct kyber_cpu_latency {
130 atomic_t buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
71}; 131};
72 132
73/* 133/*
@@ -84,14 +144,9 @@ struct kyber_ctx_queue {
84} ____cacheline_aligned_in_smp; 144} ____cacheline_aligned_in_smp;
85 145
86struct kyber_queue_data { 146struct kyber_queue_data {
87 struct request_queue *q;
88
89 struct blk_stat_callback *cb;
90
91 /* 147 /*
92 * The device is divided into multiple scheduling domains based on the 148 * Each scheduling domain has a limited number of in-flight requests
93 * request type. Each domain has a fixed number of in-flight requests of 149 * device-wide, limited by these tokens.
94 * that type device-wide, limited by these tokens.
95 */ 150 */
96 struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS]; 151 struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
97 152
@@ -101,8 +156,19 @@ struct kyber_queue_data {
101 */ 156 */
102 unsigned int async_depth; 157 unsigned int async_depth;
103 158
159 struct kyber_cpu_latency __percpu *cpu_latency;
160
161 /* Timer for stats aggregation and adjusting domain tokens. */
162 struct timer_list timer;
163
164 unsigned int latency_buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
165
166 unsigned long latency_timeout[KYBER_OTHER];
167
168 int domain_p99[KYBER_OTHER];
169
104 /* Target latencies in nanoseconds. */ 170 /* Target latencies in nanoseconds. */
105 u64 read_lat_nsec, write_lat_nsec; 171 u64 latency_targets[KYBER_OTHER];
106}; 172};
107 173
108struct kyber_hctx_data { 174struct kyber_hctx_data {
@@ -122,182 +188,165 @@ static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
122 188
123static unsigned int kyber_sched_domain(unsigned int op) 189static unsigned int kyber_sched_domain(unsigned int op)
124{ 190{
125 if ((op & REQ_OP_MASK) == REQ_OP_READ) 191 switch (op & REQ_OP_MASK) {
192 case REQ_OP_READ:
126 return KYBER_READ; 193 return KYBER_READ;
127 else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op)) 194 case REQ_OP_WRITE:
128 return KYBER_SYNC_WRITE; 195 return KYBER_WRITE;
129 else 196 case REQ_OP_DISCARD:
197 return KYBER_DISCARD;
198 default:
130 return KYBER_OTHER; 199 return KYBER_OTHER;
200 }
131} 201}
132 202
133enum { 203static void flush_latency_buckets(struct kyber_queue_data *kqd,
134 NONE = 0, 204 struct kyber_cpu_latency *cpu_latency,
135 GOOD = 1, 205 unsigned int sched_domain, unsigned int type)
136 GREAT = 2,
137 BAD = -1,
138 AWFUL = -2,
139};
140
141#define IS_GOOD(status) ((status) > 0)
142#define IS_BAD(status) ((status) < 0)
143
144static int kyber_lat_status(struct blk_stat_callback *cb,
145 unsigned int sched_domain, u64 target)
146{ 206{
147 u64 latency; 207 unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
148 208 atomic_t *cpu_buckets = cpu_latency->buckets[sched_domain][type];
149 if (!cb->stat[sched_domain].nr_samples) 209 unsigned int bucket;
150 return NONE;
151 210
152 latency = cb->stat[sched_domain].mean; 211 for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
153 if (latency >= 2 * target) 212 buckets[bucket] += atomic_xchg(&cpu_buckets[bucket], 0);
154 return AWFUL;
155 else if (latency > target)
156 return BAD;
157 else if (latency <= target / 2)
158 return GREAT;
159 else /* (latency <= target) */
160 return GOOD;
161} 213}
162 214
163/* 215/*
164 * Adjust the read or synchronous write depth given the status of reads and 216 * Calculate the histogram bucket with the given percentile rank, or -1 if there
165 * writes. The goal is that the latencies of the two domains are fair (i.e., if 217 * aren't enough samples yet.
166 * one is good, then the other is good).
167 */ 218 */
168static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd, 219static int calculate_percentile(struct kyber_queue_data *kqd,
169 unsigned int sched_domain, int this_status, 220 unsigned int sched_domain, unsigned int type,
170 int other_status) 221 unsigned int percentile)
171{ 222{
172 unsigned int orig_depth, depth; 223 unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
224 unsigned int bucket, samples = 0, percentile_samples;
225
226 for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
227 samples += buckets[bucket];
228
229 if (!samples)
230 return -1;
173 231
174 /* 232 /*
175 * If this domain had no samples, or reads and writes are both good or 233 * We do the calculation once we have 500 samples or one second passes
176 * both bad, don't adjust the depth. 234 * since the first sample was recorded, whichever comes first.
177 */ 235 */
178 if (this_status == NONE || 236 if (!kqd->latency_timeout[sched_domain])
179 (IS_GOOD(this_status) && IS_GOOD(other_status)) || 237 kqd->latency_timeout[sched_domain] = max(jiffies + HZ, 1UL);
180 (IS_BAD(this_status) && IS_BAD(other_status))) 238 if (samples < 500 &&
181 return; 239 time_is_after_jiffies(kqd->latency_timeout[sched_domain])) {
182 240 return -1;
183 orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth; 241 }
242 kqd->latency_timeout[sched_domain] = 0;
184 243
185 if (other_status == NONE) { 244 percentile_samples = DIV_ROUND_UP(samples * percentile, 100);
186 depth++; 245 for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS - 1; bucket++) {
187 } else { 246 if (buckets[bucket] >= percentile_samples)
188 switch (this_status) {
189 case GOOD:
190 if (other_status == AWFUL)
191 depth -= max(depth / 4, 1U);
192 else
193 depth -= max(depth / 8, 1U);
194 break;
195 case GREAT:
196 if (other_status == AWFUL)
197 depth /= 2;
198 else
199 depth -= max(depth / 4, 1U);
200 break; 247 break;
201 case BAD: 248 percentile_samples -= buckets[bucket];
202 depth++;
203 break;
204 case AWFUL:
205 if (other_status == GREAT)
206 depth += 2;
207 else
208 depth++;
209 break;
210 }
211 } 249 }
250 memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type]));
212 251
252 return bucket;
253}
254
255static void kyber_resize_domain(struct kyber_queue_data *kqd,
256 unsigned int sched_domain, unsigned int depth)
257{
213 depth = clamp(depth, 1U, kyber_depth[sched_domain]); 258 depth = clamp(depth, 1U, kyber_depth[sched_domain]);
214 if (depth != orig_depth) 259 if (depth != kqd->domain_tokens[sched_domain].sb.depth)
215 sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth); 260 sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
216} 261}
217 262
218/* 263static void kyber_timer_fn(struct timer_list *t)
219 * Adjust the depth of other requests given the status of reads and synchronous 264{
220 * writes. As long as either domain is doing fine, we don't throttle, but if 265 struct kyber_queue_data *kqd = from_timer(kqd, t, timer);
221 * both domains are doing badly, we throttle heavily. 266 unsigned int sched_domain;
222 */ 267 int cpu;
223static void kyber_adjust_other_depth(struct kyber_queue_data *kqd, 268 bool bad = false;
224 int read_status, int write_status, 269
225 bool have_samples) 270 /* Sum all of the per-cpu latency histograms. */
226{ 271 for_each_online_cpu(cpu) {
227 unsigned int orig_depth, depth; 272 struct kyber_cpu_latency *cpu_latency;
228 int status; 273
229 274 cpu_latency = per_cpu_ptr(kqd->cpu_latency, cpu);
230 orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth; 275 for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
231 276 flush_latency_buckets(kqd, cpu_latency, sched_domain,
232 if (read_status == NONE && write_status == NONE) { 277 KYBER_TOTAL_LATENCY);
233 depth += 2; 278 flush_latency_buckets(kqd, cpu_latency, sched_domain,
234 } else if (have_samples) { 279 KYBER_IO_LATENCY);
235 if (read_status == NONE)
236 status = write_status;
237 else if (write_status == NONE)
238 status = read_status;
239 else
240 status = max(read_status, write_status);
241 switch (status) {
242 case GREAT:
243 depth += 2;
244 break;
245 case GOOD:
246 depth++;
247 break;
248 case BAD:
249 depth -= max(depth / 4, 1U);
250 break;
251 case AWFUL:
252 depth /= 2;
253 break;
254 } 280 }
255 } 281 }
256 282
257 depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]); 283 /*
258 if (depth != orig_depth) 284 * Check if any domains have a high I/O latency, which might indicate
259 sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth); 285 * congestion in the device. Note that we use the p90; we don't want to
260} 286 * be too sensitive to outliers here.
261 287 */
262/* 288 for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
263 * Apply heuristics for limiting queue depths based on gathered latency 289 int p90;
264 * statistics.
265 */
266static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
267{
268 struct kyber_queue_data *kqd = cb->data;
269 int read_status, write_status;
270
271 read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
272 write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
273 290
274 kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status); 291 p90 = calculate_percentile(kqd, sched_domain, KYBER_IO_LATENCY,
275 kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status); 292 90);
276 kyber_adjust_other_depth(kqd, read_status, write_status, 293 if (p90 >= KYBER_GOOD_BUCKETS)
277 cb->stat[KYBER_OTHER].nr_samples != 0); 294 bad = true;
295 }
278 296
279 /* 297 /*
280 * Continue monitoring latencies if we aren't hitting the targets or 298 * Adjust the scheduling domain depths. If we determined that there was
281 * we're still throttling other requests. 299 * congestion, we throttle all domains with good latencies. Either way,
300 * we ease up on throttling domains with bad latencies.
282 */ 301 */
283 if (!blk_stat_is_active(kqd->cb) && 302 for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
284 ((IS_BAD(read_status) || IS_BAD(write_status) || 303 unsigned int orig_depth, depth;
285 kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER]))) 304 int p99;
286 blk_stat_activate_msecs(kqd->cb, 100); 305
306 p99 = calculate_percentile(kqd, sched_domain,
307 KYBER_TOTAL_LATENCY, 99);
308 /*
309 * This is kind of subtle: different domains will not
310 * necessarily have enough samples to calculate the latency
311 * percentiles during the same window, so we have to remember
312 * the p99 for the next time we observe congestion; once we do,
313 * we don't want to throttle again until we get more data, so we
314 * reset it to -1.
315 */
316 if (bad) {
317 if (p99 < 0)
318 p99 = kqd->domain_p99[sched_domain];
319 kqd->domain_p99[sched_domain] = -1;
320 } else if (p99 >= 0) {
321 kqd->domain_p99[sched_domain] = p99;
322 }
323 if (p99 < 0)
324 continue;
325
326 /*
327 * If this domain has bad latency, throttle less. Otherwise,
328 * throttle more iff we determined that there is congestion.
329 *
330 * The new depth is scaled linearly with the p99 latency vs the
331 * latency target. E.g., if the p99 is 3/4 of the target, then
332 * we throttle down to 3/4 of the current depth, and if the p99
333 * is 2x the target, then we double the depth.
334 */
335 if (bad || p99 >= KYBER_GOOD_BUCKETS) {
336 orig_depth = kqd->domain_tokens[sched_domain].sb.depth;
337 depth = (orig_depth * (p99 + 1)) >> KYBER_LATENCY_SHIFT;
338 kyber_resize_domain(kqd, sched_domain, depth);
339 }
340 }
287} 341}
288 342
289static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd) 343static unsigned int kyber_sched_tags_shift(struct request_queue *q)
290{ 344{
291 /* 345 /*
292 * All of the hardware queues have the same depth, so we can just grab 346 * All of the hardware queues have the same depth, so we can just grab
293 * the shift of the first one. 347 * the shift of the first one.
294 */ 348 */
295 return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift; 349 return q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
296}
297
298static int kyber_bucket_fn(const struct request *rq)
299{
300 return kyber_sched_domain(rq->cmd_flags);
301} 350}
302 351
303static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) 352static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
@@ -307,16 +356,17 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
307 int ret = -ENOMEM; 356 int ret = -ENOMEM;
308 int i; 357 int i;
309 358
310 kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node); 359 kqd = kzalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
311 if (!kqd) 360 if (!kqd)
312 goto err; 361 goto err;
313 kqd->q = q;
314 362
315 kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn, 363 kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency,
316 KYBER_NUM_DOMAINS, kqd); 364 GFP_KERNEL | __GFP_ZERO);
317 if (!kqd->cb) 365 if (!kqd->cpu_latency)
318 goto err_kqd; 366 goto err_kqd;
319 367
368 timer_setup(&kqd->timer, kyber_timer_fn, 0);
369
320 for (i = 0; i < KYBER_NUM_DOMAINS; i++) { 370 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
321 WARN_ON(!kyber_depth[i]); 371 WARN_ON(!kyber_depth[i]);
322 WARN_ON(!kyber_batch_size[i]); 372 WARN_ON(!kyber_batch_size[i]);
@@ -326,20 +376,22 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
326 if (ret) { 376 if (ret) {
327 while (--i >= 0) 377 while (--i >= 0)
328 sbitmap_queue_free(&kqd->domain_tokens[i]); 378 sbitmap_queue_free(&kqd->domain_tokens[i]);
329 goto err_cb; 379 goto err_buckets;
330 } 380 }
331 } 381 }
332 382
333 shift = kyber_sched_tags_shift(kqd); 383 for (i = 0; i < KYBER_OTHER; i++) {
334 kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; 384 kqd->domain_p99[i] = -1;
385 kqd->latency_targets[i] = kyber_latency_targets[i];
386 }
335 387
336 kqd->read_lat_nsec = 2000000ULL; 388 shift = kyber_sched_tags_shift(q);
337 kqd->write_lat_nsec = 10000000ULL; 389 kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
338 390
339 return kqd; 391 return kqd;
340 392
341err_cb: 393err_buckets:
342 blk_stat_free_callback(kqd->cb); 394 free_percpu(kqd->cpu_latency);
343err_kqd: 395err_kqd:
344 kfree(kqd); 396 kfree(kqd);
345err: 397err:
@@ -361,25 +413,24 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
361 return PTR_ERR(kqd); 413 return PTR_ERR(kqd);
362 } 414 }
363 415
416 blk_stat_enable_accounting(q);
417
364 eq->elevator_data = kqd; 418 eq->elevator_data = kqd;
365 q->elevator = eq; 419 q->elevator = eq;
366 420
367 blk_stat_add_callback(q, kqd->cb);
368
369 return 0; 421 return 0;
370} 422}
371 423
372static void kyber_exit_sched(struct elevator_queue *e) 424static void kyber_exit_sched(struct elevator_queue *e)
373{ 425{
374 struct kyber_queue_data *kqd = e->elevator_data; 426 struct kyber_queue_data *kqd = e->elevator_data;
375 struct request_queue *q = kqd->q;
376 int i; 427 int i;
377 428
378 blk_stat_remove_callback(q, kqd->cb); 429 del_timer_sync(&kqd->timer);
379 430
380 for (i = 0; i < KYBER_NUM_DOMAINS; i++) 431 for (i = 0; i < KYBER_NUM_DOMAINS; i++)
381 sbitmap_queue_free(&kqd->domain_tokens[i]); 432 sbitmap_queue_free(&kqd->domain_tokens[i]);
382 blk_stat_free_callback(kqd->cb); 433 free_percpu(kqd->cpu_latency);
383 kfree(kqd); 434 kfree(kqd);
384} 435}
385 436
@@ -547,40 +598,44 @@ static void kyber_finish_request(struct request *rq)
547 rq_clear_domain_token(kqd, rq); 598 rq_clear_domain_token(kqd, rq);
548} 599}
549 600
550static void kyber_completed_request(struct request *rq, u64 now) 601static void add_latency_sample(struct kyber_cpu_latency *cpu_latency,
602 unsigned int sched_domain, unsigned int type,
603 u64 target, u64 latency)
551{ 604{
552 struct request_queue *q = rq->q; 605 unsigned int bucket;
553 struct kyber_queue_data *kqd = q->elevator->elevator_data; 606 u64 divisor;
554 unsigned int sched_domain;
555 u64 latency, target;
556 607
557 /* 608 if (latency > 0) {
558 * Check if this request met our latency goal. If not, quickly gather 609 divisor = max_t(u64, target >> KYBER_LATENCY_SHIFT, 1);
559 * some statistics and start throttling. 610 bucket = min_t(unsigned int, div64_u64(latency - 1, divisor),
560 */ 611 KYBER_LATENCY_BUCKETS - 1);
561 sched_domain = kyber_sched_domain(rq->cmd_flags); 612 } else {
562 switch (sched_domain) { 613 bucket = 0;
563 case KYBER_READ:
564 target = kqd->read_lat_nsec;
565 break;
566 case KYBER_SYNC_WRITE:
567 target = kqd->write_lat_nsec;
568 break;
569 default:
570 return;
571 } 614 }
572 615
573 /* If we are already monitoring latencies, don't check again. */ 616 atomic_inc(&cpu_latency->buckets[sched_domain][type][bucket]);
574 if (blk_stat_is_active(kqd->cb)) 617}
575 return;
576 618
577 if (now < rq->io_start_time_ns) 619static void kyber_completed_request(struct request *rq, u64 now)
620{
621 struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
622 struct kyber_cpu_latency *cpu_latency;
623 unsigned int sched_domain;
624 u64 target;
625
626 sched_domain = kyber_sched_domain(rq->cmd_flags);
627 if (sched_domain == KYBER_OTHER)
578 return; 628 return;
579 629
580 latency = now - rq->io_start_time_ns; 630 cpu_latency = get_cpu_ptr(kqd->cpu_latency);
631 target = kqd->latency_targets[sched_domain];
632 add_latency_sample(cpu_latency, sched_domain, KYBER_TOTAL_LATENCY,
633 target, now - rq->start_time_ns);
634 add_latency_sample(cpu_latency, sched_domain, KYBER_IO_LATENCY, target,
635 now - rq->io_start_time_ns);
636 put_cpu_ptr(kqd->cpu_latency);
581 637
582 if (latency > target) 638 timer_reduce(&kqd->timer, jiffies + HZ / 10);
583 blk_stat_activate_msecs(kqd->cb, 10);
584} 639}
585 640
586struct flush_kcq_data { 641struct flush_kcq_data {
@@ -778,17 +833,17 @@ static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
778 return false; 833 return false;
779} 834}
780 835
781#define KYBER_LAT_SHOW_STORE(op) \ 836#define KYBER_LAT_SHOW_STORE(domain, name) \
782static ssize_t kyber_##op##_lat_show(struct elevator_queue *e, \ 837static ssize_t kyber_##name##_lat_show(struct elevator_queue *e, \
783 char *page) \ 838 char *page) \
784{ \ 839{ \
785 struct kyber_queue_data *kqd = e->elevator_data; \ 840 struct kyber_queue_data *kqd = e->elevator_data; \
786 \ 841 \
787 return sprintf(page, "%llu\n", kqd->op##_lat_nsec); \ 842 return sprintf(page, "%llu\n", kqd->latency_targets[domain]); \
788} \ 843} \
789 \ 844 \
790static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \ 845static ssize_t kyber_##name##_lat_store(struct elevator_queue *e, \
791 const char *page, size_t count) \ 846 const char *page, size_t count) \
792{ \ 847{ \
793 struct kyber_queue_data *kqd = e->elevator_data; \ 848 struct kyber_queue_data *kqd = e->elevator_data; \
794 unsigned long long nsec; \ 849 unsigned long long nsec; \
@@ -798,12 +853,12 @@ static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \
798 if (ret) \ 853 if (ret) \
799 return ret; \ 854 return ret; \
800 \ 855 \
801 kqd->op##_lat_nsec = nsec; \ 856 kqd->latency_targets[domain] = nsec; \
802 \ 857 \
803 return count; \ 858 return count; \
804} 859}
805KYBER_LAT_SHOW_STORE(read); 860KYBER_LAT_SHOW_STORE(KYBER_READ, read);
806KYBER_LAT_SHOW_STORE(write); 861KYBER_LAT_SHOW_STORE(KYBER_WRITE, write);
807#undef KYBER_LAT_SHOW_STORE 862#undef KYBER_LAT_SHOW_STORE
808 863
809#define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store) 864#define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
@@ -870,7 +925,8 @@ static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \
870 return 0; \ 925 return 0; \
871} 926}
872KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read) 927KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
873KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write) 928KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_WRITE, write)
929KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_DISCARD, discard)
874KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other) 930KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
875#undef KYBER_DEBUGFS_DOMAIN_ATTRS 931#undef KYBER_DEBUGFS_DOMAIN_ATTRS
876 932
@@ -892,8 +948,11 @@ static int kyber_cur_domain_show(void *data, struct seq_file *m)
892 case KYBER_READ: 948 case KYBER_READ:
893 seq_puts(m, "READ\n"); 949 seq_puts(m, "READ\n");
894 break; 950 break;
895 case KYBER_SYNC_WRITE: 951 case KYBER_WRITE:
896 seq_puts(m, "SYNC_WRITE\n"); 952 seq_puts(m, "WRITE\n");
953 break;
954 case KYBER_DISCARD:
955 seq_puts(m, "DISCARD\n");
897 break; 956 break;
898 case KYBER_OTHER: 957 case KYBER_OTHER:
899 seq_puts(m, "OTHER\n"); 958 seq_puts(m, "OTHER\n");
@@ -918,7 +977,8 @@ static int kyber_batching_show(void *data, struct seq_file *m)
918 {#name "_tokens", 0400, kyber_##name##_tokens_show} 977 {#name "_tokens", 0400, kyber_##name##_tokens_show}
919static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = { 978static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
920 KYBER_QUEUE_DOMAIN_ATTRS(read), 979 KYBER_QUEUE_DOMAIN_ATTRS(read),
921 KYBER_QUEUE_DOMAIN_ATTRS(sync_write), 980 KYBER_QUEUE_DOMAIN_ATTRS(write),
981 KYBER_QUEUE_DOMAIN_ATTRS(discard),
922 KYBER_QUEUE_DOMAIN_ATTRS(other), 982 KYBER_QUEUE_DOMAIN_ATTRS(other),
923 {"async_depth", 0400, kyber_async_depth_show}, 983 {"async_depth", 0400, kyber_async_depth_show},
924 {}, 984 {},
@@ -930,7 +990,8 @@ static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
930 {#name "_waiting", 0400, kyber_##name##_waiting_show} 990 {#name "_waiting", 0400, kyber_##name##_waiting_show}
931static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = { 991static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
932 KYBER_HCTX_DOMAIN_ATTRS(read), 992 KYBER_HCTX_DOMAIN_ATTRS(read),
933 KYBER_HCTX_DOMAIN_ATTRS(sync_write), 993 KYBER_HCTX_DOMAIN_ATTRS(write),
994 KYBER_HCTX_DOMAIN_ATTRS(discard),
934 KYBER_HCTX_DOMAIN_ATTRS(other), 995 KYBER_HCTX_DOMAIN_ATTRS(other),
935 {"cur_domain", 0400, kyber_cur_domain_show}, 996 {"cur_domain", 0400, kyber_cur_domain_show},
936 {"batching", 0400, kyber_batching_show}, 997 {"batching", 0400, kyber_batching_show},