summaryrefslogtreecommitdiffstats
path: root/block/kyber-iosched.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/kyber-iosched.c')
-rw-r--r--block/kyber-iosched.c497
1 files changed, 279 insertions, 218 deletions
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 08eb5295c18d..adc8e6393829 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -29,13 +29,16 @@
29#include "blk-mq-debugfs.h" 29#include "blk-mq-debugfs.h"
30#include "blk-mq-sched.h" 30#include "blk-mq-sched.h"
31#include "blk-mq-tag.h" 31#include "blk-mq-tag.h"
32#include "blk-stat.h"
33 32
34/* Scheduling domains. */ 33/*
34 * Scheduling domains: the device is divided into multiple domains based on the
35 * request type.
36 */
35enum { 37enum {
36 KYBER_READ, 38 KYBER_READ,
37 KYBER_SYNC_WRITE, 39 KYBER_WRITE,
38 KYBER_OTHER, /* Async writes, discard, etc. */ 40 KYBER_DISCARD,
41 KYBER_OTHER,
39 KYBER_NUM_DOMAINS, 42 KYBER_NUM_DOMAINS,
40}; 43};
41 44
@@ -49,25 +52,82 @@ enum {
49}; 52};
50 53
51/* 54/*
52 * Initial device-wide depths for each scheduling domain. 55 * Maximum device-wide depth for each scheduling domain.
53 * 56 *
54 * Even for fast devices with lots of tags like NVMe, you can saturate 57 * Even for fast devices with lots of tags like NVMe, you can saturate the
55 * the device with only a fraction of the maximum possible queue depth. 58 * device with only a fraction of the maximum possible queue depth. So, we cap
56 * So, we cap these to a reasonable value. 59 * these to a reasonable value.
57 */ 60 */
58static const unsigned int kyber_depth[] = { 61static const unsigned int kyber_depth[] = {
59 [KYBER_READ] = 256, 62 [KYBER_READ] = 256,
60 [KYBER_SYNC_WRITE] = 128, 63 [KYBER_WRITE] = 128,
61 [KYBER_OTHER] = 64, 64 [KYBER_DISCARD] = 64,
65 [KYBER_OTHER] = 16,
62}; 66};
63 67
64/* 68/*
65 * Scheduling domain batch sizes. We favor reads. 69 * Default latency targets for each scheduling domain.
70 */
71static const u64 kyber_latency_targets[] = {
72 [KYBER_READ] = 2 * NSEC_PER_MSEC,
73 [KYBER_WRITE] = 10 * NSEC_PER_MSEC,
74 [KYBER_DISCARD] = 5 * NSEC_PER_SEC,
75};
76
77/*
78 * Batch size (number of requests we'll dispatch in a row) for each scheduling
79 * domain.
66 */ 80 */
67static const unsigned int kyber_batch_size[] = { 81static const unsigned int kyber_batch_size[] = {
68 [KYBER_READ] = 16, 82 [KYBER_READ] = 16,
69 [KYBER_SYNC_WRITE] = 8, 83 [KYBER_WRITE] = 8,
70 [KYBER_OTHER] = 8, 84 [KYBER_DISCARD] = 1,
85 [KYBER_OTHER] = 1,
86};
87
88/*
89 * Requests latencies are recorded in a histogram with buckets defined relative
90 * to the target latency:
91 *
92 * <= 1/4 * target latency
93 * <= 1/2 * target latency
94 * <= 3/4 * target latency
95 * <= target latency
96 * <= 1 1/4 * target latency
97 * <= 1 1/2 * target latency
98 * <= 1 3/4 * target latency
99 * > 1 3/4 * target latency
100 */
101enum {
102 /*
103 * The width of the latency histogram buckets is
104 * 1 / (1 << KYBER_LATENCY_SHIFT) * target latency.
105 */
106 KYBER_LATENCY_SHIFT = 2,
107 /*
108 * The first (1 << KYBER_LATENCY_SHIFT) buckets are <= target latency,
109 * thus, "good".
110 */
111 KYBER_GOOD_BUCKETS = 1 << KYBER_LATENCY_SHIFT,
112 /* There are also (1 << KYBER_LATENCY_SHIFT) "bad" buckets. */
113 KYBER_LATENCY_BUCKETS = 2 << KYBER_LATENCY_SHIFT,
114};
115
116/*
117 * We measure both the total latency and the I/O latency (i.e., latency after
118 * submitting to the device).
119 */
120enum {
121 KYBER_TOTAL_LATENCY,
122 KYBER_IO_LATENCY,
123};
124
125/*
126 * Per-cpu latency histograms: total latency and I/O latency for each scheduling
127 * domain except for KYBER_OTHER.
128 */
129struct kyber_cpu_latency {
130 atomic_t buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
71}; 131};
72 132
73/* 133/*
@@ -84,14 +144,9 @@ struct kyber_ctx_queue {
84} ____cacheline_aligned_in_smp; 144} ____cacheline_aligned_in_smp;
85 145
86struct kyber_queue_data { 146struct kyber_queue_data {
87 struct request_queue *q;
88
89 struct blk_stat_callback *cb;
90
91 /* 147 /*
92 * The device is divided into multiple scheduling domains based on the 148 * Each scheduling domain has a limited number of in-flight requests
93 * request type. Each domain has a fixed number of in-flight requests of 149 * device-wide, limited by these tokens.
94 * that type device-wide, limited by these tokens.
95 */ 150 */
96 struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS]; 151 struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
97 152
@@ -101,8 +156,19 @@ struct kyber_queue_data {
101 */ 156 */
102 unsigned int async_depth; 157 unsigned int async_depth;
103 158
159 struct kyber_cpu_latency __percpu *cpu_latency;
160
161 /* Timer for stats aggregation and adjusting domain tokens. */
162 struct timer_list timer;
163
164 unsigned int latency_buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
165
166 unsigned long latency_timeout[KYBER_OTHER];
167
168 int domain_p99[KYBER_OTHER];
169
104 /* Target latencies in nanoseconds. */ 170 /* Target latencies in nanoseconds. */
105 u64 read_lat_nsec, write_lat_nsec; 171 u64 latency_targets[KYBER_OTHER];
106}; 172};
107 173
108struct kyber_hctx_data { 174struct kyber_hctx_data {
@@ -122,182 +188,165 @@ static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
122 188
123static unsigned int kyber_sched_domain(unsigned int op) 189static unsigned int kyber_sched_domain(unsigned int op)
124{ 190{
125 if ((op & REQ_OP_MASK) == REQ_OP_READ) 191 switch (op & REQ_OP_MASK) {
192 case REQ_OP_READ:
126 return KYBER_READ; 193 return KYBER_READ;
127 else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op)) 194 case REQ_OP_WRITE:
128 return KYBER_SYNC_WRITE; 195 return KYBER_WRITE;
129 else 196 case REQ_OP_DISCARD:
197 return KYBER_DISCARD;
198 default:
130 return KYBER_OTHER; 199 return KYBER_OTHER;
200 }
131} 201}
132 202
133enum { 203static void flush_latency_buckets(struct kyber_queue_data *kqd,
134 NONE = 0, 204 struct kyber_cpu_latency *cpu_latency,
135 GOOD = 1, 205 unsigned int sched_domain, unsigned int type)
136 GREAT = 2,
137 BAD = -1,
138 AWFUL = -2,
139};
140
141#define IS_GOOD(status) ((status) > 0)
142#define IS_BAD(status) ((status) < 0)
143
144static int kyber_lat_status(struct blk_stat_callback *cb,
145 unsigned int sched_domain, u64 target)
146{ 206{
147 u64 latency; 207 unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
148 208 atomic_t *cpu_buckets = cpu_latency->buckets[sched_domain][type];
149 if (!cb->stat[sched_domain].nr_samples) 209 unsigned int bucket;
150 return NONE;
151 210
152 latency = cb->stat[sched_domain].mean; 211 for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
153 if (latency >= 2 * target) 212 buckets[bucket] += atomic_xchg(&cpu_buckets[bucket], 0);
154 return AWFUL;
155 else if (latency > target)
156 return BAD;
157 else if (latency <= target / 2)
158 return GREAT;
159 else /* (latency <= target) */
160 return GOOD;
161} 213}
162 214
163/* 215/*
164 * Adjust the read or synchronous write depth given the status of reads and 216 * Calculate the histogram bucket with the given percentile rank, or -1 if there
165 * writes. The goal is that the latencies of the two domains are fair (i.e., if 217 * aren't enough samples yet.
166 * one is good, then the other is good).
167 */ 218 */
168static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd, 219static int calculate_percentile(struct kyber_queue_data *kqd,
169 unsigned int sched_domain, int this_status, 220 unsigned int sched_domain, unsigned int type,
170 int other_status) 221 unsigned int percentile)
171{ 222{
172 unsigned int orig_depth, depth; 223 unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
224 unsigned int bucket, samples = 0, percentile_samples;
225
226 for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
227 samples += buckets[bucket];
228
229 if (!samples)
230 return -1;
173 231
174 /* 232 /*
175 * If this domain had no samples, or reads and writes are both good or 233 * We do the calculation once we have 500 samples or one second passes
176 * both bad, don't adjust the depth. 234 * since the first sample was recorded, whichever comes first.
177 */ 235 */
178 if (this_status == NONE || 236 if (!kqd->latency_timeout[sched_domain])
179 (IS_GOOD(this_status) && IS_GOOD(other_status)) || 237 kqd->latency_timeout[sched_domain] = max(jiffies + HZ, 1UL);
180 (IS_BAD(this_status) && IS_BAD(other_status))) 238 if (samples < 500 &&
181 return; 239 time_is_after_jiffies(kqd->latency_timeout[sched_domain])) {
182 240 return -1;
183 orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth; 241 }
242 kqd->latency_timeout[sched_domain] = 0;
184 243
185 if (other_status == NONE) { 244 percentile_samples = DIV_ROUND_UP(samples * percentile, 100);
186 depth++; 245 for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS - 1; bucket++) {
187 } else { 246 if (buckets[bucket] >= percentile_samples)
188 switch (this_status) {
189 case GOOD:
190 if (other_status == AWFUL)
191 depth -= max(depth / 4, 1U);
192 else
193 depth -= max(depth / 8, 1U);
194 break;
195 case GREAT:
196 if (other_status == AWFUL)
197 depth /= 2;
198 else
199 depth -= max(depth / 4, 1U);
200 break; 247 break;
201 case BAD: 248 percentile_samples -= buckets[bucket];
202 depth++;
203 break;
204 case AWFUL:
205 if (other_status == GREAT)
206 depth += 2;
207 else
208 depth++;
209 break;
210 }
211 } 249 }
250 memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type]));
212 251
252 return bucket;
253}
254
255static void kyber_resize_domain(struct kyber_queue_data *kqd,
256 unsigned int sched_domain, unsigned int depth)
257{
213 depth = clamp(depth, 1U, kyber_depth[sched_domain]); 258 depth = clamp(depth, 1U, kyber_depth[sched_domain]);
214 if (depth != orig_depth) 259 if (depth != kqd->domain_tokens[sched_domain].sb.depth)
215 sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth); 260 sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
216} 261}
217 262
218/* 263static void kyber_timer_fn(struct timer_list *t)
219 * Adjust the depth of other requests given the status of reads and synchronous 264{
220 * writes. As long as either domain is doing fine, we don't throttle, but if 265 struct kyber_queue_data *kqd = from_timer(kqd, t, timer);
221 * both domains are doing badly, we throttle heavily. 266 unsigned int sched_domain;
222 */ 267 int cpu;
223static void kyber_adjust_other_depth(struct kyber_queue_data *kqd, 268 bool bad = false;
224 int read_status, int write_status, 269
225 bool have_samples) 270 /* Sum all of the per-cpu latency histograms. */
226{ 271 for_each_online_cpu(cpu) {
227 unsigned int orig_depth, depth; 272 struct kyber_cpu_latency *cpu_latency;
228 int status; 273
229 274 cpu_latency = per_cpu_ptr(kqd->cpu_latency, cpu);
230 orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth; 275 for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
231 276 flush_latency_buckets(kqd, cpu_latency, sched_domain,
232 if (read_status == NONE && write_status == NONE) { 277 KYBER_TOTAL_LATENCY);
233 depth += 2; 278 flush_latency_buckets(kqd, cpu_latency, sched_domain,
234 } else if (have_samples) { 279 KYBER_IO_LATENCY);
235 if (read_status == NONE)
236 status = write_status;
237 else if (write_status == NONE)
238 status = read_status;
239 else
240 status = max(read_status, write_status);
241 switch (status) {
242 case GREAT:
243 depth += 2;
244 break;
245 case GOOD:
246 depth++;
247 break;
248 case BAD:
249 depth -= max(depth / 4, 1U);
250 break;
251 case AWFUL:
252 depth /= 2;
253 break;
254 } 280 }
255 } 281 }
256 282
257 depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]); 283 /*
258 if (depth != orig_depth) 284 * Check if any domains have a high I/O latency, which might indicate
259 sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth); 285 * congestion in the device. Note that we use the p90; we don't want to
260} 286 * be too sensitive to outliers here.
261 287 */
262/* 288 for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
263 * Apply heuristics for limiting queue depths based on gathered latency 289 int p90;
264 * statistics.
265 */
266static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
267{
268 struct kyber_queue_data *kqd = cb->data;
269 int read_status, write_status;
270
271 read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
272 write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
273 290
274 kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status); 291 p90 = calculate_percentile(kqd, sched_domain, KYBER_IO_LATENCY,
275 kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status); 292 90);
276 kyber_adjust_other_depth(kqd, read_status, write_status, 293 if (p90 >= KYBER_GOOD_BUCKETS)
277 cb->stat[KYBER_OTHER].nr_samples != 0); 294 bad = true;
295 }
278 296
279 /* 297 /*
280 * Continue monitoring latencies if we aren't hitting the targets or 298 * Adjust the scheduling domain depths. If we determined that there was
281 * we're still throttling other requests. 299 * congestion, we throttle all domains with good latencies. Either way,
300 * we ease up on throttling domains with bad latencies.
282 */ 301 */
283 if (!blk_stat_is_active(kqd->cb) && 302 for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
284 ((IS_BAD(read_status) || IS_BAD(write_status) || 303 unsigned int orig_depth, depth;
285 kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER]))) 304 int p99;
286 blk_stat_activate_msecs(kqd->cb, 100); 305
306 p99 = calculate_percentile(kqd, sched_domain,
307 KYBER_TOTAL_LATENCY, 99);
308 /*
309 * This is kind of subtle: different domains will not
310 * necessarily have enough samples to calculate the latency
311 * percentiles during the same window, so we have to remember
312 * the p99 for the next time we observe congestion; once we do,
313 * we don't want to throttle again until we get more data, so we
314 * reset it to -1.
315 */
316 if (bad) {
317 if (p99 < 0)
318 p99 = kqd->domain_p99[sched_domain];
319 kqd->domain_p99[sched_domain] = -1;
320 } else if (p99 >= 0) {
321 kqd->domain_p99[sched_domain] = p99;
322 }
323 if (p99 < 0)
324 continue;
325
326 /*
327 * If this domain has bad latency, throttle less. Otherwise,
328 * throttle more iff we determined that there is congestion.
329 *
330 * The new depth is scaled linearly with the p99 latency vs the
331 * latency target. E.g., if the p99 is 3/4 of the target, then
332 * we throttle down to 3/4 of the current depth, and if the p99
333 * is 2x the target, then we double the depth.
334 */
335 if (bad || p99 >= KYBER_GOOD_BUCKETS) {
336 orig_depth = kqd->domain_tokens[sched_domain].sb.depth;
337 depth = (orig_depth * (p99 + 1)) >> KYBER_LATENCY_SHIFT;
338 kyber_resize_domain(kqd, sched_domain, depth);
339 }
340 }
287} 341}
288 342
289static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd) 343static unsigned int kyber_sched_tags_shift(struct request_queue *q)
290{ 344{
291 /* 345 /*
292 * All of the hardware queues have the same depth, so we can just grab 346 * All of the hardware queues have the same depth, so we can just grab
293 * the shift of the first one. 347 * the shift of the first one.
294 */ 348 */
295 return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift; 349 return q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
296}
297
298static int kyber_bucket_fn(const struct request *rq)
299{
300 return kyber_sched_domain(rq->cmd_flags);
301} 350}
302 351
303static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) 352static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
@@ -307,16 +356,17 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
307 int ret = -ENOMEM; 356 int ret = -ENOMEM;
308 int i; 357 int i;
309 358
310 kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node); 359 kqd = kzalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
311 if (!kqd) 360 if (!kqd)
312 goto err; 361 goto err;
313 kqd->q = q;
314 362
315 kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn, 363 kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency,
316 KYBER_NUM_DOMAINS, kqd); 364 GFP_KERNEL | __GFP_ZERO);
317 if (!kqd->cb) 365 if (!kqd->cpu_latency)
318 goto err_kqd; 366 goto err_kqd;
319 367
368 timer_setup(&kqd->timer, kyber_timer_fn, 0);
369
320 for (i = 0; i < KYBER_NUM_DOMAINS; i++) { 370 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
321 WARN_ON(!kyber_depth[i]); 371 WARN_ON(!kyber_depth[i]);
322 WARN_ON(!kyber_batch_size[i]); 372 WARN_ON(!kyber_batch_size[i]);
@@ -326,20 +376,22 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
326 if (ret) { 376 if (ret) {
327 while (--i >= 0) 377 while (--i >= 0)
328 sbitmap_queue_free(&kqd->domain_tokens[i]); 378 sbitmap_queue_free(&kqd->domain_tokens[i]);
329 goto err_cb; 379 goto err_buckets;
330 } 380 }
331 } 381 }
332 382
333 shift = kyber_sched_tags_shift(kqd); 383 for (i = 0; i < KYBER_OTHER; i++) {
334 kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; 384 kqd->domain_p99[i] = -1;
385 kqd->latency_targets[i] = kyber_latency_targets[i];
386 }
335 387
336 kqd->read_lat_nsec = 2000000ULL; 388 shift = kyber_sched_tags_shift(q);
337 kqd->write_lat_nsec = 10000000ULL; 389 kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
338 390
339 return kqd; 391 return kqd;
340 392
341err_cb: 393err_buckets:
342 blk_stat_free_callback(kqd->cb); 394 free_percpu(kqd->cpu_latency);
343err_kqd: 395err_kqd:
344 kfree(kqd); 396 kfree(kqd);
345err: 397err:
@@ -361,25 +413,24 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
361 return PTR_ERR(kqd); 413 return PTR_ERR(kqd);
362 } 414 }
363 415
416 blk_stat_enable_accounting(q);
417
364 eq->elevator_data = kqd; 418 eq->elevator_data = kqd;
365 q->elevator = eq; 419 q->elevator = eq;
366 420
367 blk_stat_add_callback(q, kqd->cb);
368
369 return 0; 421 return 0;
370} 422}
371 423
372static void kyber_exit_sched(struct elevator_queue *e) 424static void kyber_exit_sched(struct elevator_queue *e)
373{ 425{
374 struct kyber_queue_data *kqd = e->elevator_data; 426 struct kyber_queue_data *kqd = e->elevator_data;
375 struct request_queue *q = kqd->q;
376 int i; 427 int i;
377 428
378 blk_stat_remove_callback(q, kqd->cb); 429 del_timer_sync(&kqd->timer);
379 430
380 for (i = 0; i < KYBER_NUM_DOMAINS; i++) 431 for (i = 0; i < KYBER_NUM_DOMAINS; i++)
381 sbitmap_queue_free(&kqd->domain_tokens[i]); 432 sbitmap_queue_free(&kqd->domain_tokens[i]);
382 blk_stat_free_callback(kqd->cb); 433 free_percpu(kqd->cpu_latency);
383 kfree(kqd); 434 kfree(kqd);
384} 435}
385 436
@@ -547,40 +598,44 @@ static void kyber_finish_request(struct request *rq)
547 rq_clear_domain_token(kqd, rq); 598 rq_clear_domain_token(kqd, rq);
548} 599}
549 600
550static void kyber_completed_request(struct request *rq, u64 now) 601static void add_latency_sample(struct kyber_cpu_latency *cpu_latency,
602 unsigned int sched_domain, unsigned int type,
603 u64 target, u64 latency)
551{ 604{
552 struct request_queue *q = rq->q; 605 unsigned int bucket;
553 struct kyber_queue_data *kqd = q->elevator->elevator_data; 606 u64 divisor;
554 unsigned int sched_domain;
555 u64 latency, target;
556 607
557 /* 608 if (latency > 0) {
558 * Check if this request met our latency goal. If not, quickly gather 609 divisor = max_t(u64, target >> KYBER_LATENCY_SHIFT, 1);
559 * some statistics and start throttling. 610 bucket = min_t(unsigned int, div64_u64(latency - 1, divisor),
560 */ 611 KYBER_LATENCY_BUCKETS - 1);
561 sched_domain = kyber_sched_domain(rq->cmd_flags); 612 } else {
562 switch (sched_domain) { 613 bucket = 0;
563 case KYBER_READ:
564 target = kqd->read_lat_nsec;
565 break;
566 case KYBER_SYNC_WRITE:
567 target = kqd->write_lat_nsec;
568 break;
569 default:
570 return;
571 } 614 }
572 615
573 /* If we are already monitoring latencies, don't check again. */ 616 atomic_inc(&cpu_latency->buckets[sched_domain][type][bucket]);
574 if (blk_stat_is_active(kqd->cb)) 617}
575 return;
576 618
577 if (now < rq->io_start_time_ns) 619static void kyber_completed_request(struct request *rq, u64 now)
620{
621 struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
622 struct kyber_cpu_latency *cpu_latency;
623 unsigned int sched_domain;
624 u64 target;
625
626 sched_domain = kyber_sched_domain(rq->cmd_flags);
627 if (sched_domain == KYBER_OTHER)
578 return; 628 return;
579 629
580 latency = now - rq->io_start_time_ns; 630 cpu_latency = get_cpu_ptr(kqd->cpu_latency);
631 target = kqd->latency_targets[sched_domain];
632 add_latency_sample(cpu_latency, sched_domain, KYBER_TOTAL_LATENCY,
633 target, now - rq->start_time_ns);
634 add_latency_sample(cpu_latency, sched_domain, KYBER_IO_LATENCY, target,
635 now - rq->io_start_time_ns);
636 put_cpu_ptr(kqd->cpu_latency);
581 637
582 if (latency > target) 638 timer_reduce(&kqd->timer, jiffies + HZ / 10);
583 blk_stat_activate_msecs(kqd->cb, 10);
584} 639}
585 640
586struct flush_kcq_data { 641struct flush_kcq_data {
@@ -778,17 +833,17 @@ static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
778 return false; 833 return false;
779} 834}
780 835
781#define KYBER_LAT_SHOW_STORE(op) \ 836#define KYBER_LAT_SHOW_STORE(domain, name) \
782static ssize_t kyber_##op##_lat_show(struct elevator_queue *e, \ 837static ssize_t kyber_##name##_lat_show(struct elevator_queue *e, \
783 char *page) \ 838 char *page) \
784{ \ 839{ \
785 struct kyber_queue_data *kqd = e->elevator_data; \ 840 struct kyber_queue_data *kqd = e->elevator_data; \
786 \ 841 \
787 return sprintf(page, "%llu\n", kqd->op##_lat_nsec); \ 842 return sprintf(page, "%llu\n", kqd->latency_targets[domain]); \
788} \ 843} \
789 \ 844 \
790static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \ 845static ssize_t kyber_##name##_lat_store(struct elevator_queue *e, \
791 const char *page, size_t count) \ 846 const char *page, size_t count) \
792{ \ 847{ \
793 struct kyber_queue_data *kqd = e->elevator_data; \ 848 struct kyber_queue_data *kqd = e->elevator_data; \
794 unsigned long long nsec; \ 849 unsigned long long nsec; \
@@ -798,12 +853,12 @@ static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \
798 if (ret) \ 853 if (ret) \
799 return ret; \ 854 return ret; \
800 \ 855 \
801 kqd->op##_lat_nsec = nsec; \ 856 kqd->latency_targets[domain] = nsec; \
802 \ 857 \
803 return count; \ 858 return count; \
804} 859}
805KYBER_LAT_SHOW_STORE(read); 860KYBER_LAT_SHOW_STORE(KYBER_READ, read);
806KYBER_LAT_SHOW_STORE(write); 861KYBER_LAT_SHOW_STORE(KYBER_WRITE, write);
807#undef KYBER_LAT_SHOW_STORE 862#undef KYBER_LAT_SHOW_STORE
808 863
809#define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store) 864#define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
@@ -870,7 +925,8 @@ static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \
870 return 0; \ 925 return 0; \
871} 926}
872KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read) 927KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
873KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write) 928KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_WRITE, write)
929KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_DISCARD, discard)
874KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other) 930KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
875#undef KYBER_DEBUGFS_DOMAIN_ATTRS 931#undef KYBER_DEBUGFS_DOMAIN_ATTRS
876 932
@@ -892,8 +948,11 @@ static int kyber_cur_domain_show(void *data, struct seq_file *m)
892 case KYBER_READ: 948 case KYBER_READ:
893 seq_puts(m, "READ\n"); 949 seq_puts(m, "READ\n");
894 break; 950 break;
895 case KYBER_SYNC_WRITE: 951 case KYBER_WRITE:
896 seq_puts(m, "SYNC_WRITE\n"); 952 seq_puts(m, "WRITE\n");
953 break;
954 case KYBER_DISCARD:
955 seq_puts(m, "DISCARD\n");
897 break; 956 break;
898 case KYBER_OTHER: 957 case KYBER_OTHER:
899 seq_puts(m, "OTHER\n"); 958 seq_puts(m, "OTHER\n");
@@ -918,7 +977,8 @@ static int kyber_batching_show(void *data, struct seq_file *m)
918 {#name "_tokens", 0400, kyber_##name##_tokens_show} 977 {#name "_tokens", 0400, kyber_##name##_tokens_show}
919static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = { 978static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
920 KYBER_QUEUE_DOMAIN_ATTRS(read), 979 KYBER_QUEUE_DOMAIN_ATTRS(read),
921 KYBER_QUEUE_DOMAIN_ATTRS(sync_write), 980 KYBER_QUEUE_DOMAIN_ATTRS(write),
981 KYBER_QUEUE_DOMAIN_ATTRS(discard),
922 KYBER_QUEUE_DOMAIN_ATTRS(other), 982 KYBER_QUEUE_DOMAIN_ATTRS(other),
923 {"async_depth", 0400, kyber_async_depth_show}, 983 {"async_depth", 0400, kyber_async_depth_show},
924 {}, 984 {},
@@ -930,7 +990,8 @@ static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
930 {#name "_waiting", 0400, kyber_##name##_waiting_show} 990 {#name "_waiting", 0400, kyber_##name##_waiting_show}
931static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = { 991static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
932 KYBER_HCTX_DOMAIN_ATTRS(read), 992 KYBER_HCTX_DOMAIN_ATTRS(read),
933 KYBER_HCTX_DOMAIN_ATTRS(sync_write), 993 KYBER_HCTX_DOMAIN_ATTRS(write),
994 KYBER_HCTX_DOMAIN_ATTRS(discard),
934 KYBER_HCTX_DOMAIN_ATTRS(other), 995 KYBER_HCTX_DOMAIN_ATTRS(other),
935 {"cur_domain", 0400, kyber_cur_domain_show}, 996 {"cur_domain", 0400, kyber_cur_domain_show},
936 {"batching", 0400, kyber_batching_show}, 997 {"batching", 0400, kyber_batching_show},