diff options
Diffstat (limited to 'block/kyber-iosched.c')
-rw-r--r-- | block/kyber-iosched.c | 497 |
1 files changed, 279 insertions, 218 deletions
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 08eb5295c18d..adc8e6393829 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c | |||
@@ -29,13 +29,16 @@ | |||
29 | #include "blk-mq-debugfs.h" | 29 | #include "blk-mq-debugfs.h" |
30 | #include "blk-mq-sched.h" | 30 | #include "blk-mq-sched.h" |
31 | #include "blk-mq-tag.h" | 31 | #include "blk-mq-tag.h" |
32 | #include "blk-stat.h" | ||
33 | 32 | ||
34 | /* Scheduling domains. */ | 33 | /* |
34 | * Scheduling domains: the device is divided into multiple domains based on the | ||
35 | * request type. | ||
36 | */ | ||
35 | enum { | 37 | enum { |
36 | KYBER_READ, | 38 | KYBER_READ, |
37 | KYBER_SYNC_WRITE, | 39 | KYBER_WRITE, |
38 | KYBER_OTHER, /* Async writes, discard, etc. */ | 40 | KYBER_DISCARD, |
41 | KYBER_OTHER, | ||
39 | KYBER_NUM_DOMAINS, | 42 | KYBER_NUM_DOMAINS, |
40 | }; | 43 | }; |
41 | 44 | ||
@@ -49,25 +52,82 @@ enum { | |||
49 | }; | 52 | }; |
50 | 53 | ||
51 | /* | 54 | /* |
52 | * Initial device-wide depths for each scheduling domain. | 55 | * Maximum device-wide depth for each scheduling domain. |
53 | * | 56 | * |
54 | * Even for fast devices with lots of tags like NVMe, you can saturate | 57 | * Even for fast devices with lots of tags like NVMe, you can saturate the |
55 | * the device with only a fraction of the maximum possible queue depth. | 58 | * device with only a fraction of the maximum possible queue depth. So, we cap |
56 | * So, we cap these to a reasonable value. | 59 | * these to a reasonable value. |
57 | */ | 60 | */ |
58 | static const unsigned int kyber_depth[] = { | 61 | static const unsigned int kyber_depth[] = { |
59 | [KYBER_READ] = 256, | 62 | [KYBER_READ] = 256, |
60 | [KYBER_SYNC_WRITE] = 128, | 63 | [KYBER_WRITE] = 128, |
61 | [KYBER_OTHER] = 64, | 64 | [KYBER_DISCARD] = 64, |
65 | [KYBER_OTHER] = 16, | ||
62 | }; | 66 | }; |
63 | 67 | ||
64 | /* | 68 | /* |
65 | * Scheduling domain batch sizes. We favor reads. | 69 | * Default latency targets for each scheduling domain. |
70 | */ | ||
71 | static const u64 kyber_latency_targets[] = { | ||
72 | [KYBER_READ] = 2 * NSEC_PER_MSEC, | ||
73 | [KYBER_WRITE] = 10 * NSEC_PER_MSEC, | ||
74 | [KYBER_DISCARD] = 5 * NSEC_PER_SEC, | ||
75 | }; | ||
76 | |||
77 | /* | ||
78 | * Batch size (number of requests we'll dispatch in a row) for each scheduling | ||
79 | * domain. | ||
66 | */ | 80 | */ |
67 | static const unsigned int kyber_batch_size[] = { | 81 | static const unsigned int kyber_batch_size[] = { |
68 | [KYBER_READ] = 16, | 82 | [KYBER_READ] = 16, |
69 | [KYBER_SYNC_WRITE] = 8, | 83 | [KYBER_WRITE] = 8, |
70 | [KYBER_OTHER] = 8, | 84 | [KYBER_DISCARD] = 1, |
85 | [KYBER_OTHER] = 1, | ||
86 | }; | ||
87 | |||
88 | /* | ||
89 | * Requests latencies are recorded in a histogram with buckets defined relative | ||
90 | * to the target latency: | ||
91 | * | ||
92 | * <= 1/4 * target latency | ||
93 | * <= 1/2 * target latency | ||
94 | * <= 3/4 * target latency | ||
95 | * <= target latency | ||
96 | * <= 1 1/4 * target latency | ||
97 | * <= 1 1/2 * target latency | ||
98 | * <= 1 3/4 * target latency | ||
99 | * > 1 3/4 * target latency | ||
100 | */ | ||
101 | enum { | ||
102 | /* | ||
103 | * The width of the latency histogram buckets is | ||
104 | * 1 / (1 << KYBER_LATENCY_SHIFT) * target latency. | ||
105 | */ | ||
106 | KYBER_LATENCY_SHIFT = 2, | ||
107 | /* | ||
108 | * The first (1 << KYBER_LATENCY_SHIFT) buckets are <= target latency, | ||
109 | * thus, "good". | ||
110 | */ | ||
111 | KYBER_GOOD_BUCKETS = 1 << KYBER_LATENCY_SHIFT, | ||
112 | /* There are also (1 << KYBER_LATENCY_SHIFT) "bad" buckets. */ | ||
113 | KYBER_LATENCY_BUCKETS = 2 << KYBER_LATENCY_SHIFT, | ||
114 | }; | ||
115 | |||
116 | /* | ||
117 | * We measure both the total latency and the I/O latency (i.e., latency after | ||
118 | * submitting to the device). | ||
119 | */ | ||
120 | enum { | ||
121 | KYBER_TOTAL_LATENCY, | ||
122 | KYBER_IO_LATENCY, | ||
123 | }; | ||
124 | |||
125 | /* | ||
126 | * Per-cpu latency histograms: total latency and I/O latency for each scheduling | ||
127 | * domain except for KYBER_OTHER. | ||
128 | */ | ||
129 | struct kyber_cpu_latency { | ||
130 | atomic_t buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS]; | ||
71 | }; | 131 | }; |
72 | 132 | ||
73 | /* | 133 | /* |
@@ -84,14 +144,9 @@ struct kyber_ctx_queue { | |||
84 | } ____cacheline_aligned_in_smp; | 144 | } ____cacheline_aligned_in_smp; |
85 | 145 | ||
86 | struct kyber_queue_data { | 146 | struct kyber_queue_data { |
87 | struct request_queue *q; | ||
88 | |||
89 | struct blk_stat_callback *cb; | ||
90 | |||
91 | /* | 147 | /* |
92 | * The device is divided into multiple scheduling domains based on the | 148 | * Each scheduling domain has a limited number of in-flight requests |
93 | * request type. Each domain has a fixed number of in-flight requests of | 149 | * device-wide, limited by these tokens. |
94 | * that type device-wide, limited by these tokens. | ||
95 | */ | 150 | */ |
96 | struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS]; | 151 | struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS]; |
97 | 152 | ||
@@ -101,8 +156,19 @@ struct kyber_queue_data { | |||
101 | */ | 156 | */ |
102 | unsigned int async_depth; | 157 | unsigned int async_depth; |
103 | 158 | ||
159 | struct kyber_cpu_latency __percpu *cpu_latency; | ||
160 | |||
161 | /* Timer for stats aggregation and adjusting domain tokens. */ | ||
162 | struct timer_list timer; | ||
163 | |||
164 | unsigned int latency_buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS]; | ||
165 | |||
166 | unsigned long latency_timeout[KYBER_OTHER]; | ||
167 | |||
168 | int domain_p99[KYBER_OTHER]; | ||
169 | |||
104 | /* Target latencies in nanoseconds. */ | 170 | /* Target latencies in nanoseconds. */ |
105 | u64 read_lat_nsec, write_lat_nsec; | 171 | u64 latency_targets[KYBER_OTHER]; |
106 | }; | 172 | }; |
107 | 173 | ||
108 | struct kyber_hctx_data { | 174 | struct kyber_hctx_data { |
@@ -122,182 +188,165 @@ static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags, | |||
122 | 188 | ||
123 | static unsigned int kyber_sched_domain(unsigned int op) | 189 | static unsigned int kyber_sched_domain(unsigned int op) |
124 | { | 190 | { |
125 | if ((op & REQ_OP_MASK) == REQ_OP_READ) | 191 | switch (op & REQ_OP_MASK) { |
192 | case REQ_OP_READ: | ||
126 | return KYBER_READ; | 193 | return KYBER_READ; |
127 | else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op)) | 194 | case REQ_OP_WRITE: |
128 | return KYBER_SYNC_WRITE; | 195 | return KYBER_WRITE; |
129 | else | 196 | case REQ_OP_DISCARD: |
197 | return KYBER_DISCARD; | ||
198 | default: | ||
130 | return KYBER_OTHER; | 199 | return KYBER_OTHER; |
200 | } | ||
131 | } | 201 | } |
132 | 202 | ||
133 | enum { | 203 | static void flush_latency_buckets(struct kyber_queue_data *kqd, |
134 | NONE = 0, | 204 | struct kyber_cpu_latency *cpu_latency, |
135 | GOOD = 1, | 205 | unsigned int sched_domain, unsigned int type) |
136 | GREAT = 2, | ||
137 | BAD = -1, | ||
138 | AWFUL = -2, | ||
139 | }; | ||
140 | |||
141 | #define IS_GOOD(status) ((status) > 0) | ||
142 | #define IS_BAD(status) ((status) < 0) | ||
143 | |||
144 | static int kyber_lat_status(struct blk_stat_callback *cb, | ||
145 | unsigned int sched_domain, u64 target) | ||
146 | { | 206 | { |
147 | u64 latency; | 207 | unsigned int *buckets = kqd->latency_buckets[sched_domain][type]; |
148 | 208 | atomic_t *cpu_buckets = cpu_latency->buckets[sched_domain][type]; | |
149 | if (!cb->stat[sched_domain].nr_samples) | 209 | unsigned int bucket; |
150 | return NONE; | ||
151 | 210 | ||
152 | latency = cb->stat[sched_domain].mean; | 211 | for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++) |
153 | if (latency >= 2 * target) | 212 | buckets[bucket] += atomic_xchg(&cpu_buckets[bucket], 0); |
154 | return AWFUL; | ||
155 | else if (latency > target) | ||
156 | return BAD; | ||
157 | else if (latency <= target / 2) | ||
158 | return GREAT; | ||
159 | else /* (latency <= target) */ | ||
160 | return GOOD; | ||
161 | } | 213 | } |
162 | 214 | ||
163 | /* | 215 | /* |
164 | * Adjust the read or synchronous write depth given the status of reads and | 216 | * Calculate the histogram bucket with the given percentile rank, or -1 if there |
165 | * writes. The goal is that the latencies of the two domains are fair (i.e., if | 217 | * aren't enough samples yet. |
166 | * one is good, then the other is good). | ||
167 | */ | 218 | */ |
168 | static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd, | 219 | static int calculate_percentile(struct kyber_queue_data *kqd, |
169 | unsigned int sched_domain, int this_status, | 220 | unsigned int sched_domain, unsigned int type, |
170 | int other_status) | 221 | unsigned int percentile) |
171 | { | 222 | { |
172 | unsigned int orig_depth, depth; | 223 | unsigned int *buckets = kqd->latency_buckets[sched_domain][type]; |
224 | unsigned int bucket, samples = 0, percentile_samples; | ||
225 | |||
226 | for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++) | ||
227 | samples += buckets[bucket]; | ||
228 | |||
229 | if (!samples) | ||
230 | return -1; | ||
173 | 231 | ||
174 | /* | 232 | /* |
175 | * If this domain had no samples, or reads and writes are both good or | 233 | * We do the calculation once we have 500 samples or one second passes |
176 | * both bad, don't adjust the depth. | 234 | * since the first sample was recorded, whichever comes first. |
177 | */ | 235 | */ |
178 | if (this_status == NONE || | 236 | if (!kqd->latency_timeout[sched_domain]) |
179 | (IS_GOOD(this_status) && IS_GOOD(other_status)) || | 237 | kqd->latency_timeout[sched_domain] = max(jiffies + HZ, 1UL); |
180 | (IS_BAD(this_status) && IS_BAD(other_status))) | 238 | if (samples < 500 && |
181 | return; | 239 | time_is_after_jiffies(kqd->latency_timeout[sched_domain])) { |
182 | 240 | return -1; | |
183 | orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth; | 241 | } |
242 | kqd->latency_timeout[sched_domain] = 0; | ||
184 | 243 | ||
185 | if (other_status == NONE) { | 244 | percentile_samples = DIV_ROUND_UP(samples * percentile, 100); |
186 | depth++; | 245 | for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS - 1; bucket++) { |
187 | } else { | 246 | if (buckets[bucket] >= percentile_samples) |
188 | switch (this_status) { | ||
189 | case GOOD: | ||
190 | if (other_status == AWFUL) | ||
191 | depth -= max(depth / 4, 1U); | ||
192 | else | ||
193 | depth -= max(depth / 8, 1U); | ||
194 | break; | ||
195 | case GREAT: | ||
196 | if (other_status == AWFUL) | ||
197 | depth /= 2; | ||
198 | else | ||
199 | depth -= max(depth / 4, 1U); | ||
200 | break; | 247 | break; |
201 | case BAD: | 248 | percentile_samples -= buckets[bucket]; |
202 | depth++; | ||
203 | break; | ||
204 | case AWFUL: | ||
205 | if (other_status == GREAT) | ||
206 | depth += 2; | ||
207 | else | ||
208 | depth++; | ||
209 | break; | ||
210 | } | ||
211 | } | 249 | } |
250 | memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type])); | ||
212 | 251 | ||
252 | return bucket; | ||
253 | } | ||
254 | |||
255 | static void kyber_resize_domain(struct kyber_queue_data *kqd, | ||
256 | unsigned int sched_domain, unsigned int depth) | ||
257 | { | ||
213 | depth = clamp(depth, 1U, kyber_depth[sched_domain]); | 258 | depth = clamp(depth, 1U, kyber_depth[sched_domain]); |
214 | if (depth != orig_depth) | 259 | if (depth != kqd->domain_tokens[sched_domain].sb.depth) |
215 | sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth); | 260 | sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth); |
216 | } | 261 | } |
217 | 262 | ||
218 | /* | 263 | static void kyber_timer_fn(struct timer_list *t) |
219 | * Adjust the depth of other requests given the status of reads and synchronous | 264 | { |
220 | * writes. As long as either domain is doing fine, we don't throttle, but if | 265 | struct kyber_queue_data *kqd = from_timer(kqd, t, timer); |
221 | * both domains are doing badly, we throttle heavily. | 266 | unsigned int sched_domain; |
222 | */ | 267 | int cpu; |
223 | static void kyber_adjust_other_depth(struct kyber_queue_data *kqd, | 268 | bool bad = false; |
224 | int read_status, int write_status, | 269 | |
225 | bool have_samples) | 270 | /* Sum all of the per-cpu latency histograms. */ |
226 | { | 271 | for_each_online_cpu(cpu) { |
227 | unsigned int orig_depth, depth; | 272 | struct kyber_cpu_latency *cpu_latency; |
228 | int status; | 273 | |
229 | 274 | cpu_latency = per_cpu_ptr(kqd->cpu_latency, cpu); | |
230 | orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth; | 275 | for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) { |
231 | 276 | flush_latency_buckets(kqd, cpu_latency, sched_domain, | |
232 | if (read_status == NONE && write_status == NONE) { | 277 | KYBER_TOTAL_LATENCY); |
233 | depth += 2; | 278 | flush_latency_buckets(kqd, cpu_latency, sched_domain, |
234 | } else if (have_samples) { | 279 | KYBER_IO_LATENCY); |
235 | if (read_status == NONE) | ||
236 | status = write_status; | ||
237 | else if (write_status == NONE) | ||
238 | status = read_status; | ||
239 | else | ||
240 | status = max(read_status, write_status); | ||
241 | switch (status) { | ||
242 | case GREAT: | ||
243 | depth += 2; | ||
244 | break; | ||
245 | case GOOD: | ||
246 | depth++; | ||
247 | break; | ||
248 | case BAD: | ||
249 | depth -= max(depth / 4, 1U); | ||
250 | break; | ||
251 | case AWFUL: | ||
252 | depth /= 2; | ||
253 | break; | ||
254 | } | 280 | } |
255 | } | 281 | } |
256 | 282 | ||
257 | depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]); | 283 | /* |
258 | if (depth != orig_depth) | 284 | * Check if any domains have a high I/O latency, which might indicate |
259 | sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth); | 285 | * congestion in the device. Note that we use the p90; we don't want to |
260 | } | 286 | * be too sensitive to outliers here. |
261 | 287 | */ | |
262 | /* | 288 | for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) { |
263 | * Apply heuristics for limiting queue depths based on gathered latency | 289 | int p90; |
264 | * statistics. | ||
265 | */ | ||
266 | static void kyber_stat_timer_fn(struct blk_stat_callback *cb) | ||
267 | { | ||
268 | struct kyber_queue_data *kqd = cb->data; | ||
269 | int read_status, write_status; | ||
270 | |||
271 | read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec); | ||
272 | write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec); | ||
273 | 290 | ||
274 | kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status); | 291 | p90 = calculate_percentile(kqd, sched_domain, KYBER_IO_LATENCY, |
275 | kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status); | 292 | 90); |
276 | kyber_adjust_other_depth(kqd, read_status, write_status, | 293 | if (p90 >= KYBER_GOOD_BUCKETS) |
277 | cb->stat[KYBER_OTHER].nr_samples != 0); | 294 | bad = true; |
295 | } | ||
278 | 296 | ||
279 | /* | 297 | /* |
280 | * Continue monitoring latencies if we aren't hitting the targets or | 298 | * Adjust the scheduling domain depths. If we determined that there was |
281 | * we're still throttling other requests. | 299 | * congestion, we throttle all domains with good latencies. Either way, |
300 | * we ease up on throttling domains with bad latencies. | ||
282 | */ | 301 | */ |
283 | if (!blk_stat_is_active(kqd->cb) && | 302 | for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) { |
284 | ((IS_BAD(read_status) || IS_BAD(write_status) || | 303 | unsigned int orig_depth, depth; |
285 | kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER]))) | 304 | int p99; |
286 | blk_stat_activate_msecs(kqd->cb, 100); | 305 | |
306 | p99 = calculate_percentile(kqd, sched_domain, | ||
307 | KYBER_TOTAL_LATENCY, 99); | ||
308 | /* | ||
309 | * This is kind of subtle: different domains will not | ||
310 | * necessarily have enough samples to calculate the latency | ||
311 | * percentiles during the same window, so we have to remember | ||
312 | * the p99 for the next time we observe congestion; once we do, | ||
313 | * we don't want to throttle again until we get more data, so we | ||
314 | * reset it to -1. | ||
315 | */ | ||
316 | if (bad) { | ||
317 | if (p99 < 0) | ||
318 | p99 = kqd->domain_p99[sched_domain]; | ||
319 | kqd->domain_p99[sched_domain] = -1; | ||
320 | } else if (p99 >= 0) { | ||
321 | kqd->domain_p99[sched_domain] = p99; | ||
322 | } | ||
323 | if (p99 < 0) | ||
324 | continue; | ||
325 | |||
326 | /* | ||
327 | * If this domain has bad latency, throttle less. Otherwise, | ||
328 | * throttle more iff we determined that there is congestion. | ||
329 | * | ||
330 | * The new depth is scaled linearly with the p99 latency vs the | ||
331 | * latency target. E.g., if the p99 is 3/4 of the target, then | ||
332 | * we throttle down to 3/4 of the current depth, and if the p99 | ||
333 | * is 2x the target, then we double the depth. | ||
334 | */ | ||
335 | if (bad || p99 >= KYBER_GOOD_BUCKETS) { | ||
336 | orig_depth = kqd->domain_tokens[sched_domain].sb.depth; | ||
337 | depth = (orig_depth * (p99 + 1)) >> KYBER_LATENCY_SHIFT; | ||
338 | kyber_resize_domain(kqd, sched_domain, depth); | ||
339 | } | ||
340 | } | ||
287 | } | 341 | } |
288 | 342 | ||
289 | static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd) | 343 | static unsigned int kyber_sched_tags_shift(struct request_queue *q) |
290 | { | 344 | { |
291 | /* | 345 | /* |
292 | * All of the hardware queues have the same depth, so we can just grab | 346 | * All of the hardware queues have the same depth, so we can just grab |
293 | * the shift of the first one. | 347 | * the shift of the first one. |
294 | */ | 348 | */ |
295 | return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift; | 349 | return q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift; |
296 | } | ||
297 | |||
298 | static int kyber_bucket_fn(const struct request *rq) | ||
299 | { | ||
300 | return kyber_sched_domain(rq->cmd_flags); | ||
301 | } | 350 | } |
302 | 351 | ||
303 | static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) | 352 | static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) |
@@ -307,16 +356,17 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) | |||
307 | int ret = -ENOMEM; | 356 | int ret = -ENOMEM; |
308 | int i; | 357 | int i; |
309 | 358 | ||
310 | kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node); | 359 | kqd = kzalloc_node(sizeof(*kqd), GFP_KERNEL, q->node); |
311 | if (!kqd) | 360 | if (!kqd) |
312 | goto err; | 361 | goto err; |
313 | kqd->q = q; | ||
314 | 362 | ||
315 | kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn, | 363 | kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency, |
316 | KYBER_NUM_DOMAINS, kqd); | 364 | GFP_KERNEL | __GFP_ZERO); |
317 | if (!kqd->cb) | 365 | if (!kqd->cpu_latency) |
318 | goto err_kqd; | 366 | goto err_kqd; |
319 | 367 | ||
368 | timer_setup(&kqd->timer, kyber_timer_fn, 0); | ||
369 | |||
320 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) { | 370 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) { |
321 | WARN_ON(!kyber_depth[i]); | 371 | WARN_ON(!kyber_depth[i]); |
322 | WARN_ON(!kyber_batch_size[i]); | 372 | WARN_ON(!kyber_batch_size[i]); |
@@ -326,20 +376,22 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) | |||
326 | if (ret) { | 376 | if (ret) { |
327 | while (--i >= 0) | 377 | while (--i >= 0) |
328 | sbitmap_queue_free(&kqd->domain_tokens[i]); | 378 | sbitmap_queue_free(&kqd->domain_tokens[i]); |
329 | goto err_cb; | 379 | goto err_buckets; |
330 | } | 380 | } |
331 | } | 381 | } |
332 | 382 | ||
333 | shift = kyber_sched_tags_shift(kqd); | 383 | for (i = 0; i < KYBER_OTHER; i++) { |
334 | kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; | 384 | kqd->domain_p99[i] = -1; |
385 | kqd->latency_targets[i] = kyber_latency_targets[i]; | ||
386 | } | ||
335 | 387 | ||
336 | kqd->read_lat_nsec = 2000000ULL; | 388 | shift = kyber_sched_tags_shift(q); |
337 | kqd->write_lat_nsec = 10000000ULL; | 389 | kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; |
338 | 390 | ||
339 | return kqd; | 391 | return kqd; |
340 | 392 | ||
341 | err_cb: | 393 | err_buckets: |
342 | blk_stat_free_callback(kqd->cb); | 394 | free_percpu(kqd->cpu_latency); |
343 | err_kqd: | 395 | err_kqd: |
344 | kfree(kqd); | 396 | kfree(kqd); |
345 | err: | 397 | err: |
@@ -361,25 +413,24 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_type *e) | |||
361 | return PTR_ERR(kqd); | 413 | return PTR_ERR(kqd); |
362 | } | 414 | } |
363 | 415 | ||
416 | blk_stat_enable_accounting(q); | ||
417 | |||
364 | eq->elevator_data = kqd; | 418 | eq->elevator_data = kqd; |
365 | q->elevator = eq; | 419 | q->elevator = eq; |
366 | 420 | ||
367 | blk_stat_add_callback(q, kqd->cb); | ||
368 | |||
369 | return 0; | 421 | return 0; |
370 | } | 422 | } |
371 | 423 | ||
372 | static void kyber_exit_sched(struct elevator_queue *e) | 424 | static void kyber_exit_sched(struct elevator_queue *e) |
373 | { | 425 | { |
374 | struct kyber_queue_data *kqd = e->elevator_data; | 426 | struct kyber_queue_data *kqd = e->elevator_data; |
375 | struct request_queue *q = kqd->q; | ||
376 | int i; | 427 | int i; |
377 | 428 | ||
378 | blk_stat_remove_callback(q, kqd->cb); | 429 | del_timer_sync(&kqd->timer); |
379 | 430 | ||
380 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) | 431 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) |
381 | sbitmap_queue_free(&kqd->domain_tokens[i]); | 432 | sbitmap_queue_free(&kqd->domain_tokens[i]); |
382 | blk_stat_free_callback(kqd->cb); | 433 | free_percpu(kqd->cpu_latency); |
383 | kfree(kqd); | 434 | kfree(kqd); |
384 | } | 435 | } |
385 | 436 | ||
@@ -547,40 +598,44 @@ static void kyber_finish_request(struct request *rq) | |||
547 | rq_clear_domain_token(kqd, rq); | 598 | rq_clear_domain_token(kqd, rq); |
548 | } | 599 | } |
549 | 600 | ||
550 | static void kyber_completed_request(struct request *rq, u64 now) | 601 | static void add_latency_sample(struct kyber_cpu_latency *cpu_latency, |
602 | unsigned int sched_domain, unsigned int type, | ||
603 | u64 target, u64 latency) | ||
551 | { | 604 | { |
552 | struct request_queue *q = rq->q; | 605 | unsigned int bucket; |
553 | struct kyber_queue_data *kqd = q->elevator->elevator_data; | 606 | u64 divisor; |
554 | unsigned int sched_domain; | ||
555 | u64 latency, target; | ||
556 | 607 | ||
557 | /* | 608 | if (latency > 0) { |
558 | * Check if this request met our latency goal. If not, quickly gather | 609 | divisor = max_t(u64, target >> KYBER_LATENCY_SHIFT, 1); |
559 | * some statistics and start throttling. | 610 | bucket = min_t(unsigned int, div64_u64(latency - 1, divisor), |
560 | */ | 611 | KYBER_LATENCY_BUCKETS - 1); |
561 | sched_domain = kyber_sched_domain(rq->cmd_flags); | 612 | } else { |
562 | switch (sched_domain) { | 613 | bucket = 0; |
563 | case KYBER_READ: | ||
564 | target = kqd->read_lat_nsec; | ||
565 | break; | ||
566 | case KYBER_SYNC_WRITE: | ||
567 | target = kqd->write_lat_nsec; | ||
568 | break; | ||
569 | default: | ||
570 | return; | ||
571 | } | 614 | } |
572 | 615 | ||
573 | /* If we are already monitoring latencies, don't check again. */ | 616 | atomic_inc(&cpu_latency->buckets[sched_domain][type][bucket]); |
574 | if (blk_stat_is_active(kqd->cb)) | 617 | } |
575 | return; | ||
576 | 618 | ||
577 | if (now < rq->io_start_time_ns) | 619 | static void kyber_completed_request(struct request *rq, u64 now) |
620 | { | ||
621 | struct kyber_queue_data *kqd = rq->q->elevator->elevator_data; | ||
622 | struct kyber_cpu_latency *cpu_latency; | ||
623 | unsigned int sched_domain; | ||
624 | u64 target; | ||
625 | |||
626 | sched_domain = kyber_sched_domain(rq->cmd_flags); | ||
627 | if (sched_domain == KYBER_OTHER) | ||
578 | return; | 628 | return; |
579 | 629 | ||
580 | latency = now - rq->io_start_time_ns; | 630 | cpu_latency = get_cpu_ptr(kqd->cpu_latency); |
631 | target = kqd->latency_targets[sched_domain]; | ||
632 | add_latency_sample(cpu_latency, sched_domain, KYBER_TOTAL_LATENCY, | ||
633 | target, now - rq->start_time_ns); | ||
634 | add_latency_sample(cpu_latency, sched_domain, KYBER_IO_LATENCY, target, | ||
635 | now - rq->io_start_time_ns); | ||
636 | put_cpu_ptr(kqd->cpu_latency); | ||
581 | 637 | ||
582 | if (latency > target) | 638 | timer_reduce(&kqd->timer, jiffies + HZ / 10); |
583 | blk_stat_activate_msecs(kqd->cb, 10); | ||
584 | } | 639 | } |
585 | 640 | ||
586 | struct flush_kcq_data { | 641 | struct flush_kcq_data { |
@@ -778,17 +833,17 @@ static bool kyber_has_work(struct blk_mq_hw_ctx *hctx) | |||
778 | return false; | 833 | return false; |
779 | } | 834 | } |
780 | 835 | ||
781 | #define KYBER_LAT_SHOW_STORE(op) \ | 836 | #define KYBER_LAT_SHOW_STORE(domain, name) \ |
782 | static ssize_t kyber_##op##_lat_show(struct elevator_queue *e, \ | 837 | static ssize_t kyber_##name##_lat_show(struct elevator_queue *e, \ |
783 | char *page) \ | 838 | char *page) \ |
784 | { \ | 839 | { \ |
785 | struct kyber_queue_data *kqd = e->elevator_data; \ | 840 | struct kyber_queue_data *kqd = e->elevator_data; \ |
786 | \ | 841 | \ |
787 | return sprintf(page, "%llu\n", kqd->op##_lat_nsec); \ | 842 | return sprintf(page, "%llu\n", kqd->latency_targets[domain]); \ |
788 | } \ | 843 | } \ |
789 | \ | 844 | \ |
790 | static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \ | 845 | static ssize_t kyber_##name##_lat_store(struct elevator_queue *e, \ |
791 | const char *page, size_t count) \ | 846 | const char *page, size_t count) \ |
792 | { \ | 847 | { \ |
793 | struct kyber_queue_data *kqd = e->elevator_data; \ | 848 | struct kyber_queue_data *kqd = e->elevator_data; \ |
794 | unsigned long long nsec; \ | 849 | unsigned long long nsec; \ |
@@ -798,12 +853,12 @@ static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \ | |||
798 | if (ret) \ | 853 | if (ret) \ |
799 | return ret; \ | 854 | return ret; \ |
800 | \ | 855 | \ |
801 | kqd->op##_lat_nsec = nsec; \ | 856 | kqd->latency_targets[domain] = nsec; \ |
802 | \ | 857 | \ |
803 | return count; \ | 858 | return count; \ |
804 | } | 859 | } |
805 | KYBER_LAT_SHOW_STORE(read); | 860 | KYBER_LAT_SHOW_STORE(KYBER_READ, read); |
806 | KYBER_LAT_SHOW_STORE(write); | 861 | KYBER_LAT_SHOW_STORE(KYBER_WRITE, write); |
807 | #undef KYBER_LAT_SHOW_STORE | 862 | #undef KYBER_LAT_SHOW_STORE |
808 | 863 | ||
809 | #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store) | 864 | #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store) |
@@ -870,7 +925,8 @@ static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \ | |||
870 | return 0; \ | 925 | return 0; \ |
871 | } | 926 | } |
872 | KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read) | 927 | KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read) |
873 | KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write) | 928 | KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_WRITE, write) |
929 | KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_DISCARD, discard) | ||
874 | KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other) | 930 | KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other) |
875 | #undef KYBER_DEBUGFS_DOMAIN_ATTRS | 931 | #undef KYBER_DEBUGFS_DOMAIN_ATTRS |
876 | 932 | ||
@@ -892,8 +948,11 @@ static int kyber_cur_domain_show(void *data, struct seq_file *m) | |||
892 | case KYBER_READ: | 948 | case KYBER_READ: |
893 | seq_puts(m, "READ\n"); | 949 | seq_puts(m, "READ\n"); |
894 | break; | 950 | break; |
895 | case KYBER_SYNC_WRITE: | 951 | case KYBER_WRITE: |
896 | seq_puts(m, "SYNC_WRITE\n"); | 952 | seq_puts(m, "WRITE\n"); |
953 | break; | ||
954 | case KYBER_DISCARD: | ||
955 | seq_puts(m, "DISCARD\n"); | ||
897 | break; | 956 | break; |
898 | case KYBER_OTHER: | 957 | case KYBER_OTHER: |
899 | seq_puts(m, "OTHER\n"); | 958 | seq_puts(m, "OTHER\n"); |
@@ -918,7 +977,8 @@ static int kyber_batching_show(void *data, struct seq_file *m) | |||
918 | {#name "_tokens", 0400, kyber_##name##_tokens_show} | 977 | {#name "_tokens", 0400, kyber_##name##_tokens_show} |
919 | static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = { | 978 | static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = { |
920 | KYBER_QUEUE_DOMAIN_ATTRS(read), | 979 | KYBER_QUEUE_DOMAIN_ATTRS(read), |
921 | KYBER_QUEUE_DOMAIN_ATTRS(sync_write), | 980 | KYBER_QUEUE_DOMAIN_ATTRS(write), |
981 | KYBER_QUEUE_DOMAIN_ATTRS(discard), | ||
922 | KYBER_QUEUE_DOMAIN_ATTRS(other), | 982 | KYBER_QUEUE_DOMAIN_ATTRS(other), |
923 | {"async_depth", 0400, kyber_async_depth_show}, | 983 | {"async_depth", 0400, kyber_async_depth_show}, |
924 | {}, | 984 | {}, |
@@ -930,7 +990,8 @@ static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = { | |||
930 | {#name "_waiting", 0400, kyber_##name##_waiting_show} | 990 | {#name "_waiting", 0400, kyber_##name##_waiting_show} |
931 | static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = { | 991 | static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = { |
932 | KYBER_HCTX_DOMAIN_ATTRS(read), | 992 | KYBER_HCTX_DOMAIN_ATTRS(read), |
933 | KYBER_HCTX_DOMAIN_ATTRS(sync_write), | 993 | KYBER_HCTX_DOMAIN_ATTRS(write), |
994 | KYBER_HCTX_DOMAIN_ATTRS(discard), | ||
934 | KYBER_HCTX_DOMAIN_ATTRS(other), | 995 | KYBER_HCTX_DOMAIN_ATTRS(other), |
935 | {"cur_domain", 0400, kyber_cur_domain_show}, | 996 | {"cur_domain", 0400, kyber_cur_domain_show}, |
936 | {"batching", 0400, kyber_batching_show}, | 997 | {"batching", 0400, kyber_batching_show}, |