diff options
Diffstat (limited to 'block/bfq-iosched.c')
-rw-r--r-- | block/bfq-iosched.c | 497 |
1 files changed, 372 insertions, 125 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 1edac72ab51d..61d880b90882 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c | |||
@@ -407,19 +407,37 @@ struct bfq_data { | |||
407 | /* on-disk position of the last served request */ | 407 | /* on-disk position of the last served request */ |
408 | sector_t last_position; | 408 | sector_t last_position; |
409 | 409 | ||
410 | /* time of last request completion (ns) */ | ||
411 | u64 last_completion; | ||
412 | |||
413 | /* time of first rq dispatch in current observation interval (ns) */ | ||
414 | u64 first_dispatch; | ||
415 | /* time of last rq dispatch in current observation interval (ns) */ | ||
416 | u64 last_dispatch; | ||
417 | |||
410 | /* beginning of the last budget */ | 418 | /* beginning of the last budget */ |
411 | ktime_t last_budget_start; | 419 | ktime_t last_budget_start; |
412 | /* beginning of the last idle slice */ | 420 | /* beginning of the last idle slice */ |
413 | ktime_t last_idling_start; | 421 | ktime_t last_idling_start; |
414 | /* number of samples used to calculate @peak_rate */ | 422 | |
423 | /* number of samples in current observation interval */ | ||
415 | int peak_rate_samples; | 424 | int peak_rate_samples; |
425 | /* num of samples of seq dispatches in current observation interval */ | ||
426 | u32 sequential_samples; | ||
427 | /* total num of sectors transferred in current observation interval */ | ||
428 | u64 tot_sectors_dispatched; | ||
429 | /* max rq size seen during current observation interval (sectors) */ | ||
430 | u32 last_rq_max_size; | ||
431 | /* time elapsed from first dispatch in current observ. interval (us) */ | ||
432 | u64 delta_from_first; | ||
416 | /* | 433 | /* |
417 | * Peak read/write rate, observed during the service of a | 434 | * Current estimate of the device peak rate, measured in |
418 | * budget [BFQ_RATE_SHIFT * sectors/usec]. The value is | 435 | * [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by |
419 | * left-shifted by BFQ_RATE_SHIFT to increase precision in | 436 | * BFQ_RATE_SHIFT is performed to increase precision in |
420 | * fixed-point calculations. | 437 | * fixed-point calculations. |
421 | */ | 438 | */ |
422 | u64 peak_rate; | 439 | u32 peak_rate; |
440 | |||
423 | /* maximum budget allotted to a bfq_queue before rescheduling */ | 441 | /* maximum budget allotted to a bfq_queue before rescheduling */ |
424 | int bfq_max_budget; | 442 | int bfq_max_budget; |
425 | 443 | ||
@@ -740,7 +758,7 @@ static const int bfq_timeout = HZ / 8; | |||
740 | 758 | ||
741 | static struct kmem_cache *bfq_pool; | 759 | static struct kmem_cache *bfq_pool; |
742 | 760 | ||
743 | /* Below this threshold (in ms), we consider thinktime immediate. */ | 761 | /* Below this threshold (in ns), we consider thinktime immediate. */ |
744 | #define BFQ_MIN_TT (2 * NSEC_PER_MSEC) | 762 | #define BFQ_MIN_TT (2 * NSEC_PER_MSEC) |
745 | 763 | ||
746 | /* hw_tag detection: parallel requests threshold and min samples needed. */ | 764 | /* hw_tag detection: parallel requests threshold and min samples needed. */ |
@@ -752,8 +770,12 @@ static struct kmem_cache *bfq_pool; | |||
752 | #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) | 770 | #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) |
753 | #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) | 771 | #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) |
754 | 772 | ||
755 | /* Min samples used for peak rate estimation (for autotuning). */ | 773 | /* Min number of samples required to perform peak-rate update */ |
756 | #define BFQ_PEAK_RATE_SAMPLES 32 | 774 | #define BFQ_RATE_MIN_SAMPLES 32 |
775 | /* Min observation time interval required to perform a peak-rate update (ns) */ | ||
776 | #define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) | ||
777 | /* Target observation time interval for a peak-rate update (ns) */ | ||
778 | #define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC | ||
757 | 779 | ||
758 | /* Shift used for peak rate fixed precision calculations. */ | 780 | /* Shift used for peak rate fixed precision calculations. */ |
759 | #define BFQ_RATE_SHIFT 16 | 781 | #define BFQ_RATE_SHIFT 16 |
@@ -3837,15 +3859,20 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, | |||
3837 | return NULL; | 3859 | return NULL; |
3838 | } | 3860 | } |
3839 | 3861 | ||
3862 | static sector_t get_sdist(sector_t last_pos, struct request *rq) | ||
3863 | { | ||
3864 | if (last_pos) | ||
3865 | return abs(blk_rq_pos(rq) - last_pos); | ||
3866 | |||
3867 | return 0; | ||
3868 | } | ||
3869 | |||
3840 | #if 0 /* Still not clear if we can do without next two functions */ | 3870 | #if 0 /* Still not clear if we can do without next two functions */ |
3841 | static void bfq_activate_request(struct request_queue *q, struct request *rq) | 3871 | static void bfq_activate_request(struct request_queue *q, struct request *rq) |
3842 | { | 3872 | { |
3843 | struct bfq_data *bfqd = q->elevator->elevator_data; | 3873 | struct bfq_data *bfqd = q->elevator->elevator_data; |
3844 | 3874 | ||
3845 | bfqd->rq_in_driver++; | 3875 | bfqd->rq_in_driver++; |
3846 | bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); | ||
3847 | bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", | ||
3848 | (unsigned long long)bfqd->last_position); | ||
3849 | } | 3876 | } |
3850 | 3877 | ||
3851 | static void bfq_deactivate_request(struct request_queue *q, struct request *rq) | 3878 | static void bfq_deactivate_request(struct request_queue *q, struct request *rq) |
@@ -4124,6 +4151,227 @@ static void bfq_set_budget_timeout(struct bfq_data *bfqd) | |||
4124 | } | 4151 | } |
4125 | 4152 | ||
4126 | /* | 4153 | /* |
4154 | * In autotuning mode, max_budget is dynamically recomputed as the | ||
4155 | * amount of sectors transferred in timeout at the estimated peak | ||
4156 | * rate. This enables BFQ to utilize a full timeslice with a full | ||
4157 | * budget, even if the in-service queue is served at peak rate. And | ||
4158 | * this maximises throughput with sequential workloads. | ||
4159 | */ | ||
4160 | static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) | ||
4161 | { | ||
4162 | return (u64)bfqd->peak_rate * USEC_PER_MSEC * | ||
4163 | jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; | ||
4164 | } | ||
4165 | |||
4166 | static void bfq_reset_rate_computation(struct bfq_data *bfqd, | ||
4167 | struct request *rq) | ||
4168 | { | ||
4169 | if (rq != NULL) { /* new rq dispatch now, reset accordingly */ | ||
4170 | bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns(); | ||
4171 | bfqd->peak_rate_samples = 1; | ||
4172 | bfqd->sequential_samples = 0; | ||
4173 | bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = | ||
4174 | blk_rq_sectors(rq); | ||
4175 | } else /* no new rq dispatched, just reset the number of samples */ | ||
4176 | bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ | ||
4177 | |||
4178 | bfq_log(bfqd, | ||
4179 | "reset_rate_computation at end, sample %u/%u tot_sects %llu", | ||
4180 | bfqd->peak_rate_samples, bfqd->sequential_samples, | ||
4181 | bfqd->tot_sectors_dispatched); | ||
4182 | } | ||
4183 | |||
4184 | static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) | ||
4185 | { | ||
4186 | u32 rate, weight, divisor; | ||
4187 | |||
4188 | /* | ||
4189 | * For the convergence property to hold (see comments on | ||
4190 | * bfq_update_peak_rate()) and for the assessment to be | ||
4191 | * reliable, a minimum number of samples must be present, and | ||
4192 | * a minimum amount of time must have elapsed. If not so, do | ||
4193 | * not compute new rate. Just reset parameters, to get ready | ||
4194 | * for a new evaluation attempt. | ||
4195 | */ | ||
4196 | if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || | ||
4197 | bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) | ||
4198 | goto reset_computation; | ||
4199 | |||
4200 | /* | ||
4201 | * If a new request completion has occurred after last | ||
4202 | * dispatch, then, to approximate the rate at which requests | ||
4203 | * have been served by the device, it is more precise to | ||
4204 | * extend the observation interval to the last completion. | ||
4205 | */ | ||
4206 | bfqd->delta_from_first = | ||
4207 | max_t(u64, bfqd->delta_from_first, | ||
4208 | bfqd->last_completion - bfqd->first_dispatch); | ||
4209 | |||
4210 | /* | ||
4211 | * Rate computed in sects/usec, and not sects/nsec, for | ||
4212 | * precision issues. | ||
4213 | */ | ||
4214 | rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT, | ||
4215 | div_u64(bfqd->delta_from_first, NSEC_PER_USEC)); | ||
4216 | |||
4217 | /* | ||
4218 | * Peak rate not updated if: | ||
4219 | * - the percentage of sequential dispatches is below 3/4 of the | ||
4220 | * total, and rate is below the current estimated peak rate | ||
4221 | * - rate is unreasonably high (> 20M sectors/sec) | ||
4222 | */ | ||
4223 | if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && | ||
4224 | rate <= bfqd->peak_rate) || | ||
4225 | rate > 20<<BFQ_RATE_SHIFT) | ||
4226 | goto reset_computation; | ||
4227 | |||
4228 | /* | ||
4229 | * We have to update the peak rate, at last! To this purpose, | ||
4230 | * we use a low-pass filter. We compute the smoothing constant | ||
4231 | * of the filter as a function of the 'weight' of the new | ||
4232 | * measured rate. | ||
4233 | * | ||
4234 | * As can be seen in next formulas, we define this weight as a | ||
4235 | * quantity proportional to how sequential the workload is, | ||
4236 | * and to how long the observation time interval is. | ||
4237 | * | ||
4238 | * The weight runs from 0 to 8. The maximum value of the | ||
4239 | * weight, 8, yields the minimum value for the smoothing | ||
4240 | * constant. At this minimum value for the smoothing constant, | ||
4241 | * the measured rate contributes for half of the next value of | ||
4242 | * the estimated peak rate. | ||
4243 | * | ||
4244 | * So, the first step is to compute the weight as a function | ||
4245 | * of how sequential the workload is. Note that the weight | ||
4246 | * cannot reach 9, because bfqd->sequential_samples cannot | ||
4247 | * become equal to bfqd->peak_rate_samples, which, in its | ||
4248 | * turn, holds true because bfqd->sequential_samples is not | ||
4249 | * incremented for the first sample. | ||
4250 | */ | ||
4251 | weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; | ||
4252 | |||
4253 | /* | ||
4254 | * Second step: further refine the weight as a function of the | ||
4255 | * duration of the observation interval. | ||
4256 | */ | ||
4257 | weight = min_t(u32, 8, | ||
4258 | div_u64(weight * bfqd->delta_from_first, | ||
4259 | BFQ_RATE_REF_INTERVAL)); | ||
4260 | |||
4261 | /* | ||
4262 | * Divisor ranging from 10, for minimum weight, to 2, for | ||
4263 | * maximum weight. | ||
4264 | */ | ||
4265 | divisor = 10 - weight; | ||
4266 | |||
4267 | /* | ||
4268 | * Finally, update peak rate: | ||
4269 | * | ||
4270 | * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor | ||
4271 | */ | ||
4272 | bfqd->peak_rate *= divisor-1; | ||
4273 | bfqd->peak_rate /= divisor; | ||
4274 | rate /= divisor; /* smoothing constant alpha = 1/divisor */ | ||
4275 | |||
4276 | bfqd->peak_rate += rate; | ||
4277 | if (bfqd->bfq_user_max_budget == 0) | ||
4278 | bfqd->bfq_max_budget = | ||
4279 | bfq_calc_max_budget(bfqd); | ||
4280 | |||
4281 | reset_computation: | ||
4282 | bfq_reset_rate_computation(bfqd, rq); | ||
4283 | } | ||
4284 | |||
4285 | /* | ||
4286 | * Update the read/write peak rate (the main quantity used for | ||
4287 | * auto-tuning, see update_thr_responsiveness_params()). | ||
4288 | * | ||
4289 | * It is not trivial to estimate the peak rate (correctly): because of | ||
4290 | * the presence of sw and hw queues between the scheduler and the | ||
4291 | * device components that finally serve I/O requests, it is hard to | ||
4292 | * say exactly when a given dispatched request is served inside the | ||
4293 | * device, and for how long. As a consequence, it is hard to know | ||
4294 | * precisely at what rate a given set of requests is actually served | ||
4295 | * by the device. | ||
4296 | * | ||
4297 | * On the opposite end, the dispatch time of any request is trivially | ||
4298 | * available, and, from this piece of information, the "dispatch rate" | ||
4299 | * of requests can be immediately computed. So, the idea in the next | ||
4300 | * function is to use what is known, namely request dispatch times | ||
4301 | * (plus, when useful, request completion times), to estimate what is | ||
4302 | * unknown, namely in-device request service rate. | ||
4303 | * | ||
4304 | * The main issue is that, because of the above facts, the rate at | ||
4305 | * which a certain set of requests is dispatched over a certain time | ||
4306 | * interval can vary greatly with respect to the rate at which the | ||
4307 | * same requests are then served. But, since the size of any | ||
4308 | * intermediate queue is limited, and the service scheme is lossless | ||
4309 | * (no request is silently dropped), the following obvious convergence | ||
4310 | * property holds: the number of requests dispatched MUST become | ||
4311 | * closer and closer to the number of requests completed as the | ||
4312 | * observation interval grows. This is the key property used in | ||
4313 | * the next function to estimate the peak service rate as a function | ||
4314 | * of the observed dispatch rate. The function assumes to be invoked | ||
4315 | * on every request dispatch. | ||
4316 | */ | ||
4317 | static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) | ||
4318 | { | ||
4319 | u64 now_ns = ktime_get_ns(); | ||
4320 | |||
4321 | if (bfqd->peak_rate_samples == 0) { /* first dispatch */ | ||
4322 | bfq_log(bfqd, "update_peak_rate: goto reset, samples %d", | ||
4323 | bfqd->peak_rate_samples); | ||
4324 | bfq_reset_rate_computation(bfqd, rq); | ||
4325 | goto update_last_values; /* will add one sample */ | ||
4326 | } | ||
4327 | |||
4328 | /* | ||
4329 | * Device idle for very long: the observation interval lasting | ||
4330 | * up to this dispatch cannot be a valid observation interval | ||
4331 | * for computing a new peak rate (similarly to the late- | ||
4332 | * completion event in bfq_completed_request()). Go to | ||
4333 | * update_rate_and_reset to have the following three steps | ||
4334 | * taken: | ||
4335 | * - close the observation interval at the last (previous) | ||
4336 | * request dispatch or completion | ||
4337 | * - compute rate, if possible, for that observation interval | ||
4338 | * - start a new observation interval with this dispatch | ||
4339 | */ | ||
4340 | if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && | ||
4341 | bfqd->rq_in_driver == 0) | ||
4342 | goto update_rate_and_reset; | ||
4343 | |||
4344 | /* Update sampling information */ | ||
4345 | bfqd->peak_rate_samples++; | ||
4346 | |||
4347 | if ((bfqd->rq_in_driver > 0 || | ||
4348 | now_ns - bfqd->last_completion < BFQ_MIN_TT) | ||
4349 | && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR) | ||
4350 | bfqd->sequential_samples++; | ||
4351 | |||
4352 | bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); | ||
4353 | |||
4354 | /* Reset max observed rq size every 32 dispatches */ | ||
4355 | if (likely(bfqd->peak_rate_samples % 32)) | ||
4356 | bfqd->last_rq_max_size = | ||
4357 | max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); | ||
4358 | else | ||
4359 | bfqd->last_rq_max_size = blk_rq_sectors(rq); | ||
4360 | |||
4361 | bfqd->delta_from_first = now_ns - bfqd->first_dispatch; | ||
4362 | |||
4363 | /* Target observation interval not yet reached, go on sampling */ | ||
4364 | if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) | ||
4365 | goto update_last_values; | ||
4366 | |||
4367 | update_rate_and_reset: | ||
4368 | bfq_update_rate_reset(bfqd, rq); | ||
4369 | update_last_values: | ||
4370 | bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); | ||
4371 | bfqd->last_dispatch = now_ns; | ||
4372 | } | ||
4373 | |||
4374 | /* | ||
4127 | * Remove request from internal lists. | 4375 | * Remove request from internal lists. |
4128 | */ | 4376 | */ |
4129 | static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) | 4377 | static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) |
@@ -4143,6 +4391,7 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) | |||
4143 | * happens to be taken into account. | 4391 | * happens to be taken into account. |
4144 | */ | 4392 | */ |
4145 | bfqq->dispatched++; | 4393 | bfqq->dispatched++; |
4394 | bfq_update_peak_rate(q->elevator->elevator_data, rq); | ||
4146 | 4395 | ||
4147 | bfq_remove_request(q, rq); | 4396 | bfq_remove_request(q, rq); |
4148 | } | 4397 | } |
@@ -4323,110 +4572,92 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, | |||
4323 | bfqq->entity.budget); | 4572 | bfqq->entity.budget); |
4324 | } | 4573 | } |
4325 | 4574 | ||
4326 | static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) | ||
4327 | { | ||
4328 | unsigned long max_budget; | ||
4329 | |||
4330 | /* | ||
4331 | * The max_budget calculated when autotuning is equal to the | ||
4332 | * amount of sectors transferred in timeout at the estimated | ||
4333 | * peak rate. To get this value, peak_rate is, first, | ||
4334 | * multiplied by 1000, because timeout is measured in ms, | ||
4335 | * while peak_rate is measured in sectors/usecs. Then the | ||
4336 | * result of this multiplication is right-shifted by | ||
4337 | * BFQ_RATE_SHIFT, because peak_rate is equal to the value of | ||
4338 | * the peak rate left-shifted by BFQ_RATE_SHIFT. | ||
4339 | */ | ||
4340 | max_budget = (unsigned long)(peak_rate * 1000 * | ||
4341 | timeout >> BFQ_RATE_SHIFT); | ||
4342 | |||
4343 | return max_budget; | ||
4344 | } | ||
4345 | |||
4346 | /* | 4575 | /* |
4347 | * In addition to updating the peak rate, checks whether the process | 4576 | * Return true if the process associated with bfqq is "slow". The slow |
4348 | * is "slow", and returns 1 if so. This slow flag is used, in addition | 4577 | * flag is used, in addition to the budget timeout, to reduce the |
4349 | * to the budget timeout, to reduce the amount of service provided to | 4578 | * amount of service provided to seeky processes, and thus reduce |
4350 | * seeky processes, and hence reduce their chances to lower the | 4579 | * their chances to lower the throughput. More details in the comments |
4351 | * throughput. See the code for more details. | 4580 | * on the function bfq_bfqq_expire(). |
4581 | * | ||
4582 | * An important observation is in order: as discussed in the comments | ||
4583 | * on the function bfq_update_peak_rate(), with devices with internal | ||
4584 | * queues, it is hard if ever possible to know when and for how long | ||
4585 | * an I/O request is processed by the device (apart from the trivial | ||
4586 | * I/O pattern where a new request is dispatched only after the | ||
4587 | * previous one has been completed). This makes it hard to evaluate | ||
4588 | * the real rate at which the I/O requests of each bfq_queue are | ||
4589 | * served. In fact, for an I/O scheduler like BFQ, serving a | ||
4590 | * bfq_queue means just dispatching its requests during its service | ||
4591 | * slot (i.e., until the budget of the queue is exhausted, or the | ||
4592 | * queue remains idle, or, finally, a timeout fires). But, during the | ||
4593 | * service slot of a bfq_queue, around 100 ms at most, the device may | ||
4594 | * be even still processing requests of bfq_queues served in previous | ||
4595 | * service slots. On the opposite end, the requests of the in-service | ||
4596 | * bfq_queue may be completed after the service slot of the queue | ||
4597 | * finishes. | ||
4598 | * | ||
4599 | * Anyway, unless more sophisticated solutions are used | ||
4600 | * (where possible), the sum of the sizes of the requests dispatched | ||
4601 | * during the service slot of a bfq_queue is probably the only | ||
4602 | * approximation available for the service received by the bfq_queue | ||
4603 | * during its service slot. And this sum is the quantity used in this | ||
4604 | * function to evaluate the I/O speed of a process. | ||
4352 | */ | 4605 | */ |
4353 | static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, | 4606 | static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
4354 | bool compensate) | 4607 | bool compensate, enum bfqq_expiration reason, |
4608 | unsigned long *delta_ms) | ||
4355 | { | 4609 | { |
4356 | u64 bw, usecs, expected, timeout; | 4610 | ktime_t delta_ktime; |
4357 | ktime_t delta; | 4611 | u32 delta_usecs; |
4358 | int update = 0; | 4612 | bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ |
4359 | 4613 | ||
4360 | if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) | 4614 | if (!bfq_bfqq_sync(bfqq)) |
4361 | return false; | 4615 | return false; |
4362 | 4616 | ||
4363 | if (compensate) | 4617 | if (compensate) |
4364 | delta = bfqd->last_idling_start; | 4618 | delta_ktime = bfqd->last_idling_start; |
4365 | else | 4619 | else |
4366 | delta = ktime_get(); | 4620 | delta_ktime = ktime_get(); |
4367 | delta = ktime_sub(delta, bfqd->last_budget_start); | 4621 | delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); |
4368 | usecs = ktime_to_us(delta); | 4622 | delta_usecs = ktime_to_us(delta_ktime); |
4369 | 4623 | ||
4370 | /* don't use too short time intervals */ | 4624 | /* don't use too short time intervals */ |
4371 | if (usecs < 1000) | 4625 | if (delta_usecs < 1000) { |
4372 | return false; | 4626 | if (blk_queue_nonrot(bfqd->queue)) |
4373 | 4627 | /* | |
4374 | /* | 4628 | * give same worst-case guarantees as idling |
4375 | * Calculate the bandwidth for the last slice. We use a 64 bit | 4629 | * for seeky |
4376 | * value to store the peak rate, in sectors per usec in fixed | 4630 | */ |
4377 | * point math. We do so to have enough precision in the estimate | 4631 | *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; |
4378 | * and to avoid overflows. | 4632 | else /* charge at least one seek */ |
4379 | */ | 4633 | *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; |
4380 | bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; | 4634 | |
4381 | do_div(bw, (unsigned long)usecs); | 4635 | return slow; |
4636 | } | ||
4382 | 4637 | ||
4383 | timeout = jiffies_to_msecs(bfqd->bfq_timeout); | 4638 | *delta_ms = delta_usecs / USEC_PER_MSEC; |
4384 | 4639 | ||
4385 | /* | 4640 | /* |
4386 | * Use only long (> 20ms) intervals to filter out spikes for | 4641 | * Use only long (> 20ms) intervals to filter out excessive |
4387 | * the peak rate estimation. | 4642 | * spikes in service rate estimation. |
4388 | */ | 4643 | */ |
4389 | if (usecs > 20000) { | 4644 | if (delta_usecs > 20000) { |
4390 | if (bw > bfqd->peak_rate) { | 4645 | /* |
4391 | bfqd->peak_rate = bw; | 4646 | * Caveat for rotational devices: processes doing I/O |
4392 | update = 1; | 4647 | * in the slower disk zones tend to be slow(er) even |
4393 | bfq_log(bfqd, "new peak_rate=%llu", bw); | 4648 | * if not seeky. In this respect, the estimated peak |
4394 | } | 4649 | * rate is likely to be an average over the disk |
4395 | 4650 | * surface. Accordingly, to not be too harsh with | |
4396 | update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; | 4651 | * unlucky processes, a process is deemed slow only if |
4397 | 4652 | * its rate has been lower than half of the estimated | |
4398 | if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) | 4653 | * peak rate. |
4399 | bfqd->peak_rate_samples++; | 4654 | */ |
4400 | 4655 | slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; | |
4401 | if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && | ||
4402 | update && bfqd->bfq_user_max_budget == 0) { | ||
4403 | bfqd->bfq_max_budget = | ||
4404 | bfq_calc_max_budget(bfqd->peak_rate, | ||
4405 | timeout); | ||
4406 | bfq_log(bfqd, "new max_budget=%d", | ||
4407 | bfqd->bfq_max_budget); | ||
4408 | } | ||
4409 | } | 4656 | } |
4410 | 4657 | ||
4411 | /* | 4658 | bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); |
4412 | * A process is considered ``slow'' (i.e., seeky, so that we | ||
4413 | * cannot treat it fairly in the service domain, as it would | ||
4414 | * slow down too much the other processes) if, when a slice | ||
4415 | * ends for whatever reason, it has received service at a | ||
4416 | * rate that would not be high enough to complete the budget | ||
4417 | * before the budget timeout expiration. | ||
4418 | */ | ||
4419 | expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; | ||
4420 | 4659 | ||
4421 | /* | 4660 | return slow; |
4422 | * Caveat: processes doing IO in the slower disk zones will | ||
4423 | * tend to be slow(er) even if not seeky. And the estimated | ||
4424 | * peak rate will actually be an average over the disk | ||
4425 | * surface. Hence, to not be too harsh with unlucky processes, | ||
4426 | * we keep a budget/3 margin of safety before declaring a | ||
4427 | * process slow. | ||
4428 | */ | ||
4429 | return expected > (4 * bfqq->entity.budget) / 3; | ||
4430 | } | 4661 | } |
4431 | 4662 | ||
4432 | /* | 4663 | /* |
@@ -4474,13 +4705,14 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, | |||
4474 | enum bfqq_expiration reason) | 4705 | enum bfqq_expiration reason) |
4475 | { | 4706 | { |
4476 | bool slow; | 4707 | bool slow; |
4708 | unsigned long delta = 0; | ||
4709 | struct bfq_entity *entity = &bfqq->entity; | ||
4477 | int ref; | 4710 | int ref; |
4478 | 4711 | ||
4479 | /* | 4712 | /* |
4480 | * Update device peak rate for autotuning and check whether the | 4713 | * Check whether the process is slow (see bfq_bfqq_is_slow). |
4481 | * process is slow (see bfq_update_peak_rate). | ||
4482 | */ | 4714 | */ |
4483 | slow = bfq_update_peak_rate(bfqd, bfqq, compensate); | 4715 | slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); |
4484 | 4716 | ||
4485 | /* | 4717 | /* |
4486 | * As above explained, 'punish' slow (i.e., seeky), timed-out | 4718 | * As above explained, 'punish' slow (i.e., seeky), timed-out |
@@ -4490,7 +4722,7 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, | |||
4490 | bfq_bfqq_charge_full_budget(bfqq); | 4722 | bfq_bfqq_charge_full_budget(bfqq); |
4491 | 4723 | ||
4492 | if (reason == BFQQE_TOO_IDLE && | 4724 | if (reason == BFQQE_TOO_IDLE && |
4493 | bfqq->entity.service <= 2 * bfqq->entity.budget / 10) | 4725 | entity->service <= 2 * entity->budget / 10) |
4494 | bfq_clear_bfqq_IO_bound(bfqq); | 4726 | bfq_clear_bfqq_IO_bound(bfqq); |
4495 | 4727 | ||
4496 | bfq_log_bfqq(bfqd, bfqq, | 4728 | bfq_log_bfqq(bfqd, bfqq, |
@@ -5130,17 +5362,9 @@ static void | |||
5130 | bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, | 5362 | bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
5131 | struct request *rq) | 5363 | struct request *rq) |
5132 | { | 5364 | { |
5133 | sector_t sdist = 0; | ||
5134 | |||
5135 | if (bfqq->last_request_pos) { | ||
5136 | if (bfqq->last_request_pos < blk_rq_pos(rq)) | ||
5137 | sdist = blk_rq_pos(rq) - bfqq->last_request_pos; | ||
5138 | else | ||
5139 | sdist = bfqq->last_request_pos - blk_rq_pos(rq); | ||
5140 | } | ||
5141 | |||
5142 | bfqq->seek_history <<= 1; | 5365 | bfqq->seek_history <<= 1; |
5143 | bfqq->seek_history |= sdist > BFQQ_SEEK_THR && | 5366 | bfqq->seek_history |= |
5367 | get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR && | ||
5144 | (!blk_queue_nonrot(bfqd->queue) || | 5368 | (!blk_queue_nonrot(bfqd->queue) || |
5145 | blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); | 5369 | blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); |
5146 | } | 5370 | } |
@@ -5336,12 +5560,45 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) | |||
5336 | 5560 | ||
5337 | static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) | 5561 | static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) |
5338 | { | 5562 | { |
5563 | u64 now_ns; | ||
5564 | u32 delta_us; | ||
5565 | |||
5339 | bfq_update_hw_tag(bfqd); | 5566 | bfq_update_hw_tag(bfqd); |
5340 | 5567 | ||
5341 | bfqd->rq_in_driver--; | 5568 | bfqd->rq_in_driver--; |
5342 | bfqq->dispatched--; | 5569 | bfqq->dispatched--; |
5343 | 5570 | ||
5344 | bfqq->ttime.last_end_request = ktime_get_ns(); | 5571 | now_ns = ktime_get_ns(); |
5572 | |||
5573 | bfqq->ttime.last_end_request = now_ns; | ||
5574 | |||
5575 | /* | ||
5576 | * Using us instead of ns, to get a reasonable precision in | ||
5577 | * computing rate in next check. | ||
5578 | */ | ||
5579 | delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); | ||
5580 | |||
5581 | /* | ||
5582 | * If the request took rather long to complete, and, according | ||
5583 | * to the maximum request size recorded, this completion latency | ||
5584 | * implies that the request was certainly served at a very low | ||
5585 | * rate (less than 1M sectors/sec), then the whole observation | ||
5586 | * interval that lasts up to this time instant cannot be a | ||
5587 | * valid time interval for computing a new peak rate. Invoke | ||
5588 | * bfq_update_rate_reset to have the following three steps | ||
5589 | * taken: | ||
5590 | * - close the observation interval at the last (previous) | ||
5591 | * request dispatch or completion | ||
5592 | * - compute rate, if possible, for that observation interval | ||
5593 | * - reset to zero samples, which will trigger a proper | ||
5594 | * re-initialization of the observation interval on next | ||
5595 | * dispatch | ||
5596 | */ | ||
5597 | if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && | ||
5598 | (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us < | ||
5599 | 1UL<<(BFQ_RATE_SHIFT - 10)) | ||
5600 | bfq_update_rate_reset(bfqd, NULL); | ||
5601 | bfqd->last_completion = now_ns; | ||
5345 | 5602 | ||
5346 | /* | 5603 | /* |
5347 | * If this is the in-service queue, check if it needs to be expired, | 5604 | * If this is the in-service queue, check if it needs to be expired, |
@@ -5799,16 +6056,6 @@ USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, | |||
5799 | UINT_MAX); | 6056 | UINT_MAX); |
5800 | #undef USEC_STORE_FUNCTION | 6057 | #undef USEC_STORE_FUNCTION |
5801 | 6058 | ||
5802 | static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) | ||
5803 | { | ||
5804 | u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout); | ||
5805 | |||
5806 | if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) | ||
5807 | return bfq_calc_max_budget(bfqd->peak_rate, timeout); | ||
5808 | else | ||
5809 | return bfq_default_max_budget; | ||
5810 | } | ||
5811 | |||
5812 | static ssize_t bfq_max_budget_store(struct elevator_queue *e, | 6059 | static ssize_t bfq_max_budget_store(struct elevator_queue *e, |
5813 | const char *page, size_t count) | 6060 | const char *page, size_t count) |
5814 | { | 6061 | { |
@@ -5817,7 +6064,7 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e, | |||
5817 | int ret = bfq_var_store(&__data, (page), count); | 6064 | int ret = bfq_var_store(&__data, (page), count); |
5818 | 6065 | ||
5819 | if (__data == 0) | 6066 | if (__data == 0) |
5820 | bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); | 6067 | bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); |
5821 | else { | 6068 | else { |
5822 | if (__data > INT_MAX) | 6069 | if (__data > INT_MAX) |
5823 | __data = INT_MAX; | 6070 | __data = INT_MAX; |
@@ -5847,7 +6094,7 @@ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, | |||
5847 | 6094 | ||
5848 | bfqd->bfq_timeout = msecs_to_jiffies(__data); | 6095 | bfqd->bfq_timeout = msecs_to_jiffies(__data); |
5849 | if (bfqd->bfq_user_max_budget == 0) | 6096 | if (bfqd->bfq_user_max_budget == 0) |
5850 | bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); | 6097 | bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); |
5851 | 6098 | ||
5852 | return ret; | 6099 | return ret; |
5853 | } | 6100 | } |