summaryrefslogtreecommitdiffstats
path: root/block/bfq-iosched.c
diff options
context:
space:
mode:
authorPaolo Valente <paolo.valente@linaro.org>2017-04-12 12:23:10 -0400
committerJens Axboe <axboe@fb.com>2017-04-19 10:30:26 -0400
commitab0e43e9cea047873599bc8041cd6278781fd4e0 (patch)
treefc7eb653bc41f6c04a7a09fd516f1ee81b12a0d4 /block/bfq-iosched.c
parent54b604567fbfa1a35a44c2ac4a35c959d277adc2 (diff)
block, bfq: modify the peak-rate estimator
Unless the maximum budget B_max that BFQ can assign to a queue is set explicitly by the user, BFQ automatically updates B_max. In particular, BFQ dynamically sets B_max to the number of sectors that can be read, at the current estimated peak rate, during the maximum time, T_max, allowed before a budget timeout occurs. In formulas, if we denote as R_est the estimated peak rate, then B_max = T_max ∗ R_est. Hence, the higher R_est is with respect to the actual device peak rate, the higher the probability that processes incur budget timeouts unjustly is. Besides, a too high value of B_max unnecessarily increases the deviation from an ideal, smooth service. Unfortunately, it is not trivial to estimate the peak rate correctly: because of the presence of sw and hw queues between the scheduler and the device components that finally serve I/O requests, it is hard to say exactly when a given dispatched request is served inside the device, and for how long. As a consequence, it is hard to know precisely at what rate a given set of requests is actually served by the device. On the opposite end, the dispatch time of any request is trivially available, and, from this piece of information, the "dispatch rate" of requests can be immediately computed. So, the idea in the next function is to use what is known, namely request dispatch times (plus, when useful, request completion times), to estimate what is unknown, namely in-device request service rate. The main issue is that, because of the above facts, the rate at which a certain set of requests is dispatched over a certain time interval can vary greatly with respect to the rate at which the same requests are then served. But, since the size of any intermediate queue is limited, and the service scheme is lossless (no request is silently dropped), the following obvious convergence property holds: the number of requests dispatched MUST become closer and closer to the number of requests completed as the observation interval grows. This is the key property used in this new version of the peak-rate estimator. Signed-off-by: Paolo Valente <paolo.valente@linaro.org> Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com> Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'block/bfq-iosched.c')
-rw-r--r--block/bfq-iosched.c497
1 files changed, 372 insertions, 125 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 1edac72ab51d..61d880b90882 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -407,19 +407,37 @@ struct bfq_data {
407 /* on-disk position of the last served request */ 407 /* on-disk position of the last served request */
408 sector_t last_position; 408 sector_t last_position;
409 409
410 /* time of last request completion (ns) */
411 u64 last_completion;
412
413 /* time of first rq dispatch in current observation interval (ns) */
414 u64 first_dispatch;
415 /* time of last rq dispatch in current observation interval (ns) */
416 u64 last_dispatch;
417
410 /* beginning of the last budget */ 418 /* beginning of the last budget */
411 ktime_t last_budget_start; 419 ktime_t last_budget_start;
412 /* beginning of the last idle slice */ 420 /* beginning of the last idle slice */
413 ktime_t last_idling_start; 421 ktime_t last_idling_start;
414 /* number of samples used to calculate @peak_rate */ 422
423 /* number of samples in current observation interval */
415 int peak_rate_samples; 424 int peak_rate_samples;
425 /* num of samples of seq dispatches in current observation interval */
426 u32 sequential_samples;
427 /* total num of sectors transferred in current observation interval */
428 u64 tot_sectors_dispatched;
429 /* max rq size seen during current observation interval (sectors) */
430 u32 last_rq_max_size;
431 /* time elapsed from first dispatch in current observ. interval (us) */
432 u64 delta_from_first;
416 /* 433 /*
417 * Peak read/write rate, observed during the service of a 434 * Current estimate of the device peak rate, measured in
418 * budget [BFQ_RATE_SHIFT * sectors/usec]. The value is 435 * [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
419 * left-shifted by BFQ_RATE_SHIFT to increase precision in 436 * BFQ_RATE_SHIFT is performed to increase precision in
420 * fixed-point calculations. 437 * fixed-point calculations.
421 */ 438 */
422 u64 peak_rate; 439 u32 peak_rate;
440
423 /* maximum budget allotted to a bfq_queue before rescheduling */ 441 /* maximum budget allotted to a bfq_queue before rescheduling */
424 int bfq_max_budget; 442 int bfq_max_budget;
425 443
@@ -740,7 +758,7 @@ static const int bfq_timeout = HZ / 8;
740 758
741static struct kmem_cache *bfq_pool; 759static struct kmem_cache *bfq_pool;
742 760
743/* Below this threshold (in ms), we consider thinktime immediate. */ 761/* Below this threshold (in ns), we consider thinktime immediate. */
744#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) 762#define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
745 763
746/* hw_tag detection: parallel requests threshold and min samples needed. */ 764/* hw_tag detection: parallel requests threshold and min samples needed. */
@@ -752,8 +770,12 @@ static struct kmem_cache *bfq_pool;
752#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) 770#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
753#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) 771#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
754 772
755/* Min samples used for peak rate estimation (for autotuning). */ 773/* Min number of samples required to perform peak-rate update */
756#define BFQ_PEAK_RATE_SAMPLES 32 774#define BFQ_RATE_MIN_SAMPLES 32
775/* Min observation time interval required to perform a peak-rate update (ns) */
776#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC)
777/* Target observation time interval for a peak-rate update (ns) */
778#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC
757 779
758/* Shift used for peak rate fixed precision calculations. */ 780/* Shift used for peak rate fixed precision calculations. */
759#define BFQ_RATE_SHIFT 16 781#define BFQ_RATE_SHIFT 16
@@ -3837,15 +3859,20 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
3837 return NULL; 3859 return NULL;
3838} 3860}
3839 3861
3862static sector_t get_sdist(sector_t last_pos, struct request *rq)
3863{
3864 if (last_pos)
3865 return abs(blk_rq_pos(rq) - last_pos);
3866
3867 return 0;
3868}
3869
3840#if 0 /* Still not clear if we can do without next two functions */ 3870#if 0 /* Still not clear if we can do without next two functions */
3841static void bfq_activate_request(struct request_queue *q, struct request *rq) 3871static void bfq_activate_request(struct request_queue *q, struct request *rq)
3842{ 3872{
3843 struct bfq_data *bfqd = q->elevator->elevator_data; 3873 struct bfq_data *bfqd = q->elevator->elevator_data;
3844 3874
3845 bfqd->rq_in_driver++; 3875 bfqd->rq_in_driver++;
3846 bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
3847 bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
3848 (unsigned long long)bfqd->last_position);
3849} 3876}
3850 3877
3851static void bfq_deactivate_request(struct request_queue *q, struct request *rq) 3878static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
@@ -4124,6 +4151,227 @@ static void bfq_set_budget_timeout(struct bfq_data *bfqd)
4124} 4151}
4125 4152
4126/* 4153/*
4154 * In autotuning mode, max_budget is dynamically recomputed as the
4155 * amount of sectors transferred in timeout at the estimated peak
4156 * rate. This enables BFQ to utilize a full timeslice with a full
4157 * budget, even if the in-service queue is served at peak rate. And
4158 * this maximises throughput with sequential workloads.
4159 */
4160static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
4161{
4162 return (u64)bfqd->peak_rate * USEC_PER_MSEC *
4163 jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT;
4164}
4165
4166static void bfq_reset_rate_computation(struct bfq_data *bfqd,
4167 struct request *rq)
4168{
4169 if (rq != NULL) { /* new rq dispatch now, reset accordingly */
4170 bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns();
4171 bfqd->peak_rate_samples = 1;
4172 bfqd->sequential_samples = 0;
4173 bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size =
4174 blk_rq_sectors(rq);
4175 } else /* no new rq dispatched, just reset the number of samples */
4176 bfqd->peak_rate_samples = 0; /* full re-init on next disp. */
4177
4178 bfq_log(bfqd,
4179 "reset_rate_computation at end, sample %u/%u tot_sects %llu",
4180 bfqd->peak_rate_samples, bfqd->sequential_samples,
4181 bfqd->tot_sectors_dispatched);
4182}
4183
4184static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq)
4185{
4186 u32 rate, weight, divisor;
4187
4188 /*
4189 * For the convergence property to hold (see comments on
4190 * bfq_update_peak_rate()) and for the assessment to be
4191 * reliable, a minimum number of samples must be present, and
4192 * a minimum amount of time must have elapsed. If not so, do
4193 * not compute new rate. Just reset parameters, to get ready
4194 * for a new evaluation attempt.
4195 */
4196 if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES ||
4197 bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL)
4198 goto reset_computation;
4199
4200 /*
4201 * If a new request completion has occurred after last
4202 * dispatch, then, to approximate the rate at which requests
4203 * have been served by the device, it is more precise to
4204 * extend the observation interval to the last completion.
4205 */
4206 bfqd->delta_from_first =
4207 max_t(u64, bfqd->delta_from_first,
4208 bfqd->last_completion - bfqd->first_dispatch);
4209
4210 /*
4211 * Rate computed in sects/usec, and not sects/nsec, for
4212 * precision issues.
4213 */
4214 rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT,
4215 div_u64(bfqd->delta_from_first, NSEC_PER_USEC));
4216
4217 /*
4218 * Peak rate not updated if:
4219 * - the percentage of sequential dispatches is below 3/4 of the
4220 * total, and rate is below the current estimated peak rate
4221 * - rate is unreasonably high (> 20M sectors/sec)
4222 */
4223 if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 &&
4224 rate <= bfqd->peak_rate) ||
4225 rate > 20<<BFQ_RATE_SHIFT)
4226 goto reset_computation;
4227
4228 /*
4229 * We have to update the peak rate, at last! To this purpose,
4230 * we use a low-pass filter. We compute the smoothing constant
4231 * of the filter as a function of the 'weight' of the new
4232 * measured rate.
4233 *
4234 * As can be seen in next formulas, we define this weight as a
4235 * quantity proportional to how sequential the workload is,
4236 * and to how long the observation time interval is.
4237 *
4238 * The weight runs from 0 to 8. The maximum value of the
4239 * weight, 8, yields the minimum value for the smoothing
4240 * constant. At this minimum value for the smoothing constant,
4241 * the measured rate contributes for half of the next value of
4242 * the estimated peak rate.
4243 *
4244 * So, the first step is to compute the weight as a function
4245 * of how sequential the workload is. Note that the weight
4246 * cannot reach 9, because bfqd->sequential_samples cannot
4247 * become equal to bfqd->peak_rate_samples, which, in its
4248 * turn, holds true because bfqd->sequential_samples is not
4249 * incremented for the first sample.
4250 */
4251 weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples;
4252
4253 /*
4254 * Second step: further refine the weight as a function of the
4255 * duration of the observation interval.
4256 */
4257 weight = min_t(u32, 8,
4258 div_u64(weight * bfqd->delta_from_first,
4259 BFQ_RATE_REF_INTERVAL));
4260
4261 /*
4262 * Divisor ranging from 10, for minimum weight, to 2, for
4263 * maximum weight.
4264 */
4265 divisor = 10 - weight;
4266
4267 /*
4268 * Finally, update peak rate:
4269 *
4270 * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor
4271 */
4272 bfqd->peak_rate *= divisor-1;
4273 bfqd->peak_rate /= divisor;
4274 rate /= divisor; /* smoothing constant alpha = 1/divisor */
4275
4276 bfqd->peak_rate += rate;
4277 if (bfqd->bfq_user_max_budget == 0)
4278 bfqd->bfq_max_budget =
4279 bfq_calc_max_budget(bfqd);
4280
4281reset_computation:
4282 bfq_reset_rate_computation(bfqd, rq);
4283}
4284
4285/*
4286 * Update the read/write peak rate (the main quantity used for
4287 * auto-tuning, see update_thr_responsiveness_params()).
4288 *
4289 * It is not trivial to estimate the peak rate (correctly): because of
4290 * the presence of sw and hw queues between the scheduler and the
4291 * device components that finally serve I/O requests, it is hard to
4292 * say exactly when a given dispatched request is served inside the
4293 * device, and for how long. As a consequence, it is hard to know
4294 * precisely at what rate a given set of requests is actually served
4295 * by the device.
4296 *
4297 * On the opposite end, the dispatch time of any request is trivially
4298 * available, and, from this piece of information, the "dispatch rate"
4299 * of requests can be immediately computed. So, the idea in the next
4300 * function is to use what is known, namely request dispatch times
4301 * (plus, when useful, request completion times), to estimate what is
4302 * unknown, namely in-device request service rate.
4303 *
4304 * The main issue is that, because of the above facts, the rate at
4305 * which a certain set of requests is dispatched over a certain time
4306 * interval can vary greatly with respect to the rate at which the
4307 * same requests are then served. But, since the size of any
4308 * intermediate queue is limited, and the service scheme is lossless
4309 * (no request is silently dropped), the following obvious convergence
4310 * property holds: the number of requests dispatched MUST become
4311 * closer and closer to the number of requests completed as the
4312 * observation interval grows. This is the key property used in
4313 * the next function to estimate the peak service rate as a function
4314 * of the observed dispatch rate. The function assumes to be invoked
4315 * on every request dispatch.
4316 */
4317static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
4318{
4319 u64 now_ns = ktime_get_ns();
4320
4321 if (bfqd->peak_rate_samples == 0) { /* first dispatch */
4322 bfq_log(bfqd, "update_peak_rate: goto reset, samples %d",
4323 bfqd->peak_rate_samples);
4324 bfq_reset_rate_computation(bfqd, rq);
4325 goto update_last_values; /* will add one sample */
4326 }
4327
4328 /*
4329 * Device idle for very long: the observation interval lasting
4330 * up to this dispatch cannot be a valid observation interval
4331 * for computing a new peak rate (similarly to the late-
4332 * completion event in bfq_completed_request()). Go to
4333 * update_rate_and_reset to have the following three steps
4334 * taken:
4335 * - close the observation interval at the last (previous)
4336 * request dispatch or completion
4337 * - compute rate, if possible, for that observation interval
4338 * - start a new observation interval with this dispatch
4339 */
4340 if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC &&
4341 bfqd->rq_in_driver == 0)
4342 goto update_rate_and_reset;
4343
4344 /* Update sampling information */
4345 bfqd->peak_rate_samples++;
4346
4347 if ((bfqd->rq_in_driver > 0 ||
4348 now_ns - bfqd->last_completion < BFQ_MIN_TT)
4349 && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
4350 bfqd->sequential_samples++;
4351
4352 bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
4353
4354 /* Reset max observed rq size every 32 dispatches */
4355 if (likely(bfqd->peak_rate_samples % 32))
4356 bfqd->last_rq_max_size =
4357 max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size);
4358 else
4359 bfqd->last_rq_max_size = blk_rq_sectors(rq);
4360
4361 bfqd->delta_from_first = now_ns - bfqd->first_dispatch;
4362
4363 /* Target observation interval not yet reached, go on sampling */
4364 if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL)
4365 goto update_last_values;
4366
4367update_rate_and_reset:
4368 bfq_update_rate_reset(bfqd, rq);
4369update_last_values:
4370 bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
4371 bfqd->last_dispatch = now_ns;
4372}
4373
4374/*
4127 * Remove request from internal lists. 4375 * Remove request from internal lists.
4128 */ 4376 */
4129static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) 4377static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
@@ -4143,6 +4391,7 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
4143 * happens to be taken into account. 4391 * happens to be taken into account.
4144 */ 4392 */
4145 bfqq->dispatched++; 4393 bfqq->dispatched++;
4394 bfq_update_peak_rate(q->elevator->elevator_data, rq);
4146 4395
4147 bfq_remove_request(q, rq); 4396 bfq_remove_request(q, rq);
4148} 4397}
@@ -4323,110 +4572,92 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
4323 bfqq->entity.budget); 4572 bfqq->entity.budget);
4324} 4573}
4325 4574
4326static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
4327{
4328 unsigned long max_budget;
4329
4330 /*
4331 * The max_budget calculated when autotuning is equal to the
4332 * amount of sectors transferred in timeout at the estimated
4333 * peak rate. To get this value, peak_rate is, first,
4334 * multiplied by 1000, because timeout is measured in ms,
4335 * while peak_rate is measured in sectors/usecs. Then the
4336 * result of this multiplication is right-shifted by
4337 * BFQ_RATE_SHIFT, because peak_rate is equal to the value of
4338 * the peak rate left-shifted by BFQ_RATE_SHIFT.
4339 */
4340 max_budget = (unsigned long)(peak_rate * 1000 *
4341 timeout >> BFQ_RATE_SHIFT);
4342
4343 return max_budget;
4344}
4345
4346/* 4575/*
4347 * In addition to updating the peak rate, checks whether the process 4576 * Return true if the process associated with bfqq is "slow". The slow
4348 * is "slow", and returns 1 if so. This slow flag is used, in addition 4577 * flag is used, in addition to the budget timeout, to reduce the
4349 * to the budget timeout, to reduce the amount of service provided to 4578 * amount of service provided to seeky processes, and thus reduce
4350 * seeky processes, and hence reduce their chances to lower the 4579 * their chances to lower the throughput. More details in the comments
4351 * throughput. See the code for more details. 4580 * on the function bfq_bfqq_expire().
4581 *
4582 * An important observation is in order: as discussed in the comments
4583 * on the function bfq_update_peak_rate(), with devices with internal
4584 * queues, it is hard if ever possible to know when and for how long
4585 * an I/O request is processed by the device (apart from the trivial
4586 * I/O pattern where a new request is dispatched only after the
4587 * previous one has been completed). This makes it hard to evaluate
4588 * the real rate at which the I/O requests of each bfq_queue are
4589 * served. In fact, for an I/O scheduler like BFQ, serving a
4590 * bfq_queue means just dispatching its requests during its service
4591 * slot (i.e., until the budget of the queue is exhausted, or the
4592 * queue remains idle, or, finally, a timeout fires). But, during the
4593 * service slot of a bfq_queue, around 100 ms at most, the device may
4594 * be even still processing requests of bfq_queues served in previous
4595 * service slots. On the opposite end, the requests of the in-service
4596 * bfq_queue may be completed after the service slot of the queue
4597 * finishes.
4598 *
4599 * Anyway, unless more sophisticated solutions are used
4600 * (where possible), the sum of the sizes of the requests dispatched
4601 * during the service slot of a bfq_queue is probably the only
4602 * approximation available for the service received by the bfq_queue
4603 * during its service slot. And this sum is the quantity used in this
4604 * function to evaluate the I/O speed of a process.
4352 */ 4605 */
4353static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, 4606static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
4354 bool compensate) 4607 bool compensate, enum bfqq_expiration reason,
4608 unsigned long *delta_ms)
4355{ 4609{
4356 u64 bw, usecs, expected, timeout; 4610 ktime_t delta_ktime;
4357 ktime_t delta; 4611 u32 delta_usecs;
4358 int update = 0; 4612 bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */
4359 4613
4360 if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) 4614 if (!bfq_bfqq_sync(bfqq))
4361 return false; 4615 return false;
4362 4616
4363 if (compensate) 4617 if (compensate)
4364 delta = bfqd->last_idling_start; 4618 delta_ktime = bfqd->last_idling_start;
4365 else 4619 else
4366 delta = ktime_get(); 4620 delta_ktime = ktime_get();
4367 delta = ktime_sub(delta, bfqd->last_budget_start); 4621 delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);
4368 usecs = ktime_to_us(delta); 4622 delta_usecs = ktime_to_us(delta_ktime);
4369 4623
4370 /* don't use too short time intervals */ 4624 /* don't use too short time intervals */
4371 if (usecs < 1000) 4625 if (delta_usecs < 1000) {
4372 return false; 4626 if (blk_queue_nonrot(bfqd->queue))
4373 4627 /*
4374 /* 4628 * give same worst-case guarantees as idling
4375 * Calculate the bandwidth for the last slice. We use a 64 bit 4629 * for seeky
4376 * value to store the peak rate, in sectors per usec in fixed 4630 */
4377 * point math. We do so to have enough precision in the estimate 4631 *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC;
4378 * and to avoid overflows. 4632 else /* charge at least one seek */
4379 */ 4633 *delta_ms = bfq_slice_idle / NSEC_PER_MSEC;
4380 bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; 4634
4381 do_div(bw, (unsigned long)usecs); 4635 return slow;
4636 }
4382 4637
4383 timeout = jiffies_to_msecs(bfqd->bfq_timeout); 4638 *delta_ms = delta_usecs / USEC_PER_MSEC;
4384 4639
4385 /* 4640 /*
4386 * Use only long (> 20ms) intervals to filter out spikes for 4641 * Use only long (> 20ms) intervals to filter out excessive
4387 * the peak rate estimation. 4642 * spikes in service rate estimation.
4388 */ 4643 */
4389 if (usecs > 20000) { 4644 if (delta_usecs > 20000) {
4390 if (bw > bfqd->peak_rate) { 4645 /*
4391 bfqd->peak_rate = bw; 4646 * Caveat for rotational devices: processes doing I/O
4392 update = 1; 4647 * in the slower disk zones tend to be slow(er) even
4393 bfq_log(bfqd, "new peak_rate=%llu", bw); 4648 * if not seeky. In this respect, the estimated peak
4394 } 4649 * rate is likely to be an average over the disk
4395 4650 * surface. Accordingly, to not be too harsh with
4396 update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; 4651 * unlucky processes, a process is deemed slow only if
4397 4652 * its rate has been lower than half of the estimated
4398 if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) 4653 * peak rate.
4399 bfqd->peak_rate_samples++; 4654 */
4400 4655 slow = bfqq->entity.service < bfqd->bfq_max_budget / 2;
4401 if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
4402 update && bfqd->bfq_user_max_budget == 0) {
4403 bfqd->bfq_max_budget =
4404 bfq_calc_max_budget(bfqd->peak_rate,
4405 timeout);
4406 bfq_log(bfqd, "new max_budget=%d",
4407 bfqd->bfq_max_budget);
4408 }
4409 } 4656 }
4410 4657
4411 /* 4658 bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow);
4412 * A process is considered ``slow'' (i.e., seeky, so that we
4413 * cannot treat it fairly in the service domain, as it would
4414 * slow down too much the other processes) if, when a slice
4415 * ends for whatever reason, it has received service at a
4416 * rate that would not be high enough to complete the budget
4417 * before the budget timeout expiration.
4418 */
4419 expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
4420 4659
4421 /* 4660 return slow;
4422 * Caveat: processes doing IO in the slower disk zones will
4423 * tend to be slow(er) even if not seeky. And the estimated
4424 * peak rate will actually be an average over the disk
4425 * surface. Hence, to not be too harsh with unlucky processes,
4426 * we keep a budget/3 margin of safety before declaring a
4427 * process slow.
4428 */
4429 return expected > (4 * bfqq->entity.budget) / 3;
4430} 4661}
4431 4662
4432/* 4663/*
@@ -4474,13 +4705,14 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,
4474 enum bfqq_expiration reason) 4705 enum bfqq_expiration reason)
4475{ 4706{
4476 bool slow; 4707 bool slow;
4708 unsigned long delta = 0;
4709 struct bfq_entity *entity = &bfqq->entity;
4477 int ref; 4710 int ref;
4478 4711
4479 /* 4712 /*
4480 * Update device peak rate for autotuning and check whether the 4713 * Check whether the process is slow (see bfq_bfqq_is_slow).
4481 * process is slow (see bfq_update_peak_rate).
4482 */ 4714 */
4483 slow = bfq_update_peak_rate(bfqd, bfqq, compensate); 4715 slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
4484 4716
4485 /* 4717 /*
4486 * As above explained, 'punish' slow (i.e., seeky), timed-out 4718 * As above explained, 'punish' slow (i.e., seeky), timed-out
@@ -4490,7 +4722,7 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,
4490 bfq_bfqq_charge_full_budget(bfqq); 4722 bfq_bfqq_charge_full_budget(bfqq);
4491 4723
4492 if (reason == BFQQE_TOO_IDLE && 4724 if (reason == BFQQE_TOO_IDLE &&
4493 bfqq->entity.service <= 2 * bfqq->entity.budget / 10) 4725 entity->service <= 2 * entity->budget / 10)
4494 bfq_clear_bfqq_IO_bound(bfqq); 4726 bfq_clear_bfqq_IO_bound(bfqq);
4495 4727
4496 bfq_log_bfqq(bfqd, bfqq, 4728 bfq_log_bfqq(bfqd, bfqq,
@@ -5130,17 +5362,9 @@ static void
5130bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, 5362bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5131 struct request *rq) 5363 struct request *rq)
5132{ 5364{
5133 sector_t sdist = 0;
5134
5135 if (bfqq->last_request_pos) {
5136 if (bfqq->last_request_pos < blk_rq_pos(rq))
5137 sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
5138 else
5139 sdist = bfqq->last_request_pos - blk_rq_pos(rq);
5140 }
5141
5142 bfqq->seek_history <<= 1; 5365 bfqq->seek_history <<= 1;
5143 bfqq->seek_history |= sdist > BFQQ_SEEK_THR && 5366 bfqq->seek_history |=
5367 get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
5144 (!blk_queue_nonrot(bfqd->queue) || 5368 (!blk_queue_nonrot(bfqd->queue) ||
5145 blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); 5369 blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
5146} 5370}
@@ -5336,12 +5560,45 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
5336 5560
5337static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) 5561static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
5338{ 5562{
5563 u64 now_ns;
5564 u32 delta_us;
5565
5339 bfq_update_hw_tag(bfqd); 5566 bfq_update_hw_tag(bfqd);
5340 5567
5341 bfqd->rq_in_driver--; 5568 bfqd->rq_in_driver--;
5342 bfqq->dispatched--; 5569 bfqq->dispatched--;
5343 5570
5344 bfqq->ttime.last_end_request = ktime_get_ns(); 5571 now_ns = ktime_get_ns();
5572
5573 bfqq->ttime.last_end_request = now_ns;
5574
5575 /*
5576 * Using us instead of ns, to get a reasonable precision in
5577 * computing rate in next check.
5578 */
5579 delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC);
5580
5581 /*
5582 * If the request took rather long to complete, and, according
5583 * to the maximum request size recorded, this completion latency
5584 * implies that the request was certainly served at a very low
5585 * rate (less than 1M sectors/sec), then the whole observation
5586 * interval that lasts up to this time instant cannot be a
5587 * valid time interval for computing a new peak rate. Invoke
5588 * bfq_update_rate_reset to have the following three steps
5589 * taken:
5590 * - close the observation interval at the last (previous)
5591 * request dispatch or completion
5592 * - compute rate, if possible, for that observation interval
5593 * - reset to zero samples, which will trigger a proper
5594 * re-initialization of the observation interval on next
5595 * dispatch
5596 */
5597 if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC &&
5598 (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us <
5599 1UL<<(BFQ_RATE_SHIFT - 10))
5600 bfq_update_rate_reset(bfqd, NULL);
5601 bfqd->last_completion = now_ns;
5345 5602
5346 /* 5603 /*
5347 * If this is the in-service queue, check if it needs to be expired, 5604 * If this is the in-service queue, check if it needs to be expired,
@@ -5799,16 +6056,6 @@ USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0,
5799 UINT_MAX); 6056 UINT_MAX);
5800#undef USEC_STORE_FUNCTION 6057#undef USEC_STORE_FUNCTION
5801 6058
5802static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
5803{
5804 u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout);
5805
5806 if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
5807 return bfq_calc_max_budget(bfqd->peak_rate, timeout);
5808 else
5809 return bfq_default_max_budget;
5810}
5811
5812static ssize_t bfq_max_budget_store(struct elevator_queue *e, 6059static ssize_t bfq_max_budget_store(struct elevator_queue *e,
5813 const char *page, size_t count) 6060 const char *page, size_t count)
5814{ 6061{
@@ -5817,7 +6064,7 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e,
5817 int ret = bfq_var_store(&__data, (page), count); 6064 int ret = bfq_var_store(&__data, (page), count);
5818 6065
5819 if (__data == 0) 6066 if (__data == 0)
5820 bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); 6067 bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
5821 else { 6068 else {
5822 if (__data > INT_MAX) 6069 if (__data > INT_MAX)
5823 __data = INT_MAX; 6070 __data = INT_MAX;
@@ -5847,7 +6094,7 @@ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
5847 6094
5848 bfqd->bfq_timeout = msecs_to_jiffies(__data); 6095 bfqd->bfq_timeout = msecs_to_jiffies(__data);
5849 if (bfqd->bfq_user_max_budget == 0) 6096 if (bfqd->bfq_user_max_budget == 0)
5850 bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); 6097 bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
5851 6098
5852 return ret; 6099 return ret;
5853} 6100}