summaryrefslogtreecommitdiffstats
path: root/block/bfq-iosched.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/bfq-iosched.c')
-rw-r--r--block/bfq-iosched.c497
1 files changed, 372 insertions, 125 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 1edac72ab51d..61d880b90882 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -407,19 +407,37 @@ struct bfq_data {
407 /* on-disk position of the last served request */ 407 /* on-disk position of the last served request */
408 sector_t last_position; 408 sector_t last_position;
409 409
410 /* time of last request completion (ns) */
411 u64 last_completion;
412
413 /* time of first rq dispatch in current observation interval (ns) */
414 u64 first_dispatch;
415 /* time of last rq dispatch in current observation interval (ns) */
416 u64 last_dispatch;
417
410 /* beginning of the last budget */ 418 /* beginning of the last budget */
411 ktime_t last_budget_start; 419 ktime_t last_budget_start;
412 /* beginning of the last idle slice */ 420 /* beginning of the last idle slice */
413 ktime_t last_idling_start; 421 ktime_t last_idling_start;
414 /* number of samples used to calculate @peak_rate */ 422
423 /* number of samples in current observation interval */
415 int peak_rate_samples; 424 int peak_rate_samples;
425 /* num of samples of seq dispatches in current observation interval */
426 u32 sequential_samples;
427 /* total num of sectors transferred in current observation interval */
428 u64 tot_sectors_dispatched;
429 /* max rq size seen during current observation interval (sectors) */
430 u32 last_rq_max_size;
431 /* time elapsed from first dispatch in current observ. interval (us) */
432 u64 delta_from_first;
416 /* 433 /*
417 * Peak read/write rate, observed during the service of a 434 * Current estimate of the device peak rate, measured in
418 * budget [BFQ_RATE_SHIFT * sectors/usec]. The value is 435 * [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
419 * left-shifted by BFQ_RATE_SHIFT to increase precision in 436 * BFQ_RATE_SHIFT is performed to increase precision in
420 * fixed-point calculations. 437 * fixed-point calculations.
421 */ 438 */
422 u64 peak_rate; 439 u32 peak_rate;
440
423 /* maximum budget allotted to a bfq_queue before rescheduling */ 441 /* maximum budget allotted to a bfq_queue before rescheduling */
424 int bfq_max_budget; 442 int bfq_max_budget;
425 443
@@ -740,7 +758,7 @@ static const int bfq_timeout = HZ / 8;
740 758
741static struct kmem_cache *bfq_pool; 759static struct kmem_cache *bfq_pool;
742 760
743/* Below this threshold (in ms), we consider thinktime immediate. */ 761/* Below this threshold (in ns), we consider thinktime immediate. */
744#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) 762#define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
745 763
746/* hw_tag detection: parallel requests threshold and min samples needed. */ 764/* hw_tag detection: parallel requests threshold and min samples needed. */
@@ -752,8 +770,12 @@ static struct kmem_cache *bfq_pool;
752#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) 770#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
753#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) 771#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
754 772
755/* Min samples used for peak rate estimation (for autotuning). */ 773/* Min number of samples required to perform peak-rate update */
756#define BFQ_PEAK_RATE_SAMPLES 32 774#define BFQ_RATE_MIN_SAMPLES 32
775/* Min observation time interval required to perform a peak-rate update (ns) */
776#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC)
777/* Target observation time interval for a peak-rate update (ns) */
778#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC
757 779
758/* Shift used for peak rate fixed precision calculations. */ 780/* Shift used for peak rate fixed precision calculations. */
759#define BFQ_RATE_SHIFT 16 781#define BFQ_RATE_SHIFT 16
@@ -3837,15 +3859,20 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
3837 return NULL; 3859 return NULL;
3838} 3860}
3839 3861
3862static sector_t get_sdist(sector_t last_pos, struct request *rq)
3863{
3864 if (last_pos)
3865 return abs(blk_rq_pos(rq) - last_pos);
3866
3867 return 0;
3868}
3869
3840#if 0 /* Still not clear if we can do without next two functions */ 3870#if 0 /* Still not clear if we can do without next two functions */
3841static void bfq_activate_request(struct request_queue *q, struct request *rq) 3871static void bfq_activate_request(struct request_queue *q, struct request *rq)
3842{ 3872{
3843 struct bfq_data *bfqd = q->elevator->elevator_data; 3873 struct bfq_data *bfqd = q->elevator->elevator_data;
3844 3874
3845 bfqd->rq_in_driver++; 3875 bfqd->rq_in_driver++;
3846 bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
3847 bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
3848 (unsigned long long)bfqd->last_position);
3849} 3876}
3850 3877
3851static void bfq_deactivate_request(struct request_queue *q, struct request *rq) 3878static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
@@ -4124,6 +4151,227 @@ static void bfq_set_budget_timeout(struct bfq_data *bfqd)
4124} 4151}
4125 4152
4126/* 4153/*
4154 * In autotuning mode, max_budget is dynamically recomputed as the
4155 * amount of sectors transferred in timeout at the estimated peak
4156 * rate. This enables BFQ to utilize a full timeslice with a full
4157 * budget, even if the in-service queue is served at peak rate. And
4158 * this maximises throughput with sequential workloads.
4159 */
4160static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
4161{
4162 return (u64)bfqd->peak_rate * USEC_PER_MSEC *
4163 jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT;
4164}
4165
4166static void bfq_reset_rate_computation(struct bfq_data *bfqd,
4167 struct request *rq)
4168{
4169 if (rq != NULL) { /* new rq dispatch now, reset accordingly */
4170 bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns();
4171 bfqd->peak_rate_samples = 1;
4172 bfqd->sequential_samples = 0;
4173 bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size =
4174 blk_rq_sectors(rq);
4175 } else /* no new rq dispatched, just reset the number of samples */
4176 bfqd->peak_rate_samples = 0; /* full re-init on next disp. */
4177
4178 bfq_log(bfqd,
4179 "reset_rate_computation at end, sample %u/%u tot_sects %llu",
4180 bfqd->peak_rate_samples, bfqd->sequential_samples,
4181 bfqd->tot_sectors_dispatched);
4182}
4183
4184static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq)
4185{
4186 u32 rate, weight, divisor;
4187
4188 /*
4189 * For the convergence property to hold (see comments on
4190 * bfq_update_peak_rate()) and for the assessment to be
4191 * reliable, a minimum number of samples must be present, and
4192 * a minimum amount of time must have elapsed. If not so, do
4193 * not compute new rate. Just reset parameters, to get ready
4194 * for a new evaluation attempt.
4195 */
4196 if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES ||
4197 bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL)
4198 goto reset_computation;
4199
4200 /*
4201 * If a new request completion has occurred after last
4202 * dispatch, then, to approximate the rate at which requests
4203 * have been served by the device, it is more precise to
4204 * extend the observation interval to the last completion.
4205 */
4206 bfqd->delta_from_first =
4207 max_t(u64, bfqd->delta_from_first,
4208 bfqd->last_completion - bfqd->first_dispatch);
4209
4210 /*
4211 * Rate computed in sects/usec, and not sects/nsec, for
4212 * precision issues.
4213 */
4214 rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT,
4215 div_u64(bfqd->delta_from_first, NSEC_PER_USEC));
4216
4217 /*
4218 * Peak rate not updated if:
4219 * - the percentage of sequential dispatches is below 3/4 of the
4220 * total, and rate is below the current estimated peak rate
4221 * - rate is unreasonably high (> 20M sectors/sec)
4222 */
4223 if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 &&
4224 rate <= bfqd->peak_rate) ||
4225 rate > 20<<BFQ_RATE_SHIFT)
4226 goto reset_computation;
4227
4228 /*
4229 * We have to update the peak rate, at last! To this purpose,
4230 * we use a low-pass filter. We compute the smoothing constant
4231 * of the filter as a function of the 'weight' of the new
4232 * measured rate.
4233 *
4234 * As can be seen in next formulas, we define this weight as a
4235 * quantity proportional to how sequential the workload is,
4236 * and to how long the observation time interval is.
4237 *
4238 * The weight runs from 0 to 8. The maximum value of the
4239 * weight, 8, yields the minimum value for the smoothing
4240 * constant. At this minimum value for the smoothing constant,
4241 * the measured rate contributes for half of the next value of
4242 * the estimated peak rate.
4243 *
4244 * So, the first step is to compute the weight as a function
4245 * of how sequential the workload is. Note that the weight
4246 * cannot reach 9, because bfqd->sequential_samples cannot
4247 * become equal to bfqd->peak_rate_samples, which, in its
4248 * turn, holds true because bfqd->sequential_samples is not
4249 * incremented for the first sample.
4250 */
4251 weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples;
4252
4253 /*
4254 * Second step: further refine the weight as a function of the
4255 * duration of the observation interval.
4256 */
4257 weight = min_t(u32, 8,
4258 div_u64(weight * bfqd->delta_from_first,
4259 BFQ_RATE_REF_INTERVAL));
4260
4261 /*
4262 * Divisor ranging from 10, for minimum weight, to 2, for
4263 * maximum weight.
4264 */
4265 divisor = 10 - weight;
4266
4267 /*
4268 * Finally, update peak rate:
4269 *
4270 * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor
4271 */
4272 bfqd->peak_rate *= divisor-1;
4273 bfqd->peak_rate /= divisor;
4274 rate /= divisor; /* smoothing constant alpha = 1/divisor */
4275
4276 bfqd->peak_rate += rate;
4277 if (bfqd->bfq_user_max_budget == 0)
4278 bfqd->bfq_max_budget =
4279 bfq_calc_max_budget(bfqd);
4280
4281reset_computation:
4282 bfq_reset_rate_computation(bfqd, rq);
4283}
4284
4285/*
4286 * Update the read/write peak rate (the main quantity used for
4287 * auto-tuning, see update_thr_responsiveness_params()).
4288 *
4289 * It is not trivial to estimate the peak rate (correctly): because of
4290 * the presence of sw and hw queues between the scheduler and the
4291 * device components that finally serve I/O requests, it is hard to
4292 * say exactly when a given dispatched request is served inside the
4293 * device, and for how long. As a consequence, it is hard to know
4294 * precisely at what rate a given set of requests is actually served
4295 * by the device.
4296 *
4297 * On the opposite end, the dispatch time of any request is trivially
4298 * available, and, from this piece of information, the "dispatch rate"
4299 * of requests can be immediately computed. So, the idea in the next
4300 * function is to use what is known, namely request dispatch times
4301 * (plus, when useful, request completion times), to estimate what is
4302 * unknown, namely in-device request service rate.
4303 *
4304 * The main issue is that, because of the above facts, the rate at
4305 * which a certain set of requests is dispatched over a certain time
4306 * interval can vary greatly with respect to the rate at which the
4307 * same requests are then served. But, since the size of any
4308 * intermediate queue is limited, and the service scheme is lossless
4309 * (no request is silently dropped), the following obvious convergence
4310 * property holds: the number of requests dispatched MUST become
4311 * closer and closer to the number of requests completed as the
4312 * observation interval grows. This is the key property used in
4313 * the next function to estimate the peak service rate as a function
4314 * of the observed dispatch rate. The function assumes to be invoked
4315 * on every request dispatch.
4316 */
4317static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
4318{
4319 u64 now_ns = ktime_get_ns();
4320
4321 if (bfqd->peak_rate_samples == 0) { /* first dispatch */
4322 bfq_log(bfqd, "update_peak_rate: goto reset, samples %d",
4323 bfqd->peak_rate_samples);
4324 bfq_reset_rate_computation(bfqd, rq);
4325 goto update_last_values; /* will add one sample */
4326 }
4327
4328 /*
4329 * Device idle for very long: the observation interval lasting
4330 * up to this dispatch cannot be a valid observation interval
4331 * for computing a new peak rate (similarly to the late-
4332 * completion event in bfq_completed_request()). Go to
4333 * update_rate_and_reset to have the following three steps
4334 * taken:
4335 * - close the observation interval at the last (previous)
4336 * request dispatch or completion
4337 * - compute rate, if possible, for that observation interval
4338 * - start a new observation interval with this dispatch
4339 */
4340 if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC &&
4341 bfqd->rq_in_driver == 0)
4342 goto update_rate_and_reset;
4343
4344 /* Update sampling information */
4345 bfqd->peak_rate_samples++;
4346
4347 if ((bfqd->rq_in_driver > 0 ||
4348 now_ns - bfqd->last_completion < BFQ_MIN_TT)
4349 && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
4350 bfqd->sequential_samples++;
4351
4352 bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
4353
4354 /* Reset max observed rq size every 32 dispatches */
4355 if (likely(bfqd->peak_rate_samples % 32))
4356 bfqd->last_rq_max_size =
4357 max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size);
4358 else
4359 bfqd->last_rq_max_size = blk_rq_sectors(rq);
4360
4361 bfqd->delta_from_first = now_ns - bfqd->first_dispatch;
4362
4363 /* Target observation interval not yet reached, go on sampling */
4364 if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL)
4365 goto update_last_values;
4366
4367update_rate_and_reset:
4368 bfq_update_rate_reset(bfqd, rq);
4369update_last_values:
4370 bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
4371 bfqd->last_dispatch = now_ns;
4372}
4373
4374/*
4127 * Remove request from internal lists. 4375 * Remove request from internal lists.
4128 */ 4376 */
4129static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) 4377static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
@@ -4143,6 +4391,7 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
4143 * happens to be taken into account. 4391 * happens to be taken into account.
4144 */ 4392 */
4145 bfqq->dispatched++; 4393 bfqq->dispatched++;
4394 bfq_update_peak_rate(q->elevator->elevator_data, rq);
4146 4395
4147 bfq_remove_request(q, rq); 4396 bfq_remove_request(q, rq);
4148} 4397}
@@ -4323,110 +4572,92 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
4323 bfqq->entity.budget); 4572 bfqq->entity.budget);
4324} 4573}
4325 4574
4326static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
4327{
4328 unsigned long max_budget;
4329
4330 /*
4331 * The max_budget calculated when autotuning is equal to the
4332 * amount of sectors transferred in timeout at the estimated
4333 * peak rate. To get this value, peak_rate is, first,
4334 * multiplied by 1000, because timeout is measured in ms,
4335 * while peak_rate is measured in sectors/usecs. Then the
4336 * result of this multiplication is right-shifted by
4337 * BFQ_RATE_SHIFT, because peak_rate is equal to the value of
4338 * the peak rate left-shifted by BFQ_RATE_SHIFT.
4339 */
4340 max_budget = (unsigned long)(peak_rate * 1000 *
4341 timeout >> BFQ_RATE_SHIFT);
4342
4343 return max_budget;
4344}
4345
4346/* 4575/*
4347 * In addition to updating the peak rate, checks whether the process 4576 * Return true if the process associated with bfqq is "slow". The slow
4348 * is "slow", and returns 1 if so. This slow flag is used, in addition 4577 * flag is used, in addition to the budget timeout, to reduce the
4349 * to the budget timeout, to reduce the amount of service provided to 4578 * amount of service provided to seeky processes, and thus reduce
4350 * seeky processes, and hence reduce their chances to lower the 4579 * their chances to lower the throughput. More details in the comments
4351 * throughput. See the code for more details. 4580 * on the function bfq_bfqq_expire().
4581 *
4582 * An important observation is in order: as discussed in the comments
4583 * on the function bfq_update_peak_rate(), with devices with internal
4584 * queues, it is hard if ever possible to know when and for how long
4585 * an I/O request is processed by the device (apart from the trivial
4586 * I/O pattern where a new request is dispatched only after the
4587 * previous one has been completed). This makes it hard to evaluate
4588 * the real rate at which the I/O requests of each bfq_queue are
4589 * served. In fact, for an I/O scheduler like BFQ, serving a
4590 * bfq_queue means just dispatching its requests during its service
4591 * slot (i.e., until the budget of the queue is exhausted, or the
4592 * queue remains idle, or, finally, a timeout fires). But, during the
4593 * service slot of a bfq_queue, around 100 ms at most, the device may
4594 * be even still processing requests of bfq_queues served in previous
4595 * service slots. On the opposite end, the requests of the in-service
4596 * bfq_queue may be completed after the service slot of the queue
4597 * finishes.
4598 *
4599 * Anyway, unless more sophisticated solutions are used
4600 * (where possible), the sum of the sizes of the requests dispatched
4601 * during the service slot of a bfq_queue is probably the only
4602 * approximation available for the service received by the bfq_queue
4603 * during its service slot. And this sum is the quantity used in this
4604 * function to evaluate the I/O speed of a process.
4352 */ 4605 */
4353static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, 4606static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
4354 bool compensate) 4607 bool compensate, enum bfqq_expiration reason,
4608 unsigned long *delta_ms)
4355{ 4609{
4356 u64 bw, usecs, expected, timeout; 4610 ktime_t delta_ktime;
4357 ktime_t delta; 4611 u32 delta_usecs;
4358 int update = 0; 4612 bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */
4359 4613
4360 if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) 4614 if (!bfq_bfqq_sync(bfqq))
4361 return false; 4615 return false;
4362 4616
4363 if (compensate) 4617 if (compensate)
4364 delta = bfqd->last_idling_start; 4618 delta_ktime = bfqd->last_idling_start;
4365 else 4619 else
4366 delta = ktime_get(); 4620 delta_ktime = ktime_get();
4367 delta = ktime_sub(delta, bfqd->last_budget_start); 4621 delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);
4368 usecs = ktime_to_us(delta); 4622 delta_usecs = ktime_to_us(delta_ktime);
4369 4623
4370 /* don't use too short time intervals */ 4624 /* don't use too short time intervals */
4371 if (usecs < 1000) 4625 if (delta_usecs < 1000) {
4372 return false; 4626 if (blk_queue_nonrot(bfqd->queue))
4373 4627 /*
4374 /* 4628 * give same worst-case guarantees as idling
4375 * Calculate the bandwidth for the last slice. We use a 64 bit 4629 * for seeky
4376 * value to store the peak rate, in sectors per usec in fixed 4630 */
4377 * point math. We do so to have enough precision in the estimate 4631 *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC;
4378 * and to avoid overflows. 4632 else /* charge at least one seek */
4379 */ 4633 *delta_ms = bfq_slice_idle / NSEC_PER_MSEC;
4380 bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; 4634
4381 do_div(bw, (unsigned long)usecs); 4635 return slow;
4636 }
4382 4637
4383 timeout = jiffies_to_msecs(bfqd->bfq_timeout); 4638 *delta_ms = delta_usecs / USEC_PER_MSEC;
4384 4639
4385 /* 4640 /*
4386 * Use only long (> 20ms) intervals to filter out spikes for 4641 * Use only long (> 20ms) intervals to filter out excessive
4387 * the peak rate estimation. 4642 * spikes in service rate estimation.
4388 */ 4643 */
4389 if (usecs > 20000) { 4644 if (delta_usecs > 20000) {
4390 if (bw > bfqd->peak_rate) { 4645 /*
4391 bfqd->peak_rate = bw; 4646 * Caveat for rotational devices: processes doing I/O
4392 update = 1; 4647 * in the slower disk zones tend to be slow(er) even
4393 bfq_log(bfqd, "new peak_rate=%llu", bw); 4648 * if not seeky. In this respect, the estimated peak
4394 } 4649 * rate is likely to be an average over the disk
4395 4650 * surface. Accordingly, to not be too harsh with
4396 update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; 4651 * unlucky processes, a process is deemed slow only if
4397 4652 * its rate has been lower than half of the estimated
4398 if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) 4653 * peak rate.
4399 bfqd->peak_rate_samples++; 4654 */
4400 4655 slow = bfqq->entity.service < bfqd->bfq_max_budget / 2;
4401 if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
4402 update && bfqd->bfq_user_max_budget == 0) {
4403 bfqd->bfq_max_budget =
4404 bfq_calc_max_budget(bfqd->peak_rate,
4405 timeout);
4406 bfq_log(bfqd, "new max_budget=%d",
4407 bfqd->bfq_max_budget);
4408 }
4409 } 4656 }
4410 4657
4411 /* 4658 bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow);
4412 * A process is considered ``slow'' (i.e., seeky, so that we
4413 * cannot treat it fairly in the service domain, as it would
4414 * slow down too much the other processes) if, when a slice
4415 * ends for whatever reason, it has received service at a
4416 * rate that would not be high enough to complete the budget
4417 * before the budget timeout expiration.
4418 */
4419 expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
4420 4659
4421 /* 4660 return slow;
4422 * Caveat: processes doing IO in the slower disk zones will
4423 * tend to be slow(er) even if not seeky. And the estimated
4424 * peak rate will actually be an average over the disk
4425 * surface. Hence, to not be too harsh with unlucky processes,
4426 * we keep a budget/3 margin of safety before declaring a
4427 * process slow.
4428 */
4429 return expected > (4 * bfqq->entity.budget) / 3;
4430} 4661}
4431 4662
4432/* 4663/*
@@ -4474,13 +4705,14 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,
4474 enum bfqq_expiration reason) 4705 enum bfqq_expiration reason)
4475{ 4706{
4476 bool slow; 4707 bool slow;
4708 unsigned long delta = 0;
4709 struct bfq_entity *entity = &bfqq->entity;
4477 int ref; 4710 int ref;
4478 4711
4479 /* 4712 /*
4480 * Update device peak rate for autotuning and check whether the 4713 * Check whether the process is slow (see bfq_bfqq_is_slow).
4481 * process is slow (see bfq_update_peak_rate).
4482 */ 4714 */
4483 slow = bfq_update_peak_rate(bfqd, bfqq, compensate); 4715 slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
4484 4716
4485 /* 4717 /*
4486 * As above explained, 'punish' slow (i.e., seeky), timed-out 4718 * As above explained, 'punish' slow (i.e., seeky), timed-out
@@ -4490,7 +4722,7 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,
4490 bfq_bfqq_charge_full_budget(bfqq); 4722 bfq_bfqq_charge_full_budget(bfqq);
4491 4723
4492 if (reason == BFQQE_TOO_IDLE && 4724 if (reason == BFQQE_TOO_IDLE &&
4493 bfqq->entity.service <= 2 * bfqq->entity.budget / 10) 4725 entity->service <= 2 * entity->budget / 10)
4494 bfq_clear_bfqq_IO_bound(bfqq); 4726 bfq_clear_bfqq_IO_bound(bfqq);
4495 4727
4496 bfq_log_bfqq(bfqd, bfqq, 4728 bfq_log_bfqq(bfqd, bfqq,
@@ -5130,17 +5362,9 @@ static void
5130bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, 5362bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5131 struct request *rq) 5363 struct request *rq)
5132{ 5364{
5133 sector_t sdist = 0;
5134
5135 if (bfqq->last_request_pos) {
5136 if (bfqq->last_request_pos < blk_rq_pos(rq))
5137 sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
5138 else
5139 sdist = bfqq->last_request_pos - blk_rq_pos(rq);
5140 }
5141
5142 bfqq->seek_history <<= 1; 5365 bfqq->seek_history <<= 1;
5143 bfqq->seek_history |= sdist > BFQQ_SEEK_THR && 5366 bfqq->seek_history |=
5367 get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
5144 (!blk_queue_nonrot(bfqd->queue) || 5368 (!blk_queue_nonrot(bfqd->queue) ||
5145 blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); 5369 blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
5146} 5370}
@@ -5336,12 +5560,45 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
5336 5560
5337static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) 5561static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
5338{ 5562{
5563 u64 now_ns;
5564 u32 delta_us;
5565
5339 bfq_update_hw_tag(bfqd); 5566 bfq_update_hw_tag(bfqd);
5340 5567
5341 bfqd->rq_in_driver--; 5568 bfqd->rq_in_driver--;
5342 bfqq->dispatched--; 5569 bfqq->dispatched--;
5343 5570
5344 bfqq->ttime.last_end_request = ktime_get_ns(); 5571 now_ns = ktime_get_ns();
5572
5573 bfqq->ttime.last_end_request = now_ns;
5574
5575 /*
5576 * Using us instead of ns, to get a reasonable precision in
5577 * computing rate in next check.
5578 */
5579 delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC);
5580
5581 /*
5582 * If the request took rather long to complete, and, according
5583 * to the maximum request size recorded, this completion latency
5584 * implies that the request was certainly served at a very low
5585 * rate (less than 1M sectors/sec), then the whole observation
5586 * interval that lasts up to this time instant cannot be a
5587 * valid time interval for computing a new peak rate. Invoke
5588 * bfq_update_rate_reset to have the following three steps
5589 * taken:
5590 * - close the observation interval at the last (previous)
5591 * request dispatch or completion
5592 * - compute rate, if possible, for that observation interval
5593 * - reset to zero samples, which will trigger a proper
5594 * re-initialization of the observation interval on next
5595 * dispatch
5596 */
5597 if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC &&
5598 (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us <
5599 1UL<<(BFQ_RATE_SHIFT - 10))
5600 bfq_update_rate_reset(bfqd, NULL);
5601 bfqd->last_completion = now_ns;
5345 5602
5346 /* 5603 /*
5347 * If this is the in-service queue, check if it needs to be expired, 5604 * If this is the in-service queue, check if it needs to be expired,
@@ -5799,16 +6056,6 @@ USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0,
5799 UINT_MAX); 6056 UINT_MAX);
5800#undef USEC_STORE_FUNCTION 6057#undef USEC_STORE_FUNCTION
5801 6058
5802static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
5803{
5804 u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout);
5805
5806 if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
5807 return bfq_calc_max_budget(bfqd->peak_rate, timeout);
5808 else
5809 return bfq_default_max_budget;
5810}
5811
5812static ssize_t bfq_max_budget_store(struct elevator_queue *e, 6059static ssize_t bfq_max_budget_store(struct elevator_queue *e,
5813 const char *page, size_t count) 6060 const char *page, size_t count)
5814{ 6061{
@@ -5817,7 +6064,7 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e,
5817 int ret = bfq_var_store(&__data, (page), count); 6064 int ret = bfq_var_store(&__data, (page), count);
5818 6065
5819 if (__data == 0) 6066 if (__data == 0)
5820 bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); 6067 bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
5821 else { 6068 else {
5822 if (__data > INT_MAX) 6069 if (__data > INT_MAX)
5823 __data = INT_MAX; 6070 __data = INT_MAX;
@@ -5847,7 +6094,7 @@ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
5847 6094
5848 bfqd->bfq_timeout = msecs_to_jiffies(__data); 6095 bfqd->bfq_timeout = msecs_to_jiffies(__data);
5849 if (bfqd->bfq_user_max_budget == 0) 6096 if (bfqd->bfq_user_max_budget == 0)
5850 bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); 6097 bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
5851 6098
5852 return ret; 6099 return ret;
5853} 6100}