summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorJosef Bacik <josef@toxicpanda.com>2018-09-28 13:45:42 -0400
committerJens Axboe <axboe@kernel.dk>2018-09-28 13:47:31 -0400
commit1fa2840e56f9032e14a75fcf67edfe0f21102e4b (patch)
tree8a4fa641b207b2dd864cd62302ccb91f582ec74e /block
parent22ed8a93adc7a9cbb2c0a0fc1d7f10068a1f84c1 (diff)
blk-iolatency: use a percentile approache for ssd's
We use an average latency approach for determining if we're missing our latency target. This works well for rotational storage where we have generally consistent latencies, but for ssd's and other low latency devices you have more of a spikey behavior, which means we often won't throttle misbehaving groups because a lot of IO completes at drastically faster times than our latency target. Instead keep track of how many IO's miss our target and how many IO's are done in our time window. If the p(90) latency is above our target then we know we need to throttle. With this change in place we are seeing the same throttling behavior with our testcase on ssd's as we see with rotational drives. Signed-off-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'block')
-rw-r--r--block/blk-iolatency.c179
1 files changed, 145 insertions, 34 deletions
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index e7be77b0ce8b..fd246805b0be 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -115,9 +115,21 @@ struct child_latency_info {
115 atomic_t scale_cookie; 115 atomic_t scale_cookie;
116}; 116};
117 117
118struct percentile_stats {
119 u64 total;
120 u64 missed;
121};
122
123struct latency_stat {
124 union {
125 struct percentile_stats ps;
126 struct blk_rq_stat rqs;
127 };
128};
129
118struct iolatency_grp { 130struct iolatency_grp {
119 struct blkg_policy_data pd; 131 struct blkg_policy_data pd;
120 struct blk_rq_stat __percpu *stats; 132 struct latency_stat __percpu *stats;
121 struct blk_iolatency *blkiolat; 133 struct blk_iolatency *blkiolat;
122 struct rq_depth rq_depth; 134 struct rq_depth rq_depth;
123 struct rq_wait rq_wait; 135 struct rq_wait rq_wait;
@@ -132,6 +144,7 @@ struct iolatency_grp {
132 /* Our current number of IO's for the last summation. */ 144 /* Our current number of IO's for the last summation. */
133 u64 nr_samples; 145 u64 nr_samples;
134 146
147 bool ssd;
135 struct child_latency_info child_lat; 148 struct child_latency_info child_lat;
136}; 149};
137 150
@@ -172,6 +185,80 @@ static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat)
172 return pd_to_blkg(&iolat->pd); 185 return pd_to_blkg(&iolat->pd);
173} 186}
174 187
188static inline void latency_stat_init(struct iolatency_grp *iolat,
189 struct latency_stat *stat)
190{
191 if (iolat->ssd) {
192 stat->ps.total = 0;
193 stat->ps.missed = 0;
194 } else
195 blk_rq_stat_init(&stat->rqs);
196}
197
198static inline void latency_stat_sum(struct iolatency_grp *iolat,
199 struct latency_stat *sum,
200 struct latency_stat *stat)
201{
202 if (iolat->ssd) {
203 sum->ps.total += stat->ps.total;
204 sum->ps.missed += stat->ps.missed;
205 } else
206 blk_rq_stat_sum(&sum->rqs, &stat->rqs);
207}
208
209static inline void latency_stat_record_time(struct iolatency_grp *iolat,
210 u64 req_time)
211{
212 struct latency_stat *stat = get_cpu_ptr(iolat->stats);
213 if (iolat->ssd) {
214 if (req_time >= iolat->min_lat_nsec)
215 stat->ps.missed++;
216 stat->ps.total++;
217 } else
218 blk_rq_stat_add(&stat->rqs, req_time);
219 put_cpu_ptr(stat);
220}
221
222static inline bool latency_sum_ok(struct iolatency_grp *iolat,
223 struct latency_stat *stat)
224{
225 if (iolat->ssd) {
226 u64 thresh = div64_u64(stat->ps.total, 10);
227 thresh = max(thresh, 1ULL);
228 return stat->ps.missed < thresh;
229 }
230 return stat->rqs.mean <= iolat->min_lat_nsec;
231}
232
233static inline u64 latency_stat_samples(struct iolatency_grp *iolat,
234 struct latency_stat *stat)
235{
236 if (iolat->ssd)
237 return stat->ps.total;
238 return stat->rqs.nr_samples;
239}
240
241static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat,
242 struct latency_stat *stat)
243{
244 int exp_idx;
245
246 if (iolat->ssd)
247 return;
248
249 /*
250 * CALC_LOAD takes in a number stored in fixed point representation.
251 * Because we are using this for IO time in ns, the values stored
252 * are significantly larger than the FIXED_1 denominator (2048).
253 * Therefore, rounding errors in the calculation are negligible and
254 * can be ignored.
255 */
256 exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
257 div64_u64(iolat->cur_win_nsec,
258 BLKIOLATENCY_EXP_BUCKET_SIZE));
259 CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat->rqs.mean);
260}
261
175static inline bool iolatency_may_queue(struct iolatency_grp *iolat, 262static inline bool iolatency_may_queue(struct iolatency_grp *iolat,
176 wait_queue_entry_t *wait, 263 wait_queue_entry_t *wait,
177 bool first_block) 264 bool first_block)
@@ -418,7 +505,6 @@ static void iolatency_record_time(struct iolatency_grp *iolat,
418 struct bio_issue *issue, u64 now, 505 struct bio_issue *issue, u64 now,
419 bool issue_as_root) 506 bool issue_as_root)
420{ 507{
421 struct blk_rq_stat *rq_stat;
422 u64 start = bio_issue_time(issue); 508 u64 start = bio_issue_time(issue);
423 u64 req_time; 509 u64 req_time;
424 510
@@ -444,9 +530,7 @@ static void iolatency_record_time(struct iolatency_grp *iolat,
444 return; 530 return;
445 } 531 }
446 532
447 rq_stat = get_cpu_ptr(iolat->stats); 533 latency_stat_record_time(iolat, req_time);
448 blk_rq_stat_add(rq_stat, req_time);
449 put_cpu_ptr(rq_stat);
450} 534}
451 535
452#define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC) 536#define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC)
@@ -457,17 +541,17 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
457 struct blkcg_gq *blkg = lat_to_blkg(iolat); 541 struct blkcg_gq *blkg = lat_to_blkg(iolat);
458 struct iolatency_grp *parent; 542 struct iolatency_grp *parent;
459 struct child_latency_info *lat_info; 543 struct child_latency_info *lat_info;
460 struct blk_rq_stat stat; 544 struct latency_stat stat;
461 unsigned long flags; 545 unsigned long flags;
462 int cpu, exp_idx; 546 int cpu;
463 547
464 blk_rq_stat_init(&stat); 548 latency_stat_init(iolat, &stat);
465 preempt_disable(); 549 preempt_disable();
466 for_each_online_cpu(cpu) { 550 for_each_online_cpu(cpu) {
467 struct blk_rq_stat *s; 551 struct latency_stat *s;
468 s = per_cpu_ptr(iolat->stats, cpu); 552 s = per_cpu_ptr(iolat->stats, cpu);
469 blk_rq_stat_sum(&stat, s); 553 latency_stat_sum(iolat, &stat, s);
470 blk_rq_stat_init(s); 554 latency_stat_init(iolat, s);
471 } 555 }
472 preempt_enable(); 556 preempt_enable();
473 557
@@ -477,41 +561,33 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
477 561
478 lat_info = &parent->child_lat; 562 lat_info = &parent->child_lat;
479 563
480 /* 564 iolat_update_total_lat_avg(iolat, &stat);
481 * CALC_LOAD takes in a number stored in fixed point representation.
482 * Because we are using this for IO time in ns, the values stored
483 * are significantly larger than the FIXED_1 denominator (2048).
484 * Therefore, rounding errors in the calculation are negligible and
485 * can be ignored.
486 */
487 exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
488 div64_u64(iolat->cur_win_nsec,
489 BLKIOLATENCY_EXP_BUCKET_SIZE));
490 CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat.mean);
491 565
492 /* Everything is ok and we don't need to adjust the scale. */ 566 /* Everything is ok and we don't need to adjust the scale. */
493 if (stat.mean <= iolat->min_lat_nsec && 567 if (latency_sum_ok(iolat, &stat) &&
494 atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE) 568 atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE)
495 return; 569 return;
496 570
497 /* Somebody beat us to the punch, just bail. */ 571 /* Somebody beat us to the punch, just bail. */
498 spin_lock_irqsave(&lat_info->lock, flags); 572 spin_lock_irqsave(&lat_info->lock, flags);
499 lat_info->nr_samples -= iolat->nr_samples; 573 lat_info->nr_samples -= iolat->nr_samples;
500 lat_info->nr_samples += stat.nr_samples; 574 lat_info->nr_samples += latency_stat_samples(iolat, &stat);
501 iolat->nr_samples = stat.nr_samples; 575 iolat->nr_samples = latency_stat_samples(iolat, &stat);
502 576
503 if ((lat_info->last_scale_event >= now || 577 if ((lat_info->last_scale_event >= now ||
504 now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME) && 578 now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME) &&
505 lat_info->scale_lat <= iolat->min_lat_nsec) 579 lat_info->scale_lat <= iolat->min_lat_nsec)
506 goto out; 580 goto out;
507 581
508 if (stat.mean <= iolat->min_lat_nsec && 582 if (latency_sum_ok(iolat, &stat)) {
509 stat.nr_samples >= BLKIOLATENCY_MIN_GOOD_SAMPLES) { 583 if (latency_stat_samples(iolat, &stat) <
584 BLKIOLATENCY_MIN_GOOD_SAMPLES)
585 goto out;
510 if (lat_info->scale_grp == iolat) { 586 if (lat_info->scale_grp == iolat) {
511 lat_info->last_scale_event = now; 587 lat_info->last_scale_event = now;
512 scale_cookie_change(iolat->blkiolat, lat_info, true); 588 scale_cookie_change(iolat->blkiolat, lat_info, true);
513 } 589 }
514 } else if (stat.mean > iolat->min_lat_nsec) { 590 } else {
515 lat_info->last_scale_event = now; 591 lat_info->last_scale_event = now;
516 if (!lat_info->scale_grp || 592 if (!lat_info->scale_grp ||
517 lat_info->scale_lat > iolat->min_lat_nsec) { 593 lat_info->scale_lat > iolat->min_lat_nsec) {
@@ -808,13 +884,43 @@ static int iolatency_print_limit(struct seq_file *sf, void *v)
808 return 0; 884 return 0;
809} 885}
810 886
887static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf,
888 size_t size)
889{
890 struct latency_stat stat;
891 int cpu;
892
893 latency_stat_init(iolat, &stat);
894 preempt_disable();
895 for_each_online_cpu(cpu) {
896 struct latency_stat *s;
897 s = per_cpu_ptr(iolat->stats, cpu);
898 latency_stat_sum(iolat, &stat, s);
899 }
900 preempt_enable();
901
902 if (iolat->rq_depth.max_depth == UINT_MAX)
903 return scnprintf(buf, size, " missed=%llu total=%llu depth=max",
904 (unsigned long long)stat.ps.missed,
905 (unsigned long long)stat.ps.total);
906 return scnprintf(buf, size, " missed=%llu total=%llu depth=%u",
907 (unsigned long long)stat.ps.missed,
908 (unsigned long long)stat.ps.total,
909 iolat->rq_depth.max_depth);
910}
911
811static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, 912static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
812 size_t size) 913 size_t size)
813{ 914{
814 struct iolatency_grp *iolat = pd_to_lat(pd); 915 struct iolatency_grp *iolat = pd_to_lat(pd);
815 unsigned long long avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); 916 unsigned long long avg_lat;
816 unsigned long long cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); 917 unsigned long long cur_win;
918
919 if (iolat->ssd)
920 return iolatency_ssd_stat(iolat, buf, size);
817 921
922 avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
923 cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
818 if (iolat->rq_depth.max_depth == UINT_MAX) 924 if (iolat->rq_depth.max_depth == UINT_MAX)
819 return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu", 925 return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu",
820 avg_lat, cur_win); 926 avg_lat, cur_win);
@@ -831,8 +937,8 @@ static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node)
831 iolat = kzalloc_node(sizeof(*iolat), gfp, node); 937 iolat = kzalloc_node(sizeof(*iolat), gfp, node);
832 if (!iolat) 938 if (!iolat)
833 return NULL; 939 return NULL;
834 iolat->stats = __alloc_percpu_gfp(sizeof(struct blk_rq_stat), 940 iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat),
835 __alignof__(struct blk_rq_stat), gfp); 941 __alignof__(struct latency_stat), gfp);
836 if (!iolat->stats) { 942 if (!iolat->stats) {
837 kfree(iolat); 943 kfree(iolat);
838 return NULL; 944 return NULL;
@@ -849,10 +955,15 @@ static void iolatency_pd_init(struct blkg_policy_data *pd)
849 u64 now = ktime_to_ns(ktime_get()); 955 u64 now = ktime_to_ns(ktime_get());
850 int cpu; 956 int cpu;
851 957
958 if (blk_queue_nonrot(blkg->q))
959 iolat->ssd = true;
960 else
961 iolat->ssd = false;
962
852 for_each_possible_cpu(cpu) { 963 for_each_possible_cpu(cpu) {
853 struct blk_rq_stat *stat; 964 struct latency_stat *stat;
854 stat = per_cpu_ptr(iolat->stats, cpu); 965 stat = per_cpu_ptr(iolat->stats, cpu);
855 blk_rq_stat_init(stat); 966 latency_stat_init(iolat, stat);
856 } 967 }
857 968
858 rq_wait_init(&iolat->rq_wait); 969 rq_wait_init(&iolat->rq_wait);