aboutsummaryrefslogtreecommitdiffstats
path: root/block/blk-throttle.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/blk-throttle.c')
-rw-r--r--block/blk-throttle.c164
1 files changed, 127 insertions, 37 deletions
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 4b492011e0d..af53f37c1b1 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -59,8 +59,13 @@ struct throtl_grp {
59 /* bytes per second rate limits */ 59 /* bytes per second rate limits */
60 uint64_t bps[2]; 60 uint64_t bps[2];
61 61
62 /* IOPS limits */
63 unsigned int iops[2];
64
62 /* Number of bytes disptached in current slice */ 65 /* Number of bytes disptached in current slice */
63 uint64_t bytes_disp[2]; 66 uint64_t bytes_disp[2];
67 /* Number of bio's dispatched in current slice */
68 unsigned int io_disp[2];
64 69
65 /* When did we start a new slice */ 70 /* When did we start a new slice */
66 unsigned long slice_start[2]; 71 unsigned long slice_start[2];
@@ -194,6 +199,8 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
194 199
195 tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); 200 tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
196 tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); 201 tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
202 tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
203 tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
197 204
198 hlist_add_head(&tg->tg_node, &td->tg_list); 205 hlist_add_head(&tg->tg_node, &td->tg_list);
199 td->nr_undestroyed_grps++; 206 td->nr_undestroyed_grps++;
@@ -335,6 +342,7 @@ static inline void
335throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) 342throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
336{ 343{
337 tg->bytes_disp[rw] = 0; 344 tg->bytes_disp[rw] = 0;
345 tg->io_disp[rw] = 0;
338 tg->slice_start[rw] = jiffies; 346 tg->slice_start[rw] = jiffies;
339 tg->slice_end[rw] = jiffies + throtl_slice; 347 tg->slice_end[rw] = jiffies + throtl_slice;
340 throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", 348 throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
@@ -365,7 +373,7 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
365static inline void 373static inline void
366throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) 374throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
367{ 375{
368 unsigned long nr_slices, bytes_trim, time_elapsed; 376 unsigned long nr_slices, bytes_trim, time_elapsed, io_trim;
369 377
370 BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); 378 BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
371 379
@@ -385,8 +393,9 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
385 return; 393 return;
386 394
387 bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ; 395 bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ;
396 io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
388 397
389 if (!bytes_trim) 398 if (!bytes_trim && !io_trim)
390 return; 399 return;
391 400
392 if (tg->bytes_disp[rw] >= bytes_trim) 401 if (tg->bytes_disp[rw] >= bytes_trim)
@@ -394,51 +403,62 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
394 else 403 else
395 tg->bytes_disp[rw] = 0; 404 tg->bytes_disp[rw] = 0;
396 405
406 if (tg->io_disp[rw] >= io_trim)
407 tg->io_disp[rw] -= io_trim;
408 else
409 tg->io_disp[rw] = 0;
410
397 tg->slice_start[rw] += nr_slices * throtl_slice; 411 tg->slice_start[rw] += nr_slices * throtl_slice;
398 412
399 throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu" 413 throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu io=%lu"
400 " start=%lu end=%lu jiffies=%lu", 414 " start=%lu end=%lu jiffies=%lu",
401 rw == READ ? 'R' : 'W', nr_slices, bytes_trim, 415 rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
402 tg->slice_start[rw], tg->slice_end[rw], jiffies); 416 tg->slice_start[rw], tg->slice_end[rw], jiffies);
403} 417}
404 418
405/* 419static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
406 * Returns whether one can dispatch a bio or not. Also returns approx number 420 struct bio *bio, unsigned long *wait)
407 * of jiffies to wait before this bio is with-in IO rate and can be dispatched
408 */
409static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
410 struct bio *bio, unsigned long *wait)
411{ 421{
412 bool rw = bio_data_dir(bio); 422 bool rw = bio_data_dir(bio);
413 u64 bytes_allowed, extra_bytes; 423 unsigned int io_allowed;
414 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 424 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
415 425
416 /* 426 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
417 * Currently whole state machine of group depends on first bio
418 * queued in the group bio list. So one should not be calling
419 * this function with a different bio if there are other bios
420 * queued.
421 */
422 BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
423 427
424 /* If tg->bps = -1, then BW is unlimited */ 428 /* Slice has just started. Consider one slice interval */
425 if (tg->bps[rw] == -1) { 429 if (!jiffy_elapsed)
430 jiffy_elapsed_rnd = throtl_slice;
431
432 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
433
434 io_allowed = (tg->iops[rw] * jiffies_to_msecs(jiffy_elapsed_rnd))
435 / MSEC_PER_SEC;
436
437 if (tg->io_disp[rw] + 1 <= io_allowed) {
426 if (wait) 438 if (wait)
427 *wait = 0; 439 *wait = 0;
428 return 1; 440 return 1;
429 } 441 }
430 442
431 /* 443 /* Calc approx time to dispatch */
432 * If previous slice expired, start a new one otherwise renew/extend 444 jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
433 * existing slice to make sure it is at least throtl_slice interval 445
434 * long since now. 446 if (jiffy_wait > jiffy_elapsed)
435 */ 447 jiffy_wait = jiffy_wait - jiffy_elapsed;
436 if (throtl_slice_used(td, tg, rw)) 448 else
437 throtl_start_new_slice(td, tg, rw); 449 jiffy_wait = 1;
438 else { 450
439 if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) 451 if (wait)
440 throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); 452 *wait = jiffy_wait;
441 } 453 return 0;
454}
455
456static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
457 struct bio *bio, unsigned long *wait)
458{
459 bool rw = bio_data_dir(bio);
460 u64 bytes_allowed, extra_bytes;
461 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
442 462
443 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 463 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
444 464
@@ -469,12 +489,62 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
469 * up we did. Add that time also. 489 * up we did. Add that time also.
470 */ 490 */
471 jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); 491 jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
472
473 if (wait) 492 if (wait)
474 *wait = jiffy_wait; 493 *wait = jiffy_wait;
494 return 0;
495}
496
497/*
498 * Returns whether one can dispatch a bio or not. Also returns approx number
499 * of jiffies to wait before this bio is with-in IO rate and can be dispatched
500 */
501static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
502 struct bio *bio, unsigned long *wait)
503{
504 bool rw = bio_data_dir(bio);
505 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
506
507 /*
508 * Currently whole state machine of group depends on first bio
509 * queued in the group bio list. So one should not be calling
510 * this function with a different bio if there are other bios
511 * queued.
512 */
513 BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
475 514
476 if (time_before(tg->slice_end[rw], jiffies + jiffy_wait)) 515 /* If tg->bps = -1, then BW is unlimited */
477 throtl_extend_slice(td, tg, rw, jiffies + jiffy_wait); 516 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
517 if (wait)
518 *wait = 0;
519 return 1;
520 }
521
522 /*
523 * If previous slice expired, start a new one otherwise renew/extend
524 * existing slice to make sure it is at least throtl_slice interval
525 * long since now.
526 */
527 if (throtl_slice_used(td, tg, rw))
528 throtl_start_new_slice(td, tg, rw);
529 else {
530 if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
531 throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
532 }
533
534 if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
535 && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
536 if (wait)
537 *wait = 0;
538 return 1;
539 }
540
541 max_wait = max(bps_wait, iops_wait);
542
543 if (wait)
544 *wait = max_wait;
545
546 if (time_before(tg->slice_end[rw], jiffies + max_wait))
547 throtl_extend_slice(td, tg, rw, jiffies + max_wait);
478 548
479 return 0; 549 return 0;
480} 550}
@@ -486,13 +556,13 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
486 556
487 /* Charge the bio to the group */ 557 /* Charge the bio to the group */
488 tg->bytes_disp[rw] += bio->bi_size; 558 tg->bytes_disp[rw] += bio->bi_size;
559 tg->io_disp[rw]++;
489 560
490 /* 561 /*
491 * TODO: This will take blkg->stats_lock. Figure out a way 562 * TODO: This will take blkg->stats_lock. Figure out a way
492 * to avoid this cost. 563 * to avoid this cost.
493 */ 564 */
494 blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); 565 blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
495
496} 566}
497 567
498static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, 568static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
@@ -763,6 +833,18 @@ static void throtl_update_blkio_group_write_bps (struct blkio_group *blkg,
763 tg_of_blkg(blkg)->bps[WRITE] = write_bps; 833 tg_of_blkg(blkg)->bps[WRITE] = write_bps;
764} 834}
765 835
836static void throtl_update_blkio_group_read_iops (struct blkio_group *blkg,
837 unsigned int read_iops)
838{
839 tg_of_blkg(blkg)->iops[READ] = read_iops;
840}
841
842static void throtl_update_blkio_group_write_iops (struct blkio_group *blkg,
843 unsigned int write_iops)
844{
845 tg_of_blkg(blkg)->iops[WRITE] = write_iops;
846}
847
766void throtl_shutdown_timer_wq(struct request_queue *q) 848void throtl_shutdown_timer_wq(struct request_queue *q)
767{ 849{
768 struct throtl_data *td = q->td; 850 struct throtl_data *td = q->td;
@@ -777,7 +859,12 @@ static struct blkio_policy_type blkio_policy_throtl = {
777 throtl_update_blkio_group_read_bps, 859 throtl_update_blkio_group_read_bps,
778 .blkio_update_group_write_bps_fn = 860 .blkio_update_group_write_bps_fn =
779 throtl_update_blkio_group_write_bps, 861 throtl_update_blkio_group_write_bps,
862 .blkio_update_group_read_iops_fn =
863 throtl_update_blkio_group_read_iops,
864 .blkio_update_group_write_iops_fn =
865 throtl_update_blkio_group_write_iops,
780 }, 866 },
867 .plid = BLKIO_POLICY_THROTL,
781}; 868};
782 869
783int blk_throtl_bio(struct request_queue *q, struct bio **biop) 870int blk_throtl_bio(struct request_queue *q, struct bio **biop)
@@ -811,9 +898,11 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
811 } 898 }
812 899
813queue_bio: 900queue_bio:
814 throtl_log_tg(td, tg, "[%c] bio. disp=%u sz=%u bps=%llu" 901 throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu"
815 " queued=%d/%d", rw == READ ? 'R' : 'W', 902 " iodisp=%u iops=%u queued=%d/%d",
903 rw == READ ? 'R' : 'W',
816 tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], 904 tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
905 tg->io_disp[rw], tg->iops[rw],
817 tg->nr_queued[READ], tg->nr_queued[WRITE]); 906 tg->nr_queued[READ], tg->nr_queued[WRITE]);
818 907
819 throtl_add_bio_tg(q->td, tg, bio); 908 throtl_add_bio_tg(q->td, tg, bio);
@@ -850,6 +939,7 @@ int blk_throtl_init(struct request_queue *q)
850 939
851 /* Practically unlimited BW */ 940 /* Practically unlimited BW */
852 tg->bps[0] = tg->bps[1] = -1; 941 tg->bps[0] = tg->bps[1] = -1;
942 tg->iops[0] = tg->iops[1] = -1;
853 atomic_set(&tg->ref, 1); 943 atomic_set(&tg->ref, 1);
854 944
855 INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); 945 INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);