diff options
author | Vivek Goyal <vgoyal@redhat.com> | 2010-09-15 17:06:37 -0400 |
---|---|---|
committer | Jens Axboe <jaxboe@fusionio.com> | 2010-09-16 02:44:00 -0400 |
commit | 8e89d13f4ede2467629a971618537430fafaaea3 (patch) | |
tree | 90a79a2997f597715e3ad7edeea507fdb6223882 | |
parent | 7702e8f45b0a3bb262b9366c60beb5445758d94c (diff) |
blkio: Implementation of IOPS limit logic
o core logic of implementing IOPS throttling.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
-rw-r--r-- | block/blk-throttle.c | 164 |
1 files changed, 127 insertions, 37 deletions
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 4b492011e0de..af53f37c1b13 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c | |||
@@ -59,8 +59,13 @@ struct throtl_grp { | |||
59 | /* bytes per second rate limits */ | 59 | /* bytes per second rate limits */ |
60 | uint64_t bps[2]; | 60 | uint64_t bps[2]; |
61 | 61 | ||
62 | /* IOPS limits */ | ||
63 | unsigned int iops[2]; | ||
64 | |||
62 | /* Number of bytes disptached in current slice */ | 65 | /* Number of bytes disptached in current slice */ |
63 | uint64_t bytes_disp[2]; | 66 | uint64_t bytes_disp[2]; |
67 | /* Number of bio's dispatched in current slice */ | ||
68 | unsigned int io_disp[2]; | ||
64 | 69 | ||
65 | /* When did we start a new slice */ | 70 | /* When did we start a new slice */ |
66 | unsigned long slice_start[2]; | 71 | unsigned long slice_start[2]; |
@@ -194,6 +199,8 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, | |||
194 | 199 | ||
195 | tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); | 200 | tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); |
196 | tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); | 201 | tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); |
202 | tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); | ||
203 | tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); | ||
197 | 204 | ||
198 | hlist_add_head(&tg->tg_node, &td->tg_list); | 205 | hlist_add_head(&tg->tg_node, &td->tg_list); |
199 | td->nr_undestroyed_grps++; | 206 | td->nr_undestroyed_grps++; |
@@ -335,6 +342,7 @@ static inline void | |||
335 | throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) | 342 | throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) |
336 | { | 343 | { |
337 | tg->bytes_disp[rw] = 0; | 344 | tg->bytes_disp[rw] = 0; |
345 | tg->io_disp[rw] = 0; | ||
338 | tg->slice_start[rw] = jiffies; | 346 | tg->slice_start[rw] = jiffies; |
339 | tg->slice_end[rw] = jiffies + throtl_slice; | 347 | tg->slice_end[rw] = jiffies + throtl_slice; |
340 | throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", | 348 | throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", |
@@ -365,7 +373,7 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw) | |||
365 | static inline void | 373 | static inline void |
366 | throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) | 374 | throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) |
367 | { | 375 | { |
368 | unsigned long nr_slices, bytes_trim, time_elapsed; | 376 | unsigned long nr_slices, bytes_trim, time_elapsed, io_trim; |
369 | 377 | ||
370 | BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); | 378 | BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); |
371 | 379 | ||
@@ -385,8 +393,9 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) | |||
385 | return; | 393 | return; |
386 | 394 | ||
387 | bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ; | 395 | bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ; |
396 | io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; | ||
388 | 397 | ||
389 | if (!bytes_trim) | 398 | if (!bytes_trim && !io_trim) |
390 | return; | 399 | return; |
391 | 400 | ||
392 | if (tg->bytes_disp[rw] >= bytes_trim) | 401 | if (tg->bytes_disp[rw] >= bytes_trim) |
@@ -394,51 +403,62 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) | |||
394 | else | 403 | else |
395 | tg->bytes_disp[rw] = 0; | 404 | tg->bytes_disp[rw] = 0; |
396 | 405 | ||
406 | if (tg->io_disp[rw] >= io_trim) | ||
407 | tg->io_disp[rw] -= io_trim; | ||
408 | else | ||
409 | tg->io_disp[rw] = 0; | ||
410 | |||
397 | tg->slice_start[rw] += nr_slices * throtl_slice; | 411 | tg->slice_start[rw] += nr_slices * throtl_slice; |
398 | 412 | ||
399 | throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu" | 413 | throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu io=%lu" |
400 | " start=%lu end=%lu jiffies=%lu", | 414 | " start=%lu end=%lu jiffies=%lu", |
401 | rw == READ ? 'R' : 'W', nr_slices, bytes_trim, | 415 | rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, |
402 | tg->slice_start[rw], tg->slice_end[rw], jiffies); | 416 | tg->slice_start[rw], tg->slice_end[rw], jiffies); |
403 | } | 417 | } |
404 | 418 | ||
405 | /* | 419 | static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, |
406 | * Returns whether one can dispatch a bio or not. Also returns approx number | 420 | struct bio *bio, unsigned long *wait) |
407 | * of jiffies to wait before this bio is with-in IO rate and can be dispatched | ||
408 | */ | ||
409 | static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, | ||
410 | struct bio *bio, unsigned long *wait) | ||
411 | { | 421 | { |
412 | bool rw = bio_data_dir(bio); | 422 | bool rw = bio_data_dir(bio); |
413 | u64 bytes_allowed, extra_bytes; | 423 | unsigned int io_allowed; |
414 | unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; | 424 | unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; |
415 | 425 | ||
416 | /* | 426 | jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; |
417 | * Currently whole state machine of group depends on first bio | ||
418 | * queued in the group bio list. So one should not be calling | ||
419 | * this function with a different bio if there are other bios | ||
420 | * queued. | ||
421 | */ | ||
422 | BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); | ||
423 | 427 | ||
424 | /* If tg->bps = -1, then BW is unlimited */ | 428 | /* Slice has just started. Consider one slice interval */ |
425 | if (tg->bps[rw] == -1) { | 429 | if (!jiffy_elapsed) |
430 | jiffy_elapsed_rnd = throtl_slice; | ||
431 | |||
432 | jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); | ||
433 | |||
434 | io_allowed = (tg->iops[rw] * jiffies_to_msecs(jiffy_elapsed_rnd)) | ||
435 | / MSEC_PER_SEC; | ||
436 | |||
437 | if (tg->io_disp[rw] + 1 <= io_allowed) { | ||
426 | if (wait) | 438 | if (wait) |
427 | *wait = 0; | 439 | *wait = 0; |
428 | return 1; | 440 | return 1; |
429 | } | 441 | } |
430 | 442 | ||
431 | /* | 443 | /* Calc approx time to dispatch */ |
432 | * If previous slice expired, start a new one otherwise renew/extend | 444 | jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; |
433 | * existing slice to make sure it is at least throtl_slice interval | 445 | |
434 | * long since now. | 446 | if (jiffy_wait > jiffy_elapsed) |
435 | */ | 447 | jiffy_wait = jiffy_wait - jiffy_elapsed; |
436 | if (throtl_slice_used(td, tg, rw)) | 448 | else |
437 | throtl_start_new_slice(td, tg, rw); | 449 | jiffy_wait = 1; |
438 | else { | 450 | |
439 | if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) | 451 | if (wait) |
440 | throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); | 452 | *wait = jiffy_wait; |
441 | } | 453 | return 0; |
454 | } | ||
455 | |||
456 | static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, | ||
457 | struct bio *bio, unsigned long *wait) | ||
458 | { | ||
459 | bool rw = bio_data_dir(bio); | ||
460 | u64 bytes_allowed, extra_bytes; | ||
461 | unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; | ||
442 | 462 | ||
443 | jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; | 463 | jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; |
444 | 464 | ||
@@ -469,12 +489,62 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, | |||
469 | * up we did. Add that time also. | 489 | * up we did. Add that time also. |
470 | */ | 490 | */ |
471 | jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); | 491 | jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); |
472 | |||
473 | if (wait) | 492 | if (wait) |
474 | *wait = jiffy_wait; | 493 | *wait = jiffy_wait; |
494 | return 0; | ||
495 | } | ||
496 | |||
497 | /* | ||
498 | * Returns whether one can dispatch a bio or not. Also returns approx number | ||
499 | * of jiffies to wait before this bio is with-in IO rate and can be dispatched | ||
500 | */ | ||
501 | static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, | ||
502 | struct bio *bio, unsigned long *wait) | ||
503 | { | ||
504 | bool rw = bio_data_dir(bio); | ||
505 | unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; | ||
506 | |||
507 | /* | ||
508 | * Currently whole state machine of group depends on first bio | ||
509 | * queued in the group bio list. So one should not be calling | ||
510 | * this function with a different bio if there are other bios | ||
511 | * queued. | ||
512 | */ | ||
513 | BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); | ||
475 | 514 | ||
476 | if (time_before(tg->slice_end[rw], jiffies + jiffy_wait)) | 515 | /* If tg->bps = -1, then BW is unlimited */ |
477 | throtl_extend_slice(td, tg, rw, jiffies + jiffy_wait); | 516 | if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { |
517 | if (wait) | ||
518 | *wait = 0; | ||
519 | return 1; | ||
520 | } | ||
521 | |||
522 | /* | ||
523 | * If previous slice expired, start a new one otherwise renew/extend | ||
524 | * existing slice to make sure it is at least throtl_slice interval | ||
525 | * long since now. | ||
526 | */ | ||
527 | if (throtl_slice_used(td, tg, rw)) | ||
528 | throtl_start_new_slice(td, tg, rw); | ||
529 | else { | ||
530 | if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) | ||
531 | throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); | ||
532 | } | ||
533 | |||
534 | if (tg_with_in_bps_limit(td, tg, bio, &bps_wait) | ||
535 | && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) { | ||
536 | if (wait) | ||
537 | *wait = 0; | ||
538 | return 1; | ||
539 | } | ||
540 | |||
541 | max_wait = max(bps_wait, iops_wait); | ||
542 | |||
543 | if (wait) | ||
544 | *wait = max_wait; | ||
545 | |||
546 | if (time_before(tg->slice_end[rw], jiffies + max_wait)) | ||
547 | throtl_extend_slice(td, tg, rw, jiffies + max_wait); | ||
478 | 548 | ||
479 | return 0; | 549 | return 0; |
480 | } | 550 | } |
@@ -486,13 +556,13 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) | |||
486 | 556 | ||
487 | /* Charge the bio to the group */ | 557 | /* Charge the bio to the group */ |
488 | tg->bytes_disp[rw] += bio->bi_size; | 558 | tg->bytes_disp[rw] += bio->bi_size; |
559 | tg->io_disp[rw]++; | ||
489 | 560 | ||
490 | /* | 561 | /* |
491 | * TODO: This will take blkg->stats_lock. Figure out a way | 562 | * TODO: This will take blkg->stats_lock. Figure out a way |
492 | * to avoid this cost. | 563 | * to avoid this cost. |
493 | */ | 564 | */ |
494 | blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); | 565 | blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); |
495 | |||
496 | } | 566 | } |
497 | 567 | ||
498 | static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, | 568 | static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, |
@@ -763,6 +833,18 @@ static void throtl_update_blkio_group_write_bps (struct blkio_group *blkg, | |||
763 | tg_of_blkg(blkg)->bps[WRITE] = write_bps; | 833 | tg_of_blkg(blkg)->bps[WRITE] = write_bps; |
764 | } | 834 | } |
765 | 835 | ||
836 | static void throtl_update_blkio_group_read_iops (struct blkio_group *blkg, | ||
837 | unsigned int read_iops) | ||
838 | { | ||
839 | tg_of_blkg(blkg)->iops[READ] = read_iops; | ||
840 | } | ||
841 | |||
842 | static void throtl_update_blkio_group_write_iops (struct blkio_group *blkg, | ||
843 | unsigned int write_iops) | ||
844 | { | ||
845 | tg_of_blkg(blkg)->iops[WRITE] = write_iops; | ||
846 | } | ||
847 | |||
766 | void throtl_shutdown_timer_wq(struct request_queue *q) | 848 | void throtl_shutdown_timer_wq(struct request_queue *q) |
767 | { | 849 | { |
768 | struct throtl_data *td = q->td; | 850 | struct throtl_data *td = q->td; |
@@ -777,7 +859,12 @@ static struct blkio_policy_type blkio_policy_throtl = { | |||
777 | throtl_update_blkio_group_read_bps, | 859 | throtl_update_blkio_group_read_bps, |
778 | .blkio_update_group_write_bps_fn = | 860 | .blkio_update_group_write_bps_fn = |
779 | throtl_update_blkio_group_write_bps, | 861 | throtl_update_blkio_group_write_bps, |
862 | .blkio_update_group_read_iops_fn = | ||
863 | throtl_update_blkio_group_read_iops, | ||
864 | .blkio_update_group_write_iops_fn = | ||
865 | throtl_update_blkio_group_write_iops, | ||
780 | }, | 866 | }, |
867 | .plid = BLKIO_POLICY_THROTL, | ||
781 | }; | 868 | }; |
782 | 869 | ||
783 | int blk_throtl_bio(struct request_queue *q, struct bio **biop) | 870 | int blk_throtl_bio(struct request_queue *q, struct bio **biop) |
@@ -811,9 +898,11 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop) | |||
811 | } | 898 | } |
812 | 899 | ||
813 | queue_bio: | 900 | queue_bio: |
814 | throtl_log_tg(td, tg, "[%c] bio. disp=%u sz=%u bps=%llu" | 901 | throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu" |
815 | " queued=%d/%d", rw == READ ? 'R' : 'W', | 902 | " iodisp=%u iops=%u queued=%d/%d", |
903 | rw == READ ? 'R' : 'W', | ||
816 | tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], | 904 | tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], |
905 | tg->io_disp[rw], tg->iops[rw], | ||
817 | tg->nr_queued[READ], tg->nr_queued[WRITE]); | 906 | tg->nr_queued[READ], tg->nr_queued[WRITE]); |
818 | 907 | ||
819 | throtl_add_bio_tg(q->td, tg, bio); | 908 | throtl_add_bio_tg(q->td, tg, bio); |
@@ -850,6 +939,7 @@ int blk_throtl_init(struct request_queue *q) | |||
850 | 939 | ||
851 | /* Practically unlimited BW */ | 940 | /* Practically unlimited BW */ |
852 | tg->bps[0] = tg->bps[1] = -1; | 941 | tg->bps[0] = tg->bps[1] = -1; |
942 | tg->iops[0] = tg->iops[1] = -1; | ||
853 | atomic_set(&tg->ref, 1); | 943 | atomic_set(&tg->ref, 1); |
854 | 944 | ||
855 | INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); | 945 | INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); |