diff options
-rw-r--r-- | mm/page-writeback.c | 191 |
1 files changed, 190 insertions, 1 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0e6dd5c2ed31..c16ddd8f5cb6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -46,6 +46,8 @@ | |||
46 | */ | 46 | */ |
47 | #define BANDWIDTH_INTERVAL max(HZ/5, 1) | 47 | #define BANDWIDTH_INTERVAL max(HZ/5, 1) |
48 | 48 | ||
49 | #define RATELIMIT_CALC_SHIFT 10 | ||
50 | |||
49 | /* | 51 | /* |
50 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited | 52 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited |
51 | * will look to see if it needs to force writeback or throttling. | 53 | * will look to see if it needs to force writeback or throttling. |
@@ -411,6 +413,12 @@ unsigned long determine_dirtyable_memory(void) | |||
411 | return x + 1; /* Ensure that we never return 0 */ | 413 | return x + 1; /* Ensure that we never return 0 */ |
412 | } | 414 | } |
413 | 415 | ||
416 | static unsigned long dirty_freerun_ceiling(unsigned long thresh, | ||
417 | unsigned long bg_thresh) | ||
418 | { | ||
419 | return (thresh + bg_thresh) / 2; | ||
420 | } | ||
421 | |||
414 | static unsigned long hard_dirty_limit(unsigned long thresh) | 422 | static unsigned long hard_dirty_limit(unsigned long thresh) |
415 | { | 423 | { |
416 | return max(thresh, global_dirty_limit); | 424 | return max(thresh, global_dirty_limit); |
@@ -495,6 +503,184 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) | |||
495 | return bdi_dirty; | 503 | return bdi_dirty; |
496 | } | 504 | } |
497 | 505 | ||
506 | /* | ||
507 | * Dirty position control. | ||
508 | * | ||
509 | * (o) global/bdi setpoints | ||
510 | * | ||
511 | * We want the dirty pages be balanced around the global/bdi setpoints. | ||
512 | * When the number of dirty pages is higher/lower than the setpoint, the | ||
513 | * dirty position control ratio (and hence task dirty ratelimit) will be | ||
514 | * decreased/increased to bring the dirty pages back to the setpoint. | ||
515 | * | ||
516 | * pos_ratio = 1 << RATELIMIT_CALC_SHIFT | ||
517 | * | ||
518 | * if (dirty < setpoint) scale up pos_ratio | ||
519 | * if (dirty > setpoint) scale down pos_ratio | ||
520 | * | ||
521 | * if (bdi_dirty < bdi_setpoint) scale up pos_ratio | ||
522 | * if (bdi_dirty > bdi_setpoint) scale down pos_ratio | ||
523 | * | ||
524 | * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT | ||
525 | * | ||
526 | * (o) global control line | ||
527 | * | ||
528 | * ^ pos_ratio | ||
529 | * | | ||
530 | * | |<===== global dirty control scope ======>| | ||
531 | * 2.0 .............* | ||
532 | * | .* | ||
533 | * | . * | ||
534 | * | . * | ||
535 | * | . * | ||
536 | * | . * | ||
537 | * | . * | ||
538 | * 1.0 ................................* | ||
539 | * | . . * | ||
540 | * | . . * | ||
541 | * | . . * | ||
542 | * | . . * | ||
543 | * | . . * | ||
544 | * 0 +------------.------------------.----------------------*-------------> | ||
545 | * freerun^ setpoint^ limit^ dirty pages | ||
546 | * | ||
547 | * (o) bdi control line | ||
548 | * | ||
549 | * ^ pos_ratio | ||
550 | * | | ||
551 | * | * | ||
552 | * | * | ||
553 | * | * | ||
554 | * | * | ||
555 | * | * |<=========== span ============>| | ||
556 | * 1.0 .......................* | ||
557 | * | . * | ||
558 | * | . * | ||
559 | * | . * | ||
560 | * | . * | ||
561 | * | . * | ||
562 | * | . * | ||
563 | * | . * | ||
564 | * | . * | ||
565 | * | . * | ||
566 | * | . * | ||
567 | * | . * | ||
568 | * 1/4 ...............................................* * * * * * * * * * * * | ||
569 | * | . . | ||
570 | * | . . | ||
571 | * | . . | ||
572 | * 0 +----------------------.-------------------------------.-------------> | ||
573 | * bdi_setpoint^ x_intercept^ | ||
574 | * | ||
575 | * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can | ||
576 | * be smoothly throttled down to normal if it starts high in situations like | ||
577 | * - start writing to a slow SD card and a fast disk at the same time. The SD | ||
578 | * card's bdi_dirty may rush to many times higher than bdi_setpoint. | ||
579 | * - the bdi dirty thresh drops quickly due to change of JBOD workload | ||
580 | */ | ||
581 | static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, | ||
582 | unsigned long thresh, | ||
583 | unsigned long bg_thresh, | ||
584 | unsigned long dirty, | ||
585 | unsigned long bdi_thresh, | ||
586 | unsigned long bdi_dirty) | ||
587 | { | ||
588 | unsigned long write_bw = bdi->avg_write_bandwidth; | ||
589 | unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh); | ||
590 | unsigned long limit = hard_dirty_limit(thresh); | ||
591 | unsigned long x_intercept; | ||
592 | unsigned long setpoint; /* dirty pages' target balance point */ | ||
593 | unsigned long bdi_setpoint; | ||
594 | unsigned long span; | ||
595 | long long pos_ratio; /* for scaling up/down the rate limit */ | ||
596 | long x; | ||
597 | |||
598 | if (unlikely(dirty >= limit)) | ||
599 | return 0; | ||
600 | |||
601 | /* | ||
602 | * global setpoint | ||
603 | * | ||
604 | * setpoint - dirty 3 | ||
605 | * f(dirty) := 1.0 + (----------------) | ||
606 | * limit - setpoint | ||
607 | * | ||
608 | * it's a 3rd order polynomial that subjects to | ||
609 | * | ||
610 | * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast | ||
611 | * (2) f(setpoint) = 1.0 => the balance point | ||
612 | * (3) f(limit) = 0 => the hard limit | ||
613 | * (4) df/dx <= 0 => negative feedback control | ||
614 | * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) | ||
615 | * => fast response on large errors; small oscillation near setpoint | ||
616 | */ | ||
617 | setpoint = (freerun + limit) / 2; | ||
618 | x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT, | ||
619 | limit - setpoint + 1); | ||
620 | pos_ratio = x; | ||
621 | pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; | ||
622 | pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; | ||
623 | pos_ratio += 1 << RATELIMIT_CALC_SHIFT; | ||
624 | |||
625 | /* | ||
626 | * We have computed basic pos_ratio above based on global situation. If | ||
627 | * the bdi is over/under its share of dirty pages, we want to scale | ||
628 | * pos_ratio further down/up. That is done by the following mechanism. | ||
629 | */ | ||
630 | |||
631 | /* | ||
632 | * bdi setpoint | ||
633 | * | ||
634 | * f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint) | ||
635 | * | ||
636 | * x_intercept - bdi_dirty | ||
637 | * := -------------------------- | ||
638 | * x_intercept - bdi_setpoint | ||
639 | * | ||
640 | * The main bdi control line is a linear function that subjects to | ||
641 | * | ||
642 | * (1) f(bdi_setpoint) = 1.0 | ||
643 | * (2) k = - 1 / (8 * write_bw) (in single bdi case) | ||
644 | * or equally: x_intercept = bdi_setpoint + 8 * write_bw | ||
645 | * | ||
646 | * For single bdi case, the dirty pages are observed to fluctuate | ||
647 | * regularly within range | ||
648 | * [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2] | ||
649 | * for various filesystems, where (2) can yield in a reasonable 12.5% | ||
650 | * fluctuation range for pos_ratio. | ||
651 | * | ||
652 | * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its | ||
653 | * own size, so move the slope over accordingly and choose a slope that | ||
654 | * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh. | ||
655 | */ | ||
656 | if (unlikely(bdi_thresh > thresh)) | ||
657 | bdi_thresh = thresh; | ||
658 | /* | ||
659 | * scale global setpoint to bdi's: | ||
660 | * bdi_setpoint = setpoint * bdi_thresh / thresh | ||
661 | */ | ||
662 | x = div_u64((u64)bdi_thresh << 16, thresh + 1); | ||
663 | bdi_setpoint = setpoint * (u64)x >> 16; | ||
664 | /* | ||
665 | * Use span=(8*write_bw) in single bdi case as indicated by | ||
666 | * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case. | ||
667 | * | ||
668 | * bdi_thresh thresh - bdi_thresh | ||
669 | * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh | ||
670 | * thresh thresh | ||
671 | */ | ||
672 | span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16; | ||
673 | x_intercept = bdi_setpoint + span; | ||
674 | |||
675 | if (bdi_dirty < x_intercept - span / 4) { | ||
676 | pos_ratio *= x_intercept - bdi_dirty; | ||
677 | do_div(pos_ratio, x_intercept - bdi_setpoint + 1); | ||
678 | } else | ||
679 | pos_ratio /= 4; | ||
680 | |||
681 | return pos_ratio; | ||
682 | } | ||
683 | |||
498 | static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, | 684 | static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, |
499 | unsigned long elapsed, | 685 | unsigned long elapsed, |
500 | unsigned long written) | 686 | unsigned long written) |
@@ -655,6 +841,7 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
655 | unsigned long nr_reclaimable, bdi_nr_reclaimable; | 841 | unsigned long nr_reclaimable, bdi_nr_reclaimable; |
656 | unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ | 842 | unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ |
657 | unsigned long bdi_dirty; | 843 | unsigned long bdi_dirty; |
844 | unsigned long freerun; | ||
658 | unsigned long background_thresh; | 845 | unsigned long background_thresh; |
659 | unsigned long dirty_thresh; | 846 | unsigned long dirty_thresh; |
660 | unsigned long bdi_thresh; | 847 | unsigned long bdi_thresh; |
@@ -679,7 +866,9 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
679 | * catch-up. This avoids (excessively) small writeouts | 866 | * catch-up. This avoids (excessively) small writeouts |
680 | * when the bdi limits are ramping up. | 867 | * when the bdi limits are ramping up. |
681 | */ | 868 | */ |
682 | if (nr_dirty <= (background_thresh + dirty_thresh) / 2) | 869 | freerun = dirty_freerun_ceiling(dirty_thresh, |
870 | background_thresh); | ||
871 | if (nr_dirty <= freerun) | ||
683 | break; | 872 | break; |
684 | 873 | ||
685 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); | 874 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); |