aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--mm/page-writeback.c191
1 files changed, 190 insertions, 1 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0e6dd5c2ed31..c16ddd8f5cb6 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -46,6 +46,8 @@
46 */ 46 */
47#define BANDWIDTH_INTERVAL max(HZ/5, 1) 47#define BANDWIDTH_INTERVAL max(HZ/5, 1)
48 48
49#define RATELIMIT_CALC_SHIFT 10
50
49/* 51/*
50 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 52 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
51 * will look to see if it needs to force writeback or throttling. 53 * will look to see if it needs to force writeback or throttling.
@@ -411,6 +413,12 @@ unsigned long determine_dirtyable_memory(void)
411 return x + 1; /* Ensure that we never return 0 */ 413 return x + 1; /* Ensure that we never return 0 */
412} 414}
413 415
416static unsigned long dirty_freerun_ceiling(unsigned long thresh,
417 unsigned long bg_thresh)
418{
419 return (thresh + bg_thresh) / 2;
420}
421
414static unsigned long hard_dirty_limit(unsigned long thresh) 422static unsigned long hard_dirty_limit(unsigned long thresh)
415{ 423{
416 return max(thresh, global_dirty_limit); 424 return max(thresh, global_dirty_limit);
@@ -495,6 +503,184 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
495 return bdi_dirty; 503 return bdi_dirty;
496} 504}
497 505
506/*
507 * Dirty position control.
508 *
509 * (o) global/bdi setpoints
510 *
511 * We want the dirty pages be balanced around the global/bdi setpoints.
512 * When the number of dirty pages is higher/lower than the setpoint, the
513 * dirty position control ratio (and hence task dirty ratelimit) will be
514 * decreased/increased to bring the dirty pages back to the setpoint.
515 *
516 * pos_ratio = 1 << RATELIMIT_CALC_SHIFT
517 *
518 * if (dirty < setpoint) scale up pos_ratio
519 * if (dirty > setpoint) scale down pos_ratio
520 *
521 * if (bdi_dirty < bdi_setpoint) scale up pos_ratio
522 * if (bdi_dirty > bdi_setpoint) scale down pos_ratio
523 *
524 * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
525 *
526 * (o) global control line
527 *
528 * ^ pos_ratio
529 * |
530 * | |<===== global dirty control scope ======>|
531 * 2.0 .............*
532 * | .*
533 * | . *
534 * | . *
535 * | . *
536 * | . *
537 * | . *
538 * 1.0 ................................*
539 * | . . *
540 * | . . *
541 * | . . *
542 * | . . *
543 * | . . *
544 * 0 +------------.------------------.----------------------*------------->
545 * freerun^ setpoint^ limit^ dirty pages
546 *
547 * (o) bdi control line
548 *
549 * ^ pos_ratio
550 * |
551 * | *
552 * | *
553 * | *
554 * | *
555 * | * |<=========== span ============>|
556 * 1.0 .......................*
557 * | . *
558 * | . *
559 * | . *
560 * | . *
561 * | . *
562 * | . *
563 * | . *
564 * | . *
565 * | . *
566 * | . *
567 * | . *
568 * 1/4 ...............................................* * * * * * * * * * * *
569 * | . .
570 * | . .
571 * | . .
572 * 0 +----------------------.-------------------------------.------------->
573 * bdi_setpoint^ x_intercept^
574 *
575 * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can
576 * be smoothly throttled down to normal if it starts high in situations like
577 * - start writing to a slow SD card and a fast disk at the same time. The SD
578 * card's bdi_dirty may rush to many times higher than bdi_setpoint.
579 * - the bdi dirty thresh drops quickly due to change of JBOD workload
580 */
581static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
582 unsigned long thresh,
583 unsigned long bg_thresh,
584 unsigned long dirty,
585 unsigned long bdi_thresh,
586 unsigned long bdi_dirty)
587{
588 unsigned long write_bw = bdi->avg_write_bandwidth;
589 unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
590 unsigned long limit = hard_dirty_limit(thresh);
591 unsigned long x_intercept;
592 unsigned long setpoint; /* dirty pages' target balance point */
593 unsigned long bdi_setpoint;
594 unsigned long span;
595 long long pos_ratio; /* for scaling up/down the rate limit */
596 long x;
597
598 if (unlikely(dirty >= limit))
599 return 0;
600
601 /*
602 * global setpoint
603 *
604 * setpoint - dirty 3
605 * f(dirty) := 1.0 + (----------------)
606 * limit - setpoint
607 *
608 * it's a 3rd order polynomial that subjects to
609 *
610 * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast
611 * (2) f(setpoint) = 1.0 => the balance point
612 * (3) f(limit) = 0 => the hard limit
613 * (4) df/dx <= 0 => negative feedback control
614 * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
615 * => fast response on large errors; small oscillation near setpoint
616 */
617 setpoint = (freerun + limit) / 2;
618 x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT,
619 limit - setpoint + 1);
620 pos_ratio = x;
621 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
622 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
623 pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
624
625 /*
626 * We have computed basic pos_ratio above based on global situation. If
627 * the bdi is over/under its share of dirty pages, we want to scale
628 * pos_ratio further down/up. That is done by the following mechanism.
629 */
630
631 /*
632 * bdi setpoint
633 *
634 * f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint)
635 *
636 * x_intercept - bdi_dirty
637 * := --------------------------
638 * x_intercept - bdi_setpoint
639 *
640 * The main bdi control line is a linear function that subjects to
641 *
642 * (1) f(bdi_setpoint) = 1.0
643 * (2) k = - 1 / (8 * write_bw) (in single bdi case)
644 * or equally: x_intercept = bdi_setpoint + 8 * write_bw
645 *
646 * For single bdi case, the dirty pages are observed to fluctuate
647 * regularly within range
648 * [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2]
649 * for various filesystems, where (2) can yield in a reasonable 12.5%
650 * fluctuation range for pos_ratio.
651 *
652 * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its
653 * own size, so move the slope over accordingly and choose a slope that
654 * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh.
655 */
656 if (unlikely(bdi_thresh > thresh))
657 bdi_thresh = thresh;
658 /*
659 * scale global setpoint to bdi's:
660 * bdi_setpoint = setpoint * bdi_thresh / thresh
661 */
662 x = div_u64((u64)bdi_thresh << 16, thresh + 1);
663 bdi_setpoint = setpoint * (u64)x >> 16;
664 /*
665 * Use span=(8*write_bw) in single bdi case as indicated by
666 * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case.
667 *
668 * bdi_thresh thresh - bdi_thresh
669 * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh
670 * thresh thresh
671 */
672 span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16;
673 x_intercept = bdi_setpoint + span;
674
675 if (bdi_dirty < x_intercept - span / 4) {
676 pos_ratio *= x_intercept - bdi_dirty;
677 do_div(pos_ratio, x_intercept - bdi_setpoint + 1);
678 } else
679 pos_ratio /= 4;
680
681 return pos_ratio;
682}
683
498static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, 684static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
499 unsigned long elapsed, 685 unsigned long elapsed,
500 unsigned long written) 686 unsigned long written)
@@ -655,6 +841,7 @@ static void balance_dirty_pages(struct address_space *mapping,
655 unsigned long nr_reclaimable, bdi_nr_reclaimable; 841 unsigned long nr_reclaimable, bdi_nr_reclaimable;
656 unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ 842 unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
657 unsigned long bdi_dirty; 843 unsigned long bdi_dirty;
844 unsigned long freerun;
658 unsigned long background_thresh; 845 unsigned long background_thresh;
659 unsigned long dirty_thresh; 846 unsigned long dirty_thresh;
660 unsigned long bdi_thresh; 847 unsigned long bdi_thresh;
@@ -679,7 +866,9 @@ static void balance_dirty_pages(struct address_space *mapping,
679 * catch-up. This avoids (excessively) small writeouts 866 * catch-up. This avoids (excessively) small writeouts
680 * when the bdi limits are ramping up. 867 * when the bdi limits are ramping up.
681 */ 868 */
682 if (nr_dirty <= (background_thresh + dirty_thresh) / 2) 869 freerun = dirty_freerun_ceiling(dirty_thresh,
870 background_thresh);
871 if (nr_dirty <= freerun)
683 break; 872 break;
684 873
685 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); 874 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);