diff options
author | Tejun Heo <tj@kernel.org> | 2015-05-22 17:13:28 -0400 |
---|---|---|
committer | Jens Axboe <axboe@fb.com> | 2015-06-02 10:33:34 -0400 |
commit | a88a341a73be4ef035ca26170c849f002797da27 (patch) | |
tree | 3b9ba08daa3fdfb00d02ba21dfbce702a03826bc /mm/page-writeback.c | |
parent | 93f78d882865cb90020d0f80a9523c99cf46924c (diff) |
writeback: move bandwidth related fields from backing_dev_info into bdi_writeback
Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback)
and the role of the separation is unclear. For cgroup support for
writeback IOs, a bdi will be updated to host multiple wb's where each
wb serves writeback IOs of a different cgroup on the bdi. To achieve
that, a wb should carry all states necessary for servicing writeback
IOs for a cgroup independently.
This patch moves bandwidth related fields from backing_dev_info into
bdi_writeback.
* The moved fields are: bw_time_stamp, dirtied_stamp, written_stamp,
write_bandwidth, avg_write_bandwidth, dirty_ratelimit,
balanced_dirty_ratelimit, completions and dirty_exceeded.
* writeback_chunk_size() and over_bground_thresh() now take @wb
instead of @bdi.
* bdi_writeout_fraction(bdi, ...) -> wb_writeout_fraction(wb, ...)
bdi_dirty_limit(bdi, ...) -> wb_dirty_limit(wb, ...)
bdi_position_ration(bdi, ...) -> wb_position_ratio(wb, ...)
bdi_update_writebandwidth(bdi, ...) -> wb_update_write_bandwidth(wb, ...)
[__]bdi_update_bandwidth(bdi, ...) -> [__]wb_update_bandwidth(wb, ...)
bdi_{max|min}_pause(bdi, ...) -> wb_{max|min}_pause(wb, ...)
bdi_dirty_limits(bdi, ...) -> wb_dirty_limits(wb, ...)
* Init/exits of the relocated fields are moved to bdi_wb_init/exit()
respectively. Note that explicit zeroing is dropped in the process
as wb's are cleared in entirety anyway.
* As there's still only one bdi_writeback per backing_dev_info, all
uses of bdi->stat[] are mechanically replaced with bdi->wb.stat[]
introducing no behavior changes.
v2: Typo in description fixed as suggested by Jan.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'mm/page-writeback.c')
-rw-r--r-- | mm/page-writeback.c | 262 |
1 files changed, 132 insertions, 130 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index dc673a035413..cd39ee91b7bb 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -399,7 +399,7 @@ static unsigned long wp_next_time(unsigned long cur_time) | |||
399 | static inline void __wb_writeout_inc(struct bdi_writeback *wb) | 399 | static inline void __wb_writeout_inc(struct bdi_writeback *wb) |
400 | { | 400 | { |
401 | __inc_wb_stat(wb, WB_WRITTEN); | 401 | __inc_wb_stat(wb, WB_WRITTEN); |
402 | __fprop_inc_percpu_max(&writeout_completions, &wb->bdi->completions, | 402 | __fprop_inc_percpu_max(&writeout_completions, &wb->completions, |
403 | wb->bdi->max_prop_frac); | 403 | wb->bdi->max_prop_frac); |
404 | /* First event after period switching was turned off? */ | 404 | /* First event after period switching was turned off? */ |
405 | if (!unlikely(writeout_period_time)) { | 405 | if (!unlikely(writeout_period_time)) { |
@@ -427,10 +427,10 @@ EXPORT_SYMBOL_GPL(wb_writeout_inc); | |||
427 | /* | 427 | /* |
428 | * Obtain an accurate fraction of the BDI's portion. | 428 | * Obtain an accurate fraction of the BDI's portion. |
429 | */ | 429 | */ |
430 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, | 430 | static void wb_writeout_fraction(struct bdi_writeback *wb, |
431 | long *numerator, long *denominator) | 431 | long *numerator, long *denominator) |
432 | { | 432 | { |
433 | fprop_fraction_percpu(&writeout_completions, &bdi->completions, | 433 | fprop_fraction_percpu(&writeout_completions, &wb->completions, |
434 | numerator, denominator); | 434 | numerator, denominator); |
435 | } | 435 | } |
436 | 436 | ||
@@ -516,11 +516,11 @@ static unsigned long hard_dirty_limit(unsigned long thresh) | |||
516 | } | 516 | } |
517 | 517 | ||
518 | /** | 518 | /** |
519 | * bdi_dirty_limit - @bdi's share of dirty throttling threshold | 519 | * wb_dirty_limit - @wb's share of dirty throttling threshold |
520 | * @bdi: the backing_dev_info to query | 520 | * @wb: bdi_writeback to query |
521 | * @dirty: global dirty limit in pages | 521 | * @dirty: global dirty limit in pages |
522 | * | 522 | * |
523 | * Returns @bdi's dirty limit in pages. The term "dirty" in the context of | 523 | * Returns @wb's dirty limit in pages. The term "dirty" in the context of |
524 | * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. | 524 | * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. |
525 | * | 525 | * |
526 | * Note that balance_dirty_pages() will only seriously take it as a hard limit | 526 | * Note that balance_dirty_pages() will only seriously take it as a hard limit |
@@ -528,34 +528,35 @@ static unsigned long hard_dirty_limit(unsigned long thresh) | |||
528 | * control. For example, when the device is completely stalled due to some error | 528 | * control. For example, when the device is completely stalled due to some error |
529 | * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. | 529 | * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. |
530 | * In the other normal situations, it acts more gently by throttling the tasks | 530 | * In the other normal situations, it acts more gently by throttling the tasks |
531 | * more (rather than completely block them) when the bdi dirty pages go high. | 531 | * more (rather than completely block them) when the wb dirty pages go high. |
532 | * | 532 | * |
533 | * It allocates high/low dirty limits to fast/slow devices, in order to prevent | 533 | * It allocates high/low dirty limits to fast/slow devices, in order to prevent |
534 | * - starving fast devices | 534 | * - starving fast devices |
535 | * - piling up dirty pages (that will take long time to sync) on slow devices | 535 | * - piling up dirty pages (that will take long time to sync) on slow devices |
536 | * | 536 | * |
537 | * The bdi's share of dirty limit will be adapting to its throughput and | 537 | * The wb's share of dirty limit will be adapting to its throughput and |
538 | * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. | 538 | * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. |
539 | */ | 539 | */ |
540 | unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) | 540 | unsigned long wb_dirty_limit(struct bdi_writeback *wb, unsigned long dirty) |
541 | { | 541 | { |
542 | u64 bdi_dirty; | 542 | struct backing_dev_info *bdi = wb->bdi; |
543 | u64 wb_dirty; | ||
543 | long numerator, denominator; | 544 | long numerator, denominator; |
544 | 545 | ||
545 | /* | 546 | /* |
546 | * Calculate this BDI's share of the dirty ratio. | 547 | * Calculate this BDI's share of the dirty ratio. |
547 | */ | 548 | */ |
548 | bdi_writeout_fraction(bdi, &numerator, &denominator); | 549 | wb_writeout_fraction(wb, &numerator, &denominator); |
549 | 550 | ||
550 | bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; | 551 | wb_dirty = (dirty * (100 - bdi_min_ratio)) / 100; |
551 | bdi_dirty *= numerator; | 552 | wb_dirty *= numerator; |
552 | do_div(bdi_dirty, denominator); | 553 | do_div(wb_dirty, denominator); |
553 | 554 | ||
554 | bdi_dirty += (dirty * bdi->min_ratio) / 100; | 555 | wb_dirty += (dirty * bdi->min_ratio) / 100; |
555 | if (bdi_dirty > (dirty * bdi->max_ratio) / 100) | 556 | if (wb_dirty > (dirty * bdi->max_ratio) / 100) |
556 | bdi_dirty = dirty * bdi->max_ratio / 100; | 557 | wb_dirty = dirty * bdi->max_ratio / 100; |
557 | 558 | ||
558 | return bdi_dirty; | 559 | return wb_dirty; |
559 | } | 560 | } |
560 | 561 | ||
561 | /* | 562 | /* |
@@ -664,14 +665,14 @@ static long long pos_ratio_polynom(unsigned long setpoint, | |||
664 | * card's bdi_dirty may rush to many times higher than bdi_setpoint. | 665 | * card's bdi_dirty may rush to many times higher than bdi_setpoint. |
665 | * - the bdi dirty thresh drops quickly due to change of JBOD workload | 666 | * - the bdi dirty thresh drops quickly due to change of JBOD workload |
666 | */ | 667 | */ |
667 | static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, | 668 | static unsigned long wb_position_ratio(struct bdi_writeback *wb, |
668 | unsigned long thresh, | 669 | unsigned long thresh, |
669 | unsigned long bg_thresh, | 670 | unsigned long bg_thresh, |
670 | unsigned long dirty, | 671 | unsigned long dirty, |
671 | unsigned long bdi_thresh, | 672 | unsigned long bdi_thresh, |
672 | unsigned long bdi_dirty) | 673 | unsigned long bdi_dirty) |
673 | { | 674 | { |
674 | unsigned long write_bw = bdi->avg_write_bandwidth; | 675 | unsigned long write_bw = wb->avg_write_bandwidth; |
675 | unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh); | 676 | unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh); |
676 | unsigned long limit = hard_dirty_limit(thresh); | 677 | unsigned long limit = hard_dirty_limit(thresh); |
677 | unsigned long x_intercept; | 678 | unsigned long x_intercept; |
@@ -702,12 +703,12 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, | |||
702 | * consume arbitrary amount of RAM because it is accounted in | 703 | * consume arbitrary amount of RAM because it is accounted in |
703 | * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". | 704 | * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". |
704 | * | 705 | * |
705 | * Here, in bdi_position_ratio(), we calculate pos_ratio based on | 706 | * Here, in wb_position_ratio(), we calculate pos_ratio based on |
706 | * two values: bdi_dirty and bdi_thresh. Let's consider an example: | 707 | * two values: bdi_dirty and bdi_thresh. Let's consider an example: |
707 | * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global | 708 | * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global |
708 | * limits are set by default to 10% and 20% (background and throttle). | 709 | * limits are set by default to 10% and 20% (background and throttle). |
709 | * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. | 710 | * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. |
710 | * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is | 711 | * wb_dirty_limit(wb, bg_thresh) is about ~4K pages. bdi_setpoint is |
711 | * about ~6K pages (as the average of background and throttle bdi | 712 | * about ~6K pages (as the average of background and throttle bdi |
712 | * limits). The 3rd order polynomial will provide positive feedback if | 713 | * limits). The 3rd order polynomial will provide positive feedback if |
713 | * bdi_dirty is under bdi_setpoint and vice versa. | 714 | * bdi_dirty is under bdi_setpoint and vice versa. |
@@ -717,7 +718,7 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, | |||
717 | * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB | 718 | * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB |
718 | * in the example above). | 719 | * in the example above). |
719 | */ | 720 | */ |
720 | if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { | 721 | if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { |
721 | long long bdi_pos_ratio; | 722 | long long bdi_pos_ratio; |
722 | unsigned long bdi_bg_thresh; | 723 | unsigned long bdi_bg_thresh; |
723 | 724 | ||
@@ -842,13 +843,13 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, | |||
842 | return pos_ratio; | 843 | return pos_ratio; |
843 | } | 844 | } |
844 | 845 | ||
845 | static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, | 846 | static void wb_update_write_bandwidth(struct bdi_writeback *wb, |
846 | unsigned long elapsed, | 847 | unsigned long elapsed, |
847 | unsigned long written) | 848 | unsigned long written) |
848 | { | 849 | { |
849 | const unsigned long period = roundup_pow_of_two(3 * HZ); | 850 | const unsigned long period = roundup_pow_of_two(3 * HZ); |
850 | unsigned long avg = bdi->avg_write_bandwidth; | 851 | unsigned long avg = wb->avg_write_bandwidth; |
851 | unsigned long old = bdi->write_bandwidth; | 852 | unsigned long old = wb->write_bandwidth; |
852 | u64 bw; | 853 | u64 bw; |
853 | 854 | ||
854 | /* | 855 | /* |
@@ -861,14 +862,14 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, | |||
861 | * @written may have decreased due to account_page_redirty(). | 862 | * @written may have decreased due to account_page_redirty(). |
862 | * Avoid underflowing @bw calculation. | 863 | * Avoid underflowing @bw calculation. |
863 | */ | 864 | */ |
864 | bw = written - min(written, bdi->written_stamp); | 865 | bw = written - min(written, wb->written_stamp); |
865 | bw *= HZ; | 866 | bw *= HZ; |
866 | if (unlikely(elapsed > period)) { | 867 | if (unlikely(elapsed > period)) { |
867 | do_div(bw, elapsed); | 868 | do_div(bw, elapsed); |
868 | avg = bw; | 869 | avg = bw; |
869 | goto out; | 870 | goto out; |
870 | } | 871 | } |
871 | bw += (u64)bdi->write_bandwidth * (period - elapsed); | 872 | bw += (u64)wb->write_bandwidth * (period - elapsed); |
872 | bw >>= ilog2(period); | 873 | bw >>= ilog2(period); |
873 | 874 | ||
874 | /* | 875 | /* |
@@ -881,8 +882,8 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, | |||
881 | avg += (old - avg) >> 3; | 882 | avg += (old - avg) >> 3; |
882 | 883 | ||
883 | out: | 884 | out: |
884 | bdi->write_bandwidth = bw; | 885 | wb->write_bandwidth = bw; |
885 | bdi->avg_write_bandwidth = avg; | 886 | wb->avg_write_bandwidth = avg; |
886 | } | 887 | } |
887 | 888 | ||
888 | /* | 889 | /* |
@@ -947,20 +948,20 @@ static void global_update_bandwidth(unsigned long thresh, | |||
947 | * Normal bdi tasks will be curbed at or below it in long term. | 948 | * Normal bdi tasks will be curbed at or below it in long term. |
948 | * Obviously it should be around (write_bw / N) when there are N dd tasks. | 949 | * Obviously it should be around (write_bw / N) when there are N dd tasks. |
949 | */ | 950 | */ |
950 | static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, | 951 | static void wb_update_dirty_ratelimit(struct bdi_writeback *wb, |
951 | unsigned long thresh, | 952 | unsigned long thresh, |
952 | unsigned long bg_thresh, | 953 | unsigned long bg_thresh, |
953 | unsigned long dirty, | 954 | unsigned long dirty, |
954 | unsigned long bdi_thresh, | 955 | unsigned long bdi_thresh, |
955 | unsigned long bdi_dirty, | 956 | unsigned long bdi_dirty, |
956 | unsigned long dirtied, | 957 | unsigned long dirtied, |
957 | unsigned long elapsed) | 958 | unsigned long elapsed) |
958 | { | 959 | { |
959 | unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh); | 960 | unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh); |
960 | unsigned long limit = hard_dirty_limit(thresh); | 961 | unsigned long limit = hard_dirty_limit(thresh); |
961 | unsigned long setpoint = (freerun + limit) / 2; | 962 | unsigned long setpoint = (freerun + limit) / 2; |
962 | unsigned long write_bw = bdi->avg_write_bandwidth; | 963 | unsigned long write_bw = wb->avg_write_bandwidth; |
963 | unsigned long dirty_ratelimit = bdi->dirty_ratelimit; | 964 | unsigned long dirty_ratelimit = wb->dirty_ratelimit; |
964 | unsigned long dirty_rate; | 965 | unsigned long dirty_rate; |
965 | unsigned long task_ratelimit; | 966 | unsigned long task_ratelimit; |
966 | unsigned long balanced_dirty_ratelimit; | 967 | unsigned long balanced_dirty_ratelimit; |
@@ -972,10 +973,10 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, | |||
972 | * The dirty rate will match the writeout rate in long term, except | 973 | * The dirty rate will match the writeout rate in long term, except |
973 | * when dirty pages are truncated by userspace or re-dirtied by FS. | 974 | * when dirty pages are truncated by userspace or re-dirtied by FS. |
974 | */ | 975 | */ |
975 | dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed; | 976 | dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed; |
976 | 977 | ||
977 | pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty, | 978 | pos_ratio = wb_position_ratio(wb, thresh, bg_thresh, dirty, |
978 | bdi_thresh, bdi_dirty); | 979 | bdi_thresh, bdi_dirty); |
979 | /* | 980 | /* |
980 | * task_ratelimit reflects each dd's dirty rate for the past 200ms. | 981 | * task_ratelimit reflects each dd's dirty rate for the past 200ms. |
981 | */ | 982 | */ |
@@ -1059,31 +1060,31 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, | |||
1059 | 1060 | ||
1060 | /* | 1061 | /* |
1061 | * For strictlimit case, calculations above were based on bdi counters | 1062 | * For strictlimit case, calculations above were based on bdi counters |
1062 | * and limits (starting from pos_ratio = bdi_position_ratio() and up to | 1063 | * and limits (starting from pos_ratio = wb_position_ratio() and up to |
1063 | * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). | 1064 | * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). |
1064 | * Hence, to calculate "step" properly, we have to use bdi_dirty as | 1065 | * Hence, to calculate "step" properly, we have to use bdi_dirty as |
1065 | * "dirty" and bdi_setpoint as "setpoint". | 1066 | * "dirty" and bdi_setpoint as "setpoint". |
1066 | * | 1067 | * |
1067 | * We rampup dirty_ratelimit forcibly if bdi_dirty is low because | 1068 | * We rampup dirty_ratelimit forcibly if bdi_dirty is low because |
1068 | * it's possible that bdi_thresh is close to zero due to inactivity | 1069 | * it's possible that bdi_thresh is close to zero due to inactivity |
1069 | * of backing device (see the implementation of bdi_dirty_limit()). | 1070 | * of backing device (see the implementation of wb_dirty_limit()). |
1070 | */ | 1071 | */ |
1071 | if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { | 1072 | if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { |
1072 | dirty = bdi_dirty; | 1073 | dirty = bdi_dirty; |
1073 | if (bdi_dirty < 8) | 1074 | if (bdi_dirty < 8) |
1074 | setpoint = bdi_dirty + 1; | 1075 | setpoint = bdi_dirty + 1; |
1075 | else | 1076 | else |
1076 | setpoint = (bdi_thresh + | 1077 | setpoint = (bdi_thresh + |
1077 | bdi_dirty_limit(bdi, bg_thresh)) / 2; | 1078 | wb_dirty_limit(wb, bg_thresh)) / 2; |
1078 | } | 1079 | } |
1079 | 1080 | ||
1080 | if (dirty < setpoint) { | 1081 | if (dirty < setpoint) { |
1081 | x = min3(bdi->balanced_dirty_ratelimit, | 1082 | x = min3(wb->balanced_dirty_ratelimit, |
1082 | balanced_dirty_ratelimit, task_ratelimit); | 1083 | balanced_dirty_ratelimit, task_ratelimit); |
1083 | if (dirty_ratelimit < x) | 1084 | if (dirty_ratelimit < x) |
1084 | step = x - dirty_ratelimit; | 1085 | step = x - dirty_ratelimit; |
1085 | } else { | 1086 | } else { |
1086 | x = max3(bdi->balanced_dirty_ratelimit, | 1087 | x = max3(wb->balanced_dirty_ratelimit, |
1087 | balanced_dirty_ratelimit, task_ratelimit); | 1088 | balanced_dirty_ratelimit, task_ratelimit); |
1088 | if (dirty_ratelimit > x) | 1089 | if (dirty_ratelimit > x) |
1089 | step = dirty_ratelimit - x; | 1090 | step = dirty_ratelimit - x; |
@@ -1105,22 +1106,22 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, | |||
1105 | else | 1106 | else |
1106 | dirty_ratelimit -= step; | 1107 | dirty_ratelimit -= step; |
1107 | 1108 | ||
1108 | bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL); | 1109 | wb->dirty_ratelimit = max(dirty_ratelimit, 1UL); |
1109 | bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit; | 1110 | wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; |
1110 | 1111 | ||
1111 | trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit); | 1112 | trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit); |
1112 | } | 1113 | } |
1113 | 1114 | ||
1114 | void __bdi_update_bandwidth(struct backing_dev_info *bdi, | 1115 | void __wb_update_bandwidth(struct bdi_writeback *wb, |
1115 | unsigned long thresh, | 1116 | unsigned long thresh, |
1116 | unsigned long bg_thresh, | 1117 | unsigned long bg_thresh, |
1117 | unsigned long dirty, | 1118 | unsigned long dirty, |
1118 | unsigned long bdi_thresh, | 1119 | unsigned long bdi_thresh, |
1119 | unsigned long bdi_dirty, | 1120 | unsigned long bdi_dirty, |
1120 | unsigned long start_time) | 1121 | unsigned long start_time) |
1121 | { | 1122 | { |
1122 | unsigned long now = jiffies; | 1123 | unsigned long now = jiffies; |
1123 | unsigned long elapsed = now - bdi->bw_time_stamp; | 1124 | unsigned long elapsed = now - wb->bw_time_stamp; |
1124 | unsigned long dirtied; | 1125 | unsigned long dirtied; |
1125 | unsigned long written; | 1126 | unsigned long written; |
1126 | 1127 | ||
@@ -1130,44 +1131,44 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi, | |||
1130 | if (elapsed < BANDWIDTH_INTERVAL) | 1131 | if (elapsed < BANDWIDTH_INTERVAL) |
1131 | return; | 1132 | return; |
1132 | 1133 | ||
1133 | dirtied = percpu_counter_read(&bdi->wb.stat[WB_DIRTIED]); | 1134 | dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); |
1134 | written = percpu_counter_read(&bdi->wb.stat[WB_WRITTEN]); | 1135 | written = percpu_counter_read(&wb->stat[WB_WRITTEN]); |
1135 | 1136 | ||
1136 | /* | 1137 | /* |
1137 | * Skip quiet periods when disk bandwidth is under-utilized. | 1138 | * Skip quiet periods when disk bandwidth is under-utilized. |
1138 | * (at least 1s idle time between two flusher runs) | 1139 | * (at least 1s idle time between two flusher runs) |
1139 | */ | 1140 | */ |
1140 | if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) | 1141 | if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time)) |
1141 | goto snapshot; | 1142 | goto snapshot; |
1142 | 1143 | ||
1143 | if (thresh) { | 1144 | if (thresh) { |
1144 | global_update_bandwidth(thresh, dirty, now); | 1145 | global_update_bandwidth(thresh, dirty, now); |
1145 | bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty, | 1146 | wb_update_dirty_ratelimit(wb, thresh, bg_thresh, dirty, |
1146 | bdi_thresh, bdi_dirty, | 1147 | bdi_thresh, bdi_dirty, |
1147 | dirtied, elapsed); | 1148 | dirtied, elapsed); |
1148 | } | 1149 | } |
1149 | bdi_update_write_bandwidth(bdi, elapsed, written); | 1150 | wb_update_write_bandwidth(wb, elapsed, written); |
1150 | 1151 | ||
1151 | snapshot: | 1152 | snapshot: |
1152 | bdi->dirtied_stamp = dirtied; | 1153 | wb->dirtied_stamp = dirtied; |
1153 | bdi->written_stamp = written; | 1154 | wb->written_stamp = written; |
1154 | bdi->bw_time_stamp = now; | 1155 | wb->bw_time_stamp = now; |
1155 | } | 1156 | } |
1156 | 1157 | ||
1157 | static void bdi_update_bandwidth(struct backing_dev_info *bdi, | 1158 | static void wb_update_bandwidth(struct bdi_writeback *wb, |
1158 | unsigned long thresh, | 1159 | unsigned long thresh, |
1159 | unsigned long bg_thresh, | 1160 | unsigned long bg_thresh, |
1160 | unsigned long dirty, | 1161 | unsigned long dirty, |
1161 | unsigned long bdi_thresh, | 1162 | unsigned long bdi_thresh, |
1162 | unsigned long bdi_dirty, | 1163 | unsigned long bdi_dirty, |
1163 | unsigned long start_time) | 1164 | unsigned long start_time) |
1164 | { | 1165 | { |
1165 | if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) | 1166 | if (time_is_after_eq_jiffies(wb->bw_time_stamp + BANDWIDTH_INTERVAL)) |
1166 | return; | 1167 | return; |
1167 | spin_lock(&bdi->wb.list_lock); | 1168 | spin_lock(&wb->list_lock); |
1168 | __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty, | 1169 | __wb_update_bandwidth(wb, thresh, bg_thresh, dirty, |
1169 | bdi_thresh, bdi_dirty, start_time); | 1170 | bdi_thresh, bdi_dirty, start_time); |
1170 | spin_unlock(&bdi->wb.list_lock); | 1171 | spin_unlock(&wb->list_lock); |
1171 | } | 1172 | } |
1172 | 1173 | ||
1173 | /* | 1174 | /* |
@@ -1187,10 +1188,10 @@ static unsigned long dirty_poll_interval(unsigned long dirty, | |||
1187 | return 1; | 1188 | return 1; |
1188 | } | 1189 | } |
1189 | 1190 | ||
1190 | static unsigned long bdi_max_pause(struct backing_dev_info *bdi, | 1191 | static unsigned long wb_max_pause(struct bdi_writeback *wb, |
1191 | unsigned long bdi_dirty) | 1192 | unsigned long bdi_dirty) |
1192 | { | 1193 | { |
1193 | unsigned long bw = bdi->avg_write_bandwidth; | 1194 | unsigned long bw = wb->avg_write_bandwidth; |
1194 | unsigned long t; | 1195 | unsigned long t; |
1195 | 1196 | ||
1196 | /* | 1197 | /* |
@@ -1206,14 +1207,14 @@ static unsigned long bdi_max_pause(struct backing_dev_info *bdi, | |||
1206 | return min_t(unsigned long, t, MAX_PAUSE); | 1207 | return min_t(unsigned long, t, MAX_PAUSE); |
1207 | } | 1208 | } |
1208 | 1209 | ||
1209 | static long bdi_min_pause(struct backing_dev_info *bdi, | 1210 | static long wb_min_pause(struct bdi_writeback *wb, |
1210 | long max_pause, | 1211 | long max_pause, |
1211 | unsigned long task_ratelimit, | 1212 | unsigned long task_ratelimit, |
1212 | unsigned long dirty_ratelimit, | 1213 | unsigned long dirty_ratelimit, |
1213 | int *nr_dirtied_pause) | 1214 | int *nr_dirtied_pause) |
1214 | { | 1215 | { |
1215 | long hi = ilog2(bdi->avg_write_bandwidth); | 1216 | long hi = ilog2(wb->avg_write_bandwidth); |
1216 | long lo = ilog2(bdi->dirty_ratelimit); | 1217 | long lo = ilog2(wb->dirty_ratelimit); |
1217 | long t; /* target pause */ | 1218 | long t; /* target pause */ |
1218 | long pause; /* estimated next pause */ | 1219 | long pause; /* estimated next pause */ |
1219 | int pages; /* target nr_dirtied_pause */ | 1220 | int pages; /* target nr_dirtied_pause */ |
@@ -1281,14 +1282,13 @@ static long bdi_min_pause(struct backing_dev_info *bdi, | |||
1281 | return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; | 1282 | return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; |
1282 | } | 1283 | } |
1283 | 1284 | ||
1284 | static inline void bdi_dirty_limits(struct backing_dev_info *bdi, | 1285 | static inline void wb_dirty_limits(struct bdi_writeback *wb, |
1285 | unsigned long dirty_thresh, | 1286 | unsigned long dirty_thresh, |
1286 | unsigned long background_thresh, | 1287 | unsigned long background_thresh, |
1287 | unsigned long *bdi_dirty, | 1288 | unsigned long *bdi_dirty, |
1288 | unsigned long *bdi_thresh, | 1289 | unsigned long *bdi_thresh, |
1289 | unsigned long *bdi_bg_thresh) | 1290 | unsigned long *bdi_bg_thresh) |
1290 | { | 1291 | { |
1291 | struct bdi_writeback *wb = &bdi->wb; | ||
1292 | unsigned long wb_reclaimable; | 1292 | unsigned long wb_reclaimable; |
1293 | 1293 | ||
1294 | /* | 1294 | /* |
@@ -1301,10 +1301,10 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi, | |||
1301 | * In this case we don't want to hard throttle the USB key | 1301 | * In this case we don't want to hard throttle the USB key |
1302 | * dirtiers for 100 seconds until bdi_dirty drops under | 1302 | * dirtiers for 100 seconds until bdi_dirty drops under |
1303 | * bdi_thresh. Instead the auxiliary bdi control line in | 1303 | * bdi_thresh. Instead the auxiliary bdi control line in |
1304 | * bdi_position_ratio() will let the dirtier task progress | 1304 | * wb_position_ratio() will let the dirtier task progress |
1305 | * at some rate <= (write_bw / 2) for bringing down bdi_dirty. | 1305 | * at some rate <= (write_bw / 2) for bringing down bdi_dirty. |
1306 | */ | 1306 | */ |
1307 | *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); | 1307 | *bdi_thresh = wb_dirty_limit(wb, dirty_thresh); |
1308 | 1308 | ||
1309 | if (bdi_bg_thresh) | 1309 | if (bdi_bg_thresh) |
1310 | *bdi_bg_thresh = dirty_thresh ? div_u64((u64)*bdi_thresh * | 1310 | *bdi_bg_thresh = dirty_thresh ? div_u64((u64)*bdi_thresh * |
@@ -1354,6 +1354,7 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
1354 | unsigned long dirty_ratelimit; | 1354 | unsigned long dirty_ratelimit; |
1355 | unsigned long pos_ratio; | 1355 | unsigned long pos_ratio; |
1356 | struct backing_dev_info *bdi = inode_to_bdi(mapping->host); | 1356 | struct backing_dev_info *bdi = inode_to_bdi(mapping->host); |
1357 | struct bdi_writeback *wb = &bdi->wb; | ||
1357 | bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; | 1358 | bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; |
1358 | unsigned long start_time = jiffies; | 1359 | unsigned long start_time = jiffies; |
1359 | 1360 | ||
@@ -1378,8 +1379,8 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
1378 | global_dirty_limits(&background_thresh, &dirty_thresh); | 1379 | global_dirty_limits(&background_thresh, &dirty_thresh); |
1379 | 1380 | ||
1380 | if (unlikely(strictlimit)) { | 1381 | if (unlikely(strictlimit)) { |
1381 | bdi_dirty_limits(bdi, dirty_thresh, background_thresh, | 1382 | wb_dirty_limits(wb, dirty_thresh, background_thresh, |
1382 | &bdi_dirty, &bdi_thresh, &bg_thresh); | 1383 | &bdi_dirty, &bdi_thresh, &bg_thresh); |
1383 | 1384 | ||
1384 | dirty = bdi_dirty; | 1385 | dirty = bdi_dirty; |
1385 | thresh = bdi_thresh; | 1386 | thresh = bdi_thresh; |
@@ -1410,28 +1411,28 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
1410 | bdi_start_background_writeback(bdi); | 1411 | bdi_start_background_writeback(bdi); |
1411 | 1412 | ||
1412 | if (!strictlimit) | 1413 | if (!strictlimit) |
1413 | bdi_dirty_limits(bdi, dirty_thresh, background_thresh, | 1414 | wb_dirty_limits(wb, dirty_thresh, background_thresh, |
1414 | &bdi_dirty, &bdi_thresh, NULL); | 1415 | &bdi_dirty, &bdi_thresh, NULL); |
1415 | 1416 | ||
1416 | dirty_exceeded = (bdi_dirty > bdi_thresh) && | 1417 | dirty_exceeded = (bdi_dirty > bdi_thresh) && |
1417 | ((nr_dirty > dirty_thresh) || strictlimit); | 1418 | ((nr_dirty > dirty_thresh) || strictlimit); |
1418 | if (dirty_exceeded && !bdi->dirty_exceeded) | 1419 | if (dirty_exceeded && !wb->dirty_exceeded) |
1419 | bdi->dirty_exceeded = 1; | 1420 | wb->dirty_exceeded = 1; |
1420 | 1421 | ||
1421 | bdi_update_bandwidth(bdi, dirty_thresh, background_thresh, | 1422 | wb_update_bandwidth(wb, dirty_thresh, background_thresh, |
1422 | nr_dirty, bdi_thresh, bdi_dirty, | 1423 | nr_dirty, bdi_thresh, bdi_dirty, |
1423 | start_time); | 1424 | start_time); |
1424 | 1425 | ||
1425 | dirty_ratelimit = bdi->dirty_ratelimit; | 1426 | dirty_ratelimit = wb->dirty_ratelimit; |
1426 | pos_ratio = bdi_position_ratio(bdi, dirty_thresh, | 1427 | pos_ratio = wb_position_ratio(wb, dirty_thresh, |
1427 | background_thresh, nr_dirty, | 1428 | background_thresh, nr_dirty, |
1428 | bdi_thresh, bdi_dirty); | 1429 | bdi_thresh, bdi_dirty); |
1429 | task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> | 1430 | task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> |
1430 | RATELIMIT_CALC_SHIFT; | 1431 | RATELIMIT_CALC_SHIFT; |
1431 | max_pause = bdi_max_pause(bdi, bdi_dirty); | 1432 | max_pause = wb_max_pause(wb, bdi_dirty); |
1432 | min_pause = bdi_min_pause(bdi, max_pause, | 1433 | min_pause = wb_min_pause(wb, max_pause, |
1433 | task_ratelimit, dirty_ratelimit, | 1434 | task_ratelimit, dirty_ratelimit, |
1434 | &nr_dirtied_pause); | 1435 | &nr_dirtied_pause); |
1435 | 1436 | ||
1436 | if (unlikely(task_ratelimit == 0)) { | 1437 | if (unlikely(task_ratelimit == 0)) { |
1437 | period = max_pause; | 1438 | period = max_pause; |
@@ -1515,15 +1516,15 @@ pause: | |||
1515 | * more page. However bdi_dirty has accounting errors. So use | 1516 | * more page. However bdi_dirty has accounting errors. So use |
1516 | * the larger and more IO friendly wb_stat_error. | 1517 | * the larger and more IO friendly wb_stat_error. |
1517 | */ | 1518 | */ |
1518 | if (bdi_dirty <= wb_stat_error(&bdi->wb)) | 1519 | if (bdi_dirty <= wb_stat_error(wb)) |
1519 | break; | 1520 | break; |
1520 | 1521 | ||
1521 | if (fatal_signal_pending(current)) | 1522 | if (fatal_signal_pending(current)) |
1522 | break; | 1523 | break; |
1523 | } | 1524 | } |
1524 | 1525 | ||
1525 | if (!dirty_exceeded && bdi->dirty_exceeded) | 1526 | if (!dirty_exceeded && wb->dirty_exceeded) |
1526 | bdi->dirty_exceeded = 0; | 1527 | wb->dirty_exceeded = 0; |
1527 | 1528 | ||
1528 | if (writeback_in_progress(bdi)) | 1529 | if (writeback_in_progress(bdi)) |
1529 | return; | 1530 | return; |
@@ -1577,6 +1578,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; | |||
1577 | void balance_dirty_pages_ratelimited(struct address_space *mapping) | 1578 | void balance_dirty_pages_ratelimited(struct address_space *mapping) |
1578 | { | 1579 | { |
1579 | struct backing_dev_info *bdi = inode_to_bdi(mapping->host); | 1580 | struct backing_dev_info *bdi = inode_to_bdi(mapping->host); |
1581 | struct bdi_writeback *wb = &bdi->wb; | ||
1580 | int ratelimit; | 1582 | int ratelimit; |
1581 | int *p; | 1583 | int *p; |
1582 | 1584 | ||
@@ -1584,7 +1586,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) | |||
1584 | return; | 1586 | return; |
1585 | 1587 | ||
1586 | ratelimit = current->nr_dirtied_pause; | 1588 | ratelimit = current->nr_dirtied_pause; |
1587 | if (bdi->dirty_exceeded) | 1589 | if (wb->dirty_exceeded) |
1588 | ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); | 1590 | ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); |
1589 | 1591 | ||
1590 | preempt_disable(); | 1592 | preempt_disable(); |