diff options
author | Wu Fengguang <fengguang.wu@intel.com> | 2011-11-30 12:08:55 -0500 |
---|---|---|
committer | Wu Fengguang <fengguang.wu@intel.com> | 2011-12-18 01:20:28 -0500 |
commit | 7ccb9ad5364d6ac0c803096c67e76a7545cf7a77 (patch) | |
tree | 53894333454bca278f20f9c5841dd1b45c384721 /mm/page-writeback.c | |
parent | 83712358ba0a1497ce59a4f84ce4dd0f803fe6fc (diff) |
writeback: max, min and target dirty pause time
Control the pause time and the call intervals to balance_dirty_pages()
with three parameters:
1) max_pause, limited by bdi_dirty and MAX_PAUSE
2) the target pause time, grows with the number of dd tasks
and is normally limited by max_pause/2
3) the minimal pause, set to half the target pause
and is used to skip short sleeps and accumulate them into bigger ones
The typical behaviors after patch:
- if ever task_ratelimit is far below dirty_ratelimit, the pause time
will remain constant at max_pause and nr_dirtied_pause will be
fluctuating with task_ratelimit
- in the normal cases, nr_dirtied_pause will remain stable (keep in the
same pace with dirty_ratelimit) and the pause time will be fluctuating
with task_ratelimit
In summary, someone has to fluctuate with task_ratelimit, because
task_ratelimit = nr_dirtied_pause / pause
We normally prefer a stable nr_dirtied_pause, until reaching max_pause.
The notable behavior changes are:
- in stable workloads, there will no longer be sudden big trajectory
switching of nr_dirtied_pause as concerned by Peter. It will be as
smooth as dirty_ratelimit and changing proportionally with it (as
always, assuming bdi bandwidth does not fluctuate across 2^N lines,
otherwise nr_dirtied_pause will show up in 2+ parallel trajectories)
- in the rare cases when something keeps task_ratelimit far below
dirty_ratelimit, the smoothness can no longer be retained and
nr_dirtied_pause will be "dancing" with task_ratelimit. This fixes a
(not that destructive but still not good) bug that
dirty_ratelimit gets brought down undesirably
<= balanced_dirty_ratelimit is under estimated
<= weakly executed task_ratelimit
<= pause goes too large and gets trimmed down to max_pause
<= nr_dirtied_pause (based on dirty_ratelimit) is set too large
<= dirty_ratelimit being much larger than task_ratelimit
- introduce min_pause to avoid small pause sleeps
- when pause is trimmed down to max_pause, try to compensate it at the
next pause time
The "refactor" type of changes are:
The max_pause equation is slightly transformed to make it slightly more
efficient.
We now scale target_pause by (N * 10ms) on 2^N concurrent tasks, which
is effectively equal to the original scaling max_pause by (N * 20ms)
because the original code does implicit target_pause ~= max_pause / 2.
Based on the same implicit ratio, target_pause starts with 10ms on 1 dd.
CC: Jan Kara <jack@suse.cz>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Diffstat (limited to 'mm/page-writeback.c')
-rw-r--r-- | mm/page-writeback.c | 125 |
1 files changed, 81 insertions, 44 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 491932155825..5830991f261a 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -962,40 +962,81 @@ static unsigned long dirty_poll_interval(unsigned long dirty, | |||
962 | return 1; | 962 | return 1; |
963 | } | 963 | } |
964 | 964 | ||
965 | static unsigned long bdi_max_pause(struct backing_dev_info *bdi, | 965 | static long bdi_max_pause(struct backing_dev_info *bdi, |
966 | unsigned long bdi_dirty) | 966 | unsigned long bdi_dirty) |
967 | { | 967 | { |
968 | unsigned long bw = bdi->avg_write_bandwidth; | 968 | long bw = bdi->avg_write_bandwidth; |
969 | unsigned long hi = ilog2(bw); | 969 | long t; |
970 | unsigned long lo = ilog2(bdi->dirty_ratelimit); | ||
971 | unsigned long t; | ||
972 | 970 | ||
973 | /* target for 20ms max pause on 1-dd case */ | 971 | /* |
974 | t = HZ / 50; | 972 | * Limit pause time for small memory systems. If sleeping for too long |
973 | * time, a small pool of dirty/writeback pages may go empty and disk go | ||
974 | * idle. | ||
975 | * | ||
976 | * 8 serves as the safety ratio. | ||
977 | */ | ||
978 | t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); | ||
979 | t++; | ||
980 | |||
981 | return min_t(long, t, MAX_PAUSE); | ||
982 | } | ||
983 | |||
984 | static long bdi_min_pause(struct backing_dev_info *bdi, | ||
985 | long max_pause, | ||
986 | unsigned long task_ratelimit, | ||
987 | unsigned long dirty_ratelimit, | ||
988 | int *nr_dirtied_pause) | ||
989 | { | ||
990 | long hi = ilog2(bdi->avg_write_bandwidth); | ||
991 | long lo = ilog2(bdi->dirty_ratelimit); | ||
992 | long t; /* target pause */ | ||
993 | long pause; /* estimated next pause */ | ||
994 | int pages; /* target nr_dirtied_pause */ | ||
995 | |||
996 | /* target for 10ms pause on 1-dd case */ | ||
997 | t = max(1, HZ / 100); | ||
975 | 998 | ||
976 | /* | 999 | /* |
977 | * Scale up pause time for concurrent dirtiers in order to reduce CPU | 1000 | * Scale up pause time for concurrent dirtiers in order to reduce CPU |
978 | * overheads. | 1001 | * overheads. |
979 | * | 1002 | * |
980 | * (N * 20ms) on 2^N concurrent tasks. | 1003 | * (N * 10ms) on 2^N concurrent tasks. |
981 | */ | 1004 | */ |
982 | if (hi > lo) | 1005 | if (hi > lo) |
983 | t += (hi - lo) * (20 * HZ) / 1024; | 1006 | t += (hi - lo) * (10 * HZ) / 1024; |
984 | 1007 | ||
985 | /* | 1008 | /* |
986 | * Limit pause time for small memory systems. If sleeping for too long | 1009 | * This is a bit convoluted. We try to base the next nr_dirtied_pause |
987 | * time, a small pool of dirty/writeback pages may go empty and disk go | 1010 | * on the much more stable dirty_ratelimit. However the next pause time |
988 | * idle. | 1011 | * will be computed based on task_ratelimit and the two rate limits may |
1012 | * depart considerably at some time. Especially if task_ratelimit goes | ||
1013 | * below dirty_ratelimit/2 and the target pause is max_pause, the next | ||
1014 | * pause time will be max_pause*2 _trimmed down_ to max_pause. As a | ||
1015 | * result task_ratelimit won't be executed faithfully, which could | ||
1016 | * eventually bring down dirty_ratelimit. | ||
989 | * | 1017 | * |
990 | * 8 serves as the safety ratio. | 1018 | * We apply two rules to fix it up: |
1019 | * 1) try to estimate the next pause time and if necessary, use a lower | ||
1020 | * nr_dirtied_pause so as not to exceed max_pause. When this happens, | ||
1021 | * nr_dirtied_pause will be "dancing" with task_ratelimit. | ||
1022 | * 2) limit the target pause time to max_pause/2, so that the normal | ||
1023 | * small fluctuations of task_ratelimit won't trigger rule (1) and | ||
1024 | * nr_dirtied_pause will remain as stable as dirty_ratelimit. | ||
991 | */ | 1025 | */ |
992 | t = min(t, bdi_dirty * HZ / (8 * bw + 1)); | 1026 | t = min(t, 1 + max_pause / 2); |
1027 | pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); | ||
1028 | |||
1029 | pause = HZ * pages / (task_ratelimit + 1); | ||
1030 | if (pause > max_pause) { | ||
1031 | t = max_pause; | ||
1032 | pages = task_ratelimit * t / roundup_pow_of_two(HZ); | ||
1033 | } | ||
993 | 1034 | ||
1035 | *nr_dirtied_pause = pages; | ||
994 | /* | 1036 | /* |
995 | * The pause time will be settled within range (max_pause/4, max_pause). | 1037 | * The minimal pause time will normally be half the target pause time. |
996 | * Apply a minimal value of 4 to get a non-zero max_pause/4. | ||
997 | */ | 1038 | */ |
998 | return clamp_val(t, 4, MAX_PAUSE); | 1039 | return 1 + t / 2; |
999 | } | 1040 | } |
1000 | 1041 | ||
1001 | /* | 1042 | /* |
@@ -1017,11 +1058,13 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
1017 | unsigned long dirty_thresh; | 1058 | unsigned long dirty_thresh; |
1018 | unsigned long bdi_thresh; | 1059 | unsigned long bdi_thresh; |
1019 | long period; | 1060 | long period; |
1020 | long pause = 0; | 1061 | long pause; |
1021 | long uninitialized_var(max_pause); | 1062 | long max_pause; |
1063 | long min_pause; | ||
1064 | int nr_dirtied_pause; | ||
1022 | bool dirty_exceeded = false; | 1065 | bool dirty_exceeded = false; |
1023 | unsigned long task_ratelimit; | 1066 | unsigned long task_ratelimit; |
1024 | unsigned long uninitialized_var(dirty_ratelimit); | 1067 | unsigned long dirty_ratelimit; |
1025 | unsigned long pos_ratio; | 1068 | unsigned long pos_ratio; |
1026 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 1069 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
1027 | unsigned long start_time = jiffies; | 1070 | unsigned long start_time = jiffies; |
@@ -1051,6 +1094,8 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
1051 | if (nr_dirty <= freerun) { | 1094 | if (nr_dirty <= freerun) { |
1052 | current->dirty_paused_when = now; | 1095 | current->dirty_paused_when = now; |
1053 | current->nr_dirtied = 0; | 1096 | current->nr_dirtied = 0; |
1097 | current->nr_dirtied_pause = | ||
1098 | dirty_poll_interval(nr_dirty, dirty_thresh); | ||
1054 | break; | 1099 | break; |
1055 | } | 1100 | } |
1056 | 1101 | ||
@@ -1101,14 +1146,17 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
1101 | nr_dirty, bdi_thresh, bdi_dirty, | 1146 | nr_dirty, bdi_thresh, bdi_dirty, |
1102 | start_time); | 1147 | start_time); |
1103 | 1148 | ||
1104 | max_pause = bdi_max_pause(bdi, bdi_dirty); | ||
1105 | |||
1106 | dirty_ratelimit = bdi->dirty_ratelimit; | 1149 | dirty_ratelimit = bdi->dirty_ratelimit; |
1107 | pos_ratio = bdi_position_ratio(bdi, dirty_thresh, | 1150 | pos_ratio = bdi_position_ratio(bdi, dirty_thresh, |
1108 | background_thresh, nr_dirty, | 1151 | background_thresh, nr_dirty, |
1109 | bdi_thresh, bdi_dirty); | 1152 | bdi_thresh, bdi_dirty); |
1110 | task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> | 1153 | task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> |
1111 | RATELIMIT_CALC_SHIFT; | 1154 | RATELIMIT_CALC_SHIFT; |
1155 | max_pause = bdi_max_pause(bdi, bdi_dirty); | ||
1156 | min_pause = bdi_min_pause(bdi, max_pause, | ||
1157 | task_ratelimit, dirty_ratelimit, | ||
1158 | &nr_dirtied_pause); | ||
1159 | |||
1112 | if (unlikely(task_ratelimit == 0)) { | 1160 | if (unlikely(task_ratelimit == 0)) { |
1113 | period = max_pause; | 1161 | period = max_pause; |
1114 | pause = max_pause; | 1162 | pause = max_pause; |
@@ -1125,7 +1173,7 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
1125 | * future periods by updating the virtual time; otherwise just | 1173 | * future periods by updating the virtual time; otherwise just |
1126 | * do a reset, as it may be a light dirtier. | 1174 | * do a reset, as it may be a light dirtier. |
1127 | */ | 1175 | */ |
1128 | if (unlikely(pause <= 0)) { | 1176 | if (pause < min_pause) { |
1129 | trace_balance_dirty_pages(bdi, | 1177 | trace_balance_dirty_pages(bdi, |
1130 | dirty_thresh, | 1178 | dirty_thresh, |
1131 | background_thresh, | 1179 | background_thresh, |
@@ -1136,7 +1184,7 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
1136 | task_ratelimit, | 1184 | task_ratelimit, |
1137 | pages_dirtied, | 1185 | pages_dirtied, |
1138 | period, | 1186 | period, |
1139 | pause, | 1187 | min(pause, 0L), |
1140 | start_time); | 1188 | start_time); |
1141 | if (pause < -HZ) { | 1189 | if (pause < -HZ) { |
1142 | current->dirty_paused_when = now; | 1190 | current->dirty_paused_when = now; |
@@ -1144,11 +1192,15 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
1144 | } else if (period) { | 1192 | } else if (period) { |
1145 | current->dirty_paused_when += period; | 1193 | current->dirty_paused_when += period; |
1146 | current->nr_dirtied = 0; | 1194 | current->nr_dirtied = 0; |
1147 | } | 1195 | } else if (current->nr_dirtied_pause <= pages_dirtied) |
1148 | pause = 1; /* avoid resetting nr_dirtied_pause below */ | 1196 | current->nr_dirtied_pause += pages_dirtied; |
1149 | break; | 1197 | break; |
1150 | } | 1198 | } |
1151 | pause = min(pause, max_pause); | 1199 | if (unlikely(pause > max_pause)) { |
1200 | /* for occasional dropped task_ratelimit */ | ||
1201 | now += min(pause - max_pause, max_pause); | ||
1202 | pause = max_pause; | ||
1203 | } | ||
1152 | 1204 | ||
1153 | pause: | 1205 | pause: |
1154 | trace_balance_dirty_pages(bdi, | 1206 | trace_balance_dirty_pages(bdi, |
@@ -1168,6 +1220,7 @@ pause: | |||
1168 | 1220 | ||
1169 | current->dirty_paused_when = now + pause; | 1221 | current->dirty_paused_when = now + pause; |
1170 | current->nr_dirtied = 0; | 1222 | current->nr_dirtied = 0; |
1223 | current->nr_dirtied_pause = nr_dirtied_pause; | ||
1171 | 1224 | ||
1172 | /* | 1225 | /* |
1173 | * This is typically equal to (nr_dirty < dirty_thresh) and can | 1226 | * This is typically equal to (nr_dirty < dirty_thresh) and can |
@@ -1196,22 +1249,6 @@ pause: | |||
1196 | if (!dirty_exceeded && bdi->dirty_exceeded) | 1249 | if (!dirty_exceeded && bdi->dirty_exceeded) |
1197 | bdi->dirty_exceeded = 0; | 1250 | bdi->dirty_exceeded = 0; |
1198 | 1251 | ||
1199 | if (pause == 0) { /* in freerun area */ | ||
1200 | current->nr_dirtied_pause = | ||
1201 | dirty_poll_interval(nr_dirty, dirty_thresh); | ||
1202 | } else if (period <= max_pause / 4 && | ||
1203 | pages_dirtied >= current->nr_dirtied_pause) { | ||
1204 | current->nr_dirtied_pause = clamp_val( | ||
1205 | dirty_ratelimit * (max_pause / 2) / HZ, | ||
1206 | pages_dirtied + pages_dirtied / 8, | ||
1207 | pages_dirtied * 4); | ||
1208 | } else if (pause >= max_pause) { | ||
1209 | current->nr_dirtied_pause = 1 | clamp_val( | ||
1210 | dirty_ratelimit * (max_pause / 2) / HZ, | ||
1211 | pages_dirtied / 4, | ||
1212 | pages_dirtied - pages_dirtied / 8); | ||
1213 | } | ||
1214 | |||
1215 | if (writeback_in_progress(bdi)) | 1252 | if (writeback_in_progress(bdi)) |
1216 | return; | 1253 | return; |
1217 | 1254 | ||