aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page-writeback.c
diff options
context:
space:
mode:
authorWu Fengguang <fengguang.wu@intel.com>2011-11-30 12:08:55 -0500
committerWu Fengguang <fengguang.wu@intel.com>2011-12-18 01:20:28 -0500
commit7ccb9ad5364d6ac0c803096c67e76a7545cf7a77 (patch)
tree53894333454bca278f20f9c5841dd1b45c384721 /mm/page-writeback.c
parent83712358ba0a1497ce59a4f84ce4dd0f803fe6fc (diff)
writeback: max, min and target dirty pause time
Control the pause time and the call intervals to balance_dirty_pages() with three parameters: 1) max_pause, limited by bdi_dirty and MAX_PAUSE 2) the target pause time, grows with the number of dd tasks and is normally limited by max_pause/2 3) the minimal pause, set to half the target pause and is used to skip short sleeps and accumulate them into bigger ones The typical behaviors after patch: - if ever task_ratelimit is far below dirty_ratelimit, the pause time will remain constant at max_pause and nr_dirtied_pause will be fluctuating with task_ratelimit - in the normal cases, nr_dirtied_pause will remain stable (keep in the same pace with dirty_ratelimit) and the pause time will be fluctuating with task_ratelimit In summary, someone has to fluctuate with task_ratelimit, because task_ratelimit = nr_dirtied_pause / pause We normally prefer a stable nr_dirtied_pause, until reaching max_pause. The notable behavior changes are: - in stable workloads, there will no longer be sudden big trajectory switching of nr_dirtied_pause as concerned by Peter. It will be as smooth as dirty_ratelimit and changing proportionally with it (as always, assuming bdi bandwidth does not fluctuate across 2^N lines, otherwise nr_dirtied_pause will show up in 2+ parallel trajectories) - in the rare cases when something keeps task_ratelimit far below dirty_ratelimit, the smoothness can no longer be retained and nr_dirtied_pause will be "dancing" with task_ratelimit. This fixes a (not that destructive but still not good) bug that dirty_ratelimit gets brought down undesirably <= balanced_dirty_ratelimit is under estimated <= weakly executed task_ratelimit <= pause goes too large and gets trimmed down to max_pause <= nr_dirtied_pause (based on dirty_ratelimit) is set too large <= dirty_ratelimit being much larger than task_ratelimit - introduce min_pause to avoid small pause sleeps - when pause is trimmed down to max_pause, try to compensate it at the next pause time The "refactor" type of changes are: The max_pause equation is slightly transformed to make it slightly more efficient. We now scale target_pause by (N * 10ms) on 2^N concurrent tasks, which is effectively equal to the original scaling max_pause by (N * 20ms) because the original code does implicit target_pause ~= max_pause / 2. Based on the same implicit ratio, target_pause starts with 10ms on 1 dd. CC: Jan Kara <jack@suse.cz> CC: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Diffstat (limited to 'mm/page-writeback.c')
-rw-r--r--mm/page-writeback.c125
1 files changed, 81 insertions, 44 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 491932155825..5830991f261a 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -962,40 +962,81 @@ static unsigned long dirty_poll_interval(unsigned long dirty,
962 return 1; 962 return 1;
963} 963}
964 964
965static unsigned long bdi_max_pause(struct backing_dev_info *bdi, 965static long bdi_max_pause(struct backing_dev_info *bdi,
966 unsigned long bdi_dirty) 966 unsigned long bdi_dirty)
967{ 967{
968 unsigned long bw = bdi->avg_write_bandwidth; 968 long bw = bdi->avg_write_bandwidth;
969 unsigned long hi = ilog2(bw); 969 long t;
970 unsigned long lo = ilog2(bdi->dirty_ratelimit);
971 unsigned long t;
972 970
973 /* target for 20ms max pause on 1-dd case */ 971 /*
974 t = HZ / 50; 972 * Limit pause time for small memory systems. If sleeping for too long
973 * time, a small pool of dirty/writeback pages may go empty and disk go
974 * idle.
975 *
976 * 8 serves as the safety ratio.
977 */
978 t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
979 t++;
980
981 return min_t(long, t, MAX_PAUSE);
982}
983
984static long bdi_min_pause(struct backing_dev_info *bdi,
985 long max_pause,
986 unsigned long task_ratelimit,
987 unsigned long dirty_ratelimit,
988 int *nr_dirtied_pause)
989{
990 long hi = ilog2(bdi->avg_write_bandwidth);
991 long lo = ilog2(bdi->dirty_ratelimit);
992 long t; /* target pause */
993 long pause; /* estimated next pause */
994 int pages; /* target nr_dirtied_pause */
995
996 /* target for 10ms pause on 1-dd case */
997 t = max(1, HZ / 100);
975 998
976 /* 999 /*
977 * Scale up pause time for concurrent dirtiers in order to reduce CPU 1000 * Scale up pause time for concurrent dirtiers in order to reduce CPU
978 * overheads. 1001 * overheads.
979 * 1002 *
980 * (N * 20ms) on 2^N concurrent tasks. 1003 * (N * 10ms) on 2^N concurrent tasks.
981 */ 1004 */
982 if (hi > lo) 1005 if (hi > lo)
983 t += (hi - lo) * (20 * HZ) / 1024; 1006 t += (hi - lo) * (10 * HZ) / 1024;
984 1007
985 /* 1008 /*
986 * Limit pause time for small memory systems. If sleeping for too long 1009 * This is a bit convoluted. We try to base the next nr_dirtied_pause
987 * time, a small pool of dirty/writeback pages may go empty and disk go 1010 * on the much more stable dirty_ratelimit. However the next pause time
988 * idle. 1011 * will be computed based on task_ratelimit and the two rate limits may
1012 * depart considerably at some time. Especially if task_ratelimit goes
1013 * below dirty_ratelimit/2 and the target pause is max_pause, the next
1014 * pause time will be max_pause*2 _trimmed down_ to max_pause. As a
1015 * result task_ratelimit won't be executed faithfully, which could
1016 * eventually bring down dirty_ratelimit.
989 * 1017 *
990 * 8 serves as the safety ratio. 1018 * We apply two rules to fix it up:
1019 * 1) try to estimate the next pause time and if necessary, use a lower
1020 * nr_dirtied_pause so as not to exceed max_pause. When this happens,
1021 * nr_dirtied_pause will be "dancing" with task_ratelimit.
1022 * 2) limit the target pause time to max_pause/2, so that the normal
1023 * small fluctuations of task_ratelimit won't trigger rule (1) and
1024 * nr_dirtied_pause will remain as stable as dirty_ratelimit.
991 */ 1025 */
992 t = min(t, bdi_dirty * HZ / (8 * bw + 1)); 1026 t = min(t, 1 + max_pause / 2);
1027 pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
1028
1029 pause = HZ * pages / (task_ratelimit + 1);
1030 if (pause > max_pause) {
1031 t = max_pause;
1032 pages = task_ratelimit * t / roundup_pow_of_two(HZ);
1033 }
993 1034
1035 *nr_dirtied_pause = pages;
994 /* 1036 /*
995 * The pause time will be settled within range (max_pause/4, max_pause). 1037 * The minimal pause time will normally be half the target pause time.
996 * Apply a minimal value of 4 to get a non-zero max_pause/4.
997 */ 1038 */
998 return clamp_val(t, 4, MAX_PAUSE); 1039 return 1 + t / 2;
999} 1040}
1000 1041
1001/* 1042/*
@@ -1017,11 +1058,13 @@ static void balance_dirty_pages(struct address_space *mapping,
1017 unsigned long dirty_thresh; 1058 unsigned long dirty_thresh;
1018 unsigned long bdi_thresh; 1059 unsigned long bdi_thresh;
1019 long period; 1060 long period;
1020 long pause = 0; 1061 long pause;
1021 long uninitialized_var(max_pause); 1062 long max_pause;
1063 long min_pause;
1064 int nr_dirtied_pause;
1022 bool dirty_exceeded = false; 1065 bool dirty_exceeded = false;
1023 unsigned long task_ratelimit; 1066 unsigned long task_ratelimit;
1024 unsigned long uninitialized_var(dirty_ratelimit); 1067 unsigned long dirty_ratelimit;
1025 unsigned long pos_ratio; 1068 unsigned long pos_ratio;
1026 struct backing_dev_info *bdi = mapping->backing_dev_info; 1069 struct backing_dev_info *bdi = mapping->backing_dev_info;
1027 unsigned long start_time = jiffies; 1070 unsigned long start_time = jiffies;
@@ -1051,6 +1094,8 @@ static void balance_dirty_pages(struct address_space *mapping,
1051 if (nr_dirty <= freerun) { 1094 if (nr_dirty <= freerun) {
1052 current->dirty_paused_when = now; 1095 current->dirty_paused_when = now;
1053 current->nr_dirtied = 0; 1096 current->nr_dirtied = 0;
1097 current->nr_dirtied_pause =
1098 dirty_poll_interval(nr_dirty, dirty_thresh);
1054 break; 1099 break;
1055 } 1100 }
1056 1101
@@ -1101,14 +1146,17 @@ static void balance_dirty_pages(struct address_space *mapping,
1101 nr_dirty, bdi_thresh, bdi_dirty, 1146 nr_dirty, bdi_thresh, bdi_dirty,
1102 start_time); 1147 start_time);
1103 1148
1104 max_pause = bdi_max_pause(bdi, bdi_dirty);
1105
1106 dirty_ratelimit = bdi->dirty_ratelimit; 1149 dirty_ratelimit = bdi->dirty_ratelimit;
1107 pos_ratio = bdi_position_ratio(bdi, dirty_thresh, 1150 pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
1108 background_thresh, nr_dirty, 1151 background_thresh, nr_dirty,
1109 bdi_thresh, bdi_dirty); 1152 bdi_thresh, bdi_dirty);
1110 task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> 1153 task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
1111 RATELIMIT_CALC_SHIFT; 1154 RATELIMIT_CALC_SHIFT;
1155 max_pause = bdi_max_pause(bdi, bdi_dirty);
1156 min_pause = bdi_min_pause(bdi, max_pause,
1157 task_ratelimit, dirty_ratelimit,
1158 &nr_dirtied_pause);
1159
1112 if (unlikely(task_ratelimit == 0)) { 1160 if (unlikely(task_ratelimit == 0)) {
1113 period = max_pause; 1161 period = max_pause;
1114 pause = max_pause; 1162 pause = max_pause;
@@ -1125,7 +1173,7 @@ static void balance_dirty_pages(struct address_space *mapping,
1125 * future periods by updating the virtual time; otherwise just 1173 * future periods by updating the virtual time; otherwise just
1126 * do a reset, as it may be a light dirtier. 1174 * do a reset, as it may be a light dirtier.
1127 */ 1175 */
1128 if (unlikely(pause <= 0)) { 1176 if (pause < min_pause) {
1129 trace_balance_dirty_pages(bdi, 1177 trace_balance_dirty_pages(bdi,
1130 dirty_thresh, 1178 dirty_thresh,
1131 background_thresh, 1179 background_thresh,
@@ -1136,7 +1184,7 @@ static void balance_dirty_pages(struct address_space *mapping,
1136 task_ratelimit, 1184 task_ratelimit,
1137 pages_dirtied, 1185 pages_dirtied,
1138 period, 1186 period,
1139 pause, 1187 min(pause, 0L),
1140 start_time); 1188 start_time);
1141 if (pause < -HZ) { 1189 if (pause < -HZ) {
1142 current->dirty_paused_when = now; 1190 current->dirty_paused_when = now;
@@ -1144,11 +1192,15 @@ static void balance_dirty_pages(struct address_space *mapping,
1144 } else if (period) { 1192 } else if (period) {
1145 current->dirty_paused_when += period; 1193 current->dirty_paused_when += period;
1146 current->nr_dirtied = 0; 1194 current->nr_dirtied = 0;
1147 } 1195 } else if (current->nr_dirtied_pause <= pages_dirtied)
1148 pause = 1; /* avoid resetting nr_dirtied_pause below */ 1196 current->nr_dirtied_pause += pages_dirtied;
1149 break; 1197 break;
1150 } 1198 }
1151 pause = min(pause, max_pause); 1199 if (unlikely(pause > max_pause)) {
1200 /* for occasional dropped task_ratelimit */
1201 now += min(pause - max_pause, max_pause);
1202 pause = max_pause;
1203 }
1152 1204
1153pause: 1205pause:
1154 trace_balance_dirty_pages(bdi, 1206 trace_balance_dirty_pages(bdi,
@@ -1168,6 +1220,7 @@ pause:
1168 1220
1169 current->dirty_paused_when = now + pause; 1221 current->dirty_paused_when = now + pause;
1170 current->nr_dirtied = 0; 1222 current->nr_dirtied = 0;
1223 current->nr_dirtied_pause = nr_dirtied_pause;
1171 1224
1172 /* 1225 /*
1173 * This is typically equal to (nr_dirty < dirty_thresh) and can 1226 * This is typically equal to (nr_dirty < dirty_thresh) and can
@@ -1196,22 +1249,6 @@ pause:
1196 if (!dirty_exceeded && bdi->dirty_exceeded) 1249 if (!dirty_exceeded && bdi->dirty_exceeded)
1197 bdi->dirty_exceeded = 0; 1250 bdi->dirty_exceeded = 0;
1198 1251
1199 if (pause == 0) { /* in freerun area */
1200 current->nr_dirtied_pause =
1201 dirty_poll_interval(nr_dirty, dirty_thresh);
1202 } else if (period <= max_pause / 4 &&
1203 pages_dirtied >= current->nr_dirtied_pause) {
1204 current->nr_dirtied_pause = clamp_val(
1205 dirty_ratelimit * (max_pause / 2) / HZ,
1206 pages_dirtied + pages_dirtied / 8,
1207 pages_dirtied * 4);
1208 } else if (pause >= max_pause) {
1209 current->nr_dirtied_pause = 1 | clamp_val(
1210 dirty_ratelimit * (max_pause / 2) / HZ,
1211 pages_dirtied / 4,
1212 pages_dirtied - pages_dirtied / 8);
1213 }
1214
1215 if (writeback_in_progress(bdi)) 1252 if (writeback_in_progress(bdi))
1216 return; 1253 return;
1217 1254