aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/page-writeback.c246
1 files changed, 191 insertions, 55 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5cdd4f2b0c9..363ba7082ef 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -42,6 +42,12 @@
42#define MAX_PAUSE max(HZ/5, 1) 42#define MAX_PAUSE max(HZ/5, 1)
43 43
44/* 44/*
45 * Try to keep balance_dirty_pages() call intervals higher than this many pages
46 * by raising pause time to max_pause when falls below it.
47 */
48#define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10))
49
50/*
45 * Estimate write bandwidth at 200ms intervals. 51 * Estimate write bandwidth at 200ms intervals.
46 */ 52 */
47#define BANDWIDTH_INTERVAL max(HZ/5, 1) 53#define BANDWIDTH_INTERVAL max(HZ/5, 1)
@@ -898,6 +904,11 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
898 */ 904 */
899 balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw, 905 balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
900 dirty_rate | 1); 906 dirty_rate | 1);
907 /*
908 * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw
909 */
910 if (unlikely(balanced_dirty_ratelimit > write_bw))
911 balanced_dirty_ratelimit = write_bw;
901 912
902 /* 913 /*
903 * We could safely do this and return immediately: 914 * We could safely do this and return immediately:
@@ -1044,40 +1055,98 @@ static unsigned long dirty_poll_interval(unsigned long dirty,
1044 return 1; 1055 return 1;
1045} 1056}
1046 1057
1047static unsigned long bdi_max_pause(struct backing_dev_info *bdi, 1058static long bdi_max_pause(struct backing_dev_info *bdi,
1048 unsigned long bdi_dirty) 1059 unsigned long bdi_dirty)
1060{
1061 long bw = bdi->avg_write_bandwidth;
1062 long t;
1063
1064 /*
1065 * Limit pause time for small memory systems. If sleeping for too long
1066 * time, a small pool of dirty/writeback pages may go empty and disk go
1067 * idle.
1068 *
1069 * 8 serves as the safety ratio.
1070 */
1071 t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
1072 t++;
1073
1074 return min_t(long, t, MAX_PAUSE);
1075}
1076
1077static long bdi_min_pause(struct backing_dev_info *bdi,
1078 long max_pause,
1079 unsigned long task_ratelimit,
1080 unsigned long dirty_ratelimit,
1081 int *nr_dirtied_pause)
1049{ 1082{
1050 unsigned long bw = bdi->avg_write_bandwidth; 1083 long hi = ilog2(bdi->avg_write_bandwidth);
1051 unsigned long hi = ilog2(bw); 1084 long lo = ilog2(bdi->dirty_ratelimit);
1052 unsigned long lo = ilog2(bdi->dirty_ratelimit); 1085 long t; /* target pause */
1053 unsigned long t; 1086 long pause; /* estimated next pause */
1087 int pages; /* target nr_dirtied_pause */
1054 1088
1055 /* target for 20ms max pause on 1-dd case */ 1089 /* target for 10ms pause on 1-dd case */
1056 t = HZ / 50; 1090 t = max(1, HZ / 100);
1057 1091
1058 /* 1092 /*
1059 * Scale up pause time for concurrent dirtiers in order to reduce CPU 1093 * Scale up pause time for concurrent dirtiers in order to reduce CPU
1060 * overheads. 1094 * overheads.
1061 * 1095 *
1062 * (N * 20ms) on 2^N concurrent tasks. 1096 * (N * 10ms) on 2^N concurrent tasks.
1063 */ 1097 */
1064 if (hi > lo) 1098 if (hi > lo)
1065 t += (hi - lo) * (20 * HZ) / 1024; 1099 t += (hi - lo) * (10 * HZ) / 1024;
1066 1100
1067 /* 1101 /*
1068 * Limit pause time for small memory systems. If sleeping for too long 1102 * This is a bit convoluted. We try to base the next nr_dirtied_pause
1069 * time, a small pool of dirty/writeback pages may go empty and disk go 1103 * on the much more stable dirty_ratelimit. However the next pause time
1070 * idle. 1104 * will be computed based on task_ratelimit and the two rate limits may
1105 * depart considerably at some time. Especially if task_ratelimit goes
1106 * below dirty_ratelimit/2 and the target pause is max_pause, the next
1107 * pause time will be max_pause*2 _trimmed down_ to max_pause. As a
1108 * result task_ratelimit won't be executed faithfully, which could
1109 * eventually bring down dirty_ratelimit.
1071 * 1110 *
1072 * 8 serves as the safety ratio. 1111 * We apply two rules to fix it up:
1112 * 1) try to estimate the next pause time and if necessary, use a lower
1113 * nr_dirtied_pause so as not to exceed max_pause. When this happens,
1114 * nr_dirtied_pause will be "dancing" with task_ratelimit.
1115 * 2) limit the target pause time to max_pause/2, so that the normal
1116 * small fluctuations of task_ratelimit won't trigger rule (1) and
1117 * nr_dirtied_pause will remain as stable as dirty_ratelimit.
1073 */ 1118 */
1074 t = min(t, bdi_dirty * HZ / (8 * bw + 1)); 1119 t = min(t, 1 + max_pause / 2);
1120 pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
1075 1121
1076 /* 1122 /*
1077 * The pause time will be settled within range (max_pause/4, max_pause). 1123 * Tiny nr_dirtied_pause is found to hurt I/O performance in the test
1078 * Apply a minimal value of 4 to get a non-zero max_pause/4. 1124 * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}.
1125 * When the 16 consecutive reads are often interrupted by some dirty
1126 * throttling pause during the async writes, cfq will go into idles
1127 * (deadline is fine). So push nr_dirtied_pause as high as possible
1128 * until reaches DIRTY_POLL_THRESH=32 pages.
1079 */ 1129 */
1080 return clamp_val(t, 4, MAX_PAUSE); 1130 if (pages < DIRTY_POLL_THRESH) {
1131 t = max_pause;
1132 pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
1133 if (pages > DIRTY_POLL_THRESH) {
1134 pages = DIRTY_POLL_THRESH;
1135 t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit;
1136 }
1137 }
1138
1139 pause = HZ * pages / (task_ratelimit + 1);
1140 if (pause > max_pause) {
1141 t = max_pause;
1142 pages = task_ratelimit * t / roundup_pow_of_two(HZ);
1143 }
1144
1145 *nr_dirtied_pause = pages;
1146 /*
1147 * The minimal pause time will normally be half the target pause time.
1148 */
1149 return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
1081} 1150}
1082 1151
1083/* 1152/*
@@ -1098,16 +1167,21 @@ static void balance_dirty_pages(struct address_space *mapping,
1098 unsigned long background_thresh; 1167 unsigned long background_thresh;
1099 unsigned long dirty_thresh; 1168 unsigned long dirty_thresh;
1100 unsigned long bdi_thresh; 1169 unsigned long bdi_thresh;
1101 long pause = 0; 1170 long period;
1102 long uninitialized_var(max_pause); 1171 long pause;
1172 long max_pause;
1173 long min_pause;
1174 int nr_dirtied_pause;
1103 bool dirty_exceeded = false; 1175 bool dirty_exceeded = false;
1104 unsigned long task_ratelimit; 1176 unsigned long task_ratelimit;
1105 unsigned long uninitialized_var(dirty_ratelimit); 1177 unsigned long dirty_ratelimit;
1106 unsigned long pos_ratio; 1178 unsigned long pos_ratio;
1107 struct backing_dev_info *bdi = mapping->backing_dev_info; 1179 struct backing_dev_info *bdi = mapping->backing_dev_info;
1108 unsigned long start_time = jiffies; 1180 unsigned long start_time = jiffies;
1109 1181
1110 for (;;) { 1182 for (;;) {
1183 unsigned long now = jiffies;
1184
1111 /* 1185 /*
1112 * Unstable writes are a feature of certain networked 1186 * Unstable writes are a feature of certain networked
1113 * filesystems (i.e. NFS) in which data may have been 1187 * filesystems (i.e. NFS) in which data may have been
@@ -1127,8 +1201,13 @@ static void balance_dirty_pages(struct address_space *mapping,
1127 */ 1201 */
1128 freerun = dirty_freerun_ceiling(dirty_thresh, 1202 freerun = dirty_freerun_ceiling(dirty_thresh,
1129 background_thresh); 1203 background_thresh);
1130 if (nr_dirty <= freerun) 1204 if (nr_dirty <= freerun) {
1205 current->dirty_paused_when = now;
1206 current->nr_dirtied = 0;
1207 current->nr_dirtied_pause =
1208 dirty_poll_interval(nr_dirty, dirty_thresh);
1131 break; 1209 break;
1210 }
1132 1211
1133 if (unlikely(!writeback_in_progress(bdi))) 1212 if (unlikely(!writeback_in_progress(bdi)))
1134 bdi_start_background_writeback(bdi); 1213 bdi_start_background_writeback(bdi);
@@ -1168,7 +1247,7 @@ static void balance_dirty_pages(struct address_space *mapping,
1168 bdi_stat(bdi, BDI_WRITEBACK); 1247 bdi_stat(bdi, BDI_WRITEBACK);
1169 } 1248 }
1170 1249
1171 dirty_exceeded = (bdi_dirty > bdi_thresh) || 1250 dirty_exceeded = (bdi_dirty > bdi_thresh) &&
1172 (nr_dirty > dirty_thresh); 1251 (nr_dirty > dirty_thresh);
1173 if (dirty_exceeded && !bdi->dirty_exceeded) 1252 if (dirty_exceeded && !bdi->dirty_exceeded)
1174 bdi->dirty_exceeded = 1; 1253 bdi->dirty_exceeded = 1;
@@ -1177,20 +1256,34 @@ static void balance_dirty_pages(struct address_space *mapping,
1177 nr_dirty, bdi_thresh, bdi_dirty, 1256 nr_dirty, bdi_thresh, bdi_dirty,
1178 start_time); 1257 start_time);
1179 1258
1180 max_pause = bdi_max_pause(bdi, bdi_dirty);
1181
1182 dirty_ratelimit = bdi->dirty_ratelimit; 1259 dirty_ratelimit = bdi->dirty_ratelimit;
1183 pos_ratio = bdi_position_ratio(bdi, dirty_thresh, 1260 pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
1184 background_thresh, nr_dirty, 1261 background_thresh, nr_dirty,
1185 bdi_thresh, bdi_dirty); 1262 bdi_thresh, bdi_dirty);
1186 task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> 1263 task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
1187 RATELIMIT_CALC_SHIFT; 1264 RATELIMIT_CALC_SHIFT;
1265 max_pause = bdi_max_pause(bdi, bdi_dirty);
1266 min_pause = bdi_min_pause(bdi, max_pause,
1267 task_ratelimit, dirty_ratelimit,
1268 &nr_dirtied_pause);
1269
1188 if (unlikely(task_ratelimit == 0)) { 1270 if (unlikely(task_ratelimit == 0)) {
1271 period = max_pause;
1189 pause = max_pause; 1272 pause = max_pause;
1190 goto pause; 1273 goto pause;
1191 } 1274 }
1192 pause = HZ * pages_dirtied / task_ratelimit; 1275 period = HZ * pages_dirtied / task_ratelimit;
1193 if (unlikely(pause <= 0)) { 1276 pause = period;
1277 if (current->dirty_paused_when)
1278 pause -= now - current->dirty_paused_when;
1279 /*
1280 * For less than 1s think time (ext3/4 may block the dirtier
1281 * for up to 800ms from time to time on 1-HDD; so does xfs,
1282 * however at much less frequency), try to compensate it in
1283 * future periods by updating the virtual time; otherwise just
1284 * do a reset, as it may be a light dirtier.
1285 */
1286 if (pause < min_pause) {
1194 trace_balance_dirty_pages(bdi, 1287 trace_balance_dirty_pages(bdi,
1195 dirty_thresh, 1288 dirty_thresh,
1196 background_thresh, 1289 background_thresh,
@@ -1200,12 +1293,24 @@ static void balance_dirty_pages(struct address_space *mapping,
1200 dirty_ratelimit, 1293 dirty_ratelimit,
1201 task_ratelimit, 1294 task_ratelimit,
1202 pages_dirtied, 1295 pages_dirtied,
1203 pause, 1296 period,
1297 min(pause, 0L),
1204 start_time); 1298 start_time);
1205 pause = 1; /* avoid resetting nr_dirtied_pause below */ 1299 if (pause < -HZ) {
1300 current->dirty_paused_when = now;
1301 current->nr_dirtied = 0;
1302 } else if (period) {
1303 current->dirty_paused_when += period;
1304 current->nr_dirtied = 0;
1305 } else if (current->nr_dirtied_pause <= pages_dirtied)
1306 current->nr_dirtied_pause += pages_dirtied;
1206 break; 1307 break;
1207 } 1308 }
1208 pause = min(pause, max_pause); 1309 if (unlikely(pause > max_pause)) {
1310 /* for occasional dropped task_ratelimit */
1311 now += min(pause - max_pause, max_pause);
1312 pause = max_pause;
1313 }
1209 1314
1210pause: 1315pause:
1211 trace_balance_dirty_pages(bdi, 1316 trace_balance_dirty_pages(bdi,
@@ -1217,11 +1322,16 @@ pause:
1217 dirty_ratelimit, 1322 dirty_ratelimit,
1218 task_ratelimit, 1323 task_ratelimit,
1219 pages_dirtied, 1324 pages_dirtied,
1325 period,
1220 pause, 1326 pause,
1221 start_time); 1327 start_time);
1222 __set_current_state(TASK_KILLABLE); 1328 __set_current_state(TASK_KILLABLE);
1223 io_schedule_timeout(pause); 1329 io_schedule_timeout(pause);
1224 1330
1331 current->dirty_paused_when = now + pause;
1332 current->nr_dirtied = 0;
1333 current->nr_dirtied_pause = nr_dirtied_pause;
1334
1225 /* 1335 /*
1226 * This is typically equal to (nr_dirty < dirty_thresh) and can 1336 * This is typically equal to (nr_dirty < dirty_thresh) and can
1227 * also keep "1000+ dd on a slow USB stick" under control. 1337 * also keep "1000+ dd on a slow USB stick" under control.
@@ -1249,23 +1359,6 @@ pause:
1249 if (!dirty_exceeded && bdi->dirty_exceeded) 1359 if (!dirty_exceeded && bdi->dirty_exceeded)
1250 bdi->dirty_exceeded = 0; 1360 bdi->dirty_exceeded = 0;
1251 1361
1252 current->nr_dirtied = 0;
1253 if (pause == 0) { /* in freerun area */
1254 current->nr_dirtied_pause =
1255 dirty_poll_interval(nr_dirty, dirty_thresh);
1256 } else if (pause <= max_pause / 4 &&
1257 pages_dirtied >= current->nr_dirtied_pause) {
1258 current->nr_dirtied_pause = clamp_val(
1259 dirty_ratelimit * (max_pause / 2) / HZ,
1260 pages_dirtied + pages_dirtied / 8,
1261 pages_dirtied * 4);
1262 } else if (pause >= max_pause) {
1263 current->nr_dirtied_pause = 1 | clamp_val(
1264 dirty_ratelimit * (max_pause / 2) / HZ,
1265 pages_dirtied / 4,
1266 pages_dirtied - pages_dirtied / 8);
1267 }
1268
1269 if (writeback_in_progress(bdi)) 1362 if (writeback_in_progress(bdi))
1270 return; 1363 return;
1271 1364
@@ -1296,6 +1389,22 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
1296 1389
1297static DEFINE_PER_CPU(int, bdp_ratelimits); 1390static DEFINE_PER_CPU(int, bdp_ratelimits);
1298 1391
1392/*
1393 * Normal tasks are throttled by
1394 * loop {
1395 * dirty tsk->nr_dirtied_pause pages;
1396 * take a snap in balance_dirty_pages();
1397 * }
1398 * However there is a worst case. If every task exit immediately when dirtied
1399 * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be
1400 * called to throttle the page dirties. The solution is to save the not yet
1401 * throttled page dirties in dirty_throttle_leaks on task exit and charge them
1402 * randomly into the running tasks. This works well for the above worst case,
1403 * as the new task will pick up and accumulate the old task's leaked dirty
1404 * count and eventually get throttled.
1405 */
1406DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
1407
1299/** 1408/**
1300 * balance_dirty_pages_ratelimited_nr - balance dirty memory state 1409 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
1301 * @mapping: address_space which was dirtied 1410 * @mapping: address_space which was dirtied
@@ -1324,8 +1433,6 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
1324 if (bdi->dirty_exceeded) 1433 if (bdi->dirty_exceeded)
1325 ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); 1434 ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
1326 1435
1327 current->nr_dirtied += nr_pages_dirtied;
1328
1329 preempt_disable(); 1436 preempt_disable();
1330 /* 1437 /*
1331 * This prevents one CPU to accumulate too many dirtied pages without 1438 * This prevents one CPU to accumulate too many dirtied pages without
@@ -1336,12 +1443,20 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
1336 p = &__get_cpu_var(bdp_ratelimits); 1443 p = &__get_cpu_var(bdp_ratelimits);
1337 if (unlikely(current->nr_dirtied >= ratelimit)) 1444 if (unlikely(current->nr_dirtied >= ratelimit))
1338 *p = 0; 1445 *p = 0;
1339 else { 1446 else if (unlikely(*p >= ratelimit_pages)) {
1340 *p += nr_pages_dirtied; 1447 *p = 0;
1341 if (unlikely(*p >= ratelimit_pages)) { 1448 ratelimit = 0;
1342 *p = 0; 1449 }
1343 ratelimit = 0; 1450 /*
1344 } 1451 * Pick up the dirtied pages by the exited tasks. This avoids lots of
1452 * short-lived tasks (eg. gcc invocations in a kernel build) escaping
1453 * the dirty throttling and livelock other long-run dirtiers.
1454 */
1455 p = &__get_cpu_var(dirty_throttle_leaks);
1456 if (*p > 0 && current->nr_dirtied < ratelimit) {
1457 nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
1458 *p -= nr_pages_dirtied;
1459 current->nr_dirtied += nr_pages_dirtied;
1345 } 1460 }
1346 preempt_enable(); 1461 preempt_enable();
1347 1462
@@ -1823,6 +1938,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
1823 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); 1938 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
1824 __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); 1939 __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
1825 task_io_account_write(PAGE_CACHE_SIZE); 1940 task_io_account_write(PAGE_CACHE_SIZE);
1941 current->nr_dirtied++;
1942 this_cpu_inc(bdp_ratelimits);
1826 } 1943 }
1827} 1944}
1828EXPORT_SYMBOL(account_page_dirtied); 1945EXPORT_SYMBOL(account_page_dirtied);
@@ -1883,6 +2000,24 @@ int __set_page_dirty_nobuffers(struct page *page)
1883EXPORT_SYMBOL(__set_page_dirty_nobuffers); 2000EXPORT_SYMBOL(__set_page_dirty_nobuffers);
1884 2001
1885/* 2002/*
2003 * Call this whenever redirtying a page, to de-account the dirty counters
2004 * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written
2005 * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to
2006 * systematic errors in balanced_dirty_ratelimit and the dirty pages position
2007 * control.
2008 */
2009void account_page_redirty(struct page *page)
2010{
2011 struct address_space *mapping = page->mapping;
2012 if (mapping && mapping_cap_account_dirty(mapping)) {
2013 current->nr_dirtied--;
2014 dec_zone_page_state(page, NR_DIRTIED);
2015 dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
2016 }
2017}
2018EXPORT_SYMBOL(account_page_redirty);
2019
2020/*
1886 * When a writepage implementation decides that it doesn't want to write this 2021 * When a writepage implementation decides that it doesn't want to write this
1887 * page for some reason, it should redirty the locked page via 2022 * page for some reason, it should redirty the locked page via
1888 * redirty_page_for_writepage() and it should then unlock the page and return 0 2023 * redirty_page_for_writepage() and it should then unlock the page and return 0
@@ -1890,6 +2025,7 @@ EXPORT_SYMBOL(__set_page_dirty_nobuffers);
1890int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) 2025int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
1891{ 2026{
1892 wbc->pages_skipped++; 2027 wbc->pages_skipped++;
2028 account_page_redirty(page);
1893 return __set_page_dirty_nobuffers(page); 2029 return __set_page_dirty_nobuffers(page);
1894} 2030}
1895EXPORT_SYMBOL(redirty_page_for_writepage); 2031EXPORT_SYMBOL(redirty_page_for_writepage);