aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page-writeback.c
diff options
context:
space:
mode:
authorMaxim Patlasov <mpatlasov@parallels.com>2013-09-11 17:22:46 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-11 18:58:04 -0400
commit5a53748568f79641eaf40e41081a2f4987f005c2 (patch)
tree929e07be4f378f96398110dce35a64b61e1505d7 /mm/page-writeback.c
parent4c3bffc272755c98728c2b58b1a8148cf9e9fd1f (diff)
mm/page-writeback.c: add strictlimit feature
The feature prevents mistrusted filesystems (ie: FUSE mounts created by unprivileged users) to grow a large number of dirty pages before throttling. For such filesystems balance_dirty_pages always check bdi counters against bdi limits. I.e. even if global "nr_dirty" is under "freerun", it's not allowed to skip bdi checks. The only use case for now is fuse: it sets bdi max_ratio to 1% by default and system administrators are supposed to expect that this limit won't be exceeded. The feature is on if a BDI is marked by BDI_CAP_STRICTLIMIT flag. A filesystem may set the flag when it initializes its BDI. The problematic scenario comes from the fact that nobody pays attention to the NR_WRITEBACK_TEMP counter (i.e. number of pages under fuse writeback). The implementation of fuse writeback releases original page (by calling end_page_writeback) almost immediately. A fuse request queued for real processing bears a copy of original page. Hence, if userspace fuse daemon doesn't finalize write requests in timely manner, an aggressive mmap writer can pollute virtually all memory by those temporary fuse page copies. They are carefully accounted in NR_WRITEBACK_TEMP, but nobody cares. To make further explanations shorter, let me use "NR_WRITEBACK_TEMP problem" as a shortcut for "a possibility of uncontrolled grow of amount of RAM consumed by temporary pages allocated by kernel fuse to process writeback". The problem was very easy to reproduce. There is a trivial example filesystem implementation in fuse userspace distribution: fusexmp_fh.c. I added "sleep(1);" to the write methods, then recompiled and mounted it. Then created a huge file on the mount point and run a simple program which mmap-ed the file to a memory region, then wrote a data to the region. An hour later I observed almost all RAM consumed by fuse writeback. Since then some unrelated changes in kernel fuse made it more difficult to reproduce, but it is still possible now. Putting this theoretical happens-in-the-lab thing aside, there is another thing that really hurts real world (FUSE) users. This is write-through page cache policy FUSE currently uses. I.e. handling write(2), kernel fuse populates page cache and flushes user data to the server synchronously. This is excessively suboptimal. Pavel Emelyanov's patches ("writeback cache policy") solve the problem, but they also make resolving NR_WRITEBACK_TEMP problem absolutely necessary. Otherwise, simply copying a huge file to a fuse mount would result in memory starvation. Miklos, the maintainer of FUSE, believes strictlimit feature the way to go. And eventually putting FUSE topics aside, there is one more use-case for strictlimit feature. Using a slow USB stick (mass storage) in a machine with huge amount of RAM installed is a well-known pain. Let's make simple computations. Assuming 64GB of RAM installed, existing implementation of balance_dirty_pages will start throttling only after 9.6GB of RAM becomes dirty (freerun == 15% of total RAM). So, the command "cp 9GB_file /media/my-usb-storage/" may return in a few seconds, but subsequent "umount /media/my-usb-storage/" will take more than two hours if effective throughput of the storage is, to say, 1MB/sec. After inclusion of strictlimit feature, it will be trivial to add a knob (e.g. /sys/devices/virtual/bdi/x:y/strictlimit) to enable it on demand. Manually or via udev rule. May be I'm wrong, but it seems to be quite a natural desire to limit the amount of dirty memory for some devices we are not fully trust (in the sense of sustainable throughput). [akpm@linux-foundation.org: fix warning in page-writeback.c] Signed-off-by: Maxim Patlasov <MPatlasov@parallels.com> Cc: Jan Kara <jack@suse.cz> Cc: Miklos Szeredi <miklos@szeredi.hu> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Pavel Emelyanov <xemul@parallels.com> Cc: James Bottomley <James.Bottomley@HansenPartnership.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/page-writeback.c')
-rw-r--r--mm/page-writeback.c263
1 files changed, 202 insertions, 61 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3750431b3cd8..6c7b0187be8e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -585,6 +585,37 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
585} 585}
586 586
587/* 587/*
588 * setpoint - dirty 3
589 * f(dirty) := 1.0 + (----------------)
590 * limit - setpoint
591 *
592 * it's a 3rd order polynomial that subjects to
593 *
594 * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast
595 * (2) f(setpoint) = 1.0 => the balance point
596 * (3) f(limit) = 0 => the hard limit
597 * (4) df/dx <= 0 => negative feedback control
598 * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
599 * => fast response on large errors; small oscillation near setpoint
600 */
601static inline long long pos_ratio_polynom(unsigned long setpoint,
602 unsigned long dirty,
603 unsigned long limit)
604{
605 long long pos_ratio;
606 long x;
607
608 x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
609 limit - setpoint + 1);
610 pos_ratio = x;
611 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
612 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
613 pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
614
615 return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT);
616}
617
618/*
588 * Dirty position control. 619 * Dirty position control.
589 * 620 *
590 * (o) global/bdi setpoints 621 * (o) global/bdi setpoints
@@ -682,26 +713,80 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
682 /* 713 /*
683 * global setpoint 714 * global setpoint
684 * 715 *
685 * setpoint - dirty 3 716 * See comment for pos_ratio_polynom().
686 * f(dirty) := 1.0 + (----------------) 717 */
687 * limit - setpoint 718 setpoint = (freerun + limit) / 2;
719 pos_ratio = pos_ratio_polynom(setpoint, dirty, limit);
720
721 /*
722 * The strictlimit feature is a tool preventing mistrusted filesystems
723 * from growing a large number of dirty pages before throttling. For
724 * such filesystems balance_dirty_pages always checks bdi counters
725 * against bdi limits. Even if global "nr_dirty" is under "freerun".
726 * This is especially important for fuse which sets bdi->max_ratio to
727 * 1% by default. Without strictlimit feature, fuse writeback may
728 * consume arbitrary amount of RAM because it is accounted in
729 * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
688 * 730 *
689 * it's a 3rd order polynomial that subjects to 731 * Here, in bdi_position_ratio(), we calculate pos_ratio based on
732 * two values: bdi_dirty and bdi_thresh. Let's consider an example:
733 * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
734 * limits are set by default to 10% and 20% (background and throttle).
735 * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
736 * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is
737 * about ~6K pages (as the average of background and throttle bdi
738 * limits). The 3rd order polynomial will provide positive feedback if
739 * bdi_dirty is under bdi_setpoint and vice versa.
690 * 740 *
691 * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast 741 * Note, that we cannot use global counters in these calculations
692 * (2) f(setpoint) = 1.0 => the balance point 742 * because we want to throttle process writing to a strictlimit BDI
693 * (3) f(limit) = 0 => the hard limit 743 * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
694 * (4) df/dx <= 0 => negative feedback control 744 * in the example above).
695 * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
696 * => fast response on large errors; small oscillation near setpoint
697 */ 745 */
698 setpoint = (freerun + limit) / 2; 746 if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
699 x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, 747 long long bdi_pos_ratio;
700 limit - setpoint + 1); 748 unsigned long bdi_bg_thresh;
701 pos_ratio = x; 749
702 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; 750 if (bdi_dirty < 8)
703 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; 751 return min_t(long long, pos_ratio * 2,
704 pos_ratio += 1 << RATELIMIT_CALC_SHIFT; 752 2 << RATELIMIT_CALC_SHIFT);
753
754 if (bdi_dirty >= bdi_thresh)
755 return 0;
756
757 bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh);
758 bdi_setpoint = dirty_freerun_ceiling(bdi_thresh,
759 bdi_bg_thresh);
760
761 if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh)
762 return 0;
763
764 bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty,
765 bdi_thresh);
766
767 /*
768 * Typically, for strictlimit case, bdi_setpoint << setpoint
769 * and pos_ratio >> bdi_pos_ratio. In the other words global
770 * state ("dirty") is not limiting factor and we have to
771 * make decision based on bdi counters. But there is an
772 * important case when global pos_ratio should get precedence:
773 * global limits are exceeded (e.g. due to activities on other
774 * BDIs) while given strictlimit BDI is below limit.
775 *
776 * "pos_ratio * bdi_pos_ratio" would work for the case above,
777 * but it would look too non-natural for the case of all
778 * activity in the system coming from a single strictlimit BDI
779 * with bdi->max_ratio == 100%.
780 *
781 * Note that min() below somewhat changes the dynamics of the
782 * control system. Normally, pos_ratio value can be well over 3
783 * (when globally we are at freerun and bdi is well below bdi
784 * setpoint). Now the maximum pos_ratio in the same situation
785 * is 2. We might want to tweak this if we observe the control
786 * system is too slow to adapt.
787 */
788 return min(pos_ratio, bdi_pos_ratio);
789 }
705 790
706 /* 791 /*
707 * We have computed basic pos_ratio above based on global situation. If 792 * We have computed basic pos_ratio above based on global situation. If
@@ -994,6 +1079,27 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
994 * keep that period small to reduce time lags). 1079 * keep that period small to reduce time lags).
995 */ 1080 */
996 step = 0; 1081 step = 0;
1082
1083 /*
1084 * For strictlimit case, calculations above were based on bdi counters
1085 * and limits (starting from pos_ratio = bdi_position_ratio() and up to
1086 * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
1087 * Hence, to calculate "step" properly, we have to use bdi_dirty as
1088 * "dirty" and bdi_setpoint as "setpoint".
1089 *
1090 * We rampup dirty_ratelimit forcibly if bdi_dirty is low because
1091 * it's possible that bdi_thresh is close to zero due to inactivity
1092 * of backing device (see the implementation of bdi_dirty_limit()).
1093 */
1094 if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
1095 dirty = bdi_dirty;
1096 if (bdi_dirty < 8)
1097 setpoint = bdi_dirty + 1;
1098 else
1099 setpoint = (bdi_thresh +
1100 bdi_dirty_limit(bdi, bg_thresh)) / 2;
1101 }
1102
997 if (dirty < setpoint) { 1103 if (dirty < setpoint) {
998 x = min(bdi->balanced_dirty_ratelimit, 1104 x = min(bdi->balanced_dirty_ratelimit,
999 min(balanced_dirty_ratelimit, task_ratelimit)); 1105 min(balanced_dirty_ratelimit, task_ratelimit));
@@ -1198,6 +1304,56 @@ static long bdi_min_pause(struct backing_dev_info *bdi,
1198 return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; 1304 return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
1199} 1305}
1200 1306
1307static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
1308 unsigned long dirty_thresh,
1309 unsigned long background_thresh,
1310 unsigned long *bdi_dirty,
1311 unsigned long *bdi_thresh,
1312 unsigned long *bdi_bg_thresh)
1313{
1314 unsigned long bdi_reclaimable;
1315
1316 /*
1317 * bdi_thresh is not treated as some limiting factor as
1318 * dirty_thresh, due to reasons
1319 * - in JBOD setup, bdi_thresh can fluctuate a lot
1320 * - in a system with HDD and USB key, the USB key may somehow
1321 * go into state (bdi_dirty >> bdi_thresh) either because
1322 * bdi_dirty starts high, or because bdi_thresh drops low.
1323 * In this case we don't want to hard throttle the USB key
1324 * dirtiers for 100 seconds until bdi_dirty drops under
1325 * bdi_thresh. Instead the auxiliary bdi control line in
1326 * bdi_position_ratio() will let the dirtier task progress
1327 * at some rate <= (write_bw / 2) for bringing down bdi_dirty.
1328 */
1329 *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
1330
1331 if (bdi_bg_thresh)
1332 *bdi_bg_thresh = div_u64((u64)*bdi_thresh *
1333 background_thresh,
1334 dirty_thresh);
1335
1336 /*
1337 * In order to avoid the stacked BDI deadlock we need
1338 * to ensure we accurately count the 'dirty' pages when
1339 * the threshold is low.
1340 *
1341 * Otherwise it would be possible to get thresh+n pages
1342 * reported dirty, even though there are thresh-m pages
1343 * actually dirty; with m+n sitting in the percpu
1344 * deltas.
1345 */
1346 if (*bdi_thresh < 2 * bdi_stat_error(bdi)) {
1347 bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
1348 *bdi_dirty = bdi_reclaimable +
1349 bdi_stat_sum(bdi, BDI_WRITEBACK);
1350 } else {
1351 bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
1352 *bdi_dirty = bdi_reclaimable +
1353 bdi_stat(bdi, BDI_WRITEBACK);
1354 }
1355}
1356
1201/* 1357/*
1202 * balance_dirty_pages() must be called by processes which are generating dirty 1358 * balance_dirty_pages() must be called by processes which are generating dirty
1203 * data. It looks at the number of dirty pages in the machine and will force 1359 * data. It looks at the number of dirty pages in the machine and will force
@@ -1209,13 +1365,9 @@ static void balance_dirty_pages(struct address_space *mapping,
1209 unsigned long pages_dirtied) 1365 unsigned long pages_dirtied)
1210{ 1366{
1211 unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ 1367 unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */
1212 unsigned long bdi_reclaimable;
1213 unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ 1368 unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
1214 unsigned long bdi_dirty;
1215 unsigned long freerun;
1216 unsigned long background_thresh; 1369 unsigned long background_thresh;
1217 unsigned long dirty_thresh; 1370 unsigned long dirty_thresh;
1218 unsigned long bdi_thresh;
1219 long period; 1371 long period;
1220 long pause; 1372 long pause;
1221 long max_pause; 1373 long max_pause;
@@ -1226,10 +1378,16 @@ static void balance_dirty_pages(struct address_space *mapping,
1226 unsigned long dirty_ratelimit; 1378 unsigned long dirty_ratelimit;
1227 unsigned long pos_ratio; 1379 unsigned long pos_ratio;
1228 struct backing_dev_info *bdi = mapping->backing_dev_info; 1380 struct backing_dev_info *bdi = mapping->backing_dev_info;
1381 bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
1229 unsigned long start_time = jiffies; 1382 unsigned long start_time = jiffies;
1230 1383
1231 for (;;) { 1384 for (;;) {
1232 unsigned long now = jiffies; 1385 unsigned long now = jiffies;
1386 unsigned long uninitialized_var(bdi_thresh);
1387 unsigned long thresh;
1388 unsigned long uninitialized_var(bdi_dirty);
1389 unsigned long dirty;
1390 unsigned long bg_thresh;
1233 1391
1234 /* 1392 /*
1235 * Unstable writes are a feature of certain networked 1393 * Unstable writes are a feature of certain networked
@@ -1243,61 +1401,44 @@ static void balance_dirty_pages(struct address_space *mapping,
1243 1401
1244 global_dirty_limits(&background_thresh, &dirty_thresh); 1402 global_dirty_limits(&background_thresh, &dirty_thresh);
1245 1403
1404 if (unlikely(strictlimit)) {
1405 bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
1406 &bdi_dirty, &bdi_thresh, &bg_thresh);
1407
1408 dirty = bdi_dirty;
1409 thresh = bdi_thresh;
1410 } else {
1411 dirty = nr_dirty;
1412 thresh = dirty_thresh;
1413 bg_thresh = background_thresh;
1414 }
1415
1246 /* 1416 /*
1247 * Throttle it only when the background writeback cannot 1417 * Throttle it only when the background writeback cannot
1248 * catch-up. This avoids (excessively) small writeouts 1418 * catch-up. This avoids (excessively) small writeouts
1249 * when the bdi limits are ramping up. 1419 * when the bdi limits are ramping up in case of !strictlimit.
1420 *
1421 * In strictlimit case make decision based on the bdi counters
1422 * and limits. Small writeouts when the bdi limits are ramping
1423 * up are the price we consciously pay for strictlimit-ing.
1250 */ 1424 */
1251 freerun = dirty_freerun_ceiling(dirty_thresh, 1425 if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) {
1252 background_thresh);
1253 if (nr_dirty <= freerun) {
1254 current->dirty_paused_when = now; 1426 current->dirty_paused_when = now;
1255 current->nr_dirtied = 0; 1427 current->nr_dirtied = 0;
1256 current->nr_dirtied_pause = 1428 current->nr_dirtied_pause =
1257 dirty_poll_interval(nr_dirty, dirty_thresh); 1429 dirty_poll_interval(dirty, thresh);
1258 break; 1430 break;
1259 } 1431 }
1260 1432
1261 if (unlikely(!writeback_in_progress(bdi))) 1433 if (unlikely(!writeback_in_progress(bdi)))
1262 bdi_start_background_writeback(bdi); 1434 bdi_start_background_writeback(bdi);
1263 1435
1264 /* 1436 if (!strictlimit)
1265 * bdi_thresh is not treated as some limiting factor as 1437 bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
1266 * dirty_thresh, due to reasons 1438 &bdi_dirty, &bdi_thresh, NULL);
1267 * - in JBOD setup, bdi_thresh can fluctuate a lot
1268 * - in a system with HDD and USB key, the USB key may somehow
1269 * go into state (bdi_dirty >> bdi_thresh) either because
1270 * bdi_dirty starts high, or because bdi_thresh drops low.
1271 * In this case we don't want to hard throttle the USB key
1272 * dirtiers for 100 seconds until bdi_dirty drops under
1273 * bdi_thresh. Instead the auxiliary bdi control line in
1274 * bdi_position_ratio() will let the dirtier task progress
1275 * at some rate <= (write_bw / 2) for bringing down bdi_dirty.
1276 */
1277 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
1278
1279 /*
1280 * In order to avoid the stacked BDI deadlock we need
1281 * to ensure we accurately count the 'dirty' pages when
1282 * the threshold is low.
1283 *
1284 * Otherwise it would be possible to get thresh+n pages
1285 * reported dirty, even though there are thresh-m pages
1286 * actually dirty; with m+n sitting in the percpu
1287 * deltas.
1288 */
1289 if (bdi_thresh < 2 * bdi_stat_error(bdi)) {
1290 bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
1291 bdi_dirty = bdi_reclaimable +
1292 bdi_stat_sum(bdi, BDI_WRITEBACK);
1293 } else {
1294 bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
1295 bdi_dirty = bdi_reclaimable +
1296 bdi_stat(bdi, BDI_WRITEBACK);
1297 }
1298 1439
1299 dirty_exceeded = (bdi_dirty > bdi_thresh) && 1440 dirty_exceeded = (bdi_dirty > bdi_thresh) &&
1300 (nr_dirty > dirty_thresh); 1441 ((nr_dirty > dirty_thresh) || strictlimit);
1301 if (dirty_exceeded && !bdi->dirty_exceeded) 1442 if (dirty_exceeded && !bdi->dirty_exceeded)
1302 bdi->dirty_exceeded = 1; 1443 bdi->dirty_exceeded = 1;
1303 1444