aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-01-10 19:59:59 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-01-10 19:59:59 -0500
commit001a541ea9163ace5e8243ee0e907ad80a4c0ec2 (patch)
treea76225046369c440de93739add9823f5ea060245
parent40ba587923ae67090d9f141c1d3c951be5c1420e (diff)
parentbc31b86a5923fad5f3fbb6192f767f410241ba27 (diff)
Merge branch 'writeback-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/linux
* 'writeback-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/linux: writeback: move MIN_WRITEBACK_PAGES to fs-writeback.c writeback: balanced_rate cannot exceed write bandwidth writeback: do strict bdi dirty_exceeded writeback: avoid tiny dirty poll intervals writeback: max, min and target dirty pause time writeback: dirty ratelimit - think time compensation btrfs: fix dirtied pages accounting on sub-page writes writeback: fix dirtied pages accounting on redirty writeback: fix dirtied pages accounting on sub-page writes writeback: charge leaked page dirties to active tasks writeback: Include all dirty inodes in background writeback
-rw-r--r--fs/btrfs/file.c3
-rw-r--r--fs/fs-writeback.c16
-rw-r--r--include/linux/sched.h1
-rw-r--r--include/linux/writeback.h9
-rw-r--r--include/trace/events/writeback.h14
-rw-r--r--kernel/exit.c3
-rw-r--r--kernel/fork.c1
-rw-r--r--mm/page-writeback.c246
8 files changed, 227 insertions, 66 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 20375e6691c3..034d98503229 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1136,7 +1136,8 @@ again:
1136 GFP_NOFS); 1136 GFP_NOFS);
1137 } 1137 }
1138 for (i = 0; i < num_pages; i++) { 1138 for (i = 0; i < num_pages; i++) {
1139 clear_page_dirty_for_io(pages[i]); 1139 if (clear_page_dirty_for_io(pages[i]))
1140 account_page_redirty(pages[i]);
1140 set_page_extent_mapped(pages[i]); 1141 set_page_extent_mapped(pages[i]);
1141 WARN_ON(!PageLocked(pages[i])); 1142 WARN_ON(!PageLocked(pages[i]));
1142 } 1143 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e2951506434d..f855916657ba 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -20,6 +20,7 @@
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/pagemap.h>
23#include <linux/kthread.h> 24#include <linux/kthread.h>
24#include <linux/freezer.h> 25#include <linux/freezer.h>
25#include <linux/writeback.h> 26#include <linux/writeback.h>
@@ -29,6 +30,11 @@
29#include "internal.h" 30#include "internal.h"
30 31
31/* 32/*
33 * 4MB minimal write chunk size
34 */
35#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10))
36
37/*
32 * Passed into wb_writeback(), essentially a subset of writeback_control 38 * Passed into wb_writeback(), essentially a subset of writeback_control
33 */ 39 */
34struct wb_writeback_work { 40struct wb_writeback_work {
@@ -742,11 +748,17 @@ static long wb_writeback(struct bdi_writeback *wb,
742 if (work->for_background && !over_bground_thresh(wb->bdi)) 748 if (work->for_background && !over_bground_thresh(wb->bdi))
743 break; 749 break;
744 750
751 /*
752 * Kupdate and background works are special and we want to
753 * include all inodes that need writing. Livelock avoidance is
754 * handled by these works yielding to any other work so we are
755 * safe.
756 */
745 if (work->for_kupdate) { 757 if (work->for_kupdate) {
746 oldest_jif = jiffies - 758 oldest_jif = jiffies -
747 msecs_to_jiffies(dirty_expire_interval * 10); 759 msecs_to_jiffies(dirty_expire_interval * 10);
748 work->older_than_this = &oldest_jif; 760 } else if (work->for_background)
749 } 761 oldest_jif = jiffies;
750 762
751 trace_writeback_start(wb->bdi, work); 763 trace_writeback_start(wb->bdi, work);
752 if (list_empty(&wb->b_io)) 764 if (list_empty(&wb->b_io))
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f044f66018f2..21cd0303af51 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1544,6 +1544,7 @@ struct task_struct {
1544 */ 1544 */
1545 int nr_dirtied; 1545 int nr_dirtied;
1546 int nr_dirtied_pause; 1546 int nr_dirtied_pause;
1547 unsigned long dirty_paused_when; /* start of a write-and-pause period */
1547 1548
1548#ifdef CONFIG_LATENCYTOP 1549#ifdef CONFIG_LATENCYTOP
1549 int latency_record_count; 1550 int latency_record_count;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 6dff47304971..995b8bf630ac 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -7,6 +7,8 @@
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/fs.h> 8#include <linux/fs.h>
9 9
10DECLARE_PER_CPU(int, dirty_throttle_leaks);
11
10/* 12/*
11 * The 1/4 region under the global dirty thresh is for smooth dirty throttling: 13 * The 1/4 region under the global dirty thresh is for smooth dirty throttling:
12 * 14 *
@@ -23,11 +25,6 @@
23#define DIRTY_SCOPE 8 25#define DIRTY_SCOPE 8
24#define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2) 26#define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2)
25 27
26/*
27 * 4MB minimal write chunk size
28 */
29#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10))
30
31struct backing_dev_info; 28struct backing_dev_info;
32 29
33/* 30/*
@@ -194,6 +191,8 @@ void writeback_set_ratelimit(void);
194void tag_pages_for_writeback(struct address_space *mapping, 191void tag_pages_for_writeback(struct address_space *mapping,
195 pgoff_t start, pgoff_t end); 192 pgoff_t start, pgoff_t end);
196 193
194void account_page_redirty(struct page *page);
195
197/* pdflush.c */ 196/* pdflush.c */
198extern int nr_pdflush_threads; /* Global so it can be exported to sysctl 197extern int nr_pdflush_threads; /* Global so it can be exported to sysctl
199 read-only. */ 198 read-only. */
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 99d1d0decf88..8588a8918023 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -300,12 +300,13 @@ TRACE_EVENT(balance_dirty_pages,
300 unsigned long dirty_ratelimit, 300 unsigned long dirty_ratelimit,
301 unsigned long task_ratelimit, 301 unsigned long task_ratelimit,
302 unsigned long dirtied, 302 unsigned long dirtied,
303 unsigned long period,
303 long pause, 304 long pause,
304 unsigned long start_time), 305 unsigned long start_time),
305 306
306 TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, 307 TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
307 dirty_ratelimit, task_ratelimit, 308 dirty_ratelimit, task_ratelimit,
308 dirtied, pause, start_time), 309 dirtied, period, pause, start_time),
309 310
310 TP_STRUCT__entry( 311 TP_STRUCT__entry(
311 __array( char, bdi, 32) 312 __array( char, bdi, 32)
@@ -320,6 +321,8 @@ TRACE_EVENT(balance_dirty_pages,
320 __field(unsigned int, dirtied_pause) 321 __field(unsigned int, dirtied_pause)
321 __field(unsigned long, paused) 322 __field(unsigned long, paused)
322 __field( long, pause) 323 __field( long, pause)
324 __field(unsigned long, period)
325 __field( long, think)
323 ), 326 ),
324 327
325 TP_fast_assign( 328 TP_fast_assign(
@@ -336,6 +339,9 @@ TRACE_EVENT(balance_dirty_pages,
336 __entry->task_ratelimit = KBps(task_ratelimit); 339 __entry->task_ratelimit = KBps(task_ratelimit);
337 __entry->dirtied = dirtied; 340 __entry->dirtied = dirtied;
338 __entry->dirtied_pause = current->nr_dirtied_pause; 341 __entry->dirtied_pause = current->nr_dirtied_pause;
342 __entry->think = current->dirty_paused_when == 0 ? 0 :
343 (long)(jiffies - current->dirty_paused_when) * 1000/HZ;
344 __entry->period = period * 1000 / HZ;
339 __entry->pause = pause * 1000 / HZ; 345 __entry->pause = pause * 1000 / HZ;
340 __entry->paused = (jiffies - start_time) * 1000 / HZ; 346 __entry->paused = (jiffies - start_time) * 1000 / HZ;
341 ), 347 ),
@@ -346,7 +352,7 @@ TRACE_EVENT(balance_dirty_pages,
346 "bdi_setpoint=%lu bdi_dirty=%lu " 352 "bdi_setpoint=%lu bdi_dirty=%lu "
347 "dirty_ratelimit=%lu task_ratelimit=%lu " 353 "dirty_ratelimit=%lu task_ratelimit=%lu "
348 "dirtied=%u dirtied_pause=%u " 354 "dirtied=%u dirtied_pause=%u "
349 "paused=%lu pause=%ld", 355 "paused=%lu pause=%ld period=%lu think=%ld",
350 __entry->bdi, 356 __entry->bdi,
351 __entry->limit, 357 __entry->limit,
352 __entry->setpoint, 358 __entry->setpoint,
@@ -358,7 +364,9 @@ TRACE_EVENT(balance_dirty_pages,
358 __entry->dirtied, 364 __entry->dirtied,
359 __entry->dirtied_pause, 365 __entry->dirtied_pause,
360 __entry->paused, /* ms */ 366 __entry->paused, /* ms */
361 __entry->pause /* ms */ 367 __entry->pause, /* ms */
368 __entry->period, /* ms */
369 __entry->think /* ms */
362 ) 370 )
363); 371);
364 372
diff --git a/kernel/exit.c b/kernel/exit.c
index d9eab2e4b430..94ed6e20bb53 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -51,6 +51,7 @@
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52#include <linux/hw_breakpoint.h> 52#include <linux/hw_breakpoint.h>
53#include <linux/oom.h> 53#include <linux/oom.h>
54#include <linux/writeback.h>
54 55
55#include <asm/uaccess.h> 56#include <asm/uaccess.h>
56#include <asm/unistd.h> 57#include <asm/unistd.h>
@@ -1035,6 +1036,8 @@ NORET_TYPE void do_exit(long code)
1035 validate_creds_for_do_exit(tsk); 1036 validate_creds_for_do_exit(tsk);
1036 1037
1037 preempt_disable(); 1038 preempt_disable();
1039 if (tsk->nr_dirtied)
1040 __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
1038 exit_rcu(); 1041 exit_rcu();
1039 /* causes final put_task_struct in finish_task_switch(). */ 1042 /* causes final put_task_struct in finish_task_switch(). */
1040 tsk->state = TASK_DEAD; 1043 tsk->state = TASK_DEAD;
diff --git a/kernel/fork.c b/kernel/fork.c
index 5e1391b5ade0..443f5125f11e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1294,6 +1294,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1294 1294
1295 p->nr_dirtied = 0; 1295 p->nr_dirtied = 0;
1296 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); 1296 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
1297 p->dirty_paused_when = 0;
1297 1298
1298 /* 1299 /*
1299 * Ok, make it visible to the rest of the system. 1300 * Ok, make it visible to the rest of the system.
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5cdd4f2b0c9d..363ba7082ef5 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -42,6 +42,12 @@
42#define MAX_PAUSE max(HZ/5, 1) 42#define MAX_PAUSE max(HZ/5, 1)
43 43
44/* 44/*
45 * Try to keep balance_dirty_pages() call intervals higher than this many pages
46 * by raising pause time to max_pause when falls below it.
47 */
48#define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10))
49
50/*
45 * Estimate write bandwidth at 200ms intervals. 51 * Estimate write bandwidth at 200ms intervals.
46 */ 52 */
47#define BANDWIDTH_INTERVAL max(HZ/5, 1) 53#define BANDWIDTH_INTERVAL max(HZ/5, 1)
@@ -898,6 +904,11 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
898 */ 904 */
899 balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw, 905 balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
900 dirty_rate | 1); 906 dirty_rate | 1);
907 /*
908 * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw
909 */
910 if (unlikely(balanced_dirty_ratelimit > write_bw))
911 balanced_dirty_ratelimit = write_bw;
901 912
902 /* 913 /*
903 * We could safely do this and return immediately: 914 * We could safely do this and return immediately:
@@ -1044,40 +1055,98 @@ static unsigned long dirty_poll_interval(unsigned long dirty,
1044 return 1; 1055 return 1;
1045} 1056}
1046 1057
1047static unsigned long bdi_max_pause(struct backing_dev_info *bdi, 1058static long bdi_max_pause(struct backing_dev_info *bdi,
1048 unsigned long bdi_dirty) 1059 unsigned long bdi_dirty)
1060{
1061 long bw = bdi->avg_write_bandwidth;
1062 long t;
1063
1064 /*
1065 * Limit pause time for small memory systems. If sleeping for too long
1066 * time, a small pool of dirty/writeback pages may go empty and disk go
1067 * idle.
1068 *
1069 * 8 serves as the safety ratio.
1070 */
1071 t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
1072 t++;
1073
1074 return min_t(long, t, MAX_PAUSE);
1075}
1076
1077static long bdi_min_pause(struct backing_dev_info *bdi,
1078 long max_pause,
1079 unsigned long task_ratelimit,
1080 unsigned long dirty_ratelimit,
1081 int *nr_dirtied_pause)
1049{ 1082{
1050 unsigned long bw = bdi->avg_write_bandwidth; 1083 long hi = ilog2(bdi->avg_write_bandwidth);
1051 unsigned long hi = ilog2(bw); 1084 long lo = ilog2(bdi->dirty_ratelimit);
1052 unsigned long lo = ilog2(bdi->dirty_ratelimit); 1085 long t; /* target pause */
1053 unsigned long t; 1086 long pause; /* estimated next pause */
1087 int pages; /* target nr_dirtied_pause */
1054 1088
1055 /* target for 20ms max pause on 1-dd case */ 1089 /* target for 10ms pause on 1-dd case */
1056 t = HZ / 50; 1090 t = max(1, HZ / 100);
1057 1091
1058 /* 1092 /*
1059 * Scale up pause time for concurrent dirtiers in order to reduce CPU 1093 * Scale up pause time for concurrent dirtiers in order to reduce CPU
1060 * overheads. 1094 * overheads.
1061 * 1095 *
1062 * (N * 20ms) on 2^N concurrent tasks. 1096 * (N * 10ms) on 2^N concurrent tasks.
1063 */ 1097 */
1064 if (hi > lo) 1098 if (hi > lo)
1065 t += (hi - lo) * (20 * HZ) / 1024; 1099 t += (hi - lo) * (10 * HZ) / 1024;
1066 1100
1067 /* 1101 /*
1068 * Limit pause time for small memory systems. If sleeping for too long 1102 * This is a bit convoluted. We try to base the next nr_dirtied_pause
1069 * time, a small pool of dirty/writeback pages may go empty and disk go 1103 * on the much more stable dirty_ratelimit. However the next pause time
1070 * idle. 1104 * will be computed based on task_ratelimit and the two rate limits may
1105 * depart considerably at some time. Especially if task_ratelimit goes
1106 * below dirty_ratelimit/2 and the target pause is max_pause, the next
1107 * pause time will be max_pause*2 _trimmed down_ to max_pause. As a
1108 * result task_ratelimit won't be executed faithfully, which could
1109 * eventually bring down dirty_ratelimit.
1071 * 1110 *
1072 * 8 serves as the safety ratio. 1111 * We apply two rules to fix it up:
1112 * 1) try to estimate the next pause time and if necessary, use a lower
1113 * nr_dirtied_pause so as not to exceed max_pause. When this happens,
1114 * nr_dirtied_pause will be "dancing" with task_ratelimit.
1115 * 2) limit the target pause time to max_pause/2, so that the normal
1116 * small fluctuations of task_ratelimit won't trigger rule (1) and
1117 * nr_dirtied_pause will remain as stable as dirty_ratelimit.
1073 */ 1118 */
1074 t = min(t, bdi_dirty * HZ / (8 * bw + 1)); 1119 t = min(t, 1 + max_pause / 2);
1120 pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
1075 1121
1076 /* 1122 /*
1077 * The pause time will be settled within range (max_pause/4, max_pause). 1123 * Tiny nr_dirtied_pause is found to hurt I/O performance in the test
1078 * Apply a minimal value of 4 to get a non-zero max_pause/4. 1124 * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}.
1125 * When the 16 consecutive reads are often interrupted by some dirty
1126 * throttling pause during the async writes, cfq will go into idles
1127 * (deadline is fine). So push nr_dirtied_pause as high as possible
1128 * until reaches DIRTY_POLL_THRESH=32 pages.
1079 */ 1129 */
1080 return clamp_val(t, 4, MAX_PAUSE); 1130 if (pages < DIRTY_POLL_THRESH) {
1131 t = max_pause;
1132 pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
1133 if (pages > DIRTY_POLL_THRESH) {
1134 pages = DIRTY_POLL_THRESH;
1135 t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit;
1136 }
1137 }
1138
1139 pause = HZ * pages / (task_ratelimit + 1);
1140 if (pause > max_pause) {
1141 t = max_pause;
1142 pages = task_ratelimit * t / roundup_pow_of_two(HZ);
1143 }
1144
1145 *nr_dirtied_pause = pages;
1146 /*
1147 * The minimal pause time will normally be half the target pause time.
1148 */
1149 return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
1081} 1150}
1082 1151
1083/* 1152/*
@@ -1098,16 +1167,21 @@ static void balance_dirty_pages(struct address_space *mapping,
1098 unsigned long background_thresh; 1167 unsigned long background_thresh;
1099 unsigned long dirty_thresh; 1168 unsigned long dirty_thresh;
1100 unsigned long bdi_thresh; 1169 unsigned long bdi_thresh;
1101 long pause = 0; 1170 long period;
1102 long uninitialized_var(max_pause); 1171 long pause;
1172 long max_pause;
1173 long min_pause;
1174 int nr_dirtied_pause;
1103 bool dirty_exceeded = false; 1175 bool dirty_exceeded = false;
1104 unsigned long task_ratelimit; 1176 unsigned long task_ratelimit;
1105 unsigned long uninitialized_var(dirty_ratelimit); 1177 unsigned long dirty_ratelimit;
1106 unsigned long pos_ratio; 1178 unsigned long pos_ratio;
1107 struct backing_dev_info *bdi = mapping->backing_dev_info; 1179 struct backing_dev_info *bdi = mapping->backing_dev_info;
1108 unsigned long start_time = jiffies; 1180 unsigned long start_time = jiffies;
1109 1181
1110 for (;;) { 1182 for (;;) {
1183 unsigned long now = jiffies;
1184
1111 /* 1185 /*
1112 * Unstable writes are a feature of certain networked 1186 * Unstable writes are a feature of certain networked
1113 * filesystems (i.e. NFS) in which data may have been 1187 * filesystems (i.e. NFS) in which data may have been
@@ -1127,8 +1201,13 @@ static void balance_dirty_pages(struct address_space *mapping,
1127 */ 1201 */
1128 freerun = dirty_freerun_ceiling(dirty_thresh, 1202 freerun = dirty_freerun_ceiling(dirty_thresh,
1129 background_thresh); 1203 background_thresh);
1130 if (nr_dirty <= freerun) 1204 if (nr_dirty <= freerun) {
1205 current->dirty_paused_when = now;
1206 current->nr_dirtied = 0;
1207 current->nr_dirtied_pause =
1208 dirty_poll_interval(nr_dirty, dirty_thresh);
1131 break; 1209 break;
1210 }
1132 1211
1133 if (unlikely(!writeback_in_progress(bdi))) 1212 if (unlikely(!writeback_in_progress(bdi)))
1134 bdi_start_background_writeback(bdi); 1213 bdi_start_background_writeback(bdi);
@@ -1168,7 +1247,7 @@ static void balance_dirty_pages(struct address_space *mapping,
1168 bdi_stat(bdi, BDI_WRITEBACK); 1247 bdi_stat(bdi, BDI_WRITEBACK);
1169 } 1248 }
1170 1249
1171 dirty_exceeded = (bdi_dirty > bdi_thresh) || 1250 dirty_exceeded = (bdi_dirty > bdi_thresh) &&
1172 (nr_dirty > dirty_thresh); 1251 (nr_dirty > dirty_thresh);
1173 if (dirty_exceeded && !bdi->dirty_exceeded) 1252 if (dirty_exceeded && !bdi->dirty_exceeded)
1174 bdi->dirty_exceeded = 1; 1253 bdi->dirty_exceeded = 1;
@@ -1177,20 +1256,34 @@ static void balance_dirty_pages(struct address_space *mapping,
1177 nr_dirty, bdi_thresh, bdi_dirty, 1256 nr_dirty, bdi_thresh, bdi_dirty,
1178 start_time); 1257 start_time);
1179 1258
1180 max_pause = bdi_max_pause(bdi, bdi_dirty);
1181
1182 dirty_ratelimit = bdi->dirty_ratelimit; 1259 dirty_ratelimit = bdi->dirty_ratelimit;
1183 pos_ratio = bdi_position_ratio(bdi, dirty_thresh, 1260 pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
1184 background_thresh, nr_dirty, 1261 background_thresh, nr_dirty,
1185 bdi_thresh, bdi_dirty); 1262 bdi_thresh, bdi_dirty);
1186 task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> 1263 task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
1187 RATELIMIT_CALC_SHIFT; 1264 RATELIMIT_CALC_SHIFT;
1265 max_pause = bdi_max_pause(bdi, bdi_dirty);
1266 min_pause = bdi_min_pause(bdi, max_pause,
1267 task_ratelimit, dirty_ratelimit,
1268 &nr_dirtied_pause);
1269
1188 if (unlikely(task_ratelimit == 0)) { 1270 if (unlikely(task_ratelimit == 0)) {
1271 period = max_pause;
1189 pause = max_pause; 1272 pause = max_pause;
1190 goto pause; 1273 goto pause;
1191 } 1274 }
1192 pause = HZ * pages_dirtied / task_ratelimit; 1275 period = HZ * pages_dirtied / task_ratelimit;
1193 if (unlikely(pause <= 0)) { 1276 pause = period;
1277 if (current->dirty_paused_when)
1278 pause -= now - current->dirty_paused_when;
1279 /*
1280 * For less than 1s think time (ext3/4 may block the dirtier
1281 * for up to 800ms from time to time on 1-HDD; so does xfs,
1282 * however at much less frequency), try to compensate it in
1283 * future periods by updating the virtual time; otherwise just
1284 * do a reset, as it may be a light dirtier.
1285 */
1286 if (pause < min_pause) {
1194 trace_balance_dirty_pages(bdi, 1287 trace_balance_dirty_pages(bdi,
1195 dirty_thresh, 1288 dirty_thresh,
1196 background_thresh, 1289 background_thresh,
@@ -1200,12 +1293,24 @@ static void balance_dirty_pages(struct address_space *mapping,
1200 dirty_ratelimit, 1293 dirty_ratelimit,
1201 task_ratelimit, 1294 task_ratelimit,
1202 pages_dirtied, 1295 pages_dirtied,
1203 pause, 1296 period,
1297 min(pause, 0L),
1204 start_time); 1298 start_time);
1205 pause = 1; /* avoid resetting nr_dirtied_pause below */ 1299 if (pause < -HZ) {
1300 current->dirty_paused_when = now;
1301 current->nr_dirtied = 0;
1302 } else if (period) {
1303 current->dirty_paused_when += period;
1304 current->nr_dirtied = 0;
1305 } else if (current->nr_dirtied_pause <= pages_dirtied)
1306 current->nr_dirtied_pause += pages_dirtied;
1206 break; 1307 break;
1207 } 1308 }
1208 pause = min(pause, max_pause); 1309 if (unlikely(pause > max_pause)) {
1310 /* for occasional dropped task_ratelimit */
1311 now += min(pause - max_pause, max_pause);
1312 pause = max_pause;
1313 }
1209 1314
1210pause: 1315pause:
1211 trace_balance_dirty_pages(bdi, 1316 trace_balance_dirty_pages(bdi,
@@ -1217,11 +1322,16 @@ pause:
1217 dirty_ratelimit, 1322 dirty_ratelimit,
1218 task_ratelimit, 1323 task_ratelimit,
1219 pages_dirtied, 1324 pages_dirtied,
1325 period,
1220 pause, 1326 pause,
1221 start_time); 1327 start_time);
1222 __set_current_state(TASK_KILLABLE); 1328 __set_current_state(TASK_KILLABLE);
1223 io_schedule_timeout(pause); 1329 io_schedule_timeout(pause);
1224 1330
1331 current->dirty_paused_when = now + pause;
1332 current->nr_dirtied = 0;
1333 current->nr_dirtied_pause = nr_dirtied_pause;
1334
1225 /* 1335 /*
1226 * This is typically equal to (nr_dirty < dirty_thresh) and can 1336 * This is typically equal to (nr_dirty < dirty_thresh) and can
1227 * also keep "1000+ dd on a slow USB stick" under control. 1337 * also keep "1000+ dd on a slow USB stick" under control.
@@ -1249,23 +1359,6 @@ pause:
1249 if (!dirty_exceeded && bdi->dirty_exceeded) 1359 if (!dirty_exceeded && bdi->dirty_exceeded)
1250 bdi->dirty_exceeded = 0; 1360 bdi->dirty_exceeded = 0;
1251 1361
1252 current->nr_dirtied = 0;
1253 if (pause == 0) { /* in freerun area */
1254 current->nr_dirtied_pause =
1255 dirty_poll_interval(nr_dirty, dirty_thresh);
1256 } else if (pause <= max_pause / 4 &&
1257 pages_dirtied >= current->nr_dirtied_pause) {
1258 current->nr_dirtied_pause = clamp_val(
1259 dirty_ratelimit * (max_pause / 2) / HZ,
1260 pages_dirtied + pages_dirtied / 8,
1261 pages_dirtied * 4);
1262 } else if (pause >= max_pause) {
1263 current->nr_dirtied_pause = 1 | clamp_val(
1264 dirty_ratelimit * (max_pause / 2) / HZ,
1265 pages_dirtied / 4,
1266 pages_dirtied - pages_dirtied / 8);
1267 }
1268
1269 if (writeback_in_progress(bdi)) 1362 if (writeback_in_progress(bdi))
1270 return; 1363 return;
1271 1364
@@ -1296,6 +1389,22 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
1296 1389
1297static DEFINE_PER_CPU(int, bdp_ratelimits); 1390static DEFINE_PER_CPU(int, bdp_ratelimits);
1298 1391
1392/*
1393 * Normal tasks are throttled by
1394 * loop {
1395 * dirty tsk->nr_dirtied_pause pages;
1396 * take a snap in balance_dirty_pages();
1397 * }
1398 * However there is a worst case. If every task exit immediately when dirtied
1399 * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be
1400 * called to throttle the page dirties. The solution is to save the not yet
1401 * throttled page dirties in dirty_throttle_leaks on task exit and charge them
1402 * randomly into the running tasks. This works well for the above worst case,
1403 * as the new task will pick up and accumulate the old task's leaked dirty
1404 * count and eventually get throttled.
1405 */
1406DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
1407
1299/** 1408/**
1300 * balance_dirty_pages_ratelimited_nr - balance dirty memory state 1409 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
1301 * @mapping: address_space which was dirtied 1410 * @mapping: address_space which was dirtied
@@ -1324,8 +1433,6 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
1324 if (bdi->dirty_exceeded) 1433 if (bdi->dirty_exceeded)
1325 ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); 1434 ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
1326 1435
1327 current->nr_dirtied += nr_pages_dirtied;
1328
1329 preempt_disable(); 1436 preempt_disable();
1330 /* 1437 /*
1331 * This prevents one CPU to accumulate too many dirtied pages without 1438 * This prevents one CPU to accumulate too many dirtied pages without
@@ -1336,12 +1443,20 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
1336 p = &__get_cpu_var(bdp_ratelimits); 1443 p = &__get_cpu_var(bdp_ratelimits);
1337 if (unlikely(current->nr_dirtied >= ratelimit)) 1444 if (unlikely(current->nr_dirtied >= ratelimit))
1338 *p = 0; 1445 *p = 0;
1339 else { 1446 else if (unlikely(*p >= ratelimit_pages)) {
1340 *p += nr_pages_dirtied; 1447 *p = 0;
1341 if (unlikely(*p >= ratelimit_pages)) { 1448 ratelimit = 0;
1342 *p = 0; 1449 }
1343 ratelimit = 0; 1450 /*
1344 } 1451 * Pick up the dirtied pages by the exited tasks. This avoids lots of
1452 * short-lived tasks (eg. gcc invocations in a kernel build) escaping
1453 * the dirty throttling and livelock other long-run dirtiers.
1454 */
1455 p = &__get_cpu_var(dirty_throttle_leaks);
1456 if (*p > 0 && current->nr_dirtied < ratelimit) {
1457 nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
1458 *p -= nr_pages_dirtied;
1459 current->nr_dirtied += nr_pages_dirtied;
1345 } 1460 }
1346 preempt_enable(); 1461 preempt_enable();
1347 1462
@@ -1823,6 +1938,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
1823 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); 1938 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
1824 __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); 1939 __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
1825 task_io_account_write(PAGE_CACHE_SIZE); 1940 task_io_account_write(PAGE_CACHE_SIZE);
1941 current->nr_dirtied++;
1942 this_cpu_inc(bdp_ratelimits);
1826 } 1943 }
1827} 1944}
1828EXPORT_SYMBOL(account_page_dirtied); 1945EXPORT_SYMBOL(account_page_dirtied);
@@ -1883,6 +2000,24 @@ int __set_page_dirty_nobuffers(struct page *page)
1883EXPORT_SYMBOL(__set_page_dirty_nobuffers); 2000EXPORT_SYMBOL(__set_page_dirty_nobuffers);
1884 2001
1885/* 2002/*
2003 * Call this whenever redirtying a page, to de-account the dirty counters
2004 * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written
2005 * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to
2006 * systematic errors in balanced_dirty_ratelimit and the dirty pages position
2007 * control.
2008 */
2009void account_page_redirty(struct page *page)
2010{
2011 struct address_space *mapping = page->mapping;
2012 if (mapping && mapping_cap_account_dirty(mapping)) {
2013 current->nr_dirtied--;
2014 dec_zone_page_state(page, NR_DIRTIED);
2015 dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
2016 }
2017}
2018EXPORT_SYMBOL(account_page_redirty);
2019
2020/*
1886 * When a writepage implementation decides that it doesn't want to write this 2021 * When a writepage implementation decides that it doesn't want to write this
1887 * page for some reason, it should redirty the locked page via 2022 * page for some reason, it should redirty the locked page via
1888 * redirty_page_for_writepage() and it should then unlock the page and return 0 2023 * redirty_page_for_writepage() and it should then unlock the page and return 0
@@ -1890,6 +2025,7 @@ EXPORT_SYMBOL(__set_page_dirty_nobuffers);
1890int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) 2025int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
1891{ 2026{
1892 wbc->pages_skipped++; 2027 wbc->pages_skipped++;
2028 account_page_redirty(page);
1893 return __set_page_dirty_nobuffers(page); 2029 return __set_page_dirty_nobuffers(page);
1894} 2030}
1895EXPORT_SYMBOL(redirty_page_for_writepage); 2031EXPORT_SYMBOL(redirty_page_for_writepage);