diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-01-10 19:59:59 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-01-10 19:59:59 -0500 |
commit | 001a541ea9163ace5e8243ee0e907ad80a4c0ec2 (patch) | |
tree | a76225046369c440de93739add9823f5ea060245 | |
parent | 40ba587923ae67090d9f141c1d3c951be5c1420e (diff) | |
parent | bc31b86a5923fad5f3fbb6192f767f410241ba27 (diff) |
Merge branch 'writeback-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/linux
* 'writeback-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/linux:
writeback: move MIN_WRITEBACK_PAGES to fs-writeback.c
writeback: balanced_rate cannot exceed write bandwidth
writeback: do strict bdi dirty_exceeded
writeback: avoid tiny dirty poll intervals
writeback: max, min and target dirty pause time
writeback: dirty ratelimit - think time compensation
btrfs: fix dirtied pages accounting on sub-page writes
writeback: fix dirtied pages accounting on redirty
writeback: fix dirtied pages accounting on sub-page writes
writeback: charge leaked page dirties to active tasks
writeback: Include all dirty inodes in background writeback
-rw-r--r-- | fs/btrfs/file.c | 3 | ||||
-rw-r--r-- | fs/fs-writeback.c | 16 | ||||
-rw-r--r-- | include/linux/sched.h | 1 | ||||
-rw-r--r-- | include/linux/writeback.h | 9 | ||||
-rw-r--r-- | include/trace/events/writeback.h | 14 | ||||
-rw-r--r-- | kernel/exit.c | 3 | ||||
-rw-r--r-- | kernel/fork.c | 1 | ||||
-rw-r--r-- | mm/page-writeback.c | 246 |
8 files changed, 227 insertions, 66 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 20375e6691c3..034d98503229 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
@@ -1136,7 +1136,8 @@ again: | |||
1136 | GFP_NOFS); | 1136 | GFP_NOFS); |
1137 | } | 1137 | } |
1138 | for (i = 0; i < num_pages; i++) { | 1138 | for (i = 0; i < num_pages; i++) { |
1139 | clear_page_dirty_for_io(pages[i]); | 1139 | if (clear_page_dirty_for_io(pages[i])) |
1140 | account_page_redirty(pages[i]); | ||
1140 | set_page_extent_mapped(pages[i]); | 1141 | set_page_extent_mapped(pages[i]); |
1141 | WARN_ON(!PageLocked(pages[i])); | 1142 | WARN_ON(!PageLocked(pages[i])); |
1142 | } | 1143 | } |
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e2951506434d..f855916657ba 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
21 | #include <linux/fs.h> | 21 | #include <linux/fs.h> |
22 | #include <linux/mm.h> | 22 | #include <linux/mm.h> |
23 | #include <linux/pagemap.h> | ||
23 | #include <linux/kthread.h> | 24 | #include <linux/kthread.h> |
24 | #include <linux/freezer.h> | 25 | #include <linux/freezer.h> |
25 | #include <linux/writeback.h> | 26 | #include <linux/writeback.h> |
@@ -29,6 +30,11 @@ | |||
29 | #include "internal.h" | 30 | #include "internal.h" |
30 | 31 | ||
31 | /* | 32 | /* |
33 | * 4MB minimal write chunk size | ||
34 | */ | ||
35 | #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) | ||
36 | |||
37 | /* | ||
32 | * Passed into wb_writeback(), essentially a subset of writeback_control | 38 | * Passed into wb_writeback(), essentially a subset of writeback_control |
33 | */ | 39 | */ |
34 | struct wb_writeback_work { | 40 | struct wb_writeback_work { |
@@ -742,11 +748,17 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
742 | if (work->for_background && !over_bground_thresh(wb->bdi)) | 748 | if (work->for_background && !over_bground_thresh(wb->bdi)) |
743 | break; | 749 | break; |
744 | 750 | ||
751 | /* | ||
752 | * Kupdate and background works are special and we want to | ||
753 | * include all inodes that need writing. Livelock avoidance is | ||
754 | * handled by these works yielding to any other work so we are | ||
755 | * safe. | ||
756 | */ | ||
745 | if (work->for_kupdate) { | 757 | if (work->for_kupdate) { |
746 | oldest_jif = jiffies - | 758 | oldest_jif = jiffies - |
747 | msecs_to_jiffies(dirty_expire_interval * 10); | 759 | msecs_to_jiffies(dirty_expire_interval * 10); |
748 | work->older_than_this = &oldest_jif; | 760 | } else if (work->for_background) |
749 | } | 761 | oldest_jif = jiffies; |
750 | 762 | ||
751 | trace_writeback_start(wb->bdi, work); | 763 | trace_writeback_start(wb->bdi, work); |
752 | if (list_empty(&wb->b_io)) | 764 | if (list_empty(&wb->b_io)) |
diff --git a/include/linux/sched.h b/include/linux/sched.h index f044f66018f2..21cd0303af51 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1544,6 +1544,7 @@ struct task_struct { | |||
1544 | */ | 1544 | */ |
1545 | int nr_dirtied; | 1545 | int nr_dirtied; |
1546 | int nr_dirtied_pause; | 1546 | int nr_dirtied_pause; |
1547 | unsigned long dirty_paused_when; /* start of a write-and-pause period */ | ||
1547 | 1548 | ||
1548 | #ifdef CONFIG_LATENCYTOP | 1549 | #ifdef CONFIG_LATENCYTOP |
1549 | int latency_record_count; | 1550 | int latency_record_count; |
diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 6dff47304971..995b8bf630ac 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h | |||
@@ -7,6 +7,8 @@ | |||
7 | #include <linux/sched.h> | 7 | #include <linux/sched.h> |
8 | #include <linux/fs.h> | 8 | #include <linux/fs.h> |
9 | 9 | ||
10 | DECLARE_PER_CPU(int, dirty_throttle_leaks); | ||
11 | |||
10 | /* | 12 | /* |
11 | * The 1/4 region under the global dirty thresh is for smooth dirty throttling: | 13 | * The 1/4 region under the global dirty thresh is for smooth dirty throttling: |
12 | * | 14 | * |
@@ -23,11 +25,6 @@ | |||
23 | #define DIRTY_SCOPE 8 | 25 | #define DIRTY_SCOPE 8 |
24 | #define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2) | 26 | #define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2) |
25 | 27 | ||
26 | /* | ||
27 | * 4MB minimal write chunk size | ||
28 | */ | ||
29 | #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) | ||
30 | |||
31 | struct backing_dev_info; | 28 | struct backing_dev_info; |
32 | 29 | ||
33 | /* | 30 | /* |
@@ -194,6 +191,8 @@ void writeback_set_ratelimit(void); | |||
194 | void tag_pages_for_writeback(struct address_space *mapping, | 191 | void tag_pages_for_writeback(struct address_space *mapping, |
195 | pgoff_t start, pgoff_t end); | 192 | pgoff_t start, pgoff_t end); |
196 | 193 | ||
194 | void account_page_redirty(struct page *page); | ||
195 | |||
197 | /* pdflush.c */ | 196 | /* pdflush.c */ |
198 | extern int nr_pdflush_threads; /* Global so it can be exported to sysctl | 197 | extern int nr_pdflush_threads; /* Global so it can be exported to sysctl |
199 | read-only. */ | 198 | read-only. */ |
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 99d1d0decf88..8588a8918023 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h | |||
@@ -300,12 +300,13 @@ TRACE_EVENT(balance_dirty_pages, | |||
300 | unsigned long dirty_ratelimit, | 300 | unsigned long dirty_ratelimit, |
301 | unsigned long task_ratelimit, | 301 | unsigned long task_ratelimit, |
302 | unsigned long dirtied, | 302 | unsigned long dirtied, |
303 | unsigned long period, | ||
303 | long pause, | 304 | long pause, |
304 | unsigned long start_time), | 305 | unsigned long start_time), |
305 | 306 | ||
306 | TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, | 307 | TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, |
307 | dirty_ratelimit, task_ratelimit, | 308 | dirty_ratelimit, task_ratelimit, |
308 | dirtied, pause, start_time), | 309 | dirtied, period, pause, start_time), |
309 | 310 | ||
310 | TP_STRUCT__entry( | 311 | TP_STRUCT__entry( |
311 | __array( char, bdi, 32) | 312 | __array( char, bdi, 32) |
@@ -320,6 +321,8 @@ TRACE_EVENT(balance_dirty_pages, | |||
320 | __field(unsigned int, dirtied_pause) | 321 | __field(unsigned int, dirtied_pause) |
321 | __field(unsigned long, paused) | 322 | __field(unsigned long, paused) |
322 | __field( long, pause) | 323 | __field( long, pause) |
324 | __field(unsigned long, period) | ||
325 | __field( long, think) | ||
323 | ), | 326 | ), |
324 | 327 | ||
325 | TP_fast_assign( | 328 | TP_fast_assign( |
@@ -336,6 +339,9 @@ TRACE_EVENT(balance_dirty_pages, | |||
336 | __entry->task_ratelimit = KBps(task_ratelimit); | 339 | __entry->task_ratelimit = KBps(task_ratelimit); |
337 | __entry->dirtied = dirtied; | 340 | __entry->dirtied = dirtied; |
338 | __entry->dirtied_pause = current->nr_dirtied_pause; | 341 | __entry->dirtied_pause = current->nr_dirtied_pause; |
342 | __entry->think = current->dirty_paused_when == 0 ? 0 : | ||
343 | (long)(jiffies - current->dirty_paused_when) * 1000/HZ; | ||
344 | __entry->period = period * 1000 / HZ; | ||
339 | __entry->pause = pause * 1000 / HZ; | 345 | __entry->pause = pause * 1000 / HZ; |
340 | __entry->paused = (jiffies - start_time) * 1000 / HZ; | 346 | __entry->paused = (jiffies - start_time) * 1000 / HZ; |
341 | ), | 347 | ), |
@@ -346,7 +352,7 @@ TRACE_EVENT(balance_dirty_pages, | |||
346 | "bdi_setpoint=%lu bdi_dirty=%lu " | 352 | "bdi_setpoint=%lu bdi_dirty=%lu " |
347 | "dirty_ratelimit=%lu task_ratelimit=%lu " | 353 | "dirty_ratelimit=%lu task_ratelimit=%lu " |
348 | "dirtied=%u dirtied_pause=%u " | 354 | "dirtied=%u dirtied_pause=%u " |
349 | "paused=%lu pause=%ld", | 355 | "paused=%lu pause=%ld period=%lu think=%ld", |
350 | __entry->bdi, | 356 | __entry->bdi, |
351 | __entry->limit, | 357 | __entry->limit, |
352 | __entry->setpoint, | 358 | __entry->setpoint, |
@@ -358,7 +364,9 @@ TRACE_EVENT(balance_dirty_pages, | |||
358 | __entry->dirtied, | 364 | __entry->dirtied, |
359 | __entry->dirtied_pause, | 365 | __entry->dirtied_pause, |
360 | __entry->paused, /* ms */ | 366 | __entry->paused, /* ms */ |
361 | __entry->pause /* ms */ | 367 | __entry->pause, /* ms */ |
368 | __entry->period, /* ms */ | ||
369 | __entry->think /* ms */ | ||
362 | ) | 370 | ) |
363 | ); | 371 | ); |
364 | 372 | ||
diff --git a/kernel/exit.c b/kernel/exit.c index d9eab2e4b430..94ed6e20bb53 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -51,6 +51,7 @@ | |||
51 | #include <trace/events/sched.h> | 51 | #include <trace/events/sched.h> |
52 | #include <linux/hw_breakpoint.h> | 52 | #include <linux/hw_breakpoint.h> |
53 | #include <linux/oom.h> | 53 | #include <linux/oom.h> |
54 | #include <linux/writeback.h> | ||
54 | 55 | ||
55 | #include <asm/uaccess.h> | 56 | #include <asm/uaccess.h> |
56 | #include <asm/unistd.h> | 57 | #include <asm/unistd.h> |
@@ -1035,6 +1036,8 @@ NORET_TYPE void do_exit(long code) | |||
1035 | validate_creds_for_do_exit(tsk); | 1036 | validate_creds_for_do_exit(tsk); |
1036 | 1037 | ||
1037 | preempt_disable(); | 1038 | preempt_disable(); |
1039 | if (tsk->nr_dirtied) | ||
1040 | __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); | ||
1038 | exit_rcu(); | 1041 | exit_rcu(); |
1039 | /* causes final put_task_struct in finish_task_switch(). */ | 1042 | /* causes final put_task_struct in finish_task_switch(). */ |
1040 | tsk->state = TASK_DEAD; | 1043 | tsk->state = TASK_DEAD; |
diff --git a/kernel/fork.c b/kernel/fork.c index 5e1391b5ade0..443f5125f11e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1294,6 +1294,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1294 | 1294 | ||
1295 | p->nr_dirtied = 0; | 1295 | p->nr_dirtied = 0; |
1296 | p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); | 1296 | p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); |
1297 | p->dirty_paused_when = 0; | ||
1297 | 1298 | ||
1298 | /* | 1299 | /* |
1299 | * Ok, make it visible to the rest of the system. | 1300 | * Ok, make it visible to the rest of the system. |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5cdd4f2b0c9d..363ba7082ef5 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -42,6 +42,12 @@ | |||
42 | #define MAX_PAUSE max(HZ/5, 1) | 42 | #define MAX_PAUSE max(HZ/5, 1) |
43 | 43 | ||
44 | /* | 44 | /* |
45 | * Try to keep balance_dirty_pages() call intervals higher than this many pages | ||
46 | * by raising pause time to max_pause when falls below it. | ||
47 | */ | ||
48 | #define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10)) | ||
49 | |||
50 | /* | ||
45 | * Estimate write bandwidth at 200ms intervals. | 51 | * Estimate write bandwidth at 200ms intervals. |
46 | */ | 52 | */ |
47 | #define BANDWIDTH_INTERVAL max(HZ/5, 1) | 53 | #define BANDWIDTH_INTERVAL max(HZ/5, 1) |
@@ -898,6 +904,11 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, | |||
898 | */ | 904 | */ |
899 | balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw, | 905 | balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw, |
900 | dirty_rate | 1); | 906 | dirty_rate | 1); |
907 | /* | ||
908 | * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw | ||
909 | */ | ||
910 | if (unlikely(balanced_dirty_ratelimit > write_bw)) | ||
911 | balanced_dirty_ratelimit = write_bw; | ||
901 | 912 | ||
902 | /* | 913 | /* |
903 | * We could safely do this and return immediately: | 914 | * We could safely do this and return immediately: |
@@ -1044,40 +1055,98 @@ static unsigned long dirty_poll_interval(unsigned long dirty, | |||
1044 | return 1; | 1055 | return 1; |
1045 | } | 1056 | } |
1046 | 1057 | ||
1047 | static unsigned long bdi_max_pause(struct backing_dev_info *bdi, | 1058 | static long bdi_max_pause(struct backing_dev_info *bdi, |
1048 | unsigned long bdi_dirty) | 1059 | unsigned long bdi_dirty) |
1060 | { | ||
1061 | long bw = bdi->avg_write_bandwidth; | ||
1062 | long t; | ||
1063 | |||
1064 | /* | ||
1065 | * Limit pause time for small memory systems. If sleeping for too long | ||
1066 | * time, a small pool of dirty/writeback pages may go empty and disk go | ||
1067 | * idle. | ||
1068 | * | ||
1069 | * 8 serves as the safety ratio. | ||
1070 | */ | ||
1071 | t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); | ||
1072 | t++; | ||
1073 | |||
1074 | return min_t(long, t, MAX_PAUSE); | ||
1075 | } | ||
1076 | |||
1077 | static long bdi_min_pause(struct backing_dev_info *bdi, | ||
1078 | long max_pause, | ||
1079 | unsigned long task_ratelimit, | ||
1080 | unsigned long dirty_ratelimit, | ||
1081 | int *nr_dirtied_pause) | ||
1049 | { | 1082 | { |
1050 | unsigned long bw = bdi->avg_write_bandwidth; | 1083 | long hi = ilog2(bdi->avg_write_bandwidth); |
1051 | unsigned long hi = ilog2(bw); | 1084 | long lo = ilog2(bdi->dirty_ratelimit); |
1052 | unsigned long lo = ilog2(bdi->dirty_ratelimit); | 1085 | long t; /* target pause */ |
1053 | unsigned long t; | 1086 | long pause; /* estimated next pause */ |
1087 | int pages; /* target nr_dirtied_pause */ | ||
1054 | 1088 | ||
1055 | /* target for 20ms max pause on 1-dd case */ | 1089 | /* target for 10ms pause on 1-dd case */ |
1056 | t = HZ / 50; | 1090 | t = max(1, HZ / 100); |
1057 | 1091 | ||
1058 | /* | 1092 | /* |
1059 | * Scale up pause time for concurrent dirtiers in order to reduce CPU | 1093 | * Scale up pause time for concurrent dirtiers in order to reduce CPU |
1060 | * overheads. | 1094 | * overheads. |
1061 | * | 1095 | * |
1062 | * (N * 20ms) on 2^N concurrent tasks. | 1096 | * (N * 10ms) on 2^N concurrent tasks. |
1063 | */ | 1097 | */ |
1064 | if (hi > lo) | 1098 | if (hi > lo) |
1065 | t += (hi - lo) * (20 * HZ) / 1024; | 1099 | t += (hi - lo) * (10 * HZ) / 1024; |
1066 | 1100 | ||
1067 | /* | 1101 | /* |
1068 | * Limit pause time for small memory systems. If sleeping for too long | 1102 | * This is a bit convoluted. We try to base the next nr_dirtied_pause |
1069 | * time, a small pool of dirty/writeback pages may go empty and disk go | 1103 | * on the much more stable dirty_ratelimit. However the next pause time |
1070 | * idle. | 1104 | * will be computed based on task_ratelimit and the two rate limits may |
1105 | * depart considerably at some time. Especially if task_ratelimit goes | ||
1106 | * below dirty_ratelimit/2 and the target pause is max_pause, the next | ||
1107 | * pause time will be max_pause*2 _trimmed down_ to max_pause. As a | ||
1108 | * result task_ratelimit won't be executed faithfully, which could | ||
1109 | * eventually bring down dirty_ratelimit. | ||
1071 | * | 1110 | * |
1072 | * 8 serves as the safety ratio. | 1111 | * We apply two rules to fix it up: |
1112 | * 1) try to estimate the next pause time and if necessary, use a lower | ||
1113 | * nr_dirtied_pause so as not to exceed max_pause. When this happens, | ||
1114 | * nr_dirtied_pause will be "dancing" with task_ratelimit. | ||
1115 | * 2) limit the target pause time to max_pause/2, so that the normal | ||
1116 | * small fluctuations of task_ratelimit won't trigger rule (1) and | ||
1117 | * nr_dirtied_pause will remain as stable as dirty_ratelimit. | ||
1073 | */ | 1118 | */ |
1074 | t = min(t, bdi_dirty * HZ / (8 * bw + 1)); | 1119 | t = min(t, 1 + max_pause / 2); |
1120 | pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); | ||
1075 | 1121 | ||
1076 | /* | 1122 | /* |
1077 | * The pause time will be settled within range (max_pause/4, max_pause). | 1123 | * Tiny nr_dirtied_pause is found to hurt I/O performance in the test |
1078 | * Apply a minimal value of 4 to get a non-zero max_pause/4. | 1124 | * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}. |
1125 | * When the 16 consecutive reads are often interrupted by some dirty | ||
1126 | * throttling pause during the async writes, cfq will go into idles | ||
1127 | * (deadline is fine). So push nr_dirtied_pause as high as possible | ||
1128 | * until reaches DIRTY_POLL_THRESH=32 pages. | ||
1079 | */ | 1129 | */ |
1080 | return clamp_val(t, 4, MAX_PAUSE); | 1130 | if (pages < DIRTY_POLL_THRESH) { |
1131 | t = max_pause; | ||
1132 | pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); | ||
1133 | if (pages > DIRTY_POLL_THRESH) { | ||
1134 | pages = DIRTY_POLL_THRESH; | ||
1135 | t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit; | ||
1136 | } | ||
1137 | } | ||
1138 | |||
1139 | pause = HZ * pages / (task_ratelimit + 1); | ||
1140 | if (pause > max_pause) { | ||
1141 | t = max_pause; | ||
1142 | pages = task_ratelimit * t / roundup_pow_of_two(HZ); | ||
1143 | } | ||
1144 | |||
1145 | *nr_dirtied_pause = pages; | ||
1146 | /* | ||
1147 | * The minimal pause time will normally be half the target pause time. | ||
1148 | */ | ||
1149 | return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; | ||
1081 | } | 1150 | } |
1082 | 1151 | ||
1083 | /* | 1152 | /* |
@@ -1098,16 +1167,21 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
1098 | unsigned long background_thresh; | 1167 | unsigned long background_thresh; |
1099 | unsigned long dirty_thresh; | 1168 | unsigned long dirty_thresh; |
1100 | unsigned long bdi_thresh; | 1169 | unsigned long bdi_thresh; |
1101 | long pause = 0; | 1170 | long period; |
1102 | long uninitialized_var(max_pause); | 1171 | long pause; |
1172 | long max_pause; | ||
1173 | long min_pause; | ||
1174 | int nr_dirtied_pause; | ||
1103 | bool dirty_exceeded = false; | 1175 | bool dirty_exceeded = false; |
1104 | unsigned long task_ratelimit; | 1176 | unsigned long task_ratelimit; |
1105 | unsigned long uninitialized_var(dirty_ratelimit); | 1177 | unsigned long dirty_ratelimit; |
1106 | unsigned long pos_ratio; | 1178 | unsigned long pos_ratio; |
1107 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 1179 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
1108 | unsigned long start_time = jiffies; | 1180 | unsigned long start_time = jiffies; |
1109 | 1181 | ||
1110 | for (;;) { | 1182 | for (;;) { |
1183 | unsigned long now = jiffies; | ||
1184 | |||
1111 | /* | 1185 | /* |
1112 | * Unstable writes are a feature of certain networked | 1186 | * Unstable writes are a feature of certain networked |
1113 | * filesystems (i.e. NFS) in which data may have been | 1187 | * filesystems (i.e. NFS) in which data may have been |
@@ -1127,8 +1201,13 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
1127 | */ | 1201 | */ |
1128 | freerun = dirty_freerun_ceiling(dirty_thresh, | 1202 | freerun = dirty_freerun_ceiling(dirty_thresh, |
1129 | background_thresh); | 1203 | background_thresh); |
1130 | if (nr_dirty <= freerun) | 1204 | if (nr_dirty <= freerun) { |
1205 | current->dirty_paused_when = now; | ||
1206 | current->nr_dirtied = 0; | ||
1207 | current->nr_dirtied_pause = | ||
1208 | dirty_poll_interval(nr_dirty, dirty_thresh); | ||
1131 | break; | 1209 | break; |
1210 | } | ||
1132 | 1211 | ||
1133 | if (unlikely(!writeback_in_progress(bdi))) | 1212 | if (unlikely(!writeback_in_progress(bdi))) |
1134 | bdi_start_background_writeback(bdi); | 1213 | bdi_start_background_writeback(bdi); |
@@ -1168,7 +1247,7 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
1168 | bdi_stat(bdi, BDI_WRITEBACK); | 1247 | bdi_stat(bdi, BDI_WRITEBACK); |
1169 | } | 1248 | } |
1170 | 1249 | ||
1171 | dirty_exceeded = (bdi_dirty > bdi_thresh) || | 1250 | dirty_exceeded = (bdi_dirty > bdi_thresh) && |
1172 | (nr_dirty > dirty_thresh); | 1251 | (nr_dirty > dirty_thresh); |
1173 | if (dirty_exceeded && !bdi->dirty_exceeded) | 1252 | if (dirty_exceeded && !bdi->dirty_exceeded) |
1174 | bdi->dirty_exceeded = 1; | 1253 | bdi->dirty_exceeded = 1; |
@@ -1177,20 +1256,34 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
1177 | nr_dirty, bdi_thresh, bdi_dirty, | 1256 | nr_dirty, bdi_thresh, bdi_dirty, |
1178 | start_time); | 1257 | start_time); |
1179 | 1258 | ||
1180 | max_pause = bdi_max_pause(bdi, bdi_dirty); | ||
1181 | |||
1182 | dirty_ratelimit = bdi->dirty_ratelimit; | 1259 | dirty_ratelimit = bdi->dirty_ratelimit; |
1183 | pos_ratio = bdi_position_ratio(bdi, dirty_thresh, | 1260 | pos_ratio = bdi_position_ratio(bdi, dirty_thresh, |
1184 | background_thresh, nr_dirty, | 1261 | background_thresh, nr_dirty, |
1185 | bdi_thresh, bdi_dirty); | 1262 | bdi_thresh, bdi_dirty); |
1186 | task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> | 1263 | task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> |
1187 | RATELIMIT_CALC_SHIFT; | 1264 | RATELIMIT_CALC_SHIFT; |
1265 | max_pause = bdi_max_pause(bdi, bdi_dirty); | ||
1266 | min_pause = bdi_min_pause(bdi, max_pause, | ||
1267 | task_ratelimit, dirty_ratelimit, | ||
1268 | &nr_dirtied_pause); | ||
1269 | |||
1188 | if (unlikely(task_ratelimit == 0)) { | 1270 | if (unlikely(task_ratelimit == 0)) { |
1271 | period = max_pause; | ||
1189 | pause = max_pause; | 1272 | pause = max_pause; |
1190 | goto pause; | 1273 | goto pause; |
1191 | } | 1274 | } |
1192 | pause = HZ * pages_dirtied / task_ratelimit; | 1275 | period = HZ * pages_dirtied / task_ratelimit; |
1193 | if (unlikely(pause <= 0)) { | 1276 | pause = period; |
1277 | if (current->dirty_paused_when) | ||
1278 | pause -= now - current->dirty_paused_when; | ||
1279 | /* | ||
1280 | * For less than 1s think time (ext3/4 may block the dirtier | ||
1281 | * for up to 800ms from time to time on 1-HDD; so does xfs, | ||
1282 | * however at much less frequency), try to compensate it in | ||
1283 | * future periods by updating the virtual time; otherwise just | ||
1284 | * do a reset, as it may be a light dirtier. | ||
1285 | */ | ||
1286 | if (pause < min_pause) { | ||
1194 | trace_balance_dirty_pages(bdi, | 1287 | trace_balance_dirty_pages(bdi, |
1195 | dirty_thresh, | 1288 | dirty_thresh, |
1196 | background_thresh, | 1289 | background_thresh, |
@@ -1200,12 +1293,24 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
1200 | dirty_ratelimit, | 1293 | dirty_ratelimit, |
1201 | task_ratelimit, | 1294 | task_ratelimit, |
1202 | pages_dirtied, | 1295 | pages_dirtied, |
1203 | pause, | 1296 | period, |
1297 | min(pause, 0L), | ||
1204 | start_time); | 1298 | start_time); |
1205 | pause = 1; /* avoid resetting nr_dirtied_pause below */ | 1299 | if (pause < -HZ) { |
1300 | current->dirty_paused_when = now; | ||
1301 | current->nr_dirtied = 0; | ||
1302 | } else if (period) { | ||
1303 | current->dirty_paused_when += period; | ||
1304 | current->nr_dirtied = 0; | ||
1305 | } else if (current->nr_dirtied_pause <= pages_dirtied) | ||
1306 | current->nr_dirtied_pause += pages_dirtied; | ||
1206 | break; | 1307 | break; |
1207 | } | 1308 | } |
1208 | pause = min(pause, max_pause); | 1309 | if (unlikely(pause > max_pause)) { |
1310 | /* for occasional dropped task_ratelimit */ | ||
1311 | now += min(pause - max_pause, max_pause); | ||
1312 | pause = max_pause; | ||
1313 | } | ||
1209 | 1314 | ||
1210 | pause: | 1315 | pause: |
1211 | trace_balance_dirty_pages(bdi, | 1316 | trace_balance_dirty_pages(bdi, |
@@ -1217,11 +1322,16 @@ pause: | |||
1217 | dirty_ratelimit, | 1322 | dirty_ratelimit, |
1218 | task_ratelimit, | 1323 | task_ratelimit, |
1219 | pages_dirtied, | 1324 | pages_dirtied, |
1325 | period, | ||
1220 | pause, | 1326 | pause, |
1221 | start_time); | 1327 | start_time); |
1222 | __set_current_state(TASK_KILLABLE); | 1328 | __set_current_state(TASK_KILLABLE); |
1223 | io_schedule_timeout(pause); | 1329 | io_schedule_timeout(pause); |
1224 | 1330 | ||
1331 | current->dirty_paused_when = now + pause; | ||
1332 | current->nr_dirtied = 0; | ||
1333 | current->nr_dirtied_pause = nr_dirtied_pause; | ||
1334 | |||
1225 | /* | 1335 | /* |
1226 | * This is typically equal to (nr_dirty < dirty_thresh) and can | 1336 | * This is typically equal to (nr_dirty < dirty_thresh) and can |
1227 | * also keep "1000+ dd on a slow USB stick" under control. | 1337 | * also keep "1000+ dd on a slow USB stick" under control. |
@@ -1249,23 +1359,6 @@ pause: | |||
1249 | if (!dirty_exceeded && bdi->dirty_exceeded) | 1359 | if (!dirty_exceeded && bdi->dirty_exceeded) |
1250 | bdi->dirty_exceeded = 0; | 1360 | bdi->dirty_exceeded = 0; |
1251 | 1361 | ||
1252 | current->nr_dirtied = 0; | ||
1253 | if (pause == 0) { /* in freerun area */ | ||
1254 | current->nr_dirtied_pause = | ||
1255 | dirty_poll_interval(nr_dirty, dirty_thresh); | ||
1256 | } else if (pause <= max_pause / 4 && | ||
1257 | pages_dirtied >= current->nr_dirtied_pause) { | ||
1258 | current->nr_dirtied_pause = clamp_val( | ||
1259 | dirty_ratelimit * (max_pause / 2) / HZ, | ||
1260 | pages_dirtied + pages_dirtied / 8, | ||
1261 | pages_dirtied * 4); | ||
1262 | } else if (pause >= max_pause) { | ||
1263 | current->nr_dirtied_pause = 1 | clamp_val( | ||
1264 | dirty_ratelimit * (max_pause / 2) / HZ, | ||
1265 | pages_dirtied / 4, | ||
1266 | pages_dirtied - pages_dirtied / 8); | ||
1267 | } | ||
1268 | |||
1269 | if (writeback_in_progress(bdi)) | 1362 | if (writeback_in_progress(bdi)) |
1270 | return; | 1363 | return; |
1271 | 1364 | ||
@@ -1296,6 +1389,22 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) | |||
1296 | 1389 | ||
1297 | static DEFINE_PER_CPU(int, bdp_ratelimits); | 1390 | static DEFINE_PER_CPU(int, bdp_ratelimits); |
1298 | 1391 | ||
1392 | /* | ||
1393 | * Normal tasks are throttled by | ||
1394 | * loop { | ||
1395 | * dirty tsk->nr_dirtied_pause pages; | ||
1396 | * take a snap in balance_dirty_pages(); | ||
1397 | * } | ||
1398 | * However there is a worst case. If every task exit immediately when dirtied | ||
1399 | * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be | ||
1400 | * called to throttle the page dirties. The solution is to save the not yet | ||
1401 | * throttled page dirties in dirty_throttle_leaks on task exit and charge them | ||
1402 | * randomly into the running tasks. This works well for the above worst case, | ||
1403 | * as the new task will pick up and accumulate the old task's leaked dirty | ||
1404 | * count and eventually get throttled. | ||
1405 | */ | ||
1406 | DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; | ||
1407 | |||
1299 | /** | 1408 | /** |
1300 | * balance_dirty_pages_ratelimited_nr - balance dirty memory state | 1409 | * balance_dirty_pages_ratelimited_nr - balance dirty memory state |
1301 | * @mapping: address_space which was dirtied | 1410 | * @mapping: address_space which was dirtied |
@@ -1324,8 +1433,6 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | |||
1324 | if (bdi->dirty_exceeded) | 1433 | if (bdi->dirty_exceeded) |
1325 | ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); | 1434 | ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); |
1326 | 1435 | ||
1327 | current->nr_dirtied += nr_pages_dirtied; | ||
1328 | |||
1329 | preempt_disable(); | 1436 | preempt_disable(); |
1330 | /* | 1437 | /* |
1331 | * This prevents one CPU to accumulate too many dirtied pages without | 1438 | * This prevents one CPU to accumulate too many dirtied pages without |
@@ -1336,12 +1443,20 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | |||
1336 | p = &__get_cpu_var(bdp_ratelimits); | 1443 | p = &__get_cpu_var(bdp_ratelimits); |
1337 | if (unlikely(current->nr_dirtied >= ratelimit)) | 1444 | if (unlikely(current->nr_dirtied >= ratelimit)) |
1338 | *p = 0; | 1445 | *p = 0; |
1339 | else { | 1446 | else if (unlikely(*p >= ratelimit_pages)) { |
1340 | *p += nr_pages_dirtied; | 1447 | *p = 0; |
1341 | if (unlikely(*p >= ratelimit_pages)) { | 1448 | ratelimit = 0; |
1342 | *p = 0; | 1449 | } |
1343 | ratelimit = 0; | 1450 | /* |
1344 | } | 1451 | * Pick up the dirtied pages by the exited tasks. This avoids lots of |
1452 | * short-lived tasks (eg. gcc invocations in a kernel build) escaping | ||
1453 | * the dirty throttling and livelock other long-run dirtiers. | ||
1454 | */ | ||
1455 | p = &__get_cpu_var(dirty_throttle_leaks); | ||
1456 | if (*p > 0 && current->nr_dirtied < ratelimit) { | ||
1457 | nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); | ||
1458 | *p -= nr_pages_dirtied; | ||
1459 | current->nr_dirtied += nr_pages_dirtied; | ||
1345 | } | 1460 | } |
1346 | preempt_enable(); | 1461 | preempt_enable(); |
1347 | 1462 | ||
@@ -1823,6 +1938,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) | |||
1823 | __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); | 1938 | __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); |
1824 | __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); | 1939 | __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); |
1825 | task_io_account_write(PAGE_CACHE_SIZE); | 1940 | task_io_account_write(PAGE_CACHE_SIZE); |
1941 | current->nr_dirtied++; | ||
1942 | this_cpu_inc(bdp_ratelimits); | ||
1826 | } | 1943 | } |
1827 | } | 1944 | } |
1828 | EXPORT_SYMBOL(account_page_dirtied); | 1945 | EXPORT_SYMBOL(account_page_dirtied); |
@@ -1883,6 +2000,24 @@ int __set_page_dirty_nobuffers(struct page *page) | |||
1883 | EXPORT_SYMBOL(__set_page_dirty_nobuffers); | 2000 | EXPORT_SYMBOL(__set_page_dirty_nobuffers); |
1884 | 2001 | ||
1885 | /* | 2002 | /* |
2003 | * Call this whenever redirtying a page, to de-account the dirty counters | ||
2004 | * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written | ||
2005 | * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to | ||
2006 | * systematic errors in balanced_dirty_ratelimit and the dirty pages position | ||
2007 | * control. | ||
2008 | */ | ||
2009 | void account_page_redirty(struct page *page) | ||
2010 | { | ||
2011 | struct address_space *mapping = page->mapping; | ||
2012 | if (mapping && mapping_cap_account_dirty(mapping)) { | ||
2013 | current->nr_dirtied--; | ||
2014 | dec_zone_page_state(page, NR_DIRTIED); | ||
2015 | dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); | ||
2016 | } | ||
2017 | } | ||
2018 | EXPORT_SYMBOL(account_page_redirty); | ||
2019 | |||
2020 | /* | ||
1886 | * When a writepage implementation decides that it doesn't want to write this | 2021 | * When a writepage implementation decides that it doesn't want to write this |
1887 | * page for some reason, it should redirty the locked page via | 2022 | * page for some reason, it should redirty the locked page via |
1888 | * redirty_page_for_writepage() and it should then unlock the page and return 0 | 2023 | * redirty_page_for_writepage() and it should then unlock the page and return 0 |
@@ -1890,6 +2025,7 @@ EXPORT_SYMBOL(__set_page_dirty_nobuffers); | |||
1890 | int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) | 2025 | int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) |
1891 | { | 2026 | { |
1892 | wbc->pages_skipped++; | 2027 | wbc->pages_skipped++; |
2028 | account_page_redirty(page); | ||
1893 | return __set_page_dirty_nobuffers(page); | 2029 | return __set_page_dirty_nobuffers(page); |
1894 | } | 2030 | } |
1895 | EXPORT_SYMBOL(redirty_page_for_writepage); | 2031 | EXPORT_SYMBOL(redirty_page_for_writepage); |