aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/trace/events/writeback.h24
-rw-r--r--mm/page-writeback.c161
2 files changed, 56 insertions, 129 deletions
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 5f172703eb4f..178c23508d3d 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -104,30 +104,6 @@ DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
104DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); 104DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister);
105DEFINE_WRITEBACK_EVENT(writeback_thread_start); 105DEFINE_WRITEBACK_EVENT(writeback_thread_start);
106DEFINE_WRITEBACK_EVENT(writeback_thread_stop); 106DEFINE_WRITEBACK_EVENT(writeback_thread_stop);
107DEFINE_WRITEBACK_EVENT(balance_dirty_start);
108DEFINE_WRITEBACK_EVENT(balance_dirty_wait);
109
110TRACE_EVENT(balance_dirty_written,
111
112 TP_PROTO(struct backing_dev_info *bdi, int written),
113
114 TP_ARGS(bdi, written),
115
116 TP_STRUCT__entry(
117 __array(char, name, 32)
118 __field(int, written)
119 ),
120
121 TP_fast_assign(
122 strncpy(__entry->name, dev_name(bdi->dev), 32);
123 __entry->written = written;
124 ),
125
126 TP_printk("bdi %s written %d",
127 __entry->name,
128 __entry->written
129 )
130);
131 107
132DECLARE_EVENT_CLASS(wbc_class, 108DECLARE_EVENT_CLASS(wbc_class,
133 TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), 109 TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index daff320d263f..f32f25092c66 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -250,50 +250,6 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
250 numerator, denominator); 250 numerator, denominator);
251} 251}
252 252
253static inline void task_dirties_fraction(struct task_struct *tsk,
254 long *numerator, long *denominator)
255{
256 prop_fraction_single(&vm_dirties, &tsk->dirties,
257 numerator, denominator);
258}
259
260/*
261 * task_dirty_limit - scale down dirty throttling threshold for one task
262 *
263 * task specific dirty limit:
264 *
265 * dirty -= (dirty/8) * p_{t}
266 *
267 * To protect light/slow dirtying tasks from heavier/fast ones, we start
268 * throttling individual tasks before reaching the bdi dirty limit.
269 * Relatively low thresholds will be allocated to heavy dirtiers. So when
270 * dirty pages grow large, heavy dirtiers will be throttled first, which will
271 * effectively curb the growth of dirty pages. Light dirtiers with high enough
272 * dirty threshold may never get throttled.
273 */
274#define TASK_LIMIT_FRACTION 8
275static unsigned long task_dirty_limit(struct task_struct *tsk,
276 unsigned long bdi_dirty)
277{
278 long numerator, denominator;
279 unsigned long dirty = bdi_dirty;
280 u64 inv = dirty / TASK_LIMIT_FRACTION;
281
282 task_dirties_fraction(tsk, &numerator, &denominator);
283 inv *= numerator;
284 do_div(inv, denominator);
285
286 dirty -= inv;
287
288 return max(dirty, bdi_dirty/2);
289}
290
291/* Minimum limit for any task */
292static unsigned long task_min_dirty_limit(unsigned long bdi_dirty)
293{
294 return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION;
295}
296
297/* 253/*
298 * 254 *
299 */ 255 */
@@ -986,30 +942,36 @@ static unsigned long dirty_poll_interval(unsigned long dirty,
986/* 942/*
987 * balance_dirty_pages() must be called by processes which are generating dirty 943 * balance_dirty_pages() must be called by processes which are generating dirty
988 * data. It looks at the number of dirty pages in the machine and will force 944 * data. It looks at the number of dirty pages in the machine and will force
989 * the caller to perform writeback if the system is over `vm_dirty_ratio'. 945 * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
990 * If we're over `background_thresh' then the writeback threads are woken to 946 * If we're over `background_thresh' then the writeback threads are woken to
991 * perform some writeout. 947 * perform some writeout.
992 */ 948 */
993static void balance_dirty_pages(struct address_space *mapping, 949static void balance_dirty_pages(struct address_space *mapping,
994 unsigned long write_chunk) 950 unsigned long pages_dirtied)
995{ 951{
996 unsigned long nr_reclaimable, bdi_nr_reclaimable; 952 unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */
953 unsigned long bdi_reclaimable;
997 unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ 954 unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
998 unsigned long bdi_dirty; 955 unsigned long bdi_dirty;
999 unsigned long freerun; 956 unsigned long freerun;
1000 unsigned long background_thresh; 957 unsigned long background_thresh;
1001 unsigned long dirty_thresh; 958 unsigned long dirty_thresh;
1002 unsigned long bdi_thresh; 959 unsigned long bdi_thresh;
1003 unsigned long task_bdi_thresh; 960 long pause = 0;
1004 unsigned long min_task_bdi_thresh;
1005 unsigned long pages_written = 0;
1006 unsigned long pause = 1;
1007 bool dirty_exceeded = false; 961 bool dirty_exceeded = false;
1008 bool clear_dirty_exceeded = true; 962 unsigned long task_ratelimit;
963 unsigned long dirty_ratelimit;
964 unsigned long pos_ratio;
1009 struct backing_dev_info *bdi = mapping->backing_dev_info; 965 struct backing_dev_info *bdi = mapping->backing_dev_info;
1010 unsigned long start_time = jiffies; 966 unsigned long start_time = jiffies;
1011 967
1012 for (;;) { 968 for (;;) {
969 /*
970 * Unstable writes are a feature of certain networked
971 * filesystems (i.e. NFS) in which data may have been
972 * written to the server's write cache, but has not yet
973 * been flushed to permanent storage.
974 */
1013 nr_reclaimable = global_page_state(NR_FILE_DIRTY) + 975 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
1014 global_page_state(NR_UNSTABLE_NFS); 976 global_page_state(NR_UNSTABLE_NFS);
1015 nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); 977 nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
@@ -1026,9 +988,23 @@ static void balance_dirty_pages(struct address_space *mapping,
1026 if (nr_dirty <= freerun) 988 if (nr_dirty <= freerun)
1027 break; 989 break;
1028 990
991 if (unlikely(!writeback_in_progress(bdi)))
992 bdi_start_background_writeback(bdi);
993
994 /*
995 * bdi_thresh is not treated as some limiting factor as
996 * dirty_thresh, due to reasons
997 * - in JBOD setup, bdi_thresh can fluctuate a lot
998 * - in a system with HDD and USB key, the USB key may somehow
999 * go into state (bdi_dirty >> bdi_thresh) either because
1000 * bdi_dirty starts high, or because bdi_thresh drops low.
1001 * In this case we don't want to hard throttle the USB key
1002 * dirtiers for 100 seconds until bdi_dirty drops under
1003 * bdi_thresh. Instead the auxiliary bdi control line in
1004 * bdi_position_ratio() will let the dirtier task progress
1005 * at some rate <= (write_bw / 2) for bringing down bdi_dirty.
1006 */
1029 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); 1007 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
1030 min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh);
1031 task_bdi_thresh = task_dirty_limit(current, bdi_thresh);
1032 1008
1033 /* 1009 /*
1034 * In order to avoid the stacked BDI deadlock we need 1010 * In order to avoid the stacked BDI deadlock we need
@@ -1040,57 +1016,41 @@ static void balance_dirty_pages(struct address_space *mapping,
1040 * actually dirty; with m+n sitting in the percpu 1016 * actually dirty; with m+n sitting in the percpu
1041 * deltas. 1017 * deltas.
1042 */ 1018 */
1043 if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) { 1019 if (bdi_thresh < 2 * bdi_stat_error(bdi)) {
1044 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); 1020 bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
1045 bdi_dirty = bdi_nr_reclaimable + 1021 bdi_dirty = bdi_reclaimable +
1046 bdi_stat_sum(bdi, BDI_WRITEBACK); 1022 bdi_stat_sum(bdi, BDI_WRITEBACK);
1047 } else { 1023 } else {
1048 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); 1024 bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
1049 bdi_dirty = bdi_nr_reclaimable + 1025 bdi_dirty = bdi_reclaimable +
1050 bdi_stat(bdi, BDI_WRITEBACK); 1026 bdi_stat(bdi, BDI_WRITEBACK);
1051 } 1027 }
1052 1028
1053 /* 1029 dirty_exceeded = (bdi_dirty > bdi_thresh) ||
1054 * The bdi thresh is somehow "soft" limit derived from the
1055 * global "hard" limit. The former helps to prevent heavy IO
1056 * bdi or process from holding back light ones; The latter is
1057 * the last resort safeguard.
1058 */
1059 dirty_exceeded = (bdi_dirty > task_bdi_thresh) ||
1060 (nr_dirty > dirty_thresh); 1030 (nr_dirty > dirty_thresh);
1061 clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) && 1031 if (dirty_exceeded && !bdi->dirty_exceeded)
1062 (nr_dirty <= dirty_thresh);
1063
1064 if (!dirty_exceeded)
1065 break;
1066
1067 if (!bdi->dirty_exceeded)
1068 bdi->dirty_exceeded = 1; 1032 bdi->dirty_exceeded = 1;
1069 1033
1070 bdi_update_bandwidth(bdi, dirty_thresh, background_thresh, 1034 bdi_update_bandwidth(bdi, dirty_thresh, background_thresh,
1071 nr_dirty, bdi_thresh, bdi_dirty, 1035 nr_dirty, bdi_thresh, bdi_dirty,
1072 start_time); 1036 start_time);
1073 1037
1074 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. 1038 dirty_ratelimit = bdi->dirty_ratelimit;
1075 * Unstable writes are a feature of certain networked 1039 pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
1076 * filesystems (i.e. NFS) in which data may have been 1040 background_thresh, nr_dirty,
1077 * written to the server's write cache, but has not yet 1041 bdi_thresh, bdi_dirty);
1078 * been flushed to permanent storage. 1042 if (unlikely(pos_ratio == 0)) {
1079 * Only move pages to writeback if this bdi is over its 1043 pause = MAX_PAUSE;
1080 * threshold otherwise wait until the disk writes catch 1044 goto pause;
1081 * up.
1082 */
1083 trace_balance_dirty_start(bdi);
1084 if (bdi_nr_reclaimable > task_bdi_thresh) {
1085 pages_written += writeback_inodes_wb(&bdi->wb,
1086 write_chunk);
1087 trace_balance_dirty_written(bdi, pages_written);
1088 if (pages_written >= write_chunk)
1089 break; /* We've done our duty */
1090 } 1045 }
1046 task_ratelimit = (u64)dirty_ratelimit *
1047 pos_ratio >> RATELIMIT_CALC_SHIFT;
1048 pause = (HZ * pages_dirtied) / (task_ratelimit | 1);
1049 pause = min_t(long, pause, MAX_PAUSE);
1050
1051pause:
1091 __set_current_state(TASK_UNINTERRUPTIBLE); 1052 __set_current_state(TASK_UNINTERRUPTIBLE);
1092 io_schedule_timeout(pause); 1053 io_schedule_timeout(pause);
1093 trace_balance_dirty_wait(bdi);
1094 1054
1095 dirty_thresh = hard_dirty_limit(dirty_thresh); 1055 dirty_thresh = hard_dirty_limit(dirty_thresh);
1096 /* 1056 /*
@@ -1099,22 +1059,11 @@ static void balance_dirty_pages(struct address_space *mapping,
1099 * 200ms is typically more than enough to curb heavy dirtiers; 1059 * 200ms is typically more than enough to curb heavy dirtiers;
1100 * (b) the pause time limit makes the dirtiers more responsive. 1060 * (b) the pause time limit makes the dirtiers more responsive.
1101 */ 1061 */
1102 if (nr_dirty < dirty_thresh && 1062 if (nr_dirty < dirty_thresh)
1103 bdi_dirty < (task_bdi_thresh + bdi_thresh) / 2 &&
1104 time_after(jiffies, start_time + MAX_PAUSE))
1105 break; 1063 break;
1106
1107 /*
1108 * Increase the delay for each loop, up to our previous
1109 * default of taking a 100ms nap.
1110 */
1111 pause <<= 1;
1112 if (pause > HZ / 10)
1113 pause = HZ / 10;
1114 } 1064 }
1115 1065
1116 /* Clear dirty_exceeded flag only when no task can exceed the limit */ 1066 if (!dirty_exceeded && bdi->dirty_exceeded)
1117 if (clear_dirty_exceeded && bdi->dirty_exceeded)
1118 bdi->dirty_exceeded = 0; 1067 bdi->dirty_exceeded = 0;
1119 1068
1120 current->nr_dirtied = 0; 1069 current->nr_dirtied = 0;
@@ -1131,8 +1080,10 @@ static void balance_dirty_pages(struct address_space *mapping,
1131 * In normal mode, we start background writeout at the lower 1080 * In normal mode, we start background writeout at the lower
1132 * background_thresh, to keep the amount of dirty memory low. 1081 * background_thresh, to keep the amount of dirty memory low.
1133 */ 1082 */
1134 if ((laptop_mode && pages_written) || 1083 if (laptop_mode)
1135 (!laptop_mode && (nr_reclaimable > background_thresh))) 1084 return;
1085
1086 if (nr_reclaimable > background_thresh)
1136 bdi_start_background_writeback(bdi); 1087 bdi_start_background_writeback(bdi);
1137} 1088}
1138 1089