aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWu Fengguang <fengguang.wu@intel.com>2011-06-11 21:25:42 -0400
committerWu Fengguang <fengguang.wu@intel.com>2011-12-18 01:20:27 -0500
commit83712358ba0a1497ce59a4f84ce4dd0f803fe6fc (patch)
treed17ab27a7bff50616e3b63ad137c004d9ccfbcb0
parent32c7f202a4801252a0f3578807b75a961f792870 (diff)
writeback: dirty ratelimit - think time compensation
Compensate the task's think time when computing the final pause time, so that ->dirty_ratelimit can be executed accurately. think time := time spend outside of balance_dirty_pages() In the rare case that the task slept longer than the 200ms period time (result in negative pause time), the sleep time will be compensated in the following periods, too, if it's less than 1 second. Accumulated errors are carefully avoided as long as the max pause area is not hitted. Pseudo code: period = pages_dirtied / task_ratelimit; think = jiffies - dirty_paused_when; pause = period - think; 1) normal case: period > think pause = period - think dirty_paused_when = jiffies + pause nr_dirtied = 0 period time |===============================>| think time pause time |===============>|==============>| ------|----------------|---------------|------------------------ dirty_paused_when jiffies 2) no pause case: period <= think don't pause; reduce future pause time by: dirty_paused_when += period nr_dirtied = 0 period time |===============================>| think time |===================================================>| ------|--------------------------------+-------------------|---- dirty_paused_when jiffies Acked-by: Jan Kara <jack@suse.cz> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
-rw-r--r--include/linux/sched.h1
-rw-r--r--include/trace/events/writeback.h14
-rw-r--r--kernel/fork.c1
-rw-r--r--mm/page-writeback.c36
4 files changed, 45 insertions, 7 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1c4f3e9b9bc5..984c3b295978 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1527,6 +1527,7 @@ struct task_struct {
1527 */ 1527 */
1528 int nr_dirtied; 1528 int nr_dirtied;
1529 int nr_dirtied_pause; 1529 int nr_dirtied_pause;
1530 unsigned long dirty_paused_when; /* start of a write-and-pause period */
1530 1531
1531#ifdef CONFIG_LATENCYTOP 1532#ifdef CONFIG_LATENCYTOP
1532 int latency_record_count; 1533 int latency_record_count;
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 99d1d0decf88..8588a8918023 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -300,12 +300,13 @@ TRACE_EVENT(balance_dirty_pages,
300 unsigned long dirty_ratelimit, 300 unsigned long dirty_ratelimit,
301 unsigned long task_ratelimit, 301 unsigned long task_ratelimit,
302 unsigned long dirtied, 302 unsigned long dirtied,
303 unsigned long period,
303 long pause, 304 long pause,
304 unsigned long start_time), 305 unsigned long start_time),
305 306
306 TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, 307 TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
307 dirty_ratelimit, task_ratelimit, 308 dirty_ratelimit, task_ratelimit,
308 dirtied, pause, start_time), 309 dirtied, period, pause, start_time),
309 310
310 TP_STRUCT__entry( 311 TP_STRUCT__entry(
311 __array( char, bdi, 32) 312 __array( char, bdi, 32)
@@ -320,6 +321,8 @@ TRACE_EVENT(balance_dirty_pages,
320 __field(unsigned int, dirtied_pause) 321 __field(unsigned int, dirtied_pause)
321 __field(unsigned long, paused) 322 __field(unsigned long, paused)
322 __field( long, pause) 323 __field( long, pause)
324 __field(unsigned long, period)
325 __field( long, think)
323 ), 326 ),
324 327
325 TP_fast_assign( 328 TP_fast_assign(
@@ -336,6 +339,9 @@ TRACE_EVENT(balance_dirty_pages,
336 __entry->task_ratelimit = KBps(task_ratelimit); 339 __entry->task_ratelimit = KBps(task_ratelimit);
337 __entry->dirtied = dirtied; 340 __entry->dirtied = dirtied;
338 __entry->dirtied_pause = current->nr_dirtied_pause; 341 __entry->dirtied_pause = current->nr_dirtied_pause;
342 __entry->think = current->dirty_paused_when == 0 ? 0 :
343 (long)(jiffies - current->dirty_paused_when) * 1000/HZ;
344 __entry->period = period * 1000 / HZ;
339 __entry->pause = pause * 1000 / HZ; 345 __entry->pause = pause * 1000 / HZ;
340 __entry->paused = (jiffies - start_time) * 1000 / HZ; 346 __entry->paused = (jiffies - start_time) * 1000 / HZ;
341 ), 347 ),
@@ -346,7 +352,7 @@ TRACE_EVENT(balance_dirty_pages,
346 "bdi_setpoint=%lu bdi_dirty=%lu " 352 "bdi_setpoint=%lu bdi_dirty=%lu "
347 "dirty_ratelimit=%lu task_ratelimit=%lu " 353 "dirty_ratelimit=%lu task_ratelimit=%lu "
348 "dirtied=%u dirtied_pause=%u " 354 "dirtied=%u dirtied_pause=%u "
349 "paused=%lu pause=%ld", 355 "paused=%lu pause=%ld period=%lu think=%ld",
350 __entry->bdi, 356 __entry->bdi,
351 __entry->limit, 357 __entry->limit,
352 __entry->setpoint, 358 __entry->setpoint,
@@ -358,7 +364,9 @@ TRACE_EVENT(balance_dirty_pages,
358 __entry->dirtied, 364 __entry->dirtied,
359 __entry->dirtied_pause, 365 __entry->dirtied_pause,
360 __entry->paused, /* ms */ 366 __entry->paused, /* ms */
361 __entry->pause /* ms */ 367 __entry->pause, /* ms */
368 __entry->period, /* ms */
369 __entry->think /* ms */
362 ) 370 )
363); 371);
364 372
diff --git a/kernel/fork.c b/kernel/fork.c
index da4a6a10d088..f8668cf6a32d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1296,6 +1296,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1296 1296
1297 p->nr_dirtied = 0; 1297 p->nr_dirtied = 0;
1298 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); 1298 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
1299 p->dirty_paused_when = 0;
1299 1300
1300 /* 1301 /*
1301 * Ok, make it visible to the rest of the system. 1302 * Ok, make it visible to the rest of the system.
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 96b3e7aa705c..491932155825 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1016,6 +1016,7 @@ static void balance_dirty_pages(struct address_space *mapping,
1016 unsigned long background_thresh; 1016 unsigned long background_thresh;
1017 unsigned long dirty_thresh; 1017 unsigned long dirty_thresh;
1018 unsigned long bdi_thresh; 1018 unsigned long bdi_thresh;
1019 long period;
1019 long pause = 0; 1020 long pause = 0;
1020 long uninitialized_var(max_pause); 1021 long uninitialized_var(max_pause);
1021 bool dirty_exceeded = false; 1022 bool dirty_exceeded = false;
@@ -1026,6 +1027,8 @@ static void balance_dirty_pages(struct address_space *mapping,
1026 unsigned long start_time = jiffies; 1027 unsigned long start_time = jiffies;
1027 1028
1028 for (;;) { 1029 for (;;) {
1030 unsigned long now = jiffies;
1031
1029 /* 1032 /*
1030 * Unstable writes are a feature of certain networked 1033 * Unstable writes are a feature of certain networked
1031 * filesystems (i.e. NFS) in which data may have been 1034 * filesystems (i.e. NFS) in which data may have been
@@ -1045,8 +1048,11 @@ static void balance_dirty_pages(struct address_space *mapping,
1045 */ 1048 */
1046 freerun = dirty_freerun_ceiling(dirty_thresh, 1049 freerun = dirty_freerun_ceiling(dirty_thresh,
1047 background_thresh); 1050 background_thresh);
1048 if (nr_dirty <= freerun) 1051 if (nr_dirty <= freerun) {
1052 current->dirty_paused_when = now;
1053 current->nr_dirtied = 0;
1049 break; 1054 break;
1055 }
1050 1056
1051 if (unlikely(!writeback_in_progress(bdi))) 1057 if (unlikely(!writeback_in_progress(bdi)))
1052 bdi_start_background_writeback(bdi); 1058 bdi_start_background_writeback(bdi);
@@ -1104,10 +1110,21 @@ static void balance_dirty_pages(struct address_space *mapping,
1104 task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> 1110 task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
1105 RATELIMIT_CALC_SHIFT; 1111 RATELIMIT_CALC_SHIFT;
1106 if (unlikely(task_ratelimit == 0)) { 1112 if (unlikely(task_ratelimit == 0)) {
1113 period = max_pause;
1107 pause = max_pause; 1114 pause = max_pause;
1108 goto pause; 1115 goto pause;
1109 } 1116 }
1110 pause = HZ * pages_dirtied / task_ratelimit; 1117 period = HZ * pages_dirtied / task_ratelimit;
1118 pause = period;
1119 if (current->dirty_paused_when)
1120 pause -= now - current->dirty_paused_when;
1121 /*
1122 * For less than 1s think time (ext3/4 may block the dirtier
1123 * for up to 800ms from time to time on 1-HDD; so does xfs,
1124 * however at much less frequency), try to compensate it in
1125 * future periods by updating the virtual time; otherwise just
1126 * do a reset, as it may be a light dirtier.
1127 */
1111 if (unlikely(pause <= 0)) { 1128 if (unlikely(pause <= 0)) {
1112 trace_balance_dirty_pages(bdi, 1129 trace_balance_dirty_pages(bdi,
1113 dirty_thresh, 1130 dirty_thresh,
@@ -1118,8 +1135,16 @@ static void balance_dirty_pages(struct address_space *mapping,
1118 dirty_ratelimit, 1135 dirty_ratelimit,
1119 task_ratelimit, 1136 task_ratelimit,
1120 pages_dirtied, 1137 pages_dirtied,
1138 period,
1121 pause, 1139 pause,
1122 start_time); 1140 start_time);
1141 if (pause < -HZ) {
1142 current->dirty_paused_when = now;
1143 current->nr_dirtied = 0;
1144 } else if (period) {
1145 current->dirty_paused_when += period;
1146 current->nr_dirtied = 0;
1147 }
1123 pause = 1; /* avoid resetting nr_dirtied_pause below */ 1148 pause = 1; /* avoid resetting nr_dirtied_pause below */
1124 break; 1149 break;
1125 } 1150 }
@@ -1135,11 +1160,15 @@ pause:
1135 dirty_ratelimit, 1160 dirty_ratelimit,
1136 task_ratelimit, 1161 task_ratelimit,
1137 pages_dirtied, 1162 pages_dirtied,
1163 period,
1138 pause, 1164 pause,
1139 start_time); 1165 start_time);
1140 __set_current_state(TASK_KILLABLE); 1166 __set_current_state(TASK_KILLABLE);
1141 io_schedule_timeout(pause); 1167 io_schedule_timeout(pause);
1142 1168
1169 current->dirty_paused_when = now + pause;
1170 current->nr_dirtied = 0;
1171
1143 /* 1172 /*
1144 * This is typically equal to (nr_dirty < dirty_thresh) and can 1173 * This is typically equal to (nr_dirty < dirty_thresh) and can
1145 * also keep "1000+ dd on a slow USB stick" under control. 1174 * also keep "1000+ dd on a slow USB stick" under control.
@@ -1167,11 +1196,10 @@ pause:
1167 if (!dirty_exceeded && bdi->dirty_exceeded) 1196 if (!dirty_exceeded && bdi->dirty_exceeded)
1168 bdi->dirty_exceeded = 0; 1197 bdi->dirty_exceeded = 0;
1169 1198
1170 current->nr_dirtied = 0;
1171 if (pause == 0) { /* in freerun area */ 1199 if (pause == 0) { /* in freerun area */
1172 current->nr_dirtied_pause = 1200 current->nr_dirtied_pause =
1173 dirty_poll_interval(nr_dirty, dirty_thresh); 1201 dirty_poll_interval(nr_dirty, dirty_thresh);
1174 } else if (pause <= max_pause / 4 && 1202 } else if (period <= max_pause / 4 &&
1175 pages_dirtied >= current->nr_dirtied_pause) { 1203 pages_dirtied >= current->nr_dirtied_pause) {
1176 current->nr_dirtied_pause = clamp_val( 1204 current->nr_dirtied_pause = clamp_val(
1177 dirty_ratelimit * (max_pause / 2) / HZ, 1205 dirty_ratelimit * (max_pause / 2) / HZ,