aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page-writeback.c
diff options
context:
space:
mode:
authorWu Fengguang <fengguang.wu@intel.com>2011-06-11 20:10:12 -0400
committerWu Fengguang <fengguang.wu@intel.com>2011-10-03 09:08:57 -0400
commit9d823e8f6b1b7b39f952d7d1795f29162143a433 (patch)
tree2ef4c0d29353452dd2f894e7dbd240a31bdd0a02 /mm/page-writeback.c
parent7381131cbcf7e15d201a0ffd782a4698efe4e740 (diff)
writeback: per task dirty rate limit
Add two fields to task_struct. 1) account dirtied pages in the individual tasks, for accuracy 2) per-task balance_dirty_pages() call intervals, for flexibility The balance_dirty_pages() call interval (ie. nr_dirtied_pause) will scale near-sqrt to the safety gap between dirty pages and threshold. The main problem of per-task nr_dirtied is, if 1k+ tasks start dirtying pages at exactly the same time, each task will be assigned a large initial nr_dirtied_pause, so that the dirty threshold will be exceeded long before each task reached its nr_dirtied_pause and hence call balance_dirty_pages(). The solution is to watch for the number of pages dirtied on each CPU in between the calls into balance_dirty_pages(). If it exceeds ratelimit_pages (3% dirty threshold), force call balance_dirty_pages() for a chance to set bdi->dirty_exceeded. In normal situations, this safeguarding condition is not expected to trigger at all. On the sqrt in dirty_poll_interval(): It will serve as an initial guess when dirty pages are still in the freerun area. When dirty pages are floating inside the dirty control scope [freerun, limit], a followup patch will use some refined dirty poll interval to get the desired pause time. thresh-dirty (MB) sqrt 1 16 2 22 4 32 8 45 16 64 32 90 64 128 128 181 256 256 512 362 1024 512 The above table means, given 1MB (or 1GB) gap and the dd tasks polling balance_dirty_pages() on every 16 (or 512) pages, the dirty limit won't be exceeded as long as there are less than 16 (or 512) concurrent dd's. So sqrt naturally leads to less overheads and more safe concurrent tasks for large memory servers, which have large (thresh-freerun) gaps. peter: keep the per-CPU ratelimit for safeguarding the 1k+ tasks case CC: Peter Zijlstra <a.p.zijlstra@chello.nl> Reviewed-by: Andrea Righi <andrea@betterlinux.com> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Diffstat (limited to 'mm/page-writeback.c')
-rw-r--r--mm/page-writeback.c89
1 files changed, 50 insertions, 39 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d4a6e91bd9e5..daff320d263f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -54,20 +54,6 @@
54 */ 54 */
55static long ratelimit_pages = 32; 55static long ratelimit_pages = 32;
56 56
57/*
58 * When balance_dirty_pages decides that the caller needs to perform some
59 * non-background writeback, this is how many pages it will attempt to write.
60 * It should be somewhat larger than dirtied pages to ensure that reasonably
61 * large amounts of I/O are submitted.
62 */
63static inline long sync_writeback_pages(unsigned long dirtied)
64{
65 if (dirtied < ratelimit_pages)
66 dirtied = ratelimit_pages;
67
68 return dirtied + dirtied / 2;
69}
70
71/* The following parameters are exported via /proc/sys/vm */ 57/* The following parameters are exported via /proc/sys/vm */
72 58
73/* 59/*
@@ -169,6 +155,8 @@ static void update_completion_period(void)
169 int shift = calc_period_shift(); 155 int shift = calc_period_shift();
170 prop_change_shift(&vm_completions, shift); 156 prop_change_shift(&vm_completions, shift);
171 prop_change_shift(&vm_dirties, shift); 157 prop_change_shift(&vm_dirties, shift);
158
159 writeback_set_ratelimit();
172} 160}
173 161
174int dirty_background_ratio_handler(struct ctl_table *table, int write, 162int dirty_background_ratio_handler(struct ctl_table *table, int write,
@@ -979,6 +967,23 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi,
979} 967}
980 968
981/* 969/*
970 * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
971 * will look to see if it needs to start dirty throttling.
972 *
973 * If dirty_poll_interval is too low, big NUMA machines will call the expensive
974 * global_page_state() too often. So scale it near-sqrt to the safety margin
975 * (the number of pages we may dirty without exceeding the dirty limits).
976 */
977static unsigned long dirty_poll_interval(unsigned long dirty,
978 unsigned long thresh)
979{
980 if (thresh > dirty)
981 return 1UL << (ilog2(thresh - dirty) >> 1);
982
983 return 1;
984}
985
986/*
982 * balance_dirty_pages() must be called by processes which are generating dirty 987 * balance_dirty_pages() must be called by processes which are generating dirty
983 * data. It looks at the number of dirty pages in the machine and will force 988 * data. It looks at the number of dirty pages in the machine and will force
984 * the caller to perform writeback if the system is over `vm_dirty_ratio'. 989 * the caller to perform writeback if the system is over `vm_dirty_ratio'.
@@ -1112,6 +1117,9 @@ static void balance_dirty_pages(struct address_space *mapping,
1112 if (clear_dirty_exceeded && bdi->dirty_exceeded) 1117 if (clear_dirty_exceeded && bdi->dirty_exceeded)
1113 bdi->dirty_exceeded = 0; 1118 bdi->dirty_exceeded = 0;
1114 1119
1120 current->nr_dirtied = 0;
1121 current->nr_dirtied_pause = dirty_poll_interval(nr_dirty, dirty_thresh);
1122
1115 if (writeback_in_progress(bdi)) 1123 if (writeback_in_progress(bdi))
1116 return; 1124 return;
1117 1125
@@ -1138,7 +1146,7 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
1138 } 1146 }
1139} 1147}
1140 1148
1141static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; 1149static DEFINE_PER_CPU(int, bdp_ratelimits);
1142 1150
1143/** 1151/**
1144 * balance_dirty_pages_ratelimited_nr - balance dirty memory state 1152 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
@@ -1158,31 +1166,39 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
1158 unsigned long nr_pages_dirtied) 1166 unsigned long nr_pages_dirtied)
1159{ 1167{
1160 struct backing_dev_info *bdi = mapping->backing_dev_info; 1168 struct backing_dev_info *bdi = mapping->backing_dev_info;
1161 unsigned long ratelimit; 1169 int ratelimit;
1162 unsigned long *p; 1170 int *p;
1163 1171
1164 if (!bdi_cap_account_dirty(bdi)) 1172 if (!bdi_cap_account_dirty(bdi))
1165 return; 1173 return;
1166 1174
1167 ratelimit = ratelimit_pages; 1175 ratelimit = current->nr_dirtied_pause;
1168 if (mapping->backing_dev_info->dirty_exceeded) 1176 if (bdi->dirty_exceeded)
1169 ratelimit = 8; 1177 ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
1178
1179 current->nr_dirtied += nr_pages_dirtied;
1170 1180
1181 preempt_disable();
1171 /* 1182 /*
1172 * Check the rate limiting. Also, we do not want to throttle real-time 1183 * This prevents one CPU to accumulate too many dirtied pages without
1173 * tasks in balance_dirty_pages(). Period. 1184 * calling into balance_dirty_pages(), which can happen when there are
1185 * 1000+ tasks, all of them start dirtying pages at exactly the same
1186 * time, hence all honoured too large initial task->nr_dirtied_pause.
1174 */ 1187 */
1175 preempt_disable();
1176 p = &__get_cpu_var(bdp_ratelimits); 1188 p = &__get_cpu_var(bdp_ratelimits);
1177 *p += nr_pages_dirtied; 1189 if (unlikely(current->nr_dirtied >= ratelimit))
1178 if (unlikely(*p >= ratelimit)) {
1179 ratelimit = sync_writeback_pages(*p);
1180 *p = 0; 1190 *p = 0;
1181 preempt_enable(); 1191 else {
1182 balance_dirty_pages(mapping, ratelimit); 1192 *p += nr_pages_dirtied;
1183 return; 1193 if (unlikely(*p >= ratelimit_pages)) {
1194 *p = 0;
1195 ratelimit = 0;
1196 }
1184 } 1197 }
1185 preempt_enable(); 1198 preempt_enable();
1199
1200 if (unlikely(current->nr_dirtied >= ratelimit))
1201 balance_dirty_pages(mapping, current->nr_dirtied);
1186} 1202}
1187EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); 1203EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
1188 1204
@@ -1277,22 +1293,17 @@ void laptop_sync_completion(void)
1277 * 1293 *
1278 * Here we set ratelimit_pages to a level which ensures that when all CPUs are 1294 * Here we set ratelimit_pages to a level which ensures that when all CPUs are
1279 * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory 1295 * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
1280 * thresholds before writeback cuts in. 1296 * thresholds.
1281 *
1282 * But the limit should not be set too high. Because it also controls the
1283 * amount of memory which the balance_dirty_pages() caller has to write back.
1284 * If this is too large then the caller will block on the IO queue all the
1285 * time. So limit it to four megabytes - the balance_dirty_pages() caller
1286 * will write six megabyte chunks, max.
1287 */ 1297 */
1288 1298
1289void writeback_set_ratelimit(void) 1299void writeback_set_ratelimit(void)
1290{ 1300{
1291 ratelimit_pages = vm_total_pages / (num_online_cpus() * 32); 1301 unsigned long background_thresh;
1302 unsigned long dirty_thresh;
1303 global_dirty_limits(&background_thresh, &dirty_thresh);
1304 ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
1292 if (ratelimit_pages < 16) 1305 if (ratelimit_pages < 16)
1293 ratelimit_pages = 16; 1306 ratelimit_pages = 16;
1294 if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
1295 ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
1296} 1307}
1297 1308
1298static int __cpuinit 1309static int __cpuinit