aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/block_dev.c16
-rw-r--r--fs/btrfs/extent_io.c2
-rw-r--r--fs/ext4/inode.c4
-rw-r--r--fs/fs-writeback.c373
-rw-r--r--fs/inode.c5
-rw-r--r--fs/nfs/write.c3
-rw-r--r--include/linux/backing-dev.h8
-rw-r--r--include/linux/writeback.h55
-rw-r--r--include/trace/events/btrfs.h6
-rw-r--r--include/trace/events/ext4.h6
-rw-r--r--include/trace/events/writeback.h183
-rw-r--r--mm/backing-dev.c82
-rw-r--r--mm/filemap.c6
-rw-r--r--mm/page-writeback.c280
-rw-r--r--mm/rmap.c4
15 files changed, 747 insertions, 286 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c62fb84944d5..f55aad4d1611 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -44,24 +44,28 @@ inline struct block_device *I_BDEV(struct inode *inode)
44{ 44{
45 return &BDEV_I(inode)->bdev; 45 return &BDEV_I(inode)->bdev;
46} 46}
47
48EXPORT_SYMBOL(I_BDEV); 47EXPORT_SYMBOL(I_BDEV);
49 48
50/* 49/*
51 * move the inode from it's current bdi to the a new bdi. if the inode is dirty 50 * Move the inode from its current bdi to a new bdi. If the inode is dirty we
52 * we need to move it onto the dirty list of @dst so that the inode is always 51 * need to move it onto the dirty list of @dst so that the inode is always on
53 * on the right list. 52 * the right list.
54 */ 53 */
55static void bdev_inode_switch_bdi(struct inode *inode, 54static void bdev_inode_switch_bdi(struct inode *inode,
56 struct backing_dev_info *dst) 55 struct backing_dev_info *dst)
57{ 56{
58 spin_lock(&inode_wb_list_lock); 57 struct backing_dev_info *old = inode->i_data.backing_dev_info;
58
59 if (unlikely(dst == old)) /* deadlock avoidance */
60 return;
61 bdi_lock_two(&old->wb, &dst->wb);
59 spin_lock(&inode->i_lock); 62 spin_lock(&inode->i_lock);
60 inode->i_data.backing_dev_info = dst; 63 inode->i_data.backing_dev_info = dst;
61 if (inode->i_state & I_DIRTY) 64 if (inode->i_state & I_DIRTY)
62 list_move(&inode->i_wb_list, &dst->wb.b_dirty); 65 list_move(&inode->i_wb_list, &dst->wb.b_dirty);
63 spin_unlock(&inode->i_lock); 66 spin_unlock(&inode->i_lock);
64 spin_unlock(&inode_wb_list_lock); 67 spin_unlock(&old->wb.list_lock);
68 spin_unlock(&dst->wb.list_lock);
65} 69}
66 70
67static sector_t max_block(struct block_device *bdev) 71static sector_t max_block(struct block_device *bdev)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7055d11c1efd..561262d35689 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2551,7 +2551,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2551 }; 2551 };
2552 struct writeback_control wbc_writepages = { 2552 struct writeback_control wbc_writepages = {
2553 .sync_mode = wbc->sync_mode, 2553 .sync_mode = wbc->sync_mode,
2554 .older_than_this = NULL,
2555 .nr_to_write = 64, 2554 .nr_to_write = 64,
2556 .range_start = page_offset(page) + PAGE_CACHE_SIZE, 2555 .range_start = page_offset(page) + PAGE_CACHE_SIZE,
2557 .range_end = (loff_t)-1, 2556 .range_end = (loff_t)-1,
@@ -2584,7 +2583,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2584 }; 2583 };
2585 struct writeback_control wbc_writepages = { 2584 struct writeback_control wbc_writepages = {
2586 .sync_mode = mode, 2585 .sync_mode = mode,
2587 .older_than_this = NULL,
2588 .nr_to_write = nr_pages * 2, 2586 .nr_to_write = nr_pages * 2,
2589 .range_start = start, 2587 .range_start = start,
2590 .range_end = end + 1, 2588 .range_end = end + 1,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 678cde834f19..3e5191f9f398 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2741,7 +2741,7 @@ static int write_cache_pages_da(struct address_space *mapping,
2741 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2741 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2742 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2742 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2743 2743
2744 if (wbc->sync_mode == WB_SYNC_ALL) 2744 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2745 tag = PAGECACHE_TAG_TOWRITE; 2745 tag = PAGECACHE_TAG_TOWRITE;
2746 else 2746 else
2747 tag = PAGECACHE_TAG_DIRTY; 2747 tag = PAGECACHE_TAG_DIRTY;
@@ -2973,7 +2973,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2973 } 2973 }
2974 2974
2975retry: 2975retry:
2976 if (wbc->sync_mode == WB_SYNC_ALL) 2976 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2977 tag_pages_for_writeback(mapping, index, end); 2977 tag_pages_for_writeback(mapping, index, end);
2978 2978
2979 while (!ret && wbc->nr_to_write > 0) { 2979 while (!ret && wbc->nr_to_write > 0) {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index b8c507ca42f7..1599aa985fe2 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -35,7 +35,9 @@
35struct wb_writeback_work { 35struct wb_writeback_work {
36 long nr_pages; 36 long nr_pages;
37 struct super_block *sb; 37 struct super_block *sb;
38 unsigned long *older_than_this;
38 enum writeback_sync_modes sync_mode; 39 enum writeback_sync_modes sync_mode;
40 unsigned int tagged_writepages:1;
39 unsigned int for_kupdate:1; 41 unsigned int for_kupdate:1;
40 unsigned int range_cyclic:1; 42 unsigned int range_cyclic:1;
41 unsigned int for_background:1; 43 unsigned int for_background:1;
@@ -180,12 +182,13 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
180 */ 182 */
181void inode_wb_list_del(struct inode *inode) 183void inode_wb_list_del(struct inode *inode)
182{ 184{
183 spin_lock(&inode_wb_list_lock); 185 struct backing_dev_info *bdi = inode_to_bdi(inode);
186
187 spin_lock(&bdi->wb.list_lock);
184 list_del_init(&inode->i_wb_list); 188 list_del_init(&inode->i_wb_list);
185 spin_unlock(&inode_wb_list_lock); 189 spin_unlock(&bdi->wb.list_lock);
186} 190}
187 191
188
189/* 192/*
190 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the 193 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
191 * furthest end of its superblock's dirty-inode list. 194 * furthest end of its superblock's dirty-inode list.
@@ -195,11 +198,9 @@ void inode_wb_list_del(struct inode *inode)
195 * the case then the inode must have been redirtied while it was being written 198 * the case then the inode must have been redirtied while it was being written
196 * out and we don't reset its dirtied_when. 199 * out and we don't reset its dirtied_when.
197 */ 200 */
198static void redirty_tail(struct inode *inode) 201static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
199{ 202{
200 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 203 assert_spin_locked(&wb->list_lock);
201
202 assert_spin_locked(&inode_wb_list_lock);
203 if (!list_empty(&wb->b_dirty)) { 204 if (!list_empty(&wb->b_dirty)) {
204 struct inode *tail; 205 struct inode *tail;
205 206
@@ -213,11 +214,9 @@ static void redirty_tail(struct inode *inode)
213/* 214/*
214 * requeue inode for re-scanning after bdi->b_io list is exhausted. 215 * requeue inode for re-scanning after bdi->b_io list is exhausted.
215 */ 216 */
216static void requeue_io(struct inode *inode) 217static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
217{ 218{
218 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 219 assert_spin_locked(&wb->list_lock);
219
220 assert_spin_locked(&inode_wb_list_lock);
221 list_move(&inode->i_wb_list, &wb->b_more_io); 220 list_move(&inode->i_wb_list, &wb->b_more_io);
222} 221}
223 222
@@ -225,7 +224,7 @@ static void inode_sync_complete(struct inode *inode)
225{ 224{
226 /* 225 /*
227 * Prevent speculative execution through 226 * Prevent speculative execution through
228 * spin_unlock(&inode_wb_list_lock); 227 * spin_unlock(&wb->list_lock);
229 */ 228 */
230 229
231 smp_mb(); 230 smp_mb();
@@ -250,15 +249,16 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
250/* 249/*
251 * Move expired dirty inodes from @delaying_queue to @dispatch_queue. 250 * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
252 */ 251 */
253static void move_expired_inodes(struct list_head *delaying_queue, 252static int move_expired_inodes(struct list_head *delaying_queue,
254 struct list_head *dispatch_queue, 253 struct list_head *dispatch_queue,
255 unsigned long *older_than_this) 254 unsigned long *older_than_this)
256{ 255{
257 LIST_HEAD(tmp); 256 LIST_HEAD(tmp);
258 struct list_head *pos, *node; 257 struct list_head *pos, *node;
259 struct super_block *sb = NULL; 258 struct super_block *sb = NULL;
260 struct inode *inode; 259 struct inode *inode;
261 int do_sb_sort = 0; 260 int do_sb_sort = 0;
261 int moved = 0;
262 262
263 while (!list_empty(delaying_queue)) { 263 while (!list_empty(delaying_queue)) {
264 inode = wb_inode(delaying_queue->prev); 264 inode = wb_inode(delaying_queue->prev);
@@ -269,12 +269,13 @@ static void move_expired_inodes(struct list_head *delaying_queue,
269 do_sb_sort = 1; 269 do_sb_sort = 1;
270 sb = inode->i_sb; 270 sb = inode->i_sb;
271 list_move(&inode->i_wb_list, &tmp); 271 list_move(&inode->i_wb_list, &tmp);
272 moved++;
272 } 273 }
273 274
274 /* just one sb in list, splice to dispatch_queue and we're done */ 275 /* just one sb in list, splice to dispatch_queue and we're done */
275 if (!do_sb_sort) { 276 if (!do_sb_sort) {
276 list_splice(&tmp, dispatch_queue); 277 list_splice(&tmp, dispatch_queue);
277 return; 278 goto out;
278 } 279 }
279 280
280 /* Move inodes from one superblock together */ 281 /* Move inodes from one superblock together */
@@ -286,6 +287,8 @@ static void move_expired_inodes(struct list_head *delaying_queue,
286 list_move(&inode->i_wb_list, dispatch_queue); 287 list_move(&inode->i_wb_list, dispatch_queue);
287 } 288 }
288 } 289 }
290out:
291 return moved;
289} 292}
290 293
291/* 294/*
@@ -301,9 +304,11 @@ static void move_expired_inodes(struct list_head *delaying_queue,
301 */ 304 */
302static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) 305static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
303{ 306{
304 assert_spin_locked(&inode_wb_list_lock); 307 int moved;
308 assert_spin_locked(&wb->list_lock);
305 list_splice_init(&wb->b_more_io, &wb->b_io); 309 list_splice_init(&wb->b_more_io, &wb->b_io);
306 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); 310 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
311 trace_writeback_queue_io(wb, older_than_this, moved);
307} 312}
308 313
309static int write_inode(struct inode *inode, struct writeback_control *wbc) 314static int write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -316,7 +321,8 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc)
316/* 321/*
317 * Wait for writeback on an inode to complete. 322 * Wait for writeback on an inode to complete.
318 */ 323 */
319static void inode_wait_for_writeback(struct inode *inode) 324static void inode_wait_for_writeback(struct inode *inode,
325 struct bdi_writeback *wb)
320{ 326{
321 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); 327 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
322 wait_queue_head_t *wqh; 328 wait_queue_head_t *wqh;
@@ -324,15 +330,15 @@ static void inode_wait_for_writeback(struct inode *inode)
324 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 330 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
325 while (inode->i_state & I_SYNC) { 331 while (inode->i_state & I_SYNC) {
326 spin_unlock(&inode->i_lock); 332 spin_unlock(&inode->i_lock);
327 spin_unlock(&inode_wb_list_lock); 333 spin_unlock(&wb->list_lock);
328 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 334 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
329 spin_lock(&inode_wb_list_lock); 335 spin_lock(&wb->list_lock);
330 spin_lock(&inode->i_lock); 336 spin_lock(&inode->i_lock);
331 } 337 }
332} 338}
333 339
334/* 340/*
335 * Write out an inode's dirty pages. Called under inode_wb_list_lock and 341 * Write out an inode's dirty pages. Called under wb->list_lock and
336 * inode->i_lock. Either the caller has an active reference on the inode or 342 * inode->i_lock. Either the caller has an active reference on the inode or
337 * the inode has I_WILL_FREE set. 343 * the inode has I_WILL_FREE set.
338 * 344 *
@@ -343,13 +349,15 @@ static void inode_wait_for_writeback(struct inode *inode)
343 * livelocks, etc. 349 * livelocks, etc.
344 */ 350 */
345static int 351static int
346writeback_single_inode(struct inode *inode, struct writeback_control *wbc) 352writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
353 struct writeback_control *wbc)
347{ 354{
348 struct address_space *mapping = inode->i_mapping; 355 struct address_space *mapping = inode->i_mapping;
356 long nr_to_write = wbc->nr_to_write;
349 unsigned dirty; 357 unsigned dirty;
350 int ret; 358 int ret;
351 359
352 assert_spin_locked(&inode_wb_list_lock); 360 assert_spin_locked(&wb->list_lock);
353 assert_spin_locked(&inode->i_lock); 361 assert_spin_locked(&inode->i_lock);
354 362
355 if (!atomic_read(&inode->i_count)) 363 if (!atomic_read(&inode->i_count))
@@ -367,14 +375,16 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
367 * completed a full scan of b_io. 375 * completed a full scan of b_io.
368 */ 376 */
369 if (wbc->sync_mode != WB_SYNC_ALL) { 377 if (wbc->sync_mode != WB_SYNC_ALL) {
370 requeue_io(inode); 378 requeue_io(inode, wb);
379 trace_writeback_single_inode_requeue(inode, wbc,
380 nr_to_write);
371 return 0; 381 return 0;
372 } 382 }
373 383
374 /* 384 /*
375 * It's a data-integrity sync. We must wait. 385 * It's a data-integrity sync. We must wait.
376 */ 386 */
377 inode_wait_for_writeback(inode); 387 inode_wait_for_writeback(inode, wb);
378 } 388 }
379 389
380 BUG_ON(inode->i_state & I_SYNC); 390 BUG_ON(inode->i_state & I_SYNC);
@@ -383,7 +393,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
383 inode->i_state |= I_SYNC; 393 inode->i_state |= I_SYNC;
384 inode->i_state &= ~I_DIRTY_PAGES; 394 inode->i_state &= ~I_DIRTY_PAGES;
385 spin_unlock(&inode->i_lock); 395 spin_unlock(&inode->i_lock);
386 spin_unlock(&inode_wb_list_lock); 396 spin_unlock(&wb->list_lock);
387 397
388 ret = do_writepages(mapping, wbc); 398 ret = do_writepages(mapping, wbc);
389 399
@@ -414,10 +424,19 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
414 ret = err; 424 ret = err;
415 } 425 }
416 426
417 spin_lock(&inode_wb_list_lock); 427 spin_lock(&wb->list_lock);
418 spin_lock(&inode->i_lock); 428 spin_lock(&inode->i_lock);
419 inode->i_state &= ~I_SYNC; 429 inode->i_state &= ~I_SYNC;
420 if (!(inode->i_state & I_FREEING)) { 430 if (!(inode->i_state & I_FREEING)) {
431 /*
432 * Sync livelock prevention. Each inode is tagged and synced in
433 * one shot. If still dirty, it will be redirty_tail()'ed below.
434 * Update the dirty time to prevent enqueue and sync it again.
435 */
436 if ((inode->i_state & I_DIRTY) &&
437 (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
438 inode->dirtied_when = jiffies;
439
421 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 440 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
422 /* 441 /*
423 * We didn't write back all the pages. nfs_writepages() 442 * We didn't write back all the pages. nfs_writepages()
@@ -428,7 +447,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
428 /* 447 /*
429 * slice used up: queue for next turn 448 * slice used up: queue for next turn
430 */ 449 */
431 requeue_io(inode); 450 requeue_io(inode, wb);
432 } else { 451 } else {
433 /* 452 /*
434 * Writeback blocked by something other than 453 * Writeback blocked by something other than
@@ -437,7 +456,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
437 * retrying writeback of the dirty page/inode 456 * retrying writeback of the dirty page/inode
438 * that cannot be performed immediately. 457 * that cannot be performed immediately.
439 */ 458 */
440 redirty_tail(inode); 459 redirty_tail(inode, wb);
441 } 460 }
442 } else if (inode->i_state & I_DIRTY) { 461 } else if (inode->i_state & I_DIRTY) {
443 /* 462 /*
@@ -446,7 +465,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
446 * submission or metadata updates after data IO 465 * submission or metadata updates after data IO
447 * completion. 466 * completion.
448 */ 467 */
449 redirty_tail(inode); 468 redirty_tail(inode, wb);
450 } else { 469 } else {
451 /* 470 /*
452 * The inode is clean. At this point we either have 471 * The inode is clean. At this point we either have
@@ -457,9 +476,41 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
457 } 476 }
458 } 477 }
459 inode_sync_complete(inode); 478 inode_sync_complete(inode);
479 trace_writeback_single_inode(inode, wbc, nr_to_write);
460 return ret; 480 return ret;
461} 481}
462 482
483static long writeback_chunk_size(struct backing_dev_info *bdi,
484 struct wb_writeback_work *work)
485{
486 long pages;
487
488 /*
489 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
490 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
491 * here avoids calling into writeback_inodes_wb() more than once.
492 *
493 * The intended call sequence for WB_SYNC_ALL writeback is:
494 *
495 * wb_writeback()
496 * writeback_sb_inodes() <== called only once
497 * write_cache_pages() <== called once for each inode
498 * (quickly) tag currently dirty pages
499 * (maybe slowly) sync all tagged pages
500 */
501 if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
502 pages = LONG_MAX;
503 else {
504 pages = min(bdi->avg_write_bandwidth / 2,
505 global_dirty_limit / DIRTY_SCOPE);
506 pages = min(pages, work->nr_pages);
507 pages = round_down(pages + MIN_WRITEBACK_PAGES,
508 MIN_WRITEBACK_PAGES);
509 }
510
511 return pages;
512}
513
463/* 514/*
464 * Write a portion of b_io inodes which belong to @sb. 515 * Write a portion of b_io inodes which belong to @sb.
465 * 516 *
@@ -467,24 +518,36 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
467 * inodes. Otherwise write only ones which go sequentially 518 * inodes. Otherwise write only ones which go sequentially
468 * in reverse order. 519 * in reverse order.
469 * 520 *
470 * Return 1, if the caller writeback routine should be 521 * Return the number of pages and/or inodes written.
471 * interrupted. Otherwise return 0.
472 */ 522 */
473static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, 523static long writeback_sb_inodes(struct super_block *sb,
474 struct writeback_control *wbc, bool only_this_sb) 524 struct bdi_writeback *wb,
525 struct wb_writeback_work *work)
475{ 526{
527 struct writeback_control wbc = {
528 .sync_mode = work->sync_mode,
529 .tagged_writepages = work->tagged_writepages,
530 .for_kupdate = work->for_kupdate,
531 .for_background = work->for_background,
532 .range_cyclic = work->range_cyclic,
533 .range_start = 0,
534 .range_end = LLONG_MAX,
535 };
536 unsigned long start_time = jiffies;
537 long write_chunk;
538 long wrote = 0; /* count both pages and inodes */
539
476 while (!list_empty(&wb->b_io)) { 540 while (!list_empty(&wb->b_io)) {
477 long pages_skipped;
478 struct inode *inode = wb_inode(wb->b_io.prev); 541 struct inode *inode = wb_inode(wb->b_io.prev);
479 542
480 if (inode->i_sb != sb) { 543 if (inode->i_sb != sb) {
481 if (only_this_sb) { 544 if (work->sb) {
482 /* 545 /*
483 * We only want to write back data for this 546 * We only want to write back data for this
484 * superblock, move all inodes not belonging 547 * superblock, move all inodes not belonging
485 * to it back onto the dirty list. 548 * to it back onto the dirty list.
486 */ 549 */
487 redirty_tail(inode); 550 redirty_tail(inode, wb);
488 continue; 551 continue;
489 } 552 }
490 553
@@ -493,7 +556,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
493 * Bounce back to the caller to unpin this and 556 * Bounce back to the caller to unpin this and
494 * pin the next superblock. 557 * pin the next superblock.
495 */ 558 */
496 return 0; 559 break;
497 } 560 }
498 561
499 /* 562 /*
@@ -504,95 +567,91 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
504 spin_lock(&inode->i_lock); 567 spin_lock(&inode->i_lock);
505 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 568 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
506 spin_unlock(&inode->i_lock); 569 spin_unlock(&inode->i_lock);
507 requeue_io(inode); 570 redirty_tail(inode, wb);
508 continue; 571 continue;
509 } 572 }
510
511 /*
512 * Was this inode dirtied after sync_sb_inodes was called?
513 * This keeps sync from extra jobs and livelock.
514 */
515 if (inode_dirtied_after(inode, wbc->wb_start)) {
516 spin_unlock(&inode->i_lock);
517 return 1;
518 }
519
520 __iget(inode); 573 __iget(inode);
574 write_chunk = writeback_chunk_size(wb->bdi, work);
575 wbc.nr_to_write = write_chunk;
576 wbc.pages_skipped = 0;
521 577
522 pages_skipped = wbc->pages_skipped; 578 writeback_single_inode(inode, wb, &wbc);
523 writeback_single_inode(inode, wbc); 579
524 if (wbc->pages_skipped != pages_skipped) { 580 work->nr_pages -= write_chunk - wbc.nr_to_write;
581 wrote += write_chunk - wbc.nr_to_write;
582 if (!(inode->i_state & I_DIRTY))
583 wrote++;
584 if (wbc.pages_skipped) {
525 /* 585 /*
526 * writeback is not making progress due to locked 586 * writeback is not making progress due to locked
527 * buffers. Skip this inode for now. 587 * buffers. Skip this inode for now.
528 */ 588 */
529 redirty_tail(inode); 589 redirty_tail(inode, wb);
530 } 590 }
531 spin_unlock(&inode->i_lock); 591 spin_unlock(&inode->i_lock);
532 spin_unlock(&inode_wb_list_lock); 592 spin_unlock(&wb->list_lock);
533 iput(inode); 593 iput(inode);
534 cond_resched(); 594 cond_resched();
535 spin_lock(&inode_wb_list_lock); 595 spin_lock(&wb->list_lock);
536 if (wbc->nr_to_write <= 0) { 596 /*
537 wbc->more_io = 1; 597 * bail out to wb_writeback() often enough to check
538 return 1; 598 * background threshold and other termination conditions.
599 */
600 if (wrote) {
601 if (time_is_before_jiffies(start_time + HZ / 10UL))
602 break;
603 if (work->nr_pages <= 0)
604 break;
539 } 605 }
540 if (!list_empty(&wb->b_more_io))
541 wbc->more_io = 1;
542 } 606 }
543 /* b_io is empty */ 607 return wrote;
544 return 1;
545} 608}
546 609
547void writeback_inodes_wb(struct bdi_writeback *wb, 610static long __writeback_inodes_wb(struct bdi_writeback *wb,
548 struct writeback_control *wbc) 611 struct wb_writeback_work *work)
549{ 612{
550 int ret = 0; 613 unsigned long start_time = jiffies;
551 614 long wrote = 0;
552 if (!wbc->wb_start)
553 wbc->wb_start = jiffies; /* livelock avoidance */
554 spin_lock(&inode_wb_list_lock);
555 if (!wbc->for_kupdate || list_empty(&wb->b_io))
556 queue_io(wb, wbc->older_than_this);
557 615
558 while (!list_empty(&wb->b_io)) { 616 while (!list_empty(&wb->b_io)) {
559 struct inode *inode = wb_inode(wb->b_io.prev); 617 struct inode *inode = wb_inode(wb->b_io.prev);
560 struct super_block *sb = inode->i_sb; 618 struct super_block *sb = inode->i_sb;
561 619
562 if (!grab_super_passive(sb)) { 620 if (!grab_super_passive(sb)) {
563 requeue_io(inode); 621 requeue_io(inode, wb);
564 continue; 622 continue;
565 } 623 }
566 ret = writeback_sb_inodes(sb, wb, wbc, false); 624 wrote += writeback_sb_inodes(sb, wb, work);
567 drop_super(sb); 625 drop_super(sb);
568 626
569 if (ret) 627 /* refer to the same tests at the end of writeback_sb_inodes */
570 break; 628 if (wrote) {
629 if (time_is_before_jiffies(start_time + HZ / 10UL))
630 break;
631 if (work->nr_pages <= 0)
632 break;
633 }
571 } 634 }
572 spin_unlock(&inode_wb_list_lock);
573 /* Leave any unwritten inodes on b_io */ 635 /* Leave any unwritten inodes on b_io */
636 return wrote;
574} 637}
575 638
576static void __writeback_inodes_sb(struct super_block *sb, 639long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages)
577 struct bdi_writeback *wb, struct writeback_control *wbc)
578{ 640{
579 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 641 struct wb_writeback_work work = {
642 .nr_pages = nr_pages,
643 .sync_mode = WB_SYNC_NONE,
644 .range_cyclic = 1,
645 };
580 646
581 spin_lock(&inode_wb_list_lock); 647 spin_lock(&wb->list_lock);
582 if (!wbc->for_kupdate || list_empty(&wb->b_io)) 648 if (list_empty(&wb->b_io))
583 queue_io(wb, wbc->older_than_this); 649 queue_io(wb, NULL);
584 writeback_sb_inodes(sb, wb, wbc, true); 650 __writeback_inodes_wb(wb, &work);
585 spin_unlock(&inode_wb_list_lock); 651 spin_unlock(&wb->list_lock);
586}
587 652
588/* 653 return nr_pages - work.nr_pages;
589 * The maximum number of pages to writeout in a single bdi flush/kupdate 654}
590 * operation. We do this so we don't hold I_SYNC against an inode for
591 * enormous amounts of time, which would block a userspace task which has
592 * been forced to throttle against that inode. Also, the code reevaluates
593 * the dirty each time it has written this many pages.
594 */
595#define MAX_WRITEBACK_PAGES 1024
596 655
597static inline bool over_bground_thresh(void) 656static inline bool over_bground_thresh(void)
598{ 657{
@@ -605,6 +664,16 @@ static inline bool over_bground_thresh(void)
605} 664}
606 665
607/* 666/*
667 * Called under wb->list_lock. If there are multiple wb per bdi,
668 * only the flusher working on the first wb should do it.
669 */
670static void wb_update_bandwidth(struct bdi_writeback *wb,
671 unsigned long start_time)
672{
673 __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time);
674}
675
676/*
608 * Explicit flushing or periodic writeback of "old" data. 677 * Explicit flushing or periodic writeback of "old" data.
609 * 678 *
610 * Define "old": the first time one of an inode's pages is dirtied, we mark the 679 * Define "old": the first time one of an inode's pages is dirtied, we mark the
@@ -622,47 +691,16 @@ static inline bool over_bground_thresh(void)
622static long wb_writeback(struct bdi_writeback *wb, 691static long wb_writeback(struct bdi_writeback *wb,
623 struct wb_writeback_work *work) 692 struct wb_writeback_work *work)
624{ 693{
625 struct writeback_control wbc = { 694 unsigned long wb_start = jiffies;
626 .sync_mode = work->sync_mode, 695 long nr_pages = work->nr_pages;
627 .older_than_this = NULL,
628 .for_kupdate = work->for_kupdate,
629 .for_background = work->for_background,
630 .range_cyclic = work->range_cyclic,
631 };
632 unsigned long oldest_jif; 696 unsigned long oldest_jif;
633 long wrote = 0;
634 long write_chunk;
635 struct inode *inode; 697 struct inode *inode;
698 long progress;
636 699
637 if (wbc.for_kupdate) { 700 oldest_jif = jiffies;
638 wbc.older_than_this = &oldest_jif; 701 work->older_than_this = &oldest_jif;
639 oldest_jif = jiffies -
640 msecs_to_jiffies(dirty_expire_interval * 10);
641 }
642 if (!wbc.range_cyclic) {
643 wbc.range_start = 0;
644 wbc.range_end = LLONG_MAX;
645 }
646 702
647 /* 703 spin_lock(&wb->list_lock);
648 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
649 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
650 * here avoids calling into writeback_inodes_wb() more than once.
651 *
652 * The intended call sequence for WB_SYNC_ALL writeback is:
653 *
654 * wb_writeback()
655 * __writeback_inodes_sb() <== called only once
656 * write_cache_pages() <== called once for each inode
657 * (quickly) tag currently dirty pages
658 * (maybe slowly) sync all tagged pages
659 */
660 if (wbc.sync_mode == WB_SYNC_NONE)
661 write_chunk = MAX_WRITEBACK_PAGES;
662 else
663 write_chunk = LONG_MAX;
664
665 wbc.wb_start = jiffies; /* livelock avoidance */
666 for (;;) { 704 for (;;) {
667 /* 705 /*
668 * Stop writeback when nr_pages has been consumed 706 * Stop writeback when nr_pages has been consumed
@@ -687,52 +725,54 @@ static long wb_writeback(struct bdi_writeback *wb,
687 if (work->for_background && !over_bground_thresh()) 725 if (work->for_background && !over_bground_thresh())
688 break; 726 break;
689 727
690 wbc.more_io = 0; 728 if (work->for_kupdate) {
691 wbc.nr_to_write = write_chunk; 729 oldest_jif = jiffies -
692 wbc.pages_skipped = 0; 730 msecs_to_jiffies(dirty_expire_interval * 10);
731 work->older_than_this = &oldest_jif;
732 }
693 733
694 trace_wbc_writeback_start(&wbc, wb->bdi); 734 trace_writeback_start(wb->bdi, work);
735 if (list_empty(&wb->b_io))
736 queue_io(wb, work->older_than_this);
695 if (work->sb) 737 if (work->sb)
696 __writeback_inodes_sb(work->sb, wb, &wbc); 738 progress = writeback_sb_inodes(work->sb, wb, work);
697 else 739 else
698 writeback_inodes_wb(wb, &wbc); 740 progress = __writeback_inodes_wb(wb, work);
699 trace_wbc_writeback_written(&wbc, wb->bdi); 741 trace_writeback_written(wb->bdi, work);
700 742
701 work->nr_pages -= write_chunk - wbc.nr_to_write; 743 wb_update_bandwidth(wb, wb_start);
702 wrote += write_chunk - wbc.nr_to_write;
703 744
704 /* 745 /*
705 * If we consumed everything, see if we have more 746 * Did we write something? Try for more
747 *
748 * Dirty inodes are moved to b_io for writeback in batches.
749 * The completion of the current batch does not necessarily
750 * mean the overall work is done. So we keep looping as long
751 * as made some progress on cleaning pages or inodes.
706 */ 752 */
707 if (wbc.nr_to_write <= 0) 753 if (progress)
708 continue; 754 continue;
709 /* 755 /*
710 * Didn't write everything and we don't have more IO, bail 756 * No more inodes for IO, bail
711 */ 757 */
712 if (!wbc.more_io) 758 if (list_empty(&wb->b_more_io))
713 break; 759 break;
714 /* 760 /*
715 * Did we write something? Try for more
716 */
717 if (wbc.nr_to_write < write_chunk)
718 continue;
719 /*
720 * Nothing written. Wait for some inode to 761 * Nothing written. Wait for some inode to
721 * become available for writeback. Otherwise 762 * become available for writeback. Otherwise
722 * we'll just busyloop. 763 * we'll just busyloop.
723 */ 764 */
724 spin_lock(&inode_wb_list_lock);
725 if (!list_empty(&wb->b_more_io)) { 765 if (!list_empty(&wb->b_more_io)) {
766 trace_writeback_wait(wb->bdi, work);
726 inode = wb_inode(wb->b_more_io.prev); 767 inode = wb_inode(wb->b_more_io.prev);
727 trace_wbc_writeback_wait(&wbc, wb->bdi);
728 spin_lock(&inode->i_lock); 768 spin_lock(&inode->i_lock);
729 inode_wait_for_writeback(inode); 769 inode_wait_for_writeback(inode, wb);
730 spin_unlock(&inode->i_lock); 770 spin_unlock(&inode->i_lock);
731 } 771 }
732 spin_unlock(&inode_wb_list_lock);
733 } 772 }
773 spin_unlock(&wb->list_lock);
734 774
735 return wrote; 775 return nr_pages - work->nr_pages;
736} 776}
737 777
738/* 778/*
@@ -1063,10 +1103,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1063 } 1103 }
1064 1104
1065 spin_unlock(&inode->i_lock); 1105 spin_unlock(&inode->i_lock);
1066 spin_lock(&inode_wb_list_lock); 1106 spin_lock(&bdi->wb.list_lock);
1067 inode->dirtied_when = jiffies; 1107 inode->dirtied_when = jiffies;
1068 list_move(&inode->i_wb_list, &bdi->wb.b_dirty); 1108 list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
1069 spin_unlock(&inode_wb_list_lock); 1109 spin_unlock(&bdi->wb.list_lock);
1070 1110
1071 if (wakeup_bdi) 1111 if (wakeup_bdi)
1072 bdi_wakeup_thread_delayed(bdi); 1112 bdi_wakeup_thread_delayed(bdi);
@@ -1162,10 +1202,11 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
1162{ 1202{
1163 DECLARE_COMPLETION_ONSTACK(done); 1203 DECLARE_COMPLETION_ONSTACK(done);
1164 struct wb_writeback_work work = { 1204 struct wb_writeback_work work = {
1165 .sb = sb, 1205 .sb = sb,
1166 .sync_mode = WB_SYNC_NONE, 1206 .sync_mode = WB_SYNC_NONE,
1167 .done = &done, 1207 .tagged_writepages = 1,
1168 .nr_pages = nr, 1208 .done = &done,
1209 .nr_pages = nr,
1169 }; 1210 };
1170 1211
1171 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1212 WARN_ON(!rwsem_is_locked(&sb->s_umount));
@@ -1267,6 +1308,7 @@ EXPORT_SYMBOL(sync_inodes_sb);
1267 */ 1308 */
1268int write_inode_now(struct inode *inode, int sync) 1309int write_inode_now(struct inode *inode, int sync)
1269{ 1310{
1311 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
1270 int ret; 1312 int ret;
1271 struct writeback_control wbc = { 1313 struct writeback_control wbc = {
1272 .nr_to_write = LONG_MAX, 1314 .nr_to_write = LONG_MAX,
@@ -1279,11 +1321,11 @@ int write_inode_now(struct inode *inode, int sync)
1279 wbc.nr_to_write = 0; 1321 wbc.nr_to_write = 0;
1280 1322
1281 might_sleep(); 1323 might_sleep();
1282 spin_lock(&inode_wb_list_lock); 1324 spin_lock(&wb->list_lock);
1283 spin_lock(&inode->i_lock); 1325 spin_lock(&inode->i_lock);
1284 ret = writeback_single_inode(inode, &wbc); 1326 ret = writeback_single_inode(inode, wb, &wbc);
1285 spin_unlock(&inode->i_lock); 1327 spin_unlock(&inode->i_lock);
1286 spin_unlock(&inode_wb_list_lock); 1328 spin_unlock(&wb->list_lock);
1287 if (sync) 1329 if (sync)
1288 inode_sync_wait(inode); 1330 inode_sync_wait(inode);
1289 return ret; 1331 return ret;
@@ -1303,13 +1345,14 @@ EXPORT_SYMBOL(write_inode_now);
1303 */ 1345 */
1304int sync_inode(struct inode *inode, struct writeback_control *wbc) 1346int sync_inode(struct inode *inode, struct writeback_control *wbc)
1305{ 1347{
1348 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
1306 int ret; 1349 int ret;
1307 1350
1308 spin_lock(&inode_wb_list_lock); 1351 spin_lock(&wb->list_lock);
1309 spin_lock(&inode->i_lock); 1352 spin_lock(&inode->i_lock);
1310 ret = writeback_single_inode(inode, wbc); 1353 ret = writeback_single_inode(inode, wb, wbc);
1311 spin_unlock(&inode->i_lock); 1354 spin_unlock(&inode->i_lock);
1312 spin_unlock(&inode_wb_list_lock); 1355 spin_unlock(&wb->list_lock);
1313 return ret; 1356 return ret;
1314} 1357}
1315EXPORT_SYMBOL(sync_inode); 1358EXPORT_SYMBOL(sync_inode);
diff --git a/fs/inode.c b/fs/inode.c
index 96c77b81167c..a48fa5355fb4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -37,7 +37,7 @@
37 * inode->i_sb->s_inode_lru, inode->i_lru 37 * inode->i_sb->s_inode_lru, inode->i_lru
38 * inode_sb_list_lock protects: 38 * inode_sb_list_lock protects:
39 * sb->s_inodes, inode->i_sb_list 39 * sb->s_inodes, inode->i_sb_list
40 * inode_wb_list_lock protects: 40 * bdi->wb.list_lock protects:
41 * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list 41 * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
42 * inode_hash_lock protects: 42 * inode_hash_lock protects:
43 * inode_hashtable, inode->i_hash 43 * inode_hashtable, inode->i_hash
@@ -48,7 +48,7 @@
48 * inode->i_lock 48 * inode->i_lock
49 * inode->i_sb->s_inode_lru_lock 49 * inode->i_sb->s_inode_lru_lock
50 * 50 *
51 * inode_wb_list_lock 51 * bdi->wb.list_lock
52 * inode->i_lock 52 * inode->i_lock
53 * 53 *
54 * inode_hash_lock 54 * inode_hash_lock
@@ -65,7 +65,6 @@ static struct hlist_head *inode_hashtable __read_mostly;
65static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); 65static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
66 66
67__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); 67__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
68__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
69 68
70/* 69/*
71 * Empty aops. Can be used for the cases where the user does not 70 * Empty aops. Can be used for the cases where the user does not
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 08579312c57b..00e37501fa3b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1566,8 +1566,7 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
1566 int status; 1566 int status;
1567 bool sync = true; 1567 bool sync = true;
1568 1568
1569 if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking || 1569 if (wbc->sync_mode == WB_SYNC_NONE)
1570 wbc->for_background)
1571 sync = false; 1570 sync = false;
1572 1571
1573 status = pnfs_layoutcommit_inode(inode, sync); 1572 status = pnfs_layoutcommit_inode(inode, sync);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 96f4094b706d..a008982e7c08 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -40,6 +40,7 @@ typedef int (congested_fn)(void *, int);
40enum bdi_stat_item { 40enum bdi_stat_item {
41 BDI_RECLAIMABLE, 41 BDI_RECLAIMABLE,
42 BDI_WRITEBACK, 42 BDI_WRITEBACK,
43 BDI_WRITTEN,
43 NR_BDI_STAT_ITEMS 44 NR_BDI_STAT_ITEMS
44}; 45};
45 46
@@ -57,6 +58,7 @@ struct bdi_writeback {
57 struct list_head b_dirty; /* dirty inodes */ 58 struct list_head b_dirty; /* dirty inodes */
58 struct list_head b_io; /* parked for writeback */ 59 struct list_head b_io; /* parked for writeback */
59 struct list_head b_more_io; /* parked for more writeback */ 60 struct list_head b_more_io; /* parked for more writeback */
61 spinlock_t list_lock; /* protects the b_* lists */
60}; 62};
61 63
62struct backing_dev_info { 64struct backing_dev_info {
@@ -71,6 +73,11 @@ struct backing_dev_info {
71 73
72 struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; 74 struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
73 75
76 unsigned long bw_time_stamp; /* last time write bw is updated */
77 unsigned long written_stamp; /* pages written at bw_time_stamp */
78 unsigned long write_bandwidth; /* the estimated write bandwidth */
79 unsigned long avg_write_bandwidth; /* further smoothed write bw */
80
74 struct prop_local_percpu completions; 81 struct prop_local_percpu completions;
75 int dirty_exceeded; 82 int dirty_exceeded;
76 83
@@ -106,6 +113,7 @@ int bdi_writeback_thread(void *data);
106int bdi_has_dirty_io(struct backing_dev_info *bdi); 113int bdi_has_dirty_io(struct backing_dev_info *bdi);
107void bdi_arm_supers_timer(void); 114void bdi_arm_supers_timer(void);
108void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); 115void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
116void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2);
109 117
110extern spinlock_t bdi_lock; 118extern spinlock_t bdi_lock;
111extern struct list_head bdi_list; 119extern struct list_head bdi_list;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 17e7ccc322a5..f1bfa12ea246 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -7,9 +7,39 @@
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/fs.h> 8#include <linux/fs.h>
9 9
10struct backing_dev_info; 10/*
11 * The 1/4 region under the global dirty thresh is for smooth dirty throttling:
12 *
13 * (thresh - thresh/DIRTY_FULL_SCOPE, thresh)
14 *
15 * The 1/16 region above the global dirty limit will be put to maximum pauses:
16 *
17 * (limit, limit + limit/DIRTY_MAXPAUSE_AREA)
18 *
19 * The 1/16 region above the max-pause region, dirty exceeded bdi's will be put
20 * to loops:
21 *
22 * (limit + limit/DIRTY_MAXPAUSE_AREA, limit + limit/DIRTY_PASSGOOD_AREA)
23 *
24 * Further beyond, all dirtier tasks will enter a loop waiting (possibly long
25 * time) for the dirty pages to drop, unless written enough pages.
26 *
27 * The global dirty threshold is normally equal to the global dirty limit,
28 * except when the system suddenly allocates a lot of anonymous memory and
29 * knocks down the global dirty threshold quickly, in which case the global
30 * dirty limit will follow down slowly to prevent livelocking all dirtier tasks.
31 */
32#define DIRTY_SCOPE 8
33#define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2)
34#define DIRTY_MAXPAUSE_AREA 16
35#define DIRTY_PASSGOOD_AREA 8
11 36
12extern spinlock_t inode_wb_list_lock; 37/*
38 * 4MB minimal write chunk size
39 */
40#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10))
41
42struct backing_dev_info;
13 43
14/* 44/*
15 * fs/fs-writeback.c 45 * fs/fs-writeback.c
@@ -26,11 +56,6 @@ enum writeback_sync_modes {
26 */ 56 */
27struct writeback_control { 57struct writeback_control {
28 enum writeback_sync_modes sync_mode; 58 enum writeback_sync_modes sync_mode;
29 unsigned long *older_than_this; /* If !NULL, only write back inodes
30 older than this */
31 unsigned long wb_start; /* Time writeback_inodes_wb was
32 called. This is needed to avoid
33 extra jobs and livelock */
34 long nr_to_write; /* Write this many pages, and decrement 59 long nr_to_write; /* Write this many pages, and decrement
35 this for each page written */ 60 this for each page written */
36 long pages_skipped; /* Pages which were not written */ 61 long pages_skipped; /* Pages which were not written */
@@ -43,13 +68,11 @@ struct writeback_control {
43 loff_t range_start; 68 loff_t range_start;
44 loff_t range_end; 69 loff_t range_end;
45 70
46 unsigned nonblocking:1; /* Don't get stuck on request queues */
47 unsigned encountered_congestion:1; /* An output: a queue is full */
48 unsigned for_kupdate:1; /* A kupdate writeback */ 71 unsigned for_kupdate:1; /* A kupdate writeback */
49 unsigned for_background:1; /* A background writeback */ 72 unsigned for_background:1; /* A background writeback */
73 unsigned tagged_writepages:1; /* tag-and-write to avoid livelock */
50 unsigned for_reclaim:1; /* Invoked from the page allocator */ 74 unsigned for_reclaim:1; /* Invoked from the page allocator */
51 unsigned range_cyclic:1; /* range_start is cyclic */ 75 unsigned range_cyclic:1; /* range_start is cyclic */
52 unsigned more_io:1; /* more io to be dispatched */
53}; 76};
54 77
55/* 78/*
@@ -62,8 +85,7 @@ void writeback_inodes_sb_nr(struct super_block *, unsigned long nr);
62int writeback_inodes_sb_if_idle(struct super_block *); 85int writeback_inodes_sb_if_idle(struct super_block *);
63int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr); 86int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr);
64void sync_inodes_sb(struct super_block *); 87void sync_inodes_sb(struct super_block *);
65void writeback_inodes_wb(struct bdi_writeback *wb, 88long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages);
66 struct writeback_control *wbc);
67long wb_do_writeback(struct bdi_writeback *wb, int force_wait); 89long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
68void wakeup_flusher_threads(long nr_pages); 90void wakeup_flusher_threads(long nr_pages);
69 91
@@ -94,6 +116,8 @@ static inline void laptop_sync_completion(void) { }
94#endif 116#endif
95void throttle_vm_writeout(gfp_t gfp_mask); 117void throttle_vm_writeout(gfp_t gfp_mask);
96 118
119extern unsigned long global_dirty_limit;
120
97/* These are exported to sysctl. */ 121/* These are exported to sysctl. */
98extern int dirty_background_ratio; 122extern int dirty_background_ratio;
99extern unsigned long dirty_background_bytes; 123extern unsigned long dirty_background_bytes;
@@ -128,6 +152,13 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
128unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, 152unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
129 unsigned long dirty); 153 unsigned long dirty);
130 154
155void __bdi_update_bandwidth(struct backing_dev_info *bdi,
156 unsigned long thresh,
157 unsigned long dirty,
158 unsigned long bdi_thresh,
159 unsigned long bdi_dirty,
160 unsigned long start_time);
161
131void page_writeback_init(void); 162void page_writeback_init(void);
132void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, 163void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
133 unsigned long nr_pages_dirtied); 164 unsigned long nr_pages_dirtied);
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 4114129f0794..b31702ac15be 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -284,7 +284,6 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
284 __field( long, pages_skipped ) 284 __field( long, pages_skipped )
285 __field( loff_t, range_start ) 285 __field( loff_t, range_start )
286 __field( loff_t, range_end ) 286 __field( loff_t, range_end )
287 __field( char, nonblocking )
288 __field( char, for_kupdate ) 287 __field( char, for_kupdate )
289 __field( char, for_reclaim ) 288 __field( char, for_reclaim )
290 __field( char, range_cyclic ) 289 __field( char, range_cyclic )
@@ -299,7 +298,6 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
299 __entry->pages_skipped = wbc->pages_skipped; 298 __entry->pages_skipped = wbc->pages_skipped;
300 __entry->range_start = wbc->range_start; 299 __entry->range_start = wbc->range_start;
301 __entry->range_end = wbc->range_end; 300 __entry->range_end = wbc->range_end;
302 __entry->nonblocking = wbc->nonblocking;
303 __entry->for_kupdate = wbc->for_kupdate; 301 __entry->for_kupdate = wbc->for_kupdate;
304 __entry->for_reclaim = wbc->for_reclaim; 302 __entry->for_reclaim = wbc->for_reclaim;
305 __entry->range_cyclic = wbc->range_cyclic; 303 __entry->range_cyclic = wbc->range_cyclic;
@@ -310,13 +308,13 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
310 308
311 TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, " 309 TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, "
312 "nr_to_write = %ld, pages_skipped = %ld, range_start = %llu, " 310 "nr_to_write = %ld, pages_skipped = %ld, range_start = %llu, "
313 "range_end = %llu, nonblocking = %d, for_kupdate = %d, " 311 "range_end = %llu, for_kupdate = %d, "
314 "for_reclaim = %d, range_cyclic = %d, writeback_index = %lu", 312 "for_reclaim = %d, range_cyclic = %d, writeback_index = %lu",
315 show_root_type(__entry->root_objectid), 313 show_root_type(__entry->root_objectid),
316 (unsigned long)__entry->ino, __entry->index, 314 (unsigned long)__entry->ino, __entry->index,
317 __entry->nr_to_write, __entry->pages_skipped, 315 __entry->nr_to_write, __entry->pages_skipped,
318 __entry->range_start, __entry->range_end, 316 __entry->range_start, __entry->range_end,
319 __entry->nonblocking, __entry->for_kupdate, 317 __entry->for_kupdate,
320 __entry->for_reclaim, __entry->range_cyclic, 318 __entry->for_reclaim, __entry->range_cyclic,
321 (unsigned long)__entry->writeback_index) 319 (unsigned long)__entry->writeback_index)
322); 320);
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 5ce2b2f5f524..6363193a3418 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -380,7 +380,6 @@ TRACE_EVENT(ext4_da_writepages_result,
380 __field( int, pages_written ) 380 __field( int, pages_written )
381 __field( long, pages_skipped ) 381 __field( long, pages_skipped )
382 __field( int, sync_mode ) 382 __field( int, sync_mode )
383 __field( char, more_io )
384 __field( pgoff_t, writeback_index ) 383 __field( pgoff_t, writeback_index )
385 ), 384 ),
386 385
@@ -391,16 +390,15 @@ TRACE_EVENT(ext4_da_writepages_result,
391 __entry->pages_written = pages_written; 390 __entry->pages_written = pages_written;
392 __entry->pages_skipped = wbc->pages_skipped; 391 __entry->pages_skipped = wbc->pages_skipped;
393 __entry->sync_mode = wbc->sync_mode; 392 __entry->sync_mode = wbc->sync_mode;
394 __entry->more_io = wbc->more_io;
395 __entry->writeback_index = inode->i_mapping->writeback_index; 393 __entry->writeback_index = inode->i_mapping->writeback_index;
396 ), 394 ),
397 395
398 TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld " 396 TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld "
399 " more_io %d sync_mode %d writeback_index %lu", 397 "sync_mode %d writeback_index %lu",
400 MAJOR(__entry->dev), MINOR(__entry->dev), 398 MAJOR(__entry->dev), MINOR(__entry->dev),
401 (unsigned long) __entry->ino, __entry->ret, 399 (unsigned long) __entry->ino, __entry->ret,
402 __entry->pages_written, __entry->pages_skipped, 400 __entry->pages_written, __entry->pages_skipped,
403 __entry->more_io, __entry->sync_mode, 401 __entry->sync_mode,
404 (unsigned long) __entry->writeback_index) 402 (unsigned long) __entry->writeback_index)
405); 403);
406 404
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 4e249b927eaa..6bca4cc0063c 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -8,6 +8,19 @@
8#include <linux/device.h> 8#include <linux/device.h>
9#include <linux/writeback.h> 9#include <linux/writeback.h>
10 10
11#define show_inode_state(state) \
12 __print_flags(state, "|", \
13 {I_DIRTY_SYNC, "I_DIRTY_SYNC"}, \
14 {I_DIRTY_DATASYNC, "I_DIRTY_DATASYNC"}, \
15 {I_DIRTY_PAGES, "I_DIRTY_PAGES"}, \
16 {I_NEW, "I_NEW"}, \
17 {I_WILL_FREE, "I_WILL_FREE"}, \
18 {I_FREEING, "I_FREEING"}, \
19 {I_CLEAR, "I_CLEAR"}, \
20 {I_SYNC, "I_SYNC"}, \
21 {I_REFERENCED, "I_REFERENCED"} \
22 )
23
11struct wb_writeback_work; 24struct wb_writeback_work;
12 25
13DECLARE_EVENT_CLASS(writeback_work_class, 26DECLARE_EVENT_CLASS(writeback_work_class,
@@ -49,6 +62,9 @@ DEFINE_EVENT(writeback_work_class, name, \
49DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread); 62DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread);
50DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); 63DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
51DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); 64DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
65DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
66DEFINE_WRITEBACK_WORK_EVENT(writeback_written);
67DEFINE_WRITEBACK_WORK_EVENT(writeback_wait);
52 68
53TRACE_EVENT(writeback_pages_written, 69TRACE_EVENT(writeback_pages_written,
54 TP_PROTO(long pages_written), 70 TP_PROTO(long pages_written),
@@ -88,6 +104,30 @@ DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
88DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); 104DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister);
89DEFINE_WRITEBACK_EVENT(writeback_thread_start); 105DEFINE_WRITEBACK_EVENT(writeback_thread_start);
90DEFINE_WRITEBACK_EVENT(writeback_thread_stop); 106DEFINE_WRITEBACK_EVENT(writeback_thread_stop);
107DEFINE_WRITEBACK_EVENT(balance_dirty_start);
108DEFINE_WRITEBACK_EVENT(balance_dirty_wait);
109
110TRACE_EVENT(balance_dirty_written,
111
112 TP_PROTO(struct backing_dev_info *bdi, int written),
113
114 TP_ARGS(bdi, written),
115
116 TP_STRUCT__entry(
117 __array(char, name, 32)
118 __field(int, written)
119 ),
120
121 TP_fast_assign(
122 strncpy(__entry->name, dev_name(bdi->dev), 32);
123 __entry->written = written;
124 ),
125
126 TP_printk("bdi %s written %d",
127 __entry->name,
128 __entry->written
129 )
130);
91 131
92DECLARE_EVENT_CLASS(wbc_class, 132DECLARE_EVENT_CLASS(wbc_class,
93 TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), 133 TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
@@ -101,8 +141,6 @@ DECLARE_EVENT_CLASS(wbc_class,
101 __field(int, for_background) 141 __field(int, for_background)
102 __field(int, for_reclaim) 142 __field(int, for_reclaim)
103 __field(int, range_cyclic) 143 __field(int, range_cyclic)
104 __field(int, more_io)
105 __field(unsigned long, older_than_this)
106 __field(long, range_start) 144 __field(long, range_start)
107 __field(long, range_end) 145 __field(long, range_end)
108 ), 146 ),
@@ -116,15 +154,12 @@ DECLARE_EVENT_CLASS(wbc_class,
116 __entry->for_background = wbc->for_background; 154 __entry->for_background = wbc->for_background;
117 __entry->for_reclaim = wbc->for_reclaim; 155 __entry->for_reclaim = wbc->for_reclaim;
118 __entry->range_cyclic = wbc->range_cyclic; 156 __entry->range_cyclic = wbc->range_cyclic;
119 __entry->more_io = wbc->more_io;
120 __entry->older_than_this = wbc->older_than_this ?
121 *wbc->older_than_this : 0;
122 __entry->range_start = (long)wbc->range_start; 157 __entry->range_start = (long)wbc->range_start;
123 __entry->range_end = (long)wbc->range_end; 158 __entry->range_end = (long)wbc->range_end;
124 ), 159 ),
125 160
126 TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " 161 TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
127 "bgrd=%d reclm=%d cyclic=%d more=%d older=0x%lx " 162 "bgrd=%d reclm=%d cyclic=%d "
128 "start=0x%lx end=0x%lx", 163 "start=0x%lx end=0x%lx",
129 __entry->name, 164 __entry->name,
130 __entry->nr_to_write, 165 __entry->nr_to_write,
@@ -134,8 +169,6 @@ DECLARE_EVENT_CLASS(wbc_class,
134 __entry->for_background, 169 __entry->for_background,
135 __entry->for_reclaim, 170 __entry->for_reclaim,
136 __entry->range_cyclic, 171 __entry->range_cyclic,
137 __entry->more_io,
138 __entry->older_than_this,
139 __entry->range_start, 172 __entry->range_start,
140 __entry->range_end) 173 __entry->range_end)
141) 174)
@@ -144,14 +177,79 @@ DECLARE_EVENT_CLASS(wbc_class,
144DEFINE_EVENT(wbc_class, name, \ 177DEFINE_EVENT(wbc_class, name, \
145 TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \ 178 TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \
146 TP_ARGS(wbc, bdi)) 179 TP_ARGS(wbc, bdi))
147DEFINE_WBC_EVENT(wbc_writeback_start);
148DEFINE_WBC_EVENT(wbc_writeback_written);
149DEFINE_WBC_EVENT(wbc_writeback_wait);
150DEFINE_WBC_EVENT(wbc_balance_dirty_start);
151DEFINE_WBC_EVENT(wbc_balance_dirty_written);
152DEFINE_WBC_EVENT(wbc_balance_dirty_wait);
153DEFINE_WBC_EVENT(wbc_writepage); 180DEFINE_WBC_EVENT(wbc_writepage);
154 181
182TRACE_EVENT(writeback_queue_io,
183 TP_PROTO(struct bdi_writeback *wb,
184 unsigned long *older_than_this,
185 int moved),
186 TP_ARGS(wb, older_than_this, moved),
187 TP_STRUCT__entry(
188 __array(char, name, 32)
189 __field(unsigned long, older)
190 __field(long, age)
191 __field(int, moved)
192 ),
193 TP_fast_assign(
194 strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
195 __entry->older = older_than_this ? *older_than_this : 0;
196 __entry->age = older_than_this ?
197 (jiffies - *older_than_this) * 1000 / HZ : -1;
198 __entry->moved = moved;
199 ),
200 TP_printk("bdi %s: older=%lu age=%ld enqueue=%d",
201 __entry->name,
202 __entry->older, /* older_than_this in jiffies */
203 __entry->age, /* older_than_this in relative milliseconds */
204 __entry->moved)
205);
206
207TRACE_EVENT(global_dirty_state,
208
209 TP_PROTO(unsigned long background_thresh,
210 unsigned long dirty_thresh
211 ),
212
213 TP_ARGS(background_thresh,
214 dirty_thresh
215 ),
216
217 TP_STRUCT__entry(
218 __field(unsigned long, nr_dirty)
219 __field(unsigned long, nr_writeback)
220 __field(unsigned long, nr_unstable)
221 __field(unsigned long, background_thresh)
222 __field(unsigned long, dirty_thresh)
223 __field(unsigned long, dirty_limit)
224 __field(unsigned long, nr_dirtied)
225 __field(unsigned long, nr_written)
226 ),
227
228 TP_fast_assign(
229 __entry->nr_dirty = global_page_state(NR_FILE_DIRTY);
230 __entry->nr_writeback = global_page_state(NR_WRITEBACK);
231 __entry->nr_unstable = global_page_state(NR_UNSTABLE_NFS);
232 __entry->nr_dirtied = global_page_state(NR_DIRTIED);
233 __entry->nr_written = global_page_state(NR_WRITTEN);
234 __entry->background_thresh = background_thresh;
235 __entry->dirty_thresh = dirty_thresh;
236 __entry->dirty_limit = global_dirty_limit;
237 ),
238
239 TP_printk("dirty=%lu writeback=%lu unstable=%lu "
240 "bg_thresh=%lu thresh=%lu limit=%lu "
241 "dirtied=%lu written=%lu",
242 __entry->nr_dirty,
243 __entry->nr_writeback,
244 __entry->nr_unstable,
245 __entry->background_thresh,
246 __entry->dirty_thresh,
247 __entry->dirty_limit,
248 __entry->nr_dirtied,
249 __entry->nr_written
250 )
251);
252
155DECLARE_EVENT_CLASS(writeback_congest_waited_template, 253DECLARE_EVENT_CLASS(writeback_congest_waited_template,
156 254
157 TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), 255 TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
@@ -187,6 +285,63 @@ DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested,
187 TP_ARGS(usec_timeout, usec_delayed) 285 TP_ARGS(usec_timeout, usec_delayed)
188); 286);
189 287
288DECLARE_EVENT_CLASS(writeback_single_inode_template,
289
290 TP_PROTO(struct inode *inode,
291 struct writeback_control *wbc,
292 unsigned long nr_to_write
293 ),
294
295 TP_ARGS(inode, wbc, nr_to_write),
296
297 TP_STRUCT__entry(
298 __array(char, name, 32)
299 __field(unsigned long, ino)
300 __field(unsigned long, state)
301 __field(unsigned long, age)
302 __field(unsigned long, writeback_index)
303 __field(long, nr_to_write)
304 __field(unsigned long, wrote)
305 ),
306
307 TP_fast_assign(
308 strncpy(__entry->name,
309 dev_name(inode->i_mapping->backing_dev_info->dev), 32);
310 __entry->ino = inode->i_ino;
311 __entry->state = inode->i_state;
312 __entry->age = (jiffies - inode->dirtied_when) *
313 1000 / HZ;
314 __entry->writeback_index = inode->i_mapping->writeback_index;
315 __entry->nr_to_write = nr_to_write;
316 __entry->wrote = nr_to_write - wbc->nr_to_write;
317 ),
318
319 TP_printk("bdi %s: ino=%lu state=%s age=%lu "
320 "index=%lu to_write=%ld wrote=%lu",
321 __entry->name,
322 __entry->ino,
323 show_inode_state(__entry->state),
324 __entry->age,
325 __entry->writeback_index,
326 __entry->nr_to_write,
327 __entry->wrote
328 )
329);
330
331DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_requeue,
332 TP_PROTO(struct inode *inode,
333 struct writeback_control *wbc,
334 unsigned long nr_to_write),
335 TP_ARGS(inode, wbc, nr_to_write)
336);
337
338DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
339 TP_PROTO(struct inode *inode,
340 struct writeback_control *wbc,
341 unsigned long nr_to_write),
342 TP_ARGS(inode, wbc, nr_to_write)
343);
344
190#endif /* _TRACE_WRITEBACK_H */ 345#endif /* _TRACE_WRITEBACK_H */
191 346
192/* This part must be outside protection */ 347/* This part must be outside protection */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8290b1e88257..d6edf8d14f9c 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -45,6 +45,17 @@ static struct timer_list sync_supers_timer;
45static int bdi_sync_supers(void *); 45static int bdi_sync_supers(void *);
46static void sync_supers_timer_fn(unsigned long); 46static void sync_supers_timer_fn(unsigned long);
47 47
48void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
49{
50 if (wb1 < wb2) {
51 spin_lock(&wb1->list_lock);
52 spin_lock_nested(&wb2->list_lock, 1);
53 } else {
54 spin_lock(&wb2->list_lock);
55 spin_lock_nested(&wb1->list_lock, 1);
56 }
57}
58
48#ifdef CONFIG_DEBUG_FS 59#ifdef CONFIG_DEBUG_FS
49#include <linux/debugfs.h> 60#include <linux/debugfs.h>
50#include <linux/seq_file.h> 61#include <linux/seq_file.h>
@@ -67,34 +78,42 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
67 struct inode *inode; 78 struct inode *inode;
68 79
69 nr_dirty = nr_io = nr_more_io = 0; 80 nr_dirty = nr_io = nr_more_io = 0;
70 spin_lock(&inode_wb_list_lock); 81 spin_lock(&wb->list_lock);
71 list_for_each_entry(inode, &wb->b_dirty, i_wb_list) 82 list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
72 nr_dirty++; 83 nr_dirty++;
73 list_for_each_entry(inode, &wb->b_io, i_wb_list) 84 list_for_each_entry(inode, &wb->b_io, i_wb_list)
74 nr_io++; 85 nr_io++;
75 list_for_each_entry(inode, &wb->b_more_io, i_wb_list) 86 list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
76 nr_more_io++; 87 nr_more_io++;
77 spin_unlock(&inode_wb_list_lock); 88 spin_unlock(&wb->list_lock);
78 89
79 global_dirty_limits(&background_thresh, &dirty_thresh); 90 global_dirty_limits(&background_thresh, &dirty_thresh);
80 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); 91 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
81 92
82#define K(x) ((x) << (PAGE_SHIFT - 10)) 93#define K(x) ((x) << (PAGE_SHIFT - 10))
83 seq_printf(m, 94 seq_printf(m,
84 "BdiWriteback: %8lu kB\n" 95 "BdiWriteback: %10lu kB\n"
85 "BdiReclaimable: %8lu kB\n" 96 "BdiReclaimable: %10lu kB\n"
86 "BdiDirtyThresh: %8lu kB\n" 97 "BdiDirtyThresh: %10lu kB\n"
87 "DirtyThresh: %8lu kB\n" 98 "DirtyThresh: %10lu kB\n"
88 "BackgroundThresh: %8lu kB\n" 99 "BackgroundThresh: %10lu kB\n"
89 "b_dirty: %8lu\n" 100 "BdiWritten: %10lu kB\n"
90 "b_io: %8lu\n" 101 "BdiWriteBandwidth: %10lu kBps\n"
91 "b_more_io: %8lu\n" 102 "b_dirty: %10lu\n"
92 "bdi_list: %8u\n" 103 "b_io: %10lu\n"
93 "state: %8lx\n", 104 "b_more_io: %10lu\n"
105 "bdi_list: %10u\n"
106 "state: %10lx\n",
94 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), 107 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
95 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), 108 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
96 K(bdi_thresh), K(dirty_thresh), 109 K(bdi_thresh),
97 K(background_thresh), nr_dirty, nr_io, nr_more_io, 110 K(dirty_thresh),
111 K(background_thresh),
112 (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
113 (unsigned long) K(bdi->write_bandwidth),
114 nr_dirty,
115 nr_io,
116 nr_more_io,
98 !list_empty(&bdi->bdi_list), bdi->state); 117 !list_empty(&bdi->bdi_list), bdi->state);
99#undef K 118#undef K
100 119
@@ -249,18 +268,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
249 return wb_has_dirty_io(&bdi->wb); 268 return wb_has_dirty_io(&bdi->wb);
250} 269}
251 270
252static void bdi_flush_io(struct backing_dev_info *bdi)
253{
254 struct writeback_control wbc = {
255 .sync_mode = WB_SYNC_NONE,
256 .older_than_this = NULL,
257 .range_cyclic = 1,
258 .nr_to_write = 1024,
259 };
260
261 writeback_inodes_wb(&bdi->wb, &wbc);
262}
263
264/* 271/*
265 * kupdated() used to do this. We cannot do it from the bdi_forker_thread() 272 * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
266 * or we risk deadlocking on ->s_umount. The longer term solution would be 273 * or we risk deadlocking on ->s_umount. The longer term solution would be
@@ -446,9 +453,10 @@ static int bdi_forker_thread(void *ptr)
446 if (IS_ERR(task)) { 453 if (IS_ERR(task)) {
447 /* 454 /*
448 * If thread creation fails, force writeout of 455 * If thread creation fails, force writeout of
449 * the bdi from the thread. 456 * the bdi from the thread. Hopefully 1024 is
457 * large enough for efficient IO.
450 */ 458 */
451 bdi_flush_io(bdi); 459 writeback_inodes_wb(&bdi->wb, 1024);
452 } else { 460 } else {
453 /* 461 /*
454 * The spinlock makes sure we do not lose 462 * The spinlock makes sure we do not lose
@@ -629,9 +637,15 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
629 INIT_LIST_HEAD(&wb->b_dirty); 637 INIT_LIST_HEAD(&wb->b_dirty);
630 INIT_LIST_HEAD(&wb->b_io); 638 INIT_LIST_HEAD(&wb->b_io);
631 INIT_LIST_HEAD(&wb->b_more_io); 639 INIT_LIST_HEAD(&wb->b_more_io);
640 spin_lock_init(&wb->list_lock);
632 setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); 641 setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
633} 642}
634 643
644/*
645 * Initial write bandwidth: 100 MB/s
646 */
647#define INIT_BW (100 << (20 - PAGE_SHIFT))
648
635int bdi_init(struct backing_dev_info *bdi) 649int bdi_init(struct backing_dev_info *bdi)
636{ 650{
637 int i, err; 651 int i, err;
@@ -654,6 +668,13 @@ int bdi_init(struct backing_dev_info *bdi)
654 } 668 }
655 669
656 bdi->dirty_exceeded = 0; 670 bdi->dirty_exceeded = 0;
671
672 bdi->bw_time_stamp = jiffies;
673 bdi->written_stamp = 0;
674
675 bdi->write_bandwidth = INIT_BW;
676 bdi->avg_write_bandwidth = INIT_BW;
677
657 err = prop_local_init_percpu(&bdi->completions); 678 err = prop_local_init_percpu(&bdi->completions);
658 679
659 if (err) { 680 if (err) {
@@ -677,11 +698,12 @@ void bdi_destroy(struct backing_dev_info *bdi)
677 if (bdi_has_dirty_io(bdi)) { 698 if (bdi_has_dirty_io(bdi)) {
678 struct bdi_writeback *dst = &default_backing_dev_info.wb; 699 struct bdi_writeback *dst = &default_backing_dev_info.wb;
679 700
680 spin_lock(&inode_wb_list_lock); 701 bdi_lock_two(&bdi->wb, dst);
681 list_splice(&bdi->wb.b_dirty, &dst->b_dirty); 702 list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
682 list_splice(&bdi->wb.b_io, &dst->b_io); 703 list_splice(&bdi->wb.b_io, &dst->b_io);
683 list_splice(&bdi->wb.b_more_io, &dst->b_more_io); 704 list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
684 spin_unlock(&inode_wb_list_lock); 705 spin_unlock(&bdi->wb.list_lock);
706 spin_unlock(&dst->list_lock);
685 } 707 }
686 708
687 bdi_unregister(bdi); 709 bdi_unregister(bdi);
diff --git a/mm/filemap.c b/mm/filemap.c
index 10a171113273..867d40222ec7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -78,7 +78,7 @@
78 * ->i_mutex (generic_file_buffered_write) 78 * ->i_mutex (generic_file_buffered_write)
79 * ->mmap_sem (fault_in_pages_readable->do_page_fault) 79 * ->mmap_sem (fault_in_pages_readable->do_page_fault)
80 * 80 *
81 * inode_wb_list_lock 81 * bdi->wb.list_lock
82 * sb_lock (fs/fs-writeback.c) 82 * sb_lock (fs/fs-writeback.c)
83 * ->mapping->tree_lock (__sync_single_inode) 83 * ->mapping->tree_lock (__sync_single_inode)
84 * 84 *
@@ -96,9 +96,9 @@
96 * ->zone.lru_lock (check_pte_range->isolate_lru_page) 96 * ->zone.lru_lock (check_pte_range->isolate_lru_page)
97 * ->private_lock (page_remove_rmap->set_page_dirty) 97 * ->private_lock (page_remove_rmap->set_page_dirty)
98 * ->tree_lock (page_remove_rmap->set_page_dirty) 98 * ->tree_lock (page_remove_rmap->set_page_dirty)
99 * inode_wb_list_lock (page_remove_rmap->set_page_dirty) 99 * bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
100 * ->inode->i_lock (page_remove_rmap->set_page_dirty) 100 * ->inode->i_lock (page_remove_rmap->set_page_dirty)
101 * inode_wb_list_lock (zap_pte_range->set_page_dirty) 101 * bdi.wb->list_lock (zap_pte_range->set_page_dirty)
102 * ->inode->i_lock (zap_pte_range->set_page_dirty) 102 * ->inode->i_lock (zap_pte_range->set_page_dirty)
103 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 103 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
104 * 104 *
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d8767b381b9c..d1960744f881 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -37,6 +37,16 @@
37#include <trace/events/writeback.h> 37#include <trace/events/writeback.h>
38 38
39/* 39/*
40 * Sleep at most 200ms at a time in balance_dirty_pages().
41 */
42#define MAX_PAUSE max(HZ/5, 1)
43
44/*
45 * Estimate write bandwidth at 200ms intervals.
46 */
47#define BANDWIDTH_INTERVAL max(HZ/5, 1)
48
49/*
40 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 50 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
41 * will look to see if it needs to force writeback or throttling. 51 * will look to see if it needs to force writeback or throttling.
42 */ 52 */
@@ -111,6 +121,7 @@ EXPORT_SYMBOL(laptop_mode);
111 121
112/* End of sysctl-exported parameters */ 122/* End of sysctl-exported parameters */
113 123
124unsigned long global_dirty_limit;
114 125
115/* 126/*
116 * Scale the writeback cache size proportional to the relative writeout speeds. 127 * Scale the writeback cache size proportional to the relative writeout speeds.
@@ -219,6 +230,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
219 */ 230 */
220static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) 231static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
221{ 232{
233 __inc_bdi_stat(bdi, BDI_WRITTEN);
222 __prop_inc_percpu_max(&vm_completions, &bdi->completions, 234 __prop_inc_percpu_max(&vm_completions, &bdi->completions,
223 bdi->max_prop_frac); 235 bdi->max_prop_frac);
224} 236}
@@ -244,13 +256,8 @@ void task_dirty_inc(struct task_struct *tsk)
244static void bdi_writeout_fraction(struct backing_dev_info *bdi, 256static void bdi_writeout_fraction(struct backing_dev_info *bdi,
245 long *numerator, long *denominator) 257 long *numerator, long *denominator)
246{ 258{
247 if (bdi_cap_writeback_dirty(bdi)) { 259 prop_fraction_percpu(&vm_completions, &bdi->completions,
248 prop_fraction_percpu(&vm_completions, &bdi->completions,
249 numerator, denominator); 260 numerator, denominator);
250 } else {
251 *numerator = 0;
252 *denominator = 1;
253 }
254} 261}
255 262
256static inline void task_dirties_fraction(struct task_struct *tsk, 263static inline void task_dirties_fraction(struct task_struct *tsk,
@@ -274,12 +281,13 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
274 * effectively curb the growth of dirty pages. Light dirtiers with high enough 281 * effectively curb the growth of dirty pages. Light dirtiers with high enough
275 * dirty threshold may never get throttled. 282 * dirty threshold may never get throttled.
276 */ 283 */
284#define TASK_LIMIT_FRACTION 8
277static unsigned long task_dirty_limit(struct task_struct *tsk, 285static unsigned long task_dirty_limit(struct task_struct *tsk,
278 unsigned long bdi_dirty) 286 unsigned long bdi_dirty)
279{ 287{
280 long numerator, denominator; 288 long numerator, denominator;
281 unsigned long dirty = bdi_dirty; 289 unsigned long dirty = bdi_dirty;
282 u64 inv = dirty >> 3; 290 u64 inv = dirty / TASK_LIMIT_FRACTION;
283 291
284 task_dirties_fraction(tsk, &numerator, &denominator); 292 task_dirties_fraction(tsk, &numerator, &denominator);
285 inv *= numerator; 293 inv *= numerator;
@@ -290,6 +298,12 @@ static unsigned long task_dirty_limit(struct task_struct *tsk,
290 return max(dirty, bdi_dirty/2); 298 return max(dirty, bdi_dirty/2);
291} 299}
292 300
301/* Minimum limit for any task */
302static unsigned long task_min_dirty_limit(unsigned long bdi_dirty)
303{
304 return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION;
305}
306
293/* 307/*
294 * 308 *
295 */ 309 */
@@ -397,6 +411,11 @@ unsigned long determine_dirtyable_memory(void)
397 return x + 1; /* Ensure that we never return 0 */ 411 return x + 1; /* Ensure that we never return 0 */
398} 412}
399 413
414static unsigned long hard_dirty_limit(unsigned long thresh)
415{
416 return max(thresh, global_dirty_limit);
417}
418
400/* 419/*
401 * global_dirty_limits - background-writeback and dirty-throttling thresholds 420 * global_dirty_limits - background-writeback and dirty-throttling thresholds
402 * 421 *
@@ -435,12 +454,20 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
435 } 454 }
436 *pbackground = background; 455 *pbackground = background;
437 *pdirty = dirty; 456 *pdirty = dirty;
457 trace_global_dirty_state(background, dirty);
438} 458}
439 459
440/* 460/**
441 * bdi_dirty_limit - @bdi's share of dirty throttling threshold 461 * bdi_dirty_limit - @bdi's share of dirty throttling threshold
462 * @bdi: the backing_dev_info to query
463 * @dirty: global dirty limit in pages
442 * 464 *
443 * Allocate high/low dirty limits to fast/slow devices, in order to prevent 465 * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
466 * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
467 * And the "limit" in the name is not seriously taken as hard limit in
468 * balance_dirty_pages().
469 *
470 * It allocates high/low dirty limits to fast/slow devices, in order to prevent
444 * - starving fast devices 471 * - starving fast devices
445 * - piling up dirty pages (that will take long time to sync) on slow devices 472 * - piling up dirty pages (that will take long time to sync) on slow devices
446 * 473 *
@@ -468,6 +495,153 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
468 return bdi_dirty; 495 return bdi_dirty;
469} 496}
470 497
498static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
499 unsigned long elapsed,
500 unsigned long written)
501{
502 const unsigned long period = roundup_pow_of_two(3 * HZ);
503 unsigned long avg = bdi->avg_write_bandwidth;
504 unsigned long old = bdi->write_bandwidth;
505 u64 bw;
506
507 /*
508 * bw = written * HZ / elapsed
509 *
510 * bw * elapsed + write_bandwidth * (period - elapsed)
511 * write_bandwidth = ---------------------------------------------------
512 * period
513 */
514 bw = written - bdi->written_stamp;
515 bw *= HZ;
516 if (unlikely(elapsed > period)) {
517 do_div(bw, elapsed);
518 avg = bw;
519 goto out;
520 }
521 bw += (u64)bdi->write_bandwidth * (period - elapsed);
522 bw >>= ilog2(period);
523
524 /*
525 * one more level of smoothing, for filtering out sudden spikes
526 */
527 if (avg > old && old >= (unsigned long)bw)
528 avg -= (avg - old) >> 3;
529
530 if (avg < old && old <= (unsigned long)bw)
531 avg += (old - avg) >> 3;
532
533out:
534 bdi->write_bandwidth = bw;
535 bdi->avg_write_bandwidth = avg;
536}
537
538/*
539 * The global dirtyable memory and dirty threshold could be suddenly knocked
540 * down by a large amount (eg. on the startup of KVM in a swapless system).
541 * This may throw the system into deep dirty exceeded state and throttle
542 * heavy/light dirtiers alike. To retain good responsiveness, maintain
543 * global_dirty_limit for tracking slowly down to the knocked down dirty
544 * threshold.
545 */
546static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
547{
548 unsigned long limit = global_dirty_limit;
549
550 /*
551 * Follow up in one step.
552 */
553 if (limit < thresh) {
554 limit = thresh;
555 goto update;
556 }
557
558 /*
559 * Follow down slowly. Use the higher one as the target, because thresh
560 * may drop below dirty. This is exactly the reason to introduce
561 * global_dirty_limit which is guaranteed to lie above the dirty pages.
562 */
563 thresh = max(thresh, dirty);
564 if (limit > thresh) {
565 limit -= (limit - thresh) >> 5;
566 goto update;
567 }
568 return;
569update:
570 global_dirty_limit = limit;
571}
572
573static void global_update_bandwidth(unsigned long thresh,
574 unsigned long dirty,
575 unsigned long now)
576{
577 static DEFINE_SPINLOCK(dirty_lock);
578 static unsigned long update_time;
579
580 /*
581 * check locklessly first to optimize away locking for the most time
582 */
583 if (time_before(now, update_time + BANDWIDTH_INTERVAL))
584 return;
585
586 spin_lock(&dirty_lock);
587 if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
588 update_dirty_limit(thresh, dirty);
589 update_time = now;
590 }
591 spin_unlock(&dirty_lock);
592}
593
594void __bdi_update_bandwidth(struct backing_dev_info *bdi,
595 unsigned long thresh,
596 unsigned long dirty,
597 unsigned long bdi_thresh,
598 unsigned long bdi_dirty,
599 unsigned long start_time)
600{
601 unsigned long now = jiffies;
602 unsigned long elapsed = now - bdi->bw_time_stamp;
603 unsigned long written;
604
605 /*
606 * rate-limit, only update once every 200ms.
607 */
608 if (elapsed < BANDWIDTH_INTERVAL)
609 return;
610
611 written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
612
613 /*
614 * Skip quiet periods when disk bandwidth is under-utilized.
615 * (at least 1s idle time between two flusher runs)
616 */
617 if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
618 goto snapshot;
619
620 if (thresh)
621 global_update_bandwidth(thresh, dirty, now);
622
623 bdi_update_write_bandwidth(bdi, elapsed, written);
624
625snapshot:
626 bdi->written_stamp = written;
627 bdi->bw_time_stamp = now;
628}
629
630static void bdi_update_bandwidth(struct backing_dev_info *bdi,
631 unsigned long thresh,
632 unsigned long dirty,
633 unsigned long bdi_thresh,
634 unsigned long bdi_dirty,
635 unsigned long start_time)
636{
637 if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
638 return;
639 spin_lock(&bdi->wb.list_lock);
640 __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty,
641 start_time);
642 spin_unlock(&bdi->wb.list_lock);
643}
644
471/* 645/*
472 * balance_dirty_pages() must be called by processes which are generating dirty 646 * balance_dirty_pages() must be called by processes which are generating dirty
473 * data. It looks at the number of dirty pages in the machine and will force 647 * data. It looks at the number of dirty pages in the machine and will force
@@ -478,27 +652,25 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
478static void balance_dirty_pages(struct address_space *mapping, 652static void balance_dirty_pages(struct address_space *mapping,
479 unsigned long write_chunk) 653 unsigned long write_chunk)
480{ 654{
481 long nr_reclaimable, bdi_nr_reclaimable; 655 unsigned long nr_reclaimable, bdi_nr_reclaimable;
482 long nr_writeback, bdi_nr_writeback; 656 unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
657 unsigned long bdi_dirty;
483 unsigned long background_thresh; 658 unsigned long background_thresh;
484 unsigned long dirty_thresh; 659 unsigned long dirty_thresh;
485 unsigned long bdi_thresh; 660 unsigned long bdi_thresh;
661 unsigned long task_bdi_thresh;
662 unsigned long min_task_bdi_thresh;
486 unsigned long pages_written = 0; 663 unsigned long pages_written = 0;
487 unsigned long pause = 1; 664 unsigned long pause = 1;
488 bool dirty_exceeded = false; 665 bool dirty_exceeded = false;
666 bool clear_dirty_exceeded = true;
489 struct backing_dev_info *bdi = mapping->backing_dev_info; 667 struct backing_dev_info *bdi = mapping->backing_dev_info;
668 unsigned long start_time = jiffies;
490 669
491 for (;;) { 670 for (;;) {
492 struct writeback_control wbc = {
493 .sync_mode = WB_SYNC_NONE,
494 .older_than_this = NULL,
495 .nr_to_write = write_chunk,
496 .range_cyclic = 1,
497 };
498
499 nr_reclaimable = global_page_state(NR_FILE_DIRTY) + 671 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
500 global_page_state(NR_UNSTABLE_NFS); 672 global_page_state(NR_UNSTABLE_NFS);
501 nr_writeback = global_page_state(NR_WRITEBACK); 673 nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
502 674
503 global_dirty_limits(&background_thresh, &dirty_thresh); 675 global_dirty_limits(&background_thresh, &dirty_thresh);
504 676
@@ -507,12 +679,12 @@ static void balance_dirty_pages(struct address_space *mapping,
507 * catch-up. This avoids (excessively) small writeouts 679 * catch-up. This avoids (excessively) small writeouts
508 * when the bdi limits are ramping up. 680 * when the bdi limits are ramping up.
509 */ 681 */
510 if (nr_reclaimable + nr_writeback <= 682 if (nr_dirty <= (background_thresh + dirty_thresh) / 2)
511 (background_thresh + dirty_thresh) / 2)
512 break; 683 break;
513 684
514 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); 685 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
515 bdi_thresh = task_dirty_limit(current, bdi_thresh); 686 min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh);
687 task_bdi_thresh = task_dirty_limit(current, bdi_thresh);
516 688
517 /* 689 /*
518 * In order to avoid the stacked BDI deadlock we need 690 * In order to avoid the stacked BDI deadlock we need
@@ -524,12 +696,14 @@ static void balance_dirty_pages(struct address_space *mapping,
524 * actually dirty; with m+n sitting in the percpu 696 * actually dirty; with m+n sitting in the percpu
525 * deltas. 697 * deltas.
526 */ 698 */
527 if (bdi_thresh < 2*bdi_stat_error(bdi)) { 699 if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) {
528 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); 700 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
529 bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); 701 bdi_dirty = bdi_nr_reclaimable +
702 bdi_stat_sum(bdi, BDI_WRITEBACK);
530 } else { 703 } else {
531 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); 704 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
532 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); 705 bdi_dirty = bdi_nr_reclaimable +
706 bdi_stat(bdi, BDI_WRITEBACK);
533 } 707 }
534 708
535 /* 709 /*
@@ -538,9 +712,10 @@ static void balance_dirty_pages(struct address_space *mapping,
538 * bdi or process from holding back light ones; The latter is 712 * bdi or process from holding back light ones; The latter is
539 * the last resort safeguard. 713 * the last resort safeguard.
540 */ 714 */
541 dirty_exceeded = 715 dirty_exceeded = (bdi_dirty > task_bdi_thresh) ||
542 (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) 716 (nr_dirty > dirty_thresh);
543 || (nr_reclaimable + nr_writeback > dirty_thresh); 717 clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) &&
718 (nr_dirty <= dirty_thresh);
544 719
545 if (!dirty_exceeded) 720 if (!dirty_exceeded)
546 break; 721 break;
@@ -548,6 +723,9 @@ static void balance_dirty_pages(struct address_space *mapping,
548 if (!bdi->dirty_exceeded) 723 if (!bdi->dirty_exceeded)
549 bdi->dirty_exceeded = 1; 724 bdi->dirty_exceeded = 1;
550 725
726 bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty,
727 bdi_thresh, bdi_dirty, start_time);
728
551 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. 729 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
552 * Unstable writes are a feature of certain networked 730 * Unstable writes are a feature of certain networked
553 * filesystems (i.e. NFS) in which data may have been 731 * filesystems (i.e. NFS) in which data may have been
@@ -557,17 +735,40 @@ static void balance_dirty_pages(struct address_space *mapping,
557 * threshold otherwise wait until the disk writes catch 735 * threshold otherwise wait until the disk writes catch
558 * up. 736 * up.
559 */ 737 */
560 trace_wbc_balance_dirty_start(&wbc, bdi); 738 trace_balance_dirty_start(bdi);
561 if (bdi_nr_reclaimable > bdi_thresh) { 739 if (bdi_nr_reclaimable > task_bdi_thresh) {
562 writeback_inodes_wb(&bdi->wb, &wbc); 740 pages_written += writeback_inodes_wb(&bdi->wb,
563 pages_written += write_chunk - wbc.nr_to_write; 741 write_chunk);
564 trace_wbc_balance_dirty_written(&wbc, bdi); 742 trace_balance_dirty_written(bdi, pages_written);
565 if (pages_written >= write_chunk) 743 if (pages_written >= write_chunk)
566 break; /* We've done our duty */ 744 break; /* We've done our duty */
567 } 745 }
568 trace_wbc_balance_dirty_wait(&wbc, bdi);
569 __set_current_state(TASK_UNINTERRUPTIBLE); 746 __set_current_state(TASK_UNINTERRUPTIBLE);
570 io_schedule_timeout(pause); 747 io_schedule_timeout(pause);
748 trace_balance_dirty_wait(bdi);
749
750 dirty_thresh = hard_dirty_limit(dirty_thresh);
751 /*
752 * max-pause area. If dirty exceeded but still within this
753 * area, no need to sleep for more than 200ms: (a) 8 pages per
754 * 200ms is typically more than enough to curb heavy dirtiers;
755 * (b) the pause time limit makes the dirtiers more responsive.
756 */
757 if (nr_dirty < dirty_thresh +
758 dirty_thresh / DIRTY_MAXPAUSE_AREA &&
759 time_after(jiffies, start_time + MAX_PAUSE))
760 break;
761 /*
762 * pass-good area. When some bdi gets blocked (eg. NFS server
763 * not responding), or write bandwidth dropped dramatically due
764 * to concurrent reads, or dirty threshold suddenly dropped and
765 * the dirty pages cannot be brought down anytime soon (eg. on
766 * slow USB stick), at least let go of the good bdi's.
767 */
768 if (nr_dirty < dirty_thresh +
769 dirty_thresh / DIRTY_PASSGOOD_AREA &&
770 bdi_dirty < bdi_thresh)
771 break;
571 772
572 /* 773 /*
573 * Increase the delay for each loop, up to our previous 774 * Increase the delay for each loop, up to our previous
@@ -578,7 +779,8 @@ static void balance_dirty_pages(struct address_space *mapping,
578 pause = HZ / 10; 779 pause = HZ / 10;
579 } 780 }
580 781
581 if (!dirty_exceeded && bdi->dirty_exceeded) 782 /* Clear dirty_exceeded flag only when no task can exceed the limit */
783 if (clear_dirty_exceeded && bdi->dirty_exceeded)
582 bdi->dirty_exceeded = 0; 784 bdi->dirty_exceeded = 0;
583 785
584 if (writeback_in_progress(bdi)) 786 if (writeback_in_progress(bdi))
@@ -626,9 +828,13 @@ static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
626void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, 828void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
627 unsigned long nr_pages_dirtied) 829 unsigned long nr_pages_dirtied)
628{ 830{
831 struct backing_dev_info *bdi = mapping->backing_dev_info;
629 unsigned long ratelimit; 832 unsigned long ratelimit;
630 unsigned long *p; 833 unsigned long *p;
631 834
835 if (!bdi_cap_account_dirty(bdi))
836 return;
837
632 ratelimit = ratelimit_pages; 838 ratelimit = ratelimit_pages;
633 if (mapping->backing_dev_info->dirty_exceeded) 839 if (mapping->backing_dev_info->dirty_exceeded)
634 ratelimit = 8; 840 ratelimit = 8;
@@ -892,12 +1098,12 @@ int write_cache_pages(struct address_space *mapping,
892 range_whole = 1; 1098 range_whole = 1;
893 cycled = 1; /* ignore range_cyclic tests */ 1099 cycled = 1; /* ignore range_cyclic tests */
894 } 1100 }
895 if (wbc->sync_mode == WB_SYNC_ALL) 1101 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
896 tag = PAGECACHE_TAG_TOWRITE; 1102 tag = PAGECACHE_TAG_TOWRITE;
897 else 1103 else
898 tag = PAGECACHE_TAG_DIRTY; 1104 tag = PAGECACHE_TAG_DIRTY;
899retry: 1105retry:
900 if (wbc->sync_mode == WB_SYNC_ALL) 1106 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
901 tag_pages_for_writeback(mapping, index, end); 1107 tag_pages_for_writeback(mapping, index, end);
902 done_index = index; 1108 done_index = index;
903 while (!done && (index <= end)) { 1109 while (!done && (index <= end)) {
diff --git a/mm/rmap.c b/mm/rmap.c
index 9701574bb67a..8005080fb9e3 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -31,11 +31,11 @@
31 * mmlist_lock (in mmput, drain_mmlist and others) 31 * mmlist_lock (in mmput, drain_mmlist and others)
32 * mapping->private_lock (in __set_page_dirty_buffers) 32 * mapping->private_lock (in __set_page_dirty_buffers)
33 * inode->i_lock (in set_page_dirty's __mark_inode_dirty) 33 * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
34 * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) 34 * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
35 * sb_lock (within inode_lock in fs/fs-writeback.c) 35 * sb_lock (within inode_lock in fs/fs-writeback.c)
36 * mapping->tree_lock (widely used, in set_page_dirty, 36 * mapping->tree_lock (widely used, in set_page_dirty,
37 * in arch-dependent flush_dcache_mmap_lock, 37 * in arch-dependent flush_dcache_mmap_lock,
38 * within inode_wb_list_lock in __sync_single_inode) 38 * within bdi.wb->list_lock in __sync_single_inode)
39 * 39 *
40 * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) 40 * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon)
41 * ->tasklist_lock 41 * ->tasklist_lock