From a5989bdc981ec85e0734ac22519cc0b780813d7b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 16 Sep 2009 19:22:48 +0200 Subject: fs: Fix busyloop in wb_writeback() If all inodes are under writeback (e.g. in case when there's only one inode with dirty pages), wb_writeback() with WB_SYNC_NONE work basically degrades to busylooping until I_SYNC flags of the inode is cleared. Fix the problem by waiting on I_SYNC flags of an inode on b_more_io list in case we failed to write anything. Tested-by: Wu Fengguang Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 8e1e5e19d21e..c59d6737036c 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -706,6 +706,7 @@ static long wb_writeback(struct bdi_writeback *wb, }; unsigned long oldest_jif; long wrote = 0; + struct inode *inode; if (wbc.for_kupdate) { wbc.older_than_this = &oldest_jif; @@ -747,8 +748,24 @@ static long wb_writeback(struct bdi_writeback *wb, * If we ran out of stuff to write, bail unless more_io got set */ if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { - if (wbc.more_io && !wbc.for_kupdate) + if (wbc.more_io && !wbc.for_kupdate) { + if (wbc.nr_to_write < MAX_WRITEBACK_PAGES) + continue; + /* + * Nothing written. Wait for some inode to + * become available for writeback. Otherwise + * we'll just busyloop. + */ + spin_lock(&inode_lock); + if (!list_empty(&wb->b_more_io)) { + inode = list_entry( + wb->b_more_io.prev, + struct inode, i_list); + inode_wait_for_writeback(inode); + } + spin_unlock(&inode_lock); continue; + } break; } } -- cgit v1.2.2 From d3ddec7635b6fb37cb49e3553bdeea59642be653 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 23 Sep 2009 20:33:40 +0800 Subject: writeback: stop background writeback when below background threshold Treat bdi_start_writeback(0) as a special request to do background write, and stop such work when we are below the background dirty threshold. Also simplify the (nr_pages <= 0) checks. Since we already pass in nr_pages=LONG_MAX for WB_SYNC_ALL and background writes, we don't need to worry about it being decreased to zero. Reported-by: Richard Kennedy CC: Jan Kara Acked-by: Peter Zijlstra Signed-off-by: Wu Fengguang Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index c59d6737036c..476be9b10881 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -41,8 +41,9 @@ struct wb_writeback_args { long nr_pages; struct super_block *sb; enum writeback_sync_modes sync_mode; - int for_kupdate; - int range_cyclic; + int for_kupdate:1; + int range_cyclic:1; + int for_background:1; }; /* @@ -257,6 +258,15 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) .range_cyclic = 1, }; + /* + * We treat @nr_pages=0 as the special case to do background writeback, + * ie. to sync pages until the background dirty threshold is reached. + */ + if (!nr_pages) { + args.nr_pages = LONG_MAX; + args.for_background = 1; + } + bdi_alloc_queue_work(bdi, &args); } @@ -720,20 +730,16 @@ static long wb_writeback(struct bdi_writeback *wb, for (;;) { /* - * Don't flush anything for non-integrity writeback where - * no nr_pages was given + * Stop writeback when nr_pages has been consumed */ - if (!args->for_kupdate && args->nr_pages <= 0 && - args->sync_mode == WB_SYNC_NONE) + if (args->nr_pages <= 0) break; /* - * If no specific pages were given and this is just a - * periodic background writeout and we are below the - * background dirty threshold, don't do anything + * For background writeout, stop when we are below the + * background dirty threshold */ - if (args->for_kupdate && args->nr_pages <= 0 && - !over_bground_thresh()) + if (args->for_background && !over_bground_thresh()) break; wbc.more_io = 0; -- cgit v1.2.2 From 7fbdea32328312c65870c397a0a436c3226c8631 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 23 Sep 2009 20:33:41 +0800 Subject: writeback: kupdate writeback shall not stop when more io is possible Fix the kupdate case, which disregards wbc.more_io and stop writeback prematurely even when there are more inodes to be synced. wbc.more_io should always be respected. Also remove the pages_skipped check. It will set when some page(s) of some inode(s) cannot be written for now. Such inodes will be delayed for a while. This variable has nothing to do with whether there are other writeable inodes. CC: Jan Kara CC: Dave Chinner CC: Peter Zijlstra Signed-off-by: Wu Fengguang Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 476be9b10881..551684de1392 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -753,8 +753,8 @@ static long wb_writeback(struct bdi_writeback *wb, /* * If we ran out of stuff to write, bail unless more_io got set */ - if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { - if (wbc.more_io && !wbc.for_kupdate) { + if (wbc.nr_to_write > 0) { + if (wbc.more_io) { if (wbc.nr_to_write < MAX_WRITEBACK_PAGES) continue; /* -- cgit v1.2.2 From ae1b7f7d4b9ea587fda95c38301f4e72e8146634 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 23 Sep 2009 20:33:42 +0800 Subject: writeback: cleanup writeback_single_inode() Make the if-else straight in writeback_single_inode(). No behavior change. Cc: Jan Kara Cc: Michael Rubin Cc: Peter Zijlstra Signed-off-by: Fengguang Wu Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 551684de1392..916e83489caa 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -449,8 +449,13 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) spin_lock(&inode_lock); inode->i_state &= ~I_SYNC; if (!(inode->i_state & (I_FREEING | I_CLEAR))) { - if (!(inode->i_state & I_DIRTY) && - mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + if (inode->i_state & I_DIRTY) { + /* + * Someone redirtied the inode while were writing back + * the pages. + */ + redirty_tail(inode); + } else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { /* * We didn't write back all the pages. nfs_writepages() * sometimes bales out without doing anything. Redirty @@ -494,12 +499,6 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) inode->i_state |= I_DIRTY_PAGES; redirty_tail(inode); } - } else if (inode->i_state & I_DIRTY) { - /* - * Someone redirtied the inode while were writing back - * the pages. - */ - redirty_tail(inode); } else if (atomic_read(&inode->i_count)) { /* * The inode is clean, inuse -- cgit v1.2.2 From 71fd05a887e0f3f6bfff76ff81b33776177d0606 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 23 Sep 2009 19:32:26 +0200 Subject: writeback: improve readability of the wb_writeback() continue/break logic And throw some comments in there, too. Reviewed-by: Wu Fengguang Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 916e83489caa..15e375bf93e6 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -750,29 +750,32 @@ static long wb_writeback(struct bdi_writeback *wb, wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; /* - * If we ran out of stuff to write, bail unless more_io got set + * If we consumed everything, see if we have more */ - if (wbc.nr_to_write > 0) { - if (wbc.more_io) { - if (wbc.nr_to_write < MAX_WRITEBACK_PAGES) - continue; - /* - * Nothing written. Wait for some inode to - * become available for writeback. Otherwise - * we'll just busyloop. - */ - spin_lock(&inode_lock); - if (!list_empty(&wb->b_more_io)) { - inode = list_entry( - wb->b_more_io.prev, - struct inode, i_list); - inode_wait_for_writeback(inode); - } - spin_unlock(&inode_lock); - continue; - } + if (wbc.nr_to_write <= 0) + continue; + /* + * Didn't write everything and we don't have more IO, bail + */ + if (!wbc.more_io) break; + /* + * Did we write something? Try for more + */ + if (wbc.nr_to_write < MAX_WRITEBACK_PAGES) + continue; + /* + * Nothing written. Wait for some inode to + * become available for writeback. Otherwise + * we'll just busyloop. + */ + spin_lock(&inode_lock); + if (!list_empty(&wb->b_more_io)) { + inode = list_entry(wb->b_more_io.prev, + struct inode, i_list); + inode_wait_for_writeback(inode); } + spin_unlock(&inode_lock); } return wrote; -- cgit v1.2.2 From 5b0830cb9085f4b69f9d57d7f3aaff322ffbec26 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 23 Sep 2009 19:37:09 +0200 Subject: writeback: get rid to incorrect references to pdflush in comments Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 15e375bf93e6..15944f754e15 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -320,7 +320,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) * For inodes being constantly redirtied, dirtied_when can get stuck. * It _appears_ to be in the future, but is actually in distant past. * This test is necessary to prevent such wrapped-around relative times - * from permanently stopping the whole pdflush writeback. + * from permanently stopping the whole bdi writeback. */ ret = ret && time_before_eq(inode->dirtied_when, jiffies); #endif @@ -1085,9 +1085,6 @@ EXPORT_SYMBOL(__mark_inode_dirty); * If older_than_this is non-NULL, then only write out inodes which * had their first dirtying at a time earlier than *older_than_this. * - * If we're a pdlfush thread, then implement pdflush collision avoidance - * against the entire list. - * * If `bdi' is non-zero then we're being asked to writeback a specific queue. * This function assumes that the blockdev superblock's inodes are backed by * a variety of queues, so all inodes are searched. For other superblocks, -- cgit v1.2.2 From 5c03449d34debca0deab58046377e1175c1bcd7e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 24 Sep 2009 14:42:33 +0200 Subject: writeback: move inodes from one super_block together __mark_inode_dirty adds inode to wb dirty list in random order. If a disk has several partitions, writeback might keep spindle moving between partitions. To reduce the move, better write big chunk of one partition and then move to another. Inodes from one fs usually are in one partion, so idealy move indoes from one fs together should reduce spindle move. This patch tries to address this. Before per-bdi writeback is added, the behavior is write indoes from one fs first and then another, so the patch restores previous behavior. The loop in the patch is a bit ugly, should we add a dirty list for each superblock in bdi_writeback? Test in a two partition disk with attached fio script shows about 3% ~ 6% improvement. Signed-off-by: Shaohua Li Reviewed-by: Wu Fengguang Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 15944f754e15..b27406d51bc7 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -334,13 +334,28 @@ static void move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, unsigned long *older_than_this) { + LIST_HEAD(tmp); + struct list_head *pos, *node; + struct super_block *sb; + struct inode *inode; + while (!list_empty(delaying_queue)) { - struct inode *inode = list_entry(delaying_queue->prev, - struct inode, i_list); + inode = list_entry(delaying_queue->prev, struct inode, i_list); if (older_than_this && inode_dirtied_after(inode, *older_than_this)) break; - list_move(&inode->i_list, dispatch_queue); + list_move(&inode->i_list, &tmp); + } + + /* Move inodes from one superblock together */ + while (!list_empty(&tmp)) { + inode = list_entry(tmp.prev, struct inode, i_list); + sb = inode->i_sb; + list_for_each_prev_safe(pos, node, &tmp) { + inode = list_entry(pos, struct inode, i_list); + if (inode->i_sb == sb) + list_move(&inode->i_list, dispatch_queue); + } } } -- cgit v1.2.2 From cf137307cd9827495b65e7d74ea2b610daa9898b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 24 Sep 2009 15:12:57 +0200 Subject: writeback: don't resort for a single super_block in move_expired_inodes() If we only moved inodes from a single super_block to the temporary list, there's no point in doing a resort for multiple super_blocks. Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index b27406d51bc7..225c7316344e 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -336,17 +336,27 @@ static void move_expired_inodes(struct list_head *delaying_queue, { LIST_HEAD(tmp); struct list_head *pos, *node; - struct super_block *sb; + struct super_block *sb = NULL; struct inode *inode; + int do_sb_sort = 0; while (!list_empty(delaying_queue)) { inode = list_entry(delaying_queue->prev, struct inode, i_list); if (older_than_this && inode_dirtied_after(inode, *older_than_this)) break; + if (sb && sb != inode->i_sb) + do_sb_sort = 1; + sb = inode->i_sb; list_move(&inode->i_list, &tmp); } + /* just one sb in list, splice to dispatch_queue and we're done */ + if (!do_sb_sort) { + list_splice(&tmp, dispatch_queue); + return; + } + /* Move inodes from one superblock together */ while (!list_empty(&tmp)) { inode = list_entry(tmp.prev, struct inode, i_list); -- cgit v1.2.2 From 9ecc2738ac2371f88dff5d48914b4e35c45203cd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 24 Sep 2009 15:25:11 +0200 Subject: writeback: make the super_block pinning more efficient Currently we pin the inode->i_sb for every single inode. This increases cache traffic on sb->s_umount sem. Lets instead cache the inode sb pin state and keep the super_block pinned for as long as keep writing out inodes from the same super_block. Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 46 +++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 17 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 225c7316344e..c6bf775e641a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -540,6 +540,17 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) return ret; } +static void unpin_sb_for_writeback(struct super_block **psb) +{ + struct super_block *sb = *psb; + + if (sb) { + up_read(&sb->s_umount); + put_super(sb); + *psb = NULL; + } +} + /* * For WB_SYNC_NONE writeback, the caller does not have the sb pinned * before calling writeback. So make sure that we do pin it, so it doesn't @@ -549,10 +560,19 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * 1 if we failed. */ static int pin_sb_for_writeback(struct writeback_control *wbc, - struct inode *inode) + struct inode *inode, struct super_block **psb) { struct super_block *sb = inode->i_sb; + /* + * If this sb is already pinned, nothing more to do. If not and + * *psb is non-NULL, unpin the old one first + */ + if (sb == *psb) + return 0; + else if (*psb) + unpin_sb_for_writeback(psb); + /* * Caller must already hold the ref for this */ @@ -566,7 +586,7 @@ static int pin_sb_for_writeback(struct writeback_control *wbc, if (down_read_trylock(&sb->s_umount)) { if (sb->s_root) { spin_unlock(&sb_lock); - return 0; + goto pinned; } /* * umounted, drop rwsem again and fall through to failure @@ -577,24 +597,15 @@ static int pin_sb_for_writeback(struct writeback_control *wbc, sb->s_count--; spin_unlock(&sb_lock); return 1; -} - -static void unpin_sb_for_writeback(struct writeback_control *wbc, - struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - - if (wbc->sync_mode == WB_SYNC_ALL) - return; - - up_read(&sb->s_umount); - put_super(sb); +pinned: + *psb = sb; + return 0; } static void writeback_inodes_wb(struct bdi_writeback *wb, struct writeback_control *wbc) { - struct super_block *sb = wbc->sb; + struct super_block *sb = wbc->sb, *pin_sb = NULL; const int is_blkdev_sb = sb_is_blkdev_sb(sb); const unsigned long start = jiffies; /* livelock avoidance */ @@ -653,7 +664,7 @@ static void writeback_inodes_wb(struct bdi_writeback *wb, if (inode_dirtied_after(inode, start)) break; - if (pin_sb_for_writeback(wbc, inode)) { + if (pin_sb_for_writeback(wbc, inode, &pin_sb)) { requeue_io(inode); continue; } @@ -662,7 +673,6 @@ static void writeback_inodes_wb(struct bdi_writeback *wb, __iget(inode); pages_skipped = wbc->pages_skipped; writeback_single_inode(inode, wbc); - unpin_sb_for_writeback(wbc, inode); if (wbc->pages_skipped != pages_skipped) { /* * writeback is not making progress due to locked @@ -682,6 +692,8 @@ static void writeback_inodes_wb(struct bdi_writeback *wb, wbc->more_io = 1; } + unpin_sb_for_writeback(&pin_sb); + spin_unlock(&inode_lock); /* Leave any unwritten inodes on b_io */ } -- cgit v1.2.2 From b3af9468aebf5fcb573d0a116b31d2be1d43c0e9 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Fri, 25 Sep 2009 06:04:10 +0200 Subject: writeback: don't delay inodes redirtied by a fast dirtier Debug traces show that in per-bdi writeback, the inode under writeback almost always get redirtied by a busy dirtier. We used to call redirty_tail() in this case, which could delay inode for up to 30s. This is unacceptable because it now happens so frequently for plain cp/dd, that the accumulated delays could make writeback of big files very slow. So let's distinguish between data redirty and metadata only redirty. The first one is caused by a busy dirtier, while the latter one could happen in XFS, NFS, etc. when they are doing delalloc or updating isize. The inode being busy dirtied will now be requeued for next io, while the inode being redirtied by fs will continue to be delayed to avoid repeated IO. CC: Jan Kara CC: Theodore Ts'o CC: Dave Chinner CC: Chris Mason CC: Christoph Hellwig Signed-off-by: Wu Fengguang Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index c6bf775e641a..52aa54540079 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -474,10 +474,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) spin_lock(&inode_lock); inode->i_state &= ~I_SYNC; if (!(inode->i_state & (I_FREEING | I_CLEAR))) { - if (inode->i_state & I_DIRTY) { + if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) { /* - * Someone redirtied the inode while were writing back - * the pages. + * More pages get dirtied by a fast dirtier. + */ + goto select_queue; + } else if (inode->i_state & I_DIRTY) { + /* + * At least XFS will redirty the inode during the + * writeback (delalloc) and on io completion (isize). */ redirty_tail(inode); } else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { @@ -502,6 +507,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * soon as the queue becomes uncongested. */ inode->i_state |= I_DIRTY_PAGES; +select_queue: if (wbc->nr_to_write <= 0) { /* * slice used up: queue for next turn -- cgit v1.2.2 From 56a131dcf7ed36c3c6e36bea448b674ea85ed5bb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 25 Sep 2009 17:15:03 +0200 Subject: writeback: writeback_inodes_sb() should use bdi_start_writeback() Pointless to iterate other devices looking for a super, when we have a bdi mapping. Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 52aa54540079..fb61178c86e3 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1206,7 +1206,7 @@ void writeback_inodes_sb(struct super_block *sb) nr_to_write = nr_dirty + nr_unstable + (inodes_stat.nr_inodes - inodes_stat.nr_unused); - bdi_writeback_all(sb, nr_to_write); + bdi_start_writeback(sb->s_bdi, nr_to_write); } EXPORT_SYMBOL(writeback_inodes_sb); -- cgit v1.2.2