From 69b62d01ec44fe0d505d89917392347732135a4d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 17 May 2010 12:51:03 +0200 Subject: writeback: disable periodic old data writeback for !dirty_writeback_centisecs Prior to 2.6.32, setting /proc/sys/vm/dirty_writeback_centisecs disabled periodic dirty writeback from kupdate. This got broken and now causes excessive sys CPU usage if set to zero, as we'll keep beating on schedule(). Cc: stable@kernel.org Reported-by: Justin Maggard Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 4b37f7cea4dd..760dc8d0b4ff 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -852,6 +852,12 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) unsigned long expired; long nr_pages; + /* + * When set to zero, disable periodic writeback + */ + if (!dirty_writeback_interval) + return 0; + expired = wb->last_old_flush + msecs_to_jiffies(dirty_writeback_interval * 10); if (time_before(jiffies, expired)) @@ -947,8 +953,12 @@ int bdi_writeback_task(struct bdi_writeback *wb) break; } - wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); - schedule_timeout_interruptible(wait_jiffies); + if (dirty_writeback_interval) { + wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); + schedule_timeout_interruptible(wait_jiffies); + } else + schedule(); + try_to_freeze(); } -- cgit v1.2.2 From e913fc825dc685a444cb4c1d0f9d32f372f59861 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 17 May 2010 12:55:07 +0200 Subject: writeback: fix WB_SYNC_NONE writeback from umount When umount calls sync_filesystem(), we first do a WB_SYNC_NONE writeback to kick off writeback of pending dirty inodes, then follow that up with a WB_SYNC_ALL to wait for it. Since umount already holds the sb s_umount mutex, WB_SYNC_NONE ends up doing nothing and all writeback happens as WB_SYNC_ALL. This can greatly slow down umount, since WB_SYNC_ALL writeback is a data integrity operation and thus a bigger hammer than simple WB_SYNC_NONE. For barrier aware file systems it's a lot slower. Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 48 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 11 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 760dc8d0b4ff..67db89786e7d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -45,6 +45,7 @@ struct wb_writeback_args { int for_kupdate:1; int range_cyclic:1; int for_background:1; + int sb_pinned:1; }; /* @@ -230,6 +231,11 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi, .sync_mode = WB_SYNC_ALL, .nr_pages = LONG_MAX, .range_cyclic = 0, + /* + * Setting sb_pinned is not necessary for WB_SYNC_ALL, but + * lets make it explicitly clear. + */ + .sb_pinned = 1, }; struct bdi_work work; @@ -245,21 +251,23 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi, * @bdi: the backing device to write from * @sb: write inodes from this super_block * @nr_pages: the number of pages to write + * @sb_locked: caller already holds sb umount sem. * * Description: * This does WB_SYNC_NONE opportunistic writeback. The IO is only * started when this function returns, we make no guarentees on - * completion. Caller need not hold sb s_umount semaphore. + * completion. Caller specifies whether sb umount sem is held already or not. * */ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, - long nr_pages) + long nr_pages, int sb_locked) { struct wb_writeback_args args = { .sb = sb, .sync_mode = WB_SYNC_NONE, .nr_pages = nr_pages, .range_cyclic = 1, + .sb_pinned = sb_locked, }; /* @@ -577,7 +585,7 @@ static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc, /* * Caller must already hold the ref for this */ - if (wbc->sync_mode == WB_SYNC_ALL) { + if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) { WARN_ON(!rwsem_is_locked(&sb->s_umount)); return SB_NOT_PINNED; } @@ -751,6 +759,7 @@ static long wb_writeback(struct bdi_writeback *wb, .for_kupdate = args->for_kupdate, .for_background = args->for_background, .range_cyclic = args->range_cyclic, + .sb_pinned = args->sb_pinned, }; unsigned long oldest_jif; long wrote = 0; @@ -1193,6 +1202,18 @@ static void wait_sb_inodes(struct super_block *sb) iput(old_inode); } +static void __writeback_inodes_sb(struct super_block *sb, int sb_locked) +{ + unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); + unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); + long nr_to_write; + + nr_to_write = nr_dirty + nr_unstable + + (inodes_stat.nr_inodes - inodes_stat.nr_unused); + + bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked); +} + /** * writeback_inodes_sb - writeback dirty inodes from given super_block * @sb: the superblock @@ -1204,17 +1225,22 @@ static void wait_sb_inodes(struct super_block *sb) */ void writeback_inodes_sb(struct super_block *sb) { - unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); - unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); - long nr_to_write; - - nr_to_write = nr_dirty + nr_unstable + - (inodes_stat.nr_inodes - inodes_stat.nr_unused); - - bdi_start_writeback(sb->s_bdi, sb, nr_to_write); + __writeback_inodes_sb(sb, 0); } EXPORT_SYMBOL(writeback_inodes_sb); +/** + * writeback_inodes_sb_locked - writeback dirty inodes from given super_block + * @sb: the superblock + * + * Like writeback_inodes_sb(), except the caller already holds the + * sb umount sem. + */ +void writeback_inodes_sb_locked(struct super_block *sb) +{ + __writeback_inodes_sb(sb, 1); +} + /** * writeback_inodes_sb_if_idle - start writeback if none underway * @sb: the superblock -- cgit v1.2.2 From 5547e8aac6f71505d621a612de2fca0dd988b439 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Fri, 7 May 2010 13:35:44 +0400 Subject: writeback: Update dirty flags in two steps Filesystems with delalloc support may dirty inode during writepages. As result inode will have dirty metadata flags even after write_inode. In fact we have two dedicated functions for proper data and metadata writeback. It is reasonable to separate flags updates in two stages. https://bugzilla.kernel.org/show_bug.cgi?id=15906 Signed-off-by: Dmitry Monakhov Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 67db89786e7d..0f629571234f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -460,11 +460,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) BUG_ON(inode->i_state & I_SYNC); - /* Set I_SYNC, reset I_DIRTY */ - dirty = inode->i_state & I_DIRTY; + /* Set I_SYNC, reset I_DIRTY_PAGES */ inode->i_state |= I_SYNC; - inode->i_state &= ~I_DIRTY; - + inode->i_state &= ~I_DIRTY_PAGES; spin_unlock(&inode_lock); ret = do_writepages(mapping, wbc); @@ -480,6 +478,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) ret = err; } + /* + * Some filesystems may redirty the inode during the writeback + * due to delalloc, clear dirty metadata flags right before + * write_inode() + */ + spin_lock(&inode_lock); + dirty = inode->i_state & I_DIRTY; + inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); + spin_unlock(&inode_lock); /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { int err = write_inode(inode, wbc); -- cgit v1.2.2 From 7c8a3554c683f512dbcee26faedb42e4c05f12fa Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 18 May 2010 14:29:29 +0200 Subject: writeback: ensure that WB_SYNC_NONE writeback with sb pinned is sync Even if the writeout itself isn't a data integrity operation, we need to ensure that the caller doesn't drop the sb umount sem before we have actually done the writeback. This is a fixup for commit e913fc82. Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 0f629571234f..76f546d56a64 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -193,7 +193,8 @@ static void bdi_wait_on_work_clear(struct bdi_work *work) } static void bdi_alloc_queue_work(struct backing_dev_info *bdi, - struct wb_writeback_args *args) + struct wb_writeback_args *args, + int wait) { struct bdi_work *work; @@ -205,6 +206,8 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi, if (work) { bdi_work_init(work, args); bdi_queue_work(bdi, work); + if (wait) + bdi_wait_on_work_clear(work); } else { struct bdi_writeback *wb = &bdi->wb; @@ -279,7 +282,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, args.for_background = 1; } - bdi_alloc_queue_work(bdi, &args); + bdi_alloc_queue_work(bdi, &args, sb_locked); } /* @@ -909,6 +912,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) while ((work = get_next_work_item(bdi, wb)) != NULL) { struct wb_writeback_args args = work->args; + int post_clear; /* * Override sync mode, in case we must wait for completion @@ -916,11 +920,13 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) if (force_wait) work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; + post_clear = WB_SYNC_ALL || args.sb_pinned; + /* * If this isn't a data integrity operation, just notify * that we have seen this work and we are now starting it. */ - if (args.sync_mode == WB_SYNC_NONE) + if (!post_clear) wb_clear_pending(wb, work); wrote += wb_writeback(wb, &args); @@ -929,7 +935,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) * This is a data integrity writeback, so only do the * notification when we have completed the work. */ - if (args.sync_mode == WB_SYNC_ALL) + if (post_clear) wb_clear_pending(wb, work); } @@ -1000,7 +1006,7 @@ static void bdi_writeback_all(struct super_block *sb, long nr_pages) if (!bdi_has_dirty_io(bdi)) continue; - bdi_alloc_queue_work(bdi, &args); + bdi_alloc_queue_work(bdi, &args, 0); } rcu_read_unlock(); -- cgit v1.2.2 From f9eadbbd424c083b8005c7b738f644611b9ef489 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 18 May 2010 14:31:45 +0200 Subject: writeback: bdi_writeback_task() must set task state before calling schedule() Calling schedule without setting the task state to non-running will return immediately, so ensure that we set it properly and check our sleep conditions after doing so. This is a fixup for commit 69b62d01. Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 76f546d56a64..437a7431b4ea 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -978,8 +978,13 @@ int bdi_writeback_task(struct bdi_writeback *wb) if (dirty_writeback_interval) { wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); schedule_timeout_interruptible(wait_jiffies); - } else - schedule(); + } else { + set_current_state(TASK_INTERRUPTIBLE); + if (list_empty_careful(&wb->bdi->work_list) && + !kthread_should_stop()) + schedule(); + __set_current_state(TASK_RUNNING); + } try_to_freeze(); } -- cgit v1.2.2 From 52957fe1c709d5ca3732456d73f4e4d95492c72c Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Thu, 1 Apr 2010 20:36:30 -0500 Subject: fs-writeback.c: bitfields should be unsigned This fixes sparse noise: error: dubious one-bit signed bitfield Signed-off-by: H Hartley Sweeten Cc: Alexander Viro Signed-off-by: Al Viro --- fs/fs-writeback.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 4b37f7cea4dd..24e85ce11891 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -42,9 +42,9 @@ struct wb_writeback_args { long nr_pages; struct super_block *sb; enum writeback_sync_modes sync_mode; - int for_kupdate:1; - int range_cyclic:1; - int for_background:1; + unsigned int for_kupdate:1; + unsigned int range_cyclic:1; + unsigned int for_background:1; }; /* -- cgit v1.2.2