Btrfs: Avoid writeback stalls

While building large bios in writepages, btrfs may end up waiting for other page writeback to finish if WB_SYNC_ALL is used. While it is waiting, the bio it is building has a number of pages with the writeback bit set and they aren't getting to the disk any time soon. This lowers the latencies of writeback in general by sending down the bio being built before waiting for other pages. The bio submission code tries to limit the total number of async bios in flight by waiting when we're over a certain number of async bios. But, the waits are happening while writepages is building bios, and this can easily lead to stalls and other problems for people calling wait_on_page_writeback. The current fix is to let the congestion tests take care of waiting. sync() and others make sure to drain the current async requests to make sure that everything that was pending when the sync was started really get to disk. The code would drain pending requests both before and after submitting a new request. But, if one of the requests is waiting for page writeback to finish, the draining waits might block that page writeback. This changes the draining code to only wait after submitting the bio being processed. Signed-off-by: Chris Mason <chris.mason@oracle.com>
author: Chris Mason <chris.mason@oracle.com> 2008-11-19 12:44:22 -0500
committer: Chris Mason <chris.mason@oracle.com> 2008-11-19 12:44:22 -0500
commit: d2c3f4f695edac4d75c1b3eb01a1d16072de63bb (patch)
tree: 14a8dd519d067adbe16e8adb7342343529eb5c75
parent: 105d931d482b7d1b1b2dd4b0ea30365db8630b9f (diff)
3 files changed, 23 insertions, 20 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0e8d31274c92..8d03e4a3c4e9 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -538,15 +538,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
        async->work.flags = 0;
        async->bio_flags = bio_flags;
-        while(atomic_read(&fs_info->async_submit_draining) &&
-              atomic_read(&fs_info->nr_async_submits)) {
-                wait_event(fs_info->async_submit_wait,
-                           (atomic_read(&fs_info->nr_async_submits) == 0));
-        }
        atomic_inc(&fs_info->nr_async_submits);
        btrfs_queue_worker(&fs_info->workers, &async->work);
+#if 0
        if (atomic_read(&fs_info->nr_async_submits) > limit) {
                wait_event_timeout(fs_info->async_submit_wait,
                           (atomic_read(&fs_info->nr_async_submits) < limit),
@@ -556,7 +550,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
                           (atomic_read(&fs_info->nr_async_bios) < limit),
                           HZ/10);
        }
+#endif
        while(atomic_read(&fs_info->async_submit_draining) &&
              atomic_read(&fs_info->nr_async_submits)) {
                wait_event(fs_info->async_submit_wait,
@@ -1765,11 +1759,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        ret = btrfs_cleanup_reloc_trees(tree_root);
        BUG_ON(ret);
+read_fs_root:
        location.objectid = BTRFS_FS_TREE_OBJECTID;
        location.type = BTRFS_ROOT_ITEM_KEY;
        location.offset = (u64)-1;
-read_fs_root:
        fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
        if (!fs_info->fs_root)
                goto fail_cleaner;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 54d013c3bb88..a0f3804efe4f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2398,7 +2398,8 @@ update_nr_written:
 int extent_write_cache_pages(struct extent_io_tree *tree,
                             struct address_space *mapping,
                             struct writeback_control *wbc,
-                             writepage_t writepage, void *data)
+                             writepage_t writepage, void *data,
+                             void (*flush_fn)(void *))
 {
        struct backing_dev_info *bdi = mapping->backing_dev_info;
        int ret = 0;
@@ -2460,8 +2461,10 @@ retry:
                                continue;
                        }
-                        if (wbc->sync_mode != WB_SYNC_NONE)
+                        if (wbc->sync_mode != WB_SYNC_NONE) {
+                                flush_fn(data);
                                wait_on_page_writeback(page);
+                        }
                        if (PageWriteback(page) ||
                            !clear_page_dirty_for_io(page)) {
@@ -2498,6 +2501,15 @@ retry:
 }
 EXPORT_SYMBOL(extent_write_cache_pages);
+static noinline void flush_write_bio(void *data)
+{
+        struct extent_page_data *epd = data;
+        if (epd->bio) {
+                submit_one_bio(WRITE, epd->bio, 0, 0);
+                epd->bio = NULL;
+        }
+}
 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
                          get_extent_t *get_extent,
                          struct writeback_control *wbc)
@@ -2523,7 +2535,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
        ret = __extent_writepage(page, wbc, &epd);
        extent_write_cache_pages(tree, mapping, &wbc_writepages,
-                                 __extent_writepage, &epd);
+                                 __extent_writepage, &epd, flush_write_bio);
        if (epd.bio) {
                submit_one_bio(WRITE, epd.bio, 0, 0);
        }
@@ -2592,7 +2604,8 @@ int extent_writepages(struct extent_io_tree *tree,
        };
        ret = extent_write_cache_pages(tree, mapping, wbc,
-                                       __extent_writepage, &epd);
+                                       __extent_writepage, &epd,
+                                       flush_write_bio);
        if (epd.bio) {
                submit_one_bio(WRITE, epd.bio, 0, 0);
        }
@@ -3087,6 +3100,9 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
        for (i = 0; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
+                if (!set && !PageDirty(page))
+                        continue;
                lock_page(page);
                if (i == 0)
                        set_page_extent_head(page, eb->len);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6854bf41856a..806caacff86c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -883,13 +883,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
                async_cow->work.ordered_free = async_cow_free;
                async_cow->work.flags = 0;
-                while(atomic_read(&root->fs_info->async_submit_draining) &&
-                      atomic_read(&root->fs_info->async_delalloc_pages)) {
-                        wait_event(root->fs_info->async_submit_wait,
-                             (atomic_read(&root->fs_info->async_delalloc_pages)
-                              == 0));
-                }
                nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
                        PAGE_CACHE_SHIFT;
                atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
author	Chris Mason <chris.mason@oracle.com>	2008-11-19 12:44:22 -0500
committer	Chris Mason <chris.mason@oracle.com>	2008-11-19 12:44:22 -0500
commit	d2c3f4f695edac4d75c1b3eb01a1d16072de63bb (patch)
tree	14a8dd519d067adbe16e8adb7342343529eb5c75
parent	105d931d482b7d1b1b2dd4b0ea30365db8630b9f (diff)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0e8d31274c92..8d03e4a3c4e9 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c
@@ -538,15 +538,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info fs_info, struct inode inode,
538	async->work.flags = 0;	538	async->work.flags = 0;
539	async->bio_flags = bio_flags;	539	async->bio_flags = bio_flags;
540		540
541	while(atomic_read(&fs_info->async_submit_draining) &&
542	atomic_read(&fs_info->nr_async_submits)) {
543	wait_event(fs_info->async_submit_wait,
544	(atomic_read(&fs_info->nr_async_submits) == 0));
545	}
546
547	atomic_inc(&fs_info->nr_async_submits);	541	atomic_inc(&fs_info->nr_async_submits);
548	btrfs_queue_worker(&fs_info->workers, &async->work);	542	btrfs_queue_worker(&fs_info->workers, &async->work);
549		543	#if 0
550	if (atomic_read(&fs_info->nr_async_submits) > limit) {	544	if (atomic_read(&fs_info->nr_async_submits) > limit) {
551	wait_event_timeout(fs_info->async_submit_wait,	545	wait_event_timeout(fs_info->async_submit_wait,
552	(atomic_read(&fs_info->nr_async_submits) < limit),	546	(atomic_read(&fs_info->nr_async_submits) < limit),
@@ -556,7 +550,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info fs_info, struct inode inode,
556	(atomic_read(&fs_info->nr_async_bios) < limit),	550	(atomic_read(&fs_info->nr_async_bios) < limit),
557	HZ/10);	551	HZ/10);
558	}	552	}
559		553	#endif
560	while(atomic_read(&fs_info->async_submit_draining) &&	554	while(atomic_read(&fs_info->async_submit_draining) &&
561	atomic_read(&fs_info->nr_async_submits)) {	555	atomic_read(&fs_info->nr_async_submits)) {
562	wait_event(fs_info->async_submit_wait,	556	wait_event(fs_info->async_submit_wait,
@@ -1765,11 +1759,11 @@ struct btrfs_root open_ctree(struct super_block sb,
1765	ret = btrfs_cleanup_reloc_trees(tree_root);	1759	ret = btrfs_cleanup_reloc_trees(tree_root);
1766	BUG_ON(ret);	1760	BUG_ON(ret);
1767		1761
		1762	read_fs_root:
1768	location.objectid = BTRFS_FS_TREE_OBJECTID;	1763	location.objectid = BTRFS_FS_TREE_OBJECTID;
1769	location.type = BTRFS_ROOT_ITEM_KEY;	1764	location.type = BTRFS_ROOT_ITEM_KEY;
1770	location.offset = (u64)-1;	1765	location.offset = (u64)-1;
1771		1766
1772	read_fs_root:
1773	fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);	1767	fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
1774	if (!fs_info->fs_root)	1768	if (!fs_info->fs_root)
1775	goto fail_cleaner;	1769	goto fail_cleaner;


diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 54d013c3bb88..a0f3804efe4f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c
@@ -2398,7 +2398,8 @@ update_nr_written:
2398	int extent_write_cache_pages(struct extent_io_tree *tree,	2398	int extent_write_cache_pages(struct extent_io_tree *tree,
2399	struct address_space *mapping,	2399	struct address_space *mapping,
2400	struct writeback_control *wbc,	2400	struct writeback_control *wbc,
2401	writepage_t writepage, void *data)	2401	writepage_t writepage, void *data,
		2402	void (flush_fn)(void ))
2402	{	2403	{
2403	struct backing_dev_info *bdi = mapping->backing_dev_info;	2404	struct backing_dev_info *bdi = mapping->backing_dev_info;
2404	int ret = 0;	2405	int ret = 0;
@@ -2460,8 +2461,10 @@ retry:
2460	continue;	2461	continue;
2461	}	2462	}
2462		2463
2463	if (wbc->sync_mode != WB_SYNC_NONE)	2464	if (wbc->sync_mode != WB_SYNC_NONE) {
		2465	flush_fn(data);
2464	wait_on_page_writeback(page);	2466	wait_on_page_writeback(page);
		2467	}
2465		2468
2466	if (PageWriteback(page) \|\|	2469	if (PageWriteback(page) \|\|
2467	!clear_page_dirty_for_io(page)) {	2470	!clear_page_dirty_for_io(page)) {
@@ -2498,6 +2501,15 @@ retry:
2498	}	2501	}
2499	EXPORT_SYMBOL(extent_write_cache_pages);	2502	EXPORT_SYMBOL(extent_write_cache_pages);
2500		2503
		2504	static noinline void flush_write_bio(void *data)
		2505	{
		2506	struct extent_page_data *epd = data;
		2507	if (epd->bio) {
		2508	submit_one_bio(WRITE, epd->bio, 0, 0);
		2509	epd->bio = NULL;
		2510	}
		2511	}
		2512
2501	int extent_write_full_page(struct extent_io_tree tree, struct page page,	2513	int extent_write_full_page(struct extent_io_tree tree, struct page page,
2502	get_extent_t *get_extent,	2514	get_extent_t *get_extent,
2503	struct writeback_control *wbc)	2515	struct writeback_control *wbc)
@@ -2523,7 +2535,7 @@ int extent_write_full_page(struct extent_io_tree tree, struct page page,
2523	ret = __extent_writepage(page, wbc, &epd);	2535	ret = __extent_writepage(page, wbc, &epd);
2524		2536
2525	extent_write_cache_pages(tree, mapping, &wbc_writepages,	2537	extent_write_cache_pages(tree, mapping, &wbc_writepages,
2526	__extent_writepage, &epd);	2538	__extent_writepage, &epd, flush_write_bio);
2527	if (epd.bio) {	2539	if (epd.bio) {
2528	submit_one_bio(WRITE, epd.bio, 0, 0);	2540	submit_one_bio(WRITE, epd.bio, 0, 0);
2529	}	2541	}
@@ -2592,7 +2604,8 @@ int extent_writepages(struct extent_io_tree *tree,
2592	};	2604	};
2593		2605
2594	ret = extent_write_cache_pages(tree, mapping, wbc,	2606	ret = extent_write_cache_pages(tree, mapping, wbc,
2595	__extent_writepage, &epd);	2607	__extent_writepage, &epd,
		2608	flush_write_bio);
2596	if (epd.bio) {	2609	if (epd.bio) {
2597	submit_one_bio(WRITE, epd.bio, 0, 0);	2610	submit_one_bio(WRITE, epd.bio, 0, 0);
2598	}	2611	}
@@ -3087,6 +3100,9 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3087		3100
3088	for (i = 0; i < num_pages; i++) {	3101	for (i = 0; i < num_pages; i++) {
3089	page = extent_buffer_page(eb, i);	3102	page = extent_buffer_page(eb, i);
		3103	if (!set && !PageDirty(page))
		3104	continue;
		3105
3090	lock_page(page);	3106	lock_page(page);
3091	if (i == 0)	3107	if (i == 0)
3092	set_page_extent_head(page, eb->len);	3108	set_page_extent_head(page, eb->len);


diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6854bf41856a..806caacff86c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c
@@ -883,13 +883,6 @@ static int cow_file_range_async(struct inode inode, struct page locked_page,
883	async_cow->work.ordered_free = async_cow_free;	883	async_cow->work.ordered_free = async_cow_free;
884	async_cow->work.flags = 0;	884	async_cow->work.flags = 0;
885		885
886	while(atomic_read(&root->fs_info->async_submit_draining) &&
887	atomic_read(&root->fs_info->async_delalloc_pages)) {
888	wait_event(root->fs_info->async_submit_wait,
889	(atomic_read(&root->fs_info->async_delalloc_pages)
890	== 0));
891	}
892
893	nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>	886	nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
894	PAGE_CACHE_SHIFT;	887	PAGE_CACHE_SHIFT;
895	atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);	888	atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);