Btrfs: Fix some data=ordered related data corruptions

Stress testing was showing data checksum errors, most of which were caused by a lookup bug in the extent_map tree. The tree was caching the last pointer returned, and searches would check the last pointer first. But, search callers also expect the search to return the very first matching extent in the range, which wasn't always true with the last pointer usage. For now, the code to cache the last return value is just removed. It is easy to fix, but I think lookups are rare enough that it isn't required anymore. This commit also replaces do_sync_mapping_range with a local copy of the related functions. Signed-off-by: Chris Mason <chris.mason@oracle.com>
author: Chris Mason <chris.mason@oracle.com> 2008-07-22 11:18:09 -0400
committer: Chris Mason <chris.mason@oracle.com> 2008-09-25 11:04:05 -0400
commit: f421950f86bf96a11fef932e167ab2e70d4c43a0 (patch)
tree: a2b62b942b023e37b6aae39891c2b314d8d8a3fb /fs/btrfs/ordered-data.c
parent: a61e6f29dc7c9d56a776a518eed92bbc61848263 (diff)
1 files changed, 97 insertions, 18 deletions
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 0d87795fdd8f..830dbaea6853 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -19,6 +19,8 @@
 #include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
 #include "ctree.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
@@ -307,12 +309,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
         * start IO on any dirty ones so the wait doesn't stall waiting
         * for pdflush to find them
         */
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
+        btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_NONE);
-        do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE);
-#else
-        do_sync_mapping_range(inode->i_mapping, start, end,
-                              SYNC_FILE_RANGE_WRITE);
-#endif
        if (wait)
                wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
                                                 &entry->flags));
@@ -327,28 +324,26 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
        u64 orig_end;
        u64 wait_end;
        struct btrfs_ordered_extent *ordered;
-        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
        if (start + len < start) {
-                wait_end = (inode->i_size + mask) & ~mask;
+                orig_end = INT_LIMIT(loff_t);
-                orig_end = (u64)-1;
        } else {
                orig_end = start + len - 1;
-                wait_end = orig_end;
+                if (orig_end > INT_LIMIT(loff_t))
+                        orig_end = INT_LIMIT(loff_t);
        }
+        wait_end = orig_end;
 again:
        /* start IO across the range first to instantiate any delalloc
         * extents
         */
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
+        btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
-        do_sync_file_range(file, start, wait_end, SYNC_FILE_RANGE_WRITE);
-#else
+        btrfs_wait_on_page_writeback_range(inode->i_mapping,
-        do_sync_mapping_range(inode->i_mapping, start, wait_end,
+                                           start >> PAGE_CACHE_SHIFT,
-                              SYNC_FILE_RANGE_WRITE);
+                                           orig_end >> PAGE_CACHE_SHIFT);
-#endif
-        end = orig_end;
-        wait_on_extent_writeback(&BTRFS_I(inode)->io_tree, start, orig_end);
+        end = orig_end;
        while(1) {
                ordered = btrfs_lookup_first_ordered_extent(inode, end);
                if (!ordered) {
@@ -565,3 +560,87 @@ out:
        return ret;
 }
+/**
+ * taken from mm/filemap.c because it isn't exported
+ *
+ * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
+ * @mapping:    address space structure to write
+ * @start:      offset in bytes where the range starts
+ * @end:        offset in bytes where the range ends (inclusive)
+ * @sync_mode:  enable synchronous operation
+ *
+ * Start writeback against all of a mapping's dirty pages that lie
+ * within the byte offsets <start, end> inclusive.
+ *
+ * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
+ * opposed to a regular memory cleansing writeback.  The difference between
+ * these two operations is that if a dirty page/buffer is encountered, it must
+ * be waited upon, and not just skipped over.
+ */
+int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
+                           loff_t end, int sync_mode)
+{
+        struct writeback_control wbc = {
+                .sync_mode = sync_mode,
+                .nr_to_write = mapping->nrpages * 2,
+                .range_start = start,
+                .range_end = end,
+                .for_writepages = 1,
+        };
+        return btrfs_writepages(mapping, &wbc);
+}
+/**
+ * taken from mm/filemap.c because it isn't exported
+ *
+ * wait_on_page_writeback_range - wait for writeback to complete
+ * @mapping:    target address_space
+ * @start:      beginning page index
+ * @end:        ending page index
+ *
+ * Wait for writeback to complete against pages indexed by start->end
+ * inclusive
+ */
+int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
+                                       pgoff_t start, pgoff_t end)
+{
+        struct pagevec pvec;
+        int nr_pages;
+        int ret = 0;
+        pgoff_t index;
+        if (end < start)
+                return 0;
+        pagevec_init(&pvec, 0);
+        index = start;
+        while ((index <= end) &&
+                        (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                        PAGECACHE_TAG_WRITEBACK,
+                        min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
+                unsigned i;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        /* until radix tree lookup accepts end_index */
+                        if (page->index > end)
+                                continue;
+                        wait_on_page_writeback(page);
+                        if (PageError(page))
+                                ret = -EIO;
+                }
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+        /* Check for outstanding write errors */
+        if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+                ret = -ENOSPC;
+        if (test_and_clear_bit(AS_EIO, &mapping->flags))
+                ret = -EIO;
+        return ret;
+}
author	Chris Mason <chris.mason@oracle.com>	2008-07-22 11:18:09 -0400
committer	Chris Mason <chris.mason@oracle.com>	2008-09-25 11:04:05 -0400
commit	f421950f86bf96a11fef932e167ab2e70d4c43a0 (patch)
tree	a2b62b942b023e37b6aae39891c2b314d8d8a3fb /fs/btrfs/ordered-data.c
parent	a61e6f29dc7c9d56a776a518eed92bbc61848263 (diff)

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 0d87795fdd8f..830dbaea6853 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c
@@ -19,6 +19,8 @@
19	#include <linux/gfp.h>	19	#include <linux/gfp.h>
20	#include <linux/slab.h>	20	#include <linux/slab.h>
21	#include <linux/blkdev.h>	21	#include <linux/blkdev.h>
		22	#include <linux/writeback.h>
		23	#include <linux/pagevec.h>
22	#include "ctree.h"	24	#include "ctree.h"
23	#include "transaction.h"	25	#include "transaction.h"
24	#include "btrfs_inode.h"	26	#include "btrfs_inode.h"
@@ -307,12 +309,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
307	* start IO on any dirty ones so the wait doesn't stall waiting	309	* start IO on any dirty ones so the wait doesn't stall waiting
308	* for pdflush to find them	310	* for pdflush to find them
309	*/	311	*/
310	#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)	312	btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_NONE);
311	do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE);
312	#else
313	do_sync_mapping_range(inode->i_mapping, start, end,
314	SYNC_FILE_RANGE_WRITE);
315	#endif
316	if (wait)	313	if (wait)
317	wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,	314	wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
318	&entry->flags));	315	&entry->flags));
@@ -327,28 +324,26 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
327	u64 orig_end;	324	u64 orig_end;
328	u64 wait_end;	325	u64 wait_end;
329	struct btrfs_ordered_extent *ordered;	326	struct btrfs_ordered_extent *ordered;
330	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
331		327
332	if (start + len < start) {	328	if (start + len < start) {
333	wait_end = (inode->i_size + mask) & ~mask;	329	orig_end = INT_LIMIT(loff_t);
334	orig_end = (u64)-1;
335	} else {	330	} else {
336	orig_end = start + len - 1;	331	orig_end = start + len - 1;
337	wait_end = orig_end;	332	if (orig_end > INT_LIMIT(loff_t))
		333	orig_end = INT_LIMIT(loff_t);
338	}	334	}
		335	wait_end = orig_end;
339	again:	336	again:
340	/* start IO across the range first to instantiate any delalloc	337	/* start IO across the range first to instantiate any delalloc
341	* extents	338	* extents
342	*/	339	*/
343	#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)	340	btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
344	do_sync_file_range(file, start, wait_end, SYNC_FILE_RANGE_WRITE);	341
345	#else	342	btrfs_wait_on_page_writeback_range(inode->i_mapping,
346	do_sync_mapping_range(inode->i_mapping, start, wait_end,	343	start >> PAGE_CACHE_SHIFT,
347	SYNC_FILE_RANGE_WRITE);	344	orig_end >> PAGE_CACHE_SHIFT);
348	#endif
349	end = orig_end;
350	wait_on_extent_writeback(&BTRFS_I(inode)->io_tree, start, orig_end);
351		345
		346	end = orig_end;
352	while(1) {	347	while(1) {
353	ordered = btrfs_lookup_first_ordered_extent(inode, end);	348	ordered = btrfs_lookup_first_ordered_extent(inode, end);
354	if (!ordered) {	349	if (!ordered) {
@@ -565,3 +560,87 @@ out:
565	return ret;	560	return ret;
566	}	561	}
567		562
		563
		564	/**
		565	* taken from mm/filemap.c because it isn't exported
		566	*
		567	* __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
		568	* @mapping: address space structure to write
		569	* @start: offset in bytes where the range starts
		570	* @end: offset in bytes where the range ends (inclusive)
		571	* @sync_mode: enable synchronous operation
		572	*
		573	* Start writeback against all of a mapping's dirty pages that lie
		574	* within the byte offsets <start, end> inclusive.
		575	*
		576	* If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
		577	* opposed to a regular memory cleansing writeback. The difference between
		578	* these two operations is that if a dirty page/buffer is encountered, it must
		579	* be waited upon, and not just skipped over.
		580	*/
		581	int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
		582	loff_t end, int sync_mode)
		583	{
		584	struct writeback_control wbc = {
		585	.sync_mode = sync_mode,
		586	.nr_to_write = mapping->nrpages * 2,
		587	.range_start = start,
		588	.range_end = end,
		589	.for_writepages = 1,
		590	};
		591	return btrfs_writepages(mapping, &wbc);
		592	}
		593
		594	/**
		595	* taken from mm/filemap.c because it isn't exported
		596	*
		597	* wait_on_page_writeback_range - wait for writeback to complete
		598	* @mapping: target address_space
		599	* @start: beginning page index
		600	* @end: ending page index
		601	*
		602	* Wait for writeback to complete against pages indexed by start->end
		603	* inclusive
		604	*/
		605	int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
		606	pgoff_t start, pgoff_t end)
		607	{
		608	struct pagevec pvec;
		609	int nr_pages;
		610	int ret = 0;
		611	pgoff_t index;
		612
		613	if (end < start)
		614	return 0;
		615
		616	pagevec_init(&pvec, 0);
		617	index = start;
		618	while ((index <= end) &&
		619	(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
		620	PAGECACHE_TAG_WRITEBACK,
		621	min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
		622	unsigned i;
		623
		624	for (i = 0; i < nr_pages; i++) {
		625	struct page *page = pvec.pages[i];
		626
		627	/* until radix tree lookup accepts end_index */
		628	if (page->index > end)
		629	continue;
		630
		631	wait_on_page_writeback(page);
		632	if (PageError(page))
		633	ret = -EIO;
		634	}
		635	pagevec_release(&pvec);
		636	cond_resched();
		637	}
		638
		639	/* Check for outstanding write errors */
		640	if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
		641	ret = -ENOSPC;
		642	if (test_and_clear_bit(AS_EIO, &mapping->flags))
		643	ret = -EIO;
		644
		645	return ret;
		646	}