Btrfs: don't clean dirty pages during buffered writes

During buffered writes, we follow this basic series of steps: again: lock all the pages wait for writeback on all the pages Take the extent range lock wait for ordered extents on the whole range clean all the pages if (copy_from_user_in_atomic() hits a fault) { drop our locks goto again; } dirty all the pages release all the locks The extra waiting, cleaning and locking are there to make sure we don't modify pages in flight to the drive, after they've been crc'd. If some of the pages in the range were already dirty when the write began, and we need to goto again, we create a window where a dirty page has been cleaned and unlocked. It may be reclaimed before we're able to lock it again, which means we'll read the old contents off the drive and lose any modifications that had been pending writeback. We don't actually need to clean the pages. All of the other locking in place makes sure we don't start IO on the pages, so we can just leave them dirty for the duration of the write. Fixes: 73d59314e6ed (the original btrfs merge) CC: stable@vger.kernel.org # v4.4+ Signed-off-by: Chris Mason <clm@fb.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
author: Chris Mason <clm@fb.com> 2018-06-20 10:56:11 -0400
committer: David Sterba <dsterba@suse.com> 2018-10-15 11:23:35 -0400
commit: 7703bdd8d23e6ef057af3253958a793ec6066b28 (patch)
tree: 7adb511b8dbb2180961ea78d64d1bf79cce8ada3
parent: 818255feece6e2a432328020d78c8a81a153ce65 (diff)
1 files changed, 23 insertions, 6 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d254cf94545f..15b925142793 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -531,6 +531,14 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
        end_of_last_block = start_pos + num_bytes - 1;
+        /*
+         * The pages may have already been dirty, clear out old accounting so
+         * we can set things up properly
+         */
+        clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block,
+                         EXTENT_DIRTY | EXTENT_DELALLOC |
+                         EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached);
        if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
                if (start_pos >= isize &&
                    !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
@@ -1500,18 +1508,27 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
                }
                if (ordered)
                        btrfs_put_ordered_extent(ordered);
-                clear_extent_bit(&inode->io_tree, start_pos, last_pos,
-                                 EXTENT_DIRTY | EXTENT_DELALLOC |
-                                 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
-                                 0, 0, cached_state);
                *lockstart = start_pos;
                *lockend = last_pos;
                ret = 1;
        }
+        /*
+         * It's possible the pages are dirty right now, but we don't want
+         * to clean them yet because copy_from_user may catch a page fault
+         * and we might have to fall back to one page at a time.  If that
+         * happens, we'll unlock these pages and we'd have a window where
+         * reclaim could sneak in and drop the once-dirty page on the floor
+         * without writing it.
+         *
+         * We have the pages locked and the extent range locked, so there's
+         * no way someone can start IO on any dirty pages in this range.
+         *
+         * We'll call btrfs_dirty_pages() later on, and that will flip around
+         * delalloc bits and dirty the pages as required.
+         */
        for (i = 0; i < num_pages; i++) {
-                if (clear_page_dirty_for_io(pages[i]))
-                        account_page_redirty(pages[i]);
                set_page_extent_mapped(pages[i]);
                WARN_ON(!PageLocked(pages[i]));
        }
author	Chris Mason <clm@fb.com>	2018-06-20 10:56:11 -0400
committer	David Sterba <dsterba@suse.com>	2018-10-15 11:23:35 -0400
commit	7703bdd8d23e6ef057af3253958a793ec6066b28 (patch)
tree	7adb511b8dbb2180961ea78d64d1bf79cce8ada3
parent	818255feece6e2a432328020d78c8a81a153ce65 (diff)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d254cf94545f..15b925142793 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c
@@ -531,6 +531,14 @@ int btrfs_dirty_pages(struct inode inode, struct page *pages,
531		531
532	end_of_last_block = start_pos + num_bytes - 1;	532	end_of_last_block = start_pos + num_bytes - 1;
533		533
		534	/*
		535	* The pages may have already been dirty, clear out old accounting so
		536	* we can set things up properly
		537	*/
		538	clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block,
		539	EXTENT_DIRTY \| EXTENT_DELALLOC \|
		540	EXTENT_DO_ACCOUNTING \| EXTENT_DEFRAG, 0, 0, cached);
		541
534	if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {	542	if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
535	if (start_pos >= isize &&	543	if (start_pos >= isize &&
536	!(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {	544	!(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
@@ -1500,18 +1508,27 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode inode, struct page *pages,
1500	}	1508	}
1501	if (ordered)	1509	if (ordered)
1502	btrfs_put_ordered_extent(ordered);	1510	btrfs_put_ordered_extent(ordered);
1503	clear_extent_bit(&inode->io_tree, start_pos, last_pos,	1511
1504	EXTENT_DIRTY \| EXTENT_DELALLOC \|
1505	EXTENT_DO_ACCOUNTING \| EXTENT_DEFRAG,
1506	0, 0, cached_state);
1507	*lockstart = start_pos;	1512	*lockstart = start_pos;
1508	*lockend = last_pos;	1513	*lockend = last_pos;
1509	ret = 1;	1514	ret = 1;
1510	}	1515	}
1511		1516
		1517	/*
		1518	* It's possible the pages are dirty right now, but we don't want
		1519	* to clean them yet because copy_from_user may catch a page fault
		1520	* and we might have to fall back to one page at a time. If that
		1521	* happens, we'll unlock these pages and we'd have a window where
		1522	* reclaim could sneak in and drop the once-dirty page on the floor
		1523	* without writing it.
		1524	*
		1525	* We have the pages locked and the extent range locked, so there's
		1526	* no way someone can start IO on any dirty pages in this range.
		1527	*
		1528	* We'll call btrfs_dirty_pages() later on, and that will flip around
		1529	* delalloc bits and dirty the pages as required.
		1530	*/
1512	for (i = 0; i < num_pages; i++) {	1531	for (i = 0; i < num_pages; i++) {
1513	if (clear_page_dirty_for_io(pages[i]))
1514	account_page_redirty(pages[i]);
1515	set_page_extent_mapped(pages[i]);	1532	set_page_extent_mapped(pages[i]);
1516	WARN_ON(!PageLocked(pages[i]));	1533	WARN_ON(!PageLocked(pages[i]));
1517	}	1534	}