aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2019-02-27 08:42:30 -0500
committerDavid Sterba <dsterba@suse.com>2019-03-13 12:13:48 -0400
commit609e804d771f59dc5d45a93e5ee0053c74bbe2bf (patch)
treea6f8e898d3edb1b23349123fbb37c64085070059
parent2cc8334270e281815c3850c3adea363c51f21e0d (diff)
Btrfs: fix file corruption after snapshotting due to mix of buffered/DIO writes
When we are mixing buffered writes with direct IO writes against the same file and snapshotting is happening concurrently, we can end up with a corrupt file content in the snapshot. Example: 1) Inode/file is empty. 2) Snapshotting starts. 2) Buffered write at offset 0 length 256Kb. This updates the i_size of the inode to 256Kb, disk_i_size remains zero. This happens after the task doing the snapshot flushes all existing delalloc. 3) DIO write at offset 256Kb length 768Kb. Once the ordered extent completes it sets the inode's disk_i_size to 1Mb (256Kb + 768Kb) and updates the inode item in the fs tree with a size of 1Mb (which is the value of disk_i_size). 4) The dealloc for the range [0, 256Kb[ did not start yet. 5) The transaction used in the DIO ordered extent completion, which updated the inode item, is committed by the snapshotting task. 6) Snapshot creation completes. 7) Dealloc for the range [0, 256Kb[ is flushed. After that when reading the file from the snapshot we always get zeroes for the range [0, 256Kb[, the file has a size of 1Mb and the data written by the direct IO write is found. From an application's point of view this is a corruption, since in the source subvolume it could never read a version of the file that included the data from the direct IO write without the data from the buffered write included as well. In the snapshot's tree, file extent items are missing for the range [0, 256Kb[. The issue, obviously, does not happen when using the -o flushoncommit mount option. Fix this by flushing delalloc for all the roots that are about to be snapshotted when committing a transaction. This guarantees total ordering when updating the disk_i_size of an inode since the flush for dealloc is done when a transaction is in the TRANS_STATE_COMMIT_START state and wait is done once no more external writers exist. This is similar to what we do when using the flushoncommit mount option, but we do it only if the transaction has snapshots to create and only for the roots of the subvolumes to be snapshotted. The bulk of the dealloc is flushed in the snapshot creation ioctl, so the flush work we do inside the transaction is minimized. This issue, involving buffered and direct IO writes with snapshotting, is often triggered by fstest btrfs/078, and got reported by fsck when not using the NO_HOLES features, for example: $ cat results/btrfs/078.full (...) _check_btrfs_filesystem: filesystem on /dev/sdc is inconsistent *** fsck.btrfs output *** [1/7] checking root items [2/7] checking extents [3/7] checking free space cache [4/7] checking fs roots root 258 inode 264 errors 100, file extent discount Found file extent holes: start: 524288, len: 65536 ERROR: errors found in fs roots Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
-rw-r--r--fs/btrfs/transaction.c49
1 files changed, 43 insertions, 6 deletions
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index acdad6d658f5..e4e665f422fc 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1886,8 +1886,10 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
1886 } 1886 }
1887} 1887}
1888 1888
1889static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) 1889static inline int btrfs_start_delalloc_flush(struct btrfs_trans_handle *trans)
1890{ 1890{
1891 struct btrfs_fs_info *fs_info = trans->fs_info;
1892
1891 /* 1893 /*
1892 * We use writeback_inodes_sb here because if we used 1894 * We use writeback_inodes_sb here because if we used
1893 * btrfs_start_delalloc_roots we would deadlock with fs freeze. 1895 * btrfs_start_delalloc_roots we would deadlock with fs freeze.
@@ -1897,15 +1899,50 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
1897 * from already being in a transaction and our join_transaction doesn't 1899 * from already being in a transaction and our join_transaction doesn't
1898 * have to re-take the fs freeze lock. 1900 * have to re-take the fs freeze lock.
1899 */ 1901 */
1900 if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) 1902 if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) {
1901 writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); 1903 writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
1904 } else {
1905 struct btrfs_pending_snapshot *pending;
1906 struct list_head *head = &trans->transaction->pending_snapshots;
1907
1908 /*
1909 * Flush dellaloc for any root that is going to be snapshotted.
1910 * This is done to avoid a corrupted version of files, in the
1911 * snapshots, that had both buffered and direct IO writes (even
1912 * if they were done sequentially) due to an unordered update of
1913 * the inode's size on disk.
1914 */
1915 list_for_each_entry(pending, head, list) {
1916 int ret;
1917
1918 ret = btrfs_start_delalloc_snapshot(pending->root);
1919 if (ret)
1920 return ret;
1921 }
1922 }
1902 return 0; 1923 return 0;
1903} 1924}
1904 1925
1905static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) 1926static inline void btrfs_wait_delalloc_flush(struct btrfs_trans_handle *trans)
1906{ 1927{
1907 if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) 1928 struct btrfs_fs_info *fs_info = trans->fs_info;
1929
1930 if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) {
1908 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); 1931 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
1932 } else {
1933 struct btrfs_pending_snapshot *pending;
1934 struct list_head *head = &trans->transaction->pending_snapshots;
1935
1936 /*
1937 * Wait for any dellaloc that we started previously for the roots
1938 * that are going to be snapshotted. This is to avoid a corrupted
1939 * version of files in the snapshots that had both buffered and
1940 * direct IO writes (even if they were done sequentially).
1941 */
1942 list_for_each_entry(pending, head, list)
1943 btrfs_wait_ordered_extents(pending->root,
1944 U64_MAX, 0, U64_MAX);
1945 }
1909} 1946}
1910 1947
1911int btrfs_commit_transaction(struct btrfs_trans_handle *trans) 1948int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
@@ -2023,7 +2060,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
2023 2060
2024 extwriter_counter_dec(cur_trans, trans->type); 2061 extwriter_counter_dec(cur_trans, trans->type);
2025 2062
2026 ret = btrfs_start_delalloc_flush(fs_info); 2063 ret = btrfs_start_delalloc_flush(trans);
2027 if (ret) 2064 if (ret)
2028 goto cleanup_transaction; 2065 goto cleanup_transaction;
2029 2066
@@ -2039,7 +2076,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
2039 if (ret) 2076 if (ret)
2040 goto cleanup_transaction; 2077 goto cleanup_transaction;
2041 2078
2042 btrfs_wait_delalloc_flush(fs_info); 2079 btrfs_wait_delalloc_flush(trans);
2043 2080
2044 btrfs_scrub_pause(fs_info); 2081 btrfs_scrub_pause(fs_info);
2045 /* 2082 /*