aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/file.c
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2014-09-02 06:09:58 -0400
committerChris Mason <clm@fb.com>2014-09-19 09:57:50 -0400
commit669249eea365dd32b793b58891c74281c0aac47e (patch)
treedaa59316d2a0b1ce603bf129e850354a71ea4525 /fs/btrfs/file.c
parent254a2d14fa77ac8a1b64f6171ec5f717f0753b53 (diff)
Btrfs: fix fsync race leading to invalid data after log replay
When the fsync callback (btrfs_sync_file) starts, it first waits for the writeback of any dirty pages to start and finish without holding the inode's mutex (to reduce contention). After this it acquires the inode's mutex and repeats that process via btrfs_wait_ordered_range only if we're doing a full sync (BTRFS_INODE_NEEDS_FULL_SYNC flag is set on the inode). This is not safe for a non full sync - we need to start and wait for writeback to finish for any pages that might have been made dirty before acquiring the inode's mutex and after that first step mentioned before. Why this is needed is explained by the following comment added to btrfs_sync_file: "Right before acquiring the inode's mutex, we might have new writes dirtying pages, which won't immediately start the respective ordered operations - that is done through the fill_delalloc callbacks invoked from the writepage and writepages address space operations. So make sure we start all ordered operations before starting to log our inode. Not doing this means that while logging the inode, writeback could start and invoke writepage/writepages, which would call the fill_delalloc callbacks (cow_file_range, submit_compressed_extents). These callbacks add first an extent map to the modified list of extents and then create the respective ordered operation, which means in tree-log.c:btrfs_log_inode() we might capture all existing ordered operations (with btrfs_get_logged_extents()) before the fill_delalloc callback adds its ordered operation, and by the time we visit the modified list of extent maps (with btrfs_log_changed_extents()), we see and process the extent map they created. We then use the extent map to construct a file extent item for logging without waiting for the respective ordered operation to finish - this file extent item points to a disk location that might not have yet been written to, containing random data - so after a crash a log replay will make our inode have file extent items that point to disk locations containing invalid data, as we returned success to userspace without waiting for the respective ordered operation to finish, because it wasn't captured by btrfs_get_logged_extents()." Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs/btrfs/file.c')
-rw-r--r--fs/btrfs/file.c78
1 files changed, 63 insertions, 15 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d5d5060fe891..cdb71461e0fe 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1849,6 +1849,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
1849 return 0; 1849 return 0;
1850} 1850}
1851 1851
1852static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
1853{
1854 int ret;
1855
1856 atomic_inc(&BTRFS_I(inode)->sync_writers);
1857 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1858 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1859 &BTRFS_I(inode)->runtime_flags))
1860 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1861 atomic_dec(&BTRFS_I(inode)->sync_writers);
1862
1863 return ret;
1864}
1865
1852/* 1866/*
1853 * fsync call for both files and directories. This logs the inode into 1867 * fsync call for both files and directories. This logs the inode into
1854 * the tree log instead of forcing full commits whenever possible. 1868 * the tree log instead of forcing full commits whenever possible.
@@ -1878,30 +1892,64 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1878 * multi-task, and make the performance up. See 1892 * multi-task, and make the performance up. See
1879 * btrfs_wait_ordered_range for an explanation of the ASYNC check. 1893 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1880 */ 1894 */
1881 atomic_inc(&BTRFS_I(inode)->sync_writers); 1895 ret = start_ordered_ops(inode, start, end);
1882 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1883 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1884 &BTRFS_I(inode)->runtime_flags))
1885 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1886 atomic_dec(&BTRFS_I(inode)->sync_writers);
1887 if (ret) 1896 if (ret)
1888 return ret; 1897 return ret;
1889 1898
1890 mutex_lock(&inode->i_mutex); 1899 mutex_lock(&inode->i_mutex);
1891
1892 /*
1893 * We flush the dirty pages again to avoid some dirty pages in the
1894 * range being left.
1895 */
1896 atomic_inc(&root->log_batch); 1900 atomic_inc(&root->log_batch);
1897 full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 1901 full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1898 &BTRFS_I(inode)->runtime_flags); 1902 &BTRFS_I(inode)->runtime_flags);
1903 /*
1904 * We might have have had more pages made dirty after calling
1905 * start_ordered_ops and before acquiring the inode's i_mutex.
1906 */
1899 if (full_sync) { 1907 if (full_sync) {
1908 /*
1909 * For a full sync, we need to make sure any ordered operations
1910 * start and finish before we start logging the inode, so that
1911 * all extents are persisted and the respective file extent
1912 * items are in the fs/subvol btree.
1913 */
1900 ret = btrfs_wait_ordered_range(inode, start, end - start + 1); 1914 ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
1901 if (ret) { 1915 } else {
1902 mutex_unlock(&inode->i_mutex); 1916 /*
1903 goto out; 1917 * Start any new ordered operations before starting to log the
1904 } 1918 * inode. We will wait for them to finish in btrfs_sync_log().
1919 *
1920 * Right before acquiring the inode's mutex, we might have new
1921 * writes dirtying pages, which won't immediately start the
1922 * respective ordered operations - that is done through the
1923 * fill_delalloc callbacks invoked from the writepage and
1924 * writepages address space operations. So make sure we start
1925 * all ordered operations before starting to log our inode. Not
1926 * doing this means that while logging the inode, writeback
1927 * could start and invoke writepage/writepages, which would call
1928 * the fill_delalloc callbacks (cow_file_range,
1929 * submit_compressed_extents). These callbacks add first an
1930 * extent map to the modified list of extents and then create
1931 * the respective ordered operation, which means in
1932 * tree-log.c:btrfs_log_inode() we might capture all existing
1933 * ordered operations (with btrfs_get_logged_extents()) before
1934 * the fill_delalloc callback adds its ordered operation, and by
1935 * the time we visit the modified list of extent maps (with
1936 * btrfs_log_changed_extents()), we see and process the extent
1937 * map they created. We then use the extent map to construct a
1938 * file extent item for logging without waiting for the
1939 * respective ordered operation to finish - this file extent
1940 * item points to a disk location that might not have yet been
1941 * written to, containing random data - so after a crash a log
1942 * replay will make our inode have file extent items that point
1943 * to disk locations containing invalid data, as we returned
1944 * success to userspace without waiting for the respective
1945 * ordered operation to finish, because it wasn't captured by
1946 * btrfs_get_logged_extents().
1947 */
1948 ret = start_ordered_ops(inode, start, end);
1949 }
1950 if (ret) {
1951 mutex_unlock(&inode->i_mutex);
1952 goto out;
1905 } 1953 }
1906 atomic_inc(&root->log_batch); 1954 atomic_inc(&root->log_batch);
1907 1955