1 files changed, 56 insertions, 31 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b78bbbac900d..30982bbd31c3 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1811,22 +1811,10 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
        mutex_unlock(&inode->i_mutex);
        /*
-         * we want to make sure fsync finds this change
-         * but we haven't joined a transaction running right now.
-         *
-         * Later on, someone is sure to update the inode and get the
-         * real transid recorded.
-         *
-         * We set last_trans now to the fs_info generation + 1,
-         * this will either be one more than the running transaction
-         * or the generation used for the next transaction if there isn't
-         * one running right now.
-         *
         * We also have to set last_sub_trans to the current log transid,
         * otherwise subsequent syncs to a file that's been synced in this
         * transaction will appear to have already occured.
         */
-        BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
        BTRFS_I(inode)->last_sub_trans = root->log_transid;
        if (num_written > 0) {
                err = generic_write_sync(file, pos, num_written);
@@ -1959,25 +1947,37 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        atomic_inc(&root->log_batch);
        /*
-         * check the transaction that last modified this inode
+         * If the last transaction that changed this file was before the current
-         * and see if its already been committed
+         * transaction and we have the full sync flag set in our inode, we can
-         */
+         * bail out now without any syncing.
-        if (!BTRFS_I(inode)->last_trans) {
+         *
-                mutex_unlock(&inode->i_mutex);
+         * Note that we can't bail out if the full sync flag isn't set. This is
-                goto out;
+         * because when the full sync flag is set we start all ordered extents
-        }
+         * and wait for them to fully complete - when they complete they update
+         * the inode's last_trans field through:
-        /*
+         *
-         * if the last transaction that changed this file was before
+         *     btrfs_finish_ordered_io() ->
-         * the current transaction, we can bail out now without any
+         *         btrfs_update_inode_fallback() ->
-         * syncing
+         *             btrfs_update_inode() ->
+         *                 btrfs_set_inode_last_trans()
+         *
+         * So we are sure that last_trans is up to date and can do this check to
+         * bail out safely. For the fast path, when the full sync flag is not
+         * set in our inode, we can not do it because we start only our ordered
+         * extents and don't wait for them to complete (that is when
+         * btrfs_finish_ordered_io runs), so here at this point their last_trans
+         * value might be less than or equals to fs_info->last_trans_committed,
+         * and setting a speculative last_trans for an inode when a buffered
+         * write is made (such as fs_info->generation + 1 for example) would not
+         * be reliable since after setting the value and before fsync is called
+         * any number of transactions can start and commit (transaction kthread
+         * commits the current transaction periodically), and a transaction
+         * commit does not start nor waits for ordered extents to complete.
         */
        smp_mb();
        if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
-            BTRFS_I(inode)->last_trans <=
+            (full_sync && BTRFS_I(inode)->last_trans <=
-            root->fs_info->last_trans_committed) {
+             root->fs_info->last_trans_committed)) {
-                BTRFS_I(inode)->last_trans = 0;
                /*
                 * We'v had everything committed since the last time we were
                 * modified so clear this flag in case it was set for whatever
@@ -2275,6 +2275,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        bool same_page;
        bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
        u64 ino_size;
+        bool truncated_page = false;
+        bool updated_inode = false;
        ret = btrfs_wait_ordered_range(inode, offset, len);
        if (ret)
@@ -2306,13 +2308,18 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
         * entire page.
         */
        if (same_page && len < PAGE_CACHE_SIZE) {
-                if (offset < ino_size)
+                if (offset < ino_size) {
+                        truncated_page = true;
                        ret = btrfs_truncate_page(inode, offset, len, 0);
+                } else {
+                        ret = 0;
+                }
                goto out_only_mutex;
        }
        /* zero back part of the first page */
        if (offset < ino_size) {
+                truncated_page = true;
                ret = btrfs_truncate_page(inode, offset, 0, 0);
                if (ret) {
                        mutex_unlock(&inode->i_mutex);
@@ -2348,6 +2355,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                if (!ret) {
                        /* zero the front end of the last page */
                        if (tail_start + tail_len < ino_size) {
+                                truncated_page = true;
                                ret = btrfs_truncate_page(inode,
                                                tail_start + tail_len, 0, 1);
                                if (ret)
@@ -2357,8 +2365,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        }
        if (lockend < lockstart) {
-                mutex_unlock(&inode->i_mutex);
+                ret = 0;
-                return 0;
+                goto out_only_mutex;
        }
        while (1) {
@@ -2506,6 +2514,7 @@ out_trans:
        trans->block_rsv = &root->fs_info->trans_block_rsv;
        ret = btrfs_update_inode(trans, root, inode);
+        updated_inode = true;
        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root);
 out_free:
@@ -2515,6 +2524,22 @@ out:
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                             &cached_state, GFP_NOFS);
 out_only_mutex:
+        if (!updated_inode && truncated_page && !ret && !err) {
+                /*
+                 * If we only end up zeroing part of a page, we still need to
+                 * update the inode item, so that all the time fields are
+                 * updated as well as the necessary btrfs inode in memory fields
+                 * for detecting, at fsync time, if the inode isn't yet in the
+                 * log tree or it's there but not up to date.
+                 */
+                trans = btrfs_start_transaction(root, 1);
+                if (IS_ERR(trans)) {
+                        err = PTR_ERR(trans);
+                } else {
+                        err = btrfs_update_inode(trans, root, inode);
+                        ret = btrfs_end_transaction(trans, root);
+                }
+        }
        mutex_unlock(&inode->i_mutex);
        if (ret && !err)
                err = ret;

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b78bbbac900d..30982bbd31c3 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c
@@ -1811,22 +1811,10 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
1811	mutex_unlock(&inode->i_mutex);	1811	mutex_unlock(&inode->i_mutex);
1812		1812
1813	/*	1813	/*
1814	* we want to make sure fsync finds this change
1815	* but we haven't joined a transaction running right now.
1816	*
1817	* Later on, someone is sure to update the inode and get the
1818	* real transid recorded.
1819	*
1820	* We set last_trans now to the fs_info generation + 1,
1821	* this will either be one more than the running transaction
1822	* or the generation used for the next transaction if there isn't
1823	* one running right now.
1824	*
1825	* We also have to set last_sub_trans to the current log transid,	1814	* We also have to set last_sub_trans to the current log transid,
1826	* otherwise subsequent syncs to a file that's been synced in this	1815	* otherwise subsequent syncs to a file that's been synced in this
1827	* transaction will appear to have already occured.	1816	* transaction will appear to have already occured.
1828	*/	1817	*/
1829	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1830	BTRFS_I(inode)->last_sub_trans = root->log_transid;	1818	BTRFS_I(inode)->last_sub_trans = root->log_transid;
1831	if (num_written > 0) {	1819	if (num_written > 0) {
1832	err = generic_write_sync(file, pos, num_written);	1820	err = generic_write_sync(file, pos, num_written);
@@ -1959,25 +1947,37 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1959	atomic_inc(&root->log_batch);	1947	atomic_inc(&root->log_batch);
1960		1948
1961	/*	1949	/*
1962	* check the transaction that last modified this inode	1950	* If the last transaction that changed this file was before the current
1963	* and see if its already been committed	1951	* transaction and we have the full sync flag set in our inode, we can
1964	*/	1952	* bail out now without any syncing.
1965	if (!BTRFS_I(inode)->last_trans) {	1953	*
1966	mutex_unlock(&inode->i_mutex);	1954	* Note that we can't bail out if the full sync flag isn't set. This is
1967	goto out;	1955	* because when the full sync flag is set we start all ordered extents
1968	}	1956	* and wait for them to fully complete - when they complete they update
1969		1957	* the inode's last_trans field through:
1970	/*	1958	*
1971	* if the last transaction that changed this file was before	1959	* btrfs_finish_ordered_io() ->
1972	* the current transaction, we can bail out now without any	1960	* btrfs_update_inode_fallback() ->
1973	* syncing	1961	* btrfs_update_inode() ->
		1962	* btrfs_set_inode_last_trans()
		1963	*
		1964	* So we are sure that last_trans is up to date and can do this check to
		1965	* bail out safely. For the fast path, when the full sync flag is not
		1966	* set in our inode, we can not do it because we start only our ordered
		1967	* extents and don't wait for them to complete (that is when
		1968	* btrfs_finish_ordered_io runs), so here at this point their last_trans
		1969	* value might be less than or equals to fs_info->last_trans_committed,
		1970	* and setting a speculative last_trans for an inode when a buffered
		1971	* write is made (such as fs_info->generation + 1 for example) would not
		1972	* be reliable since after setting the value and before fsync is called
		1973	* any number of transactions can start and commit (transaction kthread
		1974	* commits the current transaction periodically), and a transaction
		1975	* commit does not start nor waits for ordered extents to complete.
1974	*/	1976	*/
1975	smp_mb();	1977	smp_mb();
1976	if (btrfs_inode_in_log(inode, root->fs_info->generation) \|\|	1978	if (btrfs_inode_in_log(inode, root->fs_info->generation) \|\|
1977	BTRFS_I(inode)->last_trans <=	1979	(full_sync && BTRFS_I(inode)->last_trans <=
1978	root->fs_info->last_trans_committed) {	1980	root->fs_info->last_trans_committed)) {
1979	BTRFS_I(inode)->last_trans = 0;
1980
1981	/*	1981	/*
1982	* We'v had everything committed since the last time we were	1982	* We'v had everything committed since the last time we were
1983	* modified so clear this flag in case it was set for whatever	1983	* modified so clear this flag in case it was set for whatever
@@ -2275,6 +2275,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2275	bool same_page;	2275	bool same_page;
2276	bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);	2276	bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
2277	u64 ino_size;	2277	u64 ino_size;
		2278	bool truncated_page = false;
		2279	bool updated_inode = false;
2278		2280
2279	ret = btrfs_wait_ordered_range(inode, offset, len);	2281	ret = btrfs_wait_ordered_range(inode, offset, len);
2280	if (ret)	2282	if (ret)
@@ -2306,13 +2308,18 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2306	* entire page.	2308	* entire page.
2307	*/	2309	*/
2308	if (same_page && len < PAGE_CACHE_SIZE) {	2310	if (same_page && len < PAGE_CACHE_SIZE) {
2309	if (offset < ino_size)	2311	if (offset < ino_size) {
		2312	truncated_page = true;
2310	ret = btrfs_truncate_page(inode, offset, len, 0);	2313	ret = btrfs_truncate_page(inode, offset, len, 0);
		2314	} else {
		2315	ret = 0;
		2316	}
2311	goto out_only_mutex;	2317	goto out_only_mutex;
2312	}	2318	}
2313		2319
2314	/* zero back part of the first page */	2320	/* zero back part of the first page */
2315	if (offset < ino_size) {	2321	if (offset < ino_size) {
		2322	truncated_page = true;
2316	ret = btrfs_truncate_page(inode, offset, 0, 0);	2323	ret = btrfs_truncate_page(inode, offset, 0, 0);
2317	if (ret) {	2324	if (ret) {
2318	mutex_unlock(&inode->i_mutex);	2325	mutex_unlock(&inode->i_mutex);
@@ -2348,6 +2355,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2348	if (!ret) {	2355	if (!ret) {
2349	/* zero the front end of the last page */	2356	/* zero the front end of the last page */
2350	if (tail_start + tail_len < ino_size) {	2357	if (tail_start + tail_len < ino_size) {
		2358	truncated_page = true;
2351	ret = btrfs_truncate_page(inode,	2359	ret = btrfs_truncate_page(inode,
2352	tail_start + tail_len, 0, 1);	2360	tail_start + tail_len, 0, 1);
2353	if (ret)	2361	if (ret)
@@ -2357,8 +2365,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2357	}	2365	}
2358		2366
2359	if (lockend < lockstart) {	2367	if (lockend < lockstart) {
2360	mutex_unlock(&inode->i_mutex);	2368	ret = 0;
2361	return 0;	2369	goto out_only_mutex;
2362	}	2370	}
2363		2371
2364	while (1) {	2372	while (1) {
@@ -2506,6 +2514,7 @@ out_trans:
2506		2514
2507	trans->block_rsv = &root->fs_info->trans_block_rsv;	2515	trans->block_rsv = &root->fs_info->trans_block_rsv;
2508	ret = btrfs_update_inode(trans, root, inode);	2516	ret = btrfs_update_inode(trans, root, inode);
		2517	updated_inode = true;
2509	btrfs_end_transaction(trans, root);	2518	btrfs_end_transaction(trans, root);
2510	btrfs_btree_balance_dirty(root);	2519	btrfs_btree_balance_dirty(root);
2511	out_free:	2520	out_free:
@@ -2515,6 +2524,22 @@ out:
2515	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,	2524	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2516	&cached_state, GFP_NOFS);	2525	&cached_state, GFP_NOFS);
2517	out_only_mutex:	2526	out_only_mutex:
		2527	if (!updated_inode && truncated_page && !ret && !err) {
		2528	/*
		2529	* If we only end up zeroing part of a page, we still need to
		2530	* update the inode item, so that all the time fields are
		2531	* updated as well as the necessary btrfs inode in memory fields
		2532	* for detecting, at fsync time, if the inode isn't yet in the
		2533	* log tree or it's there but not up to date.
		2534	*/
		2535	trans = btrfs_start_transaction(root, 1);
		2536	if (IS_ERR(trans)) {
		2537	err = PTR_ERR(trans);
		2538	} else {
		2539	err = btrfs_update_inode(trans, root, inode);
		2540	ret = btrfs_end_transaction(trans, root);
		2541	}
		2542	}
2518	mutex_unlock(&inode->i_mutex);	2543	mutex_unlock(&inode->i_mutex);
2519	if (ret && !err)	2544	if (ret && !err)
2520	err = ret;	2545	err = ret;