aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/tree-log.c
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2016-02-12 06:34:23 -0500
committerChris Mason <clm@fb.com>2016-03-01 11:23:29 -0500
commit2be63d5ce929603d4e7cedabd9e992eb34a0ff95 (patch)
tree2d3077ab32bed985cc35beb05ae4bddc90ddc1d9 /fs/btrfs/tree-log.c
parent1ec9a1ae1e30c733077c0b288c4301b66b7a81f2 (diff)
Btrfs: fix file loss on log replay after renaming a file and fsync
We have two cases where we end up deleting a file at log replay time when we should not. For this to happen the file must have been renamed and a directory inode must have been fsynced/logged. Two examples that exercise these two cases are listed below. Case 1) $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ mkdir -p /mnt/a/b $ mkdir /mnt/c $ touch /mnt/a/b/foo $ sync $ mv /mnt/a/b/foo /mnt/c/ # Create file bar just to make sure the fsync on directory a/ does # something and it's not a no-op. $ touch /mnt/a/bar $ xfs_io -c "fsync" /mnt/a < power fail / crash > The next time the filesystem is mounted, the log replay procedure deletes file foo. Case 2) $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ mkdir /mnt/a $ mkdir /mnt/b $ mkdir /mnt/c $ touch /mnt/a/foo $ ln /mnt/a/foo /mnt/b/foo_link $ touch /mnt/b/bar $ sync $ unlink /mnt/b/foo_link $ mv /mnt/b/bar /mnt/c/ $ xfs_io -c "fsync" /mnt/a/foo < power fail / crash > The next time the filesystem is mounted, the log replay procedure deletes file bar. The reason why the files are deleted is because when we log inodes other then the fsync target inode, we ignore their last_unlink_trans value and leave the log without enough information to later replay the rename operations. So we need to look at the last_unlink_trans values and fallback to a transaction commit if they are greater than the id of the last committed transaction. So fix this by looking at the last_unlink_trans values and fallback to transaction commits when needed. Also, when logging other inodes (for case 1 we logged descendants of the fsync target inode while for case 2 we logged ascendants) we need to care about concurrent tasks updating the last_unlink_trans of inodes we are logging (which was already an existing problem in check_parent_dirs_for_sync()). Since we can not acquire their inode mutex (vfs' struct inode ->i_mutex), as that causes deadlocks with other concurrent operations that acquire the i_mutex of 2 inodes (other fsyncs or renames for example), we need to serialize on the log_mutex of the inode we are logging. A task setting a new value for an inode's last_unlink_trans must acquire the inode's log_mutex and it must do this update before doing the actual unlink operation (which is already the case except when deleting a snapshot). Conversely the task logging the inode must first log the inode and then check the inode's last_unlink_trans value while holding its log_mutex, as if its value is not greater then the id of the last committed transaction it means it logged a safe state of the inode's items, while if its value is not smaller then the id of the last committed transaction it means the inode state it has logged might not be safe (the concurrent task might have just updated last_unlink_trans but hasn't done yet the unlink operation) and therefore a transaction commit must be done. Test cases for xfstests follow in separate patches. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r--fs/btrfs/tree-log.c67
1 files changed, 57 insertions, 10 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 43c6781af654..9f6372dd0eab 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4772,6 +4772,42 @@ out_unlock:
4772} 4772}
4773 4773
4774/* 4774/*
4775 * Check if we must fallback to a transaction commit when logging an inode.
4776 * This must be called after logging the inode and is used only in the context
4777 * when fsyncing an inode requires the need to log some other inode - in which
4778 * case we can't lock the i_mutex of each other inode we need to log as that
4779 * can lead to deadlocks with concurrent fsync against other inodes (as we can
4780 * log inodes up or down in the hierarchy) or rename operations for example. So
4781 * we take the log_mutex of the inode after we have logged it and then check for
4782 * its last_unlink_trans value - this is safe because any task setting
4783 * last_unlink_trans must take the log_mutex and it must do this before it does
4784 * the actual unlink operation, so if we do this check before a concurrent task
4785 * sets last_unlink_trans it means we've logged a consistent version/state of
4786 * all the inode items, otherwise we are not sure and must do a transaction
4787 * commit (the concurrent task migth have only updated last_unlink_trans before
4788 * we logged the inode or it might have also done the unlink).
4789 */
4790static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
4791 struct inode *inode)
4792{
4793 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
4794 bool ret = false;
4795
4796 mutex_lock(&BTRFS_I(inode)->log_mutex);
4797 if (BTRFS_I(inode)->last_unlink_trans > fs_info->last_trans_committed) {
4798 /*
4799 * Make sure any commits to the log are forced to be full
4800 * commits.
4801 */
4802 btrfs_set_log_full_commit(fs_info, trans);
4803 ret = true;
4804 }
4805 mutex_unlock(&BTRFS_I(inode)->log_mutex);
4806
4807 return ret;
4808}
4809
4810/*
4775 * follow the dentry parent pointers up the chain and see if any 4811 * follow the dentry parent pointers up the chain and see if any
4776 * of the directories in it require a full commit before they can 4812 * of the directories in it require a full commit before they can
4777 * be logged. Returns zero if nothing special needs to be done or 1 if 4813 * be logged. Returns zero if nothing special needs to be done or 1 if
@@ -4784,7 +4820,6 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
4784 u64 last_committed) 4820 u64 last_committed)
4785{ 4821{
4786 int ret = 0; 4822 int ret = 0;
4787 struct btrfs_root *root;
4788 struct dentry *old_parent = NULL; 4823 struct dentry *old_parent = NULL;
4789 struct inode *orig_inode = inode; 4824 struct inode *orig_inode = inode;
4790 4825
@@ -4816,14 +4851,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
4816 BTRFS_I(inode)->logged_trans = trans->transid; 4851 BTRFS_I(inode)->logged_trans = trans->transid;
4817 smp_mb(); 4852 smp_mb();
4818 4853
4819 if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 4854 if (btrfs_must_commit_transaction(trans, inode)) {
4820 root = BTRFS_I(inode)->root;
4821
4822 /*
4823 * make sure any commits to the log are forced
4824 * to be full commits
4825 */
4826 btrfs_set_log_full_commit(root->fs_info, trans);
4827 ret = 1; 4855 ret = 1;
4828 break; 4856 break;
4829 } 4857 }
@@ -4982,6 +5010,9 @@ process_leaf:
4982 btrfs_release_path(path); 5010 btrfs_release_path(path);
4983 ret = btrfs_log_inode(trans, root, di_inode, 5011 ret = btrfs_log_inode(trans, root, di_inode,
4984 log_mode, 0, LLONG_MAX, ctx); 5012 log_mode, 0, LLONG_MAX, ctx);
5013 if (!ret &&
5014 btrfs_must_commit_transaction(trans, di_inode))
5015 ret = 1;
4985 iput(di_inode); 5016 iput(di_inode);
4986 if (ret) 5017 if (ret)
4987 goto next_dir_inode; 5018 goto next_dir_inode;
@@ -5096,6 +5127,9 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
5096 5127
5097 ret = btrfs_log_inode(trans, root, dir_inode, 5128 ret = btrfs_log_inode(trans, root, dir_inode,
5098 LOG_INODE_ALL, 0, LLONG_MAX, ctx); 5129 LOG_INODE_ALL, 0, LLONG_MAX, ctx);
5130 if (!ret &&
5131 btrfs_must_commit_transaction(trans, dir_inode))
5132 ret = 1;
5099 iput(dir_inode); 5133 iput(dir_inode);
5100 if (ret) 5134 if (ret)
5101 goto out; 5135 goto out;
@@ -5447,6 +5481,9 @@ error:
5447 * They revolve around files there were unlinked from the directory, and 5481 * They revolve around files there were unlinked from the directory, and
5448 * this function updates the parent directory so that a full commit is 5482 * this function updates the parent directory so that a full commit is
5449 * properly done if it is fsync'd later after the unlinks are done. 5483 * properly done if it is fsync'd later after the unlinks are done.
5484 *
5485 * Must be called before the unlink operations (updates to the subvolume tree,
5486 * inodes, etc) are done.
5450 */ 5487 */
5451void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 5488void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
5452 struct inode *dir, struct inode *inode, 5489 struct inode *dir, struct inode *inode,
@@ -5462,8 +5499,11 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
5462 * into the file. When the file is logged we check it and 5499 * into the file. When the file is logged we check it and
5463 * don't log the parents if the file is fully on disk. 5500 * don't log the parents if the file is fully on disk.
5464 */ 5501 */
5465 if (S_ISREG(inode->i_mode)) 5502 if (S_ISREG(inode->i_mode)) {
5503 mutex_lock(&BTRFS_I(inode)->log_mutex);
5466 BTRFS_I(inode)->last_unlink_trans = trans->transid; 5504 BTRFS_I(inode)->last_unlink_trans = trans->transid;
5505 mutex_unlock(&BTRFS_I(inode)->log_mutex);
5506 }
5467 5507
5468 /* 5508 /*
5469 * if this directory was already logged any new 5509 * if this directory was already logged any new
@@ -5494,7 +5534,9 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
5494 return; 5534 return;
5495 5535
5496record: 5536record:
5537 mutex_lock(&BTRFS_I(dir)->log_mutex);
5497 BTRFS_I(dir)->last_unlink_trans = trans->transid; 5538 BTRFS_I(dir)->last_unlink_trans = trans->transid;
5539 mutex_unlock(&BTRFS_I(dir)->log_mutex);
5498} 5540}
5499 5541
5500/* 5542/*
@@ -5505,11 +5547,16 @@ record:
5505 * corresponding to the deleted snapshot's root, which could lead to replaying 5547 * corresponding to the deleted snapshot's root, which could lead to replaying
5506 * it after replaying the log tree of the parent directory (which would replay 5548 * it after replaying the log tree of the parent directory (which would replay
5507 * the snapshot delete operation). 5549 * the snapshot delete operation).
5550 *
5551 * Must be called before the actual snapshot destroy operation (updates to the
5552 * parent root and tree of tree roots trees, etc) are done.
5508 */ 5553 */
5509void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 5554void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
5510 struct inode *dir) 5555 struct inode *dir)
5511{ 5556{
5557 mutex_lock(&BTRFS_I(dir)->log_mutex);
5512 BTRFS_I(dir)->last_unlink_trans = trans->transid; 5558 BTRFS_I(dir)->last_unlink_trans = trans->transid;
5559 mutex_unlock(&BTRFS_I(dir)->log_mutex);
5513} 5560}
5514 5561
5515/* 5562/*