aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/tree-log.c
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2015-07-15 18:26:43 -0400
committerChris Mason <clm@fb.com>2015-08-09 09:16:56 -0400
commitbb53eda9029fd52b466fa501ba4aa58e94789b18 (patch)
tree60101a8d4efa405889ee4a57e15495199481cfdd /fs/btrfs/tree-log.c
parent74d33293e467df61de1b1d8b2fbe29e550dec33b (diff)
Btrfs: fix stale directory entries after fsync log replay
We have another case where after an fsync log replay we get an inode with a wrong link count (smaller than it should be) and a number of directory entries greater than its link count. This happens when we add a new link hard link to our inode A and then we fsync some other inode B that has the side effect of logging the parent directory inode too. In this case at log replay time we add the new hard link to our inode (the item with key BTRFS_INODE_REF_KEY) when processing the parent directory but we never adjust the link count of our inode A. As a result we get stale dir entries for our inode A that can never be deleted and therefore it makes it impossible to remove the parent directory (as its i_size can never decrease back to 0). A simple reproducer for fstests that triggers this issue: seq=`basename $0` seqres=$RESULT_DIR/$seq echo "QA output created by $seq" tmp=/tmp/$$ status=1 # failure is the default! trap "_cleanup; exit \$status" 0 1 2 3 15 _cleanup() { _cleanup_flakey rm -f $tmp.* } # get standard environment, filters and checks . ./common/rc . ./common/filter . ./common/dmflakey # real QA test starts here _need_to_be_root _supported_fs generic _supported_os Linux _require_scratch _require_dm_flakey _require_metadata_journaling $SCRATCH_DEV rm -f $seqres.full _scratch_mkfs >>$seqres.full 2>&1 _init_flakey _mount_flakey # Create our test directory and files. mkdir $SCRATCH_MNT/testdir touch $SCRATCH_MNT/testdir/foo touch $SCRATCH_MNT/testdir/bar # Make sure everything done so far is durably persisted. sync # Create one hard link for file foo and another one for file bar. After # that fsync only the file bar. ln $SCRATCH_MNT/testdir/bar $SCRATCH_MNT/testdir/bar_link ln $SCRATCH_MNT/testdir/foo $SCRATCH_MNT/testdir/foo_link $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/testdir/bar # Silently drop all writes on scratch device to simulate power failure. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey # Allow writes again and mount the fs to trigger log/journal replay. _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # Now verify both our files have a link count of 2. echo "Link count for file foo: $(stat --format=%h $SCRATCH_MNT/testdir/foo)" echo "Link count for file bar: $(stat --format=%h $SCRATCH_MNT/testdir/bar)" # We should be able to remove all the links of our files in testdir, and # after that the parent directory should become empty and therefore # possible to remove it. rm -f $SCRATCH_MNT/testdir/* rmdir $SCRATCH_MNT/testdir _unmount_flakey # The fstests framework will call fsck against our filesystem which will verify # that all metadata is in a consistent state. status=0 exit The test fails with: -Link count for file foo: 2 +Link count for file foo: 1 Link count for file bar: 2 +rm: cannot remove '/home/fdmanana/btrfs-tests/scratch_1/testdir/foo_link': Stale file handle +rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/testdir': Directory not empty (...) _check_btrfs_filesystem: filesystem on /dev/sdc is inconsistent And fsck's output: (...) checking fs roots root 5 inode 258 errors 2001, no inode item, link count wrong unresolved ref dir 257 index 5 namelen 8 name foo_link filetype 1 errors 4, no inode ref Checking filesystem on /dev/sdc (...) So fix this by marking inodes for link count fixup at log replay time whenever a directory entry is replayed if the entry was created in the transaction where the fsync was made and if it points to a non-directory inode. This isn't a new problem/regression, the issue exists for a long time, possibly since the log tree feature was added (2008). Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r--fs/btrfs/tree-log.c64
1 files changed, 60 insertions, 4 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9c45431e69ab..cb5666e7c3f9 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1613,6 +1613,9 @@ static bool name_in_log_ref(struct btrfs_root *log_root,
1613 * not exist in the FS, it is skipped. fsyncs on directories 1613 * not exist in the FS, it is skipped. fsyncs on directories
1614 * do not force down inodes inside that directory, just changes to the 1614 * do not force down inodes inside that directory, just changes to the
1615 * names or unlinks in a directory. 1615 * names or unlinks in a directory.
1616 *
1617 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1618 * non-existing inode) and 1 if the name was replayed.
1616 */ 1619 */
1617static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1620static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1618 struct btrfs_root *root, 1621 struct btrfs_root *root,
@@ -1631,6 +1634,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1631 int exists; 1634 int exists;
1632 int ret = 0; 1635 int ret = 0;
1633 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1636 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
1637 bool name_added = false;
1634 1638
1635 dir = read_one_inode(root, key->objectid); 1639 dir = read_one_inode(root, key->objectid);
1636 if (!dir) 1640 if (!dir)
@@ -1708,6 +1712,8 @@ out:
1708 } 1712 }
1709 kfree(name); 1713 kfree(name);
1710 iput(dir); 1714 iput(dir);
1715 if (!ret && name_added)
1716 ret = 1;
1711 return ret; 1717 return ret;
1712 1718
1713insert: 1719insert:
@@ -1723,6 +1729,8 @@ insert:
1723 name, name_len, log_type, &log_key); 1729 name, name_len, log_type, &log_key);
1724 if (ret && ret != -ENOENT && ret != -EEXIST) 1730 if (ret && ret != -ENOENT && ret != -EEXIST)
1725 goto out; 1731 goto out;
1732 if (!ret)
1733 name_added = true;
1726 update_size = false; 1734 update_size = false;
1727 ret = 0; 1735 ret = 0;
1728 goto out; 1736 goto out;
@@ -1740,12 +1748,13 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1740 struct extent_buffer *eb, int slot, 1748 struct extent_buffer *eb, int slot,
1741 struct btrfs_key *key) 1749 struct btrfs_key *key)
1742{ 1750{
1743 int ret; 1751 int ret = 0;
1744 u32 item_size = btrfs_item_size_nr(eb, slot); 1752 u32 item_size = btrfs_item_size_nr(eb, slot);
1745 struct btrfs_dir_item *di; 1753 struct btrfs_dir_item *di;
1746 int name_len; 1754 int name_len;
1747 unsigned long ptr; 1755 unsigned long ptr;
1748 unsigned long ptr_end; 1756 unsigned long ptr_end;
1757 struct btrfs_path *fixup_path = NULL;
1749 1758
1750 ptr = btrfs_item_ptr_offset(eb, slot); 1759 ptr = btrfs_item_ptr_offset(eb, slot);
1751 ptr_end = ptr + item_size; 1760 ptr_end = ptr + item_size;
@@ -1755,12 +1764,59 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1755 return -EIO; 1764 return -EIO;
1756 name_len = btrfs_dir_name_len(eb, di); 1765 name_len = btrfs_dir_name_len(eb, di);
1757 ret = replay_one_name(trans, root, path, eb, di, key); 1766 ret = replay_one_name(trans, root, path, eb, di, key);
1758 if (ret) 1767 if (ret < 0)
1759 return ret; 1768 break;
1760 ptr = (unsigned long)(di + 1); 1769 ptr = (unsigned long)(di + 1);
1761 ptr += name_len; 1770 ptr += name_len;
1771
1772 /*
1773 * If this entry refers to a non-directory (directories can not
1774 * have a link count > 1) and it was added in the transaction
1775 * that was not committed, make sure we fixup the link count of
1776 * the inode it the entry points to. Otherwise something like
1777 * the following would result in a directory pointing to an
1778 * inode with a wrong link that does not account for this dir
1779 * entry:
1780 *
1781 * mkdir testdir
1782 * touch testdir/foo
1783 * touch testdir/bar
1784 * sync
1785 *
1786 * ln testdir/bar testdir/bar_link
1787 * ln testdir/foo testdir/foo_link
1788 * xfs_io -c "fsync" testdir/bar
1789 *
1790 * <power failure>
1791 *
1792 * mount fs, log replay happens
1793 *
1794 * File foo would remain with a link count of 1 when it has two
1795 * entries pointing to it in the directory testdir. This would
1796 * make it impossible to ever delete the parent directory has
1797 * it would result in stale dentries that can never be deleted.
1798 */
1799 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
1800 struct btrfs_key di_key;
1801
1802 if (!fixup_path) {
1803 fixup_path = btrfs_alloc_path();
1804 if (!fixup_path) {
1805 ret = -ENOMEM;
1806 break;
1807 }
1808 }
1809
1810 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
1811 ret = link_to_fixup_dir(trans, root, fixup_path,
1812 di_key.objectid);
1813 if (ret)
1814 break;
1815 }
1816 ret = 0;
1762 } 1817 }
1763 return 0; 1818 btrfs_free_path(fixup_path);
1819 return ret;
1764} 1820}
1765 1821
1766/* 1822/*