aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2015-01-13 20:52:25 -0500
committerChris Mason <clm@fb.com>2015-01-21 21:02:05 -0500
commitdf8d116ffa379f3ef09d7ff28da0f0c921cc9fa1 (patch)
tree54df64ce76a165f25a4a8826e12f8b810e626669
parent2c2c452b0cafdc27442796c526ac929443654b0b (diff)
Btrfs: fix fsync log replay for inodes with a mix of regular refs and extrefs
If we have an inode with a large number of hard links, some of which may be extrefs, turn a regular ref into an extref, fsync the inode and then replay the fsync log (after a crash/reboot), we can endup with an fsync log that makes the replay code always fail with -EOVERFLOW when processing the inode's references. This is easy to reproduce with the test case I made for xfstests. Its steps are the following: _scratch_mkfs "-O extref" >> $seqres.full 2>&1 _init_flakey _mount_flakey # Create a test file with 3001 hard links. This number is large enough to # make btrfs start using extrefs at some point even if the fs has the maximum # possible leaf/node size (64Kb). echo "hello world" > $SCRATCH_MNT/foo for i in `seq 1 3000`; do ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_`printf "%04d" $i` done # Make sure all metadata and data are durably persisted. sync # Now remove one link, add a new one with a new name, add another new one with # the same name as the one we just removed and fsync the inode. rm -f $SCRATCH_MNT/foo_link_0001 ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_3001 ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_0001 rm -f $SCRATCH_MNT/foo_link_0002 ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_3002 ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_3003 $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo # Simulate a crash/power loss. This makes sure the next mount # will see an fsync log and will replay that log. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # Check that the number of hard links is correct, we are able to remove all # the hard links and read the file's data. This is just to verify we don't # get stale file handle errors (due to dangling directory index entries that # point to inodes that no longer exist). echo "Link count: $(stat --format=%h $SCRATCH_MNT/foo)" [ -f $SCRATCH_MNT/foo ] || echo "Link foo is missing" for ((i = 1; i <= 3003; i++)); do name=foo_link_`printf "%04d" $i` if [ $i -eq 2 ]; then [ -f $SCRATCH_MNT/$name ] && echo "Link $name found" else [ -f $SCRATCH_MNT/$name ] || echo "Link $name is missing" fi done rm -f $SCRATCH_MNT/foo_link_* cat $SCRATCH_MNT/foo rm -f $SCRATCH_MNT/foo status=0 exit The fix is simply to correct the overflow condition when overwriting a reference item because it was wrong, trying to increase the item in the fs/subvol tree by an impossible amount. Also ensure that we don't insert one normal ref and one ext ref for the same dentry - this happened because processing a dir index entry from the parent in the log happened when the normal ref item was full, which made the logic insert an extref and later when the normal ref had enough room, it would be inserted again when processing the ref item from the child inode in the log. This issue has been present since the introduction of the extrefs feature (2012). A test case for xfstests follows soon. This test only passes if the previous patch titled "Btrfs: fix fsync when extend references are added to an inode" is applied too. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
-rw-r--r--fs/btrfs/inode-item.c9
-rw-r--r--fs/btrfs/tree-log.c39
2 files changed, 43 insertions, 5 deletions
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 8ffa4783cbf4..265e03c73f4d 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -344,6 +344,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
344 return -ENOMEM; 344 return -ENOMEM;
345 345
346 path->leave_spinning = 1; 346 path->leave_spinning = 1;
347 path->skip_release_on_error = 1;
347 ret = btrfs_insert_empty_item(trans, root, path, &key, 348 ret = btrfs_insert_empty_item(trans, root, path, &key,
348 ins_len); 349 ins_len);
349 if (ret == -EEXIST) { 350 if (ret == -EEXIST) {
@@ -362,8 +363,12 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
362 ptr = (unsigned long)(ref + 1); 363 ptr = (unsigned long)(ref + 1);
363 ret = 0; 364 ret = 0;
364 } else if (ret < 0) { 365 } else if (ret < 0) {
365 if (ret == -EOVERFLOW) 366 if (ret == -EOVERFLOW) {
366 ret = -EMLINK; 367 if (find_name_in_backref(path, name, name_len, &ref))
368 ret = -EEXIST;
369 else
370 ret = -EMLINK;
371 }
367 goto out; 372 goto out;
368 } else { 373 } else {
369 ref = btrfs_item_ptr(path->nodes[0], path->slots[0], 374 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 533cdb02978a..a26658756537 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -453,11 +453,13 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
453insert: 453insert:
454 btrfs_release_path(path); 454 btrfs_release_path(path);
455 /* try to insert the key into the destination tree */ 455 /* try to insert the key into the destination tree */
456 path->skip_release_on_error = 1;
456 ret = btrfs_insert_empty_item(trans, root, path, 457 ret = btrfs_insert_empty_item(trans, root, path,
457 key, item_size); 458 key, item_size);
459 path->skip_release_on_error = 0;
458 460
459 /* make sure any existing item is the correct size */ 461 /* make sure any existing item is the correct size */
460 if (ret == -EEXIST) { 462 if (ret == -EEXIST || ret == -EOVERFLOW) {
461 u32 found_size; 463 u32 found_size;
462 found_size = btrfs_item_size_nr(path->nodes[0], 464 found_size = btrfs_item_size_nr(path->nodes[0],
463 path->slots[0]); 465 path->slots[0]);
@@ -844,7 +846,7 @@ out:
844static noinline int backref_in_log(struct btrfs_root *log, 846static noinline int backref_in_log(struct btrfs_root *log,
845 struct btrfs_key *key, 847 struct btrfs_key *key,
846 u64 ref_objectid, 848 u64 ref_objectid,
847 char *name, int namelen) 849 const char *name, int namelen)
848{ 850{
849 struct btrfs_path *path; 851 struct btrfs_path *path;
850 struct btrfs_inode_ref *ref; 852 struct btrfs_inode_ref *ref;
@@ -1556,6 +1558,30 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1556} 1558}
1557 1559
1558/* 1560/*
1561 * Return true if an inode reference exists in the log for the given name,
1562 * inode and parent inode.
1563 */
1564static bool name_in_log_ref(struct btrfs_root *log_root,
1565 const char *name, const int name_len,
1566 const u64 dirid, const u64 ino)
1567{
1568 struct btrfs_key search_key;
1569
1570 search_key.objectid = ino;
1571 search_key.type = BTRFS_INODE_REF_KEY;
1572 search_key.offset = dirid;
1573 if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1574 return true;
1575
1576 search_key.type = BTRFS_INODE_EXTREF_KEY;
1577 search_key.offset = btrfs_extref_hash(dirid, name, name_len);
1578 if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1579 return true;
1580
1581 return false;
1582}
1583
1584/*
1559 * take a single entry in a log directory item and replay it into 1585 * take a single entry in a log directory item and replay it into
1560 * the subvolume. 1586 * the subvolume.
1561 * 1587 *
@@ -1665,10 +1691,17 @@ out:
1665 return ret; 1691 return ret;
1666 1692
1667insert: 1693insert:
1694 if (name_in_log_ref(root->log_root, name, name_len,
1695 key->objectid, log_key.objectid)) {
1696 /* The dentry will be added later. */
1697 ret = 0;
1698 update_size = false;
1699 goto out;
1700 }
1668 btrfs_release_path(path); 1701 btrfs_release_path(path);
1669 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1702 ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1670 name, name_len, log_type, &log_key); 1703 name, name_len, log_type, &log_key);
1671 if (ret && ret != -ENOENT) 1704 if (ret && ret != -ENOENT && ret != -EEXIST)
1672 goto out; 1705 goto out;
1673 update_size = false; 1706 update_size = false;
1674 ret = 0; 1707 ret = 0;