aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2015-03-20 13:19:46 -0400
committerChris Mason <clm@fb.com>2015-03-26 20:56:23 -0400
commit2f2ff0ee5e4303e727cfd7abd4133d1a8ee68394 (patch)
tree53d4978c98a9aab8e6d3fc16c4d3aa2d0b570b8a
parentbf69196045a8c5c42b10493a26ed45c33014371e (diff)
Btrfs: fix metadata inconsistencies after directory fsync
We can get into inconsistency between inodes and directory entries after fsyncing a directory. The issue is that while a directory gets the new dentries persisted in the fsync log and replayed at mount time, the link count of the inode that directory entries point to doesn't get updated, staying with an incorrect link count (smaller then the correct value). This later leads to stale file handle errors when accessing (including attempt to delete) some of the links if all the other ones are removed, which also implies impossibility to delete the parent directories, since the dentries can not be removed. Another issue is that (unlike ext3/4, xfs, f2fs, reiserfs, nilfs2), when fsyncing a directory, new files aren't logged (their metadata and dentries) nor any child directories. So this patch fixes this issue too, since it has the same resolution as the incorrect inode link count issue mentioned before. This is very easy to reproduce, and the following excerpt from my test case for xfstests shows how: _scratch_mkfs >> $seqres.full 2>&1 _init_flakey _mount_flakey # Create our main test file and directory. $XFS_IO_PROG -f -c "pwrite -S 0xaa 0 8K" $SCRATCH_MNT/foo | _filter_xfs_io mkdir $SCRATCH_MNT/mydir # Make sure all metadata and data are durably persisted. sync # Add a hard link to 'foo' inside our test directory and fsync only the # directory. The btrfs fsync implementation had a bug that caused the new # directory entry to be visible after the fsync log replay but, the inode # of our file remained with a link count of 1. ln $SCRATCH_MNT/foo $SCRATCH_MNT/mydir/foo_2 # Add a few more links and new files. # This is just to verify nothing breaks or gives incorrect results after the # fsync log is replayed. ln $SCRATCH_MNT/foo $SCRATCH_MNT/mydir/foo_3 $XFS_IO_PROG -f -c "pwrite -S 0xff 0 64K" $SCRATCH_MNT/hello | _filter_xfs_io ln $SCRATCH_MNT/hello $SCRATCH_MNT/mydir/hello_2 # Add some subdirectories and new files and links to them. This is to verify # that after fsyncing our top level directory 'mydir', all the subdirectories # and their files/links are registered in the fsync log and exist after the # fsync log is replayed. mkdir -p $SCRATCH_MNT/mydir/x/y/z ln $SCRATCH_MNT/foo $SCRATCH_MNT/mydir/x/y/foo_y_link ln $SCRATCH_MNT/foo $SCRATCH_MNT/mydir/x/y/z/foo_z_link touch $SCRATCH_MNT/mydir/x/y/z/qwerty # Now fsync only our top directory. $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/mydir # And fsync now our new file named 'hello', just to verify later that it has # the expected content and that the previous fsync on the directory 'mydir' had # no bad influence on this fsync. $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/hello # Simulate a crash/power loss. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # Verify the content of our file 'foo' remains the same as before, 8192 bytes, # all with the value 0xaa. echo "File 'foo' content after log replay:" od -t x1 $SCRATCH_MNT/foo # Remove the first name of our inode. Because of the directory fsync bug, the # inode's link count was 1 instead of 5, so removing the 'foo' name ended up # deleting the inode and the other names became stale directory entries (still # visible to applications). Attempting to remove or access the remaining # dentries pointing to that inode resulted in stale file handle errors and # made it impossible to remove the parent directories since it was impossible # for them to become empty. echo "file 'foo' link count after log replay: $(stat -c %h $SCRATCH_MNT/foo)" rm -f $SCRATCH_MNT/foo # Now verify that all files, links and directories created before fsyncing our # directory exist after the fsync log was replayed. [ -f $SCRATCH_MNT/mydir/foo_2 ] || echo "Link mydir/foo_2 is missing" [ -f $SCRATCH_MNT/mydir/foo_3 ] || echo "Link mydir/foo_3 is missing" [ -f $SCRATCH_MNT/hello ] || echo "File hello is missing" [ -f $SCRATCH_MNT/mydir/hello_2 ] || echo "Link mydir/hello_2 is missing" [ -f $SCRATCH_MNT/mydir/x/y/foo_y_link ] || \ echo "Link mydir/x/y/foo_y_link is missing" [ -f $SCRATCH_MNT/mydir/x/y/z/foo_z_link ] || \ echo "Link mydir/x/y/z/foo_z_link is missing" [ -f $SCRATCH_MNT/mydir/x/y/z/qwerty ] || \ echo "File mydir/x/y/z/qwerty is missing" # We expect our file here to have a size of 64Kb and all the bytes having the # value 0xff. echo "file 'hello' content after log replay:" od -t x1 $SCRATCH_MNT/hello # Now remove all files/links, under our test directory 'mydir', and verify we # can remove all the directories. rm -f $SCRATCH_MNT/mydir/x/y/z/* rmdir $SCRATCH_MNT/mydir/x/y/z rm -f $SCRATCH_MNT/mydir/x/y/* rmdir $SCRATCH_MNT/mydir/x/y rmdir $SCRATCH_MNT/mydir/x rm -f $SCRATCH_MNT/mydir/* rmdir $SCRATCH_MNT/mydir # An fsck, run by the fstests framework everytime a test finishes, also detected # the inconsistency and printed the following error message: # # root 5 inode 257 errors 2001, no inode item, link count wrong # unresolved ref dir 258 index 2 namelen 5 name foo_2 filetype 1 errors 4, no inode ref # unresolved ref dir 258 index 3 namelen 5 name foo_3 filetype 1 errors 4, no inode ref status=0 exit The expected golden output for the test is: wrote 8192/8192 bytes at offset 0 XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) wrote 65536/65536 bytes at offset 0 XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) File 'foo' content after log replay: 0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa * 0020000 file 'foo' link count after log replay: 5 file 'hello' content after log replay: 0000000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff * 0200000 Which is the output after this patch and when running the test against ext3/4, xfs, f2fs, reiserfs or nilfs2. Without this patch, the test's output is: wrote 8192/8192 bytes at offset 0 XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) wrote 65536/65536 bytes at offset 0 XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) File 'foo' content after log replay: 0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa * 0020000 file 'foo' link count after log replay: 1 Link mydir/foo_2 is missing Link mydir/foo_3 is missing Link mydir/x/y/foo_y_link is missing Link mydir/x/y/z/foo_z_link is missing File mydir/x/y/z/qwerty is missing file 'hello' content after log replay: 0000000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff * 0200000 rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/mydir/x/y/z': No such file or directory rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/mydir/x/y': No such file or directory rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/mydir/x': No such file or directory rm: cannot remove '/home/fdmanana/btrfs-tests/scratch_1/mydir/foo_2': Stale file handle rm: cannot remove '/home/fdmanana/btrfs-tests/scratch_1/mydir/foo_3': Stale file handle rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/mydir': Directory not empty Fsck, without this fix, also complains about the wrong link count: root 5 inode 257 errors 2001, no inode item, link count wrong unresolved ref dir 258 index 2 namelen 5 name foo_2 filetype 1 errors 4, no inode ref unresolved ref dir 258 index 3 namelen 5 name foo_3 filetype 1 errors 4, no inode ref So fix this by logging the inodes that the dentries point to when fsyncing a directory. A test case for xfstests follows. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
-rw-r--r--fs/btrfs/btrfs_inode.h14
-rw-r--r--fs/btrfs/file.c2
-rw-r--r--fs/btrfs/transaction.h2
-rw-r--r--fs/btrfs/tree-log.c243
-rw-r--r--fs/btrfs/tree-log.h2
5 files changed, 253 insertions, 10 deletions
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index de5e4f2adfea..0ef5cc13fae2 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,7 +66,11 @@ struct btrfs_inode {
66 */ 66 */
67 struct btrfs_key location; 67 struct btrfs_key location;
68 68
69 /* Lock for counters */ 69 /*
70 * Lock for counters and all fields used to determine if the inode is in
71 * the log or not (last_trans, last_sub_trans, last_log_commit,
72 * logged_trans).
73 */
70 spinlock_t lock; 74 spinlock_t lock;
71 75
72 /* the extent_tree has caches of all the extent mappings to disk */ 76 /* the extent_tree has caches of all the extent mappings to disk */
@@ -250,6 +254,9 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode)
250 254
251static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) 255static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
252{ 256{
257 int ret = 0;
258
259 spin_lock(&BTRFS_I(inode)->lock);
253 if (BTRFS_I(inode)->logged_trans == generation && 260 if (BTRFS_I(inode)->logged_trans == generation &&
254 BTRFS_I(inode)->last_sub_trans <= 261 BTRFS_I(inode)->last_sub_trans <=
255 BTRFS_I(inode)->last_log_commit && 262 BTRFS_I(inode)->last_log_commit &&
@@ -263,9 +270,10 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
263 */ 270 */
264 smp_mb(); 271 smp_mb();
265 if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents)) 272 if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents))
266 return 1; 273 ret = 1;
267 } 274 }
268 return 0; 275 spin_unlock(&BTRFS_I(inode)->lock);
276 return ret;
269} 277}
270 278
271#define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1 279#define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 150db5e50c2d..fd105c172c8b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1811,7 +1811,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
1811 * otherwise subsequent syncs to a file that's been synced in this 1811 * otherwise subsequent syncs to a file that's been synced in this
1812 * transaction will appear to have already occured. 1812 * transaction will appear to have already occured.
1813 */ 1813 */
1814 spin_lock(&BTRFS_I(inode)->lock);
1814 BTRFS_I(inode)->last_sub_trans = root->log_transid; 1815 BTRFS_I(inode)->last_sub_trans = root->log_transid;
1816 spin_unlock(&BTRFS_I(inode)->lock);
1815 if (num_written > 0) { 1817 if (num_written > 0) {
1816 err = generic_write_sync(file, pos, num_written); 1818 err = generic_write_sync(file, pos, num_written);
1817 if (err < 0) 1819 if (err < 0)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 937050a2b68e..96b189b8898a 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -136,9 +136,11 @@ struct btrfs_pending_snapshot {
136static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, 136static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
137 struct inode *inode) 137 struct inode *inode)
138{ 138{
139 spin_lock(&BTRFS_I(inode)->lock);
139 BTRFS_I(inode)->last_trans = trans->transaction->transid; 140 BTRFS_I(inode)->last_trans = trans->transaction->transid;
140 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 141 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
141 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; 142 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
143 spin_unlock(&BTRFS_I(inode)->lock);
142} 144}
143 145
144int btrfs_end_transaction(struct btrfs_trans_handle *trans, 146int btrfs_end_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 6c95159302dd..016c90fc85db 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -492,11 +492,19 @@ insert:
492 492
493 if (btrfs_inode_generation(eb, src_item) == 0) { 493 if (btrfs_inode_generation(eb, src_item) == 0) {
494 struct extent_buffer *dst_eb = path->nodes[0]; 494 struct extent_buffer *dst_eb = path->nodes[0];
495 const u64 ino_size = btrfs_inode_size(eb, src_item);
495 496
497 /*
498 * For regular files an ino_size == 0 is used only when
499 * logging that an inode exists, as part of a directory
500 * fsync, and the inode wasn't fsynced before. In this
501 * case don't set the size of the inode in the fs/subvol
502 * tree, otherwise we would be throwing valid data away.
503 */
496 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 504 if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
497 S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) { 505 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
506 ino_size != 0) {
498 struct btrfs_map_token token; 507 struct btrfs_map_token token;
499 u64 ino_size = btrfs_inode_size(eb, src_item);
500 508
501 btrfs_init_map_token(&token); 509 btrfs_init_map_token(&token);
502 btrfs_set_token_inode_size(dst_eb, dst_item, 510 btrfs_set_token_inode_size(dst_eb, dst_item,
@@ -3124,6 +3132,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3124 struct btrfs_root *root, struct inode *inode, 3132 struct btrfs_root *root, struct inode *inode,
3125 struct btrfs_path *path, 3133 struct btrfs_path *path,
3126 struct btrfs_path *dst_path, int key_type, 3134 struct btrfs_path *dst_path, int key_type,
3135 struct btrfs_log_ctx *ctx,
3127 u64 min_offset, u64 *last_offset_ret) 3136 u64 min_offset, u64 *last_offset_ret)
3128{ 3137{
3129 struct btrfs_key min_key; 3138 struct btrfs_key min_key;
@@ -3208,6 +3217,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3208 src = path->nodes[0]; 3217 src = path->nodes[0];
3209 nritems = btrfs_header_nritems(src); 3218 nritems = btrfs_header_nritems(src);
3210 for (i = path->slots[0]; i < nritems; i++) { 3219 for (i = path->slots[0]; i < nritems; i++) {
3220 struct btrfs_dir_item *di;
3221
3211 btrfs_item_key_to_cpu(src, &min_key, i); 3222 btrfs_item_key_to_cpu(src, &min_key, i);
3212 3223
3213 if (min_key.objectid != ino || min_key.type != key_type) 3224 if (min_key.objectid != ino || min_key.type != key_type)
@@ -3218,6 +3229,37 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3218 err = ret; 3229 err = ret;
3219 goto done; 3230 goto done;
3220 } 3231 }
3232
3233 /*
3234 * We must make sure that when we log a directory entry,
3235 * the corresponding inode, after log replay, has a
3236 * matching link count. For example:
3237 *
3238 * touch foo
3239 * mkdir mydir
3240 * sync
3241 * ln foo mydir/bar
3242 * xfs_io -c "fsync" mydir
3243 * <crash>
3244 * <mount fs and log replay>
3245 *
3246 * Would result in a fsync log that when replayed, our
3247 * file inode would have a link count of 1, but we get
3248 * two directory entries pointing to the same inode.
3249 * After removing one of the names, it would not be
3250 * possible to remove the other name, which resulted
3251 * always in stale file handle errors, and would not
3252 * be possible to rmdir the parent directory, since
3253 * its i_size could never decrement to the value
3254 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
3255 */
3256 di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3257 btrfs_dir_item_key_to_cpu(src, di, &tmp);
3258 if (ctx &&
3259 (btrfs_dir_transid(src, di) == trans->transid ||
3260 btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
3261 tmp.type != BTRFS_ROOT_ITEM_KEY)
3262 ctx->log_new_dentries = true;
3221 } 3263 }
3222 path->slots[0] = nritems; 3264 path->slots[0] = nritems;
3223 3265
@@ -3279,7 +3321,8 @@ done:
3279static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3321static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3280 struct btrfs_root *root, struct inode *inode, 3322 struct btrfs_root *root, struct inode *inode,
3281 struct btrfs_path *path, 3323 struct btrfs_path *path,
3282 struct btrfs_path *dst_path) 3324 struct btrfs_path *dst_path,
3325 struct btrfs_log_ctx *ctx)
3283{ 3326{
3284 u64 min_key; 3327 u64 min_key;
3285 u64 max_key; 3328 u64 max_key;
@@ -3291,7 +3334,7 @@ again:
3291 max_key = 0; 3334 max_key = 0;
3292 while (1) { 3335 while (1) {
3293 ret = log_dir_items(trans, root, inode, path, 3336 ret = log_dir_items(trans, root, inode, path,
3294 dst_path, key_type, min_key, 3337 dst_path, key_type, ctx, min_key,
3295 &max_key); 3338 &max_key);
3296 if (ret) 3339 if (ret)
3297 return ret; 3340 return ret;
@@ -4067,7 +4110,7 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
4067 if (ret < 0) { 4110 if (ret < 0) {
4068 return ret; 4111 return ret;
4069 } else if (ret > 0) { 4112 } else if (ret > 0) {
4070 *size_ret = i_size_read(inode); 4113 *size_ret = 0;
4071 } else { 4114 } else {
4072 struct btrfs_inode_item *item; 4115 struct btrfs_inode_item *item;
4073 4116
@@ -4374,15 +4417,18 @@ log_extents:
4374 } 4417 }
4375 4418
4376 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 4419 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
4377 ret = log_directory_changes(trans, root, inode, path, dst_path); 4420 ret = log_directory_changes(trans, root, inode, path, dst_path,
4421 ctx);
4378 if (ret) { 4422 if (ret) {
4379 err = ret; 4423 err = ret;
4380 goto out_unlock; 4424 goto out_unlock;
4381 } 4425 }
4382 } 4426 }
4383 4427
4428 spin_lock(&BTRFS_I(inode)->lock);
4384 BTRFS_I(inode)->logged_trans = trans->transid; 4429 BTRFS_I(inode)->logged_trans = trans->transid;
4385 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 4430 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
4431 spin_unlock(&BTRFS_I(inode)->lock);
4386out_unlock: 4432out_unlock:
4387 if (unlikely(err)) 4433 if (unlikely(err))
4388 btrfs_put_logged_extents(&logged_list); 4434 btrfs_put_logged_extents(&logged_list);
@@ -4469,6 +4515,181 @@ out:
4469 return ret; 4515 return ret;
4470} 4516}
4471 4517
4518struct btrfs_dir_list {
4519 u64 ino;
4520 struct list_head list;
4521};
4522
4523/*
4524 * Log the inodes of the new dentries of a directory. See log_dir_items() for
4525 * details about the why it is needed.
4526 * This is a recursive operation - if an existing dentry corresponds to a
4527 * directory, that directory's new entries are logged too (same behaviour as
4528 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
4529 * the dentries point to we do not lock their i_mutex, otherwise lockdep
4530 * complains about the following circular lock dependency / possible deadlock:
4531 *
4532 * CPU0 CPU1
4533 * ---- ----
4534 * lock(&type->i_mutex_dir_key#3/2);
4535 * lock(sb_internal#2);
4536 * lock(&type->i_mutex_dir_key#3/2);
4537 * lock(&sb->s_type->i_mutex_key#14);
4538 *
4539 * Where sb_internal is the lock (a counter that works as a lock) acquired by
4540 * sb_start_intwrite() in btrfs_start_transaction().
4541 * Not locking i_mutex of the inodes is still safe because:
4542 *
4543 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
4544 * that while logging the inode new references (names) are added or removed
4545 * from the inode, leaving the logged inode item with a link count that does
4546 * not match the number of logged inode reference items. This is fine because
4547 * at log replay time we compute the real number of links and correct the
4548 * link count in the inode item (see replay_one_buffer() and
4549 * link_to_fixup_dir());
4550 *
4551 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
4552 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
4553 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
4554 * has a size that doesn't match the sum of the lengths of all the logged
4555 * names. This does not result in a problem because if a dir_item key is
4556 * logged but its matching dir_index key is not logged, at log replay time we
4557 * don't use it to replay the respective name (see replay_one_name()). On the
4558 * other hand if only the dir_index key ends up being logged, the respective
4559 * name is added to the fs/subvol tree with both the dir_item and dir_index
4560 * keys created (see replay_one_name()).
4561 * The directory's inode item with a wrong i_size is not a problem as well,
4562 * since we don't use it at log replay time to set the i_size in the inode
4563 * item of the fs/subvol tree (see overwrite_item()).
4564 */
4565static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
4566 struct btrfs_root *root,
4567 struct inode *start_inode,
4568 struct btrfs_log_ctx *ctx)
4569{
4570 struct btrfs_root *log = root->log_root;
4571 struct btrfs_path *path;
4572 LIST_HEAD(dir_list);
4573 struct btrfs_dir_list *dir_elem;
4574 int ret = 0;
4575
4576 path = btrfs_alloc_path();
4577 if (!path)
4578 return -ENOMEM;
4579
4580 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
4581 if (!dir_elem) {
4582 btrfs_free_path(path);
4583 return -ENOMEM;
4584 }
4585 dir_elem->ino = btrfs_ino(start_inode);
4586 list_add_tail(&dir_elem->list, &dir_list);
4587
4588 while (!list_empty(&dir_list)) {
4589 struct extent_buffer *leaf;
4590 struct btrfs_key min_key;
4591 int nritems;
4592 int i;
4593
4594 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
4595 list);
4596 if (ret)
4597 goto next_dir_inode;
4598
4599 min_key.objectid = dir_elem->ino;
4600 min_key.type = BTRFS_DIR_ITEM_KEY;
4601 min_key.offset = 0;
4602again:
4603 btrfs_release_path(path);
4604 ret = btrfs_search_forward(log, &min_key, path, trans->transid);
4605 if (ret < 0) {
4606 goto next_dir_inode;
4607 } else if (ret > 0) {
4608 ret = 0;
4609 goto next_dir_inode;
4610 }
4611
4612process_leaf:
4613 leaf = path->nodes[0];
4614 nritems = btrfs_header_nritems(leaf);
4615 for (i = path->slots[0]; i < nritems; i++) {
4616 struct btrfs_dir_item *di;
4617 struct btrfs_key di_key;
4618 struct inode *di_inode;
4619 struct btrfs_dir_list *new_dir_elem;
4620 int log_mode = LOG_INODE_EXISTS;
4621 int type;
4622
4623 btrfs_item_key_to_cpu(leaf, &min_key, i);
4624 if (min_key.objectid != dir_elem->ino ||
4625 min_key.type != BTRFS_DIR_ITEM_KEY)
4626 goto next_dir_inode;
4627
4628 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
4629 type = btrfs_dir_type(leaf, di);
4630 if (btrfs_dir_transid(leaf, di) < trans->transid &&
4631 type != BTRFS_FT_DIR)
4632 continue;
4633 btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
4634 if (di_key.type == BTRFS_ROOT_ITEM_KEY)
4635 continue;
4636
4637 di_inode = btrfs_iget(root->fs_info->sb, &di_key,
4638 root, NULL);
4639 if (IS_ERR(di_inode)) {
4640 ret = PTR_ERR(di_inode);
4641 goto next_dir_inode;
4642 }
4643
4644 if (btrfs_inode_in_log(di_inode, trans->transid)) {
4645 iput(di_inode);
4646 continue;
4647 }
4648
4649 ctx->log_new_dentries = false;
4650 if (type == BTRFS_FT_DIR)
4651 log_mode = LOG_INODE_ALL;
4652 btrfs_release_path(path);
4653 ret = btrfs_log_inode(trans, root, di_inode,
4654 log_mode, 0, LLONG_MAX, ctx);
4655 iput(di_inode);
4656 if (ret)
4657 goto next_dir_inode;
4658 if (ctx->log_new_dentries) {
4659 new_dir_elem = kmalloc(sizeof(*new_dir_elem),
4660 GFP_NOFS);
4661 if (!new_dir_elem) {
4662 ret = -ENOMEM;
4663 goto next_dir_inode;
4664 }
4665 new_dir_elem->ino = di_key.objectid;
4666 list_add_tail(&new_dir_elem->list, &dir_list);
4667 }
4668 break;
4669 }
4670 if (i == nritems) {
4671 ret = btrfs_next_leaf(log, path);
4672 if (ret < 0) {
4673 goto next_dir_inode;
4674 } else if (ret > 0) {
4675 ret = 0;
4676 goto next_dir_inode;
4677 }
4678 goto process_leaf;
4679 }
4680 if (min_key.offset < (u64)-1) {
4681 min_key.offset++;
4682 goto again;
4683 }
4684next_dir_inode:
4685 list_del(&dir_elem->list);
4686 kfree(dir_elem);
4687 }
4688
4689 btrfs_free_path(path);
4690 return ret;
4691}
4692
4472/* 4693/*
4473 * helper function around btrfs_log_inode to make sure newly created 4694 * helper function around btrfs_log_inode to make sure newly created
4474 * parent directories also end up in the log. A minimal inode and backref 4695 * parent directories also end up in the log. A minimal inode and backref
@@ -4491,6 +4712,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4491 const struct dentry * const first_parent = parent; 4712 const struct dentry * const first_parent = parent;
4492 const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans > 4713 const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
4493 last_committed); 4714 last_committed);
4715 bool log_dentries = false;
4716 struct inode *orig_inode = inode;
4494 4717
4495 sb = inode->i_sb; 4718 sb = inode->i_sb;
4496 4719
@@ -4546,6 +4769,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4546 goto end_trans; 4769 goto end_trans;
4547 } 4770 }
4548 4771
4772 if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
4773 log_dentries = true;
4774
4549 while (1) { 4775 while (1) {
4550 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 4776 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
4551 break; 4777 break;
@@ -4582,7 +4808,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4582 dput(old_parent); 4808 dput(old_parent);
4583 old_parent = parent; 4809 old_parent = parent;
4584 } 4810 }
4585 ret = 0; 4811 if (log_dentries)
4812 ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
4813 else
4814 ret = 0;
4586end_trans: 4815end_trans:
4587 dput(old_parent); 4816 dput(old_parent);
4588 if (ret < 0) { 4817 if (ret < 0) {
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 154990c26dcb..6916a781ea02 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -29,6 +29,7 @@ struct btrfs_log_ctx {
29 int log_ret; 29 int log_ret;
30 int log_transid; 30 int log_transid;
31 int io_err; 31 int io_err;
32 bool log_new_dentries;
32 struct list_head list; 33 struct list_head list;
33}; 34};
34 35
@@ -37,6 +38,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
37 ctx->log_ret = 0; 38 ctx->log_ret = 0;
38 ctx->log_transid = 0; 39 ctx->log_transid = 0;
39 ctx->io_err = 0; 40 ctx->io_err = 0;
41 ctx->log_new_dentries = false;
40 INIT_LIST_HEAD(&ctx->list); 42 INIT_LIST_HEAD(&ctx->list);
41} 43}
42 44