aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/tree-log.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r--fs/btrfs/tree-log.c243
1 files changed, 236 insertions, 7 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 6c95159302dd..016c90fc85db 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -492,11 +492,19 @@ insert:
492 492
493 if (btrfs_inode_generation(eb, src_item) == 0) { 493 if (btrfs_inode_generation(eb, src_item) == 0) {
494 struct extent_buffer *dst_eb = path->nodes[0]; 494 struct extent_buffer *dst_eb = path->nodes[0];
495 const u64 ino_size = btrfs_inode_size(eb, src_item);
495 496
497 /*
498 * For regular files an ino_size == 0 is used only when
499 * logging that an inode exists, as part of a directory
500 * fsync, and the inode wasn't fsynced before. In this
501 * case don't set the size of the inode in the fs/subvol
502 * tree, otherwise we would be throwing valid data away.
503 */
496 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 504 if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
497 S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) { 505 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
506 ino_size != 0) {
498 struct btrfs_map_token token; 507 struct btrfs_map_token token;
499 u64 ino_size = btrfs_inode_size(eb, src_item);
500 508
501 btrfs_init_map_token(&token); 509 btrfs_init_map_token(&token);
502 btrfs_set_token_inode_size(dst_eb, dst_item, 510 btrfs_set_token_inode_size(dst_eb, dst_item,
@@ -3124,6 +3132,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3124 struct btrfs_root *root, struct inode *inode, 3132 struct btrfs_root *root, struct inode *inode,
3125 struct btrfs_path *path, 3133 struct btrfs_path *path,
3126 struct btrfs_path *dst_path, int key_type, 3134 struct btrfs_path *dst_path, int key_type,
3135 struct btrfs_log_ctx *ctx,
3127 u64 min_offset, u64 *last_offset_ret) 3136 u64 min_offset, u64 *last_offset_ret)
3128{ 3137{
3129 struct btrfs_key min_key; 3138 struct btrfs_key min_key;
@@ -3208,6 +3217,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3208 src = path->nodes[0]; 3217 src = path->nodes[0];
3209 nritems = btrfs_header_nritems(src); 3218 nritems = btrfs_header_nritems(src);
3210 for (i = path->slots[0]; i < nritems; i++) { 3219 for (i = path->slots[0]; i < nritems; i++) {
3220 struct btrfs_dir_item *di;
3221
3211 btrfs_item_key_to_cpu(src, &min_key, i); 3222 btrfs_item_key_to_cpu(src, &min_key, i);
3212 3223
3213 if (min_key.objectid != ino || min_key.type != key_type) 3224 if (min_key.objectid != ino || min_key.type != key_type)
@@ -3218,6 +3229,37 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3218 err = ret; 3229 err = ret;
3219 goto done; 3230 goto done;
3220 } 3231 }
3232
3233 /*
3234 * We must make sure that when we log a directory entry,
3235 * the corresponding inode, after log replay, has a
3236 * matching link count. For example:
3237 *
3238 * touch foo
3239 * mkdir mydir
3240 * sync
3241 * ln foo mydir/bar
3242 * xfs_io -c "fsync" mydir
3243 * <crash>
3244 * <mount fs and log replay>
3245 *
3246 * Would result in a fsync log that when replayed, our
3247 * file inode would have a link count of 1, but we get
3248 * two directory entries pointing to the same inode.
3249 * After removing one of the names, it would not be
3250 * possible to remove the other name, which resulted
3251 * always in stale file handle errors, and would not
3252 * be possible to rmdir the parent directory, since
3253 * its i_size could never decrement to the value
3254 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
3255 */
3256 di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3257 btrfs_dir_item_key_to_cpu(src, di, &tmp);
3258 if (ctx &&
3259 (btrfs_dir_transid(src, di) == trans->transid ||
3260 btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
3261 tmp.type != BTRFS_ROOT_ITEM_KEY)
3262 ctx->log_new_dentries = true;
3221 } 3263 }
3222 path->slots[0] = nritems; 3264 path->slots[0] = nritems;
3223 3265
@@ -3279,7 +3321,8 @@ done:
3279static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3321static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3280 struct btrfs_root *root, struct inode *inode, 3322 struct btrfs_root *root, struct inode *inode,
3281 struct btrfs_path *path, 3323 struct btrfs_path *path,
3282 struct btrfs_path *dst_path) 3324 struct btrfs_path *dst_path,
3325 struct btrfs_log_ctx *ctx)
3283{ 3326{
3284 u64 min_key; 3327 u64 min_key;
3285 u64 max_key; 3328 u64 max_key;
@@ -3291,7 +3334,7 @@ again:
3291 max_key = 0; 3334 max_key = 0;
3292 while (1) { 3335 while (1) {
3293 ret = log_dir_items(trans, root, inode, path, 3336 ret = log_dir_items(trans, root, inode, path,
3294 dst_path, key_type, min_key, 3337 dst_path, key_type, ctx, min_key,
3295 &max_key); 3338 &max_key);
3296 if (ret) 3339 if (ret)
3297 return ret; 3340 return ret;
@@ -4067,7 +4110,7 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
4067 if (ret < 0) { 4110 if (ret < 0) {
4068 return ret; 4111 return ret;
4069 } else if (ret > 0) { 4112 } else if (ret > 0) {
4070 *size_ret = i_size_read(inode); 4113 *size_ret = 0;
4071 } else { 4114 } else {
4072 struct btrfs_inode_item *item; 4115 struct btrfs_inode_item *item;
4073 4116
@@ -4374,15 +4417,18 @@ log_extents:
4374 } 4417 }
4375 4418
4376 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 4419 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
4377 ret = log_directory_changes(trans, root, inode, path, dst_path); 4420 ret = log_directory_changes(trans, root, inode, path, dst_path,
4421 ctx);
4378 if (ret) { 4422 if (ret) {
4379 err = ret; 4423 err = ret;
4380 goto out_unlock; 4424 goto out_unlock;
4381 } 4425 }
4382 } 4426 }
4383 4427
4428 spin_lock(&BTRFS_I(inode)->lock);
4384 BTRFS_I(inode)->logged_trans = trans->transid; 4429 BTRFS_I(inode)->logged_trans = trans->transid;
4385 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 4430 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
4431 spin_unlock(&BTRFS_I(inode)->lock);
4386out_unlock: 4432out_unlock:
4387 if (unlikely(err)) 4433 if (unlikely(err))
4388 btrfs_put_logged_extents(&logged_list); 4434 btrfs_put_logged_extents(&logged_list);
@@ -4469,6 +4515,181 @@ out:
4469 return ret; 4515 return ret;
4470} 4516}
4471 4517
4518struct btrfs_dir_list {
4519 u64 ino;
4520 struct list_head list;
4521};
4522
4523/*
4524 * Log the inodes of the new dentries of a directory. See log_dir_items() for
4525 * details about the why it is needed.
4526 * This is a recursive operation - if an existing dentry corresponds to a
4527 * directory, that directory's new entries are logged too (same behaviour as
4528 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
4529 * the dentries point to we do not lock their i_mutex, otherwise lockdep
4530 * complains about the following circular lock dependency / possible deadlock:
4531 *
4532 * CPU0 CPU1
4533 * ---- ----
4534 * lock(&type->i_mutex_dir_key#3/2);
4535 * lock(sb_internal#2);
4536 * lock(&type->i_mutex_dir_key#3/2);
4537 * lock(&sb->s_type->i_mutex_key#14);
4538 *
4539 * Where sb_internal is the lock (a counter that works as a lock) acquired by
4540 * sb_start_intwrite() in btrfs_start_transaction().
4541 * Not locking i_mutex of the inodes is still safe because:
4542 *
4543 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
4544 * that while logging the inode new references (names) are added or removed
4545 * from the inode, leaving the logged inode item with a link count that does
4546 * not match the number of logged inode reference items. This is fine because
4547 * at log replay time we compute the real number of links and correct the
4548 * link count in the inode item (see replay_one_buffer() and
4549 * link_to_fixup_dir());
4550 *
4551 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
4552 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
4553 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
4554 * has a size that doesn't match the sum of the lengths of all the logged
4555 * names. This does not result in a problem because if a dir_item key is
4556 * logged but its matching dir_index key is not logged, at log replay time we
4557 * don't use it to replay the respective name (see replay_one_name()). On the
4558 * other hand if only the dir_index key ends up being logged, the respective
4559 * name is added to the fs/subvol tree with both the dir_item and dir_index
4560 * keys created (see replay_one_name()).
4561 * The directory's inode item with a wrong i_size is not a problem as well,
4562 * since we don't use it at log replay time to set the i_size in the inode
4563 * item of the fs/subvol tree (see overwrite_item()).
4564 */
4565static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
4566 struct btrfs_root *root,
4567 struct inode *start_inode,
4568 struct btrfs_log_ctx *ctx)
4569{
4570 struct btrfs_root *log = root->log_root;
4571 struct btrfs_path *path;
4572 LIST_HEAD(dir_list);
4573 struct btrfs_dir_list *dir_elem;
4574 int ret = 0;
4575
4576 path = btrfs_alloc_path();
4577 if (!path)
4578 return -ENOMEM;
4579
4580 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
4581 if (!dir_elem) {
4582 btrfs_free_path(path);
4583 return -ENOMEM;
4584 }
4585 dir_elem->ino = btrfs_ino(start_inode);
4586 list_add_tail(&dir_elem->list, &dir_list);
4587
4588 while (!list_empty(&dir_list)) {
4589 struct extent_buffer *leaf;
4590 struct btrfs_key min_key;
4591 int nritems;
4592 int i;
4593
4594 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
4595 list);
4596 if (ret)
4597 goto next_dir_inode;
4598
4599 min_key.objectid = dir_elem->ino;
4600 min_key.type = BTRFS_DIR_ITEM_KEY;
4601 min_key.offset = 0;
4602again:
4603 btrfs_release_path(path);
4604 ret = btrfs_search_forward(log, &min_key, path, trans->transid);
4605 if (ret < 0) {
4606 goto next_dir_inode;
4607 } else if (ret > 0) {
4608 ret = 0;
4609 goto next_dir_inode;
4610 }
4611
4612process_leaf:
4613 leaf = path->nodes[0];
4614 nritems = btrfs_header_nritems(leaf);
4615 for (i = path->slots[0]; i < nritems; i++) {
4616 struct btrfs_dir_item *di;
4617 struct btrfs_key di_key;
4618 struct inode *di_inode;
4619 struct btrfs_dir_list *new_dir_elem;
4620 int log_mode = LOG_INODE_EXISTS;
4621 int type;
4622
4623 btrfs_item_key_to_cpu(leaf, &min_key, i);
4624 if (min_key.objectid != dir_elem->ino ||
4625 min_key.type != BTRFS_DIR_ITEM_KEY)
4626 goto next_dir_inode;
4627
4628 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
4629 type = btrfs_dir_type(leaf, di);
4630 if (btrfs_dir_transid(leaf, di) < trans->transid &&
4631 type != BTRFS_FT_DIR)
4632 continue;
4633 btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
4634 if (di_key.type == BTRFS_ROOT_ITEM_KEY)
4635 continue;
4636
4637 di_inode = btrfs_iget(root->fs_info->sb, &di_key,
4638 root, NULL);
4639 if (IS_ERR(di_inode)) {
4640 ret = PTR_ERR(di_inode);
4641 goto next_dir_inode;
4642 }
4643
4644 if (btrfs_inode_in_log(di_inode, trans->transid)) {
4645 iput(di_inode);
4646 continue;
4647 }
4648
4649 ctx->log_new_dentries = false;
4650 if (type == BTRFS_FT_DIR)
4651 log_mode = LOG_INODE_ALL;
4652 btrfs_release_path(path);
4653 ret = btrfs_log_inode(trans, root, di_inode,
4654 log_mode, 0, LLONG_MAX, ctx);
4655 iput(di_inode);
4656 if (ret)
4657 goto next_dir_inode;
4658 if (ctx->log_new_dentries) {
4659 new_dir_elem = kmalloc(sizeof(*new_dir_elem),
4660 GFP_NOFS);
4661 if (!new_dir_elem) {
4662 ret = -ENOMEM;
4663 goto next_dir_inode;
4664 }
4665 new_dir_elem->ino = di_key.objectid;
4666 list_add_tail(&new_dir_elem->list, &dir_list);
4667 }
4668 break;
4669 }
4670 if (i == nritems) {
4671 ret = btrfs_next_leaf(log, path);
4672 if (ret < 0) {
4673 goto next_dir_inode;
4674 } else if (ret > 0) {
4675 ret = 0;
4676 goto next_dir_inode;
4677 }
4678 goto process_leaf;
4679 }
4680 if (min_key.offset < (u64)-1) {
4681 min_key.offset++;
4682 goto again;
4683 }
4684next_dir_inode:
4685 list_del(&dir_elem->list);
4686 kfree(dir_elem);
4687 }
4688
4689 btrfs_free_path(path);
4690 return ret;
4691}
4692
4472/* 4693/*
4473 * helper function around btrfs_log_inode to make sure newly created 4694 * helper function around btrfs_log_inode to make sure newly created
4474 * parent directories also end up in the log. A minimal inode and backref 4695 * parent directories also end up in the log. A minimal inode and backref
@@ -4491,6 +4712,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4491 const struct dentry * const first_parent = parent; 4712 const struct dentry * const first_parent = parent;
4492 const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans > 4713 const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
4493 last_committed); 4714 last_committed);
4715 bool log_dentries = false;
4716 struct inode *orig_inode = inode;
4494 4717
4495 sb = inode->i_sb; 4718 sb = inode->i_sb;
4496 4719
@@ -4546,6 +4769,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4546 goto end_trans; 4769 goto end_trans;
4547 } 4770 }
4548 4771
4772 if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
4773 log_dentries = true;
4774
4549 while (1) { 4775 while (1) {
4550 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 4776 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
4551 break; 4777 break;
@@ -4582,7 +4808,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4582 dput(old_parent); 4808 dput(old_parent);
4583 old_parent = parent; 4809 old_parent = parent;
4584 } 4810 }
4585 ret = 0; 4811 if (log_dentries)
4812 ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
4813 else
4814 ret = 0;
4586end_trans: 4815end_trans:
4587 dput(old_parent); 4816 dput(old_parent);
4588 if (ret < 0) { 4817 if (ret < 0) {