aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-05-21 13:49:22 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-05-21 13:49:22 -0400
commit07be1337b9e8bfcd855c6e9175b5066a30ac609b (patch)
treee40ad01dc89f6eb17d461939b809fea3387fc2a5 /fs/btrfs/inode.c
parent63d222b9d277c4d7bf08afd1631a7f8e327a825c (diff)
parentc315ef8d9db7f1a0ebd023a395ebdfde1c68057e (diff)
Merge branch 'for-linus-4.7' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs updates from Chris Mason: "This has our merge window series of cleanups and fixes. These target a wide range of issues, but do include some important fixes for qgroups, O_DIRECT, and fsync handling. Jeff Mahoney moved around a few definitions to make them easier for userland to consume. Also whiteout support is included now that issues with overlayfs have been cleared up. I have one more fix pending for page faults during btrfs_copy_from_user, but I wanted to get this bulk out the door first" * 'for-linus-4.7' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (90 commits) btrfs: fix memory leak during RAID 5/6 device replacement Btrfs: add semaphore to synchronize direct IO writes with fsync Btrfs: fix race between block group relocation and nocow writes Btrfs: fix race between fsync and direct IO writes for prealloc extents Btrfs: fix number of transaction units for renames with whiteout Btrfs: pin logs earlier when doing a rename exchange operation Btrfs: unpin logs if rename exchange operation fails Btrfs: fix inode leak on failure to setup whiteout inode in rename btrfs: add support for RENAME_EXCHANGE and RENAME_WHITEOUT Btrfs: pin log earlier when renaming Btrfs: unpin log if rename operation fails Btrfs: don't do unnecessary delalloc flushes when relocating Btrfs: don't wait for unrelated IO to finish before relocation Btrfs: fix empty symlink after creating symlink and fsync parent dir Btrfs: fix for incorrect directory entries after fsync log replay btrfs: build fixup for qgroup_account_snapshot btrfs: qgroup: Fix qgroup accounting when creating snapshot Btrfs: fix fspath error deallocation btrfs: make find_workspace warn if there are no workspaces btrfs: make find_workspace always succeed ...
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c466
1 files changed, 400 insertions, 66 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6b7fe291a174..91419ef79b00 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -824,6 +824,7 @@ retry:
824 async_extent->ram_size - 1, 0); 824 async_extent->ram_size - 1, 0);
825 goto out_free_reserve; 825 goto out_free_reserve;
826 } 826 }
827 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
827 828
828 /* 829 /*
829 * clear dirty, set writeback and unlock the pages. 830 * clear dirty, set writeback and unlock the pages.
@@ -861,6 +862,7 @@ retry:
861 } 862 }
862 return; 863 return;
863out_free_reserve: 864out_free_reserve:
865 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
864 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); 866 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
865out_free: 867out_free:
866 extent_clear_unlock_delalloc(inode, async_extent->start, 868 extent_clear_unlock_delalloc(inode, async_extent->start,
@@ -1038,6 +1040,8 @@ static noinline int cow_file_range(struct inode *inode,
1038 goto out_drop_extent_cache; 1040 goto out_drop_extent_cache;
1039 } 1041 }
1040 1042
1043 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
1044
1041 if (disk_num_bytes < cur_alloc_size) 1045 if (disk_num_bytes < cur_alloc_size)
1042 break; 1046 break;
1043 1047
@@ -1066,6 +1070,7 @@ out:
1066out_drop_extent_cache: 1070out_drop_extent_cache:
1067 btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); 1071 btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
1068out_reserve: 1072out_reserve:
1073 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
1069 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); 1074 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
1070out_unlock: 1075out_unlock:
1071 extent_clear_unlock_delalloc(inode, start, end, locked_page, 1076 extent_clear_unlock_delalloc(inode, start, end, locked_page,
@@ -1377,6 +1382,9 @@ next_slot:
1377 */ 1382 */
1378 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 1383 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1379 goto out_check; 1384 goto out_check;
1385 if (!btrfs_inc_nocow_writers(root->fs_info,
1386 disk_bytenr))
1387 goto out_check;
1380 nocow = 1; 1388 nocow = 1;
1381 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 1389 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1382 extent_end = found_key.offset + 1390 extent_end = found_key.offset +
@@ -1391,6 +1399,9 @@ out_check:
1391 path->slots[0]++; 1399 path->slots[0]++;
1392 if (!nolock && nocow) 1400 if (!nolock && nocow)
1393 btrfs_end_write_no_snapshoting(root); 1401 btrfs_end_write_no_snapshoting(root);
1402 if (nocow)
1403 btrfs_dec_nocow_writers(root->fs_info,
1404 disk_bytenr);
1394 goto next_slot; 1405 goto next_slot;
1395 } 1406 }
1396 if (!nocow) { 1407 if (!nocow) {
@@ -1411,6 +1422,9 @@ out_check:
1411 if (ret) { 1422 if (ret) {
1412 if (!nolock && nocow) 1423 if (!nolock && nocow)
1413 btrfs_end_write_no_snapshoting(root); 1424 btrfs_end_write_no_snapshoting(root);
1425 if (nocow)
1426 btrfs_dec_nocow_writers(root->fs_info,
1427 disk_bytenr);
1414 goto error; 1428 goto error;
1415 } 1429 }
1416 cow_start = (u64)-1; 1430 cow_start = (u64)-1;
@@ -1453,6 +1467,8 @@ out_check:
1453 1467
1454 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, 1468 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1455 num_bytes, num_bytes, type); 1469 num_bytes, num_bytes, type);
1470 if (nocow)
1471 btrfs_dec_nocow_writers(root->fs_info, disk_bytenr);
1456 BUG_ON(ret); /* -ENOMEM */ 1472 BUG_ON(ret); /* -ENOMEM */
1457 1473
1458 if (root->root_key.objectid == 1474 if (root->root_key.objectid ==
@@ -7129,6 +7145,43 @@ out:
7129 return em; 7145 return em;
7130} 7146}
7131 7147
7148static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
7149 const u64 start,
7150 const u64 len,
7151 const u64 orig_start,
7152 const u64 block_start,
7153 const u64 block_len,
7154 const u64 orig_block_len,
7155 const u64 ram_bytes,
7156 const int type)
7157{
7158 struct extent_map *em = NULL;
7159 int ret;
7160
7161 down_read(&BTRFS_I(inode)->dio_sem);
7162 if (type != BTRFS_ORDERED_NOCOW) {
7163 em = create_pinned_em(inode, start, len, orig_start,
7164 block_start, block_len, orig_block_len,
7165 ram_bytes, type);
7166 if (IS_ERR(em))
7167 goto out;
7168 }
7169 ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
7170 len, block_len, type);
7171 if (ret) {
7172 if (em) {
7173 free_extent_map(em);
7174 btrfs_drop_extent_cache(inode, start,
7175 start + len - 1, 0);
7176 }
7177 em = ERR_PTR(ret);
7178 }
7179 out:
7180 up_read(&BTRFS_I(inode)->dio_sem);
7181
7182 return em;
7183}
7184
7132static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 7185static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
7133 u64 start, u64 len) 7186 u64 start, u64 len)
7134{ 7187{
@@ -7144,41 +7197,13 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
7144 if (ret) 7197 if (ret)
7145 return ERR_PTR(ret); 7198 return ERR_PTR(ret);
7146 7199
7147 /* 7200 em = btrfs_create_dio_extent(inode, start, ins.offset, start,
7148 * Create the ordered extent before the extent map. This is to avoid 7201 ins.objectid, ins.offset, ins.offset,
7149 * races with the fast fsync path that would lead to it logging file 7202 ins.offset, 0);
7150 * extent items that point to disk extents that were not yet written to. 7203 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
7151 * The fast fsync path collects ordered extents into a local list and 7204 if (IS_ERR(em))
7152 * then collects all the new extent maps, so we must create the ordered
7153 * extent first and make sure the fast fsync path collects any new
7154 * ordered extents after collecting new extent maps as well.
7155 * The fsync path simply can not rely on inode_dio_wait() because it
7156 * causes deadlock with AIO.
7157 */
7158 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
7159 ins.offset, ins.offset, 0);
7160 if (ret) {
7161 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); 7205 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
7162 return ERR_PTR(ret);
7163 }
7164
7165 em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
7166 ins.offset, ins.offset, ins.offset, 0);
7167 if (IS_ERR(em)) {
7168 struct btrfs_ordered_extent *oe;
7169 7206
7170 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
7171 oe = btrfs_lookup_ordered_extent(inode, start);
7172 ASSERT(oe);
7173 if (WARN_ON(!oe))
7174 return em;
7175 set_bit(BTRFS_ORDERED_IOERR, &oe->flags);
7176 set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags);
7177 btrfs_remove_ordered_extent(inode, oe);
7178 /* Once for our lookup and once for the ordered extents tree. */
7179 btrfs_put_ordered_extent(oe);
7180 btrfs_put_ordered_extent(oe);
7181 }
7182 return em; 7207 return em;
7183} 7208}
7184 7209
@@ -7650,24 +7675,21 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
7650 block_start = em->block_start + (start - em->start); 7675 block_start = em->block_start + (start - em->start);
7651 7676
7652 if (can_nocow_extent(inode, start, &len, &orig_start, 7677 if (can_nocow_extent(inode, start, &len, &orig_start,
7653 &orig_block_len, &ram_bytes) == 1) { 7678 &orig_block_len, &ram_bytes) == 1 &&
7679 btrfs_inc_nocow_writers(root->fs_info, block_start)) {
7680 struct extent_map *em2;
7681
7682 em2 = btrfs_create_dio_extent(inode, start, len,
7683 orig_start, block_start,
7684 len, orig_block_len,
7685 ram_bytes, type);
7686 btrfs_dec_nocow_writers(root->fs_info, block_start);
7654 if (type == BTRFS_ORDERED_PREALLOC) { 7687 if (type == BTRFS_ORDERED_PREALLOC) {
7655 free_extent_map(em); 7688 free_extent_map(em);
7656 em = create_pinned_em(inode, start, len, 7689 em = em2;
7657 orig_start,
7658 block_start, len,
7659 orig_block_len,
7660 ram_bytes, type);
7661 if (IS_ERR(em)) {
7662 ret = PTR_ERR(em);
7663 goto unlock_err;
7664 }
7665 } 7690 }
7666 7691 if (em2 && IS_ERR(em2)) {
7667 ret = btrfs_add_ordered_extent_dio(inode, start, 7692 ret = PTR_ERR(em2);
7668 block_start, len, len, type);
7669 if (ret) {
7670 free_extent_map(em);
7671 goto unlock_err; 7693 goto unlock_err;
7672 } 7694 }
7673 goto unlock; 7695 goto unlock;
@@ -9230,6 +9252,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
9230 INIT_LIST_HEAD(&ei->delalloc_inodes); 9252 INIT_LIST_HEAD(&ei->delalloc_inodes);
9231 INIT_LIST_HEAD(&ei->delayed_iput); 9253 INIT_LIST_HEAD(&ei->delayed_iput);
9232 RB_CLEAR_NODE(&ei->rb_node); 9254 RB_CLEAR_NODE(&ei->rb_node);
9255 init_rwsem(&ei->dio_sem);
9233 9256
9234 return inode; 9257 return inode;
9235} 9258}
@@ -9387,10 +9410,281 @@ static int btrfs_getattr(struct vfsmount *mnt,
9387 return 0; 9410 return 0;
9388} 9411}
9389 9412
9413static int btrfs_rename_exchange(struct inode *old_dir,
9414 struct dentry *old_dentry,
9415 struct inode *new_dir,
9416 struct dentry *new_dentry)
9417{
9418 struct btrfs_trans_handle *trans;
9419 struct btrfs_root *root = BTRFS_I(old_dir)->root;
9420 struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9421 struct inode *new_inode = new_dentry->d_inode;
9422 struct inode *old_inode = old_dentry->d_inode;
9423 struct timespec ctime = CURRENT_TIME;
9424 struct dentry *parent;
9425 u64 old_ino = btrfs_ino(old_inode);
9426 u64 new_ino = btrfs_ino(new_inode);
9427 u64 old_idx = 0;
9428 u64 new_idx = 0;
9429 u64 root_objectid;
9430 int ret;
9431 bool root_log_pinned = false;
9432 bool dest_log_pinned = false;
9433
9434 /* we only allow rename subvolume link between subvolumes */
9435 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
9436 return -EXDEV;
9437
9438 /* close the race window with snapshot create/destroy ioctl */
9439 if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9440 down_read(&root->fs_info->subvol_sem);
9441 if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
9442 down_read(&dest->fs_info->subvol_sem);
9443
9444 /*
9445 * We want to reserve the absolute worst case amount of items. So if
9446 * both inodes are subvols and we need to unlink them then that would
9447 * require 4 item modifications, but if they are both normal inodes it
9448 * would require 5 item modifications, so we'll assume their normal
9449 * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items
9450 * should cover the worst case number of items we'll modify.
9451 */
9452 trans = btrfs_start_transaction(root, 12);
9453 if (IS_ERR(trans)) {
9454 ret = PTR_ERR(trans);
9455 goto out_notrans;
9456 }
9457
9458 /*
9459 * We need to find a free sequence number both in the source and
9460 * in the destination directory for the exchange.
9461 */
9462 ret = btrfs_set_inode_index(new_dir, &old_idx);
9463 if (ret)
9464 goto out_fail;
9465 ret = btrfs_set_inode_index(old_dir, &new_idx);
9466 if (ret)
9467 goto out_fail;
9468
9469 BTRFS_I(old_inode)->dir_index = 0ULL;
9470 BTRFS_I(new_inode)->dir_index = 0ULL;
9471
9472 /* Reference for the source. */
9473 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9474 /* force full log commit if subvolume involved. */
9475 btrfs_set_log_full_commit(root->fs_info, trans);
9476 } else {
9477 btrfs_pin_log_trans(root);
9478 root_log_pinned = true;
9479 ret = btrfs_insert_inode_ref(trans, dest,
9480 new_dentry->d_name.name,
9481 new_dentry->d_name.len,
9482 old_ino,
9483 btrfs_ino(new_dir), old_idx);
9484 if (ret)
9485 goto out_fail;
9486 }
9487
9488 /* And now for the dest. */
9489 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
9490 /* force full log commit if subvolume involved. */
9491 btrfs_set_log_full_commit(dest->fs_info, trans);
9492 } else {
9493 btrfs_pin_log_trans(dest);
9494 dest_log_pinned = true;
9495 ret = btrfs_insert_inode_ref(trans, root,
9496 old_dentry->d_name.name,
9497 old_dentry->d_name.len,
9498 new_ino,
9499 btrfs_ino(old_dir), new_idx);
9500 if (ret)
9501 goto out_fail;
9502 }
9503
9504 /* Update inode version and ctime/mtime. */
9505 inode_inc_iversion(old_dir);
9506 inode_inc_iversion(new_dir);
9507 inode_inc_iversion(old_inode);
9508 inode_inc_iversion(new_inode);
9509 old_dir->i_ctime = old_dir->i_mtime = ctime;
9510 new_dir->i_ctime = new_dir->i_mtime = ctime;
9511 old_inode->i_ctime = ctime;
9512 new_inode->i_ctime = ctime;
9513
9514 if (old_dentry->d_parent != new_dentry->d_parent) {
9515 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
9516 btrfs_record_unlink_dir(trans, new_dir, new_inode, 1);
9517 }
9518
9519 /* src is a subvolume */
9520 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9521 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
9522 ret = btrfs_unlink_subvol(trans, root, old_dir,
9523 root_objectid,
9524 old_dentry->d_name.name,
9525 old_dentry->d_name.len);
9526 } else { /* src is an inode */
9527 ret = __btrfs_unlink_inode(trans, root, old_dir,
9528 old_dentry->d_inode,
9529 old_dentry->d_name.name,
9530 old_dentry->d_name.len);
9531 if (!ret)
9532 ret = btrfs_update_inode(trans, root, old_inode);
9533 }
9534 if (ret) {
9535 btrfs_abort_transaction(trans, root, ret);
9536 goto out_fail;
9537 }
9538
9539 /* dest is a subvolume */
9540 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
9541 root_objectid = BTRFS_I(new_inode)->root->root_key.objectid;
9542 ret = btrfs_unlink_subvol(trans, dest, new_dir,
9543 root_objectid,
9544 new_dentry->d_name.name,
9545 new_dentry->d_name.len);
9546 } else { /* dest is an inode */
9547 ret = __btrfs_unlink_inode(trans, dest, new_dir,
9548 new_dentry->d_inode,
9549 new_dentry->d_name.name,
9550 new_dentry->d_name.len);
9551 if (!ret)
9552 ret = btrfs_update_inode(trans, dest, new_inode);
9553 }
9554 if (ret) {
9555 btrfs_abort_transaction(trans, root, ret);
9556 goto out_fail;
9557 }
9558
9559 ret = btrfs_add_link(trans, new_dir, old_inode,
9560 new_dentry->d_name.name,
9561 new_dentry->d_name.len, 0, old_idx);
9562 if (ret) {
9563 btrfs_abort_transaction(trans, root, ret);
9564 goto out_fail;
9565 }
9566
9567 ret = btrfs_add_link(trans, old_dir, new_inode,
9568 old_dentry->d_name.name,
9569 old_dentry->d_name.len, 0, new_idx);
9570 if (ret) {
9571 btrfs_abort_transaction(trans, root, ret);
9572 goto out_fail;
9573 }
9574
9575 if (old_inode->i_nlink == 1)
9576 BTRFS_I(old_inode)->dir_index = old_idx;
9577 if (new_inode->i_nlink == 1)
9578 BTRFS_I(new_inode)->dir_index = new_idx;
9579
9580 if (root_log_pinned) {
9581 parent = new_dentry->d_parent;
9582 btrfs_log_new_name(trans, old_inode, old_dir, parent);
9583 btrfs_end_log_trans(root);
9584 root_log_pinned = false;
9585 }
9586 if (dest_log_pinned) {
9587 parent = old_dentry->d_parent;
9588 btrfs_log_new_name(trans, new_inode, new_dir, parent);
9589 btrfs_end_log_trans(dest);
9590 dest_log_pinned = false;
9591 }
9592out_fail:
9593 /*
9594 * If we have pinned a log and an error happened, we unpin tasks
9595 * trying to sync the log and force them to fallback to a transaction
9596 * commit if the log currently contains any of the inodes involved in
9597 * this rename operation (to ensure we do not persist a log with an
9598 * inconsistent state for any of these inodes or leading to any
9599 * inconsistencies when replayed). If the transaction was aborted, the
9600 * abortion reason is propagated to userspace when attempting to commit
9601 * the transaction. If the log does not contain any of these inodes, we
9602 * allow the tasks to sync it.
9603 */
9604 if (ret && (root_log_pinned || dest_log_pinned)) {
9605 if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
9606 btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
9607 btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
9608 (new_inode &&
9609 btrfs_inode_in_log(new_inode, root->fs_info->generation)))
9610 btrfs_set_log_full_commit(root->fs_info, trans);
9611
9612 if (root_log_pinned) {
9613 btrfs_end_log_trans(root);
9614 root_log_pinned = false;
9615 }
9616 if (dest_log_pinned) {
9617 btrfs_end_log_trans(dest);
9618 dest_log_pinned = false;
9619 }
9620 }
9621 ret = btrfs_end_transaction(trans, root);
9622out_notrans:
9623 if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
9624 up_read(&dest->fs_info->subvol_sem);
9625 if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9626 up_read(&root->fs_info->subvol_sem);
9627
9628 return ret;
9629}
9630
9631static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
9632 struct btrfs_root *root,
9633 struct inode *dir,
9634 struct dentry *dentry)
9635{
9636 int ret;
9637 struct inode *inode;
9638 u64 objectid;
9639 u64 index;
9640
9641 ret = btrfs_find_free_ino(root, &objectid);
9642 if (ret)
9643 return ret;
9644
9645 inode = btrfs_new_inode(trans, root, dir,
9646 dentry->d_name.name,
9647 dentry->d_name.len,
9648 btrfs_ino(dir),
9649 objectid,
9650 S_IFCHR | WHITEOUT_MODE,
9651 &index);
9652
9653 if (IS_ERR(inode)) {
9654 ret = PTR_ERR(inode);
9655 return ret;
9656 }
9657
9658 inode->i_op = &btrfs_special_inode_operations;
9659 init_special_inode(inode, inode->i_mode,
9660 WHITEOUT_DEV);
9661
9662 ret = btrfs_init_inode_security(trans, inode, dir,
9663 &dentry->d_name);
9664 if (ret)
9665 goto out;
9666
9667 ret = btrfs_add_nondir(trans, dir, dentry,
9668 inode, 0, index);
9669 if (ret)
9670 goto out;
9671
9672 ret = btrfs_update_inode(trans, root, inode);
9673out:
9674 unlock_new_inode(inode);
9675 if (ret)
9676 inode_dec_link_count(inode);
9677 iput(inode);
9678
9679 return ret;
9680}
9681
9390static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, 9682static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9391 struct inode *new_dir, struct dentry *new_dentry) 9683 struct inode *new_dir, struct dentry *new_dentry,
9684 unsigned int flags)
9392{ 9685{
9393 struct btrfs_trans_handle *trans; 9686 struct btrfs_trans_handle *trans;
9687 unsigned int trans_num_items;
9394 struct btrfs_root *root = BTRFS_I(old_dir)->root; 9688 struct btrfs_root *root = BTRFS_I(old_dir)->root;
9395 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 9689 struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9396 struct inode *new_inode = d_inode(new_dentry); 9690 struct inode *new_inode = d_inode(new_dentry);
@@ -9399,6 +9693,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9399 u64 root_objectid; 9693 u64 root_objectid;
9400 int ret; 9694 int ret;
9401 u64 old_ino = btrfs_ino(old_inode); 9695 u64 old_ino = btrfs_ino(old_inode);
9696 bool log_pinned = false;
9402 9697
9403 if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 9698 if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
9404 return -EPERM; 9699 return -EPERM;
@@ -9449,15 +9744,21 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9449 * We want to reserve the absolute worst case amount of items. So if 9744 * We want to reserve the absolute worst case amount of items. So if
9450 * both inodes are subvols and we need to unlink them then that would 9745 * both inodes are subvols and we need to unlink them then that would
9451 * require 4 item modifications, but if they are both normal inodes it 9746 * require 4 item modifications, but if they are both normal inodes it
9452 * would require 5 item modifications, so we'll assume their normal 9747 * would require 5 item modifications, so we'll assume they are normal
9453 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items 9748 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
9454 * should cover the worst case number of items we'll modify. 9749 * should cover the worst case number of items we'll modify.
9750 * If our rename has the whiteout flag, we need more 5 units for the
9751 * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
9752 * when selinux is enabled).
9455 */ 9753 */
9456 trans = btrfs_start_transaction(root, 11); 9754 trans_num_items = 11;
9755 if (flags & RENAME_WHITEOUT)
9756 trans_num_items += 5;
9757 trans = btrfs_start_transaction(root, trans_num_items);
9457 if (IS_ERR(trans)) { 9758 if (IS_ERR(trans)) {
9458 ret = PTR_ERR(trans); 9759 ret = PTR_ERR(trans);
9459 goto out_notrans; 9760 goto out_notrans;
9460 } 9761 }
9461 9762
9462 if (dest != root) 9763 if (dest != root)
9463 btrfs_record_root_in_trans(trans, dest); 9764 btrfs_record_root_in_trans(trans, dest);
@@ -9471,6 +9772,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9471 /* force full log commit if subvolume involved. */ 9772 /* force full log commit if subvolume involved. */
9472 btrfs_set_log_full_commit(root->fs_info, trans); 9773 btrfs_set_log_full_commit(root->fs_info, trans);
9473 } else { 9774 } else {
9775 btrfs_pin_log_trans(root);
9776 log_pinned = true;
9474 ret = btrfs_insert_inode_ref(trans, dest, 9777 ret = btrfs_insert_inode_ref(trans, dest,
9475 new_dentry->d_name.name, 9778 new_dentry->d_name.name,
9476 new_dentry->d_name.len, 9779 new_dentry->d_name.len,
@@ -9478,14 +9781,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9478 btrfs_ino(new_dir), index); 9781 btrfs_ino(new_dir), index);
9479 if (ret) 9782 if (ret)
9480 goto out_fail; 9783 goto out_fail;
9481 /*
9482 * this is an ugly little race, but the rename is required
9483 * to make sure that if we crash, the inode is either at the
9484 * old name or the new one. pinning the log transaction lets
9485 * us make sure we don't allow a log commit to come in after
9486 * we unlink the name but before we add the new name back in.
9487 */
9488 btrfs_pin_log_trans(root);
9489 } 9784 }
9490 9785
9491 inode_inc_iversion(old_dir); 9786 inode_inc_iversion(old_dir);
@@ -9552,12 +9847,46 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9552 if (old_inode->i_nlink == 1) 9847 if (old_inode->i_nlink == 1)
9553 BTRFS_I(old_inode)->dir_index = index; 9848 BTRFS_I(old_inode)->dir_index = index;
9554 9849
9555 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { 9850 if (log_pinned) {
9556 struct dentry *parent = new_dentry->d_parent; 9851 struct dentry *parent = new_dentry->d_parent;
9852
9557 btrfs_log_new_name(trans, old_inode, old_dir, parent); 9853 btrfs_log_new_name(trans, old_inode, old_dir, parent);
9558 btrfs_end_log_trans(root); 9854 btrfs_end_log_trans(root);
9855 log_pinned = false;
9856 }
9857
9858 if (flags & RENAME_WHITEOUT) {
9859 ret = btrfs_whiteout_for_rename(trans, root, old_dir,
9860 old_dentry);
9861
9862 if (ret) {
9863 btrfs_abort_transaction(trans, root, ret);
9864 goto out_fail;
9865 }
9559 } 9866 }
9560out_fail: 9867out_fail:
9868 /*
9869 * If we have pinned the log and an error happened, we unpin tasks
9870 * trying to sync the log and force them to fallback to a transaction
9871 * commit if the log currently contains any of the inodes involved in
9872 * this rename operation (to ensure we do not persist a log with an
9873 * inconsistent state for any of these inodes or leading to any
9874 * inconsistencies when replayed). If the transaction was aborted, the
9875 * abortion reason is propagated to userspace when attempting to commit
9876 * the transaction. If the log does not contain any of these inodes, we
9877 * allow the tasks to sync it.
9878 */
9879 if (ret && log_pinned) {
9880 if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
9881 btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
9882 btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
9883 (new_inode &&
9884 btrfs_inode_in_log(new_inode, root->fs_info->generation)))
9885 btrfs_set_log_full_commit(root->fs_info, trans);
9886
9887 btrfs_end_log_trans(root);
9888 log_pinned = false;
9889 }
9561 btrfs_end_transaction(trans, root); 9890 btrfs_end_transaction(trans, root);
9562out_notrans: 9891out_notrans:
9563 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 9892 if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
@@ -9570,10 +9899,14 @@ static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
9570 struct inode *new_dir, struct dentry *new_dentry, 9899 struct inode *new_dir, struct dentry *new_dentry,
9571 unsigned int flags) 9900 unsigned int flags)
9572{ 9901{
9573 if (flags & ~RENAME_NOREPLACE) 9902 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
9574 return -EINVAL; 9903 return -EINVAL;
9575 9904
9576 return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry); 9905 if (flags & RENAME_EXCHANGE)
9906 return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
9907 new_dentry);
9908
9909 return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
9577} 9910}
9578 9911
9579static void btrfs_run_delalloc_work(struct btrfs_work *work) 9912static void btrfs_run_delalloc_work(struct btrfs_work *work)
@@ -9942,6 +10275,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
9942 btrfs_end_transaction(trans, root); 10275 btrfs_end_transaction(trans, root);
9943 break; 10276 break;
9944 } 10277 }
10278 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
9945 10279
9946 last_alloc = ins.offset; 10280 last_alloc = ins.offset;
9947 ret = insert_reserved_file_extent(trans, inode, 10281 ret = insert_reserved_file_extent(trans, inode,
@@ -10184,7 +10518,7 @@ static const struct file_operations btrfs_dir_file_operations = {
10184 .iterate = btrfs_real_readdir, 10518 .iterate = btrfs_real_readdir,
10185 .unlocked_ioctl = btrfs_ioctl, 10519 .unlocked_ioctl = btrfs_ioctl,
10186#ifdef CONFIG_COMPAT 10520#ifdef CONFIG_COMPAT
10187 .compat_ioctl = btrfs_ioctl, 10521 .compat_ioctl = btrfs_compat_ioctl,
10188#endif 10522#endif
10189 .release = btrfs_release_file, 10523 .release = btrfs_release_file,
10190 .fsync = btrfs_sync_file, 10524 .fsync = btrfs_sync_file,