aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2014-10-29 07:57:59 -0400
committerChris Mason <clm@fb.com>2014-11-25 10:41:23 -0500
commit9ea24bbe17a29f937e7f48e4b15fd52e89e9d386 (patch)
tree283655f8865c6d4adc6f080151ceb5f951f56b23
parente5fa8f865b3324aebd055e4054bf479cbab37e5a (diff)
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a file followed by a truncate, with both operations increasing the file's size, we can get a snapshot tree that reflects a state of the source subvolume's tree where the file truncation happened but the write operation didn't. This leaves a gap between 2 file extent items of the inode, which makes btrfs' fsck complain about it. For example, if we perform the following file operations: $ mkfs.btrfs -f /dev/vdd $ mount /dev/vdd /mnt $ xfs_io -f \ -c "pwrite -S 0xaa -b 32K 0 32K" \ -c "fsync" \ -c "pwrite -S 0xbb -b 32770 16K 32770" \ -c "truncate 90123" \ /mnt/foobar and the snapshot creation ioctl was just called before the second write, we often can get the following inode items in the snapshot's btree: item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160 inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0 item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20 inode ref index 282 namelen 10 name: foobar item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53 extent data disk byte 1104855040 nr 32768 extent data offset 0 nr 32768 ram 32768 extent compression 0 item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53 extent data disk byte 0 nr 0 extent data offset 0 nr 40960 ram 40960 extent compression 0 There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[ for which there's no file extent item covering it. This is because the file write and file truncate operations happened both right after the snapshot creation ioctl called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the ordered extent that matches the write and, in btrfs_setsize(), we were able to call btrfs_cont_expand() before being able to commit the current transaction in the snapshot creation ioctl. So this made it possibe to insert the hole file extent item in the source subvolume (which represents the region added by the truncate) right before the transaction commit from the snapshot creation ioctl. Btrfs' fsck tool complains about such cases with a message like the following: "root 331 inode 257 errors 100, file extent discount" >From a user perspective, the expectation when a snapshot is created while those file operations are being performed is that the snapshot will have a file that either: 1) is empty 2) only the first write was captured 3) only the 2 writes were captured 4) both writes and the truncation were captured But never capture a state where only the first write and the truncation were captured (since the second write was performed before the truncation). A test case for xfstests follows. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
-rw-r--r--fs/btrfs/ctree.h4
-rw-r--r--fs/btrfs/extent-tree.c16
-rw-r--r--fs/btrfs/file.c10
-rw-r--r--fs/btrfs/inode.c47
-rw-r--r--fs/btrfs/ioctl.c7
5 files changed, 60 insertions, 24 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9918ba3ec2b2..fc73e86235e8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3480,8 +3480,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
3480int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, 3480int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
3481 struct btrfs_fs_info *fs_info); 3481 struct btrfs_fs_info *fs_info);
3482int __get_raid_index(u64 flags); 3482int __get_raid_index(u64 flags);
3483int btrfs_start_nocow_write(struct btrfs_root *root); 3483int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
3484void btrfs_end_nocow_write(struct btrfs_root *root); 3484void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
3485/* ctree.c */ 3485/* ctree.c */
3486int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 3486int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
3487 int level, int *slot); 3487 int level, int *slot);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5e81e3694d92..b4e3ab115f5f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9656,12 +9656,14 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
9656} 9656}
9657 9657
9658/* 9658/*
9659 * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(), 9659 * btrfs_{start,end}_write_no_snapshoting() are similar to
9660 * they are used to prevent the some tasks writing data into the page cache 9660 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
9661 * by nocow before the subvolume is snapshoted, but flush the data into 9661 * data into the page cache through nocow before the subvolume is snapshoted,
9662 * the disk after the snapshot creation. 9662 * but flush the data into disk after the snapshot creation, or to prevent
9663 * operations while snapshoting is ongoing and that cause the snapshot to be
9664 * inconsistent (writes followed by expanding truncates for example).
9663 */ 9665 */
9664void btrfs_end_nocow_write(struct btrfs_root *root) 9666void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
9665{ 9667{
9666 percpu_counter_dec(&root->subv_writers->counter); 9668 percpu_counter_dec(&root->subv_writers->counter);
9667 /* 9669 /*
@@ -9673,7 +9675,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root)
9673 wake_up(&root->subv_writers->wait); 9675 wake_up(&root->subv_writers->wait);
9674} 9676}
9675 9677
9676int btrfs_start_nocow_write(struct btrfs_root *root) 9678int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
9677{ 9679{
9678 if (atomic_read(&root->will_be_snapshoted)) 9680 if (atomic_read(&root->will_be_snapshoted))
9679 return 0; 9681 return 0;
@@ -9684,7 +9686,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root)
9684 */ 9686 */
9685 smp_mb(); 9687 smp_mb();
9686 if (atomic_read(&root->will_be_snapshoted)) { 9688 if (atomic_read(&root->will_be_snapshoted)) {
9687 btrfs_end_nocow_write(root); 9689 btrfs_end_write_no_snapshoting(root);
9688 return 0; 9690 return 0;
9689 } 9691 }
9690 return 1; 9692 return 1;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0fbf0e7bc606..e4090259569b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1428,7 +1428,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1428 u64 num_bytes; 1428 u64 num_bytes;
1429 int ret; 1429 int ret;
1430 1430
1431 ret = btrfs_start_nocow_write(root); 1431 ret = btrfs_start_write_no_snapshoting(root);
1432 if (!ret) 1432 if (!ret)
1433 return -ENOSPC; 1433 return -ENOSPC;
1434 1434
@@ -1451,7 +1451,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1451 ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); 1451 ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
1452 if (ret <= 0) { 1452 if (ret <= 0) {
1453 ret = 0; 1453 ret = 0;
1454 btrfs_end_nocow_write(root); 1454 btrfs_end_write_no_snapshoting(root);
1455 } else { 1455 } else {
1456 *write_bytes = min_t(size_t, *write_bytes , 1456 *write_bytes = min_t(size_t, *write_bytes ,
1457 num_bytes - pos + lockstart); 1457 num_bytes - pos + lockstart);
@@ -1543,7 +1543,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1543 btrfs_free_reserved_data_space(inode, 1543 btrfs_free_reserved_data_space(inode,
1544 reserve_bytes); 1544 reserve_bytes);
1545 else 1545 else
1546 btrfs_end_nocow_write(root); 1546 btrfs_end_write_no_snapshoting(root);
1547 break; 1547 break;
1548 } 1548 }
1549 1549
@@ -1632,7 +1632,7 @@ again:
1632 1632
1633 release_bytes = 0; 1633 release_bytes = 0;
1634 if (only_release_metadata) 1634 if (only_release_metadata)
1635 btrfs_end_nocow_write(root); 1635 btrfs_end_write_no_snapshoting(root);
1636 1636
1637 if (only_release_metadata && copied > 0) { 1637 if (only_release_metadata && copied > 0) {
1638 u64 lockstart = round_down(pos, root->sectorsize); 1638 u64 lockstart = round_down(pos, root->sectorsize);
@@ -1661,7 +1661,7 @@ again:
1661 1661
1662 if (release_bytes) { 1662 if (release_bytes) {
1663 if (only_release_metadata) { 1663 if (only_release_metadata) {
1664 btrfs_end_nocow_write(root); 1664 btrfs_end_write_no_snapshoting(root);
1665 btrfs_delalloc_release_metadata(inode, release_bytes); 1665 btrfs_delalloc_release_metadata(inode, release_bytes);
1666 } else { 1666 } else {
1667 btrfs_delalloc_release_space(inode, release_bytes); 1667 btrfs_delalloc_release_space(inode, release_bytes);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a5374c2bb943..8de23355f6cf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1337,7 +1337,7 @@ next_slot:
1337 * we fall into common COW way. 1337 * we fall into common COW way.
1338 */ 1338 */
1339 if (!nolock) { 1339 if (!nolock) {
1340 err = btrfs_start_nocow_write(root); 1340 err = btrfs_start_write_no_snapshoting(root);
1341 if (!err) 1341 if (!err)
1342 goto out_check; 1342 goto out_check;
1343 } 1343 }
@@ -1361,7 +1361,7 @@ out_check:
1361 if (extent_end <= start) { 1361 if (extent_end <= start) {
1362 path->slots[0]++; 1362 path->slots[0]++;
1363 if (!nolock && nocow) 1363 if (!nolock && nocow)
1364 btrfs_end_nocow_write(root); 1364 btrfs_end_write_no_snapshoting(root);
1365 goto next_slot; 1365 goto next_slot;
1366 } 1366 }
1367 if (!nocow) { 1367 if (!nocow) {
@@ -1381,7 +1381,7 @@ out_check:
1381 page_started, nr_written, 1); 1381 page_started, nr_written, 1);
1382 if (ret) { 1382 if (ret) {
1383 if (!nolock && nocow) 1383 if (!nolock && nocow)
1384 btrfs_end_nocow_write(root); 1384 btrfs_end_write_no_snapshoting(root);
1385 goto error; 1385 goto error;
1386 } 1386 }
1387 cow_start = (u64)-1; 1387 cow_start = (u64)-1;
@@ -1432,7 +1432,7 @@ out_check:
1432 num_bytes); 1432 num_bytes);
1433 if (ret) { 1433 if (ret) {
1434 if (!nolock && nocow) 1434 if (!nolock && nocow)
1435 btrfs_end_nocow_write(root); 1435 btrfs_end_write_no_snapshoting(root);
1436 goto error; 1436 goto error;
1437 } 1437 }
1438 } 1438 }
@@ -1443,7 +1443,7 @@ out_check:
1443 EXTENT_DELALLOC, PAGE_UNLOCK | 1443 EXTENT_DELALLOC, PAGE_UNLOCK |
1444 PAGE_SET_PRIVATE2); 1444 PAGE_SET_PRIVATE2);
1445 if (!nolock && nocow) 1445 if (!nolock && nocow)
1446 btrfs_end_nocow_write(root); 1446 btrfs_end_write_no_snapshoting(root);
1447 cur_offset = extent_end; 1447 cur_offset = extent_end;
1448 if (cur_offset > end) 1448 if (cur_offset > end)
1449 break; 1449 break;
@@ -4599,6 +4599,26 @@ next:
4599 return err; 4599 return err;
4600} 4600}
4601 4601
4602static int wait_snapshoting_atomic_t(atomic_t *a)
4603{
4604 schedule();
4605 return 0;
4606}
4607
4608static void wait_for_snapshot_creation(struct btrfs_root *root)
4609{
4610 while (true) {
4611 int ret;
4612
4613 ret = btrfs_start_write_no_snapshoting(root);
4614 if (ret)
4615 break;
4616 wait_on_atomic_t(&root->will_be_snapshoted,
4617 wait_snapshoting_atomic_t,
4618 TASK_UNINTERRUPTIBLE);
4619 }
4620}
4621
4602static int btrfs_setsize(struct inode *inode, struct iattr *attr) 4622static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4603{ 4623{
4604 struct btrfs_root *root = BTRFS_I(inode)->root; 4624 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4623,17 +4643,30 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4623 4643
4624 if (newsize > oldsize) { 4644 if (newsize > oldsize) {
4625 truncate_pagecache(inode, newsize); 4645 truncate_pagecache(inode, newsize);
4646 /*
4647 * Don't do an expanding truncate while snapshoting is ongoing.
4648 * This is to ensure the snapshot captures a fully consistent
4649 * state of this file - if the snapshot captures this expanding
4650 * truncation, it must capture all writes that happened before
4651 * this truncation.
4652 */
4653 wait_for_snapshot_creation(root);
4626 ret = btrfs_cont_expand(inode, oldsize, newsize); 4654 ret = btrfs_cont_expand(inode, oldsize, newsize);
4627 if (ret) 4655 if (ret) {
4656 btrfs_end_write_no_snapshoting(root);
4628 return ret; 4657 return ret;
4658 }
4629 4659
4630 trans = btrfs_start_transaction(root, 1); 4660 trans = btrfs_start_transaction(root, 1);
4631 if (IS_ERR(trans)) 4661 if (IS_ERR(trans)) {
4662 btrfs_end_write_no_snapshoting(root);
4632 return PTR_ERR(trans); 4663 return PTR_ERR(trans);
4664 }
4633 4665
4634 i_size_write(inode, newsize); 4666 i_size_write(inode, newsize);
4635 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); 4667 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
4636 ret = btrfs_update_inode(trans, root, inode); 4668 ret = btrfs_update_inode(trans, root, inode);
4669 btrfs_end_write_no_snapshoting(root);
4637 btrfs_end_transaction(trans, root); 4670 btrfs_end_transaction(trans, root);
4638 } else { 4671 } else {
4639 4672
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3abc068c5543..b590e23fa03e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -617,7 +617,7 @@ fail:
617 return ret; 617 return ret;
618} 618}
619 619
620static void btrfs_wait_nocow_write(struct btrfs_root *root) 620static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root)
621{ 621{
622 s64 writers; 622 s64 writers;
623 DEFINE_WAIT(wait); 623 DEFINE_WAIT(wait);
@@ -649,7 +649,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
649 649
650 atomic_inc(&root->will_be_snapshoted); 650 atomic_inc(&root->will_be_snapshoted);
651 smp_mb__after_atomic(); 651 smp_mb__after_atomic();
652 btrfs_wait_nocow_write(root); 652 btrfs_wait_for_no_snapshoting_writes(root);
653 653
654 ret = btrfs_start_delalloc_inodes(root, 0); 654 ret = btrfs_start_delalloc_inodes(root, 0);
655 if (ret) 655 if (ret)
@@ -732,7 +732,8 @@ fail:
732free: 732free:
733 kfree(pending_snapshot); 733 kfree(pending_snapshot);
734out: 734out:
735 atomic_dec(&root->will_be_snapshoted); 735 if (atomic_dec_and_test(&root->will_be_snapshoted))
736 wake_up_atomic_t(&root->will_be_snapshoted);
736 return ret; 737 return ret;
737} 738}
738 739