aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRobbie Ko <robbieko@synology.com>2018-08-05 22:30:30 -0400
committerDavid Sterba <dsterba@suse.com>2018-08-17 12:35:43 -0400
commit8ecebf4d767e2307a946c8905278d6358eda35c3 (patch)
tree4844a64d348f597f6548f739b21c18770ee7889c
parent39379faaad79e3cf403a6904a08676b7850043ae (diff)
Btrfs: fix unexpected failure of nocow buffered writes after snapshotting when low on space
Commit e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting") forced nocow writes to fallback to COW, during writeback, when a snapshot is created. This resulted in writes made before creating the snapshot to unexpectedly fail with ENOSPC during writeback when success (0) was returned to user space through the write system call. The steps leading to this problem are: 1. When it's not possible to allocate data space for a write, the buffered write path checks if a NOCOW write is possible. If it is, it will not reserve space and success (0) is returned to user space. 2. Then when a snapshot is created, the root's will_be_snapshotted atomic is incremented and writeback is triggered for all inode's that belong to the root being snapshotted. Incrementing that atomic forces all previous writes to fallback to COW during writeback (running delalloc). 3. This results in the writeback for the inodes to fail and therefore setting the ENOSPC error in their mappings, so that a subsequent fsync on them will report the error to user space. So it's not a completely silent data loss (since fsync will report ENOSPC) but it's a very unexpected and undesirable behaviour, because if a clean shutdown/unmount of the filesystem happens without previous calls to fsync, it is expected to have the data present in the files after mounting the filesystem again. So fix this by adding a new atomic named snapshot_force_cow to the root structure which prevents this behaviour and works the following way: 1. It is incremented when we start to create a snapshot after triggering writeback and before waiting for writeback to finish. 2. This new atomic is now what is used by writeback (running delalloc) to decide whether we need to fallback to COW or not. Because we incremented this new atomic after triggering writeback in the snapshot creation ioctl, we ensure that all buffered writes that happened before snapshot creation will succeed and not fallback to COW (which would make them fail with ENOSPC). 3. The existing atomic, will_be_snapshotted, is kept because it is used to force new buffered writes, that start after we started snapshotting, to reserve data space even when NOCOW is possible. This makes these writes fail early with ENOSPC when there's no available space to allocate, preventing the unexpected behaviour of writeback later failing with ENOSPC due to a fallback to COW mode. Fixes: e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting") Signed-off-by: Robbie Ko <robbieko@synology.com> Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
-rw-r--r--fs/btrfs/ctree.h1
-rw-r--r--fs/btrfs/disk-io.c1
-rw-r--r--fs/btrfs/inode.c25
-rw-r--r--fs/btrfs/ioctl.c16
4 files changed, 22 insertions, 21 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 318be7864072..a67cc190a84b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1280,6 +1280,7 @@ struct btrfs_root {
1280 int send_in_progress; 1280 int send_in_progress;
1281 struct btrfs_subvolume_writers *subv_writers; 1281 struct btrfs_subvolume_writers *subv_writers;
1282 atomic_t will_be_snapshotted; 1282 atomic_t will_be_snapshotted;
1283 atomic_t snapshot_force_cow;
1283 1284
1284 /* For qgroup metadata reserved space */ 1285 /* For qgroup metadata reserved space */
1285 spinlock_t qgroup_meta_rsv_lock; 1286 spinlock_t qgroup_meta_rsv_lock;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5124c15705ce..05dc3c17cb62 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1187,6 +1187,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
1187 atomic_set(&root->log_batch, 0); 1187 atomic_set(&root->log_batch, 0);
1188 refcount_set(&root->refs, 1); 1188 refcount_set(&root->refs, 1);
1189 atomic_set(&root->will_be_snapshotted, 0); 1189 atomic_set(&root->will_be_snapshotted, 0);
1190 atomic_set(&root->snapshot_force_cow, 0);
1190 root->log_transid = 0; 1191 root->log_transid = 0;
1191 root->log_transid_committed = -1; 1192 root->log_transid_committed = -1;
1192 root->last_log_commit = 0; 1193 root->last_log_commit = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3f51ddc18f98..c6d8c5d19ff0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1271,7 +1271,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
1271 u64 disk_num_bytes; 1271 u64 disk_num_bytes;
1272 u64 ram_bytes; 1272 u64 ram_bytes;
1273 int extent_type; 1273 int extent_type;
1274 int ret, err; 1274 int ret;
1275 int type; 1275 int type;
1276 int nocow; 1276 int nocow;
1277 int check_prev = 1; 1277 int check_prev = 1;
@@ -1403,11 +1403,8 @@ next_slot:
1403 * if there are pending snapshots for this root, 1403 * if there are pending snapshots for this root,
1404 * we fall into common COW way. 1404 * we fall into common COW way.
1405 */ 1405 */
1406 if (!nolock) { 1406 if (!nolock && atomic_read(&root->snapshot_force_cow))
1407 err = btrfs_start_write_no_snapshotting(root); 1407 goto out_check;
1408 if (!err)
1409 goto out_check;
1410 }
1411 /* 1408 /*
1412 * force cow if csum exists in the range. 1409 * force cow if csum exists in the range.
1413 * this ensure that csum for a given extent are 1410 * this ensure that csum for a given extent are
@@ -1416,9 +1413,6 @@ next_slot:
1416 ret = csum_exist_in_range(fs_info, disk_bytenr, 1413 ret = csum_exist_in_range(fs_info, disk_bytenr,
1417 num_bytes); 1414 num_bytes);
1418 if (ret) { 1415 if (ret) {
1419 if (!nolock)
1420 btrfs_end_write_no_snapshotting(root);
1421
1422 /* 1416 /*
1423 * ret could be -EIO if the above fails to read 1417 * ret could be -EIO if the above fails to read
1424 * metadata. 1418 * metadata.
@@ -1431,11 +1425,8 @@ next_slot:
1431 WARN_ON_ONCE(nolock); 1425 WARN_ON_ONCE(nolock);
1432 goto out_check; 1426 goto out_check;
1433 } 1427 }
1434 if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) { 1428 if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
1435 if (!nolock)
1436 btrfs_end_write_no_snapshotting(root);
1437 goto out_check; 1429 goto out_check;
1438 }
1439 nocow = 1; 1430 nocow = 1;
1440 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 1431 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1441 extent_end = found_key.offset + 1432 extent_end = found_key.offset +
@@ -1448,8 +1439,6 @@ next_slot:
1448out_check: 1439out_check:
1449 if (extent_end <= start) { 1440 if (extent_end <= start) {
1450 path->slots[0]++; 1441 path->slots[0]++;
1451 if (!nolock && nocow)
1452 btrfs_end_write_no_snapshotting(root);
1453 if (nocow) 1442 if (nocow)
1454 btrfs_dec_nocow_writers(fs_info, disk_bytenr); 1443 btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1455 goto next_slot; 1444 goto next_slot;
@@ -1471,8 +1460,6 @@ out_check:
1471 end, page_started, nr_written, 1, 1460 end, page_started, nr_written, 1,
1472 NULL); 1461 NULL);
1473 if (ret) { 1462 if (ret) {
1474 if (!nolock && nocow)
1475 btrfs_end_write_no_snapshotting(root);
1476 if (nocow) 1463 if (nocow)
1477 btrfs_dec_nocow_writers(fs_info, 1464 btrfs_dec_nocow_writers(fs_info,
1478 disk_bytenr); 1465 disk_bytenr);
@@ -1492,8 +1479,6 @@ out_check:
1492 ram_bytes, BTRFS_COMPRESS_NONE, 1479 ram_bytes, BTRFS_COMPRESS_NONE,
1493 BTRFS_ORDERED_PREALLOC); 1480 BTRFS_ORDERED_PREALLOC);
1494 if (IS_ERR(em)) { 1481 if (IS_ERR(em)) {
1495 if (!nolock && nocow)
1496 btrfs_end_write_no_snapshotting(root);
1497 if (nocow) 1482 if (nocow)
1498 btrfs_dec_nocow_writers(fs_info, 1483 btrfs_dec_nocow_writers(fs_info,
1499 disk_bytenr); 1484 disk_bytenr);
@@ -1532,8 +1517,6 @@ out_check:
1532 EXTENT_CLEAR_DATA_RESV, 1517 EXTENT_CLEAR_DATA_RESV,
1533 PAGE_UNLOCK | PAGE_SET_PRIVATE2); 1518 PAGE_UNLOCK | PAGE_SET_PRIVATE2);
1534 1519
1535 if (!nolock && nocow)
1536 btrfs_end_write_no_snapshotting(root);
1537 cur_offset = extent_end; 1520 cur_offset = extent_end;
1538 1521
1539 /* 1522 /*
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d3a5d2a41e5f..85c4284bb2cf 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -747,6 +747,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
747 struct btrfs_pending_snapshot *pending_snapshot; 747 struct btrfs_pending_snapshot *pending_snapshot;
748 struct btrfs_trans_handle *trans; 748 struct btrfs_trans_handle *trans;
749 int ret; 749 int ret;
750 bool snapshot_force_cow = false;
750 751
751 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 752 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
752 return -EINVAL; 753 return -EINVAL;
@@ -763,6 +764,11 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
763 goto free_pending; 764 goto free_pending;
764 } 765 }
765 766
767 /*
768 * Force new buffered writes to reserve space even when NOCOW is
769 * possible. This is to avoid later writeback (running dealloc) to
770 * fallback to COW mode and unexpectedly fail with ENOSPC.
771 */
766 atomic_inc(&root->will_be_snapshotted); 772 atomic_inc(&root->will_be_snapshotted);
767 smp_mb__after_atomic(); 773 smp_mb__after_atomic();
768 /* wait for no snapshot writes */ 774 /* wait for no snapshot writes */
@@ -773,6 +779,14 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
773 if (ret) 779 if (ret)
774 goto dec_and_free; 780 goto dec_and_free;
775 781
782 /*
783 * All previous writes have started writeback in NOCOW mode, so now
784 * we force future writes to fallback to COW mode during snapshot
785 * creation.
786 */
787 atomic_inc(&root->snapshot_force_cow);
788 snapshot_force_cow = true;
789
776 btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); 790 btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
777 791
778 btrfs_init_block_rsv(&pending_snapshot->block_rsv, 792 btrfs_init_block_rsv(&pending_snapshot->block_rsv,
@@ -837,6 +851,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
837fail: 851fail:
838 btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); 852 btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
839dec_and_free: 853dec_and_free:
854 if (snapshot_force_cow)
855 atomic_dec(&root->snapshot_force_cow);
840 if (atomic_dec_and_test(&root->will_be_snapshotted)) 856 if (atomic_dec_and_test(&root->will_be_snapshotted))
841 wake_up_var(&root->will_be_snapshotted); 857 wake_up_var(&root->will_be_snapshotted);
842free_pending: 858free_pending: