From 67710892ec983aa79ad1e2a2642fe8e3a4a194ea Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 6 Jun 2016 11:51:25 +0100 Subject: Btrfs: be more precise on errors when getting an inode from disk When we attempt to read an inode from disk, we end up always returning an -ESTALE error to the caller regardless of the actual failure reason, which can be an out of memory problem (when allocating a path), some error found when reading from the fs/subvolume btree (like a genuine IO error) or the inode does not exists. So lets start returning the real error code to the callers so that they don't treat all -ESTALE errors as meaning that the inode does not exists (such as during orphan cleanup). This will also be needed for a subsequent patch in the same series dealing with a special fsync case. Signed-off-by: Filipe Manana --- fs/btrfs/inode.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3e61bd1f3f65..f9686541997b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3428,10 +3428,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) found_key.offset = 0; inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); ret = PTR_ERR_OR_ZERO(inode); - if (ret && ret != -ESTALE) + if (ret && ret != -ENOENT) goto out; - if (ret == -ESTALE && root == root->fs_info->tree_root) { + if (ret == -ENOENT && root == root->fs_info->tree_root) { struct btrfs_root *dead_root; struct btrfs_fs_info *fs_info = root->fs_info; int is_dead_root = 0; @@ -3467,7 +3467,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * Inode is already gone but the orphan item is still there, * kill the orphan item. */ - if (ret == -ESTALE) { + if (ret == -ENOENT) { trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -3626,7 +3626,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, /* * read an inode from the btree into the in-memory inode */ -static void btrfs_read_locked_inode(struct inode *inode) +static int btrfs_read_locked_inode(struct inode *inode) { struct btrfs_path *path; struct extent_buffer *leaf; @@ -3645,14 +3645,19 @@ static void btrfs_read_locked_inode(struct inode *inode) filled = true; path = btrfs_alloc_path(); - if (!path) + if (!path) { + ret = -ENOMEM; goto make_bad; + } memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); - if (ret) + if (ret) { + if (ret > 0) + ret = -ENOENT; goto make_bad; + } leaf = path->nodes[0]; @@ -3805,11 +3810,12 @@ cache_acl: } btrfs_update_iflags(inode); - return; + return 0; make_bad: btrfs_free_path(path); make_bad_inode(inode); + return ret; } /* @@ -5595,7 +5601,9 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { - btrfs_read_locked_inode(inode); + int ret; + + ret = btrfs_read_locked_inode(inode); if (!is_bad_inode(inode)) { inode_tree_add(inode); unlock_new_inode(inode); @@ -5604,7 +5612,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, } else { unlock_new_inode(inode); iput(inode); - inode = ERR_PTR(-ESTALE); + ASSERT(ret < 0); + inode = ERR_PTR(ret < 0 ? ret : -ESTALE); } } -- cgit v1.2.2 From 44f714dae50a2e795d3268a6831762aa6fa54f55 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 6 Jun 2016 16:11:13 +0100 Subject: Btrfs: improve performance on fsync against new inode after rename/unlink With commit 56f23fdbb600 ("Btrfs: fix file/data loss caused by fsync after rename and new inode") we got simple fix for a functional issue when the following sequence of actions is done: at transaction N create file A at directory D at transaction N + M (where M >= 1) move/rename existing file A from directory D to directory E create a new file named A at directory D fsync the new file power fail The solution was to simply detect such scenario and fallback to a full transaction commit when we detect it. However this turned out to had a significant impact on throughput (and a bit on latency too) for benchmarks using the dbench tool, which simulates real workloads from smbd (Samba) servers. For example on a test vm (with a debug kernel): Unpatched: Throughput 19.1572 MB/sec 32 clients 32 procs max_latency=1005.229 ms Patched: Throughput 23.7015 MB/sec 32 clients 32 procs max_latency=809.206 ms The patched results (this patch is applied) are similar to the results of a kernel with the commit 56f23fdbb600 ("Btrfs: fix file/data loss caused by fsync after rename and new inode") reverted. This change avoids the fallback to a transaction commit and instead makes sure all the names of the conflicting inode (the one that had a name in a past transaction that matches the name of the new file in the same parent directory) are logged so that at log replay time we don't lose neither the new file nor the old file, and the old file gets the name it was renamed to. This also ends up avoiding a full transaction commit for a similar case that involves an unlink instead of a rename of the old file: at transaction N create file A at directory D at transaction N + M (where M >= 1) remove file A create a new file named A at directory D fsync the new file power fail Signed-off-by: Filipe Manana --- fs/btrfs/inode.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f9686541997b..453b9d0da5f1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4203,6 +4203,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) int err = 0; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; + u64 last_unlink_trans; if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; @@ -4225,11 +4226,27 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (err) goto out; + last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; + /* now the directory is empty */ err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry), dentry->d_name.name, dentry->d_name.len); - if (!err) + if (!err) { btrfs_i_size_write(inode, 0); + /* + * Propagate the last_unlink_trans value of the deleted dir to + * its parent directory. This is to prevent an unrecoverable + * log tree in the case we do something like this: + * 1) create dir foo + * 2) create snapshot under dir foo + * 3) delete the snapshot + * 4) rmdir foo + * 5) mkdir foo + * 6) fsync foo or some file inside foo + */ + if (last_unlink_trans >= trans->transid) + BTRFS_I(dir)->last_unlink_trans = last_unlink_trans; + } out: btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); -- cgit v1.2.2 From 18513091af9483ba84328d42092bd4d42a3c958f Mon Sep 17 00:00:00 2001 From: Wang Xiaoguang Date: Mon, 25 Jul 2016 15:51:40 +0800 Subject: btrfs: update btrfs_space_info's bytes_may_use timely This patch can fix some false ENOSPC errors, below test script can reproduce one false ENOSPC error: #!/bin/bash dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128 dev=$(losetup --show -f fs.img) mkfs.btrfs -f -M $dev mkdir /tmp/mntpoint mount $dev /tmp/mntpoint cd /tmp/mntpoint xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile Above script will fail for ENOSPC reason, but indeed fs still has free space to satisfy this request. Please see call graph: btrfs_fallocate() |-> btrfs_alloc_data_chunk_ondemand() | bytes_may_use += 64M |-> btrfs_prealloc_file_range() |-> btrfs_reserve_extent() |-> btrfs_add_reserved_bytes() | alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not | change bytes_may_use, and bytes_reserved += 64M. Now | bytes_may_use + bytes_reserved == 128M, which is greater | than btrfs_space_info's total_bytes, false enospc occurs. | Note, the bytes_may_use decrease operation will be done in | end of btrfs_fallocate(), which is too late. Here is another simple case for buffered write: CPU 1 | CPU 2 | |-> cow_file_range() |-> __btrfs_buffered_write() |-> btrfs_reserve_extent() | | | | | | | | | ..... | |-> btrfs_check_data_free_space() | | | | |-> extent_clear_unlock_delalloc() | In CPU 1, btrfs_reserve_extent()->find_free_extent()-> btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease operation will be delayed to be done in extent_clear_unlock_delalloc(). Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's btrfs_check_data_free_space() tries to reserve 100MB data space. If 100MB > data_sinfo->total_bytes - data_sinfo->bytes_used - data_sinfo->bytes_reserved - data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - data_sinfo->bytes_may_use btrfs_check_data_free_space() will try to allcate new data chunk or call btrfs_start_delalloc_roots(), or commit current transaction in order to reserve some free space, obviously a lot of work. But indeed it's not necessary as long as decreasing bytes_may_use timely, we still have free space, decreasing 128M from bytes_may_use. To fix this issue, this patch chooses to update bytes_may_use for both data and metadata in btrfs_add_reserved_bytes(). For compress path, real extent length may not be equal to file content length, so introduce a ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by file content length. Then compress path can update bytes_may_use correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC and RESERVE_FREE. As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as PREALLOC, we also need to update bytes_may_use, but can not pass EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook() to update btrfs_space_info's bytes_may_use. Meanwhile __btrfs_prealloc_file_range() will call btrfs_free_reserved_data_space() internally for both sucessful and failed path, btrfs_prealloc_file_range()'s callers does not need to call btrfs_free_reserved_data_space() any more. Signed-off-by: Wang Xiaoguang Reviewed-by: Josef Bacik Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 29636624a427..bcea20015d9a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -566,6 +566,8 @@ cont: PAGE_SET_WRITEBACK | page_error_op | PAGE_END_WRITEBACK); + btrfs_free_reserved_data_space_noquota(inode, start, + end - start + 1); goto free_pages_out; } } @@ -742,7 +744,7 @@ retry: lock_extent(io_tree, async_extent->start, async_extent->start + async_extent->ram_size - 1); - ret = btrfs_reserve_extent(root, + ret = btrfs_reserve_extent(root, async_extent->ram_size, async_extent->compressed_size, async_extent->compressed_size, 0, alloc_hint, &ins, 1, 1); @@ -969,7 +971,8 @@ static noinline int cow_file_range(struct inode *inode, EXTENT_DEFRAG, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); - + btrfs_free_reserved_data_space_noquota(inode, start, + end - start + 1); *nr_written = *nr_written + (end - start + PAGE_SIZE) / PAGE_SIZE; *page_started = 1; @@ -989,7 +992,7 @@ static noinline int cow_file_range(struct inode *inode, unsigned long op; cur_alloc_size = disk_num_bytes; - ret = btrfs_reserve_extent(root, cur_alloc_size, + ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, root->sectorsize, 0, alloc_hint, &ins, 1, 1); if (ret < 0) @@ -1489,8 +1492,10 @@ out_check: extent_clear_unlock_delalloc(inode, cur_offset, cur_offset + num_bytes - 1, locked_page, EXTENT_LOCKED | - EXTENT_DELALLOC, PAGE_UNLOCK | - PAGE_SET_PRIVATE2); + EXTENT_DELALLOC | + EXTENT_CLEAR_DATA_RESV, + PAGE_UNLOCK | PAGE_SET_PRIVATE2); + if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); cur_offset = extent_end; @@ -1807,7 +1812,9 @@ static void btrfs_clear_bit_hook(struct inode *inode, return; if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID - && do_list && !(state->state & EXTENT_NORESERVE)) + && do_list && !(state->state & EXTENT_NORESERVE) + && (*bits & (EXTENT_DO_ACCOUNTING | + EXTENT_CLEAR_DATA_RESV))) btrfs_free_reserved_data_space_noquota(inode, state->start, len); @@ -7252,7 +7259,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, int ret; alloc_hint = get_extent_allocation_hint(inode, start, len); - ret = btrfs_reserve_extent(root, len, root->sectorsize, 0, + ret = btrfs_reserve_extent(root, len, len, root->sectorsize, 0, alloc_hint, &ins, 1, 1); if (ret) return ERR_PTR(ret); @@ -7752,6 +7759,13 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, ret = PTR_ERR(em2); goto unlock_err; } + /* + * For inode marked NODATACOW or extent marked PREALLOC, + * use the existing or preallocated extent, so does not + * need to adjust btrfs_space_info's bytes_may_use. + */ + btrfs_free_reserved_data_space_noquota(inode, + start, len); goto unlock; } } @@ -7786,7 +7800,6 @@ unlock: i_size_write(inode, start + len); adjust_dio_outstanding_extents(inode, dio_data, len); - btrfs_free_reserved_data_space(inode, start, len); WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; dio_data->unsubmitted_oe_range_end = start + len; @@ -10306,6 +10319,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, u64 last_alloc = (u64)-1; int ret = 0; bool own_trans = true; + u64 end = start + num_bytes - 1; if (trans) own_trans = false; @@ -10327,8 +10341,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, * sized chunks. */ cur_bytes = min(cur_bytes, last_alloc); - ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, - *alloc_hint, &ins, 1, 0); + ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, + min_size, 0, *alloc_hint, &ins, 1, 0); if (ret) { if (own_trans) btrfs_end_transaction(trans, root); @@ -10414,6 +10428,9 @@ next: if (own_trans) btrfs_end_transaction(trans, root); } + if (cur_offset < end) + btrfs_free_reserved_data_space(inode, cur_offset, + end - cur_offset + 1); return ret; } -- cgit v1.2.2