aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
authorWang Xiaoguang <wangxg.fnst@cn.fujitsu.com>2016-07-25 03:51:40 -0400
committerChris Mason <clm@fb.com>2016-08-25 06:58:26 -0400
commit18513091af9483ba84328d42092bd4d42a3c958f (patch)
treee5a8dfa241105b399c110cc4f79400b332136272 /fs/btrfs/inode.c
parent4824f1f412f75e9f84b9cecbde828e8f4699f82d (diff)
btrfs: update btrfs_space_info's bytes_may_use timely
This patch can fix some false ENOSPC errors, below test script can reproduce one false ENOSPC error: #!/bin/bash dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128 dev=$(losetup --show -f fs.img) mkfs.btrfs -f -M $dev mkdir /tmp/mntpoint mount $dev /tmp/mntpoint cd /tmp/mntpoint xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile Above script will fail for ENOSPC reason, but indeed fs still has free space to satisfy this request. Please see call graph: btrfs_fallocate() |-> btrfs_alloc_data_chunk_ondemand() | bytes_may_use += 64M |-> btrfs_prealloc_file_range() |-> btrfs_reserve_extent() |-> btrfs_add_reserved_bytes() | alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not | change bytes_may_use, and bytes_reserved += 64M. Now | bytes_may_use + bytes_reserved == 128M, which is greater | than btrfs_space_info's total_bytes, false enospc occurs. | Note, the bytes_may_use decrease operation will be done in | end of btrfs_fallocate(), which is too late. Here is another simple case for buffered write: CPU 1 | CPU 2 | |-> cow_file_range() |-> __btrfs_buffered_write() |-> btrfs_reserve_extent() | | | | | | | | | ..... | |-> btrfs_check_data_free_space() | | | | |-> extent_clear_unlock_delalloc() | In CPU 1, btrfs_reserve_extent()->find_free_extent()-> btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease operation will be delayed to be done in extent_clear_unlock_delalloc(). Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's btrfs_check_data_free_space() tries to reserve 100MB data space. If 100MB > data_sinfo->total_bytes - data_sinfo->bytes_used - data_sinfo->bytes_reserved - data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - data_sinfo->bytes_may_use btrfs_check_data_free_space() will try to allcate new data chunk or call btrfs_start_delalloc_roots(), or commit current transaction in order to reserve some free space, obviously a lot of work. But indeed it's not necessary as long as decreasing bytes_may_use timely, we still have free space, decreasing 128M from bytes_may_use. To fix this issue, this patch chooses to update bytes_may_use for both data and metadata in btrfs_add_reserved_bytes(). For compress path, real extent length may not be equal to file content length, so introduce a ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by file content length. Then compress path can update bytes_may_use correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC and RESERVE_FREE. As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as PREALLOC, we also need to update bytes_may_use, but can not pass EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook() to update btrfs_space_info's bytes_may_use. Meanwhile __btrfs_prealloc_file_range() will call btrfs_free_reserved_data_space() internally for both sucessful and failed path, btrfs_prealloc_file_range()'s callers does not need to call btrfs_free_reserved_data_space() any more. Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com> Reviewed-by: Josef Bacik <jbacik@fb.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c37
1 files changed, 27 insertions, 10 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 29636624a427..bcea20015d9a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -566,6 +566,8 @@ cont:
566 PAGE_SET_WRITEBACK | 566 PAGE_SET_WRITEBACK |
567 page_error_op | 567 page_error_op |
568 PAGE_END_WRITEBACK); 568 PAGE_END_WRITEBACK);
569 btrfs_free_reserved_data_space_noquota(inode, start,
570 end - start + 1);
569 goto free_pages_out; 571 goto free_pages_out;
570 } 572 }
571 } 573 }
@@ -742,7 +744,7 @@ retry:
742 lock_extent(io_tree, async_extent->start, 744 lock_extent(io_tree, async_extent->start,
743 async_extent->start + async_extent->ram_size - 1); 745 async_extent->start + async_extent->ram_size - 1);
744 746
745 ret = btrfs_reserve_extent(root, 747 ret = btrfs_reserve_extent(root, async_extent->ram_size,
746 async_extent->compressed_size, 748 async_extent->compressed_size,
747 async_extent->compressed_size, 749 async_extent->compressed_size,
748 0, alloc_hint, &ins, 1, 1); 750 0, alloc_hint, &ins, 1, 1);
@@ -969,7 +971,8 @@ static noinline int cow_file_range(struct inode *inode,
969 EXTENT_DEFRAG, PAGE_UNLOCK | 971 EXTENT_DEFRAG, PAGE_UNLOCK |
970 PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | 972 PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
971 PAGE_END_WRITEBACK); 973 PAGE_END_WRITEBACK);
972 974 btrfs_free_reserved_data_space_noquota(inode, start,
975 end - start + 1);
973 *nr_written = *nr_written + 976 *nr_written = *nr_written +
974 (end - start + PAGE_SIZE) / PAGE_SIZE; 977 (end - start + PAGE_SIZE) / PAGE_SIZE;
975 *page_started = 1; 978 *page_started = 1;
@@ -989,7 +992,7 @@ static noinline int cow_file_range(struct inode *inode,
989 unsigned long op; 992 unsigned long op;
990 993
991 cur_alloc_size = disk_num_bytes; 994 cur_alloc_size = disk_num_bytes;
992 ret = btrfs_reserve_extent(root, cur_alloc_size, 995 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
993 root->sectorsize, 0, alloc_hint, 996 root->sectorsize, 0, alloc_hint,
994 &ins, 1, 1); 997 &ins, 1, 1);
995 if (ret < 0) 998 if (ret < 0)
@@ -1489,8 +1492,10 @@ out_check:
1489 extent_clear_unlock_delalloc(inode, cur_offset, 1492 extent_clear_unlock_delalloc(inode, cur_offset,
1490 cur_offset + num_bytes - 1, 1493 cur_offset + num_bytes - 1,
1491 locked_page, EXTENT_LOCKED | 1494 locked_page, EXTENT_LOCKED |
1492 EXTENT_DELALLOC, PAGE_UNLOCK | 1495 EXTENT_DELALLOC |
1493 PAGE_SET_PRIVATE2); 1496 EXTENT_CLEAR_DATA_RESV,
1497 PAGE_UNLOCK | PAGE_SET_PRIVATE2);
1498
1494 if (!nolock && nocow) 1499 if (!nolock && nocow)
1495 btrfs_end_write_no_snapshoting(root); 1500 btrfs_end_write_no_snapshoting(root);
1496 cur_offset = extent_end; 1501 cur_offset = extent_end;
@@ -1807,7 +1812,9 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1807 return; 1812 return;
1808 1813
1809 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 1814 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1810 && do_list && !(state->state & EXTENT_NORESERVE)) 1815 && do_list && !(state->state & EXTENT_NORESERVE)
1816 && (*bits & (EXTENT_DO_ACCOUNTING |
1817 EXTENT_CLEAR_DATA_RESV)))
1811 btrfs_free_reserved_data_space_noquota(inode, 1818 btrfs_free_reserved_data_space_noquota(inode,
1812 state->start, len); 1819 state->start, len);
1813 1820
@@ -7252,7 +7259,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
7252 int ret; 7259 int ret;
7253 7260
7254 alloc_hint = get_extent_allocation_hint(inode, start, len); 7261 alloc_hint = get_extent_allocation_hint(inode, start, len);
7255 ret = btrfs_reserve_extent(root, len, root->sectorsize, 0, 7262 ret = btrfs_reserve_extent(root, len, len, root->sectorsize, 0,
7256 alloc_hint, &ins, 1, 1); 7263 alloc_hint, &ins, 1, 1);
7257 if (ret) 7264 if (ret)
7258 return ERR_PTR(ret); 7265 return ERR_PTR(ret);
@@ -7752,6 +7759,13 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
7752 ret = PTR_ERR(em2); 7759 ret = PTR_ERR(em2);
7753 goto unlock_err; 7760 goto unlock_err;
7754 } 7761 }
7762 /*
7763 * For inode marked NODATACOW or extent marked PREALLOC,
7764 * use the existing or preallocated extent, so does not
7765 * need to adjust btrfs_space_info's bytes_may_use.
7766 */
7767 btrfs_free_reserved_data_space_noquota(inode,
7768 start, len);
7755 goto unlock; 7769 goto unlock;
7756 } 7770 }
7757 } 7771 }
@@ -7786,7 +7800,6 @@ unlock:
7786 i_size_write(inode, start + len); 7800 i_size_write(inode, start + len);
7787 7801
7788 adjust_dio_outstanding_extents(inode, dio_data, len); 7802 adjust_dio_outstanding_extents(inode, dio_data, len);
7789 btrfs_free_reserved_data_space(inode, start, len);
7790 WARN_ON(dio_data->reserve < len); 7803 WARN_ON(dio_data->reserve < len);
7791 dio_data->reserve -= len; 7804 dio_data->reserve -= len;
7792 dio_data->unsubmitted_oe_range_end = start + len; 7805 dio_data->unsubmitted_oe_range_end = start + len;
@@ -10306,6 +10319,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
10306 u64 last_alloc = (u64)-1; 10319 u64 last_alloc = (u64)-1;
10307 int ret = 0; 10320 int ret = 0;
10308 bool own_trans = true; 10321 bool own_trans = true;
10322 u64 end = start + num_bytes - 1;
10309 10323
10310 if (trans) 10324 if (trans)
10311 own_trans = false; 10325 own_trans = false;
@@ -10327,8 +10341,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
10327 * sized chunks. 10341 * sized chunks.
10328 */ 10342 */
10329 cur_bytes = min(cur_bytes, last_alloc); 10343 cur_bytes = min(cur_bytes, last_alloc);
10330 ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, 10344 ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
10331 *alloc_hint, &ins, 1, 0); 10345 min_size, 0, *alloc_hint, &ins, 1, 0);
10332 if (ret) { 10346 if (ret) {
10333 if (own_trans) 10347 if (own_trans)
10334 btrfs_end_transaction(trans, root); 10348 btrfs_end_transaction(trans, root);
@@ -10414,6 +10428,9 @@ next:
10414 if (own_trans) 10428 if (own_trans)
10415 btrfs_end_transaction(trans, root); 10429 btrfs_end_transaction(trans, root);
10416 } 10430 }
10431 if (cur_offset < end)
10432 btrfs_free_reserved_data_space(inode, cur_offset,
10433 end - cur_offset + 1);
10417 return ret; 10434 return ret;
10418} 10435}
10419 10436