aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJosef Bacik <josef@toxicpanda.com>2017-10-19 14:15:57 -0400
committerDavid Sterba <dsterba@suse.com>2017-11-01 15:45:35 -0400
commit69fe2d75dd91d0124ad2ab6e9fef07633bd730e0 (patch)
treef1fce885741f552319a33c65f5957da8fe78aa77
parentdd48d4072e0cdac51edcbff66342fe2f21b5b588 (diff)
btrfs: make the delalloc block rsv per inode
The way we handle delalloc metadata reservations has gotten progressively more complicated over the years. There is so much cruft and weirdness around keeping the reserved count and outstanding counters consistent and handling the error cases that it's impossible to understand. Fix this by making the delalloc block rsv per-inode. This way we can calculate the actual size of the outstanding metadata reservations every time we make a change, and then reserve the delta based on that amount. This greatly simplifies the code everywhere, and makes the error handling in btrfs_delalloc_reserve_metadata far less terrifying. Signed-off-by: Josef Bacik <jbacik@fb.com> Signed-off-by: David Sterba <dsterba@suse.com>
-rw-r--r--fs/btrfs/btrfs_inode.h26
-rw-r--r--fs/btrfs/ctree.h5
-rw-r--r--fs/btrfs/delayed-inode.c46
-rw-r--r--fs/btrfs/disk-io.c18
-rw-r--r--fs/btrfs/extent-tree.c320
-rw-r--r--fs/btrfs/inode.c18
6 files changed, 141 insertions, 292 deletions
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 427c8738a3bd..63f0ccc92a71 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -36,14 +36,13 @@
36#define BTRFS_INODE_ORPHAN_META_RESERVED 1 36#define BTRFS_INODE_ORPHAN_META_RESERVED 1
37#define BTRFS_INODE_DUMMY 2 37#define BTRFS_INODE_DUMMY 2
38#define BTRFS_INODE_IN_DEFRAG 3 38#define BTRFS_INODE_IN_DEFRAG 3
39#define BTRFS_INODE_DELALLOC_META_RESERVED 4 39#define BTRFS_INODE_HAS_ORPHAN_ITEM 4
40#define BTRFS_INODE_HAS_ORPHAN_ITEM 5 40#define BTRFS_INODE_HAS_ASYNC_EXTENT 5
41#define BTRFS_INODE_HAS_ASYNC_EXTENT 6 41#define BTRFS_INODE_NEEDS_FULL_SYNC 6
42#define BTRFS_INODE_NEEDS_FULL_SYNC 7 42#define BTRFS_INODE_COPY_EVERYTHING 7
43#define BTRFS_INODE_COPY_EVERYTHING 8 43#define BTRFS_INODE_IN_DELALLOC_LIST 8
44#define BTRFS_INODE_IN_DELALLOC_LIST 9 44#define BTRFS_INODE_READDIO_NEED_LOCK 9
45#define BTRFS_INODE_READDIO_NEED_LOCK 10 45#define BTRFS_INODE_HAS_PROPS 10
46#define BTRFS_INODE_HAS_PROPS 11
47 46
48/* in memory btrfs inode */ 47/* in memory btrfs inode */
49struct btrfs_inode { 48struct btrfs_inode {
@@ -176,7 +175,8 @@ struct btrfs_inode {
176 * of extent items we've reserved metadata for. 175 * of extent items we've reserved metadata for.
177 */ 176 */
178 unsigned outstanding_extents; 177 unsigned outstanding_extents;
179 unsigned reserved_extents; 178
179 struct btrfs_block_rsv block_rsv;
180 180
181 /* 181 /*
182 * Cached values of inode properties 182 * Cached values of inode properties
@@ -278,14 +278,6 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
278 mod); 278 mod);
279} 279}
280 280
281static inline void btrfs_mod_reserved_extents(struct btrfs_inode *inode, int mod)
282{
283 lockdep_assert_held(&inode->lock);
284 inode->reserved_extents += mod;
285 if (btrfs_is_free_space_inode(inode))
286 return;
287}
288
289static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) 281static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
290{ 282{
291 int ret = 0; 283 int ret = 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2ede3b6ceb68..f7df5536ab61 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -763,8 +763,6 @@ struct btrfs_fs_info {
763 * delayed dir index item 763 * delayed dir index item
764 */ 764 */
765 struct btrfs_block_rsv global_block_rsv; 765 struct btrfs_block_rsv global_block_rsv;
766 /* block reservation for delay allocation */
767 struct btrfs_block_rsv delalloc_block_rsv;
768 /* block reservation for metadata operations */ 766 /* block reservation for metadata operations */
769 struct btrfs_block_rsv trans_block_rsv; 767 struct btrfs_block_rsv trans_block_rsv;
770 /* block reservation for chunk tree */ 768 /* block reservation for chunk tree */
@@ -2757,6 +2755,9 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
2757void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); 2755void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
2758struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, 2756struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
2759 unsigned short type); 2757 unsigned short type);
2758void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
2759 struct btrfs_block_rsv *rsv,
2760 unsigned short type);
2760void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, 2761void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
2761 struct btrfs_block_rsv *rsv); 2762 struct btrfs_block_rsv *rsv);
2762void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv); 2763void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 19e4ad2f3f2e..5d73f79ded8b 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -581,7 +581,6 @@ static int btrfs_delayed_inode_reserve_metadata(
581 struct btrfs_block_rsv *dst_rsv; 581 struct btrfs_block_rsv *dst_rsv;
582 u64 num_bytes; 582 u64 num_bytes;
583 int ret; 583 int ret;
584 bool release = false;
585 584
586 src_rsv = trans->block_rsv; 585 src_rsv = trans->block_rsv;
587 dst_rsv = &fs_info->delayed_block_rsv; 586 dst_rsv = &fs_info->delayed_block_rsv;
@@ -589,36 +588,13 @@ static int btrfs_delayed_inode_reserve_metadata(
589 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 588 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
590 589
591 /* 590 /*
592 * If our block_rsv is the delalloc block reserve then check and see if
593 * we have our extra reservation for updating the inode. If not fall
594 * through and try to reserve space quickly.
595 *
596 * We used to try and steal from the delalloc block rsv or the global
597 * reserve, but we'd steal a full reservation, which isn't kind. We are
598 * here through delalloc which means we've likely just cowed down close
599 * to the leaf that contains the inode, so we would steal less just
600 * doing the fallback inode update, so if we do end up having to steal
601 * from the global block rsv we hopefully only steal one or two blocks
602 * worth which is less likely to hurt us.
603 */
604 if (src_rsv && src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
605 spin_lock(&inode->lock);
606 if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
607 &inode->runtime_flags))
608 release = true;
609 else
610 src_rsv = NULL;
611 spin_unlock(&inode->lock);
612 }
613
614 /*
615 * btrfs_dirty_inode will update the inode under btrfs_join_transaction 591 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
616 * which doesn't reserve space for speed. This is a problem since we 592 * which doesn't reserve space for speed. This is a problem since we
617 * still need to reserve space for this update, so try to reserve the 593 * still need to reserve space for this update, so try to reserve the
618 * space. 594 * space.
619 * 595 *
620 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since 596 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
621 * we're accounted for. 597 * we always reserve enough to update the inode item.
622 */ 598 */
623 if (!src_rsv || (!trans->bytes_reserved && 599 if (!src_rsv || (!trans->bytes_reserved &&
624 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { 600 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
@@ -643,32 +619,12 @@ static int btrfs_delayed_inode_reserve_metadata(
643 } 619 }
644 620
645 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); 621 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
646
647 /*
648 * Migrate only takes a reservation, it doesn't touch the size of the
649 * block_rsv. This is to simplify people who don't normally have things
650 * migrated from their block rsv. If they go to release their
651 * reservation, that will decrease the size as well, so if migrate
652 * reduced size we'd end up with a negative size. But for the
653 * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
654 * but we could in fact do this reserve/migrate dance several times
655 * between the time we did the original reservation and we'd clean it
656 * up. So to take care of this, release the space for the meta
657 * reservation here. I think it may be time for a documentation page on
658 * how block rsvs. work.
659 */
660 if (!ret) { 622 if (!ret) {
661 trace_btrfs_space_reservation(fs_info, "delayed_inode", 623 trace_btrfs_space_reservation(fs_info, "delayed_inode",
662 btrfs_ino(inode), num_bytes, 1); 624 btrfs_ino(inode), num_bytes, 1);
663 node->bytes_reserved = num_bytes; 625 node->bytes_reserved = num_bytes;
664 } 626 }
665 627
666 if (release) {
667 trace_btrfs_space_reservation(fs_info, "delalloc",
668 btrfs_ino(inode), num_bytes, 0);
669 btrfs_block_rsv_release(fs_info, src_rsv, num_bytes);
670 }
671
672 return ret; 628 return ret;
673} 629}
674 630
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 484cf8fc952c..d1f396f72979 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2447,14 +2447,6 @@ int open_ctree(struct super_block *sb,
2447 goto fail_delalloc_bytes; 2447 goto fail_delalloc_bytes;
2448 } 2448 }
2449 2449
2450 fs_info->btree_inode = new_inode(sb);
2451 if (!fs_info->btree_inode) {
2452 err = -ENOMEM;
2453 goto fail_bio_counter;
2454 }
2455
2456 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
2457
2458 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); 2450 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
2459 INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); 2451 INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
2460 INIT_LIST_HEAD(&fs_info->trans_list); 2452 INIT_LIST_HEAD(&fs_info->trans_list);
@@ -2487,8 +2479,6 @@ int open_ctree(struct super_block *sb,
2487 btrfs_mapping_init(&fs_info->mapping_tree); 2479 btrfs_mapping_init(&fs_info->mapping_tree);
2488 btrfs_init_block_rsv(&fs_info->global_block_rsv, 2480 btrfs_init_block_rsv(&fs_info->global_block_rsv,
2489 BTRFS_BLOCK_RSV_GLOBAL); 2481 BTRFS_BLOCK_RSV_GLOBAL);
2490 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
2491 BTRFS_BLOCK_RSV_DELALLOC);
2492 btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); 2482 btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
2493 btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); 2483 btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
2494 btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); 2484 btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
@@ -2517,6 +2507,14 @@ int open_ctree(struct super_block *sb,
2517 2507
2518 INIT_LIST_HEAD(&fs_info->ordered_roots); 2508 INIT_LIST_HEAD(&fs_info->ordered_roots);
2519 spin_lock_init(&fs_info->ordered_root_lock); 2509 spin_lock_init(&fs_info->ordered_root_lock);
2510
2511 fs_info->btree_inode = new_inode(sb);
2512 if (!fs_info->btree_inode) {
2513 err = -ENOMEM;
2514 goto fail_bio_counter;
2515 }
2516 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
2517
2520 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), 2518 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
2521 GFP_KERNEL); 2519 GFP_KERNEL);
2522 if (!fs_info->delayed_root) { 2520 if (!fs_info->delayed_root) {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index aaa346562df6..fc9720e28005 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -26,6 +26,7 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/ratelimit.h> 27#include <linux/ratelimit.h>
28#include <linux/percpu_counter.h> 28#include <linux/percpu_counter.h>
29#include <linux/lockdep.h>
29#include "hash.h" 30#include "hash.h"
30#include "tree-log.h" 31#include "tree-log.h"
31#include "disk-io.h" 32#include "disk-io.h"
@@ -4811,7 +4812,6 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
4811static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, 4812static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
4812 u64 orig, bool wait_ordered) 4813 u64 orig, bool wait_ordered)
4813{ 4814{
4814 struct btrfs_block_rsv *block_rsv;
4815 struct btrfs_space_info *space_info; 4815 struct btrfs_space_info *space_info;
4816 struct btrfs_trans_handle *trans; 4816 struct btrfs_trans_handle *trans;
4817 u64 delalloc_bytes; 4817 u64 delalloc_bytes;
@@ -4827,8 +4827,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
4827 to_reclaim = items * EXTENT_SIZE_PER_ITEM; 4827 to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4828 4828
4829 trans = (struct btrfs_trans_handle *)current->journal_info; 4829 trans = (struct btrfs_trans_handle *)current->journal_info;
4830 block_rsv = &fs_info->delalloc_block_rsv; 4830 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4831 space_info = block_rsv->space_info;
4832 4831
4833 delalloc_bytes = percpu_counter_sum_positive( 4832 delalloc_bytes = percpu_counter_sum_positive(
4834 &fs_info->delalloc_bytes); 4833 &fs_info->delalloc_bytes);
@@ -5564,11 +5563,12 @@ again:
5564 } 5563 }
5565} 5564}
5566 5565
5567static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 5566static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5568 struct btrfs_block_rsv *block_rsv, 5567 struct btrfs_block_rsv *block_rsv,
5569 struct btrfs_block_rsv *dest, u64 num_bytes) 5568 struct btrfs_block_rsv *dest, u64 num_bytes)
5570{ 5569{
5571 struct btrfs_space_info *space_info = block_rsv->space_info; 5570 struct btrfs_space_info *space_info = block_rsv->space_info;
5571 u64 ret;
5572 5572
5573 spin_lock(&block_rsv->lock); 5573 spin_lock(&block_rsv->lock);
5574 if (num_bytes == (u64)-1) 5574 if (num_bytes == (u64)-1)
@@ -5583,6 +5583,7 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5583 } 5583 }
5584 spin_unlock(&block_rsv->lock); 5584 spin_unlock(&block_rsv->lock);
5585 5585
5586 ret = num_bytes;
5586 if (num_bytes > 0) { 5587 if (num_bytes > 0) {
5587 if (dest) { 5588 if (dest) {
5588 spin_lock(&dest->lock); 5589 spin_lock(&dest->lock);
@@ -5602,6 +5603,7 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5602 space_info_add_old_bytes(fs_info, space_info, 5603 space_info_add_old_bytes(fs_info, space_info,
5603 num_bytes); 5604 num_bytes);
5604 } 5605 }
5606 return ret;
5605} 5607}
5606 5608
5607int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, 5609int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
@@ -5625,6 +5627,15 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5625 rsv->type = type; 5627 rsv->type = type;
5626} 5628}
5627 5629
5630void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
5631 struct btrfs_block_rsv *rsv,
5632 unsigned short type)
5633{
5634 btrfs_init_block_rsv(rsv, type);
5635 rsv->space_info = __find_space_info(fs_info,
5636 BTRFS_BLOCK_GROUP_METADATA);
5637}
5638
5628struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, 5639struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
5629 unsigned short type) 5640 unsigned short type)
5630{ 5641{
@@ -5634,9 +5645,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
5634 if (!block_rsv) 5645 if (!block_rsv)
5635 return NULL; 5646 return NULL;
5636 5647
5637 btrfs_init_block_rsv(block_rsv, type); 5648 btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
5638 block_rsv->space_info = __find_space_info(fs_info,
5639 BTRFS_BLOCK_GROUP_METADATA);
5640 return block_rsv; 5649 return block_rsv;
5641} 5650}
5642 5651
@@ -5719,6 +5728,66 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
5719 return ret; 5728 return ret;
5720} 5729}
5721 5730
5731/**
5732 * btrfs_inode_rsv_refill - refill the inode block rsv.
5733 * @inode - the inode we are refilling.
5734 * @flush - the flusing restriction.
5735 *
5736 * Essentially the same as btrfs_block_rsv_refill, except it uses the
5737 * block_rsv->size as the minimum size. We'll either refill the missing amount
5738 * or return if we already have enough space. This will also handle the resreve
5739 * tracepoint for the reserved amount.
5740 */
5741int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
5742 enum btrfs_reserve_flush_enum flush)
5743{
5744 struct btrfs_root *root = inode->root;
5745 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5746 u64 num_bytes = 0;
5747 int ret = -ENOSPC;
5748
5749 spin_lock(&block_rsv->lock);
5750 if (block_rsv->reserved < block_rsv->size)
5751 num_bytes = block_rsv->size - block_rsv->reserved;
5752 spin_unlock(&block_rsv->lock);
5753
5754 if (num_bytes == 0)
5755 return 0;
5756
5757 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5758 if (!ret) {
5759 block_rsv_add_bytes(block_rsv, num_bytes, 0);
5760 trace_btrfs_space_reservation(root->fs_info, "delalloc",
5761 btrfs_ino(inode), num_bytes, 1);
5762 }
5763 return ret;
5764}
5765
5766/**
5767 * btrfs_inode_rsv_release - release any excessive reservation.
5768 * @inode - the inode we need to release from.
5769 *
5770 * This is the same as btrfs_block_rsv_release, except that it handles the
5771 * tracepoint for the reservation.
5772 */
5773void btrfs_inode_rsv_release(struct btrfs_inode *inode)
5774{
5775 struct btrfs_fs_info *fs_info = inode->root->fs_info;
5776 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5777 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5778 u64 released = 0;
5779
5780 /*
5781 * Since we statically set the block_rsv->size we just want to say we
5782 * are releasing 0 bytes, and then we'll just get the reservation over
5783 * the size free'd.
5784 */
5785 released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0);
5786 if (released > 0)
5787 trace_btrfs_space_reservation(fs_info, "delalloc",
5788 btrfs_ino(inode), released, 0);
5789}
5790
5722void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, 5791void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5723 struct btrfs_block_rsv *block_rsv, 5792 struct btrfs_block_rsv *block_rsv,
5724 u64 num_bytes) 5793 u64 num_bytes)
@@ -5790,7 +5859,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5790 5859
5791 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 5860 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5792 fs_info->global_block_rsv.space_info = space_info; 5861 fs_info->global_block_rsv.space_info = space_info;
5793 fs_info->delalloc_block_rsv.space_info = space_info;
5794 fs_info->trans_block_rsv.space_info = space_info; 5862 fs_info->trans_block_rsv.space_info = space_info;
5795 fs_info->empty_block_rsv.space_info = space_info; 5863 fs_info->empty_block_rsv.space_info = space_info;
5796 fs_info->delayed_block_rsv.space_info = space_info; 5864 fs_info->delayed_block_rsv.space_info = space_info;
@@ -5810,8 +5878,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5810{ 5878{
5811 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, 5879 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5812 (u64)-1); 5880 (u64)-1);
5813 WARN_ON(fs_info->delalloc_block_rsv.size > 0);
5814 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
5815 WARN_ON(fs_info->trans_block_rsv.size > 0); 5881 WARN_ON(fs_info->trans_block_rsv.size > 0);
5816 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 5882 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5817 WARN_ON(fs_info->chunk_block_rsv.size > 0); 5883 WARN_ON(fs_info->chunk_block_rsv.size > 0);
@@ -5953,95 +6019,37 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
5953 btrfs_block_rsv_release(fs_info, rsv, (u64)-1); 6019 btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5954} 6020}
5955 6021
5956/** 6022static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
5957 * drop_over_reserved_extents - drop our extra extent reservations 6023 struct btrfs_inode *inode)
5958 * @inode: the inode we're dropping the extent for
5959 *
5960 * We reserve extents we may use, but they may have been merged with other
5961 * extents and we may not need the extra reservation.
5962 *
5963 * We also call this when we've completed io to an extent or had an error and
5964 * cleared the outstanding extent, in either case we no longer need our
5965 * reservation and can drop the excess.
5966 */
5967static unsigned drop_over_reserved_extents(struct btrfs_inode *inode)
5968{
5969 unsigned num_extents = 0;
5970
5971 if (inode->reserved_extents > inode->outstanding_extents) {
5972 num_extents = inode->reserved_extents -
5973 inode->outstanding_extents;
5974 btrfs_mod_reserved_extents(inode, -num_extents);
5975 }
5976
5977 if (inode->outstanding_extents == 0 &&
5978 test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5979 &inode->runtime_flags))
5980 num_extents++;
5981 return num_extents;
5982}
5983
5984/**
5985 * calc_csum_metadata_size - return the amount of metadata space that must be
5986 * reserved/freed for the given bytes.
5987 * @inode: the inode we're manipulating
5988 * @num_bytes: the number of bytes in question
5989 * @reserve: 1 if we are reserving space, 0 if we are freeing space
5990 *
5991 * This adjusts the number of csum_bytes in the inode and then returns the
5992 * correct amount of metadata that must either be reserved or freed. We
5993 * calculate how many checksums we can fit into one leaf and then divide the
5994 * number of bytes that will need to be checksumed by this value to figure out
5995 * how many checksums will be required. If we are adding bytes then the number
5996 * may go up and we will return the number of additional bytes that must be
5997 * reserved. If it is going down we will return the number of bytes that must
5998 * be freed.
5999 *
6000 * This must be called with BTRFS_I(inode)->lock held.
6001 */
6002static u64 calc_csum_metadata_size(struct btrfs_inode *inode, u64 num_bytes,
6003 int reserve)
6004{ 6024{
6005 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 6025 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
6006 u64 old_csums, num_csums; 6026 u64 reserve_size = 0;
6007 6027 u64 csum_leaves;
6008 if (inode->flags & BTRFS_INODE_NODATASUM && inode->csum_bytes == 0) 6028 unsigned outstanding_extents;
6009 return 0;
6010
6011 old_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
6012 if (reserve)
6013 inode->csum_bytes += num_bytes;
6014 else
6015 inode->csum_bytes -= num_bytes;
6016 num_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
6017
6018 /* No change, no need to reserve more */
6019 if (old_csums == num_csums)
6020 return 0;
6021 6029
6022 if (reserve) 6030 lockdep_assert_held(&inode->lock);
6023 return btrfs_calc_trans_metadata_size(fs_info, 6031 outstanding_extents = inode->outstanding_extents;
6024 num_csums - old_csums); 6032 if (outstanding_extents)
6033 reserve_size = btrfs_calc_trans_metadata_size(fs_info,
6034 outstanding_extents + 1);
6035 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
6036 inode->csum_bytes);
6037 reserve_size += btrfs_calc_trans_metadata_size(fs_info,
6038 csum_leaves);
6025 6039
6026 return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums); 6040 spin_lock(&block_rsv->lock);
6041 block_rsv->size = reserve_size;
6042 spin_unlock(&block_rsv->lock);
6027} 6043}
6028 6044
6029int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) 6045int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
6030{ 6046{
6031 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 6047 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6032 struct btrfs_root *root = inode->root; 6048 struct btrfs_root *root = inode->root;
6033 struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv; 6049 unsigned nr_extents;
6034 u64 to_reserve = 0;
6035 u64 csum_bytes;
6036 unsigned nr_extents, reserve_extents;
6037 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 6050 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
6038 int ret = 0; 6051 int ret = 0;
6039 bool delalloc_lock = true; 6052 bool delalloc_lock = true;
6040 u64 to_free = 0;
6041 unsigned dropped;
6042 bool release_extra = false;
6043 bool underflow = false;
6044 bool did_retry = false;
6045 6053
6046 /* If we are a free space inode we need to not flush since we will be in 6054 /* If we are a free space inode we need to not flush since we will be in
6047 * the middle of a transaction commit. We also don't need the delalloc 6055 * the middle of a transaction commit. We also don't need the delalloc
@@ -6066,33 +6074,13 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
6066 mutex_lock(&inode->delalloc_mutex); 6074 mutex_lock(&inode->delalloc_mutex);
6067 6075
6068 num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 6076 num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
6069retry: 6077
6078 /* Add our new extents and calculate the new rsv size. */
6070 spin_lock(&inode->lock); 6079 spin_lock(&inode->lock);
6071 reserve_extents = nr_extents = count_max_extents(num_bytes); 6080 nr_extents = count_max_extents(num_bytes);
6072 btrfs_mod_outstanding_extents(inode, nr_extents); 6081 btrfs_mod_outstanding_extents(inode, nr_extents);
6073 6082 inode->csum_bytes += num_bytes;
6074 /* 6083 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6075 * Because we add an outstanding extent for ordered before we clear
6076 * delalloc we will double count our outstanding extents slightly. This
6077 * could mean that we transiently over-reserve, which could result in an
6078 * early ENOSPC if our timing is unlucky. Keep track of the case that
6079 * we had a reservation underflow so we can retry if we fail.
6080 *
6081 * Keep in mind we can legitimately have more outstanding extents than
6082 * reserved because of fragmentation, so only allow a retry once.
6083 */
6084 if (inode->outstanding_extents >
6085 inode->reserved_extents + nr_extents) {
6086 reserve_extents = inode->outstanding_extents -
6087 inode->reserved_extents;
6088 underflow = true;
6089 }
6090
6091 /* We always want to reserve a slot for updating the inode. */
6092 to_reserve = btrfs_calc_trans_metadata_size(fs_info,
6093 reserve_extents + 1);
6094 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
6095 csum_bytes = inode->csum_bytes;
6096 spin_unlock(&inode->lock); 6084 spin_unlock(&inode->lock);
6097 6085
6098 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 6086 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
@@ -6102,100 +6090,26 @@ retry:
6102 goto out_fail; 6090 goto out_fail;
6103 } 6091 }
6104 6092
6105 ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush); 6093 ret = btrfs_inode_rsv_refill(inode, flush);
6106 if (unlikely(ret)) { 6094 if (unlikely(ret)) {
6107 btrfs_qgroup_free_meta(root, 6095 btrfs_qgroup_free_meta(root,
6108 nr_extents * fs_info->nodesize); 6096 nr_extents * fs_info->nodesize);
6109 goto out_fail; 6097 goto out_fail;
6110 } 6098 }
6111 6099
6112 spin_lock(&inode->lock);
6113 if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
6114 &inode->runtime_flags)) {
6115 to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1);
6116 release_extra = true;
6117 }
6118 btrfs_mod_reserved_extents(inode, reserve_extents);
6119 spin_unlock(&inode->lock);
6120
6121 if (delalloc_lock) 6100 if (delalloc_lock)
6122 mutex_unlock(&inode->delalloc_mutex); 6101 mutex_unlock(&inode->delalloc_mutex);
6123
6124 if (to_reserve)
6125 trace_btrfs_space_reservation(fs_info, "delalloc",
6126 btrfs_ino(inode), to_reserve, 1);
6127 if (release_extra)
6128 btrfs_block_rsv_release(fs_info, block_rsv,
6129 btrfs_calc_trans_metadata_size(fs_info, 1));
6130 return 0; 6102 return 0;
6131 6103
6132out_fail: 6104out_fail:
6133 spin_lock(&inode->lock); 6105 spin_lock(&inode->lock);
6134 nr_extents = count_max_extents(num_bytes); 6106 nr_extents = count_max_extents(num_bytes);
6135 btrfs_mod_outstanding_extents(inode, -nr_extents); 6107 btrfs_mod_outstanding_extents(inode, -nr_extents);
6136 6108 inode->csum_bytes -= num_bytes;
6137 dropped = drop_over_reserved_extents(inode); 6109 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6138 /*
6139 * If the inodes csum_bytes is the same as the original
6140 * csum_bytes then we know we haven't raced with any free()ers
6141 * so we can just reduce our inodes csum bytes and carry on.
6142 */
6143 if (inode->csum_bytes == csum_bytes) {
6144 calc_csum_metadata_size(inode, num_bytes, 0);
6145 } else {
6146 u64 orig_csum_bytes = inode->csum_bytes;
6147 u64 bytes;
6148
6149 /*
6150 * This is tricky, but first we need to figure out how much we
6151 * freed from any free-ers that occurred during this
6152 * reservation, so we reset ->csum_bytes to the csum_bytes
6153 * before we dropped our lock, and then call the free for the
6154 * number of bytes that were freed while we were trying our
6155 * reservation.
6156 */
6157 bytes = csum_bytes - inode->csum_bytes;
6158 inode->csum_bytes = csum_bytes;
6159 to_free = calc_csum_metadata_size(inode, bytes, 0);
6160
6161
6162 /*
6163 * Now we need to see how much we would have freed had we not
6164 * been making this reservation and our ->csum_bytes were not
6165 * artificially inflated.
6166 */
6167 inode->csum_bytes = csum_bytes - num_bytes;
6168 bytes = csum_bytes - orig_csum_bytes;
6169 bytes = calc_csum_metadata_size(inode, bytes, 0);
6170
6171 /*
6172 * Now reset ->csum_bytes to what it should be. If bytes is
6173 * more than to_free then we would have freed more space had we
6174 * not had an artificially high ->csum_bytes, so we need to free
6175 * the remainder. If bytes is the same or less then we don't
6176 * need to do anything, the other free-ers did the correct
6177 * thing.
6178 */
6179 inode->csum_bytes = orig_csum_bytes - num_bytes;
6180 if (bytes > to_free)
6181 to_free = bytes - to_free;
6182 else
6183 to_free = 0;
6184 }
6185 spin_unlock(&inode->lock); 6110 spin_unlock(&inode->lock);
6186 if (dropped)
6187 to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
6188 6111
6189 if (to_free) { 6112 btrfs_inode_rsv_release(inode);
6190 btrfs_block_rsv_release(fs_info, block_rsv, to_free);
6191 trace_btrfs_space_reservation(fs_info, "delalloc",
6192 btrfs_ino(inode), to_free, 0);
6193 }
6194 if (underflow && !did_retry) {
6195 did_retry = true;
6196 underflow = false;
6197 goto retry;
6198 }
6199 if (delalloc_lock) 6113 if (delalloc_lock)
6200 mutex_unlock(&inode->delalloc_mutex); 6114 mutex_unlock(&inode->delalloc_mutex);
6201 return ret; 6115 return ret;
@@ -6213,25 +6127,17 @@ out_fail:
6213void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) 6127void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
6214{ 6128{
6215 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 6129 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6216 u64 to_free = 0;
6217 unsigned dropped;
6218 6130
6219 num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 6131 num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
6220 spin_lock(&inode->lock); 6132 spin_lock(&inode->lock);
6221 dropped = drop_over_reserved_extents(inode); 6133 inode->csum_bytes -= num_bytes;
6222 if (num_bytes) 6134 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6223 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
6224 spin_unlock(&inode->lock); 6135 spin_unlock(&inode->lock);
6225 if (dropped > 0)
6226 to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
6227 6136
6228 if (btrfs_is_testing(fs_info)) 6137 if (btrfs_is_testing(fs_info))
6229 return; 6138 return;
6230 6139
6231 trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), 6140 btrfs_inode_rsv_release(inode);
6232 to_free, 0);
6233
6234 btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free);
6235} 6141}
6236 6142
6237/** 6143/**
@@ -6249,25 +6155,17 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
6249{ 6155{
6250 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 6156 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6251 unsigned num_extents; 6157 unsigned num_extents;
6252 u64 to_free;
6253 unsigned dropped;
6254 6158
6255 spin_lock(&inode->lock); 6159 spin_lock(&inode->lock);
6256 num_extents = count_max_extents(num_bytes); 6160 num_extents = count_max_extents(num_bytes);
6257 btrfs_mod_outstanding_extents(inode, -num_extents); 6161 btrfs_mod_outstanding_extents(inode, -num_extents);
6258 dropped = drop_over_reserved_extents(inode); 6162 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6259 spin_unlock(&inode->lock); 6163 spin_unlock(&inode->lock);
6260 6164
6261 if (!dropped)
6262 return;
6263
6264 if (btrfs_is_testing(fs_info)) 6165 if (btrfs_is_testing(fs_info))
6265 return; 6166 return;
6266 6167
6267 to_free = btrfs_calc_trans_metadata_size(fs_info, dropped); 6168 btrfs_inode_rsv_release(inode);
6268 trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode),
6269 to_free, 0);
6270 btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free);
6271} 6169}
6272 6170
6273/** 6171/**
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5b0de1e120d1..b71731ef28c4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -42,6 +42,7 @@
42#include <linux/blkdev.h> 42#include <linux/blkdev.h>
43#include <linux/posix_acl_xattr.h> 43#include <linux/posix_acl_xattr.h>
44#include <linux/uio.h> 44#include <linux/uio.h>
45#include <linux/magic.h>
45#include "ctree.h" 46#include "ctree.h"
46#include "disk-io.h" 47#include "disk-io.h"
47#include "transaction.h" 48#include "transaction.h"
@@ -315,7 +316,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
315 btrfs_free_path(path); 316 btrfs_free_path(path);
316 return PTR_ERR(trans); 317 return PTR_ERR(trans);
317 } 318 }
318 trans->block_rsv = &fs_info->delalloc_block_rsv; 319 trans->block_rsv = &BTRFS_I(inode)->block_rsv;
319 320
320 if (compressed_size && compressed_pages) 321 if (compressed_size && compressed_pages)
321 extent_item_size = btrfs_file_extent_calc_inline_size( 322 extent_item_size = btrfs_file_extent_calc_inline_size(
@@ -2954,7 +2955,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2954 trans = NULL; 2955 trans = NULL;
2955 goto out; 2956 goto out;
2956 } 2957 }
2957 trans->block_rsv = &fs_info->delalloc_block_rsv; 2958 trans->block_rsv = &BTRFS_I(inode)->block_rsv;
2958 ret = btrfs_update_inode_fallback(trans, root, inode); 2959 ret = btrfs_update_inode_fallback(trans, root, inode);
2959 if (ret) /* -ENOMEM or corruption */ 2960 if (ret) /* -ENOMEM or corruption */
2960 btrfs_abort_transaction(trans, ret); 2961 btrfs_abort_transaction(trans, ret);
@@ -2990,7 +2991,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2990 goto out; 2991 goto out;
2991 } 2992 }
2992 2993
2993 trans->block_rsv = &fs_info->delalloc_block_rsv; 2994 trans->block_rsv = &BTRFS_I(inode)->block_rsv;
2994 2995
2995 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 2996 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
2996 compress_type = ordered_extent->compress_type; 2997 compress_type = ordered_extent->compress_type;
@@ -8845,7 +8846,6 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
8845 if (iov_iter_rw(iter) == WRITE) { 8846 if (iov_iter_rw(iter) == WRITE) {
8846 up_read(&BTRFS_I(inode)->dio_sem); 8847 up_read(&BTRFS_I(inode)->dio_sem);
8847 current->journal_info = NULL; 8848 current->journal_info = NULL;
8848 btrfs_delalloc_release_extents(BTRFS_I(inode), count);
8849 if (ret < 0 && ret != -EIOCBQUEUED) { 8849 if (ret < 0 && ret != -EIOCBQUEUED) {
8850 if (dio_data.reserve) 8850 if (dio_data.reserve)
8851 btrfs_delalloc_release_space(inode, data_reserved, 8851 btrfs_delalloc_release_space(inode, data_reserved,
@@ -8866,6 +8866,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
8866 } else if (ret >= 0 && (size_t)ret < count) 8866 } else if (ret >= 0 && (size_t)ret < count)
8867 btrfs_delalloc_release_space(inode, data_reserved, 8867 btrfs_delalloc_release_space(inode, data_reserved,
8868 offset, count - (size_t)ret); 8868 offset, count - (size_t)ret);
8869 btrfs_delalloc_release_extents(BTRFS_I(inode), count);
8869 } 8870 }
8870out: 8871out:
8871 if (wakeup) 8872 if (wakeup)
@@ -9430,6 +9431,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
9430 9431
9431struct inode *btrfs_alloc_inode(struct super_block *sb) 9432struct inode *btrfs_alloc_inode(struct super_block *sb)
9432{ 9433{
9434 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
9433 struct btrfs_inode *ei; 9435 struct btrfs_inode *ei;
9434 struct inode *inode; 9436 struct inode *inode;
9435 9437
@@ -9456,8 +9458,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
9456 9458
9457 spin_lock_init(&ei->lock); 9459 spin_lock_init(&ei->lock);
9458 ei->outstanding_extents = 0; 9460 ei->outstanding_extents = 0;
9459 ei->reserved_extents = 0; 9461 if (sb->s_magic != BTRFS_TEST_MAGIC)
9460 9462 btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
9463 BTRFS_BLOCK_RSV_DELALLOC);
9461 ei->runtime_flags = 0; 9464 ei->runtime_flags = 0;
9462 ei->prop_compress = BTRFS_COMPRESS_NONE; 9465 ei->prop_compress = BTRFS_COMPRESS_NONE;
9463 ei->defrag_compress = BTRFS_COMPRESS_NONE; 9466 ei->defrag_compress = BTRFS_COMPRESS_NONE;
@@ -9507,8 +9510,9 @@ void btrfs_destroy_inode(struct inode *inode)
9507 9510
9508 WARN_ON(!hlist_empty(&inode->i_dentry)); 9511 WARN_ON(!hlist_empty(&inode->i_dentry));
9509 WARN_ON(inode->i_data.nrpages); 9512 WARN_ON(inode->i_data.nrpages);
9513 WARN_ON(BTRFS_I(inode)->block_rsv.reserved);
9514 WARN_ON(BTRFS_I(inode)->block_rsv.size);
9510 WARN_ON(BTRFS_I(inode)->outstanding_extents); 9515 WARN_ON(BTRFS_I(inode)->outstanding_extents);
9511 WARN_ON(BTRFS_I(inode)->reserved_extents);
9512 WARN_ON(BTRFS_I(inode)->delalloc_bytes); 9516 WARN_ON(BTRFS_I(inode)->delalloc_bytes);
9513 WARN_ON(BTRFS_I(inode)->new_delalloc_bytes); 9517 WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
9514 WARN_ON(BTRFS_I(inode)->csum_bytes); 9518 WARN_ON(BTRFS_I(inode)->csum_bytes);