Btrfs: change delayed reservation fallback behavior

We reserve space for the inode update when we first reserve space for writing to a file. However there are lots of ways that we can use this reservation and not have it for subsequent ordered extents. Previously we'd fall through and try to reserve metadata bytes for this, then we'd just steal the full reservation from the delalloc_block_rsv, and if that didn't have enough space we'd steal the full reservation from the global reserve. The problem with this is we can easily just return ENOSPC and fallback to updating the inode item directly. In the worst case (assuming 4k nodesize) we'd steal 64kib from the global reserve if we fall all the way through, however if we just fallback and update the inode directly we'd only steal 4k * BTRFS_PATH_MAX in the worst case which is 32kib. We would have also just added the extent item for the inode so we likely will have already cow'ed down most of the way to the leaf containing the inode item, so we are more often than not only need one or two nodesize's worth of reservations. Given the reservation for the extent itself is also a worst case we will likely already have space to cover the inode update. This change will make us behave better in the theoretical worst case, and much better in the case that we don't have our reservation and cannot reserve more metadata. Thanks, Signed-off-by: Josef Bacik <jbacik@fb.com> Signed-off-by: David Sterba <dsterba@suse.com>
author: Josef Bacik <jbacik@fb.com> 2016-03-25 13:25:50 -0400
committer: David Sterba <dsterba@suse.com> 2016-07-07 12:45:53 -0400
commit: c48f49d63dfea12c0173fb2e99db1308f39c23ae (patch)
tree: 928ece07ce25e6c771f97de68fe2f7bdd091716a
parent: 48c3d480e4f746edb0bd3b84b4bd987ef8728560 (diff)
1 files changed, 23 insertions, 41 deletions
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index f749a5447b2a..dd3c040139a2 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -598,6 +598,29 @@ static int btrfs_delayed_inode_reserve_metadata(
        num_bytes = btrfs_calc_trans_metadata_size(root, 1);
        /*
+         * If our block_rsv is the delalloc block reserve then check and see if
+         * we have our extra reservation for updating the inode.  If not fall
+         * through and try to reserve space quickly.
+         *
+         * We used to try and steal from the delalloc block rsv or the global
+         * reserve, but we'd steal a full reservation, which isn't kind.  We are
+         * here through delalloc which means we've likely just cowed down close
+         * to the leaf that contains the inode, so we would steal less just
+         * doing the fallback inode update, so if we do end up having to steal
+         * from the global block rsv we hopefully only steal one or two blocks
+         * worth which is less likely to hurt us.
+         */
+        if (src_rsv && src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
+                spin_lock(&BTRFS_I(inode)->lock);
+                if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+                                       &BTRFS_I(inode)->runtime_flags))
+                        release = true;
+                else
+                        src_rsv = NULL;
+                spin_unlock(&BTRFS_I(inode)->lock);
+        }
+        /*
         * btrfs_dirty_inode will update the inode under btrfs_join_transaction
         * which doesn't reserve space for speed.  This is a problem since we
         * still need to reserve space for this update, so try to reserve the
@@ -626,51 +649,10 @@ static int btrfs_delayed_inode_reserve_metadata(
                                                      num_bytes, 1);
                }
                return ret;
-        } else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
-                spin_lock(&BTRFS_I(inode)->lock);
-                if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
-                                       &BTRFS_I(inode)->runtime_flags)) {
-                        spin_unlock(&BTRFS_I(inode)->lock);
-                        release = true;
-                        goto migrate;
-                }
-                spin_unlock(&BTRFS_I(inode)->lock);
-                /* Ok we didn't have space pre-reserved.  This shouldn't happen
-                 * too often but it can happen if we do delalloc to an existing
-                 * inode which gets dirtied because of the time update, and then
-                 * isn't touched again until after the transaction commits and
-                 * then we try to write out the data.  First try to be nice and
-                 * reserve something strictly for us.  If not be a pain and try
-                 * to steal from the delalloc block rsv.
-                 */
-                ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
-                                          BTRFS_RESERVE_NO_FLUSH);
-                if (!ret)
-                        goto out;
-                ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
-                if (!ret)
-                        goto out;
-                if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
-                        btrfs_debug(root->fs_info,
-                                    "block rsv migrate returned %d", ret);
-                        WARN_ON(1);
-                }
-                /*
-                 * Ok this is a problem, let's just steal from the global rsv
-                 * since this really shouldn't happen that often.
-                 */
-                ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
-                                              dst_rsv, num_bytes, 1);
-                goto out;
        }
-migrate:
        ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
-out:
        /*
         * Migrate only takes a reservation, it doesn't touch the size of the
         * block_rsv.  This is to simplify people who don't normally have things
author	Josef Bacik <jbacik@fb.com>	2016-03-25 13:25:50 -0400
committer	David Sterba <dsterba@suse.com>	2016-07-07 12:45:53 -0400
commit	c48f49d63dfea12c0173fb2e99db1308f39c23ae (patch)
tree	928ece07ce25e6c771f97de68fe2f7bdd091716a
parent	48c3d480e4f746edb0bd3b84b4bd987ef8728560 (diff)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index f749a5447b2a..dd3c040139a2 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c
@@ -598,6 +598,29 @@ static int btrfs_delayed_inode_reserve_metadata(
598	num_bytes = btrfs_calc_trans_metadata_size(root, 1);	598	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
599		599
600	/*	600	/*
		601	* If our block_rsv is the delalloc block reserve then check and see if
		602	* we have our extra reservation for updating the inode. If not fall
		603	* through and try to reserve space quickly.
		604	*
		605	* We used to try and steal from the delalloc block rsv or the global
		606	* reserve, but we'd steal a full reservation, which isn't kind. We are
		607	* here through delalloc which means we've likely just cowed down close
		608	* to the leaf that contains the inode, so we would steal less just
		609	* doing the fallback inode update, so if we do end up having to steal
		610	* from the global block rsv we hopefully only steal one or two blocks
		611	* worth which is less likely to hurt us.
		612	*/
		613	if (src_rsv && src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
		614	spin_lock(&BTRFS_I(inode)->lock);
		615	if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
		616	&BTRFS_I(inode)->runtime_flags))
		617	release = true;
		618	else
		619	src_rsv = NULL;
		620	spin_unlock(&BTRFS_I(inode)->lock);
		621	}
		622
		623	/*
601	* btrfs_dirty_inode will update the inode under btrfs_join_transaction	624	* btrfs_dirty_inode will update the inode under btrfs_join_transaction
602	* which doesn't reserve space for speed. This is a problem since we	625	* which doesn't reserve space for speed. This is a problem since we
603	* still need to reserve space for this update, so try to reserve the	626	* still need to reserve space for this update, so try to reserve the
@@ -626,51 +649,10 @@ static int btrfs_delayed_inode_reserve_metadata(
626	num_bytes, 1);	649	num_bytes, 1);
627	}	650	}
628	return ret;	651	return ret;
629	} else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
630	spin_lock(&BTRFS_I(inode)->lock);
631	if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
632	&BTRFS_I(inode)->runtime_flags)) {
633	spin_unlock(&BTRFS_I(inode)->lock);
634	release = true;
635	goto migrate;
636	}
637	spin_unlock(&BTRFS_I(inode)->lock);
638
639	/* Ok we didn't have space pre-reserved. This shouldn't happen
640	* too often but it can happen if we do delalloc to an existing
641	* inode which gets dirtied because of the time update, and then
642	* isn't touched again until after the transaction commits and
643	* then we try to write out the data. First try to be nice and
644	* reserve something strictly for us. If not be a pain and try
645	* to steal from the delalloc block rsv.
646	*/
647	ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
648	BTRFS_RESERVE_NO_FLUSH);
649	if (!ret)
650	goto out;
651
652	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
653	if (!ret)
654	goto out;
655
656	if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
657	btrfs_debug(root->fs_info,
658	"block rsv migrate returned %d", ret);
659	WARN_ON(1);
660	}
661	/*
662	* Ok this is a problem, let's just steal from the global rsv
663	* since this really shouldn't happen that often.
664	*/
665	ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
666	dst_rsv, num_bytes, 1);
667	goto out;
668	}	652	}
669		653
670	migrate:
671	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);	654	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
672		655
673	out:
674	/*	656	/*
675	* Migrate only takes a reservation, it doesn't touch the size of the	657	* Migrate only takes a reservation, it doesn't touch the size of the
676	* block_rsv. This is to simplify people who don't normally have things	658	* block_rsv. This is to simplify people who don't normally have things