aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJosef Bacik <josef@redhat.com>2011-11-08 15:47:34 -0500
committerChris Mason <chris.mason@oracle.com>2011-11-08 15:47:34 -0500
commit7fd2ae21a42d178982679b86086661292b4afe4a (patch)
tree236c22d807ca5f2419a1b0394bd1092aab730cde /fs
parent917c16b2b69fc2eeb432eabca73258f08c58361e (diff)
Btrfs: fix our reservations for updating an inode when completing io
People have been reporting ENOSPC crashes in finish_ordered_io. This is because we try to steal from the delalloc block rsv to satisfy a reservation to update the inode. The problem with this is we don't explicitly save space for updating the inode when doing delalloc. This is kind of a problem and we've gotten away with this because way back when we just stole from the delalloc reserve without any questions, and this worked out fine because generally speaking the leaf had been modified either by the mtime update when we did the original write or because we just updated the leaf when we inserted the file extent item, only on rare occasions had the leaf not actually been modified, and that was still ok because we'd just use a block or two out of the over-reservation that is delalloc. Then came the delayed inode stuff. This is amazing, except it wants a full reservation for updating the inode since it may do it at some point down the road after we've written the blocks and we have to recow everything again. This worked out because the delayed inode stuff just stole from the global reserve, that is until recently when I changed that because it caused other problems. So here we are, we're doing everything right and being screwed for it. So take an extra reservation for the inode at delalloc reservation time and carry it through the life of the delalloc reservation. If we need it we can steal it in the delayed inode stuff. If we have already stolen it try and do a normal metadata reservation. If that fails try to steal from the delalloc reservation. If _that_ fails we'll get a WARN_ON() so I can start thinking of a better way to solve this and in the meantime we'll steal from the global reserve. With this patch I ran xfstests 13 in a loop for a couple of hours and didn't see any problems. Signed-off-by: Josef Bacik <josef@redhat.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/btrfs_inode.h4
-rw-r--r--fs/btrfs/delayed-inode.c65
-rw-r--r--fs/btrfs/extent-tree.c22
-rw-r--r--fs/btrfs/inode.c1
4 files changed, 83 insertions, 9 deletions
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 5a5d325a3935..634608d2a6d0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -147,14 +147,12 @@ struct btrfs_inode {
147 * the btrfs file release call will add this inode to the 147 * the btrfs file release call will add this inode to the
148 * ordered operations list so that we make sure to flush out any 148 * ordered operations list so that we make sure to flush out any
149 * new data the application may have written before commit. 149 * new data the application may have written before commit.
150 *
151 * yes, its silly to have a single bitflag, but we might grow more
152 * of these.
153 */ 150 */
154 unsigned ordered_data_close:1; 151 unsigned ordered_data_close:1;
155 unsigned orphan_meta_reserved:1; 152 unsigned orphan_meta_reserved:1;
156 unsigned dummy_inode:1; 153 unsigned dummy_inode:1;
157 unsigned in_defrag:1; 154 unsigned in_defrag:1;
155 unsigned delalloc_meta_reserved:1;
158 156
159 /* 157 /*
160 * always compress this one file 158 * always compress this one file
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index bbe8496d5339..313ee14cf3b7 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -617,12 +617,14 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
617static int btrfs_delayed_inode_reserve_metadata( 617static int btrfs_delayed_inode_reserve_metadata(
618 struct btrfs_trans_handle *trans, 618 struct btrfs_trans_handle *trans,
619 struct btrfs_root *root, 619 struct btrfs_root *root,
620 struct inode *inode,
620 struct btrfs_delayed_node *node) 621 struct btrfs_delayed_node *node)
621{ 622{
622 struct btrfs_block_rsv *src_rsv; 623 struct btrfs_block_rsv *src_rsv;
623 struct btrfs_block_rsv *dst_rsv; 624 struct btrfs_block_rsv *dst_rsv;
624 u64 num_bytes; 625 u64 num_bytes;
625 int ret; 626 int ret;
627 int release = false;
626 628
627 src_rsv = trans->block_rsv; 629 src_rsv = trans->block_rsv;
628 dst_rsv = &root->fs_info->delayed_block_rsv; 630 dst_rsv = &root->fs_info->delayed_block_rsv;
@@ -652,11 +654,67 @@ static int btrfs_delayed_inode_reserve_metadata(
652 if (!ret) 654 if (!ret)
653 node->bytes_reserved = num_bytes; 655 node->bytes_reserved = num_bytes;
654 return ret; 656 return ret;
657 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
658 spin_lock(&BTRFS_I(inode)->lock);
659 if (BTRFS_I(inode)->delalloc_meta_reserved) {
660 BTRFS_I(inode)->delalloc_meta_reserved = 0;
661 spin_unlock(&BTRFS_I(inode)->lock);
662 release = true;
663 goto migrate;
664 }
665 spin_unlock(&BTRFS_I(inode)->lock);
666
667 /* Ok we didn't have space pre-reserved. This shouldn't happen
668 * too often but it can happen if we do delalloc to an existing
669 * inode which gets dirtied because of the time update, and then
670 * isn't touched again until after the transaction commits and
671 * then we try to write out the data. First try to be nice and
672 * reserve something strictly for us. If not be a pain and try
673 * to steal from the delalloc block rsv.
674 */
675 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
676 if (!ret)
677 goto out;
678
679 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
680 if (!ret)
681 goto out;
682
683 /*
684 * Ok this is a problem, let's just steal from the global rsv
685 * since this really shouldn't happen that often.
686 */
687 WARN_ON(1);
688 ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
689 dst_rsv, num_bytes);
690 goto out;
655 } 691 }
656 692
693migrate:
657 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 694 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
658 if (!ret) 695 if (unlikely(ret)) {
659 node->bytes_reserved = num_bytes; 696 /* This shouldn't happen */
697 BUG_ON(release);
698 return ret;
699 }
700
701out:
702 /*
703 * Migrate only takes a reservation, it doesn't touch the size of the
704 * block_rsv. This is to simplify people who don't normally have things
705 * migrated from their block rsv. If they go to release their
706 * reservation, that will decrease the size as well, so if migrate
707 * reduced size we'd end up with a negative size. But for the
708 * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
709 * but we could in fact do this reserve/migrate dance several times
710 * between the time we did the original reservation and we'd clean it
711 * up. So to take care of this, release the space for the meta
712 * reservation here. I think it may be time for a documentation page on
713 * how block rsvs. work.
714 */
715 if (release)
716 btrfs_block_rsv_release(root, src_rsv, num_bytes);
717 node->bytes_reserved = num_bytes;
660 718
661 return ret; 719 return ret;
662} 720}
@@ -1708,7 +1766,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
1708 goto release_node; 1766 goto release_node;
1709 } 1767 }
1710 1768
1711 ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); 1769 ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
1770 delayed_node);
1712 if (ret) 1771 if (ret)
1713 goto release_node; 1772 goto release_node;
1714 1773
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 18ea90c8943b..0b044e509e9f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4063,23 +4063,30 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
4063 */ 4063 */
4064static unsigned drop_outstanding_extent(struct inode *inode) 4064static unsigned drop_outstanding_extent(struct inode *inode)
4065{ 4065{
4066 unsigned drop_inode_space = 0;
4066 unsigned dropped_extents = 0; 4067 unsigned dropped_extents = 0;
4067 4068
4068 BUG_ON(!BTRFS_I(inode)->outstanding_extents); 4069 BUG_ON(!BTRFS_I(inode)->outstanding_extents);
4069 BTRFS_I(inode)->outstanding_extents--; 4070 BTRFS_I(inode)->outstanding_extents--;
4070 4071
4072 if (BTRFS_I(inode)->outstanding_extents == 0 &&
4073 BTRFS_I(inode)->delalloc_meta_reserved) {
4074 drop_inode_space = 1;
4075 BTRFS_I(inode)->delalloc_meta_reserved = 0;
4076 }
4077
4071 /* 4078 /*
4072 * If we have more or the same amount of outsanding extents than we have 4079 * If we have more or the same amount of outsanding extents than we have
4073 * reserved then we need to leave the reserved extents count alone. 4080 * reserved then we need to leave the reserved extents count alone.
4074 */ 4081 */
4075 if (BTRFS_I(inode)->outstanding_extents >= 4082 if (BTRFS_I(inode)->outstanding_extents >=
4076 BTRFS_I(inode)->reserved_extents) 4083 BTRFS_I(inode)->reserved_extents)
4077 return 0; 4084 return drop_inode_space;
4078 4085
4079 dropped_extents = BTRFS_I(inode)->reserved_extents - 4086 dropped_extents = BTRFS_I(inode)->reserved_extents -
4080 BTRFS_I(inode)->outstanding_extents; 4087 BTRFS_I(inode)->outstanding_extents;
4081 BTRFS_I(inode)->reserved_extents -= dropped_extents; 4088 BTRFS_I(inode)->reserved_extents -= dropped_extents;
4082 return dropped_extents; 4089 return dropped_extents + drop_inode_space;
4083} 4090}
4084 4091
4085/** 4092/**
@@ -4165,9 +4172,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4165 nr_extents = BTRFS_I(inode)->outstanding_extents - 4172 nr_extents = BTRFS_I(inode)->outstanding_extents -
4166 BTRFS_I(inode)->reserved_extents; 4173 BTRFS_I(inode)->reserved_extents;
4167 BTRFS_I(inode)->reserved_extents += nr_extents; 4174 BTRFS_I(inode)->reserved_extents += nr_extents;
4175 }
4168 4176
4169 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 4177 /*
4178 * Add an item to reserve for updating the inode when we complete the
4179 * delalloc io.
4180 */
4181 if (!BTRFS_I(inode)->delalloc_meta_reserved) {
4182 nr_extents++;
4183 BTRFS_I(inode)->delalloc_meta_reserved = 1;
4170 } 4184 }
4185
4186 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4171 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); 4187 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4172 spin_unlock(&BTRFS_I(inode)->lock); 4188 spin_unlock(&BTRFS_I(inode)->lock);
4173 4189
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f60e2490bd0d..2b920596c126 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6607,6 +6607,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6607 ei->orphan_meta_reserved = 0; 6607 ei->orphan_meta_reserved = 0;
6608 ei->dummy_inode = 0; 6608 ei->dummy_inode = 0;
6609 ei->in_defrag = 0; 6609 ei->in_defrag = 0;
6610 ei->delalloc_meta_reserved = 0;
6610 ei->force_compress = BTRFS_COMPRESS_NONE; 6611 ei->force_compress = BTRFS_COMPRESS_NONE;
6611 6612
6612 ei->delayed_node = NULL; 6613 ei->delayed_node = NULL;