aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJosef Bacik <josef@redhat.com>2012-05-23 14:26:42 -0400
committerJosef Bacik <josef@redhat.com>2012-05-30 10:23:37 -0400
commit8a35d95ff4680a456d3ce47df9638f33d4f54f20 (patch)
tree8adb116747b84209406d29767b56e2a6ade2f2f4 /fs
parent72ac3c0d7921f943d92d1ef42a549fb52e56817d (diff)
Btrfs: fix how we deal with the orphan block rsv
Ceph was hitting this race where we would remove an inode from the per-root orphan list before we would release the space we had reserved for the inode. We actually don't need a list or anything, we just need to make sure the root doesn't try to free up the orphan reserve until after the inodes have released their reservations. So use an atomic counter instead of a list on the root and only decrement the counter after we've released our reservation. I've tested this as well as several others and we no longer see the warnings that you would see while running ceph. Thanks, Btrfs: fix how we deal with the orphan block rsv Ceph was hitting this race where we would remove an inode from the per-root orphan list before we would release the space we had reserved for the inode. We actually don't need a list or anything, we just need to make sure the root doesn't try to free up the orphan reserve until after the inodes have released their reservations. So use an atomic counter instead of a list on the root and only decrement the counter after we've released our reservation. I've tested this as well as several others and we no longer see the warnings that you would see while running ceph. Thanks, Signed-off-by: Josef Bacik <josef@redhat.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/btrfs_inode.h4
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/btrfs/inode.c38
4 files changed, 24 insertions, 22 deletions
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 6265edb219e..ce2c9d60031 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -36,6 +36,7 @@
36#define BTRFS_INODE_DUMMY 2 36#define BTRFS_INODE_DUMMY 2
37#define BTRFS_INODE_IN_DEFRAG 3 37#define BTRFS_INODE_IN_DEFRAG 3
38#define BTRFS_INODE_DELALLOC_META_RESERVED 4 38#define BTRFS_INODE_DELALLOC_META_RESERVED 4
39#define BTRFS_INODE_HAS_ORPHAN_ITEM 5
39 40
40/* in memory btrfs inode */ 41/* in memory btrfs inode */
41struct btrfs_inode { 42struct btrfs_inode {
@@ -70,9 +71,6 @@ struct btrfs_inode {
70 /* used to order data wrt metadata */ 71 /* used to order data wrt metadata */
71 struct btrfs_ordered_inode_tree ordered_tree; 72 struct btrfs_ordered_inode_tree ordered_tree;
72 73
73 /* for keeping track of orphaned inodes */
74 struct list_head i_orphan;
75
76 /* list of all the delalloc inodes in the FS. There are times we need 74 /* list of all the delalloc inodes in the FS. There are times we need
77 * to write all the delalloc pages to disk, and this list is used 75 * to write all the delalloc pages to disk, and this list is used
78 * to walk them all. 76 * to walk them all.
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8fd72331d60..aad2600718a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1375,7 +1375,7 @@ struct btrfs_root {
1375 struct list_head root_list; 1375 struct list_head root_list;
1376 1376
1377 spinlock_t orphan_lock; 1377 spinlock_t orphan_lock;
1378 struct list_head orphan_list; 1378 atomic_t orphan_inodes;
1379 struct btrfs_block_rsv *orphan_block_rsv; 1379 struct btrfs_block_rsv *orphan_block_rsv;
1380 int orphan_item_inserted; 1380 int orphan_item_inserted;
1381 int orphan_cleanup_state; 1381 int orphan_cleanup_state;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0cf8ef2b5b1..297e5a8ed93 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1153,7 +1153,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1153 root->orphan_block_rsv = NULL; 1153 root->orphan_block_rsv = NULL;
1154 1154
1155 INIT_LIST_HEAD(&root->dirty_list); 1155 INIT_LIST_HEAD(&root->dirty_list);
1156 INIT_LIST_HEAD(&root->orphan_list);
1157 INIT_LIST_HEAD(&root->root_list); 1156 INIT_LIST_HEAD(&root->root_list);
1158 spin_lock_init(&root->orphan_lock); 1157 spin_lock_init(&root->orphan_lock);
1159 spin_lock_init(&root->inode_lock); 1158 spin_lock_init(&root->inode_lock);
@@ -1166,6 +1165,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1166 atomic_set(&root->log_commit[0], 0); 1165 atomic_set(&root->log_commit[0], 0);
1167 atomic_set(&root->log_commit[1], 0); 1166 atomic_set(&root->log_commit[1], 0);
1168 atomic_set(&root->log_writers, 0); 1167 atomic_set(&root->log_writers, 0);
1168 atomic_set(&root->orphan_inodes, 0);
1169 root->log_batch = 0; 1169 root->log_batch = 0;
1170 root->log_transid = 0; 1170 root->log_transid = 0;
1171 root->last_log_commit = 0; 1171 root->last_log_commit = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 91ad6390175..029892887fc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2104,12 +2104,12 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2104 struct btrfs_block_rsv *block_rsv; 2104 struct btrfs_block_rsv *block_rsv;
2105 int ret; 2105 int ret;
2106 2106
2107 if (!list_empty(&root->orphan_list) || 2107 if (atomic_read(&root->orphan_inodes) ||
2108 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) 2108 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
2109 return; 2109 return;
2110 2110
2111 spin_lock(&root->orphan_lock); 2111 spin_lock(&root->orphan_lock);
2112 if (!list_empty(&root->orphan_list)) { 2112 if (atomic_read(&root->orphan_inodes)) {
2113 spin_unlock(&root->orphan_lock); 2113 spin_unlock(&root->orphan_lock);
2114 return; 2114 return;
2115 } 2115 }
@@ -2166,8 +2166,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2166 block_rsv = NULL; 2166 block_rsv = NULL;
2167 } 2167 }
2168 2168
2169 if (list_empty(&BTRFS_I(inode)->i_orphan)) { 2169 if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2170 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2170 &BTRFS_I(inode)->runtime_flags)) {
2171#if 0 2171#if 0
2172 /* 2172 /*
2173 * For proper ENOSPC handling, we should do orphan 2173 * For proper ENOSPC handling, we should do orphan
@@ -2180,6 +2180,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2180 insert = 1; 2180 insert = 1;
2181#endif 2181#endif
2182 insert = 1; 2182 insert = 1;
2183 atomic_dec(&root->orphan_inodes);
2183 } 2184 }
2184 2185
2185 if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 2186 if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
@@ -2197,6 +2198,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2197 if (insert >= 1) { 2198 if (insert >= 1) {
2198 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); 2199 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
2199 if (ret && ret != -EEXIST) { 2200 if (ret && ret != -EEXIST) {
2201 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2202 &BTRFS_I(inode)->runtime_flags);
2200 btrfs_abort_transaction(trans, root, ret); 2203 btrfs_abort_transaction(trans, root, ret);
2201 return ret; 2204 return ret;
2202 } 2205 }
@@ -2227,10 +2230,9 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2227 int ret = 0; 2230 int ret = 0;
2228 2231
2229 spin_lock(&root->orphan_lock); 2232 spin_lock(&root->orphan_lock);
2230 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 2233 if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2231 list_del_init(&BTRFS_I(inode)->i_orphan); 2234 &BTRFS_I(inode)->runtime_flags))
2232 delete_item = 1; 2235 delete_item = 1;
2233 }
2234 2236
2235 if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 2237 if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
2236 &BTRFS_I(inode)->runtime_flags)) 2238 &BTRFS_I(inode)->runtime_flags))
@@ -2242,8 +2244,10 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2242 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ 2244 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
2243 } 2245 }
2244 2246
2245 if (release_rsv) 2247 if (release_rsv) {
2246 btrfs_orphan_release_metadata(inode); 2248 btrfs_orphan_release_metadata(inode);
2249 atomic_dec(&root->orphan_inodes);
2250 }
2247 2251
2248 return 0; 2252 return 0;
2249} 2253}
@@ -2371,6 +2375,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2371 ret = PTR_ERR(trans); 2375 ret = PTR_ERR(trans);
2372 goto out; 2376 goto out;
2373 } 2377 }
2378 printk(KERN_ERR "auto deleting %Lu\n",
2379 found_key.objectid);
2374 ret = btrfs_del_orphan_item(trans, root, 2380 ret = btrfs_del_orphan_item(trans, root,
2375 found_key.objectid); 2381 found_key.objectid);
2376 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ 2382 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
@@ -2382,9 +2388,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2382 * add this inode to the orphan list so btrfs_orphan_del does 2388 * add this inode to the orphan list so btrfs_orphan_del does
2383 * the proper thing when we hit it 2389 * the proper thing when we hit it
2384 */ 2390 */
2385 spin_lock(&root->orphan_lock); 2391 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2386 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2392 &BTRFS_I(inode)->runtime_flags);
2387 spin_unlock(&root->orphan_lock);
2388 2393
2389 /* if we have links, this was a truncate, lets do that */ 2394 /* if we have links, this was a truncate, lets do that */
2390 if (inode->i_nlink) { 2395 if (inode->i_nlink) {
@@ -3706,7 +3711,8 @@ void btrfs_evict_inode(struct inode *inode)
3706 btrfs_wait_ordered_range(inode, 0, (u64)-1); 3711 btrfs_wait_ordered_range(inode, 0, (u64)-1);
3707 3712
3708 if (root->fs_info->log_root_recovering) { 3713 if (root->fs_info->log_root_recovering) {
3709 BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan)); 3714 BUG_ON(!test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3715 &BTRFS_I(inode)->runtime_flags));
3710 goto no_delete; 3716 goto no_delete;
3711 } 3717 }
3712 3718
@@ -6903,7 +6909,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6903 mutex_init(&ei->log_mutex); 6909 mutex_init(&ei->log_mutex);
6904 mutex_init(&ei->delalloc_mutex); 6910 mutex_init(&ei->delalloc_mutex);
6905 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6911 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
6906 INIT_LIST_HEAD(&ei->i_orphan);
6907 INIT_LIST_HEAD(&ei->delalloc_inodes); 6912 INIT_LIST_HEAD(&ei->delalloc_inodes);
6908 INIT_LIST_HEAD(&ei->ordered_operations); 6913 INIT_LIST_HEAD(&ei->ordered_operations);
6909 RB_CLEAR_NODE(&ei->rb_node); 6914 RB_CLEAR_NODE(&ei->rb_node);
@@ -6948,13 +6953,12 @@ void btrfs_destroy_inode(struct inode *inode)
6948 spin_unlock(&root->fs_info->ordered_extent_lock); 6953 spin_unlock(&root->fs_info->ordered_extent_lock);
6949 } 6954 }
6950 6955
6951 spin_lock(&root->orphan_lock); 6956 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
6952 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 6957 &BTRFS_I(inode)->runtime_flags)) {
6953 printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n", 6958 printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
6954 (unsigned long long)btrfs_ino(inode)); 6959 (unsigned long long)btrfs_ino(inode));
6955 list_del_init(&BTRFS_I(inode)->i_orphan); 6960 atomic_dec(&root->orphan_inodes);
6956 } 6961 }
6957 spin_unlock(&root->orphan_lock);
6958 6962
6959 while (1) { 6963 while (1) {
6960 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 6964 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);