diff options
author | Chris Mason <clm@fb.com> | 2017-12-15 14:58:27 -0500 |
---|---|---|
committer | David Sterba <dsterba@suse.com> | 2018-01-02 12:00:14 -0500 |
commit | ec35e48b286959991cdbb886f1bdeda4575c80b4 (patch) | |
tree | 58d7678f440f948d056510da948b6d3caa3d5a18 | |
parent | beed9263f4000c48a5c48912f26576f6fa091181 (diff) |
btrfs: fix refcount_t usage when deleting btrfs_delayed_nodes
refcounts have a generic implementation and an asm optimized one. The
generic version has extra debugging to make sure that once a refcount
goes to zero, refcount_inc won't increase it.
The btrfs delayed inode code wasn't expecting this, and we're tripping
over the warnings when the generic refcounts are used. We ended up with
this race:
Process A Process B
btrfs_get_delayed_node()
spin_lock(root->inode_lock)
radix_tree_lookup()
__btrfs_release_delayed_node()
refcount_dec_and_test(&delayed_node->refs)
our refcount is now zero
refcount_add(2) <---
warning here, refcount
unchanged
spin_lock(root->inode_lock)
radix_tree_delete()
With the generic refcounts, we actually warn again when process B above
tries to release his refcount because refcount_add() turned into a
no-op.
We saw this in production on older kernels without the asm optimized
refcounts.
The fix used here is to use refcount_inc_not_zero() to detect when the
object is in the middle of being freed and return NULL. This is almost
always the right answer anyway, since we usually end up pitching the
delayed_node if it didn't have fresh data in it.
This also changes __btrfs_release_delayed_node() to remove the extra
check for zero refcounts before radix tree deletion.
btrfs_get_delayed_node() was the only path that was allowing refcounts
to go from zero to one.
Fixes: 6de5f18e7b0da ("btrfs: fix refcount_t usage when deleting btrfs_delayed_node")
CC: <stable@vger.kernel.org> # 4.12+
Signed-off-by: Chris Mason <clm@fb.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
-rw-r--r-- | fs/btrfs/delayed-inode.c | 45 |
1 files changed, 34 insertions, 11 deletions
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 5d73f79ded8b..056276101c63 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c | |||
@@ -87,6 +87,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( | |||
87 | 87 | ||
88 | spin_lock(&root->inode_lock); | 88 | spin_lock(&root->inode_lock); |
89 | node = radix_tree_lookup(&root->delayed_nodes_tree, ino); | 89 | node = radix_tree_lookup(&root->delayed_nodes_tree, ino); |
90 | |||
90 | if (node) { | 91 | if (node) { |
91 | if (btrfs_inode->delayed_node) { | 92 | if (btrfs_inode->delayed_node) { |
92 | refcount_inc(&node->refs); /* can be accessed */ | 93 | refcount_inc(&node->refs); /* can be accessed */ |
@@ -94,9 +95,30 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( | |||
94 | spin_unlock(&root->inode_lock); | 95 | spin_unlock(&root->inode_lock); |
95 | return node; | 96 | return node; |
96 | } | 97 | } |
97 | btrfs_inode->delayed_node = node; | 98 | |
98 | /* can be accessed and cached in the inode */ | 99 | /* |
99 | refcount_add(2, &node->refs); | 100 | * It's possible that we're racing into the middle of removing |
101 | * this node from the radix tree. In this case, the refcount | ||
102 | * was zero and it should never go back to one. Just return | ||
103 | * NULL like it was never in the radix at all; our release | ||
104 | * function is in the process of removing it. | ||
105 | * | ||
106 | * Some implementations of refcount_inc refuse to bump the | ||
107 | * refcount once it has hit zero. If we don't do this dance | ||
108 | * here, refcount_inc() may decide to just WARN_ONCE() instead | ||
109 | * of actually bumping the refcount. | ||
110 | * | ||
111 | * If this node is properly in the radix, we want to bump the | ||
112 | * refcount twice, once for the inode and once for this get | ||
113 | * operation. | ||
114 | */ | ||
115 | if (refcount_inc_not_zero(&node->refs)) { | ||
116 | refcount_inc(&node->refs); | ||
117 | btrfs_inode->delayed_node = node; | ||
118 | } else { | ||
119 | node = NULL; | ||
120 | } | ||
121 | |||
100 | spin_unlock(&root->inode_lock); | 122 | spin_unlock(&root->inode_lock); |
101 | return node; | 123 | return node; |
102 | } | 124 | } |
@@ -254,17 +276,18 @@ static void __btrfs_release_delayed_node( | |||
254 | mutex_unlock(&delayed_node->mutex); | 276 | mutex_unlock(&delayed_node->mutex); |
255 | 277 | ||
256 | if (refcount_dec_and_test(&delayed_node->refs)) { | 278 | if (refcount_dec_and_test(&delayed_node->refs)) { |
257 | bool free = false; | ||
258 | struct btrfs_root *root = delayed_node->root; | 279 | struct btrfs_root *root = delayed_node->root; |
280 | |||
259 | spin_lock(&root->inode_lock); | 281 | spin_lock(&root->inode_lock); |
260 | if (refcount_read(&delayed_node->refs) == 0) { | 282 | /* |
261 | radix_tree_delete(&root->delayed_nodes_tree, | 283 | * Once our refcount goes to zero, nobody is allowed to bump it |
262 | delayed_node->inode_id); | 284 | * back up. We can delete it now. |
263 | free = true; | 285 | */ |
264 | } | 286 | ASSERT(refcount_read(&delayed_node->refs) == 0); |
287 | radix_tree_delete(&root->delayed_nodes_tree, | ||
288 | delayed_node->inode_id); | ||
265 | spin_unlock(&root->inode_lock); | 289 | spin_unlock(&root->inode_lock); |
266 | if (free) | 290 | kmem_cache_free(delayed_node_cache, delayed_node); |
267 | kmem_cache_free(delayed_node_cache, delayed_node); | ||
268 | } | 291 | } |
269 | } | 292 | } |
270 | 293 | ||