From a717531942f488209dded30f6bc648167bcefa72 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 22 Jan 2009 09:23:10 -0500 Subject: Btrfs: do less aggressive btree readahead Just before reading a leaf, btrfs scans the node for blocks that are close by and reads them too. It tries to build up a large window of IO looking for blocks that are within a max distance from the top and bottom of the IO window. This patch changes things to just look for blocks within 64k of the target block. It will trigger less IO and make for lower latencies on the read size. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 9e46c0776816..2603ee539b7a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1210,8 +1210,7 @@ static noinline void reada_for_search(struct btrfs_root *root, struct btrfs_disk_key disk_key; u32 nritems; u64 search; - u64 lowest_read; - u64 highest_read; + u64 target; u64 nread = 0; int direction = path->reada; struct extent_buffer *eb; @@ -1235,8 +1234,7 @@ static noinline void reada_for_search(struct btrfs_root *root, return; } - highest_read = search; - lowest_read = search; + target = search; nritems = btrfs_header_nritems(node); nr = slot; @@ -1256,24 +1254,15 @@ static noinline void reada_for_search(struct btrfs_root *root, break; } search = btrfs_node_blockptr(node, nr); - if ((search >= lowest_read && search <= highest_read) || - (search < lowest_read && lowest_read - search <= 16384) || - (search > highest_read && search - highest_read <= 16384)) { + if ((search <= target && target - search <= 65536) || + (search > target && search - target <= 65536)) { readahead_tree_block(root, search, blocksize, btrfs_node_ptr_generation(node, nr)); nread += blocksize; } nscan++; - if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32)) + if ((nread > 65536 || nscan > 32)) break; - - if (nread > (256 * 1024) || nscan > 128) - break; - - if (search < lowest_read) - lowest_read = search; - if (search > highest_read) - highest_read = search; } } -- cgit v1.2.2 From c487685d7c18a8481900755aa5c56a7a74193101 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 4 Feb 2009 09:24:25 -0500 Subject: Btrfs: hash_lock is no longer needed Before metadata is written to disk, it is updated to reflect that writeout has begun. Once this update is done, the block must be cow'd before it can be modified again. This update was originally synchronized by using a per-fs spinlock. Today the buffers for the metadata blocks are locked before writeout begins, and everyone that tests the flag has the buffer locked as well. So, the per-fs spinlock (called hash_lock for no good reason) is no longer required. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 2603ee539b7a..3b6e35aafc9e 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -388,16 +388,14 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, WARN_ON(1); } - spin_lock(&root->fs_info->hash_lock); if (btrfs_header_generation(buf) == trans->transid && btrfs_header_owner(buf) == root->root_key.objectid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { *cow_ret = buf; - spin_unlock(&root->fs_info->hash_lock); WARN_ON(prealloc_dest); return 0; } - spin_unlock(&root->fs_info->hash_lock); + search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); ret = __btrfs_cow_block(trans, root, buf, parent, parent_slot, cow_ret, search_start, 0, @@ -1376,14 +1374,11 @@ again: int wret; /* is a cow on this block not required */ - spin_lock(&root->fs_info->hash_lock); if (btrfs_header_generation(b) == trans->transid && btrfs_header_owner(b) == root->root_key.objectid && !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { - spin_unlock(&root->fs_info->hash_lock); goto cow_done; } - spin_unlock(&root->fs_info->hash_lock); /* ok, we have to cow, is our old prealloc the right * size? -- cgit v1.2.2 From b4ce94de9b4d64e8ab3cf155d13653c666e22b9b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 4 Feb 2009 09:25:08 -0500 Subject: Btrfs: Change btree locking to use explicit blocking points Most of the btrfs metadata operations can be protected by a spinlock, but some operations still need to schedule. So far, btrfs has been using a mutex along with a trylock loop, most of the time it is able to avoid going for the full mutex, so the trylock loop is a big performance gain. This commit is step one for getting rid of the blocking locks entirely. btrfs_tree_lock takes a spinlock, and the code explicitly switches to a blocking lock when it starts an operation that can schedule. We'll be able get rid of the blocking locks in smaller pieces over time. Tracing allows us to find the most common cause of blocking, so we can start with the hot spots first. The basic idea is: btrfs_tree_lock() returns with the spin lock held btrfs_set_lock_blocking() sets the EXTENT_BUFFER_BLOCKING bit in the extent buffer flags, and then drops the spin lock. The buffer is still considered locked by all of the btrfs code. If btrfs_tree_lock gets the spinlock but finds the blocking bit set, it drops the spin lock and waits on a wait queue for the blocking bit to go away. Much of the code that needs to set the blocking bit finishes without actually blocking a good percentage of the time. So, an adaptive spin is still used against the blocking bit to avoid very high context switch rates. btrfs_clear_lock_blocking() clears the blocking bit and returns with the spinlock held again. btrfs_tree_unlock() can be called on either blocking or spinning locks, it does the right thing based on the blocking bit. ctree.c has a helper function to set/clear all the locked buffers in a path as blocking. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 234 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 226 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 3b6e35aafc9e..3af777357acb 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -54,6 +54,31 @@ struct btrfs_path *btrfs_alloc_path(void) return path; } +/* + * set all locked nodes in the path to blocking locks. This should + * be done before scheduling + */ +noinline void btrfs_set_path_blocking(struct btrfs_path *p) +{ + int i; + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + if (p->nodes[i] && p->locks[i]) + btrfs_set_lock_blocking(p->nodes[i]); + } +} + +/* + * reset all the locked nodes in the patch to spinning locks. + */ +noinline void btrfs_clear_path_blocking(struct btrfs_path *p) +{ + int i; + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + if (p->nodes[i] && p->locks[i]) + btrfs_clear_lock_blocking(p->nodes[i]); + } +} + /* this also releases the path */ void btrfs_free_path(struct btrfs_path *p) { @@ -272,6 +297,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (IS_ERR(cow)) return PTR_ERR(cow); + /* cow is set to blocking by btrfs_init_new_buffer */ + copy_extent_buffer(cow, buf, 0, 0, cow->len); btrfs_set_header_bytenr(cow, cow->start); btrfs_set_header_generation(cow, trans->transid); @@ -397,6 +424,11 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, } search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); + + if (parent) + btrfs_set_lock_blocking(parent); + btrfs_set_lock_blocking(buf); + ret = __btrfs_cow_block(trans, root, buf, parent, parent_slot, cow_ret, search_start, 0, prealloc_dest); @@ -502,6 +534,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (parent_nritems == 1) return 0; + btrfs_set_lock_blocking(parent); + for (i = start_slot; i < end_slot; i++) { int close = 1; @@ -562,6 +596,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, search_start = last_block; btrfs_tree_lock(cur); + btrfs_set_lock_blocking(cur); err = __btrfs_cow_block(trans, root, cur, parent, i, &cur, search_start, min(16 * blocksize, @@ -860,6 +895,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, return 0; mid = path->nodes[level]; + WARN_ON(!path->locks[level]); WARN_ON(btrfs_header_generation(mid) != trans->transid); @@ -882,6 +918,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* promote the child to a root */ child = read_node_slot(root, mid, 0); btrfs_tree_lock(child); + btrfs_set_lock_blocking(child); BUG_ON(!child); ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); BUG_ON(ret); @@ -898,6 +935,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, add_root_to_dirty_list(root); btrfs_tree_unlock(child); + path->locks[level] = 0; path->nodes[level] = NULL; clean_tree_block(trans, root, mid); @@ -922,6 +960,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, left = read_node_slot(root, parent, pslot - 1); if (left) { btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); wret = btrfs_cow_block(trans, root, left, parent, pslot - 1, &left, 0); if (wret) { @@ -932,6 +971,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, right = read_node_slot(root, parent, pslot + 1); if (right) { btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); wret = btrfs_cow_block(trans, root, right, parent, pslot + 1, &right, 0); if (wret) { @@ -1107,6 +1147,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, u32 left_nr; btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); + left_nr = btrfs_header_nritems(left); if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { wret = 1; @@ -1153,7 +1195,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, */ if (right) { u32 right_nr; + btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); + right_nr = btrfs_header_nritems(right); if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { wret = 1; @@ -1264,6 +1309,68 @@ static noinline void reada_for_search(struct btrfs_root *root, } } +/* + * returns -EAGAIN if it had to drop the path, or zero if everything was in + * cache + */ +static noinline int reada_for_balance(struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + int slot; + int nritems; + struct extent_buffer *parent; + struct extent_buffer *eb; + u64 gen; + u64 block1 = 0; + u64 block2 = 0; + int ret = 0; + int blocksize; + + parent = path->nodes[level - 1]; + if (!parent) + return 0; + + nritems = btrfs_header_nritems(parent); + slot = path->slots[level]; + blocksize = btrfs_level_size(root, level); + + if (slot > 0) { + block1 = btrfs_node_blockptr(parent, slot - 1); + gen = btrfs_node_ptr_generation(parent, slot - 1); + eb = btrfs_find_tree_block(root, block1, blocksize); + if (eb && btrfs_buffer_uptodate(eb, gen)) + block1 = 0; + free_extent_buffer(eb); + } + if (slot < nritems) { + block2 = btrfs_node_blockptr(parent, slot + 1); + gen = btrfs_node_ptr_generation(parent, slot + 1); + eb = btrfs_find_tree_block(root, block2, blocksize); + if (eb && btrfs_buffer_uptodate(eb, gen)) + block2 = 0; + free_extent_buffer(eb); + } + if (block1 || block2) { + ret = -EAGAIN; + btrfs_release_path(root, path); + if (block1) + readahead_tree_block(root, block1, blocksize, 0); + if (block2) + readahead_tree_block(root, block2, blocksize, 0); + + if (block1) { + eb = read_tree_block(root, block1, blocksize, 0); + free_extent_buffer(eb); + } + if (block1) { + eb = read_tree_block(root, block2, blocksize, 0); + free_extent_buffer(eb); + } + } + return ret; +} + + /* * when we walk down the tree, it is usually safe to unlock the higher layers * in the tree. The exceptions are when our path goes through slot 0, because @@ -1314,6 +1421,32 @@ static noinline void unlock_up(struct btrfs_path *path, int level, } } +/* + * This releases any locks held in the path starting at level and + * going all the way up to the root. + * + * btrfs_search_slot will keep the lock held on higher nodes in a few + * corner cases, such as COW of the block at slot zero in the node. This + * ignores those rules, and it should only be called when there are no + * more updates to be done higher up in the tree. + */ +noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) +{ + int i; + + if (path->keep_locks || path->lowest_level) + return; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + if (!path->nodes[i]) + break; + if (!path->locks[i]) + break; + btrfs_tree_unlock(path->nodes[i]); + path->locks[i] = 0; + } +} + /* * look for key in the tree. path is filled in with nodes along the way * if key is found, we return zero and you can find the item in the leaf @@ -1385,6 +1518,7 @@ again: */ if (prealloc_block.objectid && prealloc_block.offset != b->len) { + btrfs_set_path_blocking(p); btrfs_free_reserved_extent(root, prealloc_block.objectid, prealloc_block.offset); @@ -1409,6 +1543,8 @@ again: goto again; } + btrfs_set_path_blocking(p); + wret = btrfs_cow_block(trans, root, b, p->nodes[level + 1], p->slots[level + 1], @@ -1430,6 +1566,22 @@ cow_done: if (!p->skip_locking) p->locks[level] = 1; + btrfs_clear_path_blocking(p); + + /* + * we have a lock on b and as long as we aren't changing + * the tree, there is no way to for the items in b to change. + * It is safe to drop the lock on our parent before we + * go through the expensive btree search on b. + * + * If cow is true, then we might be changing slot zero, + * which may require changing the parent. So, we can't + * drop the lock until after we know which slot we're + * operating on. + */ + if (!cow) + btrfs_unlock_up_safe(p, level + 1); + ret = check_block(root, p, level); if (ret) { ret = -1; @@ -1437,6 +1589,7 @@ cow_done: } ret = bin_search(b, key, level, &slot); + if (level != 0) { if (ret && slot > 0) slot -= 1; @@ -1444,7 +1597,16 @@ cow_done: if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { - int sret = split_node(trans, root, p, level); + int sret; + + sret = reada_for_balance(root, p, level); + if (sret) + goto again; + + btrfs_set_path_blocking(p); + sret = split_node(trans, root, p, level); + btrfs_clear_path_blocking(p); + BUG_ON(sret > 0); if (sret) { ret = sret; @@ -1453,8 +1615,16 @@ cow_done: b = p->nodes[level]; slot = p->slots[level]; } else if (ins_len < 0) { - int sret = balance_level(trans, root, p, - level); + int sret; + + sret = reada_for_balance(root, p, level); + if (sret) + goto again; + + btrfs_set_path_blocking(p); + sret = balance_level(trans, root, p, level); + btrfs_clear_path_blocking(p); + if (sret) { ret = sret; goto done; @@ -1488,7 +1658,7 @@ cow_done: * of the btree by dropping locks before * we read. */ - if (level > 1) { + if (level > 0) { btrfs_release_path(NULL, p); if (tmp) free_extent_buffer(tmp); @@ -1503,6 +1673,7 @@ cow_done: free_extent_buffer(tmp); goto again; } else { + btrfs_set_path_blocking(p); if (tmp) free_extent_buffer(tmp); if (should_reada) @@ -1512,14 +1683,29 @@ cow_done: b = read_node_slot(root, b, slot); } } - if (!p->skip_locking) - btrfs_tree_lock(b); + if (!p->skip_locking) { + int lret; + + btrfs_clear_path_blocking(p); + lret = btrfs_try_spin_lock(b); + + if (!lret) { + btrfs_set_path_blocking(p); + btrfs_tree_lock(b); + btrfs_clear_path_blocking(p); + } + } } else { p->slots[level] = slot; if (ins_len > 0 && btrfs_leaf_free_space(root, b) < ins_len) { - int sret = split_leaf(trans, root, key, + int sret; + + btrfs_set_path_blocking(p); + sret = split_leaf(trans, root, key, p, ins_len, ret == 0); + btrfs_clear_path_blocking(p); + BUG_ON(sret > 0); if (sret) { ret = sret; @@ -1533,12 +1719,16 @@ cow_done: } ret = 1; done: + /* + * we don't really know what they plan on doing with the path + * from here on, so for now just mark it as blocking + */ + btrfs_set_path_blocking(p); if (prealloc_block.objectid) { btrfs_free_reserved_extent(root, prealloc_block.objectid, prealloc_block.offset); } - return ret; } @@ -1562,6 +1752,8 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans, ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); BUG_ON(ret); + btrfs_set_lock_blocking(eb); + parent = eb; while (1) { level = btrfs_header_level(parent); @@ -1586,6 +1778,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans, eb = read_tree_block(root, bytenr, blocksize, generation); btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); } /* @@ -1610,6 +1803,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans, eb = read_tree_block(root, bytenr, blocksize, generation); btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); } ret = btrfs_cow_block(trans, root, eb, parent, slot, @@ -2156,6 +2350,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root right = read_node_slot(root, upper, slot + 1); btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); + free_space = btrfs_leaf_free_space(root, right); if (free_space < data_size) goto out_unlock; @@ -2351,6 +2547,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root left = read_node_slot(root, path->nodes[1], slot - 1); btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); + free_space = btrfs_leaf_free_space(root, left); if (free_space < data_size) { ret = 1; @@ -2809,6 +3007,12 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, path->keep_locks = 0; BUG_ON(ret); + /* + * make sure any changes to the path from split_leaf leave it + * in a blocking state + */ + btrfs_set_path_blocking(path); + leaf = path->nodes[0]; BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); @@ -3338,6 +3542,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, BUG(); } out: + btrfs_unlock_up_safe(path, 1); return ret; } @@ -3705,12 +3910,14 @@ find_next_key: */ if (slot >= nritems) { path->slots[level] = slot; + btrfs_set_path_blocking(path); sret = btrfs_find_next_key(root, path, min_key, level, cache_only, min_trans); if (sret == 0) { btrfs_release_path(root, path); goto again; } else { + btrfs_clear_path_blocking(path); goto out; } } @@ -3722,16 +3929,20 @@ find_next_key: unlock_up(path, level, 1); goto out; } + btrfs_set_path_blocking(path); cur = read_node_slot(root, cur, slot); btrfs_tree_lock(cur); + path->locks[level - 1] = 1; path->nodes[level - 1] = cur; unlock_up(path, level, 1); + btrfs_clear_path_blocking(path); } out: if (ret == 0) memcpy(min_key, &found_key, sizeof(found_key)); + btrfs_set_path_blocking(path); return ret; } @@ -3827,6 +4038,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) if (ret < 0) return ret; + btrfs_set_path_blocking(path); nritems = btrfs_header_nritems(path->nodes[0]); /* * by releasing the path above we dropped all our locks. A balance @@ -3857,6 +4069,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) free_extent_buffer(next); } + /* the path was set to blocking above */ if (level == 1 && (path->locks[1] || path->skip_locking) && path->reada) reada_for_search(root, path, level, slot, 0); @@ -3865,6 +4078,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) if (!path->skip_locking) { WARN_ON(!btrfs_tree_locked(c)); btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); } break; } @@ -3881,12 +4095,15 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) path->locks[level] = 1; if (!level) break; + + btrfs_set_path_blocking(path); if (level == 1 && path->locks[1] && path->reada) reada_for_search(root, path, level, slot, 0); next = read_node_slot(root, next, 0); if (!path->skip_locking) { WARN_ON(!btrfs_tree_locked(path->nodes[level])); btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); } } done: @@ -3911,6 +4128,7 @@ int btrfs_previous_item(struct btrfs_root *root, while (1) { if (path->slots[0] == 0) { + btrfs_set_path_blocking(path); ret = btrfs_prev_leaf(root, path); if (ret != 0) return ret; -- cgit v1.2.2 From 4d081c41a4f98aecb5e86ef7d3e644cc7b52131f Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 4 Feb 2009 09:31:28 -0500 Subject: Btrfs: change btrfs_del_leaf to drop locks earlier btrfs_del_leaf does two things. First it removes the pointer in the parent, and then it frees the block that has the leaf. It has the parent node locked for both operations. But, it only needs the parent locked while it is deleting the pointer. After that it can safely free the block without the parent locked. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 3af777357acb..f6916ceb3920 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3630,15 +3630,22 @@ noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, { int ret; u64 root_gen = btrfs_header_generation(path->nodes[1]); + u64 parent_start = path->nodes[1]->start; + u64 parent_owner = btrfs_header_owner(path->nodes[1]); ret = del_ptr(trans, root, path, 1, path->slots[1]); if (ret) return ret; + /* + * btrfs_free_extent is expensive, we want to make sure we + * aren't holding any locks when we call it + */ + btrfs_unlock_up_safe(path, 0); + ret = btrfs_free_extent(trans, root, bytenr, btrfs_level_size(root, 0), - path->nodes[1]->start, - btrfs_header_owner(path->nodes[1]), + parent_start, parent_owner, root_gen, 0, 1); return ret; } -- cgit v1.2.2 From 12f4daccfc3732280debba8f9ba49720372de831 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 4 Feb 2009 09:31:42 -0500 Subject: Btrfs: fix btrfs_unlock_up_safe to walk the entire path btrfs_unlock_up_safe would break out at the first NULL node entry or unlocked node it found in the path. Some of the callers have missing nodes at the lower levels of the path, so this commit fixes things to check all the nodes in the path before returning. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f6916ceb3920..0d1e3b91e7bd 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1439,9 +1439,9 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) for (i = level; i < BTRFS_MAX_LEVEL; i++) { if (!path->nodes[i]) - break; + continue; if (!path->locks[i]) - break; + continue; btrfs_tree_unlock(path->nodes[i]); path->locks[i] = 0; } -- cgit v1.2.2 From 7b78c170dc4f538cc7ee66f47b3aac3f3974a36c Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 4 Feb 2009 09:12:46 -0500 Subject: Btrfs: Only prep for btree deletion balances when nodes are mostly empty Whenever an item deletion is done, we need to balance all the nodes in the tree to make sure we don't end up with an empty node if a pointer is deleted. This balance prep happens from the root of the tree down so we can drop our locks as we go. reada_for_balance was triggering read-ahead on neighboring nodes even when no balancing was required. This adds an extra check to avoid calling balance_level() and avoid reada_for_balance() when a balance won't be required. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0d1e3b91e7bd..551177c0011a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1518,18 +1518,19 @@ again: */ if (prealloc_block.objectid && prealloc_block.offset != b->len) { - btrfs_set_path_blocking(p); + btrfs_release_path(root, p); btrfs_free_reserved_extent(root, prealloc_block.objectid, prealloc_block.offset); prealloc_block.objectid = 0; + goto again; } /* * for higher level blocks, try not to allocate blocks * with the block and the parent locks held. */ - if (level > 1 && !prealloc_block.objectid && + if (level > 0 && !prealloc_block.objectid && btrfs_path_lock_waiting(p, level)) { u32 size = b->len; u64 hint = b->start; @@ -1614,7 +1615,9 @@ cow_done: } b = p->nodes[level]; slot = p->slots[level]; - } else if (ins_len < 0) { + } else if (ins_len < 0 && + btrfs_header_nritems(b) < + BTRFS_NODEPTRS_PER_BLOCK(root) / 4) { int sret; sret = reada_for_balance(root, p, level); -- cgit v1.2.2 From 284b066af41579f62649048fdec5c5e7091703e6 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 9 Feb 2009 16:22:03 -0500 Subject: Btrfs: don't use spin_is_contended Btrfs was using spin_is_contended to see if it should drop locks before doing extent allocations during btrfs_search_slot. The idea was to avoid expensive searches in the tree unless the lock was actually contended. But, spin_is_contended is specific to the ticket spinlocks on x86, so this is causing compile errors everywhere else. In practice, the contention could easily appear some time after we started doing the extent allocation, and it makes more sense to always drop the lock instead. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 551177c0011a..35443cc4b9a9 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1530,8 +1530,7 @@ again: * for higher level blocks, try not to allocate blocks * with the block and the parent locks held. */ - if (level > 0 && !prealloc_block.objectid && - btrfs_path_lock_waiting(p, level)) { + if (level > 0 && !prealloc_block.objectid) { u32 size = b->len; u64 hint = b->start; -- cgit v1.2.2 From 7951f3cefbd711f4429a0cd014aa83a844c399a0 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 12 Feb 2009 10:06:15 -0500 Subject: Btrfs: balance_level checks !child after access The BUG_ON() is in the wrong spot. Signed-off-by: Jeff Mahoney Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 35443cc4b9a9..6674692f7023 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -917,9 +917,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* promote the child to a root */ child = read_node_slot(root, mid, 0); + BUG_ON(!child); btrfs_tree_lock(child); btrfs_set_lock_blocking(child); - BUG_ON(!child); ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); BUG_ON(ret); -- cgit v1.2.2 From e00f7308658622fbd483cb0d9fe41165bf9050d0 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 12 Feb 2009 14:11:25 -0500 Subject: Btrfs: remove btrfs_init_path btrfs_init_path was initially used when the path objects were on the stack. Now all the work is done by btrfs_alloc_path and btrfs_init_path isn't required. This patch removes it, and just uses kmem_cache_zalloc to zero out the object. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 6674692f7023..c8f4c540cc2c 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -38,19 +38,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans, static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); -inline void btrfs_init_path(struct btrfs_path *p) -{ - memset(p, 0, sizeof(*p)); -} - struct btrfs_path *btrfs_alloc_path(void) { struct btrfs_path *path; - path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS); - if (path) { - btrfs_init_path(path); + path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); + if (path) path->reada = 1; - } return path; } -- cgit v1.2.2 From 4008c04a07c73ec3cb1be4c1391d2159a8f75d6d Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 12 Feb 2009 14:09:45 -0500 Subject: Btrfs: make a lockdep class for the extent buffer locks Btrfs is currently using spin_lock_nested with a nested value based on the tree depth of the block. But, this doesn't quite work because the max tree depth is bigger than what spin_lock_nested can deal with, and because locks are sometimes taken before the level field is filled in. The solution here is to use lockdep_set_class_and_name instead, and to set the class before unlocking the pages when the block is read from the disk and just after init of a freshly allocated tree block. btrfs_clear_path_blocking is also changed to take the locks in the proper order, and it also makes sure all the locks currently held are properly set to blocking before it tries to retake the spinlocks. Otherwise, lockdep gets upset about bad lock orderin. The lockdep magic cam from Peter Zijlstra Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 45 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 11 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index c8f4c540cc2c..42491d728e99 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -62,14 +62,38 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p) /* * reset all the locked nodes in the patch to spinning locks. + * + * held is used to keep lockdep happy, when lockdep is enabled + * we set held to a blocking lock before we go around and + * retake all the spinlocks in the path. You can safely use NULL + * for held */ -noinline void btrfs_clear_path_blocking(struct btrfs_path *p) +noinline void btrfs_clear_path_blocking(struct btrfs_path *p, + struct extent_buffer *held) { int i; - for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* lockdep really cares that we take all of these spinlocks + * in the right order. If any of the locks in the path are not + * currently blocking, it is going to complain. So, make really + * really sure by forcing the path to blocking before we clear + * the path blocking. + */ + if (held) + btrfs_set_lock_blocking(held); + btrfs_set_path_blocking(p); +#endif + + for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { if (p->nodes[i] && p->locks[i]) btrfs_clear_lock_blocking(p->nodes[i]); } + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (held) + btrfs_clear_lock_blocking(held); +#endif } /* this also releases the path */ @@ -279,7 +303,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, trans->transid, level, &ins); BUG_ON(ret); cow = btrfs_init_new_buffer(trans, root, prealloc_dest, - buf->len); + buf->len, level); } else { cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, @@ -1559,7 +1583,7 @@ cow_done: if (!p->skip_locking) p->locks[level] = 1; - btrfs_clear_path_blocking(p); + btrfs_clear_path_blocking(p, NULL); /* * we have a lock on b and as long as we aren't changing @@ -1598,7 +1622,7 @@ cow_done: btrfs_set_path_blocking(p); sret = split_node(trans, root, p, level); - btrfs_clear_path_blocking(p); + btrfs_clear_path_blocking(p, NULL); BUG_ON(sret > 0); if (sret) { @@ -1618,7 +1642,7 @@ cow_done: btrfs_set_path_blocking(p); sret = balance_level(trans, root, p, level); - btrfs_clear_path_blocking(p); + btrfs_clear_path_blocking(p, NULL); if (sret) { ret = sret; @@ -1681,13 +1705,13 @@ cow_done: if (!p->skip_locking) { int lret; - btrfs_clear_path_blocking(p); + btrfs_clear_path_blocking(p, NULL); lret = btrfs_try_spin_lock(b); if (!lret) { btrfs_set_path_blocking(p); btrfs_tree_lock(b); - btrfs_clear_path_blocking(p); + btrfs_clear_path_blocking(p, b); } } } else { @@ -1699,7 +1723,7 @@ cow_done: btrfs_set_path_blocking(p); sret = split_leaf(trans, root, key, p, ins_len, ret == 0); - btrfs_clear_path_blocking(p); + btrfs_clear_path_blocking(p, NULL); BUG_ON(sret > 0); if (sret) { @@ -3919,7 +3943,6 @@ find_next_key: btrfs_release_path(root, path); goto again; } else { - btrfs_clear_path_blocking(path); goto out; } } @@ -3939,7 +3962,7 @@ find_next_key: path->locks[level - 1] = 1; path->nodes[level - 1] = cur; unlock_up(path, level, 1); - btrfs_clear_path_blocking(path); + btrfs_clear_path_blocking(path, NULL); } out: if (ret == 0) -- cgit v1.2.2