aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2009-02-04 09:25:08 -0500
committerChris Mason <chris.mason@oracle.com>2009-02-04 09:25:08 -0500
commitb4ce94de9b4d64e8ab3cf155d13653c666e22b9b (patch)
treeebc44a9554a50b495b091cb0979d79fd29e50fe7
parentc487685d7c18a8481900755aa5c56a7a74193101 (diff)
Btrfs: Change btree locking to use explicit blocking points
Most of the btrfs metadata operations can be protected by a spinlock, but some operations still need to schedule. So far, btrfs has been using a mutex along with a trylock loop, most of the time it is able to avoid going for the full mutex, so the trylock loop is a big performance gain. This commit is step one for getting rid of the blocking locks entirely. btrfs_tree_lock takes a spinlock, and the code explicitly switches to a blocking lock when it starts an operation that can schedule. We'll be able get rid of the blocking locks in smaller pieces over time. Tracing allows us to find the most common cause of blocking, so we can start with the hot spots first. The basic idea is: btrfs_tree_lock() returns with the spin lock held btrfs_set_lock_blocking() sets the EXTENT_BUFFER_BLOCKING bit in the extent buffer flags, and then drops the spin lock. The buffer is still considered locked by all of the btrfs code. If btrfs_tree_lock gets the spinlock but finds the blocking bit set, it drops the spin lock and waits on a wait queue for the blocking bit to go away. Much of the code that needs to set the blocking bit finishes without actually blocking a good percentage of the time. So, an adaptive spin is still used against the blocking bit to avoid very high context switch rates. btrfs_clear_lock_blocking() clears the blocking bit and returns with the spinlock held again. btrfs_tree_unlock() can be called on either blocking or spinning locks, it does the right thing based on the blocking bit. ctree.c has a helper function to set/clear all the locked buffers in a path as blocking. Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r--fs/btrfs/ctree.c234
-rw-r--r--fs/btrfs/ctree.h4
-rw-r--r--fs/btrfs/disk-io.c10
-rw-r--r--fs/btrfs/extent-tree.c5
-rw-r--r--fs/btrfs/extent_io.c18
-rw-r--r--fs/btrfs/extent_io.h16
-rw-r--r--fs/btrfs/inode.c3
-rw-r--r--fs/btrfs/locking.c208
-rw-r--r--fs/btrfs/locking.h6
-rw-r--r--fs/btrfs/tree-defrag.c1
-rw-r--r--fs/btrfs/tree-log.c4
11 files changed, 470 insertions, 39 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3b6e35aafc9e..3af777357acb 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -54,6 +54,31 @@ struct btrfs_path *btrfs_alloc_path(void)
54 return path; 54 return path;
55} 55}
56 56
57/*
58 * set all locked nodes in the path to blocking locks. This should
59 * be done before scheduling
60 */
61noinline void btrfs_set_path_blocking(struct btrfs_path *p)
62{
63 int i;
64 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
65 if (p->nodes[i] && p->locks[i])
66 btrfs_set_lock_blocking(p->nodes[i]);
67 }
68}
69
70/*
71 * reset all the locked nodes in the patch to spinning locks.
72 */
73noinline void btrfs_clear_path_blocking(struct btrfs_path *p)
74{
75 int i;
76 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
77 if (p->nodes[i] && p->locks[i])
78 btrfs_clear_lock_blocking(p->nodes[i]);
79 }
80}
81
57/* this also releases the path */ 82/* this also releases the path */
58void btrfs_free_path(struct btrfs_path *p) 83void btrfs_free_path(struct btrfs_path *p)
59{ 84{
@@ -272,6 +297,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
272 if (IS_ERR(cow)) 297 if (IS_ERR(cow))
273 return PTR_ERR(cow); 298 return PTR_ERR(cow);
274 299
300 /* cow is set to blocking by btrfs_init_new_buffer */
301
275 copy_extent_buffer(cow, buf, 0, 0, cow->len); 302 copy_extent_buffer(cow, buf, 0, 0, cow->len);
276 btrfs_set_header_bytenr(cow, cow->start); 303 btrfs_set_header_bytenr(cow, cow->start);
277 btrfs_set_header_generation(cow, trans->transid); 304 btrfs_set_header_generation(cow, trans->transid);
@@ -397,6 +424,11 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
397 } 424 }
398 425
399 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); 426 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
427
428 if (parent)
429 btrfs_set_lock_blocking(parent);
430 btrfs_set_lock_blocking(buf);
431
400 ret = __btrfs_cow_block(trans, root, buf, parent, 432 ret = __btrfs_cow_block(trans, root, buf, parent,
401 parent_slot, cow_ret, search_start, 0, 433 parent_slot, cow_ret, search_start, 0,
402 prealloc_dest); 434 prealloc_dest);
@@ -502,6 +534,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
502 if (parent_nritems == 1) 534 if (parent_nritems == 1)
503 return 0; 535 return 0;
504 536
537 btrfs_set_lock_blocking(parent);
538
505 for (i = start_slot; i < end_slot; i++) { 539 for (i = start_slot; i < end_slot; i++) {
506 int close = 1; 540 int close = 1;
507 541
@@ -562,6 +596,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
562 search_start = last_block; 596 search_start = last_block;
563 597
564 btrfs_tree_lock(cur); 598 btrfs_tree_lock(cur);
599 btrfs_set_lock_blocking(cur);
565 err = __btrfs_cow_block(trans, root, cur, parent, i, 600 err = __btrfs_cow_block(trans, root, cur, parent, i,
566 &cur, search_start, 601 &cur, search_start,
567 min(16 * blocksize, 602 min(16 * blocksize,
@@ -860,6 +895,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
860 return 0; 895 return 0;
861 896
862 mid = path->nodes[level]; 897 mid = path->nodes[level];
898
863 WARN_ON(!path->locks[level]); 899 WARN_ON(!path->locks[level]);
864 WARN_ON(btrfs_header_generation(mid) != trans->transid); 900 WARN_ON(btrfs_header_generation(mid) != trans->transid);
865 901
@@ -882,6 +918,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
882 /* promote the child to a root */ 918 /* promote the child to a root */
883 child = read_node_slot(root, mid, 0); 919 child = read_node_slot(root, mid, 0);
884 btrfs_tree_lock(child); 920 btrfs_tree_lock(child);
921 btrfs_set_lock_blocking(child);
885 BUG_ON(!child); 922 BUG_ON(!child);
886 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); 923 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
887 BUG_ON(ret); 924 BUG_ON(ret);
@@ -898,6 +935,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
898 935
899 add_root_to_dirty_list(root); 936 add_root_to_dirty_list(root);
900 btrfs_tree_unlock(child); 937 btrfs_tree_unlock(child);
938
901 path->locks[level] = 0; 939 path->locks[level] = 0;
902 path->nodes[level] = NULL; 940 path->nodes[level] = NULL;
903 clean_tree_block(trans, root, mid); 941 clean_tree_block(trans, root, mid);
@@ -922,6 +960,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
922 left = read_node_slot(root, parent, pslot - 1); 960 left = read_node_slot(root, parent, pslot - 1);
923 if (left) { 961 if (left) {
924 btrfs_tree_lock(left); 962 btrfs_tree_lock(left);
963 btrfs_set_lock_blocking(left);
925 wret = btrfs_cow_block(trans, root, left, 964 wret = btrfs_cow_block(trans, root, left,
926 parent, pslot - 1, &left, 0); 965 parent, pslot - 1, &left, 0);
927 if (wret) { 966 if (wret) {
@@ -932,6 +971,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
932 right = read_node_slot(root, parent, pslot + 1); 971 right = read_node_slot(root, parent, pslot + 1);
933 if (right) { 972 if (right) {
934 btrfs_tree_lock(right); 973 btrfs_tree_lock(right);
974 btrfs_set_lock_blocking(right);
935 wret = btrfs_cow_block(trans, root, right, 975 wret = btrfs_cow_block(trans, root, right,
936 parent, pslot + 1, &right, 0); 976 parent, pslot + 1, &right, 0);
937 if (wret) { 977 if (wret) {
@@ -1107,6 +1147,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1107 u32 left_nr; 1147 u32 left_nr;
1108 1148
1109 btrfs_tree_lock(left); 1149 btrfs_tree_lock(left);
1150 btrfs_set_lock_blocking(left);
1151
1110 left_nr = btrfs_header_nritems(left); 1152 left_nr = btrfs_header_nritems(left);
1111 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 1153 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1112 wret = 1; 1154 wret = 1;
@@ -1153,7 +1195,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1153 */ 1195 */
1154 if (right) { 1196 if (right) {
1155 u32 right_nr; 1197 u32 right_nr;
1198
1156 btrfs_tree_lock(right); 1199 btrfs_tree_lock(right);
1200 btrfs_set_lock_blocking(right);
1201
1157 right_nr = btrfs_header_nritems(right); 1202 right_nr = btrfs_header_nritems(right);
1158 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 1203 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1159 wret = 1; 1204 wret = 1;
@@ -1265,6 +1310,68 @@ static noinline void reada_for_search(struct btrfs_root *root,
1265} 1310}
1266 1311
1267/* 1312/*
1313 * returns -EAGAIN if it had to drop the path, or zero if everything was in
1314 * cache
1315 */
1316static noinline int reada_for_balance(struct btrfs_root *root,
1317 struct btrfs_path *path, int level)
1318{
1319 int slot;
1320 int nritems;
1321 struct extent_buffer *parent;
1322 struct extent_buffer *eb;
1323 u64 gen;
1324 u64 block1 = 0;
1325 u64 block2 = 0;
1326 int ret = 0;
1327 int blocksize;
1328
1329 parent = path->nodes[level - 1];
1330 if (!parent)
1331 return 0;
1332
1333 nritems = btrfs_header_nritems(parent);
1334 slot = path->slots[level];
1335 blocksize = btrfs_level_size(root, level);
1336
1337 if (slot > 0) {
1338 block1 = btrfs_node_blockptr(parent, slot - 1);
1339 gen = btrfs_node_ptr_generation(parent, slot - 1);
1340 eb = btrfs_find_tree_block(root, block1, blocksize);
1341 if (eb && btrfs_buffer_uptodate(eb, gen))
1342 block1 = 0;
1343 free_extent_buffer(eb);
1344 }
1345 if (slot < nritems) {
1346 block2 = btrfs_node_blockptr(parent, slot + 1);
1347 gen = btrfs_node_ptr_generation(parent, slot + 1);
1348 eb = btrfs_find_tree_block(root, block2, blocksize);
1349 if (eb && btrfs_buffer_uptodate(eb, gen))
1350 block2 = 0;
1351 free_extent_buffer(eb);
1352 }
1353 if (block1 || block2) {
1354 ret = -EAGAIN;
1355 btrfs_release_path(root, path);
1356 if (block1)
1357 readahead_tree_block(root, block1, blocksize, 0);
1358 if (block2)
1359 readahead_tree_block(root, block2, blocksize, 0);
1360
1361 if (block1) {
1362 eb = read_tree_block(root, block1, blocksize, 0);
1363 free_extent_buffer(eb);
1364 }
1365 if (block1) {
1366 eb = read_tree_block(root, block2, blocksize, 0);
1367 free_extent_buffer(eb);
1368 }
1369 }
1370 return ret;
1371}
1372
1373
1374/*
1268 * when we walk down the tree, it is usually safe to unlock the higher layers 1375 * when we walk down the tree, it is usually safe to unlock the higher layers
1269 * in the tree. The exceptions are when our path goes through slot 0, because 1376 * in the tree. The exceptions are when our path goes through slot 0, because
1270 * operations on the tree might require changing key pointers higher up in the 1377 * operations on the tree might require changing key pointers higher up in the
@@ -1315,6 +1422,32 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
1315} 1422}
1316 1423
1317/* 1424/*
1425 * This releases any locks held in the path starting at level and
1426 * going all the way up to the root.
1427 *
1428 * btrfs_search_slot will keep the lock held on higher nodes in a few
1429 * corner cases, such as COW of the block at slot zero in the node. This
1430 * ignores those rules, and it should only be called when there are no
1431 * more updates to be done higher up in the tree.
1432 */
1433noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1434{
1435 int i;
1436
1437 if (path->keep_locks || path->lowest_level)
1438 return;
1439
1440 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1441 if (!path->nodes[i])
1442 break;
1443 if (!path->locks[i])
1444 break;
1445 btrfs_tree_unlock(path->nodes[i]);
1446 path->locks[i] = 0;
1447 }
1448}
1449
1450/*
1318 * look for key in the tree. path is filled in with nodes along the way 1451 * look for key in the tree. path is filled in with nodes along the way
1319 * if key is found, we return zero and you can find the item in the leaf 1452 * if key is found, we return zero and you can find the item in the leaf
1320 * level of the path (level 0) 1453 * level of the path (level 0)
@@ -1385,6 +1518,7 @@ again:
1385 */ 1518 */
1386 if (prealloc_block.objectid && 1519 if (prealloc_block.objectid &&
1387 prealloc_block.offset != b->len) { 1520 prealloc_block.offset != b->len) {
1521 btrfs_set_path_blocking(p);
1388 btrfs_free_reserved_extent(root, 1522 btrfs_free_reserved_extent(root,
1389 prealloc_block.objectid, 1523 prealloc_block.objectid,
1390 prealloc_block.offset); 1524 prealloc_block.offset);
@@ -1409,6 +1543,8 @@ again:
1409 goto again; 1543 goto again;
1410 } 1544 }
1411 1545
1546 btrfs_set_path_blocking(p);
1547
1412 wret = btrfs_cow_block(trans, root, b, 1548 wret = btrfs_cow_block(trans, root, b,
1413 p->nodes[level + 1], 1549 p->nodes[level + 1],
1414 p->slots[level + 1], 1550 p->slots[level + 1],
@@ -1430,6 +1566,22 @@ cow_done:
1430 if (!p->skip_locking) 1566 if (!p->skip_locking)
1431 p->locks[level] = 1; 1567 p->locks[level] = 1;
1432 1568
1569 btrfs_clear_path_blocking(p);
1570
1571 /*
1572 * we have a lock on b and as long as we aren't changing
1573 * the tree, there is no way to for the items in b to change.
1574 * It is safe to drop the lock on our parent before we
1575 * go through the expensive btree search on b.
1576 *
1577 * If cow is true, then we might be changing slot zero,
1578 * which may require changing the parent. So, we can't
1579 * drop the lock until after we know which slot we're
1580 * operating on.
1581 */
1582 if (!cow)
1583 btrfs_unlock_up_safe(p, level + 1);
1584
1433 ret = check_block(root, p, level); 1585 ret = check_block(root, p, level);
1434 if (ret) { 1586 if (ret) {
1435 ret = -1; 1587 ret = -1;
@@ -1437,6 +1589,7 @@ cow_done:
1437 } 1589 }
1438 1590
1439 ret = bin_search(b, key, level, &slot); 1591 ret = bin_search(b, key, level, &slot);
1592
1440 if (level != 0) { 1593 if (level != 0) {
1441 if (ret && slot > 0) 1594 if (ret && slot > 0)
1442 slot -= 1; 1595 slot -= 1;
@@ -1444,7 +1597,16 @@ cow_done:
1444 if ((p->search_for_split || ins_len > 0) && 1597 if ((p->search_for_split || ins_len > 0) &&
1445 btrfs_header_nritems(b) >= 1598 btrfs_header_nritems(b) >=
1446 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1599 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1447 int sret = split_node(trans, root, p, level); 1600 int sret;
1601
1602 sret = reada_for_balance(root, p, level);
1603 if (sret)
1604 goto again;
1605
1606 btrfs_set_path_blocking(p);
1607 sret = split_node(trans, root, p, level);
1608 btrfs_clear_path_blocking(p);
1609
1448 BUG_ON(sret > 0); 1610 BUG_ON(sret > 0);
1449 if (sret) { 1611 if (sret) {
1450 ret = sret; 1612 ret = sret;
@@ -1453,8 +1615,16 @@ cow_done:
1453 b = p->nodes[level]; 1615 b = p->nodes[level];
1454 slot = p->slots[level]; 1616 slot = p->slots[level];
1455 } else if (ins_len < 0) { 1617 } else if (ins_len < 0) {
1456 int sret = balance_level(trans, root, p, 1618 int sret;
1457 level); 1619
1620 sret = reada_for_balance(root, p, level);
1621 if (sret)
1622 goto again;
1623
1624 btrfs_set_path_blocking(p);
1625 sret = balance_level(trans, root, p, level);
1626 btrfs_clear_path_blocking(p);
1627
1458 if (sret) { 1628 if (sret) {
1459 ret = sret; 1629 ret = sret;
1460 goto done; 1630 goto done;
@@ -1488,7 +1658,7 @@ cow_done:
1488 * of the btree by dropping locks before 1658 * of the btree by dropping locks before
1489 * we read. 1659 * we read.
1490 */ 1660 */
1491 if (level > 1) { 1661 if (level > 0) {
1492 btrfs_release_path(NULL, p); 1662 btrfs_release_path(NULL, p);
1493 if (tmp) 1663 if (tmp)
1494 free_extent_buffer(tmp); 1664 free_extent_buffer(tmp);
@@ -1503,6 +1673,7 @@ cow_done:
1503 free_extent_buffer(tmp); 1673 free_extent_buffer(tmp);
1504 goto again; 1674 goto again;
1505 } else { 1675 } else {
1676 btrfs_set_path_blocking(p);
1506 if (tmp) 1677 if (tmp)
1507 free_extent_buffer(tmp); 1678 free_extent_buffer(tmp);
1508 if (should_reada) 1679 if (should_reada)
@@ -1512,14 +1683,29 @@ cow_done:
1512 b = read_node_slot(root, b, slot); 1683 b = read_node_slot(root, b, slot);
1513 } 1684 }
1514 } 1685 }
1515 if (!p->skip_locking) 1686 if (!p->skip_locking) {
1516 btrfs_tree_lock(b); 1687 int lret;
1688
1689 btrfs_clear_path_blocking(p);
1690 lret = btrfs_try_spin_lock(b);
1691
1692 if (!lret) {
1693 btrfs_set_path_blocking(p);
1694 btrfs_tree_lock(b);
1695 btrfs_clear_path_blocking(p);
1696 }
1697 }
1517 } else { 1698 } else {
1518 p->slots[level] = slot; 1699 p->slots[level] = slot;
1519 if (ins_len > 0 && 1700 if (ins_len > 0 &&
1520 btrfs_leaf_free_space(root, b) < ins_len) { 1701 btrfs_leaf_free_space(root, b) < ins_len) {
1521 int sret = split_leaf(trans, root, key, 1702 int sret;
1703
1704 btrfs_set_path_blocking(p);
1705 sret = split_leaf(trans, root, key,
1522 p, ins_len, ret == 0); 1706 p, ins_len, ret == 0);
1707 btrfs_clear_path_blocking(p);
1708
1523 BUG_ON(sret > 0); 1709 BUG_ON(sret > 0);
1524 if (sret) { 1710 if (sret) {
1525 ret = sret; 1711 ret = sret;
@@ -1533,12 +1719,16 @@ cow_done:
1533 } 1719 }
1534 ret = 1; 1720 ret = 1;
1535done: 1721done:
1722 /*
1723 * we don't really know what they plan on doing with the path
1724 * from here on, so for now just mark it as blocking
1725 */
1726 btrfs_set_path_blocking(p);
1536 if (prealloc_block.objectid) { 1727 if (prealloc_block.objectid) {
1537 btrfs_free_reserved_extent(root, 1728 btrfs_free_reserved_extent(root,
1538 prealloc_block.objectid, 1729 prealloc_block.objectid,
1539 prealloc_block.offset); 1730 prealloc_block.offset);
1540 } 1731 }
1541
1542 return ret; 1732 return ret;
1543} 1733}
1544 1734
@@ -1562,6 +1752,8 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1562 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); 1752 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
1563 BUG_ON(ret); 1753 BUG_ON(ret);
1564 1754
1755 btrfs_set_lock_blocking(eb);
1756
1565 parent = eb; 1757 parent = eb;
1566 while (1) { 1758 while (1) {
1567 level = btrfs_header_level(parent); 1759 level = btrfs_header_level(parent);
@@ -1586,6 +1778,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1586 eb = read_tree_block(root, bytenr, blocksize, 1778 eb = read_tree_block(root, bytenr, blocksize,
1587 generation); 1779 generation);
1588 btrfs_tree_lock(eb); 1780 btrfs_tree_lock(eb);
1781 btrfs_set_lock_blocking(eb);
1589 } 1782 }
1590 1783
1591 /* 1784 /*
@@ -1610,6 +1803,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1610 eb = read_tree_block(root, bytenr, blocksize, 1803 eb = read_tree_block(root, bytenr, blocksize,
1611 generation); 1804 generation);
1612 btrfs_tree_lock(eb); 1805 btrfs_tree_lock(eb);
1806 btrfs_set_lock_blocking(eb);
1613 } 1807 }
1614 1808
1615 ret = btrfs_cow_block(trans, root, eb, parent, slot, 1809 ret = btrfs_cow_block(trans, root, eb, parent, slot,
@@ -2156,6 +2350,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2156 2350
2157 right = read_node_slot(root, upper, slot + 1); 2351 right = read_node_slot(root, upper, slot + 1);
2158 btrfs_tree_lock(right); 2352 btrfs_tree_lock(right);
2353 btrfs_set_lock_blocking(right);
2354
2159 free_space = btrfs_leaf_free_space(root, right); 2355 free_space = btrfs_leaf_free_space(root, right);
2160 if (free_space < data_size) 2356 if (free_space < data_size)
2161 goto out_unlock; 2357 goto out_unlock;
@@ -2351,6 +2547,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2351 2547
2352 left = read_node_slot(root, path->nodes[1], slot - 1); 2548 left = read_node_slot(root, path->nodes[1], slot - 1);
2353 btrfs_tree_lock(left); 2549 btrfs_tree_lock(left);
2550 btrfs_set_lock_blocking(left);
2551
2354 free_space = btrfs_leaf_free_space(root, left); 2552 free_space = btrfs_leaf_free_space(root, left);
2355 if (free_space < data_size) { 2553 if (free_space < data_size) {
2356 ret = 1; 2554 ret = 1;
@@ -2809,6 +3007,12 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
2809 path->keep_locks = 0; 3007 path->keep_locks = 0;
2810 BUG_ON(ret); 3008 BUG_ON(ret);
2811 3009
3010 /*
3011 * make sure any changes to the path from split_leaf leave it
3012 * in a blocking state
3013 */
3014 btrfs_set_path_blocking(path);
3015
2812 leaf = path->nodes[0]; 3016 leaf = path->nodes[0];
2813 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); 3017 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
2814 3018
@@ -3338,6 +3542,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3338 BUG(); 3542 BUG();
3339 } 3543 }
3340out: 3544out:
3545 btrfs_unlock_up_safe(path, 1);
3341 return ret; 3546 return ret;
3342} 3547}
3343 3548
@@ -3705,12 +3910,14 @@ find_next_key:
3705 */ 3910 */
3706 if (slot >= nritems) { 3911 if (slot >= nritems) {
3707 path->slots[level] = slot; 3912 path->slots[level] = slot;
3913 btrfs_set_path_blocking(path);
3708 sret = btrfs_find_next_key(root, path, min_key, level, 3914 sret = btrfs_find_next_key(root, path, min_key, level,
3709 cache_only, min_trans); 3915 cache_only, min_trans);
3710 if (sret == 0) { 3916 if (sret == 0) {
3711 btrfs_release_path(root, path); 3917 btrfs_release_path(root, path);
3712 goto again; 3918 goto again;
3713 } else { 3919 } else {
3920 btrfs_clear_path_blocking(path);
3714 goto out; 3921 goto out;
3715 } 3922 }
3716 } 3923 }
@@ -3722,16 +3929,20 @@ find_next_key:
3722 unlock_up(path, level, 1); 3929 unlock_up(path, level, 1);
3723 goto out; 3930 goto out;
3724 } 3931 }
3932 btrfs_set_path_blocking(path);
3725 cur = read_node_slot(root, cur, slot); 3933 cur = read_node_slot(root, cur, slot);
3726 3934
3727 btrfs_tree_lock(cur); 3935 btrfs_tree_lock(cur);
3936
3728 path->locks[level - 1] = 1; 3937 path->locks[level - 1] = 1;
3729 path->nodes[level - 1] = cur; 3938 path->nodes[level - 1] = cur;
3730 unlock_up(path, level, 1); 3939 unlock_up(path, level, 1);
3940 btrfs_clear_path_blocking(path);
3731 } 3941 }
3732out: 3942out:
3733 if (ret == 0) 3943 if (ret == 0)
3734 memcpy(min_key, &found_key, sizeof(found_key)); 3944 memcpy(min_key, &found_key, sizeof(found_key));
3945 btrfs_set_path_blocking(path);
3735 return ret; 3946 return ret;
3736} 3947}
3737 3948
@@ -3827,6 +4038,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3827 if (ret < 0) 4038 if (ret < 0)
3828 return ret; 4039 return ret;
3829 4040
4041 btrfs_set_path_blocking(path);
3830 nritems = btrfs_header_nritems(path->nodes[0]); 4042 nritems = btrfs_header_nritems(path->nodes[0]);
3831 /* 4043 /*
3832 * by releasing the path above we dropped all our locks. A balance 4044 * by releasing the path above we dropped all our locks. A balance
@@ -3857,6 +4069,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3857 free_extent_buffer(next); 4069 free_extent_buffer(next);
3858 } 4070 }
3859 4071
4072 /* the path was set to blocking above */
3860 if (level == 1 && (path->locks[1] || path->skip_locking) && 4073 if (level == 1 && (path->locks[1] || path->skip_locking) &&
3861 path->reada) 4074 path->reada)
3862 reada_for_search(root, path, level, slot, 0); 4075 reada_for_search(root, path, level, slot, 0);
@@ -3865,6 +4078,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3865 if (!path->skip_locking) { 4078 if (!path->skip_locking) {
3866 WARN_ON(!btrfs_tree_locked(c)); 4079 WARN_ON(!btrfs_tree_locked(c));
3867 btrfs_tree_lock(next); 4080 btrfs_tree_lock(next);
4081 btrfs_set_lock_blocking(next);
3868 } 4082 }
3869 break; 4083 break;
3870 } 4084 }
@@ -3881,12 +4095,15 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3881 path->locks[level] = 1; 4095 path->locks[level] = 1;
3882 if (!level) 4096 if (!level)
3883 break; 4097 break;
4098
4099 btrfs_set_path_blocking(path);
3884 if (level == 1 && path->locks[1] && path->reada) 4100 if (level == 1 && path->locks[1] && path->reada)
3885 reada_for_search(root, path, level, slot, 0); 4101 reada_for_search(root, path, level, slot, 0);
3886 next = read_node_slot(root, next, 0); 4102 next = read_node_slot(root, next, 0);
3887 if (!path->skip_locking) { 4103 if (!path->skip_locking) {
3888 WARN_ON(!btrfs_tree_locked(path->nodes[level])); 4104 WARN_ON(!btrfs_tree_locked(path->nodes[level]));
3889 btrfs_tree_lock(next); 4105 btrfs_tree_lock(next);
4106 btrfs_set_lock_blocking(next);
3890 } 4107 }
3891 } 4108 }
3892done: 4109done:
@@ -3911,6 +4128,7 @@ int btrfs_previous_item(struct btrfs_root *root,
3911 4128
3912 while (1) { 4129 while (1) {
3913 if (path->slots[0] == 0) { 4130 if (path->slots[0] == 0) {
4131 btrfs_set_path_blocking(path);
3914 ret = btrfs_prev_leaf(root, path); 4132 ret = btrfs_prev_leaf(root, path);
3915 if (ret != 0) 4133 if (ret != 0)
3916 return ret; 4134 return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f2b8d26b0472..531db112c8bd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1835,6 +1835,10 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
1835struct btrfs_path *btrfs_alloc_path(void); 1835struct btrfs_path *btrfs_alloc_path(void);
1836void btrfs_free_path(struct btrfs_path *p); 1836void btrfs_free_path(struct btrfs_path *p);
1837void btrfs_init_path(struct btrfs_path *p); 1837void btrfs_init_path(struct btrfs_path *p);
1838void btrfs_set_path_blocking(struct btrfs_path *p);
1839void btrfs_clear_path_blocking(struct btrfs_path *p);
1840void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
1841
1838int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1842int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1839 struct btrfs_path *path, int slot, int nr); 1843 struct btrfs_path *path, int slot, int nr);
1840int btrfs_del_leaf(struct btrfs_trans_handle *trans, 1844int btrfs_del_leaf(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 549271607c17..5aebddd71193 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -799,7 +799,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
799 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 799 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
800 800
801 if (ret == 0) 801 if (ret == 0)
802 buf->flags |= EXTENT_UPTODATE; 802 set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
803 else 803 else
804 WARN_ON(1); 804 WARN_ON(1);
805 return buf; 805 return buf;
@@ -813,6 +813,10 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
813 if (btrfs_header_generation(buf) == 813 if (btrfs_header_generation(buf) ==
814 root->fs_info->running_transaction->transid) { 814 root->fs_info->running_transaction->transid) {
815 WARN_ON(!btrfs_tree_locked(buf)); 815 WARN_ON(!btrfs_tree_locked(buf));
816
817 /* ugh, clear_extent_buffer_dirty can be expensive */
818 btrfs_set_lock_blocking(buf);
819
816 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, 820 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
817 buf); 821 buf);
818 } 822 }
@@ -2311,6 +2315,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2311 u64 transid = btrfs_header_generation(buf); 2315 u64 transid = btrfs_header_generation(buf);
2312 struct inode *btree_inode = root->fs_info->btree_inode; 2316 struct inode *btree_inode = root->fs_info->btree_inode;
2313 2317
2318 btrfs_set_lock_blocking(buf);
2319
2314 WARN_ON(!btrfs_tree_locked(buf)); 2320 WARN_ON(!btrfs_tree_locked(buf));
2315 if (transid != root->fs_info->generation) { 2321 if (transid != root->fs_info->generation) {
2316 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " 2322 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
@@ -2353,7 +2359,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2353 int ret; 2359 int ret;
2354 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 2360 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
2355 if (ret == 0) 2361 if (ret == 0)
2356 buf->flags |= EXTENT_UPTODATE; 2362 set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
2357 return ret; 2363 return ret;
2358} 2364}
2359 2365
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7a22f2e6ec47..ed1e25d72483 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3407,7 +3407,10 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3407 btrfs_set_header_generation(buf, trans->transid); 3407 btrfs_set_header_generation(buf, trans->transid);
3408 btrfs_tree_lock(buf); 3408 btrfs_tree_lock(buf);
3409 clean_tree_block(trans, root, buf); 3409 clean_tree_block(trans, root, buf);
3410
3411 btrfs_set_lock_blocking(buf);
3410 btrfs_set_buffer_uptodate(buf); 3412 btrfs_set_buffer_uptodate(buf);
3413
3411 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 3414 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3412 set_extent_dirty(&root->dirty_log_pages, buf->start, 3415 set_extent_dirty(&root->dirty_log_pages, buf->start,
3413 buf->start + buf->len - 1, GFP_NOFS); 3416 buf->start + buf->len - 1, GFP_NOFS);
@@ -3416,6 +3419,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3416 buf->start + buf->len - 1, GFP_NOFS); 3419 buf->start + buf->len - 1, GFP_NOFS);
3417 } 3420 }
3418 trans->blocks_used++; 3421 trans->blocks_used++;
3422 /* this returns a buffer locked for blocking */
3419 return buf; 3423 return buf;
3420} 3424}
3421 3425
@@ -3752,6 +3756,7 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
3752 3756
3753 next = read_tree_block(root, bytenr, blocksize, ptr_gen); 3757 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3754 btrfs_tree_lock(next); 3758 btrfs_tree_lock(next);
3759 btrfs_set_lock_blocking(next);
3755 3760
3756 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, 3761 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
3757 &refs); 3762 &refs);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2ea7f052722c..dd5df53e045a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2990,7 +2990,9 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2990 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 2990 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2991 eb->start = start; 2991 eb->start = start;
2992 eb->len = len; 2992 eb->len = len;
2993 mutex_init(&eb->mutex); 2993 spin_lock_init(&eb->lock);
2994 init_waitqueue_head(&eb->lock_wq);
2995
2994#if LEAK_DEBUG 2996#if LEAK_DEBUG
2995 spin_lock_irqsave(&leak_lock, flags); 2997 spin_lock_irqsave(&leak_lock, flags);
2996 list_add(&eb->leak_list, &buffers); 2998 list_add(&eb->leak_list, &buffers);
@@ -3071,8 +3073,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3071 unlock_page(p); 3073 unlock_page(p);
3072 } 3074 }
3073 if (uptodate) 3075 if (uptodate)
3074 eb->flags |= EXTENT_UPTODATE; 3076 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3075 eb->flags |= EXTENT_BUFFER_FILLED;
3076 3077
3077 spin_lock(&tree->buffer_lock); 3078 spin_lock(&tree->buffer_lock);
3078 exists = buffer_tree_insert(tree, start, &eb->rb_node); 3079 exists = buffer_tree_insert(tree, start, &eb->rb_node);
@@ -3226,7 +3227,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3226 unsigned long num_pages; 3227 unsigned long num_pages;
3227 3228
3228 num_pages = num_extent_pages(eb->start, eb->len); 3229 num_pages = num_extent_pages(eb->start, eb->len);
3229 eb->flags &= ~EXTENT_UPTODATE; 3230 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3230 3231
3231 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3232 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3232 GFP_NOFS); 3233 GFP_NOFS);
@@ -3297,7 +3298,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3297 struct page *page; 3298 struct page *page;
3298 int pg_uptodate = 1; 3299 int pg_uptodate = 1;
3299 3300
3300 if (eb->flags & EXTENT_UPTODATE) 3301 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3301 return 1; 3302 return 1;
3302 3303
3303 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3304 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
@@ -3333,7 +3334,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3333 struct bio *bio = NULL; 3334 struct bio *bio = NULL;
3334 unsigned long bio_flags = 0; 3335 unsigned long bio_flags = 0;
3335 3336
3336 if (eb->flags & EXTENT_UPTODATE) 3337 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3337 return 0; 3338 return 0;
3338 3339
3339 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3340 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
@@ -3364,7 +3365,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3364 } 3365 }
3365 if (all_uptodate) { 3366 if (all_uptodate) {
3366 if (start_i == 0) 3367 if (start_i == 0)
3367 eb->flags |= EXTENT_UPTODATE; 3368 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3368 goto unlock_exit; 3369 goto unlock_exit;
3369 } 3370 }
3370 3371
@@ -3400,7 +3401,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3400 } 3401 }
3401 3402
3402 if (!ret) 3403 if (!ret)
3403 eb->flags |= EXTENT_UPTODATE; 3404 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3404 return ret; 3405 return ret;
3405 3406
3406unlock_exit: 3407unlock_exit:
@@ -3497,7 +3498,6 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3497 unmap_extent_buffer(eb, eb->map_token, km); 3498 unmap_extent_buffer(eb, eb->map_token, km);
3498 eb->map_token = NULL; 3499 eb->map_token = NULL;
3499 save = 1; 3500 save = 1;
3500 WARN_ON(!mutex_is_locked(&eb->mutex));
3501 } 3501 }
3502 err = map_private_extent_buffer(eb, start, min_len, token, map, 3502 err = map_private_extent_buffer(eb, start, min_len, token, map,
3503 map_start, map_len, km); 3503 map_start, map_len, km);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index e80c6d96b318..1f9df88afbf6 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -22,6 +22,10 @@
22/* flags for bio submission */ 22/* flags for bio submission */
23#define EXTENT_BIO_COMPRESSED 1 23#define EXTENT_BIO_COMPRESSED 1
24 24
25/* these are bit numbers for test/set bit */
26#define EXTENT_BUFFER_UPTODATE 0
27#define EXTENT_BUFFER_BLOCKING 1
28
25/* 29/*
26 * page->private values. Every page that is controlled by the extent 30 * page->private values. Every page that is controlled by the extent
27 * map has page->private set to one. 31 * map has page->private set to one.
@@ -95,11 +99,19 @@ struct extent_buffer {
95 unsigned long map_start; 99 unsigned long map_start;
96 unsigned long map_len; 100 unsigned long map_len;
97 struct page *first_page; 101 struct page *first_page;
102 unsigned long bflags;
98 atomic_t refs; 103 atomic_t refs;
99 int flags;
100 struct list_head leak_list; 104 struct list_head leak_list;
101 struct rb_node rb_node; 105 struct rb_node rb_node;
102 struct mutex mutex; 106
107 /* the spinlock is used to protect most operations */
108 spinlock_t lock;
109
110 /*
111 * when we keep the lock held while blocking, waiters go onto
112 * the wq
113 */
114 wait_queue_head_t lock_wq;
103}; 115};
104 116
105struct extent_map_tree; 117struct extent_map_tree;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4a79e1c5ebd0..ebd7d6c37df8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -50,6 +50,7 @@
50#include "tree-log.h" 50#include "tree-log.h"
51#include "ref-cache.h" 51#include "ref-cache.h"
52#include "compression.h" 52#include "compression.h"
53#include "locking.h"
53 54
54struct btrfs_iget_args { 55struct btrfs_iget_args {
55 u64 ino; 56 u64 ino;
@@ -2021,6 +2022,7 @@ void btrfs_read_locked_inode(struct inode *inode)
2021 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 2022 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
2022 2023
2023 alloc_group_block = btrfs_inode_block_group(leaf, inode_item); 2024 alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
2025
2024 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, 2026 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
2025 alloc_group_block, 0); 2027 alloc_group_block, 0);
2026 btrfs_free_path(path); 2028 btrfs_free_path(path);
@@ -2117,6 +2119,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2117 goto failed; 2119 goto failed;
2118 } 2120 }
2119 2121
2122 btrfs_unlock_up_safe(path, 1);
2120 leaf = path->nodes[0]; 2123 leaf = path->nodes[0];
2121 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2124 inode_item = btrfs_item_ptr(leaf, path->slots[0],
2122 struct btrfs_inode_item); 2125 struct btrfs_inode_item);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 39bae7761db6..68fd9ccf1805 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -26,45 +26,215 @@
26#include "locking.h" 26#include "locking.h"
27 27
28/* 28/*
29 * locks the per buffer mutex in an extent buffer. This uses adaptive locks 29 * btrfs_header_level() isn't free, so don't call it when lockdep isn't
30 * and the spin is not tuned very extensively. The spinning does make a big 30 * on
31 * difference in almost every workload, but spinning for the right amount of
32 * time needs some help.
33 *
34 * In general, we want to spin as long as the lock holder is doing btree
35 * searches, and we should give up if they are in more expensive code.
36 */ 31 */
32#ifdef CONFIG_DEBUG_LOCK_ALLOC
33static inline void spin_nested(struct extent_buffer *eb)
34{
35 spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
36}
37#else
38static inline void spin_nested(struct extent_buffer *eb)
39{
40 spin_lock(&eb->lock);
41}
42#endif
37 43
38int btrfs_tree_lock(struct extent_buffer *eb) 44/*
45 * Setting a lock to blocking will drop the spinlock and set the
46 * flag that forces other procs who want the lock to wait. After
47 * this you can safely schedule with the lock held.
48 */
49void btrfs_set_lock_blocking(struct extent_buffer *eb)
39{ 50{
40 int i; 51 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
52 set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
53 spin_unlock(&eb->lock);
54 }
55 /* exit with the spin lock released and the bit set */
56}
41 57
42 if (mutex_trylock(&eb->mutex)) 58/*
43 return 0; 59 * clearing the blocking flag will take the spinlock again.
60 * After this you can't safely schedule
61 */
62void btrfs_clear_lock_blocking(struct extent_buffer *eb)
63{
64 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
65 spin_nested(eb);
66 clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
67 smp_mb__after_clear_bit();
68 }
69 /* exit with the spin lock held */
70}
71
72/*
73 * unfortunately, many of the places that currently set a lock to blocking
74 * don't end up blocking for every long, and often they don't block
75 * at all. For a dbench 50 run, if we don't spin one the blocking bit
76 * at all, the context switch rate can jump up to 400,000/sec or more.
77 *
78 * So, we're still stuck with this crummy spin on the blocking bit,
79 * at least until the most common causes of the short blocks
80 * can be dealt with.
81 */
82static int btrfs_spin_on_block(struct extent_buffer *eb)
83{
84 int i;
44 for (i = 0; i < 512; i++) { 85 for (i = 0; i < 512; i++) {
45 cpu_relax(); 86 cpu_relax();
46 if (mutex_trylock(&eb->mutex)) 87 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
88 return 1;
89 if (need_resched())
90 break;
91 }
92 return 0;
93}
94
95/*
96 * This is somewhat different from trylock. It will take the
97 * spinlock but if it finds the lock is set to blocking, it will
98 * return without the lock held.
99 *
100 * returns 1 if it was able to take the lock and zero otherwise
101 *
102 * After this call, scheduling is not safe without first calling
103 * btrfs_set_lock_blocking()
104 */
105int btrfs_try_spin_lock(struct extent_buffer *eb)
106{
107 int i;
108
109 spin_nested(eb);
110 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
111 return 1;
112 spin_unlock(&eb->lock);
113
114 /* spin for a bit on the BLOCKING flag */
115 for (i = 0; i < 2; i++) {
116 if (!btrfs_spin_on_block(eb))
117 break;
118
119 spin_nested(eb);
120 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
121 return 1;
122 spin_unlock(&eb->lock);
123 }
124 return 0;
125}
126
127/*
128 * the autoremove wake function will return 0 if it tried to wake up
129 * a process that was already awake, which means that process won't
130 * count as an exclusive wakeup. The waitq code will continue waking
131 * procs until it finds one that was actually sleeping.
132 *
133 * For btrfs, this isn't quite what we want. We want a single proc
134 * to be notified that the lock is ready for taking. If that proc
135 * already happen to be awake, great, it will loop around and try for
136 * the lock.
137 *
138 * So, btrfs_wake_function always returns 1, even when the proc that we
139 * tried to wake up was already awake.
140 */
141static int btrfs_wake_function(wait_queue_t *wait, unsigned mode,
142 int sync, void *key)
143{
144 autoremove_wake_function(wait, mode, sync, key);
145 return 1;
146}
147
148/*
149 * returns with the extent buffer spinlocked.
150 *
151 * This will spin and/or wait as required to take the lock, and then
152 * return with the spinlock held.
153 *
154 * After this call, scheduling is not safe without first calling
155 * btrfs_set_lock_blocking()
156 */
157int btrfs_tree_lock(struct extent_buffer *eb)
158{
159 DEFINE_WAIT(wait);
160 wait.func = btrfs_wake_function;
161
162 while(1) {
163 spin_nested(eb);
164
165 /* nobody is blocking, exit with the spinlock held */
166 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
47 return 0; 167 return 0;
168
169 /*
170 * we have the spinlock, but the real owner is blocking.
171 * wait for them
172 */
173 spin_unlock(&eb->lock);
174
175 /*
176 * spin for a bit, and if the blocking flag goes away,
177 * loop around
178 */
179 if (btrfs_spin_on_block(eb))
180 continue;
181
182 prepare_to_wait_exclusive(&eb->lock_wq, &wait,
183 TASK_UNINTERRUPTIBLE);
184
185 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
186 schedule();
187
188 finish_wait(&eb->lock_wq, &wait);
48 } 189 }
49 cpu_relax();
50 mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
51 return 0; 190 return 0;
52} 191}
53 192
193/*
194 * Very quick trylock, this does not spin or schedule. It returns
195 * 1 with the spinlock held if it was able to take the lock, or it
196 * returns zero if it was unable to take the lock.
197 *
198 * After this call, scheduling is not safe without first calling
199 * btrfs_set_lock_blocking()
200 */
54int btrfs_try_tree_lock(struct extent_buffer *eb) 201int btrfs_try_tree_lock(struct extent_buffer *eb)
55{ 202{
56 return mutex_trylock(&eb->mutex); 203 if (spin_trylock(&eb->lock)) {
204 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
205 /*
206 * we've got the spinlock, but the real owner is
207 * blocking. Drop the spinlock and return failure
208 */
209 spin_unlock(&eb->lock);
210 return 0;
211 }
212 return 1;
213 }
214 /* someone else has the spinlock giveup */
215 return 0;
57} 216}
58 217
59int btrfs_tree_unlock(struct extent_buffer *eb) 218int btrfs_tree_unlock(struct extent_buffer *eb)
60{ 219{
61 mutex_unlock(&eb->mutex); 220 /*
221 * if we were a blocking owner, we don't have the spinlock held
222 * just clear the bit and look for waiters
223 */
224 if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
225 smp_mb__after_clear_bit();
226 else
227 spin_unlock(&eb->lock);
228
229 if (waitqueue_active(&eb->lock_wq))
230 wake_up(&eb->lock_wq);
62 return 0; 231 return 0;
63} 232}
64 233
65int btrfs_tree_locked(struct extent_buffer *eb) 234int btrfs_tree_locked(struct extent_buffer *eb)
66{ 235{
67 return mutex_is_locked(&eb->mutex); 236 return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) ||
237 spin_is_locked(&eb->lock);
68} 238}
69 239
70/* 240/*
@@ -75,12 +245,14 @@ int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
75{ 245{
76 int i; 246 int i;
77 struct extent_buffer *eb; 247 struct extent_buffer *eb;
248
78 for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) { 249 for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
79 eb = path->nodes[i]; 250 eb = path->nodes[i];
80 if (!eb) 251 if (!eb)
81 break; 252 break;
82 smp_mb(); 253 smp_mb();
83 if (!list_empty(&eb->mutex.wait_list)) 254 if (spin_is_contended(&eb->lock) ||
255 waitqueue_active(&eb->lock_wq))
84 return 1; 256 return 1;
85 } 257 }
86 return 0; 258 return 0;
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index bc1faef12519..d92e707f5870 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -22,6 +22,12 @@
22int btrfs_tree_lock(struct extent_buffer *eb); 22int btrfs_tree_lock(struct extent_buffer *eb);
23int btrfs_tree_unlock(struct extent_buffer *eb); 23int btrfs_tree_unlock(struct extent_buffer *eb);
24int btrfs_tree_locked(struct extent_buffer *eb); 24int btrfs_tree_locked(struct extent_buffer *eb);
25
25int btrfs_try_tree_lock(struct extent_buffer *eb); 26int btrfs_try_tree_lock(struct extent_buffer *eb);
27int btrfs_try_spin_lock(struct extent_buffer *eb);
28
26int btrfs_path_lock_waiting(struct btrfs_path *path, int level); 29int btrfs_path_lock_waiting(struct btrfs_path *path, int level);
30
31void btrfs_set_lock_blocking(struct extent_buffer *eb);
32void btrfs_clear_lock_blocking(struct extent_buffer *eb);
27#endif 33#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3e8358c36165..98d25fa4570e 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -74,6 +74,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
74 u32 nritems; 74 u32 nritems;
75 75
76 root_node = btrfs_lock_root_node(root); 76 root_node = btrfs_lock_root_node(root);
77 btrfs_set_lock_blocking(root_node);
77 nritems = btrfs_header_nritems(root_node); 78 nritems = btrfs_header_nritems(root_node);
78 root->defrag_max.objectid = 0; 79 root->defrag_max.objectid = 0;
79 /* from above we know this is not a leaf */ 80 /* from above we know this is not a leaf */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4f26f3ed0c87..20794290256b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1615,6 +1615,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1615 1615
1616 btrfs_tree_lock(next); 1616 btrfs_tree_lock(next);
1617 clean_tree_block(trans, root, next); 1617 clean_tree_block(trans, root, next);
1618 btrfs_set_lock_blocking(next);
1618 btrfs_wait_tree_block_writeback(next); 1619 btrfs_wait_tree_block_writeback(next);
1619 btrfs_tree_unlock(next); 1620 btrfs_tree_unlock(next);
1620 1621
@@ -1661,6 +1662,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1661 next = path->nodes[*level]; 1662 next = path->nodes[*level];
1662 btrfs_tree_lock(next); 1663 btrfs_tree_lock(next);
1663 clean_tree_block(trans, root, next); 1664 clean_tree_block(trans, root, next);
1665 btrfs_set_lock_blocking(next);
1664 btrfs_wait_tree_block_writeback(next); 1666 btrfs_wait_tree_block_writeback(next);
1665 btrfs_tree_unlock(next); 1667 btrfs_tree_unlock(next);
1666 1668
@@ -1718,6 +1720,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1718 1720
1719 btrfs_tree_lock(next); 1721 btrfs_tree_lock(next);
1720 clean_tree_block(trans, root, next); 1722 clean_tree_block(trans, root, next);
1723 btrfs_set_lock_blocking(next);
1721 btrfs_wait_tree_block_writeback(next); 1724 btrfs_wait_tree_block_writeback(next);
1722 btrfs_tree_unlock(next); 1725 btrfs_tree_unlock(next);
1723 1726
@@ -1790,6 +1793,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1790 1793
1791 btrfs_tree_lock(next); 1794 btrfs_tree_lock(next);
1792 clean_tree_block(trans, log, next); 1795 clean_tree_block(trans, log, next);
1796 btrfs_set_lock_blocking(next);
1793 btrfs_wait_tree_block_writeback(next); 1797 btrfs_wait_tree_block_writeback(next);
1794 btrfs_tree_unlock(next); 1798 btrfs_tree_unlock(next);
1795 1799