aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/ctree.c234
-rw-r--r--fs/btrfs/ctree.h4
-rw-r--r--fs/btrfs/disk-io.c10
-rw-r--r--fs/btrfs/extent-tree.c5
-rw-r--r--fs/btrfs/extent_io.c18
-rw-r--r--fs/btrfs/extent_io.h16
-rw-r--r--fs/btrfs/inode.c3
-rw-r--r--fs/btrfs/locking.c208
-rw-r--r--fs/btrfs/locking.h6
-rw-r--r--fs/btrfs/tree-defrag.c1
-rw-r--r--fs/btrfs/tree-log.c4
11 files changed, 470 insertions, 39 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3b6e35aafc9..3af777357ac 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -54,6 +54,31 @@ struct btrfs_path *btrfs_alloc_path(void)
54 return path; 54 return path;
55} 55}
56 56
57/*
58 * set all locked nodes in the path to blocking locks. This should
59 * be done before scheduling
60 */
61noinline void btrfs_set_path_blocking(struct btrfs_path *p)
62{
63 int i;
64 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
65 if (p->nodes[i] && p->locks[i])
66 btrfs_set_lock_blocking(p->nodes[i]);
67 }
68}
69
70/*
71 * reset all the locked nodes in the patch to spinning locks.
72 */
73noinline void btrfs_clear_path_blocking(struct btrfs_path *p)
74{
75 int i;
76 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
77 if (p->nodes[i] && p->locks[i])
78 btrfs_clear_lock_blocking(p->nodes[i]);
79 }
80}
81
57/* this also releases the path */ 82/* this also releases the path */
58void btrfs_free_path(struct btrfs_path *p) 83void btrfs_free_path(struct btrfs_path *p)
59{ 84{
@@ -272,6 +297,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
272 if (IS_ERR(cow)) 297 if (IS_ERR(cow))
273 return PTR_ERR(cow); 298 return PTR_ERR(cow);
274 299
300 /* cow is set to blocking by btrfs_init_new_buffer */
301
275 copy_extent_buffer(cow, buf, 0, 0, cow->len); 302 copy_extent_buffer(cow, buf, 0, 0, cow->len);
276 btrfs_set_header_bytenr(cow, cow->start); 303 btrfs_set_header_bytenr(cow, cow->start);
277 btrfs_set_header_generation(cow, trans->transid); 304 btrfs_set_header_generation(cow, trans->transid);
@@ -397,6 +424,11 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
397 } 424 }
398 425
399 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); 426 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
427
428 if (parent)
429 btrfs_set_lock_blocking(parent);
430 btrfs_set_lock_blocking(buf);
431
400 ret = __btrfs_cow_block(trans, root, buf, parent, 432 ret = __btrfs_cow_block(trans, root, buf, parent,
401 parent_slot, cow_ret, search_start, 0, 433 parent_slot, cow_ret, search_start, 0,
402 prealloc_dest); 434 prealloc_dest);
@@ -502,6 +534,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
502 if (parent_nritems == 1) 534 if (parent_nritems == 1)
503 return 0; 535 return 0;
504 536
537 btrfs_set_lock_blocking(parent);
538
505 for (i = start_slot; i < end_slot; i++) { 539 for (i = start_slot; i < end_slot; i++) {
506 int close = 1; 540 int close = 1;
507 541
@@ -562,6 +596,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
562 search_start = last_block; 596 search_start = last_block;
563 597
564 btrfs_tree_lock(cur); 598 btrfs_tree_lock(cur);
599 btrfs_set_lock_blocking(cur);
565 err = __btrfs_cow_block(trans, root, cur, parent, i, 600 err = __btrfs_cow_block(trans, root, cur, parent, i,
566 &cur, search_start, 601 &cur, search_start,
567 min(16 * blocksize, 602 min(16 * blocksize,
@@ -860,6 +895,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
860 return 0; 895 return 0;
861 896
862 mid = path->nodes[level]; 897 mid = path->nodes[level];
898
863 WARN_ON(!path->locks[level]); 899 WARN_ON(!path->locks[level]);
864 WARN_ON(btrfs_header_generation(mid) != trans->transid); 900 WARN_ON(btrfs_header_generation(mid) != trans->transid);
865 901
@@ -882,6 +918,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
882 /* promote the child to a root */ 918 /* promote the child to a root */
883 child = read_node_slot(root, mid, 0); 919 child = read_node_slot(root, mid, 0);
884 btrfs_tree_lock(child); 920 btrfs_tree_lock(child);
921 btrfs_set_lock_blocking(child);
885 BUG_ON(!child); 922 BUG_ON(!child);
886 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); 923 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
887 BUG_ON(ret); 924 BUG_ON(ret);
@@ -898,6 +935,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
898 935
899 add_root_to_dirty_list(root); 936 add_root_to_dirty_list(root);
900 btrfs_tree_unlock(child); 937 btrfs_tree_unlock(child);
938
901 path->locks[level] = 0; 939 path->locks[level] = 0;
902 path->nodes[level] = NULL; 940 path->nodes[level] = NULL;
903 clean_tree_block(trans, root, mid); 941 clean_tree_block(trans, root, mid);
@@ -922,6 +960,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
922 left = read_node_slot(root, parent, pslot - 1); 960 left = read_node_slot(root, parent, pslot - 1);
923 if (left) { 961 if (left) {
924 btrfs_tree_lock(left); 962 btrfs_tree_lock(left);
963 btrfs_set_lock_blocking(left);
925 wret = btrfs_cow_block(trans, root, left, 964 wret = btrfs_cow_block(trans, root, left,
926 parent, pslot - 1, &left, 0); 965 parent, pslot - 1, &left, 0);
927 if (wret) { 966 if (wret) {
@@ -932,6 +971,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
932 right = read_node_slot(root, parent, pslot + 1); 971 right = read_node_slot(root, parent, pslot + 1);
933 if (right) { 972 if (right) {
934 btrfs_tree_lock(right); 973 btrfs_tree_lock(right);
974 btrfs_set_lock_blocking(right);
935 wret = btrfs_cow_block(trans, root, right, 975 wret = btrfs_cow_block(trans, root, right,
936 parent, pslot + 1, &right, 0); 976 parent, pslot + 1, &right, 0);
937 if (wret) { 977 if (wret) {
@@ -1107,6 +1147,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1107 u32 left_nr; 1147 u32 left_nr;
1108 1148
1109 btrfs_tree_lock(left); 1149 btrfs_tree_lock(left);
1150 btrfs_set_lock_blocking(left);
1151
1110 left_nr = btrfs_header_nritems(left); 1152 left_nr = btrfs_header_nritems(left);
1111 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 1153 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1112 wret = 1; 1154 wret = 1;
@@ -1153,7 +1195,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1153 */ 1195 */
1154 if (right) { 1196 if (right) {
1155 u32 right_nr; 1197 u32 right_nr;
1198
1156 btrfs_tree_lock(right); 1199 btrfs_tree_lock(right);
1200 btrfs_set_lock_blocking(right);
1201
1157 right_nr = btrfs_header_nritems(right); 1202 right_nr = btrfs_header_nritems(right);
1158 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 1203 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1159 wret = 1; 1204 wret = 1;
@@ -1265,6 +1310,68 @@ static noinline void reada_for_search(struct btrfs_root *root,
1265} 1310}
1266 1311
1267/* 1312/*
1313 * returns -EAGAIN if it had to drop the path, or zero if everything was in
1314 * cache
1315 */
1316static noinline int reada_for_balance(struct btrfs_root *root,
1317 struct btrfs_path *path, int level)
1318{
1319 int slot;
1320 int nritems;
1321 struct extent_buffer *parent;
1322 struct extent_buffer *eb;
1323 u64 gen;
1324 u64 block1 = 0;
1325 u64 block2 = 0;
1326 int ret = 0;
1327 int blocksize;
1328
1329 parent = path->nodes[level - 1];
1330 if (!parent)
1331 return 0;
1332
1333 nritems = btrfs_header_nritems(parent);
1334 slot = path->slots[level];
1335 blocksize = btrfs_level_size(root, level);
1336
1337 if (slot > 0) {
1338 block1 = btrfs_node_blockptr(parent, slot - 1);
1339 gen = btrfs_node_ptr_generation(parent, slot - 1);
1340 eb = btrfs_find_tree_block(root, block1, blocksize);
1341 if (eb && btrfs_buffer_uptodate(eb, gen))
1342 block1 = 0;
1343 free_extent_buffer(eb);
1344 }
1345 if (slot < nritems) {
1346 block2 = btrfs_node_blockptr(parent, slot + 1);
1347 gen = btrfs_node_ptr_generation(parent, slot + 1);
1348 eb = btrfs_find_tree_block(root, block2, blocksize);
1349 if (eb && btrfs_buffer_uptodate(eb, gen))
1350 block2 = 0;
1351 free_extent_buffer(eb);
1352 }
1353 if (block1 || block2) {
1354 ret = -EAGAIN;
1355 btrfs_release_path(root, path);
1356 if (block1)
1357 readahead_tree_block(root, block1, blocksize, 0);
1358 if (block2)
1359 readahead_tree_block(root, block2, blocksize, 0);
1360
1361 if (block1) {
1362 eb = read_tree_block(root, block1, blocksize, 0);
1363 free_extent_buffer(eb);
1364 }
1365 if (block1) {
1366 eb = read_tree_block(root, block2, blocksize, 0);
1367 free_extent_buffer(eb);
1368 }
1369 }
1370 return ret;
1371}
1372
1373
1374/*
1268 * when we walk down the tree, it is usually safe to unlock the higher layers 1375 * when we walk down the tree, it is usually safe to unlock the higher layers
1269 * in the tree. The exceptions are when our path goes through slot 0, because 1376 * in the tree. The exceptions are when our path goes through slot 0, because
1270 * operations on the tree might require changing key pointers higher up in the 1377 * operations on the tree might require changing key pointers higher up in the
@@ -1315,6 +1422,32 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
1315} 1422}
1316 1423
1317/* 1424/*
1425 * This releases any locks held in the path starting at level and
1426 * going all the way up to the root.
1427 *
1428 * btrfs_search_slot will keep the lock held on higher nodes in a few
1429 * corner cases, such as COW of the block at slot zero in the node. This
1430 * ignores those rules, and it should only be called when there are no
1431 * more updates to be done higher up in the tree.
1432 */
1433noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1434{
1435 int i;
1436
1437 if (path->keep_locks || path->lowest_level)
1438 return;
1439
1440 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1441 if (!path->nodes[i])
1442 break;
1443 if (!path->locks[i])
1444 break;
1445 btrfs_tree_unlock(path->nodes[i]);
1446 path->locks[i] = 0;
1447 }
1448}
1449
1450/*
1318 * look for key in the tree. path is filled in with nodes along the way 1451 * look for key in the tree. path is filled in with nodes along the way
1319 * if key is found, we return zero and you can find the item in the leaf 1452 * if key is found, we return zero and you can find the item in the leaf
1320 * level of the path (level 0) 1453 * level of the path (level 0)
@@ -1385,6 +1518,7 @@ again:
1385 */ 1518 */
1386 if (prealloc_block.objectid && 1519 if (prealloc_block.objectid &&
1387 prealloc_block.offset != b->len) { 1520 prealloc_block.offset != b->len) {
1521 btrfs_set_path_blocking(p);
1388 btrfs_free_reserved_extent(root, 1522 btrfs_free_reserved_extent(root,
1389 prealloc_block.objectid, 1523 prealloc_block.objectid,
1390 prealloc_block.offset); 1524 prealloc_block.offset);
@@ -1409,6 +1543,8 @@ again:
1409 goto again; 1543 goto again;
1410 } 1544 }
1411 1545
1546 btrfs_set_path_blocking(p);
1547
1412 wret = btrfs_cow_block(trans, root, b, 1548 wret = btrfs_cow_block(trans, root, b,
1413 p->nodes[level + 1], 1549 p->nodes[level + 1],
1414 p->slots[level + 1], 1550 p->slots[level + 1],
@@ -1430,6 +1566,22 @@ cow_done:
1430 if (!p->skip_locking) 1566 if (!p->skip_locking)
1431 p->locks[level] = 1; 1567 p->locks[level] = 1;
1432 1568
1569 btrfs_clear_path_blocking(p);
1570
1571 /*
1572 * we have a lock on b and as long as we aren't changing
1573 * the tree, there is no way to for the items in b to change.
1574 * It is safe to drop the lock on our parent before we
1575 * go through the expensive btree search on b.
1576 *
1577 * If cow is true, then we might be changing slot zero,
1578 * which may require changing the parent. So, we can't
1579 * drop the lock until after we know which slot we're
1580 * operating on.
1581 */
1582 if (!cow)
1583 btrfs_unlock_up_safe(p, level + 1);
1584
1433 ret = check_block(root, p, level); 1585 ret = check_block(root, p, level);
1434 if (ret) { 1586 if (ret) {
1435 ret = -1; 1587 ret = -1;
@@ -1437,6 +1589,7 @@ cow_done:
1437 } 1589 }
1438 1590
1439 ret = bin_search(b, key, level, &slot); 1591 ret = bin_search(b, key, level, &slot);
1592
1440 if (level != 0) { 1593 if (level != 0) {
1441 if (ret && slot > 0) 1594 if (ret && slot > 0)
1442 slot -= 1; 1595 slot -= 1;
@@ -1444,7 +1597,16 @@ cow_done:
1444 if ((p->search_for_split || ins_len > 0) && 1597 if ((p->search_for_split || ins_len > 0) &&
1445 btrfs_header_nritems(b) >= 1598 btrfs_header_nritems(b) >=
1446 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1599 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1447 int sret = split_node(trans, root, p, level); 1600 int sret;
1601
1602 sret = reada_for_balance(root, p, level);
1603 if (sret)
1604 goto again;
1605
1606 btrfs_set_path_blocking(p);
1607 sret = split_node(trans, root, p, level);
1608 btrfs_clear_path_blocking(p);
1609
1448 BUG_ON(sret > 0); 1610 BUG_ON(sret > 0);
1449 if (sret) { 1611 if (sret) {
1450 ret = sret; 1612 ret = sret;
@@ -1453,8 +1615,16 @@ cow_done:
1453 b = p->nodes[level]; 1615 b = p->nodes[level];
1454 slot = p->slots[level]; 1616 slot = p->slots[level];
1455 } else if (ins_len < 0) { 1617 } else if (ins_len < 0) {
1456 int sret = balance_level(trans, root, p, 1618 int sret;
1457 level); 1619
1620 sret = reada_for_balance(root, p, level);
1621 if (sret)
1622 goto again;
1623
1624 btrfs_set_path_blocking(p);
1625 sret = balance_level(trans, root, p, level);
1626 btrfs_clear_path_blocking(p);
1627
1458 if (sret) { 1628 if (sret) {
1459 ret = sret; 1629 ret = sret;
1460 goto done; 1630 goto done;
@@ -1488,7 +1658,7 @@ cow_done:
1488 * of the btree by dropping locks before 1658 * of the btree by dropping locks before
1489 * we read. 1659 * we read.
1490 */ 1660 */
1491 if (level > 1) { 1661 if (level > 0) {
1492 btrfs_release_path(NULL, p); 1662 btrfs_release_path(NULL, p);
1493 if (tmp) 1663 if (tmp)
1494 free_extent_buffer(tmp); 1664 free_extent_buffer(tmp);
@@ -1503,6 +1673,7 @@ cow_done:
1503 free_extent_buffer(tmp); 1673 free_extent_buffer(tmp);
1504 goto again; 1674 goto again;
1505 } else { 1675 } else {
1676 btrfs_set_path_blocking(p);
1506 if (tmp) 1677 if (tmp)
1507 free_extent_buffer(tmp); 1678 free_extent_buffer(tmp);
1508 if (should_reada) 1679 if (should_reada)
@@ -1512,14 +1683,29 @@ cow_done:
1512 b = read_node_slot(root, b, slot); 1683 b = read_node_slot(root, b, slot);
1513 } 1684 }
1514 } 1685 }
1515 if (!p->skip_locking) 1686 if (!p->skip_locking) {
1516 btrfs_tree_lock(b); 1687 int lret;
1688
1689 btrfs_clear_path_blocking(p);
1690 lret = btrfs_try_spin_lock(b);
1691
1692 if (!lret) {
1693 btrfs_set_path_blocking(p);
1694 btrfs_tree_lock(b);
1695 btrfs_clear_path_blocking(p);
1696 }
1697 }
1517 } else { 1698 } else {
1518 p->slots[level] = slot; 1699 p->slots[level] = slot;
1519 if (ins_len > 0 && 1700 if (ins_len > 0 &&
1520 btrfs_leaf_free_space(root, b) < ins_len) { 1701 btrfs_leaf_free_space(root, b) < ins_len) {
1521 int sret = split_leaf(trans, root, key, 1702 int sret;
1703
1704 btrfs_set_path_blocking(p);
1705 sret = split_leaf(trans, root, key,
1522 p, ins_len, ret == 0); 1706 p, ins_len, ret == 0);
1707 btrfs_clear_path_blocking(p);
1708
1523 BUG_ON(sret > 0); 1709 BUG_ON(sret > 0);
1524 if (sret) { 1710 if (sret) {
1525 ret = sret; 1711 ret = sret;
@@ -1533,12 +1719,16 @@ cow_done:
1533 } 1719 }
1534 ret = 1; 1720 ret = 1;
1535done: 1721done:
1722 /*
1723 * we don't really know what they plan on doing with the path
1724 * from here on, so for now just mark it as blocking
1725 */
1726 btrfs_set_path_blocking(p);
1536 if (prealloc_block.objectid) { 1727 if (prealloc_block.objectid) {
1537 btrfs_free_reserved_extent(root, 1728 btrfs_free_reserved_extent(root,
1538 prealloc_block.objectid, 1729 prealloc_block.objectid,
1539 prealloc_block.offset); 1730 prealloc_block.offset);
1540 } 1731 }
1541
1542 return ret; 1732 return ret;
1543} 1733}
1544 1734
@@ -1562,6 +1752,8 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1562 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); 1752 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
1563 BUG_ON(ret); 1753 BUG_ON(ret);
1564 1754
1755 btrfs_set_lock_blocking(eb);
1756
1565 parent = eb; 1757 parent = eb;
1566 while (1) { 1758 while (1) {
1567 level = btrfs_header_level(parent); 1759 level = btrfs_header_level(parent);
@@ -1586,6 +1778,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1586 eb = read_tree_block(root, bytenr, blocksize, 1778 eb = read_tree_block(root, bytenr, blocksize,
1587 generation); 1779 generation);
1588 btrfs_tree_lock(eb); 1780 btrfs_tree_lock(eb);
1781 btrfs_set_lock_blocking(eb);
1589 } 1782 }
1590 1783
1591 /* 1784 /*
@@ -1610,6 +1803,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1610 eb = read_tree_block(root, bytenr, blocksize, 1803 eb = read_tree_block(root, bytenr, blocksize,
1611 generation); 1804 generation);
1612 btrfs_tree_lock(eb); 1805 btrfs_tree_lock(eb);
1806 btrfs_set_lock_blocking(eb);
1613 } 1807 }
1614 1808
1615 ret = btrfs_cow_block(trans, root, eb, parent, slot, 1809 ret = btrfs_cow_block(trans, root, eb, parent, slot,
@@ -2156,6 +2350,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2156 2350
2157 right = read_node_slot(root, upper, slot + 1); 2351 right = read_node_slot(root, upper, slot + 1);
2158 btrfs_tree_lock(right); 2352 btrfs_tree_lock(right);
2353 btrfs_set_lock_blocking(right);
2354
2159 free_space = btrfs_leaf_free_space(root, right); 2355 free_space = btrfs_leaf_free_space(root, right);
2160 if (free_space < data_size) 2356 if (free_space < data_size)
2161 goto out_unlock; 2357 goto out_unlock;
@@ -2351,6 +2547,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2351 2547
2352 left = read_node_slot(root, path->nodes[1], slot - 1); 2548 left = read_node_slot(root, path->nodes[1], slot - 1);
2353 btrfs_tree_lock(left); 2549 btrfs_tree_lock(left);
2550 btrfs_set_lock_blocking(left);
2551
2354 free_space = btrfs_leaf_free_space(root, left); 2552 free_space = btrfs_leaf_free_space(root, left);
2355 if (free_space < data_size) { 2553 if (free_space < data_size) {
2356 ret = 1; 2554 ret = 1;
@@ -2809,6 +3007,12 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
2809 path->keep_locks = 0; 3007 path->keep_locks = 0;
2810 BUG_ON(ret); 3008 BUG_ON(ret);
2811 3009
3010 /*
3011 * make sure any changes to the path from split_leaf leave it
3012 * in a blocking state
3013 */
3014 btrfs_set_path_blocking(path);
3015
2812 leaf = path->nodes[0]; 3016 leaf = path->nodes[0];
2813 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); 3017 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
2814 3018
@@ -3338,6 +3542,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3338 BUG(); 3542 BUG();
3339 } 3543 }
3340out: 3544out:
3545 btrfs_unlock_up_safe(path, 1);
3341 return ret; 3546 return ret;
3342} 3547}
3343 3548
@@ -3705,12 +3910,14 @@ find_next_key:
3705 */ 3910 */
3706 if (slot >= nritems) { 3911 if (slot >= nritems) {
3707 path->slots[level] = slot; 3912 path->slots[level] = slot;
3913 btrfs_set_path_blocking(path);
3708 sret = btrfs_find_next_key(root, path, min_key, level, 3914 sret = btrfs_find_next_key(root, path, min_key, level,
3709 cache_only, min_trans); 3915 cache_only, min_trans);
3710 if (sret == 0) { 3916 if (sret == 0) {
3711 btrfs_release_path(root, path); 3917 btrfs_release_path(root, path);
3712 goto again; 3918 goto again;
3713 } else { 3919 } else {
3920 btrfs_clear_path_blocking(path);
3714 goto out; 3921 goto out;
3715 } 3922 }
3716 } 3923 }
@@ -3722,16 +3929,20 @@ find_next_key:
3722 unlock_up(path, level, 1); 3929 unlock_up(path, level, 1);
3723 goto out; 3930 goto out;
3724 } 3931 }
3932 btrfs_set_path_blocking(path);
3725 cur = read_node_slot(root, cur, slot); 3933 cur = read_node_slot(root, cur, slot);
3726 3934
3727 btrfs_tree_lock(cur); 3935 btrfs_tree_lock(cur);
3936
3728 path->locks[level - 1] = 1; 3937 path->locks[level - 1] = 1;
3729 path->nodes[level - 1] = cur; 3938 path->nodes[level - 1] = cur;
3730 unlock_up(path, level, 1); 3939 unlock_up(path, level, 1);
3940 btrfs_clear_path_blocking(path);
3731 } 3941 }
3732out: 3942out:
3733 if (ret == 0) 3943 if (ret == 0)
3734 memcpy(min_key, &found_key, sizeof(found_key)); 3944 memcpy(min_key, &found_key, sizeof(found_key));
3945 btrfs_set_path_blocking(path);
3735 return ret; 3946 return ret;
3736} 3947}
3737 3948
@@ -3827,6 +4038,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3827 if (ret < 0) 4038 if (ret < 0)
3828 return ret; 4039 return ret;
3829 4040
4041 btrfs_set_path_blocking(path);
3830 nritems = btrfs_header_nritems(path->nodes[0]); 4042 nritems = btrfs_header_nritems(path->nodes[0]);
3831 /* 4043 /*
3832 * by releasing the path above we dropped all our locks. A balance 4044 * by releasing the path above we dropped all our locks. A balance
@@ -3857,6 +4069,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3857 free_extent_buffer(next); 4069 free_extent_buffer(next);
3858 } 4070 }
3859 4071
4072 /* the path was set to blocking above */
3860 if (level == 1 && (path->locks[1] || path->skip_locking) && 4073 if (level == 1 && (path->locks[1] || path->skip_locking) &&
3861 path->reada) 4074 path->reada)
3862 reada_for_search(root, path, level, slot, 0); 4075 reada_for_search(root, path, level, slot, 0);
@@ -3865,6 +4078,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3865 if (!path->skip_locking) { 4078 if (!path->skip_locking) {
3866 WARN_ON(!btrfs_tree_locked(c)); 4079 WARN_ON(!btrfs_tree_locked(c));
3867 btrfs_tree_lock(next); 4080 btrfs_tree_lock(next);
4081 btrfs_set_lock_blocking(next);
3868 } 4082 }
3869 break; 4083 break;
3870 } 4084 }
@@ -3881,12 +4095,15 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3881 path->locks[level] = 1; 4095 path->locks[level] = 1;
3882 if (!level) 4096 if (!level)
3883 break; 4097 break;
4098
4099 btrfs_set_path_blocking(path);
3884 if (level == 1 && path->locks[1] && path->reada) 4100 if (level == 1 && path->locks[1] && path->reada)
3885 reada_for_search(root, path, level, slot, 0); 4101 reada_for_search(root, path, level, slot, 0);
3886 next = read_node_slot(root, next, 0); 4102 next = read_node_slot(root, next, 0);
3887 if (!path->skip_locking) { 4103 if (!path->skip_locking) {
3888 WARN_ON(!btrfs_tree_locked(path->nodes[level])); 4104 WARN_ON(!btrfs_tree_locked(path->nodes[level]));
3889 btrfs_tree_lock(next); 4105 btrfs_tree_lock(next);
4106 btrfs_set_lock_blocking(next);
3890 } 4107 }
3891 } 4108 }
3892done: 4109done:
@@ -3911,6 +4128,7 @@ int btrfs_previous_item(struct btrfs_root *root,
3911 4128
3912 while (1) { 4129 while (1) {
3913 if (path->slots[0] == 0) { 4130 if (path->slots[0] == 0) {
4131 btrfs_set_path_blocking(path);
3914 ret = btrfs_prev_leaf(root, path); 4132 ret = btrfs_prev_leaf(root, path);
3915 if (ret != 0) 4133 if (ret != 0)
3916 return ret; 4134 return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f2b8d26b047..531db112c8b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1835,6 +1835,10 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
1835struct btrfs_path *btrfs_alloc_path(void); 1835struct btrfs_path *btrfs_alloc_path(void);
1836void btrfs_free_path(struct btrfs_path *p); 1836void btrfs_free_path(struct btrfs_path *p);
1837void btrfs_init_path(struct btrfs_path *p); 1837void btrfs_init_path(struct btrfs_path *p);
1838void btrfs_set_path_blocking(struct btrfs_path *p);
1839void btrfs_clear_path_blocking(struct btrfs_path *p);
1840void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
1841
1838int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1842int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1839 struct btrfs_path *path, int slot, int nr); 1843 struct btrfs_path *path, int slot, int nr);
1840int btrfs_del_leaf(struct btrfs_trans_handle *trans, 1844int btrfs_del_leaf(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 549271607c1..5aebddd7119 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -799,7 +799,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
799 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 799 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
800 800
801 if (ret == 0) 801 if (ret == 0)
802 buf->flags |= EXTENT_UPTODATE; 802 set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
803 else 803 else
804 WARN_ON(1); 804 WARN_ON(1);
805 return buf; 805 return buf;
@@ -813,6 +813,10 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
813 if (btrfs_header_generation(buf) == 813 if (btrfs_header_generation(buf) ==
814 root->fs_info->running_transaction->transid) { 814 root->fs_info->running_transaction->transid) {
815 WARN_ON(!btrfs_tree_locked(buf)); 815 WARN_ON(!btrfs_tree_locked(buf));
816
817 /* ugh, clear_extent_buffer_dirty can be expensive */
818 btrfs_set_lock_blocking(buf);
819
816 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, 820 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
817 buf); 821 buf);
818 } 822 }
@@ -2311,6 +2315,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2311 u64 transid = btrfs_header_generation(buf); 2315 u64 transid = btrfs_header_generation(buf);
2312 struct inode *btree_inode = root->fs_info->btree_inode; 2316 struct inode *btree_inode = root->fs_info->btree_inode;
2313 2317
2318 btrfs_set_lock_blocking(buf);
2319
2314 WARN_ON(!btrfs_tree_locked(buf)); 2320 WARN_ON(!btrfs_tree_locked(buf));
2315 if (transid != root->fs_info->generation) { 2321 if (transid != root->fs_info->generation) {
2316 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " 2322 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
@@ -2353,7 +2359,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2353 int ret; 2359 int ret;
2354 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 2360 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
2355 if (ret == 0) 2361 if (ret == 0)
2356 buf->flags |= EXTENT_UPTODATE; 2362 set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
2357 return ret; 2363 return ret;
2358} 2364}
2359 2365
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7a22f2e6ec4..ed1e25d7248 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3407,7 +3407,10 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3407 btrfs_set_header_generation(buf, trans->transid); 3407 btrfs_set_header_generation(buf, trans->transid);
3408 btrfs_tree_lock(buf); 3408 btrfs_tree_lock(buf);
3409 clean_tree_block(trans, root, buf); 3409 clean_tree_block(trans, root, buf);
3410
3411 btrfs_set_lock_blocking(buf);
3410 btrfs_set_buffer_uptodate(buf); 3412 btrfs_set_buffer_uptodate(buf);
3413
3411 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 3414 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3412 set_extent_dirty(&root->dirty_log_pages, buf->start, 3415 set_extent_dirty(&root->dirty_log_pages, buf->start,
3413 buf->start + buf->len - 1, GFP_NOFS); 3416 buf->start + buf->len - 1, GFP_NOFS);
@@ -3416,6 +3419,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3416 buf->start + buf->len - 1, GFP_NOFS); 3419 buf->start + buf->len - 1, GFP_NOFS);
3417 } 3420 }
3418 trans->blocks_used++; 3421 trans->blocks_used++;
3422 /* this returns a buffer locked for blocking */
3419 return buf; 3423 return buf;
3420} 3424}
3421 3425
@@ -3752,6 +3756,7 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
3752 3756
3753 next = read_tree_block(root, bytenr, blocksize, ptr_gen); 3757 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3754 btrfs_tree_lock(next); 3758 btrfs_tree_lock(next);
3759 btrfs_set_lock_blocking(next);
3755 3760
3756 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, 3761 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
3757 &refs); 3762 &refs);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2ea7f052722..dd5df53e045 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2990,7 +2990,9 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2990 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 2990 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2991 eb->start = start; 2991 eb->start = start;
2992 eb->len = len; 2992 eb->len = len;
2993 mutex_init(&eb->mutex); 2993 spin_lock_init(&eb->lock);
2994 init_waitqueue_head(&eb->lock_wq);
2995
2994#if LEAK_DEBUG 2996#if LEAK_DEBUG
2995 spin_lock_irqsave(&leak_lock, flags); 2997 spin_lock_irqsave(&leak_lock, flags);
2996 list_add(&eb->leak_list, &buffers); 2998 list_add(&eb->leak_list, &buffers);
@@ -3071,8 +3073,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3071 unlock_page(p); 3073 unlock_page(p);
3072 } 3074 }
3073 if (uptodate) 3075 if (uptodate)
3074 eb->flags |= EXTENT_UPTODATE; 3076 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3075 eb->flags |= EXTENT_BUFFER_FILLED;
3076 3077
3077 spin_lock(&tree->buffer_lock); 3078 spin_lock(&tree->buffer_lock);
3078 exists = buffer_tree_insert(tree, start, &eb->rb_node); 3079 exists = buffer_tree_insert(tree, start, &eb->rb_node);
@@ -3226,7 +3227,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3226 unsigned long num_pages; 3227 unsigned long num_pages;
3227 3228
3228 num_pages = num_extent_pages(eb->start, eb->len); 3229 num_pages = num_extent_pages(eb->start, eb->len);
3229 eb->flags &= ~EXTENT_UPTODATE; 3230 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3230 3231
3231 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3232 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3232 GFP_NOFS); 3233 GFP_NOFS);
@@ -3297,7 +3298,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3297 struct page *page; 3298 struct page *page;
3298 int pg_uptodate = 1; 3299 int pg_uptodate = 1;
3299 3300
3300 if (eb->flags & EXTENT_UPTODATE) 3301 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3301 return 1; 3302 return 1;
3302 3303
3303 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3304 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
@@ -3333,7 +3334,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3333 struct bio *bio = NULL; 3334 struct bio *bio = NULL;
3334 unsigned long bio_flags = 0; 3335 unsigned long bio_flags = 0;
3335 3336
3336 if (eb->flags & EXTENT_UPTODATE) 3337 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3337 return 0; 3338 return 0;
3338 3339
3339 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3340 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
@@ -3364,7 +3365,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3364 } 3365 }
3365 if (all_uptodate) { 3366 if (all_uptodate) {
3366 if (start_i == 0) 3367 if (start_i == 0)
3367 eb->flags |= EXTENT_UPTODATE; 3368 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3368 goto unlock_exit; 3369 goto unlock_exit;
3369 } 3370 }
3370 3371
@@ -3400,7 +3401,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3400 } 3401 }
3401 3402
3402 if (!ret) 3403 if (!ret)
3403 eb->flags |= EXTENT_UPTODATE; 3404 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3404 return ret; 3405 return ret;
3405 3406
3406unlock_exit: 3407unlock_exit:
@@ -3497,7 +3498,6 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3497 unmap_extent_buffer(eb, eb->map_token, km); 3498 unmap_extent_buffer(eb, eb->map_token, km);
3498 eb->map_token = NULL; 3499 eb->map_token = NULL;
3499 save = 1; 3500 save = 1;
3500 WARN_ON(!mutex_is_locked(&eb->mutex));
3501 } 3501 }
3502 err = map_private_extent_buffer(eb, start, min_len, token, map, 3502 err = map_private_extent_buffer(eb, start, min_len, token, map,
3503 map_start, map_len, km); 3503 map_start, map_len, km);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index e80c6d96b31..1f9df88afbf 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -22,6 +22,10 @@
22/* flags for bio submission */ 22/* flags for bio submission */
23#define EXTENT_BIO_COMPRESSED 1 23#define EXTENT_BIO_COMPRESSED 1
24 24
25/* these are bit numbers for test/set bit */
26#define EXTENT_BUFFER_UPTODATE 0
27#define EXTENT_BUFFER_BLOCKING 1
28
25/* 29/*
26 * page->private values. Every page that is controlled by the extent 30 * page->private values. Every page that is controlled by the extent
27 * map has page->private set to one. 31 * map has page->private set to one.
@@ -95,11 +99,19 @@ struct extent_buffer {
95 unsigned long map_start; 99 unsigned long map_start;
96 unsigned long map_len; 100 unsigned long map_len;
97 struct page *first_page; 101 struct page *first_page;
102 unsigned long bflags;
98 atomic_t refs; 103 atomic_t refs;
99 int flags;
100 struct list_head leak_list; 104 struct list_head leak_list;
101 struct rb_node rb_node; 105 struct rb_node rb_node;
102 struct mutex mutex; 106
107 /* the spinlock is used to protect most operations */
108 spinlock_t lock;
109
110 /*
111 * when we keep the lock held while blocking, waiters go onto
112 * the wq
113 */
114 wait_queue_head_t lock_wq;
103}; 115};
104 116
105struct extent_map_tree; 117struct extent_map_tree;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4a79e1c5ebd..ebd7d6c37df 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -50,6 +50,7 @@
50#include "tree-log.h" 50#include "tree-log.h"
51#include "ref-cache.h" 51#include "ref-cache.h"
52#include "compression.h" 52#include "compression.h"
53#include "locking.h"
53 54
54struct btrfs_iget_args { 55struct btrfs_iget_args {
55 u64 ino; 56 u64 ino;
@@ -2021,6 +2022,7 @@ void btrfs_read_locked_inode(struct inode *inode)
2021 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 2022 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
2022 2023
2023 alloc_group_block = btrfs_inode_block_group(leaf, inode_item); 2024 alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
2025
2024 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, 2026 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
2025 alloc_group_block, 0); 2027 alloc_group_block, 0);
2026 btrfs_free_path(path); 2028 btrfs_free_path(path);
@@ -2117,6 +2119,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2117 goto failed; 2119 goto failed;
2118 } 2120 }
2119 2121
2122 btrfs_unlock_up_safe(path, 1);
2120 leaf = path->nodes[0]; 2123 leaf = path->nodes[0];
2121 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2124 inode_item = btrfs_item_ptr(leaf, path->slots[0],
2122 struct btrfs_inode_item); 2125 struct btrfs_inode_item);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 39bae7761db..68fd9ccf180 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -26,45 +26,215 @@
26#include "locking.h" 26#include "locking.h"
27 27
28/* 28/*
29 * locks the per buffer mutex in an extent buffer. This uses adaptive locks 29 * btrfs_header_level() isn't free, so don't call it when lockdep isn't
30 * and the spin is not tuned very extensively. The spinning does make a big 30 * on
31 * difference in almost every workload, but spinning for the right amount of
32 * time needs some help.
33 *
34 * In general, we want to spin as long as the lock holder is doing btree
35 * searches, and we should give up if they are in more expensive code.
36 */ 31 */
32#ifdef CONFIG_DEBUG_LOCK_ALLOC
33static inline void spin_nested(struct extent_buffer *eb)
34{
35 spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
36}
37#else
38static inline void spin_nested(struct extent_buffer *eb)
39{
40 spin_lock(&eb->lock);
41}
42#endif
37 43
38int btrfs_tree_lock(struct extent_buffer *eb) 44/*
45 * Setting a lock to blocking will drop the spinlock and set the
46 * flag that forces other procs who want the lock to wait. After
47 * this you can safely schedule with the lock held.
48 */
49void btrfs_set_lock_blocking(struct extent_buffer *eb)
39{ 50{
40 int i; 51 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
52 set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
53 spin_unlock(&eb->lock);
54 }
55 /* exit with the spin lock released and the bit set */
56}
41 57
42 if (mutex_trylock(&eb->mutex)) 58/*
43 return 0; 59 * clearing the blocking flag will take the spinlock again.
60 * After this you can't safely schedule
61 */
62void btrfs_clear_lock_blocking(struct extent_buffer *eb)
63{
64 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
65 spin_nested(eb);
66 clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
67 smp_mb__after_clear_bit();
68 }
69 /* exit with the spin lock held */
70}
71
72/*
73 * unfortunately, many of the places that currently set a lock to blocking
74 * don't end up blocking for every long, and often they don't block
75 * at all. For a dbench 50 run, if we don't spin one the blocking bit
76 * at all, the context switch rate can jump up to 400,000/sec or more.
77 *
78 * So, we're still stuck with this crummy spin on the blocking bit,
79 * at least until the most common causes of the short blocks
80 * can be dealt with.
81 */
82static int btrfs_spin_on_block(struct extent_buffer *eb)
83{
84 int i;
44 for (i = 0; i < 512; i++) { 85 for (i = 0; i < 512; i++) {
45 cpu_relax(); 86 cpu_relax();
46 if (mutex_trylock(&eb->mutex)) 87 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
88 return 1;
89 if (need_resched())
90 break;
91 }
92 return 0;
93}
94
95/*
96 * This is somewhat different from trylock. It will take the
97 * spinlock but if it finds the lock is set to blocking, it will
98 * return without the lock held.
99 *
100 * returns 1 if it was able to take the lock and zero otherwise
101 *
102 * After this call, scheduling is not safe without first calling
103 * btrfs_set_lock_blocking()
104 */
105int btrfs_try_spin_lock(struct extent_buffer *eb)
106{
107 int i;
108
109 spin_nested(eb);
110 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
111 return 1;
112 spin_unlock(&eb->lock);
113
114 /* spin for a bit on the BLOCKING flag */
115 for (i = 0; i < 2; i++) {
116 if (!btrfs_spin_on_block(eb))
117 break;
118
119 spin_nested(eb);
120 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
121 return 1;
122 spin_unlock(&eb->lock);
123 }
124 return 0;
125}
126
127/*
128 * the autoremove wake function will return 0 if it tried to wake up
129 * a process that was already awake, which means that process won't
130 * count as an exclusive wakeup. The waitq code will continue waking
131 * procs until it finds one that was actually sleeping.
132 *
133 * For btrfs, this isn't quite what we want. We want a single proc
134 * to be notified that the lock is ready for taking. If that proc
135 * already happen to be awake, great, it will loop around and try for
136 * the lock.
137 *
138 * So, btrfs_wake_function always returns 1, even when the proc that we
139 * tried to wake up was already awake.
140 */
141static int btrfs_wake_function(wait_queue_t *wait, unsigned mode,
142 int sync, void *key)
143{
144 autoremove_wake_function(wait, mode, sync, key);
145 return 1;
146}
147
148/*
149 * returns with the extent buffer spinlocked.
150 *
151 * This will spin and/or wait as required to take the lock, and then
152 * return with the spinlock held.
153 *
154 * After this call, scheduling is not safe without first calling
155 * btrfs_set_lock_blocking()
156 */
157int btrfs_tree_lock(struct extent_buffer *eb)
158{
159 DEFINE_WAIT(wait);
160 wait.func = btrfs_wake_function;
161
162 while(1) {
163 spin_nested(eb);
164
165 /* nobody is blocking, exit with the spinlock held */
166 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
47 return 0; 167 return 0;
168
169 /*
170 * we have the spinlock, but the real owner is blocking.
171 * wait for them
172 */
173 spin_unlock(&eb->lock);
174
175 /*
176 * spin for a bit, and if the blocking flag goes away,
177 * loop around
178 */
179 if (btrfs_spin_on_block(eb))
180 continue;
181
182 prepare_to_wait_exclusive(&eb->lock_wq, &wait,
183 TASK_UNINTERRUPTIBLE);
184
185 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
186 schedule();
187
188 finish_wait(&eb->lock_wq, &wait);
48 } 189 }
49 cpu_relax();
50 mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
51 return 0; 190 return 0;
52} 191}
53 192
193/*
194 * Very quick trylock, this does not spin or schedule. It returns
195 * 1 with the spinlock held if it was able to take the lock, or it
196 * returns zero if it was unable to take the lock.
197 *
198 * After this call, scheduling is not safe without first calling
199 * btrfs_set_lock_blocking()
200 */
54int btrfs_try_tree_lock(struct extent_buffer *eb) 201int btrfs_try_tree_lock(struct extent_buffer *eb)
55{ 202{
56 return mutex_trylock(&eb->mutex); 203 if (spin_trylock(&eb->lock)) {
204 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
205 /*
206 * we've got the spinlock, but the real owner is
207 * blocking. Drop the spinlock and return failure
208 */
209 spin_unlock(&eb->lock);
210 return 0;
211 }
212 return 1;
213 }
214 /* someone else has the spinlock giveup */
215 return 0;
57} 216}
58 217
59int btrfs_tree_unlock(struct extent_buffer *eb) 218int btrfs_tree_unlock(struct extent_buffer *eb)
60{ 219{
61 mutex_unlock(&eb->mutex); 220 /*
221 * if we were a blocking owner, we don't have the spinlock held
222 * just clear the bit and look for waiters
223 */
224 if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
225 smp_mb__after_clear_bit();
226 else
227 spin_unlock(&eb->lock);
228
229 if (waitqueue_active(&eb->lock_wq))
230 wake_up(&eb->lock_wq);
62 return 0; 231 return 0;
63} 232}
64 233
65int btrfs_tree_locked(struct extent_buffer *eb) 234int btrfs_tree_locked(struct extent_buffer *eb)
66{ 235{
67 return mutex_is_locked(&eb->mutex); 236 return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) ||
237 spin_is_locked(&eb->lock);
68} 238}
69 239
70/* 240/*
@@ -75,12 +245,14 @@ int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
75{ 245{
76 int i; 246 int i;
77 struct extent_buffer *eb; 247 struct extent_buffer *eb;
248
78 for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) { 249 for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
79 eb = path->nodes[i]; 250 eb = path->nodes[i];
80 if (!eb) 251 if (!eb)
81 break; 252 break;
82 smp_mb(); 253 smp_mb();
83 if (!list_empty(&eb->mutex.wait_list)) 254 if (spin_is_contended(&eb->lock) ||
255 waitqueue_active(&eb->lock_wq))
84 return 1; 256 return 1;
85 } 257 }
86 return 0; 258 return 0;
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index bc1faef1251..d92e707f587 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -22,6 +22,12 @@
22int btrfs_tree_lock(struct extent_buffer *eb); 22int btrfs_tree_lock(struct extent_buffer *eb);
23int btrfs_tree_unlock(struct extent_buffer *eb); 23int btrfs_tree_unlock(struct extent_buffer *eb);
24int btrfs_tree_locked(struct extent_buffer *eb); 24int btrfs_tree_locked(struct extent_buffer *eb);
25
25int btrfs_try_tree_lock(struct extent_buffer *eb); 26int btrfs_try_tree_lock(struct extent_buffer *eb);
27int btrfs_try_spin_lock(struct extent_buffer *eb);
28
26int btrfs_path_lock_waiting(struct btrfs_path *path, int level); 29int btrfs_path_lock_waiting(struct btrfs_path *path, int level);
30
31void btrfs_set_lock_blocking(struct extent_buffer *eb);
32void btrfs_clear_lock_blocking(struct extent_buffer *eb);
27#endif 33#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3e8358c3616..98d25fa4570 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -74,6 +74,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
74 u32 nritems; 74 u32 nritems;
75 75
76 root_node = btrfs_lock_root_node(root); 76 root_node = btrfs_lock_root_node(root);
77 btrfs_set_lock_blocking(root_node);
77 nritems = btrfs_header_nritems(root_node); 78 nritems = btrfs_header_nritems(root_node);
78 root->defrag_max.objectid = 0; 79 root->defrag_max.objectid = 0;
79 /* from above we know this is not a leaf */ 80 /* from above we know this is not a leaf */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4f26f3ed0c8..20794290256 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1615,6 +1615,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1615 1615
1616 btrfs_tree_lock(next); 1616 btrfs_tree_lock(next);
1617 clean_tree_block(trans, root, next); 1617 clean_tree_block(trans, root, next);
1618 btrfs_set_lock_blocking(next);
1618 btrfs_wait_tree_block_writeback(next); 1619 btrfs_wait_tree_block_writeback(next);
1619 btrfs_tree_unlock(next); 1620 btrfs_tree_unlock(next);
1620 1621
@@ -1661,6 +1662,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1661 next = path->nodes[*level]; 1662 next = path->nodes[*level];
1662 btrfs_tree_lock(next); 1663 btrfs_tree_lock(next);
1663 clean_tree_block(trans, root, next); 1664 clean_tree_block(trans, root, next);
1665 btrfs_set_lock_blocking(next);
1664 btrfs_wait_tree_block_writeback(next); 1666 btrfs_wait_tree_block_writeback(next);
1665 btrfs_tree_unlock(next); 1667 btrfs_tree_unlock(next);
1666 1668
@@ -1718,6 +1720,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1718 1720
1719 btrfs_tree_lock(next); 1721 btrfs_tree_lock(next);
1720 clean_tree_block(trans, root, next); 1722 clean_tree_block(trans, root, next);
1723 btrfs_set_lock_blocking(next);
1721 btrfs_wait_tree_block_writeback(next); 1724 btrfs_wait_tree_block_writeback(next);
1722 btrfs_tree_unlock(next); 1725 btrfs_tree_unlock(next);
1723 1726
@@ -1790,6 +1793,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1790 1793
1791 btrfs_tree_lock(next); 1794 btrfs_tree_lock(next);
1792 clean_tree_block(trans, log, next); 1795 clean_tree_block(trans, log, next);
1796 btrfs_set_lock_blocking(next);
1793 btrfs_wait_tree_block_writeback(next); 1797 btrfs_wait_tree_block_writeback(next);
1794 btrfs_tree_unlock(next); 1798 btrfs_tree_unlock(next);
1795 1799