aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2011-07-16 15:23:14 -0400
committerChris Mason <chris.mason@oracle.com>2011-07-27 12:46:46 -0400
commitbd681513fa6f2ff29aa391f01e413a2d1c59fd77 (patch)
treebb10ec6ef876b4d7a553cbe54976ec49a0d10b21
parent81317fdeddcef259b6ecf7b5c0d04caa167c6b54 (diff)
Btrfs: switch the btrfs tree locks to reader/writer
The btrfs metadata btree is the source of significant lock contention, especially in the root node. This commit changes our locking to use a reader/writer lock. The lock is built on top of rw spinlocks, and it extends the lock tracking to remember if we have a read lock or a write lock when we go to blocking. Atomics count the number of blocking readers or writers at any given time. It removes all of the adaptive spinning from the old code and uses only the spinning/blocking hints inside of btrfs to decide when it should continue spinning. In read heavy workloads this is dramatically faster. In write heavy workloads we're still faster because of less contention on the root node lock. We suffer slightly in dbench because we schedule more often during write locks, but all other benchmarks so far are improved. Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r--fs/btrfs/ctree.c268
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/delayed-inode.c2
-rw-r--r--fs/btrfs/extent-tree.c20
-rw-r--r--fs/btrfs/extent_io.c11
-rw-r--r--fs/btrfs/extent_io.h24
-rw-r--r--fs/btrfs/locking.c280
-rw-r--r--fs/btrfs/locking.h36
-rw-r--r--fs/btrfs/tree-log.c6
9 files changed, 431 insertions, 218 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index d2431284b913..0ad48e782d37 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -54,8 +54,13 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
54{ 54{
55 int i; 55 int i;
56 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 56 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
57 if (p->nodes[i] && p->locks[i]) 57 if (!p->nodes[i] || !p->locks[i])
58 btrfs_set_lock_blocking(p->nodes[i]); 58 continue;
59 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
60 if (p->locks[i] == BTRFS_READ_LOCK)
61 p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
62 else if (p->locks[i] == BTRFS_WRITE_LOCK)
63 p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
59 } 64 }
60} 65}
61 66
@@ -68,7 +73,7 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
68 * for held 73 * for held
69 */ 74 */
70noinline void btrfs_clear_path_blocking(struct btrfs_path *p, 75noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
71 struct extent_buffer *held) 76 struct extent_buffer *held, int held_rw)
72{ 77{
73 int i; 78 int i;
74 79
@@ -79,19 +84,29 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
79 * really sure by forcing the path to blocking before we clear 84 * really sure by forcing the path to blocking before we clear
80 * the path blocking. 85 * the path blocking.
81 */ 86 */
82 if (held) 87 if (held) {
83 btrfs_set_lock_blocking(held); 88 btrfs_set_lock_blocking_rw(held, held_rw);
89 if (held_rw == BTRFS_WRITE_LOCK)
90 held_rw = BTRFS_WRITE_LOCK_BLOCKING;
91 else if (held_rw == BTRFS_READ_LOCK)
92 held_rw = BTRFS_READ_LOCK_BLOCKING;
93 }
84 btrfs_set_path_blocking(p); 94 btrfs_set_path_blocking(p);
85#endif 95#endif
86 96
87 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { 97 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
88 if (p->nodes[i] && p->locks[i]) 98 if (p->nodes[i] && p->locks[i]) {
89 btrfs_clear_lock_blocking(p->nodes[i]); 99 btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
100 if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
101 p->locks[i] = BTRFS_WRITE_LOCK;
102 else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
103 p->locks[i] = BTRFS_READ_LOCK;
104 }
90 } 105 }
91 106
92#ifdef CONFIG_DEBUG_LOCK_ALLOC 107#ifdef CONFIG_DEBUG_LOCK_ALLOC
93 if (held) 108 if (held)
94 btrfs_clear_lock_blocking(held); 109 btrfs_clear_lock_blocking_rw(held, held_rw);
95#endif 110#endif
96} 111}
97 112
@@ -119,7 +134,7 @@ noinline void btrfs_release_path(struct btrfs_path *p)
119 if (!p->nodes[i]) 134 if (!p->nodes[i])
120 continue; 135 continue;
121 if (p->locks[i]) { 136 if (p->locks[i]) {
122 btrfs_tree_unlock(p->nodes[i]); 137 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
123 p->locks[i] = 0; 138 p->locks[i] = 0;
124 } 139 }
125 free_extent_buffer(p->nodes[i]); 140 free_extent_buffer(p->nodes[i]);
@@ -167,6 +182,25 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
167 return eb; 182 return eb;
168} 183}
169 184
185/* loop around taking references on and locking the root node of the
186 * tree until you end up with a lock on the root. A locked buffer
187 * is returned, with a reference held.
188 */
189struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
190{
191 struct extent_buffer *eb;
192
193 while (1) {
194 eb = btrfs_root_node(root);
195 btrfs_tree_read_lock(eb);
196 if (eb == root->node)
197 break;
198 btrfs_tree_read_unlock(eb);
199 free_extent_buffer(eb);
200 }
201 return eb;
202}
203
170/* cowonly root (everything not a reference counted cow subvolume), just get 204/* cowonly root (everything not a reference counted cow subvolume), just get
171 * put onto a simple dirty list. transaction.c walks this to make sure they 205 * put onto a simple dirty list. transaction.c walks this to make sure they
172 * get properly updated on disk. 206 * get properly updated on disk.
@@ -862,7 +896,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
862 896
863 mid = path->nodes[level]; 897 mid = path->nodes[level];
864 898
865 WARN_ON(!path->locks[level]); 899 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
900 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
866 WARN_ON(btrfs_header_generation(mid) != trans->transid); 901 WARN_ON(btrfs_header_generation(mid) != trans->transid);
867 902
868 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 903 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
@@ -1360,7 +1395,7 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
1360 1395
1361 t = path->nodes[i]; 1396 t = path->nodes[i];
1362 if (i >= lowest_unlock && i > skip_level && path->locks[i]) { 1397 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1363 btrfs_tree_unlock(t); 1398 btrfs_tree_unlock_rw(t, path->locks[i]);
1364 path->locks[i] = 0; 1399 path->locks[i] = 0;
1365 } 1400 }
1366 } 1401 }
@@ -1387,7 +1422,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1387 continue; 1422 continue;
1388 if (!path->locks[i]) 1423 if (!path->locks[i])
1389 continue; 1424 continue;
1390 btrfs_tree_unlock(path->nodes[i]); 1425 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
1391 path->locks[i] = 0; 1426 path->locks[i] = 0;
1392 } 1427 }
1393} 1428}
@@ -1436,6 +1471,8 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1436 * we can trust our generation number 1471 * we can trust our generation number
1437 */ 1472 */
1438 free_extent_buffer(tmp); 1473 free_extent_buffer(tmp);
1474 btrfs_set_path_blocking(p);
1475
1439 tmp = read_tree_block(root, blocknr, blocksize, gen); 1476 tmp = read_tree_block(root, blocknr, blocksize, gen);
1440 if (tmp && btrfs_buffer_uptodate(tmp, gen)) { 1477 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1441 *eb_ret = tmp; 1478 *eb_ret = tmp;
@@ -1491,20 +1528,27 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1491static int 1528static int
1492setup_nodes_for_search(struct btrfs_trans_handle *trans, 1529setup_nodes_for_search(struct btrfs_trans_handle *trans,
1493 struct btrfs_root *root, struct btrfs_path *p, 1530 struct btrfs_root *root, struct btrfs_path *p,
1494 struct extent_buffer *b, int level, int ins_len) 1531 struct extent_buffer *b, int level, int ins_len,
1532 int *write_lock_level)
1495{ 1533{
1496 int ret; 1534 int ret;
1497 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 1535 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1498 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1536 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1499 int sret; 1537 int sret;
1500 1538
1539 if (*write_lock_level < level + 1) {
1540 *write_lock_level = level + 1;
1541 btrfs_release_path(p);
1542 goto again;
1543 }
1544
1501 sret = reada_for_balance(root, p, level); 1545 sret = reada_for_balance(root, p, level);
1502 if (sret) 1546 if (sret)
1503 goto again; 1547 goto again;
1504 1548
1505 btrfs_set_path_blocking(p); 1549 btrfs_set_path_blocking(p);
1506 sret = split_node(trans, root, p, level); 1550 sret = split_node(trans, root, p, level);
1507 btrfs_clear_path_blocking(p, NULL); 1551 btrfs_clear_path_blocking(p, NULL, 0);
1508 1552
1509 BUG_ON(sret > 0); 1553 BUG_ON(sret > 0);
1510 if (sret) { 1554 if (sret) {
@@ -1516,13 +1560,19 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
1516 BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { 1560 BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
1517 int sret; 1561 int sret;
1518 1562
1563 if (*write_lock_level < level + 1) {
1564 *write_lock_level = level + 1;
1565 btrfs_release_path(p);
1566 goto again;
1567 }
1568
1519 sret = reada_for_balance(root, p, level); 1569 sret = reada_for_balance(root, p, level);
1520 if (sret) 1570 if (sret)
1521 goto again; 1571 goto again;
1522 1572
1523 btrfs_set_path_blocking(p); 1573 btrfs_set_path_blocking(p);
1524 sret = balance_level(trans, root, p, level); 1574 sret = balance_level(trans, root, p, level);
1525 btrfs_clear_path_blocking(p, NULL); 1575 btrfs_clear_path_blocking(p, NULL, 0);
1526 1576
1527 if (sret) { 1577 if (sret) {
1528 ret = sret; 1578 ret = sret;
@@ -1566,27 +1616,78 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1566 int err; 1616 int err;
1567 int level; 1617 int level;
1568 int lowest_unlock = 1; 1618 int lowest_unlock = 1;
1619 int root_lock;
1620 /* everything at write_lock_level or lower must be write locked */
1621 int write_lock_level = 0;
1569 u8 lowest_level = 0; 1622 u8 lowest_level = 0;
1570 1623
1571 lowest_level = p->lowest_level; 1624 lowest_level = p->lowest_level;
1572 WARN_ON(lowest_level && ins_len > 0); 1625 WARN_ON(lowest_level && ins_len > 0);
1573 WARN_ON(p->nodes[0] != NULL); 1626 WARN_ON(p->nodes[0] != NULL);
1574 1627
1575 if (ins_len < 0) 1628 if (ins_len < 0) {
1576 lowest_unlock = 2; 1629 lowest_unlock = 2;
1577 1630
1631 /* when we are removing items, we might have to go up to level
1632 * two as we update tree pointers Make sure we keep write
1633 * for those levels as well
1634 */
1635 write_lock_level = 2;
1636 } else if (ins_len > 0) {
1637 /*
1638 * for inserting items, make sure we have a write lock on
1639 * level 1 so we can update keys
1640 */
1641 write_lock_level = 1;
1642 }
1643
1644 if (!cow)
1645 write_lock_level = -1;
1646
1647 if (cow && (p->keep_locks || p->lowest_level))
1648 write_lock_level = BTRFS_MAX_LEVEL;
1649
1578again: 1650again:
1651 /*
1652 * we try very hard to do read locks on the root
1653 */
1654 root_lock = BTRFS_READ_LOCK;
1655 level = 0;
1579 if (p->search_commit_root) { 1656 if (p->search_commit_root) {
1657 /*
1658 * the commit roots are read only
1659 * so we always do read locks
1660 */
1580 b = root->commit_root; 1661 b = root->commit_root;
1581 extent_buffer_get(b); 1662 extent_buffer_get(b);
1663 level = btrfs_header_level(b);
1582 if (!p->skip_locking) 1664 if (!p->skip_locking)
1583 btrfs_tree_lock(b); 1665 btrfs_tree_read_lock(b);
1584 } else { 1666 } else {
1585 if (p->skip_locking) 1667 if (p->skip_locking) {
1586 b = btrfs_root_node(root); 1668 b = btrfs_root_node(root);
1587 else 1669 level = btrfs_header_level(b);
1588 b = btrfs_lock_root_node(root); 1670 } else {
1671 /* we don't know the level of the root node
1672 * until we actually have it read locked
1673 */
1674 b = btrfs_read_lock_root_node(root);
1675 level = btrfs_header_level(b);
1676 if (level <= write_lock_level) {
1677 /* whoops, must trade for write lock */
1678 btrfs_tree_read_unlock(b);
1679 free_extent_buffer(b);
1680 b = btrfs_lock_root_node(root);
1681 root_lock = BTRFS_WRITE_LOCK;
1682
1683 /* the level might have changed, check again */
1684 level = btrfs_header_level(b);
1685 }
1686 }
1589 } 1687 }
1688 p->nodes[level] = b;
1689 if (!p->skip_locking)
1690 p->locks[level] = root_lock;
1590 1691
1591 while (b) { 1692 while (b) {
1592 level = btrfs_header_level(b); 1693 level = btrfs_header_level(b);
@@ -1595,10 +1696,6 @@ again:
1595 * setup the path here so we can release it under lock 1696 * setup the path here so we can release it under lock
1596 * contention with the cow code 1697 * contention with the cow code
1597 */ 1698 */
1598 p->nodes[level] = b;
1599 if (!p->skip_locking)
1600 p->locks[level] = 1;
1601
1602 if (cow) { 1699 if (cow) {
1603 /* 1700 /*
1604 * if we don't really need to cow this block 1701 * if we don't really need to cow this block
@@ -1610,6 +1707,16 @@ again:
1610 1707
1611 btrfs_set_path_blocking(p); 1708 btrfs_set_path_blocking(p);
1612 1709
1710 /*
1711 * must have write locks on this node and the
1712 * parent
1713 */
1714 if (level + 1 > write_lock_level) {
1715 write_lock_level = level + 1;
1716 btrfs_release_path(p);
1717 goto again;
1718 }
1719
1613 err = btrfs_cow_block(trans, root, b, 1720 err = btrfs_cow_block(trans, root, b,
1614 p->nodes[level + 1], 1721 p->nodes[level + 1],
1615 p->slots[level + 1], &b); 1722 p->slots[level + 1], &b);
@@ -1622,10 +1729,7 @@ cow_done:
1622 BUG_ON(!cow && ins_len); 1729 BUG_ON(!cow && ins_len);
1623 1730
1624 p->nodes[level] = b; 1731 p->nodes[level] = b;
1625 if (!p->skip_locking) 1732 btrfs_clear_path_blocking(p, NULL, 0);
1626 p->locks[level] = 1;
1627
1628 btrfs_clear_path_blocking(p, NULL);
1629 1733
1630 /* 1734 /*
1631 * we have a lock on b and as long as we aren't changing 1735 * we have a lock on b and as long as we aren't changing
@@ -1651,7 +1755,7 @@ cow_done:
1651 } 1755 }
1652 p->slots[level] = slot; 1756 p->slots[level] = slot;
1653 err = setup_nodes_for_search(trans, root, p, b, level, 1757 err = setup_nodes_for_search(trans, root, p, b, level,
1654 ins_len); 1758 ins_len, &write_lock_level);
1655 if (err == -EAGAIN) 1759 if (err == -EAGAIN)
1656 goto again; 1760 goto again;
1657 if (err) { 1761 if (err) {
@@ -1661,6 +1765,19 @@ cow_done:
1661 b = p->nodes[level]; 1765 b = p->nodes[level];
1662 slot = p->slots[level]; 1766 slot = p->slots[level];
1663 1767
1768 /*
1769 * slot 0 is special, if we change the key
1770 * we have to update the parent pointer
1771 * which means we must have a write lock
1772 * on the parent
1773 */
1774 if (slot == 0 && cow &&
1775 write_lock_level < level + 1) {
1776 write_lock_level = level + 1;
1777 btrfs_release_path(p);
1778 goto again;
1779 }
1780
1664 unlock_up(p, level, lowest_unlock); 1781 unlock_up(p, level, lowest_unlock);
1665 1782
1666 if (level == lowest_level) { 1783 if (level == lowest_level) {
@@ -1679,23 +1796,42 @@ cow_done:
1679 } 1796 }
1680 1797
1681 if (!p->skip_locking) { 1798 if (!p->skip_locking) {
1682 btrfs_clear_path_blocking(p, NULL); 1799 level = btrfs_header_level(b);
1683 err = btrfs_try_spin_lock(b); 1800 if (level <= write_lock_level) {
1684 1801 err = btrfs_try_tree_write_lock(b);
1685 if (!err) { 1802 if (!err) {
1686 btrfs_set_path_blocking(p); 1803 btrfs_set_path_blocking(p);
1687 btrfs_tree_lock(b); 1804 btrfs_tree_lock(b);
1688 btrfs_clear_path_blocking(p, b); 1805 btrfs_clear_path_blocking(p, b,
1806 BTRFS_WRITE_LOCK);
1807 }
1808 p->locks[level] = BTRFS_WRITE_LOCK;
1809 } else {
1810 err = btrfs_try_tree_read_lock(b);
1811 if (!err) {
1812 btrfs_set_path_blocking(p);
1813 btrfs_tree_read_lock(b);
1814 btrfs_clear_path_blocking(p, b,
1815 BTRFS_READ_LOCK);
1816 }
1817 p->locks[level] = BTRFS_READ_LOCK;
1689 } 1818 }
1819 p->nodes[level] = b;
1690 } 1820 }
1691 } else { 1821 } else {
1692 p->slots[level] = slot; 1822 p->slots[level] = slot;
1693 if (ins_len > 0 && 1823 if (ins_len > 0 &&
1694 btrfs_leaf_free_space(root, b) < ins_len) { 1824 btrfs_leaf_free_space(root, b) < ins_len) {
1825 if (write_lock_level < 1) {
1826 write_lock_level = 1;
1827 btrfs_release_path(p);
1828 goto again;
1829 }
1830
1695 btrfs_set_path_blocking(p); 1831 btrfs_set_path_blocking(p);
1696 err = split_leaf(trans, root, key, 1832 err = split_leaf(trans, root, key,
1697 p, ins_len, ret == 0); 1833 p, ins_len, ret == 0);
1698 btrfs_clear_path_blocking(p, NULL); 1834 btrfs_clear_path_blocking(p, NULL, 0);
1699 1835
1700 BUG_ON(err > 0); 1836 BUG_ON(err > 0);
1701 if (err) { 1837 if (err) {
@@ -1976,7 +2112,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
1976 add_root_to_dirty_list(root); 2112 add_root_to_dirty_list(root);
1977 extent_buffer_get(c); 2113 extent_buffer_get(c);
1978 path->nodes[level] = c; 2114 path->nodes[level] = c;
1979 path->locks[level] = 1; 2115 path->locks[level] = BTRFS_WRITE_LOCK;
1980 path->slots[level] = 0; 2116 path->slots[level] = 0;
1981 return 0; 2117 return 0;
1982} 2118}
@@ -3819,11 +3955,11 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3819 3955
3820 WARN_ON(!path->keep_locks); 3956 WARN_ON(!path->keep_locks);
3821again: 3957again:
3822 cur = btrfs_lock_root_node(root); 3958 cur = btrfs_read_lock_root_node(root);
3823 level = btrfs_header_level(cur); 3959 level = btrfs_header_level(cur);
3824 WARN_ON(path->nodes[level]); 3960 WARN_ON(path->nodes[level]);
3825 path->nodes[level] = cur; 3961 path->nodes[level] = cur;
3826 path->locks[level] = 1; 3962 path->locks[level] = BTRFS_READ_LOCK;
3827 3963
3828 if (btrfs_header_generation(cur) < min_trans) { 3964 if (btrfs_header_generation(cur) < min_trans) {
3829 ret = 1; 3965 ret = 1;
@@ -3913,12 +4049,12 @@ find_next_key:
3913 cur = read_node_slot(root, cur, slot); 4049 cur = read_node_slot(root, cur, slot);
3914 BUG_ON(!cur); 4050 BUG_ON(!cur);
3915 4051
3916 btrfs_tree_lock(cur); 4052 btrfs_tree_read_lock(cur);
3917 4053
3918 path->locks[level - 1] = 1; 4054 path->locks[level - 1] = BTRFS_READ_LOCK;
3919 path->nodes[level - 1] = cur; 4055 path->nodes[level - 1] = cur;
3920 unlock_up(path, level, 1); 4056 unlock_up(path, level, 1);
3921 btrfs_clear_path_blocking(path, NULL); 4057 btrfs_clear_path_blocking(path, NULL, 0);
3922 } 4058 }
3923out: 4059out:
3924 if (ret == 0) 4060 if (ret == 0)
@@ -4034,6 +4170,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4034 int ret; 4170 int ret;
4035 int old_spinning = path->leave_spinning; 4171 int old_spinning = path->leave_spinning;
4036 int force_blocking = 0; 4172 int force_blocking = 0;
4173 int next_rw_lock = 0;
4037 4174
4038 nritems = btrfs_header_nritems(path->nodes[0]); 4175 nritems = btrfs_header_nritems(path->nodes[0]);
4039 if (nritems == 0) 4176 if (nritems == 0)
@@ -4051,6 +4188,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4051again: 4188again:
4052 level = 1; 4189 level = 1;
4053 next = NULL; 4190 next = NULL;
4191 next_rw_lock = 0;
4054 btrfs_release_path(path); 4192 btrfs_release_path(path);
4055 4193
4056 path->keep_locks = 1; 4194 path->keep_locks = 1;
@@ -4096,11 +4234,12 @@ again:
4096 } 4234 }
4097 4235
4098 if (next) { 4236 if (next) {
4099 btrfs_tree_unlock(next); 4237 btrfs_tree_unlock_rw(next, next_rw_lock);
4100 free_extent_buffer(next); 4238 free_extent_buffer(next);
4101 } 4239 }
4102 4240
4103 next = c; 4241 next = c;
4242 next_rw_lock = path->locks[level];
4104 ret = read_block_for_search(NULL, root, path, &next, level, 4243 ret = read_block_for_search(NULL, root, path, &next, level,
4105 slot, &key); 4244 slot, &key);
4106 if (ret == -EAGAIN) 4245 if (ret == -EAGAIN)
@@ -4112,15 +4251,22 @@ again:
4112 } 4251 }
4113 4252
4114 if (!path->skip_locking) { 4253 if (!path->skip_locking) {
4115 ret = btrfs_try_spin_lock(next); 4254 ret = btrfs_try_tree_read_lock(next);
4116 if (!ret) { 4255 if (!ret) {
4117 btrfs_set_path_blocking(path); 4256 btrfs_set_path_blocking(path);
4118 btrfs_tree_lock(next); 4257 btrfs_tree_read_lock(next);
4119 if (!force_blocking) 4258 if (!force_blocking) {
4120 btrfs_clear_path_blocking(path, next); 4259 btrfs_clear_path_blocking(path, next,
4260 BTRFS_READ_LOCK);
4261 }
4262 }
4263 if (force_blocking) {
4264 btrfs_set_lock_blocking_rw(next,
4265 BTRFS_READ_LOCK);
4266 next_rw_lock = BTRFS_READ_LOCK_BLOCKING;
4267 } else {
4268 next_rw_lock = BTRFS_READ_LOCK;
4121 } 4269 }
4122 if (force_blocking)
4123 btrfs_set_lock_blocking(next);
4124 } 4270 }
4125 break; 4271 break;
4126 } 4272 }
@@ -4129,14 +4275,13 @@ again:
4129 level--; 4275 level--;
4130 c = path->nodes[level]; 4276 c = path->nodes[level];
4131 if (path->locks[level]) 4277 if (path->locks[level])
4132 btrfs_tree_unlock(c); 4278 btrfs_tree_unlock_rw(c, path->locks[level]);
4133 4279
4134 free_extent_buffer(c); 4280 free_extent_buffer(c);
4135 path->nodes[level] = next; 4281 path->nodes[level] = next;
4136 path->slots[level] = 0; 4282 path->slots[level] = 0;
4137 if (!path->skip_locking) 4283 if (!path->skip_locking)
4138 path->locks[level] = 1; 4284 path->locks[level] = next_rw_lock;
4139
4140 if (!level) 4285 if (!level)
4141 break; 4286 break;
4142 4287
@@ -4151,16 +4296,21 @@ again:
4151 } 4296 }
4152 4297
4153 if (!path->skip_locking) { 4298 if (!path->skip_locking) {
4154 btrfs_assert_tree_locked(path->nodes[level]); 4299 ret = btrfs_try_tree_read_lock(next);
4155 ret = btrfs_try_spin_lock(next);
4156 if (!ret) { 4300 if (!ret) {
4157 btrfs_set_path_blocking(path); 4301 btrfs_set_path_blocking(path);
4158 btrfs_tree_lock(next); 4302 btrfs_tree_read_lock(next);
4159 if (!force_blocking) 4303 if (!force_blocking)
4160 btrfs_clear_path_blocking(path, next); 4304 btrfs_clear_path_blocking(path, next,
4305 BTRFS_READ_LOCK);
4306 }
4307 if (force_blocking) {
4308 btrfs_set_lock_blocking_rw(next,
4309 BTRFS_READ_LOCK);
4310 next_rw_lock = BTRFS_READ_LOCK_BLOCKING;
4311 } else {
4312 next_rw_lock = BTRFS_READ_LOCK;
4161 } 4313 }
4162 if (force_blocking)
4163 btrfs_set_lock_blocking(next);
4164 } 4314 }
4165 } 4315 }
4166 ret = 0; 4316 ret = 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3063f21d3fc6..40235e10cfb9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2333,7 +2333,7 @@ struct btrfs_path *btrfs_alloc_path(void);
2333void btrfs_free_path(struct btrfs_path *p); 2333void btrfs_free_path(struct btrfs_path *p);
2334void btrfs_set_path_blocking(struct btrfs_path *p); 2334void btrfs_set_path_blocking(struct btrfs_path *p);
2335void btrfs_clear_path_blocking(struct btrfs_path *p, 2335void btrfs_clear_path_blocking(struct btrfs_path *p,
2336 struct extent_buffer *held); 2336 struct extent_buffer *held, int held_rw);
2337void btrfs_unlock_up_safe(struct btrfs_path *p, int level); 2337void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
2338 2338
2339int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2339int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 98c68e658a9b..b52c672f4c18 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -735,7 +735,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
735 } 735 }
736 736
737 /* reset all the locked nodes in the patch to spinning locks. */ 737 /* reset all the locked nodes in the patch to spinning locks. */
738 btrfs_clear_path_blocking(path, NULL); 738 btrfs_clear_path_blocking(path, NULL, 0);
739 739
740 /* insert the keys of the items */ 740 /* insert the keys of the items */
741 ret = setup_items_for_insert(trans, root, path, keys, data_size, 741 ret = setup_items_for_insert(trans, root, path, keys, data_size,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7021dde74d81..2a782c2fcb62 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5912,7 +5912,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5912 return 1; 5912 return 1;
5913 5913
5914 if (path->locks[level] && !wc->keep_locks) { 5914 if (path->locks[level] && !wc->keep_locks) {
5915 btrfs_tree_unlock(eb); 5915 btrfs_tree_unlock_rw(eb, path->locks[level]);
5916 path->locks[level] = 0; 5916 path->locks[level] = 0;
5917 } 5917 }
5918 return 0; 5918 return 0;
@@ -5936,7 +5936,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5936 * keep the tree lock 5936 * keep the tree lock
5937 */ 5937 */
5938 if (path->locks[level] && level > 0) { 5938 if (path->locks[level] && level > 0) {
5939 btrfs_tree_unlock(eb); 5939 btrfs_tree_unlock_rw(eb, path->locks[level]);
5940 path->locks[level] = 0; 5940 path->locks[level] = 0;
5941 } 5941 }
5942 return 0; 5942 return 0;
@@ -6049,7 +6049,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
6049 BUG_ON(level != btrfs_header_level(next)); 6049 BUG_ON(level != btrfs_header_level(next));
6050 path->nodes[level] = next; 6050 path->nodes[level] = next;
6051 path->slots[level] = 0; 6051 path->slots[level] = 0;
6052 path->locks[level] = 1; 6052 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6053 wc->level = level; 6053 wc->level = level;
6054 if (wc->level == 1) 6054 if (wc->level == 1)
6055 wc->reada_slot = 0; 6055 wc->reada_slot = 0;
@@ -6120,7 +6120,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6120 BUG_ON(level == 0); 6120 BUG_ON(level == 0);
6121 btrfs_tree_lock(eb); 6121 btrfs_tree_lock(eb);
6122 btrfs_set_lock_blocking(eb); 6122 btrfs_set_lock_blocking(eb);
6123 path->locks[level] = 1; 6123 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6124 6124
6125 ret = btrfs_lookup_extent_info(trans, root, 6125 ret = btrfs_lookup_extent_info(trans, root,
6126 eb->start, eb->len, 6126 eb->start, eb->len,
@@ -6129,8 +6129,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6129 BUG_ON(ret); 6129 BUG_ON(ret);
6130 BUG_ON(wc->refs[level] == 0); 6130 BUG_ON(wc->refs[level] == 0);
6131 if (wc->refs[level] == 1) { 6131 if (wc->refs[level] == 1) {
6132 btrfs_tree_unlock(eb); 6132 btrfs_tree_unlock_rw(eb, path->locks[level]);
6133 path->locks[level] = 0;
6134 return 1; 6133 return 1;
6135 } 6134 }
6136 } 6135 }
@@ -6152,7 +6151,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6152 btrfs_header_generation(eb) == trans->transid) { 6151 btrfs_header_generation(eb) == trans->transid) {
6153 btrfs_tree_lock(eb); 6152 btrfs_tree_lock(eb);
6154 btrfs_set_lock_blocking(eb); 6153 btrfs_set_lock_blocking(eb);
6155 path->locks[level] = 1; 6154 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6156 } 6155 }
6157 clean_tree_block(trans, root, eb); 6156 clean_tree_block(trans, root, eb);
6158 } 6157 }
@@ -6231,7 +6230,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6231 return 0; 6230 return 0;
6232 6231
6233 if (path->locks[level]) { 6232 if (path->locks[level]) {
6234 btrfs_tree_unlock(path->nodes[level]); 6233 btrfs_tree_unlock_rw(path->nodes[level],
6234 path->locks[level]);
6235 path->locks[level] = 0; 6235 path->locks[level] = 0;
6236 } 6236 }
6237 free_extent_buffer(path->nodes[level]); 6237 free_extent_buffer(path->nodes[level]);
@@ -6283,7 +6283,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6283 path->nodes[level] = btrfs_lock_root_node(root); 6283 path->nodes[level] = btrfs_lock_root_node(root);
6284 btrfs_set_lock_blocking(path->nodes[level]); 6284 btrfs_set_lock_blocking(path->nodes[level]);
6285 path->slots[level] = 0; 6285 path->slots[level] = 0;
6286 path->locks[level] = 1; 6286 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6287 memset(&wc->update_progress, 0, 6287 memset(&wc->update_progress, 0,
6288 sizeof(wc->update_progress)); 6288 sizeof(wc->update_progress));
6289 } else { 6289 } else {
@@ -6451,7 +6451,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6451 level = btrfs_header_level(node); 6451 level = btrfs_header_level(node);
6452 path->nodes[level] = node; 6452 path->nodes[level] = node;
6453 path->slots[level] = 0; 6453 path->slots[level] = 0;
6454 path->locks[level] = 1; 6454 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6455 6455
6456 wc->refs[parent_level] = 1; 6456 wc->refs[parent_level] = 1;
6457 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 6457 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 76ecbb8ed0e0..5392c3b12fc1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3017,8 +3017,15 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3017 return NULL; 3017 return NULL;
3018 eb->start = start; 3018 eb->start = start;
3019 eb->len = len; 3019 eb->len = len;
3020 spin_lock_init(&eb->lock); 3020 rwlock_init(&eb->lock);
3021 init_waitqueue_head(&eb->lock_wq); 3021 atomic_set(&eb->write_locks, 0);
3022 atomic_set(&eb->read_locks, 0);
3023 atomic_set(&eb->blocking_readers, 0);
3024 atomic_set(&eb->blocking_writers, 0);
3025 atomic_set(&eb->spinning_readers, 0);
3026 atomic_set(&eb->spinning_writers, 0);
3027 init_waitqueue_head(&eb->write_lock_wq);
3028 init_waitqueue_head(&eb->read_lock_wq);
3022 3029
3023#if LEAK_DEBUG 3030#if LEAK_DEBUG
3024 spin_lock_irqsave(&leak_lock, flags); 3031 spin_lock_irqsave(&leak_lock, flags);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b5f120cca916..21a7ca9e7282 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -128,14 +128,26 @@ struct extent_buffer {
128 struct rcu_head rcu_head; 128 struct rcu_head rcu_head;
129 atomic_t refs; 129 atomic_t refs;
130 130
131 /* the spinlock is used to protect most operations */ 131 /* count of read lock holders on the extent buffer */
132 spinlock_t lock; 132 atomic_t write_locks;
133 atomic_t read_locks;
134 atomic_t blocking_writers;
135 atomic_t blocking_readers;
136 atomic_t spinning_readers;
137 atomic_t spinning_writers;
138
139 /* protects write locks */
140 rwlock_t lock;
141
142 /* readers use lock_wq while they wait for the write
143 * lock holders to unlock
144 */
145 wait_queue_head_t write_lock_wq;
133 146
134 /* 147 /* writers use read_lock_wq while they wait for readers
135 * when we keep the lock held while blocking, waiters go onto 148 * to unlock
136 * the wq
137 */ 149 */
138 wait_queue_head_t lock_wq; 150 wait_queue_head_t read_lock_wq;
139}; 151};
140 152
141static inline void extent_set_compress_type(unsigned long *bio_flags, 153static inline void extent_set_compress_type(unsigned long *bio_flags,
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 66fa43dc3f0f..d77b67c4b275 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -24,185 +24,197 @@
24#include "extent_io.h" 24#include "extent_io.h"
25#include "locking.h" 25#include "locking.h"
26 26
27static inline void spin_nested(struct extent_buffer *eb) 27void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
28{
29 spin_lock(&eb->lock);
30}
31 28
32/* 29/*
33 * Setting a lock to blocking will drop the spinlock and set the 30 * if we currently have a spinning reader or writer lock
34 * flag that forces other procs who want the lock to wait. After 31 * (indicated by the rw flag) this will bump the count
35 * this you can safely schedule with the lock held. 32 * of blocking holders and drop the spinlock.
36 */ 33 */
37void btrfs_set_lock_blocking(struct extent_buffer *eb) 34void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
38{ 35{
39 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { 36 if (rw == BTRFS_WRITE_LOCK) {
40 set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); 37 if (atomic_read(&eb->blocking_writers) == 0) {
41 spin_unlock(&eb->lock); 38 WARN_ON(atomic_read(&eb->spinning_writers) != 1);
39 atomic_dec(&eb->spinning_writers);
40 btrfs_assert_tree_locked(eb);
41 atomic_inc(&eb->blocking_writers);
42 write_unlock(&eb->lock);
43 }
44 } else if (rw == BTRFS_READ_LOCK) {
45 btrfs_assert_tree_read_locked(eb);
46 atomic_inc(&eb->blocking_readers);
47 WARN_ON(atomic_read(&eb->spinning_readers) == 0);
48 atomic_dec(&eb->spinning_readers);
49 read_unlock(&eb->lock);
42 } 50 }
43 /* exit with the spin lock released and the bit set */ 51 return;
44} 52}
45 53
46/* 54/*
47 * clearing the blocking flag will take the spinlock again. 55 * if we currently have a blocking lock, take the spinlock
48 * After this you can't safely schedule 56 * and drop our blocking count
49 */ 57 */
50void btrfs_clear_lock_blocking(struct extent_buffer *eb) 58void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
51{ 59{
52 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { 60 if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
53 spin_nested(eb); 61 BUG_ON(atomic_read(&eb->blocking_writers) != 1);
54 clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); 62 write_lock(&eb->lock);
55 smp_mb__after_clear_bit(); 63 WARN_ON(atomic_read(&eb->spinning_writers));
64 atomic_inc(&eb->spinning_writers);
65 if (atomic_dec_and_test(&eb->blocking_writers))
66 wake_up(&eb->write_lock_wq);
67 } else if (rw == BTRFS_READ_LOCK_BLOCKING) {
68 BUG_ON(atomic_read(&eb->blocking_readers) == 0);
69 read_lock(&eb->lock);
70 atomic_inc(&eb->spinning_readers);
71 if (atomic_dec_and_test(&eb->blocking_readers))
72 wake_up(&eb->read_lock_wq);
56 } 73 }
57 /* exit with the spin lock held */ 74 return;
58} 75}
59 76
60/* 77/*
61 * unfortunately, many of the places that currently set a lock to blocking 78 * take a spinning read lock. This will wait for any blocking
62 * don't end up blocking for very long, and often they don't block 79 * writers
63 * at all. For a dbench 50 run, if we don't spin on the blocking bit
64 * at all, the context switch rate can jump up to 400,000/sec or more.
65 *
66 * So, we're still stuck with this crummy spin on the blocking bit,
67 * at least until the most common causes of the short blocks
68 * can be dealt with.
69 */ 80 */
70static int btrfs_spin_on_block(struct extent_buffer *eb) 81void btrfs_tree_read_lock(struct extent_buffer *eb)
71{ 82{
72 int i; 83again:
73 84 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
74 for (i = 0; i < 512; i++) { 85 read_lock(&eb->lock);
75 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 86 if (atomic_read(&eb->blocking_writers)) {
76 return 1; 87 read_unlock(&eb->lock);
77 if (need_resched()) 88 wait_event(eb->write_lock_wq,
78 break; 89 atomic_read(&eb->blocking_writers) == 0);
79 cpu_relax(); 90 goto again;
80 } 91 }
81 return 0; 92 atomic_inc(&eb->read_locks);
93 atomic_inc(&eb->spinning_readers);
82} 94}
83 95
84/* 96/*
85 * This is somewhat different from trylock. It will take the 97 * returns 1 if we get the read lock and 0 if we don't
86 * spinlock but if it finds the lock is set to blocking, it will 98 * this won't wait for blocking writers
87 * return without the lock held.
88 *
89 * returns 1 if it was able to take the lock and zero otherwise
90 *
91 * After this call, scheduling is not safe without first calling
92 * btrfs_set_lock_blocking()
93 */ 99 */
94int btrfs_try_spin_lock(struct extent_buffer *eb) 100int btrfs_try_tree_read_lock(struct extent_buffer *eb)
95{ 101{
96 int i; 102 if (atomic_read(&eb->blocking_writers))
103 return 0;
97 104
98 if (btrfs_spin_on_block(eb)) { 105 read_lock(&eb->lock);
99 spin_nested(eb); 106 if (atomic_read(&eb->blocking_writers)) {
100 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 107 read_unlock(&eb->lock);
101 return 1; 108 return 0;
102 spin_unlock(&eb->lock);
103 } 109 }
104 /* spin for a bit on the BLOCKING flag */ 110 atomic_inc(&eb->read_locks);
105 for (i = 0; i < 2; i++) { 111 atomic_inc(&eb->spinning_readers);
106 cpu_relax(); 112 return 1;
107 if (!btrfs_spin_on_block(eb))
108 break;
109
110 spin_nested(eb);
111 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
112 return 1;
113 spin_unlock(&eb->lock);
114 }
115 return 0;
116} 113}
117 114
118/* 115/*
119 * the autoremove wake function will return 0 if it tried to wake up 116 * returns 1 if we get the read lock and 0 if we don't
120 * a process that was already awake, which means that process won't 117 * this won't wait for blocking writers or readers
121 * count as an exclusive wakeup. The waitq code will continue waking
122 * procs until it finds one that was actually sleeping.
123 *
124 * For btrfs, this isn't quite what we want. We want a single proc
125 * to be notified that the lock is ready for taking. If that proc
126 * already happen to be awake, great, it will loop around and try for
127 * the lock.
128 *
129 * So, btrfs_wake_function always returns 1, even when the proc that we
130 * tried to wake up was already awake.
131 */ 118 */
132static int btrfs_wake_function(wait_queue_t *wait, unsigned mode, 119int btrfs_try_tree_write_lock(struct extent_buffer *eb)
133 int sync, void *key)
134{ 120{
135 autoremove_wake_function(wait, mode, sync, key); 121 if (atomic_read(&eb->blocking_writers) ||
122 atomic_read(&eb->blocking_readers))
123 return 0;
124 write_lock(&eb->lock);
125 if (atomic_read(&eb->blocking_writers) ||
126 atomic_read(&eb->blocking_readers)) {
127 write_unlock(&eb->lock);
128 return 0;
129 }
130 atomic_inc(&eb->write_locks);
131 atomic_inc(&eb->spinning_writers);
136 return 1; 132 return 1;
137} 133}
138 134
139/* 135/*
140 * returns with the extent buffer spinlocked. 136 * drop a spinning read lock
141 * 137 */
142 * This will spin and/or wait as required to take the lock, and then 138void btrfs_tree_read_unlock(struct extent_buffer *eb)
143 * return with the spinlock held. 139{
144 * 140 btrfs_assert_tree_read_locked(eb);
145 * After this call, scheduling is not safe without first calling 141 WARN_ON(atomic_read(&eb->spinning_readers) == 0);
146 * btrfs_set_lock_blocking() 142 atomic_dec(&eb->spinning_readers);
143 atomic_dec(&eb->read_locks);
144 read_unlock(&eb->lock);
145}
146
147/*
148 * drop a blocking read lock
149 */
150void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
151{
152 btrfs_assert_tree_read_locked(eb);
153 WARN_ON(atomic_read(&eb->blocking_readers) == 0);
154 if (atomic_dec_and_test(&eb->blocking_readers))
155 wake_up(&eb->read_lock_wq);
156 atomic_dec(&eb->read_locks);
157}
158
159/*
160 * take a spinning write lock. This will wait for both
161 * blocking readers or writers
147 */ 162 */
148int btrfs_tree_lock(struct extent_buffer *eb) 163int btrfs_tree_lock(struct extent_buffer *eb)
149{ 164{
150 DEFINE_WAIT(wait); 165again:
151 wait.func = btrfs_wake_function; 166 wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
152 167 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
153 if (!btrfs_spin_on_block(eb)) 168 write_lock(&eb->lock);
154 goto sleep; 169 if (atomic_read(&eb->blocking_readers)) {
155 170 write_unlock(&eb->lock);
156 while(1) { 171 wait_event(eb->read_lock_wq,
157 spin_nested(eb); 172 atomic_read(&eb->blocking_readers) == 0);
158 173 goto again;
159 /* nobody is blocking, exit with the spinlock held */
160 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
161 return 0;
162
163 /*
164 * we have the spinlock, but the real owner is blocking.
165 * wait for them
166 */
167 spin_unlock(&eb->lock);
168
169 /*
170 * spin for a bit, and if the blocking flag goes away,
171 * loop around
172 */
173 cpu_relax();
174 if (btrfs_spin_on_block(eb))
175 continue;
176sleep:
177 prepare_to_wait_exclusive(&eb->lock_wq, &wait,
178 TASK_UNINTERRUPTIBLE);
179
180 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
181 schedule();
182
183 finish_wait(&eb->lock_wq, &wait);
184 } 174 }
175 if (atomic_read(&eb->blocking_writers)) {
176 write_unlock(&eb->lock);
177 wait_event(eb->write_lock_wq,
178 atomic_read(&eb->blocking_writers) == 0);
179 goto again;
180 }
181 WARN_ON(atomic_read(&eb->spinning_writers));
182 atomic_inc(&eb->spinning_writers);
183 atomic_inc(&eb->write_locks);
185 return 0; 184 return 0;
186} 185}
187 186
187/*
188 * drop a spinning or a blocking write lock.
189 */
188int btrfs_tree_unlock(struct extent_buffer *eb) 190int btrfs_tree_unlock(struct extent_buffer *eb)
189{ 191{
190 /* 192 int blockers = atomic_read(&eb->blocking_writers);
191 * if we were a blocking owner, we don't have the spinlock held 193
192 * just clear the bit and look for waiters 194 BUG_ON(blockers > 1);
193 */ 195
194 if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 196 btrfs_assert_tree_locked(eb);
195 smp_mb__after_clear_bit(); 197 atomic_dec(&eb->write_locks);
196 else 198
197 spin_unlock(&eb->lock); 199 if (blockers) {
198 200 WARN_ON(atomic_read(&eb->spinning_writers));
199 if (waitqueue_active(&eb->lock_wq)) 201 atomic_dec(&eb->blocking_writers);
200 wake_up(&eb->lock_wq); 202 smp_wmb();
203 wake_up(&eb->write_lock_wq);
204 } else {
205 WARN_ON(atomic_read(&eb->spinning_writers) != 1);
206 atomic_dec(&eb->spinning_writers);
207 write_unlock(&eb->lock);
208 }
201 return 0; 209 return 0;
202} 210}
203 211
204void btrfs_assert_tree_locked(struct extent_buffer *eb) 212void btrfs_assert_tree_locked(struct extent_buffer *eb)
205{ 213{
206 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 214 BUG_ON(!atomic_read(&eb->write_locks));
207 assert_spin_locked(&eb->lock); 215}
216
217void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
218{
219 BUG_ON(!atomic_read(&eb->read_locks));
208} 220}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 5c33a560a2f1..17247ddb81a0 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -19,11 +19,43 @@
19#ifndef __BTRFS_LOCKING_ 19#ifndef __BTRFS_LOCKING_
20#define __BTRFS_LOCKING_ 20#define __BTRFS_LOCKING_
21 21
22#define BTRFS_WRITE_LOCK 1
23#define BTRFS_READ_LOCK 2
24#define BTRFS_WRITE_LOCK_BLOCKING 3
25#define BTRFS_READ_LOCK_BLOCKING 4
26
22int btrfs_tree_lock(struct extent_buffer *eb); 27int btrfs_tree_lock(struct extent_buffer *eb);
23int btrfs_tree_unlock(struct extent_buffer *eb); 28int btrfs_tree_unlock(struct extent_buffer *eb);
24int btrfs_try_spin_lock(struct extent_buffer *eb); 29int btrfs_try_spin_lock(struct extent_buffer *eb);
25 30
26void btrfs_set_lock_blocking(struct extent_buffer *eb); 31void btrfs_tree_read_lock(struct extent_buffer *eb);
27void btrfs_clear_lock_blocking(struct extent_buffer *eb); 32void btrfs_tree_read_unlock(struct extent_buffer *eb);
33void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
34void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw);
35void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw);
28void btrfs_assert_tree_locked(struct extent_buffer *eb); 36void btrfs_assert_tree_locked(struct extent_buffer *eb);
37int btrfs_try_tree_read_lock(struct extent_buffer *eb);
38int btrfs_try_tree_write_lock(struct extent_buffer *eb);
39
40static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
41{
42 if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING)
43 btrfs_tree_unlock(eb);
44 else if (rw == BTRFS_READ_LOCK_BLOCKING)
45 btrfs_tree_read_unlock_blocking(eb);
46 else if (rw == BTRFS_READ_LOCK)
47 btrfs_tree_read_unlock(eb);
48 else
49 BUG();
50}
51
52static inline void btrfs_set_lock_blocking(struct extent_buffer *eb)
53{
54 btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK);
55}
56
57static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb)
58{
59 btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING);
60}
29#endif 61#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4ce8a9f41d1e..ac278dd83175 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1730,8 +1730,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1730 btrfs_read_buffer(next, ptr_gen); 1730 btrfs_read_buffer(next, ptr_gen);
1731 1731
1732 btrfs_tree_lock(next); 1732 btrfs_tree_lock(next);
1733 clean_tree_block(trans, root, next);
1734 btrfs_set_lock_blocking(next); 1733 btrfs_set_lock_blocking(next);
1734 clean_tree_block(trans, root, next);
1735 btrfs_wait_tree_block_writeback(next); 1735 btrfs_wait_tree_block_writeback(next);
1736 btrfs_tree_unlock(next); 1736 btrfs_tree_unlock(next);
1737 1737
@@ -1796,8 +1796,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1796 next = path->nodes[*level]; 1796 next = path->nodes[*level];
1797 1797
1798 btrfs_tree_lock(next); 1798 btrfs_tree_lock(next);
1799 clean_tree_block(trans, root, next);
1800 btrfs_set_lock_blocking(next); 1799 btrfs_set_lock_blocking(next);
1800 clean_tree_block(trans, root, next);
1801 btrfs_wait_tree_block_writeback(next); 1801 btrfs_wait_tree_block_writeback(next);
1802 btrfs_tree_unlock(next); 1802 btrfs_tree_unlock(next);
1803 1803
@@ -1864,8 +1864,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1864 next = path->nodes[orig_level]; 1864 next = path->nodes[orig_level];
1865 1865
1866 btrfs_tree_lock(next); 1866 btrfs_tree_lock(next);
1867 clean_tree_block(trans, log, next);
1868 btrfs_set_lock_blocking(next); 1867 btrfs_set_lock_blocking(next);
1868 clean_tree_block(trans, log, next);
1869 btrfs_wait_tree_block_writeback(next); 1869 btrfs_wait_tree_block_writeback(next);
1870 btrfs_tree_unlock(next); 1870 btrfs_tree_unlock(next);
1871 1871