aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/ctree.c268
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/delayed-inode.c2
-rw-r--r--fs/btrfs/extent-tree.c20
-rw-r--r--fs/btrfs/extent_io.c11
-rw-r--r--fs/btrfs/extent_io.h24
-rw-r--r--fs/btrfs/locking.c280
-rw-r--r--fs/btrfs/locking.h36
-rw-r--r--fs/btrfs/tree-log.c6
9 files changed, 431 insertions, 218 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index d2431284b913..0ad48e782d37 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -54,8 +54,13 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
54{ 54{
55 int i; 55 int i;
56 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 56 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
57 if (p->nodes[i] && p->locks[i]) 57 if (!p->nodes[i] || !p->locks[i])
58 btrfs_set_lock_blocking(p->nodes[i]); 58 continue;
59 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
60 if (p->locks[i] == BTRFS_READ_LOCK)
61 p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
62 else if (p->locks[i] == BTRFS_WRITE_LOCK)
63 p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
59 } 64 }
60} 65}
61 66
@@ -68,7 +73,7 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
68 * for held 73 * for held
69 */ 74 */
70noinline void btrfs_clear_path_blocking(struct btrfs_path *p, 75noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
71 struct extent_buffer *held) 76 struct extent_buffer *held, int held_rw)
72{ 77{
73 int i; 78 int i;
74 79
@@ -79,19 +84,29 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
79 * really sure by forcing the path to blocking before we clear 84 * really sure by forcing the path to blocking before we clear
80 * the path blocking. 85 * the path blocking.
81 */ 86 */
82 if (held) 87 if (held) {
83 btrfs_set_lock_blocking(held); 88 btrfs_set_lock_blocking_rw(held, held_rw);
89 if (held_rw == BTRFS_WRITE_LOCK)
90 held_rw = BTRFS_WRITE_LOCK_BLOCKING;
91 else if (held_rw == BTRFS_READ_LOCK)
92 held_rw = BTRFS_READ_LOCK_BLOCKING;
93 }
84 btrfs_set_path_blocking(p); 94 btrfs_set_path_blocking(p);
85#endif 95#endif
86 96
87 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { 97 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
88 if (p->nodes[i] && p->locks[i]) 98 if (p->nodes[i] && p->locks[i]) {
89 btrfs_clear_lock_blocking(p->nodes[i]); 99 btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
100 if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
101 p->locks[i] = BTRFS_WRITE_LOCK;
102 else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
103 p->locks[i] = BTRFS_READ_LOCK;
104 }
90 } 105 }
91 106
92#ifdef CONFIG_DEBUG_LOCK_ALLOC 107#ifdef CONFIG_DEBUG_LOCK_ALLOC
93 if (held) 108 if (held)
94 btrfs_clear_lock_blocking(held); 109 btrfs_clear_lock_blocking_rw(held, held_rw);
95#endif 110#endif
96} 111}
97 112
@@ -119,7 +134,7 @@ noinline void btrfs_release_path(struct btrfs_path *p)
119 if (!p->nodes[i]) 134 if (!p->nodes[i])
120 continue; 135 continue;
121 if (p->locks[i]) { 136 if (p->locks[i]) {
122 btrfs_tree_unlock(p->nodes[i]); 137 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
123 p->locks[i] = 0; 138 p->locks[i] = 0;
124 } 139 }
125 free_extent_buffer(p->nodes[i]); 140 free_extent_buffer(p->nodes[i]);
@@ -167,6 +182,25 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
167 return eb; 182 return eb;
168} 183}
169 184
185/* loop around taking references on and locking the root node of the
186 * tree until you end up with a lock on the root. A locked buffer
187 * is returned, with a reference held.
188 */
189struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
190{
191 struct extent_buffer *eb;
192
193 while (1) {
194 eb = btrfs_root_node(root);
195 btrfs_tree_read_lock(eb);
196 if (eb == root->node)
197 break;
198 btrfs_tree_read_unlock(eb);
199 free_extent_buffer(eb);
200 }
201 return eb;
202}
203
170/* cowonly root (everything not a reference counted cow subvolume), just get 204/* cowonly root (everything not a reference counted cow subvolume), just get
171 * put onto a simple dirty list. transaction.c walks this to make sure they 205 * put onto a simple dirty list. transaction.c walks this to make sure they
172 * get properly updated on disk. 206 * get properly updated on disk.
@@ -862,7 +896,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
862 896
863 mid = path->nodes[level]; 897 mid = path->nodes[level];
864 898
865 WARN_ON(!path->locks[level]); 899 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
900 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
866 WARN_ON(btrfs_header_generation(mid) != trans->transid); 901 WARN_ON(btrfs_header_generation(mid) != trans->transid);
867 902
868 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 903 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
@@ -1360,7 +1395,7 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
1360 1395
1361 t = path->nodes[i]; 1396 t = path->nodes[i];
1362 if (i >= lowest_unlock && i > skip_level && path->locks[i]) { 1397 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1363 btrfs_tree_unlock(t); 1398 btrfs_tree_unlock_rw(t, path->locks[i]);
1364 path->locks[i] = 0; 1399 path->locks[i] = 0;
1365 } 1400 }
1366 } 1401 }
@@ -1387,7 +1422,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1387 continue; 1422 continue;
1388 if (!path->locks[i]) 1423 if (!path->locks[i])
1389 continue; 1424 continue;
1390 btrfs_tree_unlock(path->nodes[i]); 1425 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
1391 path->locks[i] = 0; 1426 path->locks[i] = 0;
1392 } 1427 }
1393} 1428}
@@ -1436,6 +1471,8 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1436 * we can trust our generation number 1471 * we can trust our generation number
1437 */ 1472 */
1438 free_extent_buffer(tmp); 1473 free_extent_buffer(tmp);
1474 btrfs_set_path_blocking(p);
1475
1439 tmp = read_tree_block(root, blocknr, blocksize, gen); 1476 tmp = read_tree_block(root, blocknr, blocksize, gen);
1440 if (tmp && btrfs_buffer_uptodate(tmp, gen)) { 1477 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1441 *eb_ret = tmp; 1478 *eb_ret = tmp;
@@ -1491,20 +1528,27 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1491static int 1528static int
1492setup_nodes_for_search(struct btrfs_trans_handle *trans, 1529setup_nodes_for_search(struct btrfs_trans_handle *trans,
1493 struct btrfs_root *root, struct btrfs_path *p, 1530 struct btrfs_root *root, struct btrfs_path *p,
1494 struct extent_buffer *b, int level, int ins_len) 1531 struct extent_buffer *b, int level, int ins_len,
1532 int *write_lock_level)
1495{ 1533{
1496 int ret; 1534 int ret;
1497 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 1535 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1498 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1536 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1499 int sret; 1537 int sret;
1500 1538
1539 if (*write_lock_level < level + 1) {
1540 *write_lock_level = level + 1;
1541 btrfs_release_path(p);
1542 goto again;
1543 }
1544
1501 sret = reada_for_balance(root, p, level); 1545 sret = reada_for_balance(root, p, level);
1502 if (sret) 1546 if (sret)
1503 goto again; 1547 goto again;
1504 1548
1505 btrfs_set_path_blocking(p); 1549 btrfs_set_path_blocking(p);
1506 sret = split_node(trans, root, p, level); 1550 sret = split_node(trans, root, p, level);
1507 btrfs_clear_path_blocking(p, NULL); 1551 btrfs_clear_path_blocking(p, NULL, 0);
1508 1552
1509 BUG_ON(sret > 0); 1553 BUG_ON(sret > 0);
1510 if (sret) { 1554 if (sret) {
@@ -1516,13 +1560,19 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
1516 BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { 1560 BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
1517 int sret; 1561 int sret;
1518 1562
1563 if (*write_lock_level < level + 1) {
1564 *write_lock_level = level + 1;
1565 btrfs_release_path(p);
1566 goto again;
1567 }
1568
1519 sret = reada_for_balance(root, p, level); 1569 sret = reada_for_balance(root, p, level);
1520 if (sret) 1570 if (sret)
1521 goto again; 1571 goto again;
1522 1572
1523 btrfs_set_path_blocking(p); 1573 btrfs_set_path_blocking(p);
1524 sret = balance_level(trans, root, p, level); 1574 sret = balance_level(trans, root, p, level);
1525 btrfs_clear_path_blocking(p, NULL); 1575 btrfs_clear_path_blocking(p, NULL, 0);
1526 1576
1527 if (sret) { 1577 if (sret) {
1528 ret = sret; 1578 ret = sret;
@@ -1566,27 +1616,78 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1566 int err; 1616 int err;
1567 int level; 1617 int level;
1568 int lowest_unlock = 1; 1618 int lowest_unlock = 1;
1619 int root_lock;
1620 /* everything at write_lock_level or lower must be write locked */
1621 int write_lock_level = 0;
1569 u8 lowest_level = 0; 1622 u8 lowest_level = 0;
1570 1623
1571 lowest_level = p->lowest_level; 1624 lowest_level = p->lowest_level;
1572 WARN_ON(lowest_level && ins_len > 0); 1625 WARN_ON(lowest_level && ins_len > 0);
1573 WARN_ON(p->nodes[0] != NULL); 1626 WARN_ON(p->nodes[0] != NULL);
1574 1627
1575 if (ins_len < 0) 1628 if (ins_len < 0) {
1576 lowest_unlock = 2; 1629 lowest_unlock = 2;
1577 1630
1631 /* when we are removing items, we might have to go up to level
1632 * two as we update tree pointers Make sure we keep write
1633 * for those levels as well
1634 */
1635 write_lock_level = 2;
1636 } else if (ins_len > 0) {
1637 /*
1638 * for inserting items, make sure we have a write lock on
1639 * level 1 so we can update keys
1640 */
1641 write_lock_level = 1;
1642 }
1643
1644 if (!cow)
1645 write_lock_level = -1;
1646
1647 if (cow && (p->keep_locks || p->lowest_level))
1648 write_lock_level = BTRFS_MAX_LEVEL;
1649
1578again: 1650again:
1651 /*
1652 * we try very hard to do read locks on the root
1653 */
1654 root_lock = BTRFS_READ_LOCK;
1655 level = 0;
1579 if (p->search_commit_root) { 1656 if (p->search_commit_root) {
1657 /*
1658 * the commit roots are read only
1659 * so we always do read locks
1660 */
1580 b = root->commit_root; 1661 b = root->commit_root;
1581 extent_buffer_get(b); 1662 extent_buffer_get(b);
1663 level = btrfs_header_level(b);
1582 if (!p->skip_locking) 1664 if (!p->skip_locking)
1583 btrfs_tree_lock(b); 1665 btrfs_tree_read_lock(b);
1584 } else { 1666 } else {
1585 if (p->skip_locking) 1667 if (p->skip_locking) {
1586 b = btrfs_root_node(root); 1668 b = btrfs_root_node(root);
1587 else 1669 level = btrfs_header_level(b);
1588 b = btrfs_lock_root_node(root); 1670 } else {
1671 /* we don't know the level of the root node
1672 * until we actually have it read locked
1673 */
1674 b = btrfs_read_lock_root_node(root);
1675 level = btrfs_header_level(b);
1676 if (level <= write_lock_level) {
1677 /* whoops, must trade for write lock */
1678 btrfs_tree_read_unlock(b);
1679 free_extent_buffer(b);
1680 b = btrfs_lock_root_node(root);
1681 root_lock = BTRFS_WRITE_LOCK;
1682
1683 /* the level might have changed, check again */
1684 level = btrfs_header_level(b);
1685 }
1686 }
1589 } 1687 }
1688 p->nodes[level] = b;
1689 if (!p->skip_locking)
1690 p->locks[level] = root_lock;
1590 1691
1591 while (b) { 1692 while (b) {
1592 level = btrfs_header_level(b); 1693 level = btrfs_header_level(b);
@@ -1595,10 +1696,6 @@ again:
1595 * setup the path here so we can release it under lock 1696 * setup the path here so we can release it under lock
1596 * contention with the cow code 1697 * contention with the cow code
1597 */ 1698 */
1598 p->nodes[level] = b;
1599 if (!p->skip_locking)
1600 p->locks[level] = 1;
1601
1602 if (cow) { 1699 if (cow) {
1603 /* 1700 /*
1604 * if we don't really need to cow this block 1701 * if we don't really need to cow this block
@@ -1610,6 +1707,16 @@ again:
1610 1707
1611 btrfs_set_path_blocking(p); 1708 btrfs_set_path_blocking(p);
1612 1709
1710 /*
1711 * must have write locks on this node and the
1712 * parent
1713 */
1714 if (level + 1 > write_lock_level) {
1715 write_lock_level = level + 1;
1716 btrfs_release_path(p);
1717 goto again;
1718 }
1719
1613 err = btrfs_cow_block(trans, root, b, 1720 err = btrfs_cow_block(trans, root, b,
1614 p->nodes[level + 1], 1721 p->nodes[level + 1],
1615 p->slots[level + 1], &b); 1722 p->slots[level + 1], &b);
@@ -1622,10 +1729,7 @@ cow_done:
1622 BUG_ON(!cow && ins_len); 1729 BUG_ON(!cow && ins_len);
1623 1730
1624 p->nodes[level] = b; 1731 p->nodes[level] = b;
1625 if (!p->skip_locking) 1732 btrfs_clear_path_blocking(p, NULL, 0);
1626 p->locks[level] = 1;
1627
1628 btrfs_clear_path_blocking(p, NULL);
1629 1733
1630 /* 1734 /*
1631 * we have a lock on b and as long as we aren't changing 1735 * we have a lock on b and as long as we aren't changing
@@ -1651,7 +1755,7 @@ cow_done:
1651 } 1755 }
1652 p->slots[level] = slot; 1756 p->slots[level] = slot;
1653 err = setup_nodes_for_search(trans, root, p, b, level, 1757 err = setup_nodes_for_search(trans, root, p, b, level,
1654 ins_len); 1758 ins_len, &write_lock_level);
1655 if (err == -EAGAIN) 1759 if (err == -EAGAIN)
1656 goto again; 1760 goto again;
1657 if (err) { 1761 if (err) {
@@ -1661,6 +1765,19 @@ cow_done:
1661 b = p->nodes[level]; 1765 b = p->nodes[level];
1662 slot = p->slots[level]; 1766 slot = p->slots[level];
1663 1767
1768 /*
1769 * slot 0 is special, if we change the key
1770 * we have to update the parent pointer
1771 * which means we must have a write lock
1772 * on the parent
1773 */
1774 if (slot == 0 && cow &&
1775 write_lock_level < level + 1) {
1776 write_lock_level = level + 1;
1777 btrfs_release_path(p);
1778 goto again;
1779 }
1780
1664 unlock_up(p, level, lowest_unlock); 1781 unlock_up(p, level, lowest_unlock);
1665 1782
1666 if (level == lowest_level) { 1783 if (level == lowest_level) {
@@ -1679,23 +1796,42 @@ cow_done:
1679 } 1796 }
1680 1797
1681 if (!p->skip_locking) { 1798 if (!p->skip_locking) {
1682 btrfs_clear_path_blocking(p, NULL); 1799 level = btrfs_header_level(b);
1683 err = btrfs_try_spin_lock(b); 1800 if (level <= write_lock_level) {
1684 1801 err = btrfs_try_tree_write_lock(b);
1685 if (!err) { 1802 if (!err) {
1686 btrfs_set_path_blocking(p); 1803 btrfs_set_path_blocking(p);
1687 btrfs_tree_lock(b); 1804 btrfs_tree_lock(b);
1688 btrfs_clear_path_blocking(p, b); 1805 btrfs_clear_path_blocking(p, b,
1806 BTRFS_WRITE_LOCK);
1807 }
1808 p->locks[level] = BTRFS_WRITE_LOCK;
1809 } else {
1810 err = btrfs_try_tree_read_lock(b);
1811 if (!err) {
1812 btrfs_set_path_blocking(p);
1813 btrfs_tree_read_lock(b);
1814 btrfs_clear_path_blocking(p, b,
1815 BTRFS_READ_LOCK);
1816 }
1817 p->locks[level] = BTRFS_READ_LOCK;
1689 } 1818 }
1819 p->nodes[level] = b;
1690 } 1820 }
1691 } else { 1821 } else {
1692 p->slots[level] = slot; 1822 p->slots[level] = slot;
1693 if (ins_len > 0 && 1823 if (ins_len > 0 &&
1694 btrfs_leaf_free_space(root, b) < ins_len) { 1824 btrfs_leaf_free_space(root, b) < ins_len) {
1825 if (write_lock_level < 1) {
1826 write_lock_level = 1;
1827 btrfs_release_path(p);
1828 goto again;
1829 }
1830
1695 btrfs_set_path_blocking(p); 1831 btrfs_set_path_blocking(p);
1696 err = split_leaf(trans, root, key, 1832 err = split_leaf(trans, root, key,
1697 p, ins_len, ret == 0); 1833 p, ins_len, ret == 0);
1698 btrfs_clear_path_blocking(p, NULL); 1834 btrfs_clear_path_blocking(p, NULL, 0);
1699 1835
1700 BUG_ON(err > 0); 1836 BUG_ON(err > 0);
1701 if (err) { 1837 if (err) {
@@ -1976,7 +2112,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
1976 add_root_to_dirty_list(root); 2112 add_root_to_dirty_list(root);
1977 extent_buffer_get(c); 2113 extent_buffer_get(c);
1978 path->nodes[level] = c; 2114 path->nodes[level] = c;
1979 path->locks[level] = 1; 2115 path->locks[level] = BTRFS_WRITE_LOCK;
1980 path->slots[level] = 0; 2116 path->slots[level] = 0;
1981 return 0; 2117 return 0;
1982} 2118}
@@ -3819,11 +3955,11 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3819 3955
3820 WARN_ON(!path->keep_locks); 3956 WARN_ON(!path->keep_locks);
3821again: 3957again:
3822 cur = btrfs_lock_root_node(root); 3958 cur = btrfs_read_lock_root_node(root);
3823 level = btrfs_header_level(cur); 3959 level = btrfs_header_level(cur);
3824 WARN_ON(path->nodes[level]); 3960 WARN_ON(path->nodes[level]);
3825 path->nodes[level] = cur; 3961 path->nodes[level] = cur;
3826 path->locks[level] = 1; 3962 path->locks[level] = BTRFS_READ_LOCK;
3827 3963
3828 if (btrfs_header_generation(cur) < min_trans) { 3964 if (btrfs_header_generation(cur) < min_trans) {
3829 ret = 1; 3965 ret = 1;
@@ -3913,12 +4049,12 @@ find_next_key:
3913 cur = read_node_slot(root, cur, slot); 4049 cur = read_node_slot(root, cur, slot);
3914 BUG_ON(!cur); 4050 BUG_ON(!cur);
3915 4051
3916 btrfs_tree_lock(cur); 4052 btrfs_tree_read_lock(cur);
3917 4053
3918 path->locks[level - 1] = 1; 4054 path->locks[level - 1] = BTRFS_READ_LOCK;
3919 path->nodes[level - 1] = cur; 4055 path->nodes[level - 1] = cur;
3920 unlock_up(path, level, 1); 4056 unlock_up(path, level, 1);
3921 btrfs_clear_path_blocking(path, NULL); 4057 btrfs_clear_path_blocking(path, NULL, 0);
3922 } 4058 }
3923out: 4059out:
3924 if (ret == 0) 4060 if (ret == 0)
@@ -4034,6 +4170,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4034 int ret; 4170 int ret;
4035 int old_spinning = path->leave_spinning; 4171 int old_spinning = path->leave_spinning;
4036 int force_blocking = 0; 4172 int force_blocking = 0;
4173 int next_rw_lock = 0;
4037 4174
4038 nritems = btrfs_header_nritems(path->nodes[0]); 4175 nritems = btrfs_header_nritems(path->nodes[0]);
4039 if (nritems == 0) 4176 if (nritems == 0)
@@ -4051,6 +4188,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4051again: 4188again:
4052 level = 1; 4189 level = 1;
4053 next = NULL; 4190 next = NULL;
4191 next_rw_lock = 0;
4054 btrfs_release_path(path); 4192 btrfs_release_path(path);
4055 4193
4056 path->keep_locks = 1; 4194 path->keep_locks = 1;
@@ -4096,11 +4234,12 @@ again:
4096 } 4234 }
4097 4235
4098 if (next) { 4236 if (next) {
4099 btrfs_tree_unlock(next); 4237 btrfs_tree_unlock_rw(next, next_rw_lock);
4100 free_extent_buffer(next); 4238 free_extent_buffer(next);
4101 } 4239 }
4102 4240
4103 next = c; 4241 next = c;
4242 next_rw_lock = path->locks[level];
4104 ret = read_block_for_search(NULL, root, path, &next, level, 4243 ret = read_block_for_search(NULL, root, path, &next, level,
4105 slot, &key); 4244 slot, &key);
4106 if (ret == -EAGAIN) 4245 if (ret == -EAGAIN)
@@ -4112,15 +4251,22 @@ again:
4112 } 4251 }
4113 4252
4114 if (!path->skip_locking) { 4253 if (!path->skip_locking) {
4115 ret = btrfs_try_spin_lock(next); 4254 ret = btrfs_try_tree_read_lock(next);
4116 if (!ret) { 4255 if (!ret) {
4117 btrfs_set_path_blocking(path); 4256 btrfs_set_path_blocking(path);
4118 btrfs_tree_lock(next); 4257 btrfs_tree_read_lock(next);
4119 if (!force_blocking) 4258 if (!force_blocking) {
4120 btrfs_clear_path_blocking(path, next); 4259 btrfs_clear_path_blocking(path, next,
4260 BTRFS_READ_LOCK);
4261 }
4262 }
4263 if (force_blocking) {
4264 btrfs_set_lock_blocking_rw(next,
4265 BTRFS_READ_LOCK);
4266 next_rw_lock = BTRFS_READ_LOCK_BLOCKING;
4267 } else {
4268 next_rw_lock = BTRFS_READ_LOCK;
4121 } 4269 }
4122 if (force_blocking)
4123 btrfs_set_lock_blocking(next);
4124 } 4270 }
4125 break; 4271 break;
4126 } 4272 }
@@ -4129,14 +4275,13 @@ again:
4129 level--; 4275 level--;
4130 c = path->nodes[level]; 4276 c = path->nodes[level];
4131 if (path->locks[level]) 4277 if (path->locks[level])
4132 btrfs_tree_unlock(c); 4278 btrfs_tree_unlock_rw(c, path->locks[level]);
4133 4279
4134 free_extent_buffer(c); 4280 free_extent_buffer(c);
4135 path->nodes[level] = next; 4281 path->nodes[level] = next;
4136 path->slots[level] = 0; 4282 path->slots[level] = 0;
4137 if (!path->skip_locking) 4283 if (!path->skip_locking)
4138 path->locks[level] = 1; 4284 path->locks[level] = next_rw_lock;
4139
4140 if (!level) 4285 if (!level)
4141 break; 4286 break;
4142 4287
@@ -4151,16 +4296,21 @@ again:
4151 } 4296 }
4152 4297
4153 if (!path->skip_locking) { 4298 if (!path->skip_locking) {
4154 btrfs_assert_tree_locked(path->nodes[level]); 4299 ret = btrfs_try_tree_read_lock(next);
4155 ret = btrfs_try_spin_lock(next);
4156 if (!ret) { 4300 if (!ret) {
4157 btrfs_set_path_blocking(path); 4301 btrfs_set_path_blocking(path);
4158 btrfs_tree_lock(next); 4302 btrfs_tree_read_lock(next);
4159 if (!force_blocking) 4303 if (!force_blocking)
4160 btrfs_clear_path_blocking(path, next); 4304 btrfs_clear_path_blocking(path, next,
4305 BTRFS_READ_LOCK);
4306 }
4307 if (force_blocking) {
4308 btrfs_set_lock_blocking_rw(next,
4309 BTRFS_READ_LOCK);
4310 next_rw_lock = BTRFS_READ_LOCK_BLOCKING;
4311 } else {
4312 next_rw_lock = BTRFS_READ_LOCK;
4161 } 4313 }
4162 if (force_blocking)
4163 btrfs_set_lock_blocking(next);
4164 } 4314 }
4165 } 4315 }
4166 ret = 0; 4316 ret = 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3063f21d3fc6..40235e10cfb9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2333,7 +2333,7 @@ struct btrfs_path *btrfs_alloc_path(void);
2333void btrfs_free_path(struct btrfs_path *p); 2333void btrfs_free_path(struct btrfs_path *p);
2334void btrfs_set_path_blocking(struct btrfs_path *p); 2334void btrfs_set_path_blocking(struct btrfs_path *p);
2335void btrfs_clear_path_blocking(struct btrfs_path *p, 2335void btrfs_clear_path_blocking(struct btrfs_path *p,
2336 struct extent_buffer *held); 2336 struct extent_buffer *held, int held_rw);
2337void btrfs_unlock_up_safe(struct btrfs_path *p, int level); 2337void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
2338 2338
2339int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2339int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 98c68e658a9b..b52c672f4c18 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -735,7 +735,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
735 } 735 }
736 736
737 /* reset all the locked nodes in the patch to spinning locks. */ 737 /* reset all the locked nodes in the patch to spinning locks. */
738 btrfs_clear_path_blocking(path, NULL); 738 btrfs_clear_path_blocking(path, NULL, 0);
739 739
740 /* insert the keys of the items */ 740 /* insert the keys of the items */
741 ret = setup_items_for_insert(trans, root, path, keys, data_size, 741 ret = setup_items_for_insert(trans, root, path, keys, data_size,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7021dde74d81..2a782c2fcb62 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5912,7 +5912,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5912 return 1; 5912 return 1;
5913 5913
5914 if (path->locks[level] && !wc->keep_locks) { 5914 if (path->locks[level] && !wc->keep_locks) {
5915 btrfs_tree_unlock(eb); 5915 btrfs_tree_unlock_rw(eb, path->locks[level]);
5916 path->locks[level] = 0; 5916 path->locks[level] = 0;
5917 } 5917 }
5918 return 0; 5918 return 0;
@@ -5936,7 +5936,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5936 * keep the tree lock 5936 * keep the tree lock
5937 */ 5937 */
5938 if (path->locks[level] && level > 0) { 5938 if (path->locks[level] && level > 0) {
5939 btrfs_tree_unlock(eb); 5939 btrfs_tree_unlock_rw(eb, path->locks[level]);
5940 path->locks[level] = 0; 5940 path->locks[level] = 0;
5941 } 5941 }
5942 return 0; 5942 return 0;
@@ -6049,7 +6049,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
6049 BUG_ON(level != btrfs_header_level(next)); 6049 BUG_ON(level != btrfs_header_level(next));
6050 path->nodes[level] = next; 6050 path->nodes[level] = next;
6051 path->slots[level] = 0; 6051 path->slots[level] = 0;
6052 path->locks[level] = 1; 6052 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6053 wc->level = level; 6053 wc->level = level;
6054 if (wc->level == 1) 6054 if (wc->level == 1)
6055 wc->reada_slot = 0; 6055 wc->reada_slot = 0;
@@ -6120,7 +6120,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6120 BUG_ON(level == 0); 6120 BUG_ON(level == 0);
6121 btrfs_tree_lock(eb); 6121 btrfs_tree_lock(eb);
6122 btrfs_set_lock_blocking(eb); 6122 btrfs_set_lock_blocking(eb);
6123 path->locks[level] = 1; 6123 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6124 6124
6125 ret = btrfs_lookup_extent_info(trans, root, 6125 ret = btrfs_lookup_extent_info(trans, root,
6126 eb->start, eb->len, 6126 eb->start, eb->len,
@@ -6129,8 +6129,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6129 BUG_ON(ret); 6129 BUG_ON(ret);
6130 BUG_ON(wc->refs[level] == 0); 6130 BUG_ON(wc->refs[level] == 0);
6131 if (wc->refs[level] == 1) { 6131 if (wc->refs[level] == 1) {
6132 btrfs_tree_unlock(eb); 6132 btrfs_tree_unlock_rw(eb, path->locks[level]);
6133 path->locks[level] = 0;
6134 return 1; 6133 return 1;
6135 } 6134 }
6136 } 6135 }
@@ -6152,7 +6151,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6152 btrfs_header_generation(eb) == trans->transid) { 6151 btrfs_header_generation(eb) == trans->transid) {
6153 btrfs_tree_lock(eb); 6152 btrfs_tree_lock(eb);
6154 btrfs_set_lock_blocking(eb); 6153 btrfs_set_lock_blocking(eb);
6155 path->locks[level] = 1; 6154 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6156 } 6155 }
6157 clean_tree_block(trans, root, eb); 6156 clean_tree_block(trans, root, eb);
6158 } 6157 }
@@ -6231,7 +6230,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6231 return 0; 6230 return 0;
6232 6231
6233 if (path->locks[level]) { 6232 if (path->locks[level]) {
6234 btrfs_tree_unlock(path->nodes[level]); 6233 btrfs_tree_unlock_rw(path->nodes[level],
6234 path->locks[level]);
6235 path->locks[level] = 0; 6235 path->locks[level] = 0;
6236 } 6236 }
6237 free_extent_buffer(path->nodes[level]); 6237 free_extent_buffer(path->nodes[level]);
@@ -6283,7 +6283,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6283 path->nodes[level] = btrfs_lock_root_node(root); 6283 path->nodes[level] = btrfs_lock_root_node(root);
6284 btrfs_set_lock_blocking(path->nodes[level]); 6284 btrfs_set_lock_blocking(path->nodes[level]);
6285 path->slots[level] = 0; 6285 path->slots[level] = 0;
6286 path->locks[level] = 1; 6286 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6287 memset(&wc->update_progress, 0, 6287 memset(&wc->update_progress, 0,
6288 sizeof(wc->update_progress)); 6288 sizeof(wc->update_progress));
6289 } else { 6289 } else {
@@ -6451,7 +6451,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6451 level = btrfs_header_level(node); 6451 level = btrfs_header_level(node);
6452 path->nodes[level] = node; 6452 path->nodes[level] = node;
6453 path->slots[level] = 0; 6453 path->slots[level] = 0;
6454 path->locks[level] = 1; 6454 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6455 6455
6456 wc->refs[parent_level] = 1; 6456 wc->refs[parent_level] = 1;
6457 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 6457 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 76ecbb8ed0e0..5392c3b12fc1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3017,8 +3017,15 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3017 return NULL; 3017 return NULL;
3018 eb->start = start; 3018 eb->start = start;
3019 eb->len = len; 3019 eb->len = len;
3020 spin_lock_init(&eb->lock); 3020 rwlock_init(&eb->lock);
3021 init_waitqueue_head(&eb->lock_wq); 3021 atomic_set(&eb->write_locks, 0);
3022 atomic_set(&eb->read_locks, 0);
3023 atomic_set(&eb->blocking_readers, 0);
3024 atomic_set(&eb->blocking_writers, 0);
3025 atomic_set(&eb->spinning_readers, 0);
3026 atomic_set(&eb->spinning_writers, 0);
3027 init_waitqueue_head(&eb->write_lock_wq);
3028 init_waitqueue_head(&eb->read_lock_wq);
3022 3029
3023#if LEAK_DEBUG 3030#if LEAK_DEBUG
3024 spin_lock_irqsave(&leak_lock, flags); 3031 spin_lock_irqsave(&leak_lock, flags);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b5f120cca916..21a7ca9e7282 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -128,14 +128,26 @@ struct extent_buffer {
128 struct rcu_head rcu_head; 128 struct rcu_head rcu_head;
129 atomic_t refs; 129 atomic_t refs;
130 130
131 /* the spinlock is used to protect most operations */ 131 /* count of read lock holders on the extent buffer */
132 spinlock_t lock; 132 atomic_t write_locks;
133 atomic_t read_locks;
134 atomic_t blocking_writers;
135 atomic_t blocking_readers;
136 atomic_t spinning_readers;
137 atomic_t spinning_writers;
138
139 /* protects write locks */
140 rwlock_t lock;
141
142 /* readers use lock_wq while they wait for the write
143 * lock holders to unlock
144 */
145 wait_queue_head_t write_lock_wq;
133 146
134 /* 147 /* writers use read_lock_wq while they wait for readers
135 * when we keep the lock held while blocking, waiters go onto 148 * to unlock
136 * the wq
137 */ 149 */
138 wait_queue_head_t lock_wq; 150 wait_queue_head_t read_lock_wq;
139}; 151};
140 152
141static inline void extent_set_compress_type(unsigned long *bio_flags, 153static inline void extent_set_compress_type(unsigned long *bio_flags,
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 66fa43dc3f0f..d77b67c4b275 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -24,185 +24,197 @@
24#include "extent_io.h" 24#include "extent_io.h"
25#include "locking.h" 25#include "locking.h"
26 26
27static inline void spin_nested(struct extent_buffer *eb) 27void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
28{
29 spin_lock(&eb->lock);
30}
31 28
32/* 29/*
33 * Setting a lock to blocking will drop the spinlock and set the 30 * if we currently have a spinning reader or writer lock
34 * flag that forces other procs who want the lock to wait. After 31 * (indicated by the rw flag) this will bump the count
35 * this you can safely schedule with the lock held. 32 * of blocking holders and drop the spinlock.
36 */ 33 */
37void btrfs_set_lock_blocking(struct extent_buffer *eb) 34void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
38{ 35{
39 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { 36 if (rw == BTRFS_WRITE_LOCK) {
40 set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); 37 if (atomic_read(&eb->blocking_writers) == 0) {
41 spin_unlock(&eb->lock); 38 WARN_ON(atomic_read(&eb->spinning_writers) != 1);
39 atomic_dec(&eb->spinning_writers);
40 btrfs_assert_tree_locked(eb);
41 atomic_inc(&eb->blocking_writers);
42 write_unlock(&eb->lock);
43 }
44 } else if (rw == BTRFS_READ_LOCK) {
45 btrfs_assert_tree_read_locked(eb);
46 atomic_inc(&eb->blocking_readers);
47 WARN_ON(atomic_read(&eb->spinning_readers) == 0);
48 atomic_dec(&eb->spinning_readers);
49 read_unlock(&eb->lock);
42 } 50 }
43 /* exit with the spin lock released and the bit set */ 51 return;
44} 52}
45 53
46/* 54/*
47 * clearing the blocking flag will take the spinlock again. 55 * if we currently have a blocking lock, take the spinlock
48 * After this you can't safely schedule 56 * and drop our blocking count
49 */ 57 */
50void btrfs_clear_lock_blocking(struct extent_buffer *eb) 58void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
51{ 59{
52 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { 60 if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
53 spin_nested(eb); 61 BUG_ON(atomic_read(&eb->blocking_writers) != 1);
54 clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); 62 write_lock(&eb->lock);
55 smp_mb__after_clear_bit(); 63 WARN_ON(atomic_read(&eb->spinning_writers));
64 atomic_inc(&eb->spinning_writers);
65 if (atomic_dec_and_test(&eb->blocking_writers))
66 wake_up(&eb->write_lock_wq);
67 } else if (rw == BTRFS_READ_LOCK_BLOCKING) {
68 BUG_ON(atomic_read(&eb->blocking_readers) == 0);
69 read_lock(&eb->lock);
70 atomic_inc(&eb->spinning_readers);
71 if (atomic_dec_and_test(&eb->blocking_readers))
72 wake_up(&eb->read_lock_wq);
56 } 73 }
57 /* exit with the spin lock held */ 74 return;
58} 75}
59 76
60/* 77/*
61 * unfortunately, many of the places that currently set a lock to blocking 78 * take a spinning read lock. This will wait for any blocking
62 * don't end up blocking for very long, and often they don't block 79 * writers
63 * at all. For a dbench 50 run, if we don't spin on the blocking bit
64 * at all, the context switch rate can jump up to 400,000/sec or more.
65 *
66 * So, we're still stuck with this crummy spin on the blocking bit,
67 * at least until the most common causes of the short blocks
68 * can be dealt with.
69 */ 80 */
70static int btrfs_spin_on_block(struct extent_buffer *eb) 81void btrfs_tree_read_lock(struct extent_buffer *eb)
71{ 82{
72 int i; 83again:
73 84 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
74 for (i = 0; i < 512; i++) { 85 read_lock(&eb->lock);
75 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 86 if (atomic_read(&eb->blocking_writers)) {
76 return 1; 87 read_unlock(&eb->lock);
77 if (need_resched()) 88 wait_event(eb->write_lock_wq,
78 break; 89 atomic_read(&eb->blocking_writers) == 0);
79 cpu_relax(); 90 goto again;
80 } 91 }
81 return 0; 92 atomic_inc(&eb->read_locks);
93 atomic_inc(&eb->spinning_readers);
82} 94}
83 95
84/* 96/*
85 * This is somewhat different from trylock. It will take the 97 * returns 1 if we get the read lock and 0 if we don't
86 * spinlock but if it finds the lock is set to blocking, it will 98 * this won't wait for blocking writers
87 * return without the lock held.
88 *
89 * returns 1 if it was able to take the lock and zero otherwise
90 *
91 * After this call, scheduling is not safe without first calling
92 * btrfs_set_lock_blocking()
93 */ 99 */
94int btrfs_try_spin_lock(struct extent_buffer *eb) 100int btrfs_try_tree_read_lock(struct extent_buffer *eb)
95{ 101{
96 int i; 102 if (atomic_read(&eb->blocking_writers))
103 return 0;
97 104
98 if (btrfs_spin_on_block(eb)) { 105 read_lock(&eb->lock);
99 spin_nested(eb); 106 if (atomic_read(&eb->blocking_writers)) {
100 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 107 read_unlock(&eb->lock);
101 return 1; 108 return 0;
102 spin_unlock(&eb->lock);
103 } 109 }
104 /* spin for a bit on the BLOCKING flag */ 110 atomic_inc(&eb->read_locks);
105 for (i = 0; i < 2; i++) { 111 atomic_inc(&eb->spinning_readers);
106 cpu_relax(); 112 return 1;
107 if (!btrfs_spin_on_block(eb))
108 break;
109
110 spin_nested(eb);
111 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
112 return 1;
113 spin_unlock(&eb->lock);
114 }
115 return 0;
116} 113}
117 114
118/* 115/*
119 * the autoremove wake function will return 0 if it tried to wake up 116 * returns 1 if we get the read lock and 0 if we don't
120 * a process that was already awake, which means that process won't 117 * this won't wait for blocking writers or readers
121 * count as an exclusive wakeup. The waitq code will continue waking
122 * procs until it finds one that was actually sleeping.
123 *
124 * For btrfs, this isn't quite what we want. We want a single proc
125 * to be notified that the lock is ready for taking. If that proc
126 * already happen to be awake, great, it will loop around and try for
127 * the lock.
128 *
129 * So, btrfs_wake_function always returns 1, even when the proc that we
130 * tried to wake up was already awake.
131 */ 118 */
132static int btrfs_wake_function(wait_queue_t *wait, unsigned mode, 119int btrfs_try_tree_write_lock(struct extent_buffer *eb)
133 int sync, void *key)
134{ 120{
135 autoremove_wake_function(wait, mode, sync, key); 121 if (atomic_read(&eb->blocking_writers) ||
122 atomic_read(&eb->blocking_readers))
123 return 0;
124 write_lock(&eb->lock);
125 if (atomic_read(&eb->blocking_writers) ||
126 atomic_read(&eb->blocking_readers)) {
127 write_unlock(&eb->lock);
128 return 0;
129 }
130 atomic_inc(&eb->write_locks);
131 atomic_inc(&eb->spinning_writers);
136 return 1; 132 return 1;
137} 133}
138 134
139/* 135/*
140 * returns with the extent buffer spinlocked. 136 * drop a spinning read lock
141 * 137 */
142 * This will spin and/or wait as required to take the lock, and then 138void btrfs_tree_read_unlock(struct extent_buffer *eb)
143 * return with the spinlock held. 139{
144 * 140 btrfs_assert_tree_read_locked(eb);
145 * After this call, scheduling is not safe without first calling 141 WARN_ON(atomic_read(&eb->spinning_readers) == 0);
146 * btrfs_set_lock_blocking() 142 atomic_dec(&eb->spinning_readers);
143 atomic_dec(&eb->read_locks);
144 read_unlock(&eb->lock);
145}
146
147/*
148 * drop a blocking read lock
149 */
150void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
151{
152 btrfs_assert_tree_read_locked(eb);
153 WARN_ON(atomic_read(&eb->blocking_readers) == 0);
154 if (atomic_dec_and_test(&eb->blocking_readers))
155 wake_up(&eb->read_lock_wq);
156 atomic_dec(&eb->read_locks);
157}
158
159/*
160 * take a spinning write lock. This will wait for both
161 * blocking readers or writers
147 */ 162 */
148int btrfs_tree_lock(struct extent_buffer *eb) 163int btrfs_tree_lock(struct extent_buffer *eb)
149{ 164{
150 DEFINE_WAIT(wait); 165again:
151 wait.func = btrfs_wake_function; 166 wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
152 167 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
153 if (!btrfs_spin_on_block(eb)) 168 write_lock(&eb->lock);
154 goto sleep; 169 if (atomic_read(&eb->blocking_readers)) {
155 170 write_unlock(&eb->lock);
156 while(1) { 171 wait_event(eb->read_lock_wq,
157 spin_nested(eb); 172 atomic_read(&eb->blocking_readers) == 0);
158 173 goto again;
159 /* nobody is blocking, exit with the spinlock held */
160 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
161 return 0;
162
163 /*
164 * we have the spinlock, but the real owner is blocking.
165 * wait for them
166 */
167 spin_unlock(&eb->lock);
168
169 /*
170 * spin for a bit, and if the blocking flag goes away,
171 * loop around
172 */
173 cpu_relax();
174 if (btrfs_spin_on_block(eb))
175 continue;
176sleep:
177 prepare_to_wait_exclusive(&eb->lock_wq, &wait,
178 TASK_UNINTERRUPTIBLE);
179
180 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
181 schedule();
182
183 finish_wait(&eb->lock_wq, &wait);
184 } 174 }
175 if (atomic_read(&eb->blocking_writers)) {
176 write_unlock(&eb->lock);
177 wait_event(eb->write_lock_wq,
178 atomic_read(&eb->blocking_writers) == 0);
179 goto again;
180 }
181 WARN_ON(atomic_read(&eb->spinning_writers));
182 atomic_inc(&eb->spinning_writers);
183 atomic_inc(&eb->write_locks);
185 return 0; 184 return 0;
186} 185}
187 186
187/*
188 * drop a spinning or a blocking write lock.
189 */
188int btrfs_tree_unlock(struct extent_buffer *eb) 190int btrfs_tree_unlock(struct extent_buffer *eb)
189{ 191{
190 /* 192 int blockers = atomic_read(&eb->blocking_writers);
191 * if we were a blocking owner, we don't have the spinlock held 193
192 * just clear the bit and look for waiters 194 BUG_ON(blockers > 1);
193 */ 195
194 if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 196 btrfs_assert_tree_locked(eb);
195 smp_mb__after_clear_bit(); 197 atomic_dec(&eb->write_locks);
196 else 198
197 spin_unlock(&eb->lock); 199 if (blockers) {
198 200 WARN_ON(atomic_read(&eb->spinning_writers));
199 if (waitqueue_active(&eb->lock_wq)) 201 atomic_dec(&eb->blocking_writers);
200 wake_up(&eb->lock_wq); 202 smp_wmb();
203 wake_up(&eb->write_lock_wq);
204 } else {
205 WARN_ON(atomic_read(&eb->spinning_writers) != 1);
206 atomic_dec(&eb->spinning_writers);
207 write_unlock(&eb->lock);
208 }
201 return 0; 209 return 0;
202} 210}
203 211
204void btrfs_assert_tree_locked(struct extent_buffer *eb) 212void btrfs_assert_tree_locked(struct extent_buffer *eb)
205{ 213{
206 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 214 BUG_ON(!atomic_read(&eb->write_locks));
207 assert_spin_locked(&eb->lock); 215}
216
217void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
218{
219 BUG_ON(!atomic_read(&eb->read_locks));
208} 220}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 5c33a560a2f1..17247ddb81a0 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -19,11 +19,43 @@
19#ifndef __BTRFS_LOCKING_ 19#ifndef __BTRFS_LOCKING_
20#define __BTRFS_LOCKING_ 20#define __BTRFS_LOCKING_
21 21
22#define BTRFS_WRITE_LOCK 1
23#define BTRFS_READ_LOCK 2
24#define BTRFS_WRITE_LOCK_BLOCKING 3
25#define BTRFS_READ_LOCK_BLOCKING 4
26
22int btrfs_tree_lock(struct extent_buffer *eb); 27int btrfs_tree_lock(struct extent_buffer *eb);
23int btrfs_tree_unlock(struct extent_buffer *eb); 28int btrfs_tree_unlock(struct extent_buffer *eb);
24int btrfs_try_spin_lock(struct extent_buffer *eb); 29int btrfs_try_spin_lock(struct extent_buffer *eb);
25 30
26void btrfs_set_lock_blocking(struct extent_buffer *eb); 31void btrfs_tree_read_lock(struct extent_buffer *eb);
27void btrfs_clear_lock_blocking(struct extent_buffer *eb); 32void btrfs_tree_read_unlock(struct extent_buffer *eb);
33void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
34void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw);
35void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw);
28void btrfs_assert_tree_locked(struct extent_buffer *eb); 36void btrfs_assert_tree_locked(struct extent_buffer *eb);
37int btrfs_try_tree_read_lock(struct extent_buffer *eb);
38int btrfs_try_tree_write_lock(struct extent_buffer *eb);
39
40static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
41{
42 if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING)
43 btrfs_tree_unlock(eb);
44 else if (rw == BTRFS_READ_LOCK_BLOCKING)
45 btrfs_tree_read_unlock_blocking(eb);
46 else if (rw == BTRFS_READ_LOCK)
47 btrfs_tree_read_unlock(eb);
48 else
49 BUG();
50}
51
52static inline void btrfs_set_lock_blocking(struct extent_buffer *eb)
53{
54 btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK);
55}
56
57static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb)
58{
59 btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING);
60}
29#endif 61#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4ce8a9f41d1e..ac278dd83175 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1730,8 +1730,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1730 btrfs_read_buffer(next, ptr_gen); 1730 btrfs_read_buffer(next, ptr_gen);
1731 1731
1732 btrfs_tree_lock(next); 1732 btrfs_tree_lock(next);
1733 clean_tree_block(trans, root, next);
1734 btrfs_set_lock_blocking(next); 1733 btrfs_set_lock_blocking(next);
1734 clean_tree_block(trans, root, next);
1735 btrfs_wait_tree_block_writeback(next); 1735 btrfs_wait_tree_block_writeback(next);
1736 btrfs_tree_unlock(next); 1736 btrfs_tree_unlock(next);
1737 1737
@@ -1796,8 +1796,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1796 next = path->nodes[*level]; 1796 next = path->nodes[*level];
1797 1797
1798 btrfs_tree_lock(next); 1798 btrfs_tree_lock(next);
1799 clean_tree_block(trans, root, next);
1800 btrfs_set_lock_blocking(next); 1799 btrfs_set_lock_blocking(next);
1800 clean_tree_block(trans, root, next);
1801 btrfs_wait_tree_block_writeback(next); 1801 btrfs_wait_tree_block_writeback(next);
1802 btrfs_tree_unlock(next); 1802 btrfs_tree_unlock(next);
1803 1803
@@ -1864,8 +1864,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1864 next = path->nodes[orig_level]; 1864 next = path->nodes[orig_level];
1865 1865
1866 btrfs_tree_lock(next); 1866 btrfs_tree_lock(next);
1867 clean_tree_block(trans, log, next);
1868 btrfs_set_lock_blocking(next); 1867 btrfs_set_lock_blocking(next);
1868 clean_tree_block(trans, log, next);
1869 btrfs_wait_tree_block_writeback(next); 1869 btrfs_wait_tree_block_writeback(next);
1870 btrfs_tree_unlock(next); 1870 btrfs_tree_unlock(next);
1871 1871