diff options
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/ctree.c | 268 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 2 | ||||
-rw-r--r-- | fs/btrfs/delayed-inode.c | 2 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 20 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 11 | ||||
-rw-r--r-- | fs/btrfs/extent_io.h | 24 | ||||
-rw-r--r-- | fs/btrfs/locking.c | 280 | ||||
-rw-r--r-- | fs/btrfs/locking.h | 36 | ||||
-rw-r--r-- | fs/btrfs/tree-log.c | 6 |
9 files changed, 431 insertions, 218 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d2431284b913..0ad48e782d37 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c | |||
@@ -54,8 +54,13 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p) | |||
54 | { | 54 | { |
55 | int i; | 55 | int i; |
56 | for (i = 0; i < BTRFS_MAX_LEVEL; i++) { | 56 | for (i = 0; i < BTRFS_MAX_LEVEL; i++) { |
57 | if (p->nodes[i] && p->locks[i]) | 57 | if (!p->nodes[i] || !p->locks[i]) |
58 | btrfs_set_lock_blocking(p->nodes[i]); | 58 | continue; |
59 | btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]); | ||
60 | if (p->locks[i] == BTRFS_READ_LOCK) | ||
61 | p->locks[i] = BTRFS_READ_LOCK_BLOCKING; | ||
62 | else if (p->locks[i] == BTRFS_WRITE_LOCK) | ||
63 | p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; | ||
59 | } | 64 | } |
60 | } | 65 | } |
61 | 66 | ||
@@ -68,7 +73,7 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p) | |||
68 | * for held | 73 | * for held |
69 | */ | 74 | */ |
70 | noinline void btrfs_clear_path_blocking(struct btrfs_path *p, | 75 | noinline void btrfs_clear_path_blocking(struct btrfs_path *p, |
71 | struct extent_buffer *held) | 76 | struct extent_buffer *held, int held_rw) |
72 | { | 77 | { |
73 | int i; | 78 | int i; |
74 | 79 | ||
@@ -79,19 +84,29 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p, | |||
79 | * really sure by forcing the path to blocking before we clear | 84 | * really sure by forcing the path to blocking before we clear |
80 | * the path blocking. | 85 | * the path blocking. |
81 | */ | 86 | */ |
82 | if (held) | 87 | if (held) { |
83 | btrfs_set_lock_blocking(held); | 88 | btrfs_set_lock_blocking_rw(held, held_rw); |
89 | if (held_rw == BTRFS_WRITE_LOCK) | ||
90 | held_rw = BTRFS_WRITE_LOCK_BLOCKING; | ||
91 | else if (held_rw == BTRFS_READ_LOCK) | ||
92 | held_rw = BTRFS_READ_LOCK_BLOCKING; | ||
93 | } | ||
84 | btrfs_set_path_blocking(p); | 94 | btrfs_set_path_blocking(p); |
85 | #endif | 95 | #endif |
86 | 96 | ||
87 | for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { | 97 | for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { |
88 | if (p->nodes[i] && p->locks[i]) | 98 | if (p->nodes[i] && p->locks[i]) { |
89 | btrfs_clear_lock_blocking(p->nodes[i]); | 99 | btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]); |
100 | if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING) | ||
101 | p->locks[i] = BTRFS_WRITE_LOCK; | ||
102 | else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING) | ||
103 | p->locks[i] = BTRFS_READ_LOCK; | ||
104 | } | ||
90 | } | 105 | } |
91 | 106 | ||
92 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 107 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
93 | if (held) | 108 | if (held) |
94 | btrfs_clear_lock_blocking(held); | 109 | btrfs_clear_lock_blocking_rw(held, held_rw); |
95 | #endif | 110 | #endif |
96 | } | 111 | } |
97 | 112 | ||
@@ -119,7 +134,7 @@ noinline void btrfs_release_path(struct btrfs_path *p) | |||
119 | if (!p->nodes[i]) | 134 | if (!p->nodes[i]) |
120 | continue; | 135 | continue; |
121 | if (p->locks[i]) { | 136 | if (p->locks[i]) { |
122 | btrfs_tree_unlock(p->nodes[i]); | 137 | btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); |
123 | p->locks[i] = 0; | 138 | p->locks[i] = 0; |
124 | } | 139 | } |
125 | free_extent_buffer(p->nodes[i]); | 140 | free_extent_buffer(p->nodes[i]); |
@@ -167,6 +182,25 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) | |||
167 | return eb; | 182 | return eb; |
168 | } | 183 | } |
169 | 184 | ||
185 | /* loop around taking references on and locking the root node of the | ||
186 | * tree until you end up with a lock on the root. A locked buffer | ||
187 | * is returned, with a reference held. | ||
188 | */ | ||
189 | struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) | ||
190 | { | ||
191 | struct extent_buffer *eb; | ||
192 | |||
193 | while (1) { | ||
194 | eb = btrfs_root_node(root); | ||
195 | btrfs_tree_read_lock(eb); | ||
196 | if (eb == root->node) | ||
197 | break; | ||
198 | btrfs_tree_read_unlock(eb); | ||
199 | free_extent_buffer(eb); | ||
200 | } | ||
201 | return eb; | ||
202 | } | ||
203 | |||
170 | /* cowonly root (everything not a reference counted cow subvolume), just get | 204 | /* cowonly root (everything not a reference counted cow subvolume), just get |
171 | * put onto a simple dirty list. transaction.c walks this to make sure they | 205 | * put onto a simple dirty list. transaction.c walks this to make sure they |
172 | * get properly updated on disk. | 206 | * get properly updated on disk. |
@@ -862,7 +896,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, | |||
862 | 896 | ||
863 | mid = path->nodes[level]; | 897 | mid = path->nodes[level]; |
864 | 898 | ||
865 | WARN_ON(!path->locks[level]); | 899 | WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK && |
900 | path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING); | ||
866 | WARN_ON(btrfs_header_generation(mid) != trans->transid); | 901 | WARN_ON(btrfs_header_generation(mid) != trans->transid); |
867 | 902 | ||
868 | orig_ptr = btrfs_node_blockptr(mid, orig_slot); | 903 | orig_ptr = btrfs_node_blockptr(mid, orig_slot); |
@@ -1360,7 +1395,7 @@ static noinline void unlock_up(struct btrfs_path *path, int level, | |||
1360 | 1395 | ||
1361 | t = path->nodes[i]; | 1396 | t = path->nodes[i]; |
1362 | if (i >= lowest_unlock && i > skip_level && path->locks[i]) { | 1397 | if (i >= lowest_unlock && i > skip_level && path->locks[i]) { |
1363 | btrfs_tree_unlock(t); | 1398 | btrfs_tree_unlock_rw(t, path->locks[i]); |
1364 | path->locks[i] = 0; | 1399 | path->locks[i] = 0; |
1365 | } | 1400 | } |
1366 | } | 1401 | } |
@@ -1387,7 +1422,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) | |||
1387 | continue; | 1422 | continue; |
1388 | if (!path->locks[i]) | 1423 | if (!path->locks[i]) |
1389 | continue; | 1424 | continue; |
1390 | btrfs_tree_unlock(path->nodes[i]); | 1425 | btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); |
1391 | path->locks[i] = 0; | 1426 | path->locks[i] = 0; |
1392 | } | 1427 | } |
1393 | } | 1428 | } |
@@ -1436,6 +1471,8 @@ read_block_for_search(struct btrfs_trans_handle *trans, | |||
1436 | * we can trust our generation number | 1471 | * we can trust our generation number |
1437 | */ | 1472 | */ |
1438 | free_extent_buffer(tmp); | 1473 | free_extent_buffer(tmp); |
1474 | btrfs_set_path_blocking(p); | ||
1475 | |||
1439 | tmp = read_tree_block(root, blocknr, blocksize, gen); | 1476 | tmp = read_tree_block(root, blocknr, blocksize, gen); |
1440 | if (tmp && btrfs_buffer_uptodate(tmp, gen)) { | 1477 | if (tmp && btrfs_buffer_uptodate(tmp, gen)) { |
1441 | *eb_ret = tmp; | 1478 | *eb_ret = tmp; |
@@ -1491,20 +1528,27 @@ read_block_for_search(struct btrfs_trans_handle *trans, | |||
1491 | static int | 1528 | static int |
1492 | setup_nodes_for_search(struct btrfs_trans_handle *trans, | 1529 | setup_nodes_for_search(struct btrfs_trans_handle *trans, |
1493 | struct btrfs_root *root, struct btrfs_path *p, | 1530 | struct btrfs_root *root, struct btrfs_path *p, |
1494 | struct extent_buffer *b, int level, int ins_len) | 1531 | struct extent_buffer *b, int level, int ins_len, |
1532 | int *write_lock_level) | ||
1495 | { | 1533 | { |
1496 | int ret; | 1534 | int ret; |
1497 | if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= | 1535 | if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= |
1498 | BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { | 1536 | BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { |
1499 | int sret; | 1537 | int sret; |
1500 | 1538 | ||
1539 | if (*write_lock_level < level + 1) { | ||
1540 | *write_lock_level = level + 1; | ||
1541 | btrfs_release_path(p); | ||
1542 | goto again; | ||
1543 | } | ||
1544 | |||
1501 | sret = reada_for_balance(root, p, level); | 1545 | sret = reada_for_balance(root, p, level); |
1502 | if (sret) | 1546 | if (sret) |
1503 | goto again; | 1547 | goto again; |
1504 | 1548 | ||
1505 | btrfs_set_path_blocking(p); | 1549 | btrfs_set_path_blocking(p); |
1506 | sret = split_node(trans, root, p, level); | 1550 | sret = split_node(trans, root, p, level); |
1507 | btrfs_clear_path_blocking(p, NULL); | 1551 | btrfs_clear_path_blocking(p, NULL, 0); |
1508 | 1552 | ||
1509 | BUG_ON(sret > 0); | 1553 | BUG_ON(sret > 0); |
1510 | if (sret) { | 1554 | if (sret) { |
@@ -1516,13 +1560,19 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, | |||
1516 | BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { | 1560 | BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { |
1517 | int sret; | 1561 | int sret; |
1518 | 1562 | ||
1563 | if (*write_lock_level < level + 1) { | ||
1564 | *write_lock_level = level + 1; | ||
1565 | btrfs_release_path(p); | ||
1566 | goto again; | ||
1567 | } | ||
1568 | |||
1519 | sret = reada_for_balance(root, p, level); | 1569 | sret = reada_for_balance(root, p, level); |
1520 | if (sret) | 1570 | if (sret) |
1521 | goto again; | 1571 | goto again; |
1522 | 1572 | ||
1523 | btrfs_set_path_blocking(p); | 1573 | btrfs_set_path_blocking(p); |
1524 | sret = balance_level(trans, root, p, level); | 1574 | sret = balance_level(trans, root, p, level); |
1525 | btrfs_clear_path_blocking(p, NULL); | 1575 | btrfs_clear_path_blocking(p, NULL, 0); |
1526 | 1576 | ||
1527 | if (sret) { | 1577 | if (sret) { |
1528 | ret = sret; | 1578 | ret = sret; |
@@ -1566,27 +1616,78 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root | |||
1566 | int err; | 1616 | int err; |
1567 | int level; | 1617 | int level; |
1568 | int lowest_unlock = 1; | 1618 | int lowest_unlock = 1; |
1619 | int root_lock; | ||
1620 | /* everything at write_lock_level or lower must be write locked */ | ||
1621 | int write_lock_level = 0; | ||
1569 | u8 lowest_level = 0; | 1622 | u8 lowest_level = 0; |
1570 | 1623 | ||
1571 | lowest_level = p->lowest_level; | 1624 | lowest_level = p->lowest_level; |
1572 | WARN_ON(lowest_level && ins_len > 0); | 1625 | WARN_ON(lowest_level && ins_len > 0); |
1573 | WARN_ON(p->nodes[0] != NULL); | 1626 | WARN_ON(p->nodes[0] != NULL); |
1574 | 1627 | ||
1575 | if (ins_len < 0) | 1628 | if (ins_len < 0) { |
1576 | lowest_unlock = 2; | 1629 | lowest_unlock = 2; |
1577 | 1630 | ||
1631 | /* when we are removing items, we might have to go up to level | ||
1632 | * two as we update tree pointers Make sure we keep write | ||
1633 | * for those levels as well | ||
1634 | */ | ||
1635 | write_lock_level = 2; | ||
1636 | } else if (ins_len > 0) { | ||
1637 | /* | ||
1638 | * for inserting items, make sure we have a write lock on | ||
1639 | * level 1 so we can update keys | ||
1640 | */ | ||
1641 | write_lock_level = 1; | ||
1642 | } | ||
1643 | |||
1644 | if (!cow) | ||
1645 | write_lock_level = -1; | ||
1646 | |||
1647 | if (cow && (p->keep_locks || p->lowest_level)) | ||
1648 | write_lock_level = BTRFS_MAX_LEVEL; | ||
1649 | |||
1578 | again: | 1650 | again: |
1651 | /* | ||
1652 | * we try very hard to do read locks on the root | ||
1653 | */ | ||
1654 | root_lock = BTRFS_READ_LOCK; | ||
1655 | level = 0; | ||
1579 | if (p->search_commit_root) { | 1656 | if (p->search_commit_root) { |
1657 | /* | ||
1658 | * the commit roots are read only | ||
1659 | * so we always do read locks | ||
1660 | */ | ||
1580 | b = root->commit_root; | 1661 | b = root->commit_root; |
1581 | extent_buffer_get(b); | 1662 | extent_buffer_get(b); |
1663 | level = btrfs_header_level(b); | ||
1582 | if (!p->skip_locking) | 1664 | if (!p->skip_locking) |
1583 | btrfs_tree_lock(b); | 1665 | btrfs_tree_read_lock(b); |
1584 | } else { | 1666 | } else { |
1585 | if (p->skip_locking) | 1667 | if (p->skip_locking) { |
1586 | b = btrfs_root_node(root); | 1668 | b = btrfs_root_node(root); |
1587 | else | 1669 | level = btrfs_header_level(b); |
1588 | b = btrfs_lock_root_node(root); | 1670 | } else { |
1671 | /* we don't know the level of the root node | ||
1672 | * until we actually have it read locked | ||
1673 | */ | ||
1674 | b = btrfs_read_lock_root_node(root); | ||
1675 | level = btrfs_header_level(b); | ||
1676 | if (level <= write_lock_level) { | ||
1677 | /* whoops, must trade for write lock */ | ||
1678 | btrfs_tree_read_unlock(b); | ||
1679 | free_extent_buffer(b); | ||
1680 | b = btrfs_lock_root_node(root); | ||
1681 | root_lock = BTRFS_WRITE_LOCK; | ||
1682 | |||
1683 | /* the level might have changed, check again */ | ||
1684 | level = btrfs_header_level(b); | ||
1685 | } | ||
1686 | } | ||
1589 | } | 1687 | } |
1688 | p->nodes[level] = b; | ||
1689 | if (!p->skip_locking) | ||
1690 | p->locks[level] = root_lock; | ||
1590 | 1691 | ||
1591 | while (b) { | 1692 | while (b) { |
1592 | level = btrfs_header_level(b); | 1693 | level = btrfs_header_level(b); |
@@ -1595,10 +1696,6 @@ again: | |||
1595 | * setup the path here so we can release it under lock | 1696 | * setup the path here so we can release it under lock |
1596 | * contention with the cow code | 1697 | * contention with the cow code |
1597 | */ | 1698 | */ |
1598 | p->nodes[level] = b; | ||
1599 | if (!p->skip_locking) | ||
1600 | p->locks[level] = 1; | ||
1601 | |||
1602 | if (cow) { | 1699 | if (cow) { |
1603 | /* | 1700 | /* |
1604 | * if we don't really need to cow this block | 1701 | * if we don't really need to cow this block |
@@ -1610,6 +1707,16 @@ again: | |||
1610 | 1707 | ||
1611 | btrfs_set_path_blocking(p); | 1708 | btrfs_set_path_blocking(p); |
1612 | 1709 | ||
1710 | /* | ||
1711 | * must have write locks on this node and the | ||
1712 | * parent | ||
1713 | */ | ||
1714 | if (level + 1 > write_lock_level) { | ||
1715 | write_lock_level = level + 1; | ||
1716 | btrfs_release_path(p); | ||
1717 | goto again; | ||
1718 | } | ||
1719 | |||
1613 | err = btrfs_cow_block(trans, root, b, | 1720 | err = btrfs_cow_block(trans, root, b, |
1614 | p->nodes[level + 1], | 1721 | p->nodes[level + 1], |
1615 | p->slots[level + 1], &b); | 1722 | p->slots[level + 1], &b); |
@@ -1622,10 +1729,7 @@ cow_done: | |||
1622 | BUG_ON(!cow && ins_len); | 1729 | BUG_ON(!cow && ins_len); |
1623 | 1730 | ||
1624 | p->nodes[level] = b; | 1731 | p->nodes[level] = b; |
1625 | if (!p->skip_locking) | 1732 | btrfs_clear_path_blocking(p, NULL, 0); |
1626 | p->locks[level] = 1; | ||
1627 | |||
1628 | btrfs_clear_path_blocking(p, NULL); | ||
1629 | 1733 | ||
1630 | /* | 1734 | /* |
1631 | * we have a lock on b and as long as we aren't changing | 1735 | * we have a lock on b and as long as we aren't changing |
@@ -1651,7 +1755,7 @@ cow_done: | |||
1651 | } | 1755 | } |
1652 | p->slots[level] = slot; | 1756 | p->slots[level] = slot; |
1653 | err = setup_nodes_for_search(trans, root, p, b, level, | 1757 | err = setup_nodes_for_search(trans, root, p, b, level, |
1654 | ins_len); | 1758 | ins_len, &write_lock_level); |
1655 | if (err == -EAGAIN) | 1759 | if (err == -EAGAIN) |
1656 | goto again; | 1760 | goto again; |
1657 | if (err) { | 1761 | if (err) { |
@@ -1661,6 +1765,19 @@ cow_done: | |||
1661 | b = p->nodes[level]; | 1765 | b = p->nodes[level]; |
1662 | slot = p->slots[level]; | 1766 | slot = p->slots[level]; |
1663 | 1767 | ||
1768 | /* | ||
1769 | * slot 0 is special, if we change the key | ||
1770 | * we have to update the parent pointer | ||
1771 | * which means we must have a write lock | ||
1772 | * on the parent | ||
1773 | */ | ||
1774 | if (slot == 0 && cow && | ||
1775 | write_lock_level < level + 1) { | ||
1776 | write_lock_level = level + 1; | ||
1777 | btrfs_release_path(p); | ||
1778 | goto again; | ||
1779 | } | ||
1780 | |||
1664 | unlock_up(p, level, lowest_unlock); | 1781 | unlock_up(p, level, lowest_unlock); |
1665 | 1782 | ||
1666 | if (level == lowest_level) { | 1783 | if (level == lowest_level) { |
@@ -1679,23 +1796,42 @@ cow_done: | |||
1679 | } | 1796 | } |
1680 | 1797 | ||
1681 | if (!p->skip_locking) { | 1798 | if (!p->skip_locking) { |
1682 | btrfs_clear_path_blocking(p, NULL); | 1799 | level = btrfs_header_level(b); |
1683 | err = btrfs_try_spin_lock(b); | 1800 | if (level <= write_lock_level) { |
1684 | 1801 | err = btrfs_try_tree_write_lock(b); | |
1685 | if (!err) { | 1802 | if (!err) { |
1686 | btrfs_set_path_blocking(p); | 1803 | btrfs_set_path_blocking(p); |
1687 | btrfs_tree_lock(b); | 1804 | btrfs_tree_lock(b); |
1688 | btrfs_clear_path_blocking(p, b); | 1805 | btrfs_clear_path_blocking(p, b, |
1806 | BTRFS_WRITE_LOCK); | ||
1807 | } | ||
1808 | p->locks[level] = BTRFS_WRITE_LOCK; | ||
1809 | } else { | ||
1810 | err = btrfs_try_tree_read_lock(b); | ||
1811 | if (!err) { | ||
1812 | btrfs_set_path_blocking(p); | ||
1813 | btrfs_tree_read_lock(b); | ||
1814 | btrfs_clear_path_blocking(p, b, | ||
1815 | BTRFS_READ_LOCK); | ||
1816 | } | ||
1817 | p->locks[level] = BTRFS_READ_LOCK; | ||
1689 | } | 1818 | } |
1819 | p->nodes[level] = b; | ||
1690 | } | 1820 | } |
1691 | } else { | 1821 | } else { |
1692 | p->slots[level] = slot; | 1822 | p->slots[level] = slot; |
1693 | if (ins_len > 0 && | 1823 | if (ins_len > 0 && |
1694 | btrfs_leaf_free_space(root, b) < ins_len) { | 1824 | btrfs_leaf_free_space(root, b) < ins_len) { |
1825 | if (write_lock_level < 1) { | ||
1826 | write_lock_level = 1; | ||
1827 | btrfs_release_path(p); | ||
1828 | goto again; | ||
1829 | } | ||
1830 | |||
1695 | btrfs_set_path_blocking(p); | 1831 | btrfs_set_path_blocking(p); |
1696 | err = split_leaf(trans, root, key, | 1832 | err = split_leaf(trans, root, key, |
1697 | p, ins_len, ret == 0); | 1833 | p, ins_len, ret == 0); |
1698 | btrfs_clear_path_blocking(p, NULL); | 1834 | btrfs_clear_path_blocking(p, NULL, 0); |
1699 | 1835 | ||
1700 | BUG_ON(err > 0); | 1836 | BUG_ON(err > 0); |
1701 | if (err) { | 1837 | if (err) { |
@@ -1976,7 +2112,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, | |||
1976 | add_root_to_dirty_list(root); | 2112 | add_root_to_dirty_list(root); |
1977 | extent_buffer_get(c); | 2113 | extent_buffer_get(c); |
1978 | path->nodes[level] = c; | 2114 | path->nodes[level] = c; |
1979 | path->locks[level] = 1; | 2115 | path->locks[level] = BTRFS_WRITE_LOCK; |
1980 | path->slots[level] = 0; | 2116 | path->slots[level] = 0; |
1981 | return 0; | 2117 | return 0; |
1982 | } | 2118 | } |
@@ -3819,11 +3955,11 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, | |||
3819 | 3955 | ||
3820 | WARN_ON(!path->keep_locks); | 3956 | WARN_ON(!path->keep_locks); |
3821 | again: | 3957 | again: |
3822 | cur = btrfs_lock_root_node(root); | 3958 | cur = btrfs_read_lock_root_node(root); |
3823 | level = btrfs_header_level(cur); | 3959 | level = btrfs_header_level(cur); |
3824 | WARN_ON(path->nodes[level]); | 3960 | WARN_ON(path->nodes[level]); |
3825 | path->nodes[level] = cur; | 3961 | path->nodes[level] = cur; |
3826 | path->locks[level] = 1; | 3962 | path->locks[level] = BTRFS_READ_LOCK; |
3827 | 3963 | ||
3828 | if (btrfs_header_generation(cur) < min_trans) { | 3964 | if (btrfs_header_generation(cur) < min_trans) { |
3829 | ret = 1; | 3965 | ret = 1; |
@@ -3913,12 +4049,12 @@ find_next_key: | |||
3913 | cur = read_node_slot(root, cur, slot); | 4049 | cur = read_node_slot(root, cur, slot); |
3914 | BUG_ON(!cur); | 4050 | BUG_ON(!cur); |
3915 | 4051 | ||
3916 | btrfs_tree_lock(cur); | 4052 | btrfs_tree_read_lock(cur); |
3917 | 4053 | ||
3918 | path->locks[level - 1] = 1; | 4054 | path->locks[level - 1] = BTRFS_READ_LOCK; |
3919 | path->nodes[level - 1] = cur; | 4055 | path->nodes[level - 1] = cur; |
3920 | unlock_up(path, level, 1); | 4056 | unlock_up(path, level, 1); |
3921 | btrfs_clear_path_blocking(path, NULL); | 4057 | btrfs_clear_path_blocking(path, NULL, 0); |
3922 | } | 4058 | } |
3923 | out: | 4059 | out: |
3924 | if (ret == 0) | 4060 | if (ret == 0) |
@@ -4034,6 +4170,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) | |||
4034 | int ret; | 4170 | int ret; |
4035 | int old_spinning = path->leave_spinning; | 4171 | int old_spinning = path->leave_spinning; |
4036 | int force_blocking = 0; | 4172 | int force_blocking = 0; |
4173 | int next_rw_lock = 0; | ||
4037 | 4174 | ||
4038 | nritems = btrfs_header_nritems(path->nodes[0]); | 4175 | nritems = btrfs_header_nritems(path->nodes[0]); |
4039 | if (nritems == 0) | 4176 | if (nritems == 0) |
@@ -4051,6 +4188,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) | |||
4051 | again: | 4188 | again: |
4052 | level = 1; | 4189 | level = 1; |
4053 | next = NULL; | 4190 | next = NULL; |
4191 | next_rw_lock = 0; | ||
4054 | btrfs_release_path(path); | 4192 | btrfs_release_path(path); |
4055 | 4193 | ||
4056 | path->keep_locks = 1; | 4194 | path->keep_locks = 1; |
@@ -4096,11 +4234,12 @@ again: | |||
4096 | } | 4234 | } |
4097 | 4235 | ||
4098 | if (next) { | 4236 | if (next) { |
4099 | btrfs_tree_unlock(next); | 4237 | btrfs_tree_unlock_rw(next, next_rw_lock); |
4100 | free_extent_buffer(next); | 4238 | free_extent_buffer(next); |
4101 | } | 4239 | } |
4102 | 4240 | ||
4103 | next = c; | 4241 | next = c; |
4242 | next_rw_lock = path->locks[level]; | ||
4104 | ret = read_block_for_search(NULL, root, path, &next, level, | 4243 | ret = read_block_for_search(NULL, root, path, &next, level, |
4105 | slot, &key); | 4244 | slot, &key); |
4106 | if (ret == -EAGAIN) | 4245 | if (ret == -EAGAIN) |
@@ -4112,15 +4251,22 @@ again: | |||
4112 | } | 4251 | } |
4113 | 4252 | ||
4114 | if (!path->skip_locking) { | 4253 | if (!path->skip_locking) { |
4115 | ret = btrfs_try_spin_lock(next); | 4254 | ret = btrfs_try_tree_read_lock(next); |
4116 | if (!ret) { | 4255 | if (!ret) { |
4117 | btrfs_set_path_blocking(path); | 4256 | btrfs_set_path_blocking(path); |
4118 | btrfs_tree_lock(next); | 4257 | btrfs_tree_read_lock(next); |
4119 | if (!force_blocking) | 4258 | if (!force_blocking) { |
4120 | btrfs_clear_path_blocking(path, next); | 4259 | btrfs_clear_path_blocking(path, next, |
4260 | BTRFS_READ_LOCK); | ||
4261 | } | ||
4262 | } | ||
4263 | if (force_blocking) { | ||
4264 | btrfs_set_lock_blocking_rw(next, | ||
4265 | BTRFS_READ_LOCK); | ||
4266 | next_rw_lock = BTRFS_READ_LOCK_BLOCKING; | ||
4267 | } else { | ||
4268 | next_rw_lock = BTRFS_READ_LOCK; | ||
4121 | } | 4269 | } |
4122 | if (force_blocking) | ||
4123 | btrfs_set_lock_blocking(next); | ||
4124 | } | 4270 | } |
4125 | break; | 4271 | break; |
4126 | } | 4272 | } |
@@ -4129,14 +4275,13 @@ again: | |||
4129 | level--; | 4275 | level--; |
4130 | c = path->nodes[level]; | 4276 | c = path->nodes[level]; |
4131 | if (path->locks[level]) | 4277 | if (path->locks[level]) |
4132 | btrfs_tree_unlock(c); | 4278 | btrfs_tree_unlock_rw(c, path->locks[level]); |
4133 | 4279 | ||
4134 | free_extent_buffer(c); | 4280 | free_extent_buffer(c); |
4135 | path->nodes[level] = next; | 4281 | path->nodes[level] = next; |
4136 | path->slots[level] = 0; | 4282 | path->slots[level] = 0; |
4137 | if (!path->skip_locking) | 4283 | if (!path->skip_locking) |
4138 | path->locks[level] = 1; | 4284 | path->locks[level] = next_rw_lock; |
4139 | |||
4140 | if (!level) | 4285 | if (!level) |
4141 | break; | 4286 | break; |
4142 | 4287 | ||
@@ -4151,16 +4296,21 @@ again: | |||
4151 | } | 4296 | } |
4152 | 4297 | ||
4153 | if (!path->skip_locking) { | 4298 | if (!path->skip_locking) { |
4154 | btrfs_assert_tree_locked(path->nodes[level]); | 4299 | ret = btrfs_try_tree_read_lock(next); |
4155 | ret = btrfs_try_spin_lock(next); | ||
4156 | if (!ret) { | 4300 | if (!ret) { |
4157 | btrfs_set_path_blocking(path); | 4301 | btrfs_set_path_blocking(path); |
4158 | btrfs_tree_lock(next); | 4302 | btrfs_tree_read_lock(next); |
4159 | if (!force_blocking) | 4303 | if (!force_blocking) |
4160 | btrfs_clear_path_blocking(path, next); | 4304 | btrfs_clear_path_blocking(path, next, |
4305 | BTRFS_READ_LOCK); | ||
4306 | } | ||
4307 | if (force_blocking) { | ||
4308 | btrfs_set_lock_blocking_rw(next, | ||
4309 | BTRFS_READ_LOCK); | ||
4310 | next_rw_lock = BTRFS_READ_LOCK_BLOCKING; | ||
4311 | } else { | ||
4312 | next_rw_lock = BTRFS_READ_LOCK; | ||
4161 | } | 4313 | } |
4162 | if (force_blocking) | ||
4163 | btrfs_set_lock_blocking(next); | ||
4164 | } | 4314 | } |
4165 | } | 4315 | } |
4166 | ret = 0; | 4316 | ret = 0; |
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3063f21d3fc6..40235e10cfb9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h | |||
@@ -2333,7 +2333,7 @@ struct btrfs_path *btrfs_alloc_path(void); | |||
2333 | void btrfs_free_path(struct btrfs_path *p); | 2333 | void btrfs_free_path(struct btrfs_path *p); |
2334 | void btrfs_set_path_blocking(struct btrfs_path *p); | 2334 | void btrfs_set_path_blocking(struct btrfs_path *p); |
2335 | void btrfs_clear_path_blocking(struct btrfs_path *p, | 2335 | void btrfs_clear_path_blocking(struct btrfs_path *p, |
2336 | struct extent_buffer *held); | 2336 | struct extent_buffer *held, int held_rw); |
2337 | void btrfs_unlock_up_safe(struct btrfs_path *p, int level); | 2337 | void btrfs_unlock_up_safe(struct btrfs_path *p, int level); |
2338 | 2338 | ||
2339 | int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, | 2339 | int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, |
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 98c68e658a9b..b52c672f4c18 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c | |||
@@ -735,7 +735,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans, | |||
735 | } | 735 | } |
736 | 736 | ||
737 | /* reset all the locked nodes in the patch to spinning locks. */ | 737 | /* reset all the locked nodes in the patch to spinning locks. */ |
738 | btrfs_clear_path_blocking(path, NULL); | 738 | btrfs_clear_path_blocking(path, NULL, 0); |
739 | 739 | ||
740 | /* insert the keys of the items */ | 740 | /* insert the keys of the items */ |
741 | ret = setup_items_for_insert(trans, root, path, keys, data_size, | 741 | ret = setup_items_for_insert(trans, root, path, keys, data_size, |
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7021dde74d81..2a782c2fcb62 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
@@ -5912,7 +5912,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, | |||
5912 | return 1; | 5912 | return 1; |
5913 | 5913 | ||
5914 | if (path->locks[level] && !wc->keep_locks) { | 5914 | if (path->locks[level] && !wc->keep_locks) { |
5915 | btrfs_tree_unlock(eb); | 5915 | btrfs_tree_unlock_rw(eb, path->locks[level]); |
5916 | path->locks[level] = 0; | 5916 | path->locks[level] = 0; |
5917 | } | 5917 | } |
5918 | return 0; | 5918 | return 0; |
@@ -5936,7 +5936,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, | |||
5936 | * keep the tree lock | 5936 | * keep the tree lock |
5937 | */ | 5937 | */ |
5938 | if (path->locks[level] && level > 0) { | 5938 | if (path->locks[level] && level > 0) { |
5939 | btrfs_tree_unlock(eb); | 5939 | btrfs_tree_unlock_rw(eb, path->locks[level]); |
5940 | path->locks[level] = 0; | 5940 | path->locks[level] = 0; |
5941 | } | 5941 | } |
5942 | return 0; | 5942 | return 0; |
@@ -6049,7 +6049,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, | |||
6049 | BUG_ON(level != btrfs_header_level(next)); | 6049 | BUG_ON(level != btrfs_header_level(next)); |
6050 | path->nodes[level] = next; | 6050 | path->nodes[level] = next; |
6051 | path->slots[level] = 0; | 6051 | path->slots[level] = 0; |
6052 | path->locks[level] = 1; | 6052 | path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; |
6053 | wc->level = level; | 6053 | wc->level = level; |
6054 | if (wc->level == 1) | 6054 | if (wc->level == 1) |
6055 | wc->reada_slot = 0; | 6055 | wc->reada_slot = 0; |
@@ -6120,7 +6120,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, | |||
6120 | BUG_ON(level == 0); | 6120 | BUG_ON(level == 0); |
6121 | btrfs_tree_lock(eb); | 6121 | btrfs_tree_lock(eb); |
6122 | btrfs_set_lock_blocking(eb); | 6122 | btrfs_set_lock_blocking(eb); |
6123 | path->locks[level] = 1; | 6123 | path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; |
6124 | 6124 | ||
6125 | ret = btrfs_lookup_extent_info(trans, root, | 6125 | ret = btrfs_lookup_extent_info(trans, root, |
6126 | eb->start, eb->len, | 6126 | eb->start, eb->len, |
@@ -6129,8 +6129,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, | |||
6129 | BUG_ON(ret); | 6129 | BUG_ON(ret); |
6130 | BUG_ON(wc->refs[level] == 0); | 6130 | BUG_ON(wc->refs[level] == 0); |
6131 | if (wc->refs[level] == 1) { | 6131 | if (wc->refs[level] == 1) { |
6132 | btrfs_tree_unlock(eb); | 6132 | btrfs_tree_unlock_rw(eb, path->locks[level]); |
6133 | path->locks[level] = 0; | ||
6134 | return 1; | 6133 | return 1; |
6135 | } | 6134 | } |
6136 | } | 6135 | } |
@@ -6152,7 +6151,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, | |||
6152 | btrfs_header_generation(eb) == trans->transid) { | 6151 | btrfs_header_generation(eb) == trans->transid) { |
6153 | btrfs_tree_lock(eb); | 6152 | btrfs_tree_lock(eb); |
6154 | btrfs_set_lock_blocking(eb); | 6153 | btrfs_set_lock_blocking(eb); |
6155 | path->locks[level] = 1; | 6154 | path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; |
6156 | } | 6155 | } |
6157 | clean_tree_block(trans, root, eb); | 6156 | clean_tree_block(trans, root, eb); |
6158 | } | 6157 | } |
@@ -6231,7 +6230,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, | |||
6231 | return 0; | 6230 | return 0; |
6232 | 6231 | ||
6233 | if (path->locks[level]) { | 6232 | if (path->locks[level]) { |
6234 | btrfs_tree_unlock(path->nodes[level]); | 6233 | btrfs_tree_unlock_rw(path->nodes[level], |
6234 | path->locks[level]); | ||
6235 | path->locks[level] = 0; | 6235 | path->locks[level] = 0; |
6236 | } | 6236 | } |
6237 | free_extent_buffer(path->nodes[level]); | 6237 | free_extent_buffer(path->nodes[level]); |
@@ -6283,7 +6283,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, | |||
6283 | path->nodes[level] = btrfs_lock_root_node(root); | 6283 | path->nodes[level] = btrfs_lock_root_node(root); |
6284 | btrfs_set_lock_blocking(path->nodes[level]); | 6284 | btrfs_set_lock_blocking(path->nodes[level]); |
6285 | path->slots[level] = 0; | 6285 | path->slots[level] = 0; |
6286 | path->locks[level] = 1; | 6286 | path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; |
6287 | memset(&wc->update_progress, 0, | 6287 | memset(&wc->update_progress, 0, |
6288 | sizeof(wc->update_progress)); | 6288 | sizeof(wc->update_progress)); |
6289 | } else { | 6289 | } else { |
@@ -6451,7 +6451,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, | |||
6451 | level = btrfs_header_level(node); | 6451 | level = btrfs_header_level(node); |
6452 | path->nodes[level] = node; | 6452 | path->nodes[level] = node; |
6453 | path->slots[level] = 0; | 6453 | path->slots[level] = 0; |
6454 | path->locks[level] = 1; | 6454 | path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; |
6455 | 6455 | ||
6456 | wc->refs[parent_level] = 1; | 6456 | wc->refs[parent_level] = 1; |
6457 | wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; | 6457 | wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 76ecbb8ed0e0..5392c3b12fc1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -3017,8 +3017,15 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, | |||
3017 | return NULL; | 3017 | return NULL; |
3018 | eb->start = start; | 3018 | eb->start = start; |
3019 | eb->len = len; | 3019 | eb->len = len; |
3020 | spin_lock_init(&eb->lock); | 3020 | rwlock_init(&eb->lock); |
3021 | init_waitqueue_head(&eb->lock_wq); | 3021 | atomic_set(&eb->write_locks, 0); |
3022 | atomic_set(&eb->read_locks, 0); | ||
3023 | atomic_set(&eb->blocking_readers, 0); | ||
3024 | atomic_set(&eb->blocking_writers, 0); | ||
3025 | atomic_set(&eb->spinning_readers, 0); | ||
3026 | atomic_set(&eb->spinning_writers, 0); | ||
3027 | init_waitqueue_head(&eb->write_lock_wq); | ||
3028 | init_waitqueue_head(&eb->read_lock_wq); | ||
3022 | 3029 | ||
3023 | #if LEAK_DEBUG | 3030 | #if LEAK_DEBUG |
3024 | spin_lock_irqsave(&leak_lock, flags); | 3031 | spin_lock_irqsave(&leak_lock, flags); |
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index b5f120cca916..21a7ca9e7282 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h | |||
@@ -128,14 +128,26 @@ struct extent_buffer { | |||
128 | struct rcu_head rcu_head; | 128 | struct rcu_head rcu_head; |
129 | atomic_t refs; | 129 | atomic_t refs; |
130 | 130 | ||
131 | /* the spinlock is used to protect most operations */ | 131 | /* count of read lock holders on the extent buffer */ |
132 | spinlock_t lock; | 132 | atomic_t write_locks; |
133 | atomic_t read_locks; | ||
134 | atomic_t blocking_writers; | ||
135 | atomic_t blocking_readers; | ||
136 | atomic_t spinning_readers; | ||
137 | atomic_t spinning_writers; | ||
138 | |||
139 | /* protects write locks */ | ||
140 | rwlock_t lock; | ||
141 | |||
142 | /* readers use lock_wq while they wait for the write | ||
143 | * lock holders to unlock | ||
144 | */ | ||
145 | wait_queue_head_t write_lock_wq; | ||
133 | 146 | ||
134 | /* | 147 | /* writers use read_lock_wq while they wait for readers |
135 | * when we keep the lock held while blocking, waiters go onto | 148 | * to unlock |
136 | * the wq | ||
137 | */ | 149 | */ |
138 | wait_queue_head_t lock_wq; | 150 | wait_queue_head_t read_lock_wq; |
139 | }; | 151 | }; |
140 | 152 | ||
141 | static inline void extent_set_compress_type(unsigned long *bio_flags, | 153 | static inline void extent_set_compress_type(unsigned long *bio_flags, |
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 66fa43dc3f0f..d77b67c4b275 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c | |||
@@ -24,185 +24,197 @@ | |||
24 | #include "extent_io.h" | 24 | #include "extent_io.h" |
25 | #include "locking.h" | 25 | #include "locking.h" |
26 | 26 | ||
27 | static inline void spin_nested(struct extent_buffer *eb) | 27 | void btrfs_assert_tree_read_locked(struct extent_buffer *eb); |
28 | { | ||
29 | spin_lock(&eb->lock); | ||
30 | } | ||
31 | 28 | ||
32 | /* | 29 | /* |
33 | * Setting a lock to blocking will drop the spinlock and set the | 30 | * if we currently have a spinning reader or writer lock |
34 | * flag that forces other procs who want the lock to wait. After | 31 | * (indicated by the rw flag) this will bump the count |
35 | * this you can safely schedule with the lock held. | 32 | * of blocking holders and drop the spinlock. |
36 | */ | 33 | */ |
37 | void btrfs_set_lock_blocking(struct extent_buffer *eb) | 34 | void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) |
38 | { | 35 | { |
39 | if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { | 36 | if (rw == BTRFS_WRITE_LOCK) { |
40 | set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); | 37 | if (atomic_read(&eb->blocking_writers) == 0) { |
41 | spin_unlock(&eb->lock); | 38 | WARN_ON(atomic_read(&eb->spinning_writers) != 1); |
39 | atomic_dec(&eb->spinning_writers); | ||
40 | btrfs_assert_tree_locked(eb); | ||
41 | atomic_inc(&eb->blocking_writers); | ||
42 | write_unlock(&eb->lock); | ||
43 | } | ||
44 | } else if (rw == BTRFS_READ_LOCK) { | ||
45 | btrfs_assert_tree_read_locked(eb); | ||
46 | atomic_inc(&eb->blocking_readers); | ||
47 | WARN_ON(atomic_read(&eb->spinning_readers) == 0); | ||
48 | atomic_dec(&eb->spinning_readers); | ||
49 | read_unlock(&eb->lock); | ||
42 | } | 50 | } |
43 | /* exit with the spin lock released and the bit set */ | 51 | return; |
44 | } | 52 | } |
45 | 53 | ||
46 | /* | 54 | /* |
47 | * clearing the blocking flag will take the spinlock again. | 55 | * if we currently have a blocking lock, take the spinlock |
48 | * After this you can't safely schedule | 56 | * and drop our blocking count |
49 | */ | 57 | */ |
50 | void btrfs_clear_lock_blocking(struct extent_buffer *eb) | 58 | void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) |
51 | { | 59 | { |
52 | if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { | 60 | if (rw == BTRFS_WRITE_LOCK_BLOCKING) { |
53 | spin_nested(eb); | 61 | BUG_ON(atomic_read(&eb->blocking_writers) != 1); |
54 | clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); | 62 | write_lock(&eb->lock); |
55 | smp_mb__after_clear_bit(); | 63 | WARN_ON(atomic_read(&eb->spinning_writers)); |
64 | atomic_inc(&eb->spinning_writers); | ||
65 | if (atomic_dec_and_test(&eb->blocking_writers)) | ||
66 | wake_up(&eb->write_lock_wq); | ||
67 | } else if (rw == BTRFS_READ_LOCK_BLOCKING) { | ||
68 | BUG_ON(atomic_read(&eb->blocking_readers) == 0); | ||
69 | read_lock(&eb->lock); | ||
70 | atomic_inc(&eb->spinning_readers); | ||
71 | if (atomic_dec_and_test(&eb->blocking_readers)) | ||
72 | wake_up(&eb->read_lock_wq); | ||
56 | } | 73 | } |
57 | /* exit with the spin lock held */ | 74 | return; |
58 | } | 75 | } |
59 | 76 | ||
60 | /* | 77 | /* |
61 | * unfortunately, many of the places that currently set a lock to blocking | 78 | * take a spinning read lock. This will wait for any blocking |
62 | * don't end up blocking for very long, and often they don't block | 79 | * writers |
63 | * at all. For a dbench 50 run, if we don't spin on the blocking bit | ||
64 | * at all, the context switch rate can jump up to 400,000/sec or more. | ||
65 | * | ||
66 | * So, we're still stuck with this crummy spin on the blocking bit, | ||
67 | * at least until the most common causes of the short blocks | ||
68 | * can be dealt with. | ||
69 | */ | 80 | */ |
70 | static int btrfs_spin_on_block(struct extent_buffer *eb) | 81 | void btrfs_tree_read_lock(struct extent_buffer *eb) |
71 | { | 82 | { |
72 | int i; | 83 | again: |
73 | 84 | wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); | |
74 | for (i = 0; i < 512; i++) { | 85 | read_lock(&eb->lock); |
75 | if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) | 86 | if (atomic_read(&eb->blocking_writers)) { |
76 | return 1; | 87 | read_unlock(&eb->lock); |
77 | if (need_resched()) | 88 | wait_event(eb->write_lock_wq, |
78 | break; | 89 | atomic_read(&eb->blocking_writers) == 0); |
79 | cpu_relax(); | 90 | goto again; |
80 | } | 91 | } |
81 | return 0; | 92 | atomic_inc(&eb->read_locks); |
93 | atomic_inc(&eb->spinning_readers); | ||
82 | } | 94 | } |
83 | 95 | ||
84 | /* | 96 | /* |
85 | * This is somewhat different from trylock. It will take the | 97 | * returns 1 if we get the read lock and 0 if we don't |
86 | * spinlock but if it finds the lock is set to blocking, it will | 98 | * this won't wait for blocking writers |
87 | * return without the lock held. | ||
88 | * | ||
89 | * returns 1 if it was able to take the lock and zero otherwise | ||
90 | * | ||
91 | * After this call, scheduling is not safe without first calling | ||
92 | * btrfs_set_lock_blocking() | ||
93 | */ | 99 | */ |
94 | int btrfs_try_spin_lock(struct extent_buffer *eb) | 100 | int btrfs_try_tree_read_lock(struct extent_buffer *eb) |
95 | { | 101 | { |
96 | int i; | 102 | if (atomic_read(&eb->blocking_writers)) |
103 | return 0; | ||
97 | 104 | ||
98 | if (btrfs_spin_on_block(eb)) { | 105 | read_lock(&eb->lock); |
99 | spin_nested(eb); | 106 | if (atomic_read(&eb->blocking_writers)) { |
100 | if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) | 107 | read_unlock(&eb->lock); |
101 | return 1; | 108 | return 0; |
102 | spin_unlock(&eb->lock); | ||
103 | } | 109 | } |
104 | /* spin for a bit on the BLOCKING flag */ | 110 | atomic_inc(&eb->read_locks); |
105 | for (i = 0; i < 2; i++) { | 111 | atomic_inc(&eb->spinning_readers); |
106 | cpu_relax(); | 112 | return 1; |
107 | if (!btrfs_spin_on_block(eb)) | ||
108 | break; | ||
109 | |||
110 | spin_nested(eb); | ||
111 | if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) | ||
112 | return 1; | ||
113 | spin_unlock(&eb->lock); | ||
114 | } | ||
115 | return 0; | ||
116 | } | 113 | } |
117 | 114 | ||
118 | /* | 115 | /* |
119 | * the autoremove wake function will return 0 if it tried to wake up | 116 | * returns 1 if we get the read lock and 0 if we don't |
120 | * a process that was already awake, which means that process won't | 117 | * this won't wait for blocking writers or readers |
121 | * count as an exclusive wakeup. The waitq code will continue waking | ||
122 | * procs until it finds one that was actually sleeping. | ||
123 | * | ||
124 | * For btrfs, this isn't quite what we want. We want a single proc | ||
125 | * to be notified that the lock is ready for taking. If that proc | ||
126 | * already happen to be awake, great, it will loop around and try for | ||
127 | * the lock. | ||
128 | * | ||
129 | * So, btrfs_wake_function always returns 1, even when the proc that we | ||
130 | * tried to wake up was already awake. | ||
131 | */ | 118 | */ |
132 | static int btrfs_wake_function(wait_queue_t *wait, unsigned mode, | 119 | int btrfs_try_tree_write_lock(struct extent_buffer *eb) |
133 | int sync, void *key) | ||
134 | { | 120 | { |
135 | autoremove_wake_function(wait, mode, sync, key); | 121 | if (atomic_read(&eb->blocking_writers) || |
122 | atomic_read(&eb->blocking_readers)) | ||
123 | return 0; | ||
124 | write_lock(&eb->lock); | ||
125 | if (atomic_read(&eb->blocking_writers) || | ||
126 | atomic_read(&eb->blocking_readers)) { | ||
127 | write_unlock(&eb->lock); | ||
128 | return 0; | ||
129 | } | ||
130 | atomic_inc(&eb->write_locks); | ||
131 | atomic_inc(&eb->spinning_writers); | ||
136 | return 1; | 132 | return 1; |
137 | } | 133 | } |
138 | 134 | ||
139 | /* | 135 | /* |
140 | * returns with the extent buffer spinlocked. | 136 | * drop a spinning read lock |
141 | * | 137 | */ |
142 | * This will spin and/or wait as required to take the lock, and then | 138 | void btrfs_tree_read_unlock(struct extent_buffer *eb) |
143 | * return with the spinlock held. | 139 | { |
144 | * | 140 | btrfs_assert_tree_read_locked(eb); |
145 | * After this call, scheduling is not safe without first calling | 141 | WARN_ON(atomic_read(&eb->spinning_readers) == 0); |
146 | * btrfs_set_lock_blocking() | 142 | atomic_dec(&eb->spinning_readers); |
143 | atomic_dec(&eb->read_locks); | ||
144 | read_unlock(&eb->lock); | ||
145 | } | ||
146 | |||
147 | /* | ||
148 | * drop a blocking read lock | ||
149 | */ | ||
150 | void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) | ||
151 | { | ||
152 | btrfs_assert_tree_read_locked(eb); | ||
153 | WARN_ON(atomic_read(&eb->blocking_readers) == 0); | ||
154 | if (atomic_dec_and_test(&eb->blocking_readers)) | ||
155 | wake_up(&eb->read_lock_wq); | ||
156 | atomic_dec(&eb->read_locks); | ||
157 | } | ||
158 | |||
159 | /* | ||
160 | * take a spinning write lock. This will wait for both | ||
161 | * blocking readers or writers | ||
147 | */ | 162 | */ |
148 | int btrfs_tree_lock(struct extent_buffer *eb) | 163 | int btrfs_tree_lock(struct extent_buffer *eb) |
149 | { | 164 | { |
150 | DEFINE_WAIT(wait); | 165 | again: |
151 | wait.func = btrfs_wake_function; | 166 | wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); |
152 | 167 | wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); | |
153 | if (!btrfs_spin_on_block(eb)) | 168 | write_lock(&eb->lock); |
154 | goto sleep; | 169 | if (atomic_read(&eb->blocking_readers)) { |
155 | 170 | write_unlock(&eb->lock); | |
156 | while(1) { | 171 | wait_event(eb->read_lock_wq, |
157 | spin_nested(eb); | 172 | atomic_read(&eb->blocking_readers) == 0); |
158 | 173 | goto again; | |
159 | /* nobody is blocking, exit with the spinlock held */ | ||
160 | if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) | ||
161 | return 0; | ||
162 | |||
163 | /* | ||
164 | * we have the spinlock, but the real owner is blocking. | ||
165 | * wait for them | ||
166 | */ | ||
167 | spin_unlock(&eb->lock); | ||
168 | |||
169 | /* | ||
170 | * spin for a bit, and if the blocking flag goes away, | ||
171 | * loop around | ||
172 | */ | ||
173 | cpu_relax(); | ||
174 | if (btrfs_spin_on_block(eb)) | ||
175 | continue; | ||
176 | sleep: | ||
177 | prepare_to_wait_exclusive(&eb->lock_wq, &wait, | ||
178 | TASK_UNINTERRUPTIBLE); | ||
179 | |||
180 | if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) | ||
181 | schedule(); | ||
182 | |||
183 | finish_wait(&eb->lock_wq, &wait); | ||
184 | } | 174 | } |
175 | if (atomic_read(&eb->blocking_writers)) { | ||
176 | write_unlock(&eb->lock); | ||
177 | wait_event(eb->write_lock_wq, | ||
178 | atomic_read(&eb->blocking_writers) == 0); | ||
179 | goto again; | ||
180 | } | ||
181 | WARN_ON(atomic_read(&eb->spinning_writers)); | ||
182 | atomic_inc(&eb->spinning_writers); | ||
183 | atomic_inc(&eb->write_locks); | ||
185 | return 0; | 184 | return 0; |
186 | } | 185 | } |
187 | 186 | ||
187 | /* | ||
188 | * drop a spinning or a blocking write lock. | ||
189 | */ | ||
188 | int btrfs_tree_unlock(struct extent_buffer *eb) | 190 | int btrfs_tree_unlock(struct extent_buffer *eb) |
189 | { | 191 | { |
190 | /* | 192 | int blockers = atomic_read(&eb->blocking_writers); |
191 | * if we were a blocking owner, we don't have the spinlock held | 193 | |
192 | * just clear the bit and look for waiters | 194 | BUG_ON(blockers > 1); |
193 | */ | 195 | |
194 | if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) | 196 | btrfs_assert_tree_locked(eb); |
195 | smp_mb__after_clear_bit(); | 197 | atomic_dec(&eb->write_locks); |
196 | else | 198 | |
197 | spin_unlock(&eb->lock); | 199 | if (blockers) { |
198 | 200 | WARN_ON(atomic_read(&eb->spinning_writers)); | |
199 | if (waitqueue_active(&eb->lock_wq)) | 201 | atomic_dec(&eb->blocking_writers); |
200 | wake_up(&eb->lock_wq); | 202 | smp_wmb(); |
203 | wake_up(&eb->write_lock_wq); | ||
204 | } else { | ||
205 | WARN_ON(atomic_read(&eb->spinning_writers) != 1); | ||
206 | atomic_dec(&eb->spinning_writers); | ||
207 | write_unlock(&eb->lock); | ||
208 | } | ||
201 | return 0; | 209 | return 0; |
202 | } | 210 | } |
203 | 211 | ||
204 | void btrfs_assert_tree_locked(struct extent_buffer *eb) | 212 | void btrfs_assert_tree_locked(struct extent_buffer *eb) |
205 | { | 213 | { |
206 | if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) | 214 | BUG_ON(!atomic_read(&eb->write_locks)); |
207 | assert_spin_locked(&eb->lock); | 215 | } |
216 | |||
217 | void btrfs_assert_tree_read_locked(struct extent_buffer *eb) | ||
218 | { | ||
219 | BUG_ON(!atomic_read(&eb->read_locks)); | ||
208 | } | 220 | } |
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 5c33a560a2f1..17247ddb81a0 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h | |||
@@ -19,11 +19,43 @@ | |||
19 | #ifndef __BTRFS_LOCKING_ | 19 | #ifndef __BTRFS_LOCKING_ |
20 | #define __BTRFS_LOCKING_ | 20 | #define __BTRFS_LOCKING_ |
21 | 21 | ||
22 | #define BTRFS_WRITE_LOCK 1 | ||
23 | #define BTRFS_READ_LOCK 2 | ||
24 | #define BTRFS_WRITE_LOCK_BLOCKING 3 | ||
25 | #define BTRFS_READ_LOCK_BLOCKING 4 | ||
26 | |||
22 | int btrfs_tree_lock(struct extent_buffer *eb); | 27 | int btrfs_tree_lock(struct extent_buffer *eb); |
23 | int btrfs_tree_unlock(struct extent_buffer *eb); | 28 | int btrfs_tree_unlock(struct extent_buffer *eb); |
24 | int btrfs_try_spin_lock(struct extent_buffer *eb); | 29 | int btrfs_try_spin_lock(struct extent_buffer *eb); |
25 | 30 | ||
26 | void btrfs_set_lock_blocking(struct extent_buffer *eb); | 31 | void btrfs_tree_read_lock(struct extent_buffer *eb); |
27 | void btrfs_clear_lock_blocking(struct extent_buffer *eb); | 32 | void btrfs_tree_read_unlock(struct extent_buffer *eb); |
33 | void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); | ||
34 | void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw); | ||
35 | void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw); | ||
28 | void btrfs_assert_tree_locked(struct extent_buffer *eb); | 36 | void btrfs_assert_tree_locked(struct extent_buffer *eb); |
37 | int btrfs_try_tree_read_lock(struct extent_buffer *eb); | ||
38 | int btrfs_try_tree_write_lock(struct extent_buffer *eb); | ||
39 | |||
40 | static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) | ||
41 | { | ||
42 | if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING) | ||
43 | btrfs_tree_unlock(eb); | ||
44 | else if (rw == BTRFS_READ_LOCK_BLOCKING) | ||
45 | btrfs_tree_read_unlock_blocking(eb); | ||
46 | else if (rw == BTRFS_READ_LOCK) | ||
47 | btrfs_tree_read_unlock(eb); | ||
48 | else | ||
49 | BUG(); | ||
50 | } | ||
51 | |||
52 | static inline void btrfs_set_lock_blocking(struct extent_buffer *eb) | ||
53 | { | ||
54 | btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK); | ||
55 | } | ||
56 | |||
57 | static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb) | ||
58 | { | ||
59 | btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING); | ||
60 | } | ||
29 | #endif | 61 | #endif |
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4ce8a9f41d1e..ac278dd83175 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c | |||
@@ -1730,8 +1730,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, | |||
1730 | btrfs_read_buffer(next, ptr_gen); | 1730 | btrfs_read_buffer(next, ptr_gen); |
1731 | 1731 | ||
1732 | btrfs_tree_lock(next); | 1732 | btrfs_tree_lock(next); |
1733 | clean_tree_block(trans, root, next); | ||
1734 | btrfs_set_lock_blocking(next); | 1733 | btrfs_set_lock_blocking(next); |
1734 | clean_tree_block(trans, root, next); | ||
1735 | btrfs_wait_tree_block_writeback(next); | 1735 | btrfs_wait_tree_block_writeback(next); |
1736 | btrfs_tree_unlock(next); | 1736 | btrfs_tree_unlock(next); |
1737 | 1737 | ||
@@ -1796,8 +1796,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, | |||
1796 | next = path->nodes[*level]; | 1796 | next = path->nodes[*level]; |
1797 | 1797 | ||
1798 | btrfs_tree_lock(next); | 1798 | btrfs_tree_lock(next); |
1799 | clean_tree_block(trans, root, next); | ||
1800 | btrfs_set_lock_blocking(next); | 1799 | btrfs_set_lock_blocking(next); |
1800 | clean_tree_block(trans, root, next); | ||
1801 | btrfs_wait_tree_block_writeback(next); | 1801 | btrfs_wait_tree_block_writeback(next); |
1802 | btrfs_tree_unlock(next); | 1802 | btrfs_tree_unlock(next); |
1803 | 1803 | ||
@@ -1864,8 +1864,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, | |||
1864 | next = path->nodes[orig_level]; | 1864 | next = path->nodes[orig_level]; |
1865 | 1865 | ||
1866 | btrfs_tree_lock(next); | 1866 | btrfs_tree_lock(next); |
1867 | clean_tree_block(trans, log, next); | ||
1868 | btrfs_set_lock_blocking(next); | 1867 | btrfs_set_lock_blocking(next); |
1868 | clean_tree_block(trans, log, next); | ||
1869 | btrfs_wait_tree_block_writeback(next); | 1869 | btrfs_wait_tree_block_writeback(next); |
1870 | btrfs_tree_unlock(next); | 1870 | btrfs_tree_unlock(next); |
1871 | 1871 | ||