diff options
author | Chris Mason <chris.mason@oracle.com> | 2011-07-16 15:23:14 -0400 |
---|---|---|
committer | Chris Mason <chris.mason@oracle.com> | 2011-07-27 12:46:46 -0400 |
commit | bd681513fa6f2ff29aa391f01e413a2d1c59fd77 (patch) | |
tree | bb10ec6ef876b4d7a553cbe54976ec49a0d10b21 | |
parent | 81317fdeddcef259b6ecf7b5c0d04caa167c6b54 (diff) |
Btrfs: switch the btrfs tree locks to reader/writer
The btrfs metadata btree is the source of significant
lock contention, especially in the root node. This
commit changes our locking to use a reader/writer
lock.
The lock is built on top of rw spinlocks, and it
extends the lock tracking to remember if we have a
read lock or a write lock when we go to blocking. Atomics
count the number of blocking readers or writers at any
given time.
It removes all of the adaptive spinning from the old code
and uses only the spinning/blocking hints inside of btrfs
to decide when it should continue spinning.
In read heavy workloads this is dramatically faster. In write
heavy workloads we're still faster because of less contention
on the root node lock.
We suffer slightly in dbench because we schedule more often
during write locks, but all other benchmarks so far are improved.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r-- | fs/btrfs/ctree.c | 268 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 2 | ||||
-rw-r--r-- | fs/btrfs/delayed-inode.c | 2 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 20 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 11 | ||||
-rw-r--r-- | fs/btrfs/extent_io.h | 24 | ||||
-rw-r--r-- | fs/btrfs/locking.c | 280 | ||||
-rw-r--r-- | fs/btrfs/locking.h | 36 | ||||
-rw-r--r-- | fs/btrfs/tree-log.c | 6 |
9 files changed, 431 insertions, 218 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d2431284b913..0ad48e782d37 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c | |||
@@ -54,8 +54,13 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p) | |||
54 | { | 54 | { |
55 | int i; | 55 | int i; |
56 | for (i = 0; i < BTRFS_MAX_LEVEL; i++) { | 56 | for (i = 0; i < BTRFS_MAX_LEVEL; i++) { |
57 | if (p->nodes[i] && p->locks[i]) | 57 | if (!p->nodes[i] || !p->locks[i]) |
58 | btrfs_set_lock_blocking(p->nodes[i]); | 58 | continue; |
59 | btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]); | ||
60 | if (p->locks[i] == BTRFS_READ_LOCK) | ||
61 | p->locks[i] = BTRFS_READ_LOCK_BLOCKING; | ||
62 | else if (p->locks[i] == BTRFS_WRITE_LOCK) | ||
63 | p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; | ||
59 | } | 64 | } |
60 | } | 65 | } |
61 | 66 | ||
@@ -68,7 +73,7 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p) | |||
68 | * for held | 73 | * for held |
69 | */ | 74 | */ |
70 | noinline void btrfs_clear_path_blocking(struct btrfs_path *p, | 75 | noinline void btrfs_clear_path_blocking(struct btrfs_path *p, |
71 | struct extent_buffer *held) | 76 | struct extent_buffer *held, int held_rw) |
72 | { | 77 | { |
73 | int i; | 78 | int i; |
74 | 79 | ||
@@ -79,19 +84,29 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p, | |||
79 | * really sure by forcing the path to blocking before we clear | 84 | * really sure by forcing the path to blocking before we clear |
80 | * the path blocking. | 85 | * the path blocking. |
81 | */ | 86 | */ |
82 | if (held) | 87 | if (held) { |
83 | btrfs_set_lock_blocking(held); | 88 | btrfs_set_lock_blocking_rw(held, held_rw); |
89 | if (held_rw == BTRFS_WRITE_LOCK) | ||
90 | held_rw = BTRFS_WRITE_LOCK_BLOCKING; | ||
91 | else if (held_rw == BTRFS_READ_LOCK) | ||
92 | held_rw = BTRFS_READ_LOCK_BLOCKING; | ||
93 | } | ||
84 | btrfs_set_path_blocking(p); | 94 | btrfs_set_path_blocking(p); |
85 | #endif | 95 | #endif |
86 | 96 | ||
87 | for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { | 97 | for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { |
88 | if (p->nodes[i] && p->locks[i]) | 98 | if (p->nodes[i] && p->locks[i]) { |
89 | btrfs_clear_lock_blocking(p->nodes[i]); | 99 | btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]); |
100 | if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING) | ||
101 | p->locks[i] = BTRFS_WRITE_LOCK; | ||
102 | else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING) | ||
103 | p->locks[i] = BTRFS_READ_LOCK; | ||
104 | } | ||
90 | } | 105 | } |
91 | 106 | ||
92 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 107 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
93 | if (held) | 108 | if (held) |
94 | btrfs_clear_lock_blocking(held); | 109 | btrfs_clear_lock_blocking_rw(held, held_rw); |
95 | #endif | 110 | #endif |
96 | } | 111 | } |
97 | 112 | ||
@@ -119,7 +134,7 @@ noinline void btrfs_release_path(struct btrfs_path *p) | |||
119 | if (!p->nodes[i]) | 134 | if (!p->nodes[i]) |
120 | continue; | 135 | continue; |
121 | if (p->locks[i]) { | 136 | if (p->locks[i]) { |
122 | btrfs_tree_unlock(p->nodes[i]); | 137 | btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); |
123 | p->locks[i] = 0; | 138 | p->locks[i] = 0; |
124 | } | 139 | } |
125 | free_extent_buffer(p->nodes[i]); | 140 | free_extent_buffer(p->nodes[i]); |
@@ -167,6 +182,25 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) | |||
167 | return eb; | 182 | return eb; |
168 | } | 183 | } |
169 | 184 | ||
185 | /* loop around taking references on and locking the root node of the | ||
186 | * tree until you end up with a lock on the root. A locked buffer | ||
187 | * is returned, with a reference held. | ||
188 | */ | ||
189 | struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) | ||
190 | { | ||
191 | struct extent_buffer *eb; | ||
192 | |||
193 | while (1) { | ||
194 | eb = btrfs_root_node(root); | ||
195 | btrfs_tree_read_lock(eb); | ||
196 | if (eb == root->node) | ||
197 | break; | ||
198 | btrfs_tree_read_unlock(eb); | ||
199 | free_extent_buffer(eb); | ||
200 | } | ||
201 | return eb; | ||
202 | } | ||
203 | |||
170 | /* cowonly root (everything not a reference counted cow subvolume), just get | 204 | /* cowonly root (everything not a reference counted cow subvolume), just get |
171 | * put onto a simple dirty list. transaction.c walks this to make sure they | 205 | * put onto a simple dirty list. transaction.c walks this to make sure they |
172 | * get properly updated on disk. | 206 | * get properly updated on disk. |
@@ -862,7 +896,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, | |||
862 | 896 | ||
863 | mid = path->nodes[level]; | 897 | mid = path->nodes[level]; |
864 | 898 | ||
865 | WARN_ON(!path->locks[level]); | 899 | WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK && |
900 | path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING); | ||
866 | WARN_ON(btrfs_header_generation(mid) != trans->transid); | 901 | WARN_ON(btrfs_header_generation(mid) != trans->transid); |
867 | 902 | ||
868 | orig_ptr = btrfs_node_blockptr(mid, orig_slot); | 903 | orig_ptr = btrfs_node_blockptr(mid, orig_slot); |
@@ -1360,7 +1395,7 @@ static noinline void unlock_up(struct btrfs_path *path, int level, | |||
1360 | 1395 | ||
1361 | t = path->nodes[i]; | 1396 | t = path->nodes[i]; |
1362 | if (i >= lowest_unlock && i > skip_level && path->locks[i]) { | 1397 | if (i >= lowest_unlock && i > skip_level && path->locks[i]) { |
1363 | btrfs_tree_unlock(t); | 1398 | btrfs_tree_unlock_rw(t, path->locks[i]); |
1364 | path->locks[i] = 0; | 1399 | path->locks[i] = 0; |
1365 | } | 1400 | } |
1366 | } | 1401 | } |
@@ -1387,7 +1422,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) | |||
1387 | continue; | 1422 | continue; |
1388 | if (!path->locks[i]) | 1423 | if (!path->locks[i]) |
1389 | continue; | 1424 | continue; |
1390 | btrfs_tree_unlock(path->nodes[i]); | 1425 | btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); |
1391 | path->locks[i] = 0; | 1426 | path->locks[i] = 0; |
1392 | } | 1427 | } |
1393 | } | 1428 | } |
@@ -1436,6 +1471,8 @@ read_block_for_search(struct btrfs_trans_handle *trans, | |||
1436 | * we can trust our generation number | 1471 | * we can trust our generation number |
1437 | */ | 1472 | */ |
1438 | free_extent_buffer(tmp); | 1473 | free_extent_buffer(tmp); |
1474 | btrfs_set_path_blocking(p); | ||
1475 | |||
1439 | tmp = read_tree_block(root, blocknr, blocksize, gen); | 1476 | tmp = read_tree_block(root, blocknr, blocksize, gen); |
1440 | if (tmp && btrfs_buffer_uptodate(tmp, gen)) { | 1477 | if (tmp && btrfs_buffer_uptodate(tmp, gen)) { |
1441 | *eb_ret = tmp; | 1478 | *eb_ret = tmp; |
@@ -1491,20 +1528,27 @@ read_block_for_search(struct btrfs_trans_handle *trans, | |||
1491 | static int | 1528 | static int |
1492 | setup_nodes_for_search(struct btrfs_trans_handle *trans, | 1529 | setup_nodes_for_search(struct btrfs_trans_handle *trans, |
1493 | struct btrfs_root *root, struct btrfs_path *p, | 1530 | struct btrfs_root *root, struct btrfs_path *p, |
1494 | struct extent_buffer *b, int level, int ins_len) | 1531 | struct extent_buffer *b, int level, int ins_len, |
1532 | int *write_lock_level) | ||
1495 | { | 1533 | { |
1496 | int ret; | 1534 | int ret; |
1497 | if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= | 1535 | if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= |
1498 | BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { | 1536 | BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { |
1499 | int sret; | 1537 | int sret; |
1500 | 1538 | ||
1539 | if (*write_lock_level < level + 1) { | ||
1540 | *write_lock_level = level + 1; | ||
1541 | btrfs_release_path(p); | ||
1542 | goto again; | ||
1543 | } | ||
1544 | |||
1501 | sret = reada_for_balance(root, p, level); | 1545 | sret = reada_for_balance(root, p, level); |
1502 | if (sret) | 1546 | if (sret) |
1503 | goto again; | 1547 | goto again; |
1504 | 1548 | ||
1505 | btrfs_set_path_blocking(p); | 1549 | btrfs_set_path_blocking(p); |
1506 | sret = split_node(trans, root, p, level); | 1550 | sret = split_node(trans, root, p, level); |
1507 | btrfs_clear_path_blocking(p, NULL); | 1551 | btrfs_clear_path_blocking(p, NULL, 0); |
1508 | 1552 | ||
1509 | BUG_ON(sret > 0); | 1553 | BUG_ON(sret > 0); |
1510 | if (sret) { | 1554 | if (sret) { |
@@ -1516,13 +1560,19 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, | |||
1516 | BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { | 1560 | BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { |
1517 | int sret; | 1561 | int sret; |
1518 | 1562 | ||
1563 | if (*write_lock_level < level + 1) { | ||
1564 | *write_lock_level = level + 1; | ||
1565 | btrfs_release_path(p); | ||
1566 | goto again; | ||
1567 | } | ||
1568 | |||
1519 | sret = reada_for_balance(root, p, level); | 1569 | sret = reada_for_balance(root, p, level); |
1520 | if (sret) | 1570 | if (sret) |
1521 | goto again; | 1571 | goto again; |
1522 | 1572 | ||
1523 | btrfs_set_path_blocking(p); | 1573 | btrfs_set_path_blocking(p); |
1524 | sret = balance_level(trans, root, p, level); | 1574 | sret = balance_level(trans, root, p, level); |
1525 | btrfs_clear_path_blocking(p, NULL); | 1575 | btrfs_clear_path_blocking(p, NULL, 0); |
1526 | 1576 | ||
1527 | if (sret) { | 1577 | if (sret) { |
1528 | ret = sret; | 1578 | ret = sret; |
@@ -1566,27 +1616,78 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root | |||
1566 | int err; | 1616 | int err; |
1567 | int level; | 1617 | int level; |
1568 | int lowest_unlock = 1; | 1618 | int lowest_unlock = 1; |
1619 | int root_lock; | ||
1620 | /* everything at write_lock_level or lower must be write locked */ | ||
1621 | int write_lock_level = 0; | ||
1569 | u8 lowest_level = 0; | 1622 | u8 lowest_level = 0; |
1570 | 1623 | ||
1571 | lowest_level = p->lowest_level; | 1624 | lowest_level = p->lowest_level; |
1572 | WARN_ON(lowest_level && ins_len > 0); | 1625 | WARN_ON(lowest_level && ins_len > 0); |
1573 | WARN_ON(p->nodes[0] != NULL); | 1626 | WARN_ON(p->nodes[0] != NULL); |
1574 | 1627 | ||
1575 | if (ins_len < 0) | 1628 | if (ins_len < 0) { |
1576 | lowest_unlock = 2; | 1629 | lowest_unlock = 2; |
1577 | 1630 | ||
1631 | /* when we are removing items, we might have to go up to level | ||
1632 | * two as we update tree pointers Make sure we keep write | ||
1633 | * for those levels as well | ||
1634 | */ | ||
1635 | write_lock_level = 2; | ||
1636 | } else if (ins_len > 0) { | ||
1637 | /* | ||
1638 | * for inserting items, make sure we have a write lock on | ||
1639 | * level 1 so we can update keys | ||
1640 | */ | ||
1641 | write_lock_level = 1; | ||
1642 | } | ||
1643 | |||
1644 | if (!cow) | ||
1645 | write_lock_level = -1; | ||
1646 | |||
1647 | if (cow && (p->keep_locks || p->lowest_level)) | ||
1648 | write_lock_level = BTRFS_MAX_LEVEL; | ||
1649 | |||
1578 | again: | 1650 | again: |
1651 | /* | ||
1652 | * we try very hard to do read locks on the root | ||
1653 | */ | ||
1654 | root_lock = BTRFS_READ_LOCK; | ||
1655 | level = 0; | ||
1579 | if (p->search_commit_root) { | 1656 | if (p->search_commit_root) { |
1657 | /* | ||
1658 | * the commit roots are read only | ||
1659 | * so we always do read locks | ||
1660 | */ | ||
1580 | b = root->commit_root; | 1661 | b = root->commit_root; |
1581 | extent_buffer_get(b); | 1662 | extent_buffer_get(b); |
1663 | level = btrfs_header_level(b); | ||
1582 | if (!p->skip_locking) | 1664 | if (!p->skip_locking) |
1583 | btrfs_tree_lock(b); | 1665 | btrfs_tree_read_lock(b); |
1584 | } else { | 1666 | } else { |
1585 | if (p->skip_locking) | 1667 | if (p->skip_locking) { |
1586 | b = btrfs_root_node(root); | 1668 | b = btrfs_root_node(root); |
1587 | else | 1669 | level = btrfs_header_level(b); |
1588 | b = btrfs_lock_root_node(root); | 1670 | } else { |
1671 | /* we don't know the level of the root node | ||
1672 | * until we actually have it read locked | ||
1673 | */ | ||
1674 | b = btrfs_read_lock_root_node(root); | ||
1675 | level = btrfs_header_level(b); | ||
1676 | if (level <= write_lock_level) { | ||
1677 | /* whoops, must trade for write lock */ | ||
1678 | btrfs_tree_read_unlock(b); | ||
1679 | free_extent_buffer(b); | ||
1680 | b = btrfs_lock_root_node(root); | ||
1681 | root_lock = BTRFS_WRITE_LOCK; | ||
1682 | |||
1683 | /* the level might have changed, check again */ | ||
1684 | level = btrfs_header_level(b); | ||
1685 | } | ||
1686 | } | ||
1589 | } | 1687 | } |
1688 | p->nodes[level] = b; | ||
1689 | if (!p->skip_locking) | ||
1690 | p->locks[level] = root_lock; | ||
1590 | 1691 | ||
1591 | while (b) { | 1692 | while (b) { |
1592 | level = btrfs_header_level(b); | 1693 | level = btrfs_header_level(b); |
@@ -1595,10 +1696,6 @@ again: | |||
1595 | * setup the path here so we can release it under lock | 1696 | * setup the path here so we can release it under lock |
1596 | * contention with the cow code | 1697 | * contention with the cow code |
1597 | */ | 1698 | */ |
1598 | p->nodes[level] = b; | ||
1599 | if (!p->skip_locking) | ||
1600 | p->locks[level] = 1; | ||
1601 | |||
1602 | if (cow) { | 1699 | if (cow) { |
1603 | /* | 1700 | /* |
1604 | * if we don't really need to cow this block | 1701 | * if we don't really need to cow this block |
@@ -1610,6 +1707,16 @@ again: | |||
1610 | 1707 | ||
1611 | btrfs_set_path_blocking(p); | 1708 | btrfs_set_path_blocking(p); |
1612 | 1709 | ||
1710 | /* | ||
1711 | * must have write locks on this node and the | ||
1712 | * parent | ||
1713 | */ | ||
1714 | if (level + 1 > write_lock_level) { | ||
1715 | write_lock_level = level + 1; | ||
1716 | btrfs_release_path(p); | ||
1717 | goto again; | ||
1718 | } | ||
1719 | |||
1613 | err = btrfs_cow_block(trans, root, b, | 1720 | err = btrfs_cow_block(trans, root, b, |
1614 | p->nodes[level + 1], | 1721 | p->nodes[level + 1], |
1615 | p->slots[level + 1], &b); | 1722 | p->slots[level + 1], &b); |
@@ -1622,10 +1729,7 @@ cow_done: | |||
1622 | BUG_ON(!cow && ins_len); | 1729 | BUG_ON(!cow && ins_len); |
1623 | 1730 | ||
1624 | p->nodes[level] = b; | 1731 | p->nodes[level] = b; |
1625 | if (!p->skip_locking) | 1732 | btrfs_clear_path_blocking(p, NULL, 0); |
1626 | p->locks[level] = 1; | ||
1627 | |||
1628 | btrfs_clear_path_blocking(p, NULL); | ||
1629 | 1733 | ||
1630 | /* | 1734 | /* |
1631 | * we have a lock on b and as long as we aren't changing | 1735 | * we have a lock on b and as long as we aren't changing |
@@ -1651,7 +1755,7 @@ cow_done: | |||
1651 | } | 1755 | } |
1652 | p->slots[level] = slot; | 1756 | p->slots[level] = slot; |
1653 | err = setup_nodes_for_search(trans, root, p, b, level, | 1757 | err = setup_nodes_for_search(trans, root, p, b, level, |
1654 | ins_len); | 1758 | ins_len, &write_lock_level); |
1655 | if (err == -EAGAIN) | 1759 | if (err == -EAGAIN) |
1656 | goto again; | 1760 | goto again; |
1657 | if (err) { | 1761 | if (err) { |
@@ -1661,6 +1765,19 @@ cow_done: | |||
1661 | b = p->nodes[level]; | 1765 | b = p->nodes[level]; |
1662 | slot = p->slots[level]; | 1766 | slot = p->slots[level]; |
1663 | 1767 | ||
1768 | /* | ||
1769 | * slot 0 is special, if we change the key | ||
1770 | * we have to update the parent pointer | ||
1771 | * which means we must have a write lock | ||
1772 | * on the parent | ||
1773 | */ | ||
1774 | if (slot == 0 && cow && | ||
1775 | write_lock_level < level + 1) { | ||
1776 | write_lock_level = level + 1; | ||
1777 | btrfs_release_path(p); | ||
1778 | goto again; | ||
1779 | } | ||
1780 | |||
1664 | unlock_up(p, level, lowest_unlock); | 1781 | unlock_up(p, level, lowest_unlock); |
1665 | 1782 | ||
1666 | if (level == lowest_level) { | 1783 | if (level == lowest_level) { |
@@ -1679,23 +1796,42 @@ cow_done: | |||
1679 | } | 1796 | } |
1680 | 1797 | ||
1681 | if (!p->skip_locking) { | 1798 | if (!p->skip_locking) { |
1682 | btrfs_clear_path_blocking(p, NULL); | 1799 | level = btrfs_header_level(b); |
1683 | err = btrfs_try_spin_lock(b); | 1800 | if (level <= write_lock_level) { |
1684 | 1801 | err = btrfs_try_tree_write_lock(b); | |
1685 | if (!err) { | 1802 | if (!err) { |
1686 | btrfs_set_path_blocking(p); | 1803 | btrfs_set_path_blocking(p); |
1687 | btrfs_tree_lock(b); | 1804 | btrfs_tree_lock(b); |
1688 | btrfs_clear_path_blocking(p, b); | 1805 | btrfs_clear_path_blocking(p, b, |
1806 | BTRFS_WRITE_LOCK); | ||
1807 | } | ||
1808 | p->locks[level] = BTRFS_WRITE_LOCK; | ||
1809 | } else { | ||
1810 | err = btrfs_try_tree_read_lock(b); | ||
1811 | if (!err) { | ||
1812 | btrfs_set_path_blocking(p); | ||
1813 | btrfs_tree_read_lock(b); | ||
1814 | btrfs_clear_path_blocking(p, b, | ||
1815 | BTRFS_READ_LOCK); | ||
1816 | } | ||
1817 | p->locks[level] = BTRFS_READ_LOCK; | ||
1689 | } | 1818 | } |
1819 | p->nodes[level] = b; | ||
1690 | } | 1820 | } |
1691 | } else { | 1821 | } else { |
1692 | p->slots[level] = slot; | 1822 | p->slots[level] = slot; |
1693 | if (ins_len > 0 && | 1823 | if (ins_len > 0 && |
1694 | btrfs_leaf_free_space(root, b) < ins_len) { | 1824 | btrfs_leaf_free_space(root, b) < ins_len) { |
1825 | if (write_lock_level < 1) { | ||
1826 | write_lock_level = 1; | ||
1827 | btrfs_release_path(p); | ||
1828 | goto again; | ||
1829 | } | ||
1830 | |||
1695 | btrfs_set_path_blocking(p); | 1831 | btrfs_set_path_blocking(p); |
1696 | err = split_leaf(trans, root, key, | 1832 | err = split_leaf(trans, root, key, |
1697 | p, ins_len, ret == 0); | 1833 | p, ins_len, ret == 0); |
1698 | btrfs_clear_path_blocking(p, NULL); | 1834 | btrfs_clear_path_blocking(p, NULL, 0); |
1699 | 1835 | ||
1700 | BUG_ON(err > 0); | 1836 | BUG_ON(err > 0); |
1701 | if (err) { | 1837 | if (err) { |
@@ -1976,7 +2112,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, | |||
1976 | add_root_to_dirty_list(root); | 2112 | add_root_to_dirty_list(root); |
1977 | extent_buffer_get(c); | 2113 | extent_buffer_get(c); |
1978 | path->nodes[level] = c; | 2114 | path->nodes[level] = c; |
1979 | path->locks[level] = 1; | 2115 | path->locks[level] = BTRFS_WRITE_LOCK; |
1980 | path->slots[level] = 0; | 2116 | path->slots[level] = 0; |
1981 | return 0; | 2117 | return 0; |
1982 | } | 2118 | } |
@@ -3819,11 +3955,11 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, | |||
3819 | 3955 | ||
3820 | WARN_ON(!path->keep_locks); | 3956 | WARN_ON(!path->keep_locks); |
3821 | again: | 3957 | again: |
3822 | cur = btrfs_lock_root_node(root); | 3958 | cur = btrfs_read_lock_root_node(root); |
3823 | level = btrfs_header_level(cur); | 3959 | level = btrfs_header_level(cur); |
3824 | WARN_ON(path->nodes[level]); | 3960 | WARN_ON(path->nodes[level]); |
3825 | path->nodes[level] = cur; | 3961 | path->nodes[level] = cur; |
3826 | path->locks[level] = 1; | 3962 | path->locks[level] = BTRFS_READ_LOCK; |
3827 | 3963 | ||
3828 | if (btrfs_header_generation(cur) < min_trans) { | 3964 | if (btrfs_header_generation(cur) < min_trans) { |
3829 | ret = 1; | 3965 | ret = 1; |
@@ -3913,12 +4049,12 @@ find_next_key: | |||
3913 | cur = read_node_slot(root, cur, slot); | 4049 | cur = read_node_slot(root, cur, slot); |
3914 | BUG_ON(!cur); | 4050 | BUG_ON(!cur); |
3915 | 4051 | ||
3916 | btrfs_tree_lock(cur); | 4052 | btrfs_tree_read_lock(cur); |
3917 | 4053 | ||
3918 | path->locks[level - 1] = 1; | 4054 | path->locks[level - 1] = BTRFS_READ_LOCK; |
3919 | path->nodes[level - 1] = cur; | 4055 | path->nodes[level - 1] = cur; |
3920 | unlock_up(path, level, 1); | 4056 | unlock_up(path, level, 1); |
3921 | btrfs_clear_path_blocking(path, NULL); | 4057 | btrfs_clear_path_blocking(path, NULL, 0); |
3922 | } | 4058 | } |
3923 | out: | 4059 | out: |
3924 | if (ret == 0) | 4060 | if (ret == 0) |
@@ -4034,6 +4170,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) | |||
4034 | int ret; | 4170 | int ret; |
4035 | int old_spinning = path->leave_spinning; | 4171 | int old_spinning = path->leave_spinning; |
4036 | int force_blocking = 0; | 4172 | int force_blocking = 0; |
4173 | int next_rw_lock = 0; | ||
4037 | 4174 | ||
4038 | nritems = btrfs_header_nritems(path->nodes[0]); | 4175 | nritems = btrfs_header_nritems(path->nodes[0]); |
4039 | if (nritems == 0) | 4176 | if (nritems == 0) |
@@ -4051,6 +4188,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) | |||
4051 | again: | 4188 | again: |
4052 | level = 1; | 4189 | level = 1; |
4053 | next = NULL; | 4190 | next = NULL; |
4191 | next_rw_lock = 0; | ||
4054 | btrfs_release_path(path); | 4192 | btrfs_release_path(path); |
4055 | 4193 | ||
4056 | path->keep_locks = 1; | 4194 | path->keep_locks = 1; |
@@ -4096,11 +4234,12 @@ again: | |||
4096 | } | 4234 | } |
4097 | 4235 | ||
4098 | if (next) { | 4236 | if (next) { |
4099 | btrfs_tree_unlock(next); | 4237 | btrfs_tree_unlock_rw(next, next_rw_lock); |
4100 | free_extent_buffer(next); | 4238 | free_extent_buffer(next); |
4101 | } | 4239 | } |
4102 | 4240 | ||
4103 | next = c; | 4241 | next = c; |
4242 | next_rw_lock = path->locks[level]; | ||
4104 | ret = read_block_for_search(NULL, root, path, &next, level, | 4243 | ret = read_block_for_search(NULL, root, path, &next, level, |
4105 | slot, &key); | 4244 | slot, &key); |
4106 | if (ret == -EAGAIN) | 4245 | if (ret == -EAGAIN) |
@@ -4112,15 +4251,22 @@ again: | |||
4112 | } | 4251 | } |
4113 | 4252 | ||
4114 | if (!path->skip_locking) { | 4253 | if (!path->skip_locking) { |
4115 | ret = btrfs_try_spin_lock(next); | 4254 | ret = btrfs_try_tree_read_lock(next); |
4116 | if (!ret) { | 4255 | if (!ret) { |
4117 | btrfs_set_path_blocking(path); | 4256 | btrfs_set_path_blocking(path); |
4118 | btrfs_tree_lock(next); | 4257 | btrfs_tree_read_lock(next); |
4119 | if (!force_blocking) | 4258 | if (!force_blocking) { |
4120 | btrfs_clear_path_blocking(path, next); | 4259 | btrfs_clear_path_blocking(path, next, |
4260 | BTRFS_READ_LOCK); | ||
4261 | } | ||
4262 | } | ||
4263 | if (force_blocking) { | ||
4264 | btrfs_set_lock_blocking_rw(next, | ||
4265 | BTRFS_READ_LOCK); | ||
4266 | next_rw_lock = BTRFS_READ_LOCK_BLOCKING; | ||
4267 | } else { | ||
4268 | next_rw_lock = BTRFS_READ_LOCK; | ||
4121 | } | 4269 | } |
4122 | if (force_blocking) | ||
4123 | btrfs_set_lock_blocking(next); | ||
4124 | } | 4270 | } |
4125 | break; | 4271 | break; |
4126 | } | 4272 | } |
@@ -4129,14 +4275,13 @@ again: | |||
4129 | level--; | 4275 | level--; |
4130 | c = path->nodes[level]; | 4276 | c = path->nodes[level]; |
4131 | if (path->locks[level]) | 4277 | if (path->locks[level]) |
4132 | btrfs_tree_unlock(c); | 4278 | btrfs_tree_unlock_rw(c, path->locks[level]); |
4133 | 4279 | ||
4134 | free_extent_buffer(c); | 4280 | free_extent_buffer(c); |
4135 | path->nodes[level] = next; | 4281 | path->nodes[level] = next; |
4136 | path->slots[level] = 0; | 4282 | path->slots[level] = 0; |
4137 | if (!path->skip_locking) | 4283 | if (!path->skip_locking) |
4138 | path->locks[level] = 1; | 4284 | path->locks[level] = next_rw_lock; |
4139 | |||
4140 | if (!level) | 4285 | if (!level) |
4141 | break; | 4286 | break; |
4142 | 4287 | ||
@@ -4151,16 +4296,21 @@ again: | |||
4151 | } | 4296 | } |
4152 | 4297 | ||
4153 | if (!path->skip_locking) { | 4298 | if (!path->skip_locking) { |
4154 | btrfs_assert_tree_locked(path->nodes[level]); | 4299 | ret = btrfs_try_tree_read_lock(next); |
4155 | ret = btrfs_try_spin_lock(next); | ||
4156 | if (!ret) { | 4300 | if (!ret) { |
4157 | btrfs_set_path_blocking(path); | 4301 | btrfs_set_path_blocking(path); |
4158 | btrfs_tree_lock(next); | 4302 | btrfs_tree_read_lock(next); |
4159 | if (!force_blocking) | 4303 | if (!force_blocking) |
4160 | btrfs_clear_path_blocking(path, next); | 4304 | btrfs_clear_path_blocking(path, next, |
4305 | BTRFS_READ_LOCK); | ||
4306 | } | ||
4307 | if (force_blocking) { | ||
4308 | btrfs_set_lock_blocking_rw(next, | ||
4309 | BTRFS_READ_LOCK); | ||
4310 | next_rw_lock = BTRFS_READ_LOCK_BLOCKING; | ||
4311 | } else { | ||
4312 | next_rw_lock = BTRFS_READ_LOCK; | ||
4161 | } | 4313 | } |
4162 | if (force_blocking) | ||
4163 | btrfs_set_lock_blocking(next); | ||
4164 | } | 4314 | } |
4165 | } | 4315 | } |
4166 | ret = 0; | 4316 | ret = 0; |
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3063f21d3fc6..40235e10cfb9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h | |||
@@ -2333,7 +2333,7 @@ struct btrfs_path *btrfs_alloc_path(void); | |||
2333 | void btrfs_free_path(struct btrfs_path *p); | 2333 | void btrfs_free_path(struct btrfs_path *p); |
2334 | void btrfs_set_path_blocking(struct btrfs_path *p); | 2334 | void btrfs_set_path_blocking(struct btrfs_path *p); |
2335 | void btrfs_clear_path_blocking(struct btrfs_path *p, | 2335 | void btrfs_clear_path_blocking(struct btrfs_path *p, |
2336 | struct extent_buffer *held); | 2336 | struct extent_buffer *held, int held_rw); |
2337 | void btrfs_unlock_up_safe(struct btrfs_path *p, int level); | 2337 | void btrfs_unlock_up_safe(struct btrfs_path *p, int level); |
2338 | 2338 | ||
2339 | int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, | 2339 | int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, |
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 98c68e658a9b..b52c672f4c18 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c | |||
@@ -735,7 +735,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans, | |||
735 | } | 735 | } |
736 | 736 | ||
737 | /* reset all the locked nodes in the patch to spinning locks. */ | 737 | /* reset all the locked nodes in the patch to spinning locks. */ |
738 | btrfs_clear_path_blocking(path, NULL); | 738 | btrfs_clear_path_blocking(path, NULL, 0); |
739 | 739 | ||
740 | /* insert the keys of the items */ | 740 | /* insert the keys of the items */ |
741 | ret = setup_items_for_insert(trans, root, path, keys, data_size, | 741 | ret = setup_items_for_insert(trans, root, path, keys, data_size, |
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7021dde74d81..2a782c2fcb62 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
@@ -5912,7 +5912,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, | |||
5912 | return 1; | 5912 | return 1; |
5913 | 5913 | ||
5914 | if (path->locks[level] && !wc->keep_locks) { | 5914 | if (path->locks[level] && !wc->keep_locks) { |
5915 | btrfs_tree_unlock(eb); | 5915 | btrfs_tree_unlock_rw(eb, path->locks[level]); |
5916 | path->locks[level] = 0; | 5916 | path->locks[level] = 0; |
5917 | } | 5917 | } |
5918 | return 0; | 5918 | return 0; |
@@ -5936,7 +5936,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, | |||
5936 | * keep the tree lock | 5936 | * keep the tree lock |
5937 | */ | 5937 | */ |
5938 | if (path->locks[level] && level > 0) { | 5938 | if (path->locks[level] && level > 0) { |
5939 | btrfs_tree_unlock(eb); | 5939 | btrfs_tree_unlock_rw(eb, path->locks[level]); |
5940 | path->locks[level] = 0; | 5940 | path->locks[level] = 0; |
5941 | } | 5941 | } |
5942 | return 0; | 5942 | return 0; |
@@ -6049,7 +6049,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, | |||
6049 | BUG_ON(level != btrfs_header_level(next)); | 6049 | BUG_ON(level != btrfs_header_level(next)); |
6050 | path->nodes[level] = next; | 6050 | path->nodes[level] = next; |
6051 | path->slots[level] = 0; | 6051 | path->slots[level] = 0; |
6052 | path->locks[level] = 1; | 6052 | path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; |
6053 | wc->level = level; | 6053 | wc->level = level; |
6054 | if (wc->level == 1) | 6054 | if (wc->level == 1) |
6055 | wc->reada_slot = 0; | 6055 | wc->reada_slot = 0; |
@@ -6120,7 +6120,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, | |||
6120 | BUG_ON(level == 0); | 6120 | BUG_ON(level == 0); |
6121 | btrfs_tree_lock(eb); | 6121 | btrfs_tree_lock(eb); |
6122 | btrfs_set_lock_blocking(eb); | 6122 | btrfs_set_lock_blocking(eb); |
6123 | path->locks[level] = 1; | 6123 | path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; |
6124 | 6124 | ||
6125 | ret = btrfs_lookup_extent_info(trans, root, | 6125 | ret = btrfs_lookup_extent_info(trans, root, |
6126 | eb->start, eb->len, | 6126 | eb->start, eb->len, |
@@ -6129,8 +6129,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, | |||
6129 | BUG_ON(ret); | 6129 | BUG_ON(ret); |
6130 | BUG_ON(wc->refs[level] == 0); | 6130 | BUG_ON(wc->refs[level] == 0); |
6131 | if (wc->refs[level] == 1) { | 6131 | if (wc->refs[level] == 1) { |
6132 | btrfs_tree_unlock(eb); | 6132 | btrfs_tree_unlock_rw(eb, path->locks[level]); |
6133 | path->locks[level] = 0; | ||
6134 | return 1; | 6133 | return 1; |
6135 | } | 6134 | } |
6136 | } | 6135 | } |
@@ -6152,7 +6151,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, | |||
6152 | btrfs_header_generation(eb) == trans->transid) { | 6151 | btrfs_header_generation(eb) == trans->transid) { |
6153 | btrfs_tree_lock(eb); | 6152 | btrfs_tree_lock(eb); |
6154 | btrfs_set_lock_blocking(eb); | 6153 | btrfs_set_lock_blocking(eb); |
6155 | path->locks[level] = 1; | 6154 | path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; |
6156 | } | 6155 | } |
6157 | clean_tree_block(trans, root, eb); | 6156 | clean_tree_block(trans, root, eb); |
6158 | } | 6157 | } |
@@ -6231,7 +6230,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, | |||
6231 | return 0; | 6230 | return 0; |
6232 | 6231 | ||
6233 | if (path->locks[level]) { | 6232 | if (path->locks[level]) { |
6234 | btrfs_tree_unlock(path->nodes[level]); | 6233 | btrfs_tree_unlock_rw(path->nodes[level], |
6234 | path->locks[level]); | ||
6235 | path->locks[level] = 0; | 6235 | path->locks[level] = 0; |
6236 | } | 6236 | } |
6237 | free_extent_buffer(path->nodes[level]); | 6237 | free_extent_buffer(path->nodes[level]); |
@@ -6283,7 +6283,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, | |||
6283 | path->nodes[level] = btrfs_lock_root_node(root); | 6283 | path->nodes[level] = btrfs_lock_root_node(root); |
6284 | btrfs_set_lock_blocking(path->nodes[level]); | 6284 | btrfs_set_lock_blocking(path->nodes[level]); |
6285 | path->slots[level] = 0; | 6285 | path->slots[level] = 0; |
6286 | path->locks[level] = 1; | 6286 | path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; |
6287 | memset(&wc->update_progress, 0, | 6287 | memset(&wc->update_progress, 0, |
6288 | sizeof(wc->update_progress)); | 6288 | sizeof(wc->update_progress)); |
6289 | } else { | 6289 | } else { |
@@ -6451,7 +6451,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, | |||
6451 | level = btrfs_header_level(node); | 6451 | level = btrfs_header_level(node); |
6452 | path->nodes[level] = node; | 6452 | path->nodes[level] = node; |
6453 | path->slots[level] = 0; | 6453 | path->slots[level] = 0; |
6454 | path->locks[level] = 1; | 6454 | path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; |
6455 | 6455 | ||
6456 | wc->refs[parent_level] = 1; | 6456 | wc->refs[parent_level] = 1; |
6457 | wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; | 6457 | wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 76ecbb8ed0e0..5392c3b12fc1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -3017,8 +3017,15 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, | |||
3017 | return NULL; | 3017 | return NULL; |
3018 | eb->start = start; | 3018 | eb->start = start; |
3019 | eb->len = len; | 3019 | eb->len = len; |
3020 | spin_lock_init(&eb->lock); | 3020 | rwlock_init(&eb->lock); |
3021 | init_waitqueue_head(&eb->lock_wq); | 3021 | atomic_set(&eb->write_locks, 0); |
3022 | atomic_set(&eb->read_locks, 0); | ||
3023 | atomic_set(&eb->blocking_readers, 0); | ||
3024 | atomic_set(&eb->blocking_writers, 0); | ||
3025 | atomic_set(&eb->spinning_readers, 0); | ||
3026 | atomic_set(&eb->spinning_writers, 0); | ||
3027 | init_waitqueue_head(&eb->write_lock_wq); | ||
3028 | init_waitqueue_head(&eb->read_lock_wq); | ||
3022 | 3029 | ||
3023 | #if LEAK_DEBUG | 3030 | #if LEAK_DEBUG |
3024 | spin_lock_irqsave(&leak_lock, flags); | 3031 | spin_lock_irqsave(&leak_lock, flags); |
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index b5f120cca916..21a7ca9e7282 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h | |||
@@ -128,14 +128,26 @@ struct extent_buffer { | |||
128 | struct rcu_head rcu_head; | 128 | struct rcu_head rcu_head; |
129 | atomic_t refs; | 129 | atomic_t refs; |
130 | 130 | ||
131 | /* the spinlock is used to protect most operations */ | 131 | /* count of read lock holders on the extent buffer */ |
132 | spinlock_t lock; | 132 | atomic_t write_locks; |
133 | atomic_t read_locks; | ||
134 | atomic_t blocking_writers; | ||
135 | atomic_t blocking_readers; | ||
136 | atomic_t spinning_readers; | ||
137 | atomic_t spinning_writers; | ||
138 | |||
139 | /* protects write locks */ | ||
140 | rwlock_t lock; | ||
141 | |||
142 | /* readers use lock_wq while they wait for the write | ||
143 | * lock holders to unlock | ||
144 | */ | ||
145 | wait_queue_head_t write_lock_wq; | ||
133 | 146 | ||
134 | /* | 147 | /* writers use read_lock_wq while they wait for readers |
135 | * when we keep the lock held while blocking, waiters go onto | 148 | * to unlock |
136 | * the wq | ||
137 | */ | 149 | */ |
138 | wait_queue_head_t lock_wq; | 150 | wait_queue_head_t read_lock_wq; |
139 | }; | 151 | }; |
140 | 152 | ||
141 | static inline void extent_set_compress_type(unsigned long *bio_flags, | 153 | static inline void extent_set_compress_type(unsigned long *bio_flags, |
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 66fa43dc3f0f..d77b67c4b275 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c | |||
@@ -24,185 +24,197 @@ | |||
24 | #include "extent_io.h" | 24 | #include "extent_io.h" |
25 | #include "locking.h" | 25 | #include "locking.h" |
26 | 26 | ||
27 | static inline void spin_nested(struct extent_buffer *eb) | 27 | void btrfs_assert_tree_read_locked(struct extent_buffer *eb); |
28 | { | ||
29 | spin_lock(&eb->lock); | ||
30 | } | ||
31 | 28 | ||
32 | /* | 29 | /* |
33 | * Setting a lock to blocking will drop the spinlock and set the | 30 | * if we currently have a spinning reader or writer lock |
34 | * flag that forces other procs who want the lock to wait. After | 31 | * (indicated by the rw flag) this will bump the count |
35 | * this you can safely schedule with the lock held. | 32 | * of blocking holders and drop the spinlock. |
36 | */ | 33 | */ |
37 | void btrfs_set_lock_blocking(struct extent_buffer *eb) | 34 | void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) |
38 | { | 35 | { |
39 | if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { | 36 | if (rw == BTRFS_WRITE_LOCK) { |
40 | set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); | 37 | if (atomic_read(&eb->blocking_writers) == 0) { |
41 | spin_unlock(&eb->lock); | 38 | WARN_ON(atomic_read(&eb->spinning_writers) != 1); |
39 | atomic_dec(&eb->spinning_writers); | ||
40 | btrfs_assert_tree_locked(eb); | ||
41 | atomic_inc(&eb->blocking_writers); | ||
42 | write_unlock(&eb->lock); | ||
43 | } | ||
44 | } else if (rw == BTRFS_READ_LOCK) { | ||
45 | btrfs_assert_tree_read_locked(eb); | ||
46 | atomic_inc(&eb->blocking_readers); | ||
47 | WARN_ON(atomic_read(&eb->spinning_readers) == 0); | ||
48 | atomic_dec(&eb->spinning_readers); | ||
49 | read_unlock(&eb->lock); | ||
42 | } | 50 | } |
43 | /* exit with the spin lock released and the bit set */ | 51 | return; |
44 | } | 52 | } |
45 | 53 | ||
46 | /* | 54 | /* |
47 | * clearing the blocking flag will take the spinlock again. | 55 | * if we currently have a blocking lock, take the spinlock |
48 | * After this you can't safely schedule | 56 | * and drop our blocking count |
49 | */ | 57 | */ |
50 | void btrfs_clear_lock_blocking(struct extent_buffer *eb) | 58 | void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) |
51 | { | 59 | { |
52 | if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { | 60 | if (rw == BTRFS_WRITE_LOCK_BLOCKING) { |
53 | spin_nested(eb); | 61 | BUG_ON(atomic_read(&eb->blocking_writers) != 1); |
54 | clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); | 62 | write_lock(&eb->lock); |
55 | smp_mb__after_clear_bit(); | 63 | WARN_ON(atomic_read(&eb->spinning_writers)); |
64 | atomic_inc(&eb->spinning_writers); | ||
65 | if (atomic_dec_and_test(&eb->blocking_writers)) | ||
66 | wake_up(&eb->write_lock_wq); | ||
67 | } else if (rw == BTRFS_READ_LOCK_BLOCKING) { | ||
68 | BUG_ON(atomic_read(&eb->blocking_readers) == 0); | ||
69 | read_lock(&eb->lock); | ||
70 | atomic_inc(&eb->spinning_readers); | ||
71 | if (atomic_dec_and_test(&eb->blocking_readers)) | ||
72 | wake_up(&eb->read_lock_wq); | ||
56 | } | 73 | } |
57 | /* exit with the spin lock held */ | 74 | return; |
58 | } | 75 | } |
59 | 76 | ||
60 | /* | 77 | /* |
61 | * unfortunately, many of the places that currently set a lock to blocking | 78 | * take a spinning read lock. This will wait for any blocking |
62 | * don't end up blocking for very long, and often they don't block | 79 | * writers |
63 | * at all. For a dbench 50 run, if we don't spin on the blocking bit | ||
64 | * at all, the context switch rate can jump up to 400,000/sec or more. | ||
65 | * | ||
66 | * So, we're still stuck with this crummy spin on the blocking bit, | ||
67 | * at least until the most common causes of the short blocks | ||
68 | * can be dealt with. | ||
69 | */ | 80 | */ |
70 | static int btrfs_spin_on_block(struct extent_buffer *eb) | 81 | void btrfs_tree_read_lock(struct extent_buffer *eb) |
71 | { | 82 | { |
72 | int i; | 83 | again: |
73 | 84 | wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); | |
74 | for (i = 0; i < 512; i++) { | 85 | read_lock(&eb->lock); |
75 | if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) | 86 | if (atomic_read(&eb->blocking_writers)) { |
76 | return 1; | 87 | read_unlock(&eb->lock); |
77 | if (need_resched()) | 88 | wait_event(eb->write_lock_wq, |
78 | break; | 89 | atomic_read(&eb->blocking_writers) == 0); |
79 | cpu_relax(); | 90 | goto again; |
80 | } | 91 | } |
81 | return 0; | 92 | atomic_inc(&eb->read_locks); |
93 | atomic_inc(&eb->spinning_readers); | ||
82 | } | 94 | } |
83 | 95 | ||
84 | /* | 96 | /* |
85 | * This is somewhat different from trylock. It will take the | 97 | * returns 1 if we get the read lock and 0 if we don't |
86 | * spinlock but if it finds the lock is set to blocking, it will | 98 | * this won't wait for blocking writers |
87 | * return without the lock held. | ||
88 | * | ||
89 | * returns 1 if it was able to take the lock and zero otherwise | ||
90 | * | ||
91 | * After this call, scheduling is not safe without first calling | ||
92 | * btrfs_set_lock_blocking() | ||
93 | */ | 99 | */ |
94 | int btrfs_try_spin_lock(struct extent_buffer *eb) | 100 | int btrfs_try_tree_read_lock(struct extent_buffer *eb) |
95 | { | 101 | { |
96 | int i; | 102 | if (atomic_read(&eb->blocking_writers)) |
103 | return 0; | ||
97 | 104 | ||
98 | if (btrfs_spin_on_block(eb)) { | 105 | read_lock(&eb->lock); |
99 | spin_nested(eb); | 106 | if (atomic_read(&eb->blocking_writers)) { |
100 | if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) | 107 | read_unlock(&eb->lock); |
101 | return 1; | 108 | return 0; |
102 | spin_unlock(&eb->lock); | ||
103 | } | 109 | } |
104 | /* spin for a bit on the BLOCKING flag */ | 110 | atomic_inc(&eb->read_locks); |
105 | for (i = 0; i < 2; i++) { | 111 | atomic_inc(&eb->spinning_readers); |
106 | cpu_relax(); | 112 | return 1; |
107 | if (!btrfs_spin_on_block(eb)) | ||
108 | break; | ||
109 | |||
110 | spin_nested(eb); | ||
111 | if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) | ||
112 | return 1; | ||
113 | spin_unlock(&eb->lock); | ||
114 | } | ||
115 | return 0; | ||
116 | } | 113 | } |
117 | 114 | ||
118 | /* | 115 | /* |
119 | * the autoremove wake function will return 0 if it tried to wake up | 116 | * returns 1 if we get the read lock and 0 if we don't |
120 | * a process that was already awake, which means that process won't | 117 | * this won't wait for blocking writers or readers |
121 | * count as an exclusive wakeup. The waitq code will continue waking | ||
122 | * procs until it finds one that was actually sleeping. | ||
123 | * | ||
124 | * For btrfs, this isn't quite what we want. We want a single proc | ||
125 | * to be notified that the lock is ready for taking. If that proc | ||
126 | * already happen to be awake, great, it will loop around and try for | ||
127 | * the lock. | ||
128 | * | ||
129 | * So, btrfs_wake_function always returns 1, even when the proc that we | ||
130 | * tried to wake up was already awake. | ||
131 | */ | 118 | */ |
132 | static int btrfs_wake_function(wait_queue_t *wait, unsigned mode, | 119 | int btrfs_try_tree_write_lock(struct extent_buffer *eb) |
133 | int sync, void *key) | ||
134 | { | 120 | { |
135 | autoremove_wake_function(wait, mode, sync, key); | 121 | if (atomic_read(&eb->blocking_writers) || |
122 | atomic_read(&eb->blocking_readers)) | ||
123 | return 0; | ||
124 | write_lock(&eb->lock); | ||
125 | if (atomic_read(&eb->blocking_writers) || | ||
126 | atomic_read(&eb->blocking_readers)) { | ||
127 | write_unlock(&eb->lock); | ||
128 | return 0; | ||
129 | } | ||
130 | atomic_inc(&eb->write_locks); | ||
131 | atomic_inc(&eb->spinning_writers); | ||
136 | return 1; | 132 | return 1; |
137 | } | 133 | } |
138 | 134 | ||
139 | /* | 135 | /* |
140 | * returns with the extent buffer spinlocked. | 136 | * drop a spinning read lock |
141 | * | 137 | */ |
142 | * This will spin and/or wait as required to take the lock, and then | 138 | void btrfs_tree_read_unlock(struct extent_buffer *eb) |
143 | * return with the spinlock held. | 139 | { |
144 | * | 140 | btrfs_assert_tree_read_locked(eb); |
145 | * After this call, scheduling is not safe without first calling | 141 | WARN_ON(atomic_read(&eb->spinning_readers) == 0); |
146 | * btrfs_set_lock_blocking() | 142 | atomic_dec(&eb->spinning_readers); |
143 | atomic_dec(&eb->read_locks); | ||
144 | read_unlock(&eb->lock); | ||
145 | } | ||
146 | |||
147 | /* | ||
148 | * drop a blocking read lock | ||
149 | */ | ||
150 | void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) | ||
151 | { | ||
152 | btrfs_assert_tree_read_locked(eb); | ||
153 | WARN_ON(atomic_read(&eb->blocking_readers) == 0); | ||
154 | if (atomic_dec_and_test(&eb->blocking_readers)) | ||
155 | wake_up(&eb->read_lock_wq); | ||
156 | atomic_dec(&eb->read_locks); | ||
157 | } | ||
158 | |||
159 | /* | ||
160 | * take a spinning write lock. This will wait for both | ||
161 | * blocking readers or writers | ||
147 | */ | 162 | */ |
148 | int btrfs_tree_lock(struct extent_buffer *eb) | 163 | int btrfs_tree_lock(struct extent_buffer *eb) |
149 | { | 164 | { |
150 | DEFINE_WAIT(wait); | 165 | again: |
151 | wait.func = btrfs_wake_function; | 166 | wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); |
152 | 167 | wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); | |
153 | if (!btrfs_spin_on_block(eb)) | 168 | write_lock(&eb->lock); |
154 | goto sleep; | 169 | if (atomic_read(&eb->blocking_readers)) { |
155 | 170 | write_unlock(&eb->lock); | |
156 | while(1) { | 171 | wait_event(eb->read_lock_wq, |
157 | spin_nested(eb); | 172 | atomic_read(&eb->blocking_readers) == 0); |
158 | 173 | goto again; | |
159 | /* nobody is blocking, exit with the spinlock held */ | ||
160 | if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) | ||
161 | return 0; | ||
162 | |||
163 | /* | ||
164 | * we have the spinlock, but the real owner is blocking. | ||
165 | * wait for them | ||
166 | */ | ||
167 | spin_unlock(&eb->lock); | ||
168 | |||
169 | /* | ||
170 | * spin for a bit, and if the blocking flag goes away, | ||
171 | * loop around | ||
172 | */ | ||
173 | cpu_relax(); | ||
174 | if (btrfs_spin_on_block(eb)) | ||
175 | continue; | ||
176 | sleep: | ||
177 | prepare_to_wait_exclusive(&eb->lock_wq, &wait, | ||
178 | TASK_UNINTERRUPTIBLE); | ||
179 | |||
180 | if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) | ||
181 | schedule(); | ||
182 | |||
183 | finish_wait(&eb->lock_wq, &wait); | ||
184 | } | 174 | } |
175 | if (atomic_read(&eb->blocking_writers)) { | ||
176 | write_unlock(&eb->lock); | ||
177 | wait_event(eb->write_lock_wq, | ||
178 | atomic_read(&eb->blocking_writers) == 0); | ||
179 | goto again; | ||
180 | } | ||
181 | WARN_ON(atomic_read(&eb->spinning_writers)); | ||
182 | atomic_inc(&eb->spinning_writers); | ||
183 | atomic_inc(&eb->write_locks); | ||
185 | return 0; | 184 | return 0; |
186 | } | 185 | } |
187 | 186 | ||
187 | /* | ||
188 | * drop a spinning or a blocking write lock. | ||
189 | */ | ||
188 | int btrfs_tree_unlock(struct extent_buffer *eb) | 190 | int btrfs_tree_unlock(struct extent_buffer *eb) |
189 | { | 191 | { |
190 | /* | 192 | int blockers = atomic_read(&eb->blocking_writers); |
191 | * if we were a blocking owner, we don't have the spinlock held | 193 | |
192 | * just clear the bit and look for waiters | 194 | BUG_ON(blockers > 1); |
193 | */ | 195 | |
194 | if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) | 196 | btrfs_assert_tree_locked(eb); |
195 | smp_mb__after_clear_bit(); | 197 | atomic_dec(&eb->write_locks); |
196 | else | 198 | |
197 | spin_unlock(&eb->lock); | 199 | if (blockers) { |
198 | 200 | WARN_ON(atomic_read(&eb->spinning_writers)); | |
199 | if (waitqueue_active(&eb->lock_wq)) | 201 | atomic_dec(&eb->blocking_writers); |
200 | wake_up(&eb->lock_wq); | 202 | smp_wmb(); |
203 | wake_up(&eb->write_lock_wq); | ||
204 | } else { | ||
205 | WARN_ON(atomic_read(&eb->spinning_writers) != 1); | ||
206 | atomic_dec(&eb->spinning_writers); | ||
207 | write_unlock(&eb->lock); | ||
208 | } | ||
201 | return 0; | 209 | return 0; |
202 | } | 210 | } |
203 | 211 | ||
204 | void btrfs_assert_tree_locked(struct extent_buffer *eb) | 212 | void btrfs_assert_tree_locked(struct extent_buffer *eb) |
205 | { | 213 | { |
206 | if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) | 214 | BUG_ON(!atomic_read(&eb->write_locks)); |
207 | assert_spin_locked(&eb->lock); | 215 | } |
216 | |||
217 | void btrfs_assert_tree_read_locked(struct extent_buffer *eb) | ||
218 | { | ||
219 | BUG_ON(!atomic_read(&eb->read_locks)); | ||
208 | } | 220 | } |
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 5c33a560a2f1..17247ddb81a0 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h | |||
@@ -19,11 +19,43 @@ | |||
19 | #ifndef __BTRFS_LOCKING_ | 19 | #ifndef __BTRFS_LOCKING_ |
20 | #define __BTRFS_LOCKING_ | 20 | #define __BTRFS_LOCKING_ |
21 | 21 | ||
22 | #define BTRFS_WRITE_LOCK 1 | ||
23 | #define BTRFS_READ_LOCK 2 | ||
24 | #define BTRFS_WRITE_LOCK_BLOCKING 3 | ||
25 | #define BTRFS_READ_LOCK_BLOCKING 4 | ||
26 | |||
22 | int btrfs_tree_lock(struct extent_buffer *eb); | 27 | int btrfs_tree_lock(struct extent_buffer *eb); |
23 | int btrfs_tree_unlock(struct extent_buffer *eb); | 28 | int btrfs_tree_unlock(struct extent_buffer *eb); |
24 | int btrfs_try_spin_lock(struct extent_buffer *eb); | 29 | int btrfs_try_spin_lock(struct extent_buffer *eb); |
25 | 30 | ||
26 | void btrfs_set_lock_blocking(struct extent_buffer *eb); | 31 | void btrfs_tree_read_lock(struct extent_buffer *eb); |
27 | void btrfs_clear_lock_blocking(struct extent_buffer *eb); | 32 | void btrfs_tree_read_unlock(struct extent_buffer *eb); |
33 | void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); | ||
34 | void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw); | ||
35 | void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw); | ||
28 | void btrfs_assert_tree_locked(struct extent_buffer *eb); | 36 | void btrfs_assert_tree_locked(struct extent_buffer *eb); |
37 | int btrfs_try_tree_read_lock(struct extent_buffer *eb); | ||
38 | int btrfs_try_tree_write_lock(struct extent_buffer *eb); | ||
39 | |||
40 | static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) | ||
41 | { | ||
42 | if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING) | ||
43 | btrfs_tree_unlock(eb); | ||
44 | else if (rw == BTRFS_READ_LOCK_BLOCKING) | ||
45 | btrfs_tree_read_unlock_blocking(eb); | ||
46 | else if (rw == BTRFS_READ_LOCK) | ||
47 | btrfs_tree_read_unlock(eb); | ||
48 | else | ||
49 | BUG(); | ||
50 | } | ||
51 | |||
52 | static inline void btrfs_set_lock_blocking(struct extent_buffer *eb) | ||
53 | { | ||
54 | btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK); | ||
55 | } | ||
56 | |||
57 | static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb) | ||
58 | { | ||
59 | btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING); | ||
60 | } | ||
29 | #endif | 61 | #endif |
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4ce8a9f41d1e..ac278dd83175 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c | |||
@@ -1730,8 +1730,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, | |||
1730 | btrfs_read_buffer(next, ptr_gen); | 1730 | btrfs_read_buffer(next, ptr_gen); |
1731 | 1731 | ||
1732 | btrfs_tree_lock(next); | 1732 | btrfs_tree_lock(next); |
1733 | clean_tree_block(trans, root, next); | ||
1734 | btrfs_set_lock_blocking(next); | 1733 | btrfs_set_lock_blocking(next); |
1734 | clean_tree_block(trans, root, next); | ||
1735 | btrfs_wait_tree_block_writeback(next); | 1735 | btrfs_wait_tree_block_writeback(next); |
1736 | btrfs_tree_unlock(next); | 1736 | btrfs_tree_unlock(next); |
1737 | 1737 | ||
@@ -1796,8 +1796,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, | |||
1796 | next = path->nodes[*level]; | 1796 | next = path->nodes[*level]; |
1797 | 1797 | ||
1798 | btrfs_tree_lock(next); | 1798 | btrfs_tree_lock(next); |
1799 | clean_tree_block(trans, root, next); | ||
1800 | btrfs_set_lock_blocking(next); | 1799 | btrfs_set_lock_blocking(next); |
1800 | clean_tree_block(trans, root, next); | ||
1801 | btrfs_wait_tree_block_writeback(next); | 1801 | btrfs_wait_tree_block_writeback(next); |
1802 | btrfs_tree_unlock(next); | 1802 | btrfs_tree_unlock(next); |
1803 | 1803 | ||
@@ -1864,8 +1864,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, | |||
1864 | next = path->nodes[orig_level]; | 1864 | next = path->nodes[orig_level]; |
1865 | 1865 | ||
1866 | btrfs_tree_lock(next); | 1866 | btrfs_tree_lock(next); |
1867 | clean_tree_block(trans, log, next); | ||
1868 | btrfs_set_lock_blocking(next); | 1867 | btrfs_set_lock_blocking(next); |
1868 | clean_tree_block(trans, log, next); | ||
1869 | btrfs_wait_tree_block_writeback(next); | 1869 | btrfs_wait_tree_block_writeback(next); |
1870 | btrfs_tree_unlock(next); | 1870 | btrfs_tree_unlock(next); |
1871 | 1871 | ||