diff options
author | Johannes Weiner <hannes@cmpxchg.org> | 2016-12-12 19:43:46 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-12-12 21:55:08 -0500 |
commit | f4b109c6dad54257eca837f9dd16a23f2eeab832 (patch) | |
tree | ab2ceb69c33346aeaa8e1420d04fad8623118e26 /lib/radix-tree.c | |
parent | 6d75f366b9242f9b17ed7d0b0604d7460f818f21 (diff) |
lib: radix-tree: add entry deletion support to __radix_tree_replace()
Page cache shadow entry handling will be a lot simpler when it can use a
single generic replacement function for pages, shadow entries, and
emptying slots.
Make __radix_tree_replace() properly account insertions and deletions in
node->count and garbage collect nodes as they become empty. Then
re-implement radix_tree_delete() on top of it.
Link: http://lkml.kernel.org/r/20161117193058.GC23430@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox <mawilcox@linuxonhyperv.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'lib/radix-tree.c')
-rw-r--r-- | lib/radix-tree.c | 227 |
1 files changed, 116 insertions, 111 deletions
diff --git a/lib/radix-tree.c b/lib/radix-tree.c index f91d5b0af654..5d8930f3b3d8 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c | |||
@@ -539,6 +539,107 @@ out: | |||
539 | } | 539 | } |
540 | 540 | ||
541 | /** | 541 | /** |
542 | * radix_tree_shrink - shrink radix tree to minimum height | ||
543 | * @root radix tree root | ||
544 | */ | ||
545 | static inline bool radix_tree_shrink(struct radix_tree_root *root) | ||
546 | { | ||
547 | bool shrunk = false; | ||
548 | |||
549 | for (;;) { | ||
550 | struct radix_tree_node *node = root->rnode; | ||
551 | struct radix_tree_node *child; | ||
552 | |||
553 | if (!radix_tree_is_internal_node(node)) | ||
554 | break; | ||
555 | node = entry_to_node(node); | ||
556 | |||
557 | /* | ||
558 | * The candidate node has more than one child, or its child | ||
559 | * is not at the leftmost slot, or the child is a multiorder | ||
560 | * entry, we cannot shrink. | ||
561 | */ | ||
562 | if (node->count != 1) | ||
563 | break; | ||
564 | child = node->slots[0]; | ||
565 | if (!child) | ||
566 | break; | ||
567 | if (!radix_tree_is_internal_node(child) && node->shift) | ||
568 | break; | ||
569 | |||
570 | if (radix_tree_is_internal_node(child)) | ||
571 | entry_to_node(child)->parent = NULL; | ||
572 | |||
573 | /* | ||
574 | * We don't need rcu_assign_pointer(), since we are simply | ||
575 | * moving the node from one part of the tree to another: if it | ||
576 | * was safe to dereference the old pointer to it | ||
577 | * (node->slots[0]), it will be safe to dereference the new | ||
578 | * one (root->rnode) as far as dependent read barriers go. | ||
579 | */ | ||
580 | root->rnode = child; | ||
581 | |||
582 | /* | ||
583 | * We have a dilemma here. The node's slot[0] must not be | ||
584 | * NULLed in case there are concurrent lookups expecting to | ||
585 | * find the item. However if this was a bottom-level node, | ||
586 | * then it may be subject to the slot pointer being visible | ||
587 | * to callers dereferencing it. If item corresponding to | ||
588 | * slot[0] is subsequently deleted, these callers would expect | ||
589 | * their slot to become empty sooner or later. | ||
590 | * | ||
591 | * For example, lockless pagecache will look up a slot, deref | ||
592 | * the page pointer, and if the page has 0 refcount it means it | ||
593 | * was concurrently deleted from pagecache so try the deref | ||
594 | * again. Fortunately there is already a requirement for logic | ||
595 | * to retry the entire slot lookup -- the indirect pointer | ||
596 | * problem (replacing direct root node with an indirect pointer | ||
597 | * also results in a stale slot). So tag the slot as indirect | ||
598 | * to force callers to retry. | ||
599 | */ | ||
600 | if (!radix_tree_is_internal_node(child)) | ||
601 | node->slots[0] = RADIX_TREE_RETRY; | ||
602 | |||
603 | radix_tree_node_free(node); | ||
604 | shrunk = true; | ||
605 | } | ||
606 | |||
607 | return shrunk; | ||
608 | } | ||
609 | |||
610 | static bool delete_node(struct radix_tree_root *root, | ||
611 | struct radix_tree_node *node) | ||
612 | { | ||
613 | bool deleted = false; | ||
614 | |||
615 | do { | ||
616 | struct radix_tree_node *parent; | ||
617 | |||
618 | if (node->count) { | ||
619 | if (node == entry_to_node(root->rnode)) | ||
620 | deleted |= radix_tree_shrink(root); | ||
621 | return deleted; | ||
622 | } | ||
623 | |||
624 | parent = node->parent; | ||
625 | if (parent) { | ||
626 | parent->slots[node->offset] = NULL; | ||
627 | parent->count--; | ||
628 | } else { | ||
629 | root_tag_clear_all(root); | ||
630 | root->rnode = NULL; | ||
631 | } | ||
632 | |||
633 | radix_tree_node_free(node); | ||
634 | deleted = true; | ||
635 | |||
636 | node = parent; | ||
637 | } while (node); | ||
638 | |||
639 | return deleted; | ||
640 | } | ||
641 | |||
642 | /** | ||
542 | * __radix_tree_create - create a slot in a radix tree | 643 | * __radix_tree_create - create a slot in a radix tree |
543 | * @root: radix tree root | 644 | * @root: radix tree root |
544 | * @index: index key | 645 | * @index: index key |
@@ -759,18 +860,20 @@ static void replace_slot(struct radix_tree_root *root, | |||
759 | bool warn_typeswitch) | 860 | bool warn_typeswitch) |
760 | { | 861 | { |
761 | void *old = rcu_dereference_raw(*slot); | 862 | void *old = rcu_dereference_raw(*slot); |
762 | int exceptional; | 863 | int count, exceptional; |
763 | 864 | ||
764 | WARN_ON_ONCE(radix_tree_is_internal_node(item)); | 865 | WARN_ON_ONCE(radix_tree_is_internal_node(item)); |
765 | WARN_ON_ONCE(!!item - !!old); | ||
766 | 866 | ||
867 | count = !!item - !!old; | ||
767 | exceptional = !!radix_tree_exceptional_entry(item) - | 868 | exceptional = !!radix_tree_exceptional_entry(item) - |
768 | !!radix_tree_exceptional_entry(old); | 869 | !!radix_tree_exceptional_entry(old); |
769 | 870 | ||
770 | WARN_ON_ONCE(warn_typeswitch && exceptional); | 871 | WARN_ON_ONCE(warn_typeswitch && (count || exceptional)); |
771 | 872 | ||
772 | if (node) | 873 | if (node) { |
874 | node->count += count; | ||
773 | node->exceptional += exceptional; | 875 | node->exceptional += exceptional; |
876 | } | ||
774 | 877 | ||
775 | rcu_assign_pointer(*slot, item); | 878 | rcu_assign_pointer(*slot, item); |
776 | } | 879 | } |
@@ -790,12 +893,14 @@ void __radix_tree_replace(struct radix_tree_root *root, | |||
790 | void **slot, void *item) | 893 | void **slot, void *item) |
791 | { | 894 | { |
792 | /* | 895 | /* |
793 | * This function supports replacing exceptional entries, but | 896 | * This function supports replacing exceptional entries and |
794 | * that needs accounting against the node unless the slot is | 897 | * deleting entries, but that needs accounting against the |
795 | * root->rnode. | 898 | * node unless the slot is root->rnode. |
796 | */ | 899 | */ |
797 | replace_slot(root, node, slot, item, | 900 | replace_slot(root, node, slot, item, |
798 | !node && slot != (void **)&root->rnode); | 901 | !node && slot != (void **)&root->rnode); |
902 | |||
903 | delete_node(root, node); | ||
799 | } | 904 | } |
800 | 905 | ||
801 | /** | 906 | /** |
@@ -810,8 +915,8 @@ void __radix_tree_replace(struct radix_tree_root *root, | |||
810 | * | 915 | * |
811 | * NOTE: This cannot be used to switch between non-entries (empty slots), | 916 | * NOTE: This cannot be used to switch between non-entries (empty slots), |
812 | * regular entries, and exceptional entries, as that requires accounting | 917 | * regular entries, and exceptional entries, as that requires accounting |
813 | * inside the radix tree node. When switching from one type of entry to | 918 | * inside the radix tree node. When switching from one type of entry or |
814 | * another, use __radix_tree_lookup() and __radix_tree_replace(). | 919 | * deleting, use __radix_tree_lookup() and __radix_tree_replace(). |
815 | */ | 920 | */ |
816 | void radix_tree_replace_slot(struct radix_tree_root *root, | 921 | void radix_tree_replace_slot(struct radix_tree_root *root, |
817 | void **slot, void *item) | 922 | void **slot, void *item) |
@@ -1467,75 +1572,6 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) | |||
1467 | #endif /* CONFIG_SHMEM && CONFIG_SWAP */ | 1572 | #endif /* CONFIG_SHMEM && CONFIG_SWAP */ |
1468 | 1573 | ||
1469 | /** | 1574 | /** |
1470 | * radix_tree_shrink - shrink radix tree to minimum height | ||
1471 | * @root radix tree root | ||
1472 | */ | ||
1473 | static inline bool radix_tree_shrink(struct radix_tree_root *root) | ||
1474 | { | ||
1475 | bool shrunk = false; | ||
1476 | |||
1477 | for (;;) { | ||
1478 | struct radix_tree_node *node = root->rnode; | ||
1479 | struct radix_tree_node *child; | ||
1480 | |||
1481 | if (!radix_tree_is_internal_node(node)) | ||
1482 | break; | ||
1483 | node = entry_to_node(node); | ||
1484 | |||
1485 | /* | ||
1486 | * The candidate node has more than one child, or its child | ||
1487 | * is not at the leftmost slot, or the child is a multiorder | ||
1488 | * entry, we cannot shrink. | ||
1489 | */ | ||
1490 | if (node->count != 1) | ||
1491 | break; | ||
1492 | child = node->slots[0]; | ||
1493 | if (!child) | ||
1494 | break; | ||
1495 | if (!radix_tree_is_internal_node(child) && node->shift) | ||
1496 | break; | ||
1497 | |||
1498 | if (radix_tree_is_internal_node(child)) | ||
1499 | entry_to_node(child)->parent = NULL; | ||
1500 | |||
1501 | /* | ||
1502 | * We don't need rcu_assign_pointer(), since we are simply | ||
1503 | * moving the node from one part of the tree to another: if it | ||
1504 | * was safe to dereference the old pointer to it | ||
1505 | * (node->slots[0]), it will be safe to dereference the new | ||
1506 | * one (root->rnode) as far as dependent read barriers go. | ||
1507 | */ | ||
1508 | root->rnode = child; | ||
1509 | |||
1510 | /* | ||
1511 | * We have a dilemma here. The node's slot[0] must not be | ||
1512 | * NULLed in case there are concurrent lookups expecting to | ||
1513 | * find the item. However if this was a bottom-level node, | ||
1514 | * then it may be subject to the slot pointer being visible | ||
1515 | * to callers dereferencing it. If item corresponding to | ||
1516 | * slot[0] is subsequently deleted, these callers would expect | ||
1517 | * their slot to become empty sooner or later. | ||
1518 | * | ||
1519 | * For example, lockless pagecache will look up a slot, deref | ||
1520 | * the page pointer, and if the page has 0 refcount it means it | ||
1521 | * was concurrently deleted from pagecache so try the deref | ||
1522 | * again. Fortunately there is already a requirement for logic | ||
1523 | * to retry the entire slot lookup -- the indirect pointer | ||
1524 | * problem (replacing direct root node with an indirect pointer | ||
1525 | * also results in a stale slot). So tag the slot as indirect | ||
1526 | * to force callers to retry. | ||
1527 | */ | ||
1528 | if (!radix_tree_is_internal_node(child)) | ||
1529 | node->slots[0] = RADIX_TREE_RETRY; | ||
1530 | |||
1531 | radix_tree_node_free(node); | ||
1532 | shrunk = true; | ||
1533 | } | ||
1534 | |||
1535 | return shrunk; | ||
1536 | } | ||
1537 | |||
1538 | /** | ||
1539 | * __radix_tree_delete_node - try to free node after clearing a slot | 1575 | * __radix_tree_delete_node - try to free node after clearing a slot |
1540 | * @root: radix tree root | 1576 | * @root: radix tree root |
1541 | * @node: node containing @index | 1577 | * @node: node containing @index |
@@ -1549,33 +1585,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root) | |||
1549 | bool __radix_tree_delete_node(struct radix_tree_root *root, | 1585 | bool __radix_tree_delete_node(struct radix_tree_root *root, |
1550 | struct radix_tree_node *node) | 1586 | struct radix_tree_node *node) |
1551 | { | 1587 | { |
1552 | bool deleted = false; | 1588 | return delete_node(root, node); |
1553 | |||
1554 | do { | ||
1555 | struct radix_tree_node *parent; | ||
1556 | |||
1557 | if (node->count) { | ||
1558 | if (node == entry_to_node(root->rnode)) | ||
1559 | deleted |= radix_tree_shrink(root); | ||
1560 | return deleted; | ||
1561 | } | ||
1562 | |||
1563 | parent = node->parent; | ||
1564 | if (parent) { | ||
1565 | parent->slots[node->offset] = NULL; | ||
1566 | parent->count--; | ||
1567 | } else { | ||
1568 | root_tag_clear_all(root); | ||
1569 | root->rnode = NULL; | ||
1570 | } | ||
1571 | |||
1572 | radix_tree_node_free(node); | ||
1573 | deleted = true; | ||
1574 | |||
1575 | node = parent; | ||
1576 | } while (node); | ||
1577 | |||
1578 | return deleted; | ||
1579 | } | 1589 | } |
1580 | 1590 | ||
1581 | static inline void delete_sibling_entries(struct radix_tree_node *node, | 1591 | static inline void delete_sibling_entries(struct radix_tree_node *node, |
@@ -1632,12 +1642,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root, | |||
1632 | node_tag_clear(root, node, tag, offset); | 1642 | node_tag_clear(root, node, tag, offset); |
1633 | 1643 | ||
1634 | delete_sibling_entries(node, node_to_entry(slot), offset); | 1644 | delete_sibling_entries(node, node_to_entry(slot), offset); |
1635 | node->slots[offset] = NULL; | 1645 | __radix_tree_replace(root, node, slot, NULL); |
1636 | node->count--; | ||
1637 | if (radix_tree_exceptional_entry(entry)) | ||
1638 | node->exceptional--; | ||
1639 | |||
1640 | __radix_tree_delete_node(root, node); | ||
1641 | 1646 | ||
1642 | return entry; | 1647 | return entry; |
1643 | } | 1648 | } |