diff options
author | Jan Schmidt <list.btrfs@jan-o-sch.net> | 2012-05-16 12:25:47 -0400 |
---|---|---|
committer | Jan Schmidt <list.btrfs@jan-o-sch.net> | 2012-05-30 09:17:33 -0400 |
commit | 5d9e75c41d11ca79b1d1ff6ed17c88c9047d1fea (patch) | |
tree | c7e6e1011997dcba5fd9522cd4172b34dec15860 /fs/btrfs/ctree.c | |
parent | f3ea38da3e76455fbd6d405cdca4d050ed085458 (diff) |
Btrfs: add btrfs_search_old_slot
The tree modification log together with the current state of the tree gives
a consistent, old version of the tree. btrfs_search_old_slot is used to
search through this old version and return old (dummy!) extent buffers.
Naturally, this function cannot do any tree modifications.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Diffstat (limited to 'fs/btrfs/ctree.c')
-rw-r--r-- | fs/btrfs/ctree.c | 319 |
1 files changed, 315 insertions, 4 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 69ce3f7deeef..0954f1770fd0 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c | |||
@@ -960,6 +960,207 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, | |||
960 | return 0; | 960 | return 0; |
961 | } | 961 | } |
962 | 962 | ||
963 | /* | ||
964 | * returns the logical address of the oldest predecessor of the given root. | ||
965 | * entries older than time_seq are ignored. | ||
966 | */ | ||
967 | static struct tree_mod_elem * | ||
968 | __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, | ||
969 | struct btrfs_root *root, u64 time_seq) | ||
970 | { | ||
971 | struct tree_mod_elem *tm; | ||
972 | struct tree_mod_elem *found = NULL; | ||
973 | u64 root_logical = root->node->start; | ||
974 | int looped = 0; | ||
975 | |||
976 | if (!time_seq) | ||
977 | return 0; | ||
978 | |||
979 | /* | ||
980 | * the very last operation that's logged for a root is the replacement | ||
981 | * operation (if it is replaced at all). this has the index of the *new* | ||
982 | * root, making it the very first operation that's logged for this root. | ||
983 | */ | ||
984 | while (1) { | ||
985 | tm = tree_mod_log_search_oldest(fs_info, root_logical, | ||
986 | time_seq); | ||
987 | if (!looped && !tm) | ||
988 | return 0; | ||
989 | /* | ||
990 | * we must have key remove operations in the log before the | ||
991 | * replace operation. | ||
992 | */ | ||
993 | BUG_ON(!tm); | ||
994 | |||
995 | if (tm->op != MOD_LOG_ROOT_REPLACE) | ||
996 | break; | ||
997 | |||
998 | found = tm; | ||
999 | root_logical = tm->old_root.logical; | ||
1000 | BUG_ON(root_logical == root->node->start); | ||
1001 | looped = 1; | ||
1002 | } | ||
1003 | |||
1004 | return found; | ||
1005 | } | ||
1006 | |||
1007 | /* | ||
1008 | * tm is a pointer to the first operation to rewind within eb. then, all | ||
1009 | * previous operations will be rewinded (until we reach something older than | ||
1010 | * time_seq). | ||
1011 | */ | ||
1012 | static void | ||
1013 | __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, | ||
1014 | struct tree_mod_elem *first_tm) | ||
1015 | { | ||
1016 | u32 n; | ||
1017 | struct rb_node *next; | ||
1018 | struct tree_mod_elem *tm = first_tm; | ||
1019 | unsigned long o_dst; | ||
1020 | unsigned long o_src; | ||
1021 | unsigned long p_size = sizeof(struct btrfs_key_ptr); | ||
1022 | |||
1023 | n = btrfs_header_nritems(eb); | ||
1024 | while (tm && tm->elem.seq >= time_seq) { | ||
1025 | /* | ||
1026 | * all the operations are recorded with the operator used for | ||
1027 | * the modification. as we're going backwards, we do the | ||
1028 | * opposite of each operation here. | ||
1029 | */ | ||
1030 | switch (tm->op) { | ||
1031 | case MOD_LOG_KEY_REMOVE_WHILE_FREEING: | ||
1032 | BUG_ON(tm->slot < n); | ||
1033 | case MOD_LOG_KEY_REMOVE_WHILE_MOVING: | ||
1034 | case MOD_LOG_KEY_REMOVE: | ||
1035 | btrfs_set_node_key(eb, &tm->key, tm->slot); | ||
1036 | btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); | ||
1037 | btrfs_set_node_ptr_generation(eb, tm->slot, | ||
1038 | tm->generation); | ||
1039 | n++; | ||
1040 | break; | ||
1041 | case MOD_LOG_KEY_REPLACE: | ||
1042 | BUG_ON(tm->slot >= n); | ||
1043 | btrfs_set_node_key(eb, &tm->key, tm->slot); | ||
1044 | btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); | ||
1045 | btrfs_set_node_ptr_generation(eb, tm->slot, | ||
1046 | tm->generation); | ||
1047 | break; | ||
1048 | case MOD_LOG_KEY_ADD: | ||
1049 | if (tm->slot != n - 1) { | ||
1050 | o_dst = btrfs_node_key_ptr_offset(tm->slot); | ||
1051 | o_src = btrfs_node_key_ptr_offset(tm->slot + 1); | ||
1052 | memmove_extent_buffer(eb, o_dst, o_src, p_size); | ||
1053 | } | ||
1054 | n--; | ||
1055 | break; | ||
1056 | case MOD_LOG_MOVE_KEYS: | ||
1057 | memmove_extent_buffer(eb, tm->slot, tm->move.dst_slot, | ||
1058 | tm->move.nr_items * p_size); | ||
1059 | break; | ||
1060 | case MOD_LOG_ROOT_REPLACE: | ||
1061 | /* | ||
1062 | * this operation is special. for roots, this must be | ||
1063 | * handled explicitly before rewinding. | ||
1064 | * for non-roots, this operation may exist if the node | ||
1065 | * was a root: root A -> child B; then A gets empty and | ||
1066 | * B is promoted to the new root. in the mod log, we'll | ||
1067 | * have a root-replace operation for B, a tree block | ||
1068 | * that is no root. we simply ignore that operation. | ||
1069 | */ | ||
1070 | break; | ||
1071 | } | ||
1072 | next = rb_next(&tm->node); | ||
1073 | if (!next) | ||
1074 | break; | ||
1075 | tm = container_of(next, struct tree_mod_elem, node); | ||
1076 | if (tm->index != first_tm->index) | ||
1077 | break; | ||
1078 | } | ||
1079 | btrfs_set_header_nritems(eb, n); | ||
1080 | } | ||
1081 | |||
1082 | static struct extent_buffer * | ||
1083 | tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, | ||
1084 | u64 time_seq) | ||
1085 | { | ||
1086 | struct extent_buffer *eb_rewin; | ||
1087 | struct tree_mod_elem *tm; | ||
1088 | |||
1089 | if (!time_seq) | ||
1090 | return eb; | ||
1091 | |||
1092 | if (btrfs_header_level(eb) == 0) | ||
1093 | return eb; | ||
1094 | |||
1095 | tm = tree_mod_log_search(fs_info, eb->start, time_seq); | ||
1096 | if (!tm) | ||
1097 | return eb; | ||
1098 | |||
1099 | if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { | ||
1100 | BUG_ON(tm->slot != 0); | ||
1101 | eb_rewin = alloc_dummy_extent_buffer(eb->start, | ||
1102 | fs_info->tree_root->nodesize); | ||
1103 | BUG_ON(!eb_rewin); | ||
1104 | btrfs_set_header_bytenr(eb_rewin, eb->start); | ||
1105 | btrfs_set_header_backref_rev(eb_rewin, | ||
1106 | btrfs_header_backref_rev(eb)); | ||
1107 | btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); | ||
1108 | } else { | ||
1109 | eb_rewin = btrfs_clone_extent_buffer(eb); | ||
1110 | BUG_ON(!eb_rewin); | ||
1111 | } | ||
1112 | |||
1113 | extent_buffer_get(eb_rewin); | ||
1114 | free_extent_buffer(eb); | ||
1115 | |||
1116 | __tree_mod_log_rewind(eb_rewin, time_seq, tm); | ||
1117 | |||
1118 | return eb_rewin; | ||
1119 | } | ||
1120 | |||
1121 | static inline struct extent_buffer * | ||
1122 | get_old_root(struct btrfs_root *root, u64 time_seq) | ||
1123 | { | ||
1124 | struct tree_mod_elem *tm; | ||
1125 | struct extent_buffer *eb; | ||
1126 | struct tree_mod_root *old_root; | ||
1127 | u64 old_generation; | ||
1128 | |||
1129 | tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); | ||
1130 | if (!tm) | ||
1131 | return root->node; | ||
1132 | |||
1133 | old_root = &tm->old_root; | ||
1134 | old_generation = tm->generation; | ||
1135 | |||
1136 | tm = tree_mod_log_search(root->fs_info, old_root->logical, time_seq); | ||
1137 | /* | ||
1138 | * there was an item in the log when __tree_mod_log_oldest_root | ||
1139 | * returned. this one must not go away, because the time_seq passed to | ||
1140 | * us must be blocking its removal. | ||
1141 | */ | ||
1142 | BUG_ON(!tm); | ||
1143 | |||
1144 | if (old_root->logical == root->node->start) { | ||
1145 | /* there are logged operations for the current root */ | ||
1146 | eb = btrfs_clone_extent_buffer(root->node); | ||
1147 | } else { | ||
1148 | /* there's a root replace operation for the current root */ | ||
1149 | eb = alloc_dummy_extent_buffer(tm->index << PAGE_CACHE_SHIFT, | ||
1150 | root->nodesize); | ||
1151 | btrfs_set_header_bytenr(eb, eb->start); | ||
1152 | btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); | ||
1153 | btrfs_set_header_owner(eb, root->root_key.objectid); | ||
1154 | } | ||
1155 | if (!eb) | ||
1156 | return NULL; | ||
1157 | btrfs_set_header_level(eb, old_root->level); | ||
1158 | btrfs_set_header_generation(eb, old_generation); | ||
1159 | __tree_mod_log_rewind(eb, time_seq, tm); | ||
1160 | |||
1161 | return eb; | ||
1162 | } | ||
1163 | |||
963 | static inline int should_cow_block(struct btrfs_trans_handle *trans, | 1164 | static inline int should_cow_block(struct btrfs_trans_handle *trans, |
964 | struct btrfs_root *root, | 1165 | struct btrfs_root *root, |
965 | struct extent_buffer *buf) | 1166 | struct extent_buffer *buf) |
@@ -1930,7 +2131,7 @@ static int | |||
1930 | read_block_for_search(struct btrfs_trans_handle *trans, | 2131 | read_block_for_search(struct btrfs_trans_handle *trans, |
1931 | struct btrfs_root *root, struct btrfs_path *p, | 2132 | struct btrfs_root *root, struct btrfs_path *p, |
1932 | struct extent_buffer **eb_ret, int level, int slot, | 2133 | struct extent_buffer **eb_ret, int level, int slot, |
1933 | struct btrfs_key *key) | 2134 | struct btrfs_key *key, u64 time_seq) |
1934 | { | 2135 | { |
1935 | u64 blocknr; | 2136 | u64 blocknr; |
1936 | u64 gen; | 2137 | u64 gen; |
@@ -2284,7 +2485,7 @@ cow_done: | |||
2284 | } | 2485 | } |
2285 | 2486 | ||
2286 | err = read_block_for_search(trans, root, p, | 2487 | err = read_block_for_search(trans, root, p, |
2287 | &b, level, slot, key); | 2488 | &b, level, slot, key, 0); |
2288 | if (err == -EAGAIN) | 2489 | if (err == -EAGAIN) |
2289 | goto again; | 2490 | goto again; |
2290 | if (err) { | 2491 | if (err) { |
@@ -2356,6 +2557,116 @@ done: | |||
2356 | } | 2557 | } |
2357 | 2558 | ||
2358 | /* | 2559 | /* |
2560 | * Like btrfs_search_slot, this looks for a key in the given tree. It uses the | ||
2561 | * current state of the tree together with the operations recorded in the tree | ||
2562 | * modification log to search for the key in a previous version of this tree, as | ||
2563 | * denoted by the time_seq parameter. | ||
2564 | * | ||
2565 | * Naturally, there is no support for insert, delete or cow operations. | ||
2566 | * | ||
2567 | * The resulting path and return value will be set up as if we called | ||
2568 | * btrfs_search_slot at that point in time with ins_len and cow both set to 0. | ||
2569 | */ | ||
2570 | int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, | ||
2571 | struct btrfs_path *p, u64 time_seq) | ||
2572 | { | ||
2573 | struct extent_buffer *b; | ||
2574 | int slot; | ||
2575 | int ret; | ||
2576 | int err; | ||
2577 | int level; | ||
2578 | int lowest_unlock = 1; | ||
2579 | u8 lowest_level = 0; | ||
2580 | |||
2581 | lowest_level = p->lowest_level; | ||
2582 | WARN_ON(p->nodes[0] != NULL); | ||
2583 | |||
2584 | if (p->search_commit_root) { | ||
2585 | BUG_ON(time_seq); | ||
2586 | return btrfs_search_slot(NULL, root, key, p, 0, 0); | ||
2587 | } | ||
2588 | |||
2589 | again: | ||
2590 | level = 0; | ||
2591 | b = get_old_root(root, time_seq); | ||
2592 | extent_buffer_get(b); | ||
2593 | level = btrfs_header_level(b); | ||
2594 | btrfs_tree_read_lock(b); | ||
2595 | p->locks[level] = BTRFS_READ_LOCK; | ||
2596 | |||
2597 | while (b) { | ||
2598 | level = btrfs_header_level(b); | ||
2599 | p->nodes[level] = b; | ||
2600 | btrfs_clear_path_blocking(p, NULL, 0); | ||
2601 | |||
2602 | /* | ||
2603 | * we have a lock on b and as long as we aren't changing | ||
2604 | * the tree, there is no way to for the items in b to change. | ||
2605 | * It is safe to drop the lock on our parent before we | ||
2606 | * go through the expensive btree search on b. | ||
2607 | */ | ||
2608 | btrfs_unlock_up_safe(p, level + 1); | ||
2609 | |||
2610 | ret = bin_search(b, key, level, &slot); | ||
2611 | |||
2612 | if (level != 0) { | ||
2613 | int dec = 0; | ||
2614 | if (ret && slot > 0) { | ||
2615 | dec = 1; | ||
2616 | slot -= 1; | ||
2617 | } | ||
2618 | p->slots[level] = slot; | ||
2619 | unlock_up(p, level, lowest_unlock, 0, NULL); | ||
2620 | |||
2621 | if (level == lowest_level) { | ||
2622 | if (dec) | ||
2623 | p->slots[level]++; | ||
2624 | goto done; | ||
2625 | } | ||
2626 | |||
2627 | err = read_block_for_search(NULL, root, p, &b, level, | ||
2628 | slot, key, time_seq); | ||
2629 | if (err == -EAGAIN) | ||
2630 | goto again; | ||
2631 | if (err) { | ||
2632 | ret = err; | ||
2633 | goto done; | ||
2634 | } | ||
2635 | |||
2636 | level = btrfs_header_level(b); | ||
2637 | err = btrfs_try_tree_read_lock(b); | ||
2638 | if (!err) { | ||
2639 | btrfs_set_path_blocking(p); | ||
2640 | btrfs_tree_read_lock(b); | ||
2641 | btrfs_clear_path_blocking(p, b, | ||
2642 | BTRFS_READ_LOCK); | ||
2643 | } | ||
2644 | p->locks[level] = BTRFS_READ_LOCK; | ||
2645 | p->nodes[level] = b; | ||
2646 | b = tree_mod_log_rewind(root->fs_info, b, time_seq); | ||
2647 | if (b != p->nodes[level]) { | ||
2648 | btrfs_tree_unlock_rw(p->nodes[level], | ||
2649 | p->locks[level]); | ||
2650 | p->locks[level] = 0; | ||
2651 | p->nodes[level] = b; | ||
2652 | } | ||
2653 | } else { | ||
2654 | p->slots[level] = slot; | ||
2655 | unlock_up(p, level, lowest_unlock, 0, NULL); | ||
2656 | goto done; | ||
2657 | } | ||
2658 | } | ||
2659 | ret = 1; | ||
2660 | done: | ||
2661 | if (!p->leave_spinning) | ||
2662 | btrfs_set_path_blocking(p); | ||
2663 | if (ret < 0) | ||
2664 | btrfs_release_path(p); | ||
2665 | |||
2666 | return ret; | ||
2667 | } | ||
2668 | |||
2669 | /* | ||
2359 | * adjust the pointers going up the tree, starting at level | 2670 | * adjust the pointers going up the tree, starting at level |
2360 | * making sure the right key of each node is points to 'key'. | 2671 | * making sure the right key of each node is points to 'key'. |
2361 | * This is used after shifting pointers to the left, so it stops | 2672 | * This is used after shifting pointers to the left, so it stops |
@@ -4735,7 +5046,7 @@ again: | |||
4735 | next = c; | 5046 | next = c; |
4736 | next_rw_lock = path->locks[level]; | 5047 | next_rw_lock = path->locks[level]; |
4737 | ret = read_block_for_search(NULL, root, path, &next, level, | 5048 | ret = read_block_for_search(NULL, root, path, &next, level, |
4738 | slot, &key); | 5049 | slot, &key, 0); |
4739 | if (ret == -EAGAIN) | 5050 | if (ret == -EAGAIN) |
4740 | goto again; | 5051 | goto again; |
4741 | 5052 | ||
@@ -4772,7 +5083,7 @@ again: | |||
4772 | break; | 5083 | break; |
4773 | 5084 | ||
4774 | ret = read_block_for_search(NULL, root, path, &next, level, | 5085 | ret = read_block_for_search(NULL, root, path, &next, level, |
4775 | 0, &key); | 5086 | 0, &key, 0); |
4776 | if (ret == -EAGAIN) | 5087 | if (ret == -EAGAIN) |
4777 | goto again; | 5088 | goto again; |
4778 | 5089 | ||