aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/file.c
diff options
context:
space:
mode:
authorYan Zheng <zheng.yan@oracle.com>2009-06-10 10:45:14 -0400
committerChris Mason <chris.mason@oracle.com>2009-06-10 11:29:46 -0400
commit5d4f98a28c7d334091c1b7744f48a1acdd2a4ae0 (patch)
treec611d7d824cbcdb777dd2d8e33e2ed1c5df8a9c6 /fs/btrfs/file.c
parent5c939df56c3ea018b58e5aa76181284c2053d699 (diff)
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata. Once a filesystem has been mounted with this commit, IT WILL NO LONGER BE MOUNTABLE BY OLDER KERNELS. When a tree block in subvolume tree is cow'd, the reference counts of all extents it points to are increased by one. At transaction commit time, the old root of the subvolume is recorded in a "dead root" data structure, and the btree it points to is later walked, dropping reference counts and freeing any blocks where the reference count goes to 0. The increments done during cow and decrements done after commit cancel out, and the walk is a very expensive way to go about freeing the blocks that are no longer referenced by the new btree root. This commit reduces the transaction overhead by avoiding the need for dead root records. When a non-shared tree block is cow'd, we free the old block at once, and the new block inherits old block's references. When a tree block with reference count > 1 is cow'd, we increase the reference counts of all extents the new block points to by one, and decrease the old block's reference count by one. This dead tree avoidance code removes the need to modify the reference counts of lower level extents when a non-shared tree block is cow'd. But we still need to update back ref for all pointers in the block. This is because the location of the block is recorded in the back ref item. We can solve this by introducing a new type of back ref. The new back ref provides information about pointer's key, level and in which tree the pointer lives. This information allow us to find the pointer by searching the tree. The shortcoming of the new back ref is that it only works for pointers in tree blocks referenced by their owner trees. This is mostly a problem for snapshots, where resolving one of these fuzzy back references would be O(number_of_snapshots) and quite slow. The solution used here is to use the fuzzy back references in the common case where a given tree block is only referenced by one root, and use the full back references when multiple roots have a reference on a given block. This commit adds per subvolume red-black tree to keep trace of cached inodes. The red-black tree helps the balancing code to find cached inodes whose inode numbers within a given range. This commit improves the balancing code by introducing several data structures to keep the state of balancing. The most important one is the back ref cache. It caches how the upper level tree blocks are referenced. This greatly reduce the overhead of checking back ref. The improved balancing code scales significantly better with a large number of snapshots. This is a very large commit and was written in a number of pieces. But, they depend heavily on the disk format change and were squashed together to make sure git bisect didn't end up in a bad state wrt space balancing or the format change. Signed-off-by: Yan Zheng <zheng.yan@oracle.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/file.c')
-rw-r--r--fs/btrfs/file.c76
1 files changed, 22 insertions, 54 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1d51dc38bb49..0726a734ee38 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -291,16 +291,12 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
291{ 291{
292 u64 extent_end = 0; 292 u64 extent_end = 0;
293 u64 search_start = start; 293 u64 search_start = start;
294 u64 leaf_start;
295 u64 ram_bytes = 0; 294 u64 ram_bytes = 0;
296 u64 orig_parent = 0;
297 u64 disk_bytenr = 0; 295 u64 disk_bytenr = 0;
298 u64 orig_locked_end = locked_end; 296 u64 orig_locked_end = locked_end;
299 u8 compression; 297 u8 compression;
300 u8 encryption; 298 u8 encryption;
301 u16 other_encoding = 0; 299 u16 other_encoding = 0;
302 u64 root_gen;
303 u64 root_owner;
304 struct extent_buffer *leaf; 300 struct extent_buffer *leaf;
305 struct btrfs_file_extent_item *extent; 301 struct btrfs_file_extent_item *extent;
306 struct btrfs_path *path; 302 struct btrfs_path *path;
@@ -340,9 +336,6 @@ next_slot:
340 bookend = 0; 336 bookend = 0;
341 found_extent = 0; 337 found_extent = 0;
342 found_inline = 0; 338 found_inline = 0;
343 leaf_start = 0;
344 root_gen = 0;
345 root_owner = 0;
346 compression = 0; 339 compression = 0;
347 encryption = 0; 340 encryption = 0;
348 extent = NULL; 341 extent = NULL;
@@ -417,9 +410,6 @@ next_slot:
417 if (found_extent) { 410 if (found_extent) {
418 read_extent_buffer(leaf, &old, (unsigned long)extent, 411 read_extent_buffer(leaf, &old, (unsigned long)extent,
419 sizeof(old)); 412 sizeof(old));
420 root_gen = btrfs_header_generation(leaf);
421 root_owner = btrfs_header_owner(leaf);
422 leaf_start = leaf->start;
423 } 413 }
424 414
425 if (end < extent_end && end >= key.offset) { 415 if (end < extent_end && end >= key.offset) {
@@ -443,14 +433,14 @@ next_slot:
443 } 433 }
444 locked_end = extent_end; 434 locked_end = extent_end;
445 } 435 }
446 orig_parent = path->nodes[0]->start;
447 disk_bytenr = le64_to_cpu(old.disk_bytenr); 436 disk_bytenr = le64_to_cpu(old.disk_bytenr);
448 if (disk_bytenr != 0) { 437 if (disk_bytenr != 0) {
449 ret = btrfs_inc_extent_ref(trans, root, 438 ret = btrfs_inc_extent_ref(trans, root,
450 disk_bytenr, 439 disk_bytenr,
451 le64_to_cpu(old.disk_num_bytes), 440 le64_to_cpu(old.disk_num_bytes), 0,
452 orig_parent, root->root_key.objectid, 441 root->root_key.objectid,
453 trans->transid, inode->i_ino); 442 key.objectid, key.offset -
443 le64_to_cpu(old.offset));
454 BUG_ON(ret); 444 BUG_ON(ret);
455 } 445 }
456 } 446 }
@@ -568,17 +558,6 @@ next_slot:
568 btrfs_mark_buffer_dirty(path->nodes[0]); 558 btrfs_mark_buffer_dirty(path->nodes[0]);
569 btrfs_set_lock_blocking(path->nodes[0]); 559 btrfs_set_lock_blocking(path->nodes[0]);
570 560
571 if (disk_bytenr != 0) {
572 ret = btrfs_update_extent_ref(trans, root,
573 disk_bytenr,
574 le64_to_cpu(old.disk_num_bytes),
575 orig_parent,
576 leaf->start,
577 root->root_key.objectid,
578 trans->transid, ins.objectid);
579
580 BUG_ON(ret);
581 }
582 path->leave_spinning = 0; 561 path->leave_spinning = 0;
583 btrfs_release_path(root, path); 562 btrfs_release_path(root, path);
584 if (disk_bytenr != 0) 563 if (disk_bytenr != 0)
@@ -594,8 +573,9 @@ next_slot:
594 ret = btrfs_free_extent(trans, root, 573 ret = btrfs_free_extent(trans, root,
595 old_disk_bytenr, 574 old_disk_bytenr,
596 le64_to_cpu(old.disk_num_bytes), 575 le64_to_cpu(old.disk_num_bytes),
597 leaf_start, root_owner, 576 0, root->root_key.objectid,
598 root_gen, key.objectid, 0); 577 key.objectid, key.offset -
578 le64_to_cpu(old.offset));
599 BUG_ON(ret); 579 BUG_ON(ret);
600 *hint_byte = old_disk_bytenr; 580 *hint_byte = old_disk_bytenr;
601 } 581 }
@@ -664,12 +644,11 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
664 u64 bytenr; 644 u64 bytenr;
665 u64 num_bytes; 645 u64 num_bytes;
666 u64 extent_end; 646 u64 extent_end;
667 u64 extent_offset; 647 u64 orig_offset;
668 u64 other_start; 648 u64 other_start;
669 u64 other_end; 649 u64 other_end;
670 u64 split = start; 650 u64 split = start;
671 u64 locked_end = end; 651 u64 locked_end = end;
672 u64 orig_parent;
673 int extent_type; 652 int extent_type;
674 int split_end = 1; 653 int split_end = 1;
675 int ret; 654 int ret;
@@ -703,7 +682,7 @@ again:
703 682
704 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 683 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
705 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 684 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
706 extent_offset = btrfs_file_extent_offset(leaf, fi); 685 orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
707 686
708 if (key.offset == start) 687 if (key.offset == start)
709 split = end; 688 split = end;
@@ -711,8 +690,6 @@ again:
711 if (key.offset == start && extent_end == end) { 690 if (key.offset == start && extent_end == end) {
712 int del_nr = 0; 691 int del_nr = 0;
713 int del_slot = 0; 692 int del_slot = 0;
714 u64 leaf_owner = btrfs_header_owner(leaf);
715 u64 leaf_gen = btrfs_header_generation(leaf);
716 other_start = end; 693 other_start = end;
717 other_end = 0; 694 other_end = 0;
718 if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, 695 if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
@@ -721,8 +698,8 @@ again:
721 del_slot = path->slots[0] + 1; 698 del_slot = path->slots[0] + 1;
722 del_nr++; 699 del_nr++;
723 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 700 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
724 leaf->start, leaf_owner, 701 0, root->root_key.objectid,
725 leaf_gen, inode->i_ino, 0); 702 inode->i_ino, orig_offset);
726 BUG_ON(ret); 703 BUG_ON(ret);
727 } 704 }
728 other_start = 0; 705 other_start = 0;
@@ -733,8 +710,8 @@ again:
733 del_slot = path->slots[0]; 710 del_slot = path->slots[0];
734 del_nr++; 711 del_nr++;
735 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 712 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
736 leaf->start, leaf_owner, 713 0, root->root_key.objectid,
737 leaf_gen, inode->i_ino, 0); 714 inode->i_ino, orig_offset);
738 BUG_ON(ret); 715 BUG_ON(ret);
739 } 716 }
740 split_end = 0; 717 split_end = 0;
@@ -768,13 +745,12 @@ again:
768 locked_end = extent_end; 745 locked_end = extent_end;
769 } 746 }
770 btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset); 747 btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
771 extent_offset += split - key.offset;
772 } else { 748 } else {
773 BUG_ON(key.offset != start); 749 BUG_ON(key.offset != start);
774 btrfs_set_file_extent_offset(leaf, fi, extent_offset +
775 split - key.offset);
776 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
777 key.offset = split; 750 key.offset = split;
751 btrfs_set_file_extent_offset(leaf, fi, key.offset -
752 orig_offset);
753 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
778 btrfs_set_item_key_safe(trans, root, path, &key); 754 btrfs_set_item_key_safe(trans, root, path, &key);
779 extent_end = split; 755 extent_end = split;
780 } 756 }
@@ -793,7 +769,8 @@ again:
793 struct btrfs_file_extent_item); 769 struct btrfs_file_extent_item);
794 key.offset = split; 770 key.offset = split;
795 btrfs_set_item_key_safe(trans, root, path, &key); 771 btrfs_set_item_key_safe(trans, root, path, &key);
796 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 772 btrfs_set_file_extent_offset(leaf, fi, key.offset -
773 orig_offset);
797 btrfs_set_file_extent_num_bytes(leaf, fi, 774 btrfs_set_file_extent_num_bytes(leaf, fi,
798 other_end - split); 775 other_end - split);
799 goto done; 776 goto done;
@@ -815,10 +792,9 @@ again:
815 792
816 btrfs_mark_buffer_dirty(leaf); 793 btrfs_mark_buffer_dirty(leaf);
817 794
818 orig_parent = leaf->start; 795 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
819 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 796 root->root_key.objectid,
820 orig_parent, root->root_key.objectid, 797 inode->i_ino, orig_offset);
821 trans->transid, inode->i_ino);
822 BUG_ON(ret); 798 BUG_ON(ret);
823 btrfs_release_path(root, path); 799 btrfs_release_path(root, path);
824 800
@@ -833,20 +809,12 @@ again:
833 btrfs_set_file_extent_type(leaf, fi, extent_type); 809 btrfs_set_file_extent_type(leaf, fi, extent_type);
834 btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr); 810 btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
835 btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes); 811 btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
836 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 812 btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset);
837 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); 813 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
838 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); 814 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
839 btrfs_set_file_extent_compression(leaf, fi, 0); 815 btrfs_set_file_extent_compression(leaf, fi, 0);
840 btrfs_set_file_extent_encryption(leaf, fi, 0); 816 btrfs_set_file_extent_encryption(leaf, fi, 0);
841 btrfs_set_file_extent_other_encoding(leaf, fi, 0); 817 btrfs_set_file_extent_other_encoding(leaf, fi, 0);
842
843 if (orig_parent != leaf->start) {
844 ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
845 orig_parent, leaf->start,
846 root->root_key.objectid,
847 trans->transid, inode->i_ino);
848 BUG_ON(ret);
849 }
850done: 818done:
851 btrfs_mark_buffer_dirty(leaf); 819 btrfs_mark_buffer_dirty(leaf);
852 820