aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/tree-log.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-12-08 16:58:54 -0500
committerChris Mason <chris.mason@oracle.com>2008-12-08 16:58:54 -0500
commitd20f7043fa65659136c1a7c3c456eeeb5c6f431f (patch)
tree05d1031cadec6d440a97221e3a32adb504a51699 /fs/btrfs/tree-log.c
parentc99e905c945c462085c6d64646dc5af0c0a16815 (diff)
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have been stored in the subvolume trees, indexed by the inode that is referencing the data block. This means that when we read the inode, we've probably read in at least some checksums as well. But, this has a few problems: * The checksums are indexed by logical offset in the file. When compression is on, this means we have to do the expensive checksumming on the uncompressed data. It would be faster if we could checksum the compressed data instead. * If we implement encryption, we'll be checksumming the plain text and storing that on disk. This is significantly less secure. * For either compression or encryption, we have to get the plain text back before we can verify the checksum as correct. This makes the raid layer balancing and extent moving much more expensive. * It makes the front end caching code more complex, as we have touch the subvolume and inodes as we cache extents. * There is potentitally one copy of the checksum in each subvolume referencing an extent. The solution used here is to store the extent checksums in a dedicated tree. This allows us to index the checksums by phyiscal extent start and length. It means: * The checksum is against the data stored on disk, after any compression or encryption is done. * The checksum is stored in a central location, and can be verified without following back references, or reading inodes. This makes compression significantly faster by reducing the amount of data that needs to be checksummed. It will also allow much faster raid management code in general. The checksums are indexed by a key with a fixed objectid (a magic value in ctree.h) and offset set to the starting byte of the extent. This allows us to copy the checksum items into the fsync log tree directly (or any other tree), without having to invent a second format for them. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r--fs/btrfs/tree-log.c121
1 files changed, 108 insertions, 13 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c766649ad453..08469ec05850 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -934,24 +934,17 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
934 unsigned long file_bytes; 934 unsigned long file_bytes;
935 struct btrfs_ordered_sum *sums; 935 struct btrfs_ordered_sum *sums;
936 struct btrfs_sector_sum *sector_sum; 936 struct btrfs_sector_sum *sector_sum;
937 struct inode *inode;
938 unsigned long ptr; 937 unsigned long ptr;
939 938
940 file_bytes = (item_size / csum_size) * root->sectorsize; 939 file_bytes = (item_size / csum_size) * root->sectorsize;
941 inode = read_one_inode(root, key->objectid);
942 if (!inode) {
943 return -EIO;
944 }
945
946 sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS); 940 sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS);
947 if (!sums) { 941 if (!sums) {
948 iput(inode);
949 return -ENOMEM; 942 return -ENOMEM;
950 } 943 }
951 944
952 INIT_LIST_HEAD(&sums->list); 945 INIT_LIST_HEAD(&sums->list);
953 sums->len = file_bytes; 946 sums->len = file_bytes;
954 sums->file_offset = key->offset; 947 sums->bytenr = key->offset;
955 948
956 /* 949 /*
957 * copy all the sums into the ordered sum struct 950 * copy all the sums into the ordered sum struct
@@ -960,7 +953,7 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
960 cur_offset = key->offset; 953 cur_offset = key->offset;
961 ptr = btrfs_item_ptr_offset(eb, slot); 954 ptr = btrfs_item_ptr_offset(eb, slot);
962 while(item_size > 0) { 955 while(item_size > 0) {
963 sector_sum->offset = cur_offset; 956 sector_sum->bytenr = cur_offset;
964 read_extent_buffer(eb, &sector_sum->sum, ptr, csum_size); 957 read_extent_buffer(eb, &sector_sum->sum, ptr, csum_size);
965 sector_sum++; 958 sector_sum++;
966 item_size -= csum_size; 959 item_size -= csum_size;
@@ -969,11 +962,9 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
969 } 962 }
970 963
971 /* let btrfs_csum_file_blocks add them into the file */ 964 /* let btrfs_csum_file_blocks add them into the file */
972 ret = btrfs_csum_file_blocks(trans, root, inode, sums); 965 ret = btrfs_csum_file_blocks(trans, root->fs_info->csum_root, sums);
973 BUG_ON(ret); 966 BUG_ON(ret);
974 kfree(sums); 967 kfree(sums);
975 iput(inode);
976
977 return 0; 968 return 0;
978} 969}
979/* 970/*
@@ -1670,7 +1661,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1670 ret = replay_one_extent(wc->trans, root, path, 1661 ret = replay_one_extent(wc->trans, root, path,
1671 eb, i, &key); 1662 eb, i, &key);
1672 BUG_ON(ret); 1663 BUG_ON(ret);
1673 } else if (key.type == BTRFS_CSUM_ITEM_KEY) { 1664 } else if (key.type == BTRFS_EXTENT_CSUM_KEY) {
1674 ret = replay_one_csum(wc->trans, root, path, 1665 ret = replay_one_csum(wc->trans, root, path,
1675 eb, i, &key); 1666 eb, i, &key);
1676 BUG_ON(ret); 1667 BUG_ON(ret);
@@ -2466,6 +2457,85 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2466 return 0; 2457 return 0;
2467} 2458}
2468 2459
2460static noinline int copy_extent_csums(struct btrfs_trans_handle *trans,
2461 struct list_head *list,
2462 struct btrfs_root *root,
2463 u64 disk_bytenr, u64 len)
2464{
2465 struct btrfs_ordered_sum *sums;
2466 struct btrfs_sector_sum *sector_sum;
2467 int ret;
2468 struct btrfs_path *path;
2469 struct btrfs_csum_item *item = NULL;
2470 u64 end = disk_bytenr + len;
2471 u64 item_start_offset = 0;
2472 u64 item_last_offset = 0;
2473 u32 diff;
2474 u32 sum;
2475 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
2476
2477 sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS);
2478
2479 sector_sum = sums->sums;
2480 sums->bytenr = disk_bytenr;
2481 sums->len = len;
2482 list_add_tail(&sums->list, list);
2483
2484 path = btrfs_alloc_path();
2485 while(disk_bytenr < end) {
2486 if (!item || disk_bytenr < item_start_offset ||
2487 disk_bytenr >= item_last_offset) {
2488 struct btrfs_key found_key;
2489 u32 item_size;
2490
2491 if (item)
2492 btrfs_release_path(root, path);
2493 item = btrfs_lookup_csum(NULL, root, path,
2494 disk_bytenr, 0);
2495 if (IS_ERR(item)) {
2496 ret = PTR_ERR(item);
2497 if (ret == -ENOENT || ret == -EFBIG)
2498 ret = 0;
2499 sum = 0;
2500 printk("log no csum found for byte %llu\n",
2501 (unsigned long long)disk_bytenr);
2502 item = NULL;
2503 btrfs_release_path(root, path);
2504 goto found;
2505 }
2506 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2507 path->slots[0]);
2508
2509 item_start_offset = found_key.offset;
2510 item_size = btrfs_item_size_nr(path->nodes[0],
2511 path->slots[0]);
2512 item_last_offset = item_start_offset +
2513 (item_size / csum_size) *
2514 root->sectorsize;
2515 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2516 struct btrfs_csum_item);
2517 }
2518 /*
2519 * this byte range must be able to fit inside
2520 * a single leaf so it will also fit inside a u32
2521 */
2522 diff = disk_bytenr - item_start_offset;
2523 diff = diff / root->sectorsize;
2524 diff = diff * csum_size;
2525
2526 read_extent_buffer(path->nodes[0], &sum,
2527 ((unsigned long)item) + diff,
2528 csum_size);
2529found:
2530 sector_sum->bytenr = disk_bytenr;
2531 sector_sum->sum = sum;
2532 disk_bytenr += root->sectorsize;
2533 sector_sum++;
2534 }
2535 btrfs_free_path(path);
2536 return 0;
2537}
2538
2469static noinline int copy_items(struct btrfs_trans_handle *trans, 2539static noinline int copy_items(struct btrfs_trans_handle *trans,
2470 struct btrfs_root *log, 2540 struct btrfs_root *log,
2471 struct btrfs_path *dst_path, 2541 struct btrfs_path *dst_path,
@@ -2481,6 +2551,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2481 u32 *ins_sizes; 2551 u32 *ins_sizes;
2482 char *ins_data; 2552 char *ins_data;
2483 int i; 2553 int i;
2554 struct list_head ordered_sums;
2555
2556 INIT_LIST_HEAD(&ordered_sums);
2484 2557
2485 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 2558 ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
2486 nr * sizeof(u32), GFP_NOFS); 2559 nr * sizeof(u32), GFP_NOFS);
@@ -2535,6 +2608,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2535 extent); 2608 extent);
2536 u64 dl = btrfs_file_extent_disk_num_bytes(src, 2609 u64 dl = btrfs_file_extent_disk_num_bytes(src,
2537 extent); 2610 extent);
2611 u64 cs = btrfs_file_extent_offset(src, extent);
2612 u64 cl = btrfs_file_extent_num_bytes(src,
2613 extent);;
2538 /* ds == 0 is a hole */ 2614 /* ds == 0 is a hole */
2539 if (ds != 0) { 2615 if (ds != 0) {
2540 ret = btrfs_inc_extent_ref(trans, log, 2616 ret = btrfs_inc_extent_ref(trans, log,
@@ -2544,6 +2620,11 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2544 trans->transid, 2620 trans->transid,
2545 ins_keys[i].objectid); 2621 ins_keys[i].objectid);
2546 BUG_ON(ret); 2622 BUG_ON(ret);
2623 ret = copy_extent_csums(trans,
2624 &ordered_sums,
2625 log->fs_info->csum_root,
2626 ds + cs, cl);
2627 BUG_ON(ret);
2547 } 2628 }
2548 } 2629 }
2549 } 2630 }
@@ -2553,6 +2634,20 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2553 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 2634 btrfs_mark_buffer_dirty(dst_path->nodes[0]);
2554 btrfs_release_path(log, dst_path); 2635 btrfs_release_path(log, dst_path);
2555 kfree(ins_data); 2636 kfree(ins_data);
2637
2638 /*
2639 * we have to do this after the loop above to avoid changing the
2640 * log tree while trying to change the log tree.
2641 */
2642 while(!list_empty(&ordered_sums)) {
2643 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
2644 struct btrfs_ordered_sum,
2645 list);
2646 ret = btrfs_csum_file_blocks(trans, log, sums);
2647 BUG_ON(ret);
2648 list_del(&sums->list);
2649 kfree(sums);
2650 }
2556 return 0; 2651 return 0;
2557} 2652}
2558 2653