aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/ioctl.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-12-08 16:58:54 -0500
committerChris Mason <chris.mason@oracle.com>2008-12-08 16:58:54 -0500
commitd20f7043fa65659136c1a7c3c456eeeb5c6f431f (patch)
tree05d1031cadec6d440a97221e3a32adb504a51699 /fs/btrfs/ioctl.c
parentc99e905c945c462085c6d64646dc5af0c0a16815 (diff)
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have been stored in the subvolume trees, indexed by the inode that is referencing the data block. This means that when we read the inode, we've probably read in at least some checksums as well. But, this has a few problems: * The checksums are indexed by logical offset in the file. When compression is on, this means we have to do the expensive checksumming on the uncompressed data. It would be faster if we could checksum the compressed data instead. * If we implement encryption, we'll be checksumming the plain text and storing that on disk. This is significantly less secure. * For either compression or encryption, we have to get the plain text back before we can verify the checksum as correct. This makes the raid layer balancing and extent moving much more expensive. * It makes the front end caching code more complex, as we have touch the subvolume and inodes as we cache extents. * There is potentitally one copy of the checksum in each subvolume referencing an extent. The solution used here is to store the extent checksums in a dedicated tree. This allows us to index the checksums by phyiscal extent start and length. It means: * The checksum is against the data stored on disk, after any compression or encryption is done. * The checksum is stored in a central location, and can be verified without following back references, or reading inodes. This makes compression significantly faster by reducing the amount of data that needs to be checksummed. It will also allow much faster raid management code in general. The checksums are indexed by a key with a fixed objectid (a magic value in ctree.h) and offset set to the starting byte of the extent. This allows us to copy the checksum items into the fsync log tree directly (or any other tree), without having to invent a second format for them. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/ioctl.c')
-rw-r--r--fs/btrfs/ioctl.c55
1 files changed, 2 insertions, 53 deletions
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b4da53d55c82..6228b69c2b93 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -714,8 +714,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
714 u64 len = olen; 714 u64 len = olen;
715 u64 bs = root->fs_info->sb->s_blocksize; 715 u64 bs = root->fs_info->sb->s_blocksize;
716 u64 hint_byte; 716 u64 hint_byte;
717 u16 csum_size = 717
718 btrfs_super_csum_size(&root->fs_info->super_copy);
719 /* 718 /*
720 * TODO: 719 * TODO:
721 * - split compressed inline extents. annoying: we need to 720 * - split compressed inline extents. annoying: we need to
@@ -833,7 +832,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
833 slot = path->slots[0]; 832 slot = path->slots[0];
834 833
835 btrfs_item_key_to_cpu(leaf, &key, slot); 834 btrfs_item_key_to_cpu(leaf, &key, slot);
836 if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY || 835 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
837 key.objectid != src->i_ino) 836 key.objectid != src->i_ino)
838 break; 837 break;
839 838
@@ -958,56 +957,6 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
958 btrfs_mark_buffer_dirty(leaf); 957 btrfs_mark_buffer_dirty(leaf);
959 } 958 }
960 959
961 if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) {
962 u32 size;
963 struct btrfs_key new_key;
964 u64 coverslen;
965 int coff, clen;
966
967 size = btrfs_item_size_nr(leaf, slot);
968 coverslen = (size / csum_size) <<
969 root->fs_info->sb->s_blocksize_bits;
970 printk("csums for %llu~%llu\n",
971 key.offset, coverslen);
972 if (key.offset + coverslen < off ||
973 key.offset >= off+len)
974 goto next;
975
976 read_extent_buffer(leaf, buf,
977 btrfs_item_ptr_offset(leaf, slot),
978 size);
979 btrfs_release_path(root, path);
980
981 coff = 0;
982 if (off > key.offset)
983 coff = ((off - key.offset) >>
984 root->fs_info->sb->s_blocksize_bits) *
985 csum_size;
986 clen = size - coff;
987 if (key.offset + coverslen > off+len)
988 clen -= ((key.offset+coverslen-off-len) >>
989 root->fs_info->sb->s_blocksize_bits) *
990 csum_size;
991 printk(" will dup %d~%d of %d\n",
992 coff, clen, size);
993
994 memcpy(&new_key, &key, sizeof(new_key));
995 new_key.objectid = inode->i_ino;
996 new_key.offset = key.offset + destoff - off;
997
998 ret = btrfs_insert_empty_item(trans, root, path,
999 &new_key, clen);
1000 if (ret)
1001 goto out;
1002
1003 leaf = path->nodes[0];
1004 slot = path->slots[0];
1005 write_extent_buffer(leaf, buf + coff,
1006 btrfs_item_ptr_offset(leaf, slot),
1007 clen);
1008 btrfs_mark_buffer_dirty(leaf);
1009 }
1010
1011 next: 960 next:
1012 btrfs_release_path(root, path); 961 btrfs_release_path(root, path);
1013 key.offset++; 962 key.offset++;