aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-12-08 16:58:54 -0500
committerChris Mason <chris.mason@oracle.com>2008-12-08 16:58:54 -0500
commitd20f7043fa65659136c1a7c3c456eeeb5c6f431f (patch)
tree05d1031cadec6d440a97221e3a32adb504a51699 /fs/btrfs/inode.c
parentc99e905c945c462085c6d64646dc5af0c0a16815 (diff)
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have been stored in the subvolume trees, indexed by the inode that is referencing the data block. This means that when we read the inode, we've probably read in at least some checksums as well. But, this has a few problems: * The checksums are indexed by logical offset in the file. When compression is on, this means we have to do the expensive checksumming on the uncompressed data. It would be faster if we could checksum the compressed data instead. * If we implement encryption, we'll be checksumming the plain text and storing that on disk. This is significantly less secure. * For either compression or encryption, we have to get the plain text back before we can verify the checksum as correct. This makes the raid layer balancing and extent moving much more expensive. * It makes the front end caching code more complex, as we have touch the subvolume and inodes as we cache extents. * There is potentitally one copy of the checksum in each subvolume referencing an extent. The solution used here is to store the extent checksums in a dedicated tree. This allows us to index the checksums by phyiscal extent start and length. It means: * The checksum is against the data stored on disk, after any compression or encryption is done. * The checksum is stored in a central location, and can be verified without following back references, or reading inodes. This makes compression significantly faster by reducing the amount of data that needs to be checksummed. It will also allow much faster raid management code in general. The checksums are indexed by a key with a fixed objectid (a magic value in ctree.h) and offset set to the starting byte of the extent. This allows us to copy the checksum items into the fsync log tree directly (or any other tree), without having to invent a second format for them. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c45
1 files changed, 24 insertions, 21 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 09efc9473a3d..c03d847b8c4e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1221,7 +1221,7 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio
1221 struct btrfs_root *root = BTRFS_I(inode)->root; 1221 struct btrfs_root *root = BTRFS_I(inode)->root;
1222 int ret = 0; 1222 int ret = 0;
1223 1223
1224 ret = btrfs_csum_one_bio(root, inode, bio); 1224 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1225 BUG_ON(ret); 1225 BUG_ON(ret);
1226 return 0; 1226 return 0;
1227} 1227}
@@ -1259,12 +1259,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1259 btrfs_test_flag(inode, NODATASUM); 1259 btrfs_test_flag(inode, NODATASUM);
1260 1260
1261 if (!(rw & (1 << BIO_RW))) { 1261 if (!(rw & (1 << BIO_RW))) {
1262 1262 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1263 if (bio_flags & EXTENT_BIO_COMPRESSED)
1264 return btrfs_submit_compressed_read(inode, bio, 1263 return btrfs_submit_compressed_read(inode, bio,
1265 mirror_num, bio_flags); 1264 mirror_num, bio_flags);
1266 else if (!skip_sum) 1265 } else if (!skip_sum)
1267 btrfs_lookup_bio_sums(root, inode, bio); 1266 btrfs_lookup_bio_sums(root, inode, bio, NULL);
1268 goto mapit; 1267 goto mapit;
1269 } else if (!skip_sum) { 1268 } else if (!skip_sum) {
1270 /* we're doing a write, do the async checksumming */ 1269 /* we're doing a write, do the async checksumming */
@@ -1292,8 +1291,8 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1292 btrfs_set_trans_block_group(trans, inode); 1291 btrfs_set_trans_block_group(trans, inode);
1293 list_for_each(cur, list) { 1292 list_for_each(cur, list) {
1294 sum = list_entry(cur, struct btrfs_ordered_sum, list); 1293 sum = list_entry(cur, struct btrfs_ordered_sum, list);
1295 btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root, 1294 btrfs_csum_file_blocks(trans,
1296 inode, sum); 1295 BTRFS_I(inode)->root->fs_info->csum_root, sum);
1297 } 1296 }
1298 return 0; 1297 return 0;
1299} 1298}
@@ -1545,6 +1544,7 @@ struct io_failure_record {
1545 u64 start; 1544 u64 start;
1546 u64 len; 1545 u64 len;
1547 u64 logical; 1546 u64 logical;
1547 unsigned long bio_flags;
1548 int last_mirror; 1548 int last_mirror;
1549}; 1549};
1550 1550
@@ -1563,7 +1563,6 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1563 int ret; 1563 int ret;
1564 int rw; 1564 int rw;
1565 u64 logical; 1565 u64 logical;
1566 unsigned long bio_flags = 0;
1567 1566
1568 ret = get_state_private(failure_tree, start, &private); 1567 ret = get_state_private(failure_tree, start, &private);
1569 if (ret) { 1568 if (ret) {
@@ -1573,6 +1572,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1573 failrec->start = start; 1572 failrec->start = start;
1574 failrec->len = end - start + 1; 1573 failrec->len = end - start + 1;
1575 failrec->last_mirror = 0; 1574 failrec->last_mirror = 0;
1575 failrec->bio_flags = 0;
1576 1576
1577 spin_lock(&em_tree->lock); 1577 spin_lock(&em_tree->lock);
1578 em = lookup_extent_mapping(em_tree, start, failrec->len); 1578 em = lookup_extent_mapping(em_tree, start, failrec->len);
@@ -1588,8 +1588,10 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1588 } 1588 }
1589 logical = start - em->start; 1589 logical = start - em->start;
1590 logical = em->block_start + logical; 1590 logical = em->block_start + logical;
1591 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 1591 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1592 bio_flags = EXTENT_BIO_COMPRESSED; 1592 logical = em->block_start;
1593 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1594 }
1593 failrec->logical = logical; 1595 failrec->logical = logical;
1594 free_extent_map(em); 1596 free_extent_map(em);
1595 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | 1597 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
@@ -1626,6 +1628,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1626 bio->bi_sector = failrec->logical >> 9; 1628 bio->bi_sector = failrec->logical >> 9;
1627 bio->bi_bdev = failed_bio->bi_bdev; 1629 bio->bi_bdev = failed_bio->bi_bdev;
1628 bio->bi_size = 0; 1630 bio->bi_size = 0;
1631
1629 bio_add_page(bio, page, failrec->len, start - page_offset(page)); 1632 bio_add_page(bio, page, failrec->len, start - page_offset(page));
1630 if (failed_bio->bi_rw & (1 << BIO_RW)) 1633 if (failed_bio->bi_rw & (1 << BIO_RW))
1631 rw = WRITE; 1634 rw = WRITE;
@@ -1634,7 +1637,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1634 1637
1635 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, 1638 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1636 failrec->last_mirror, 1639 failrec->last_mirror,
1637 bio_flags); 1640 failrec->bio_flags);
1638 return 0; 1641 return 0;
1639} 1642}
1640 1643
@@ -1688,9 +1691,14 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1688 u32 csum = ~(u32)0; 1691 u32 csum = ~(u32)0;
1689 unsigned long flags; 1692 unsigned long flags;
1690 1693
1694 if (PageChecked(page)) {
1695 ClearPageChecked(page);
1696 goto good;
1697 }
1691 if (btrfs_test_opt(root, NODATASUM) || 1698 if (btrfs_test_opt(root, NODATASUM) ||
1692 btrfs_test_flag(inode, NODATASUM)) 1699 btrfs_test_flag(inode, NODATASUM))
1693 return 0; 1700 return 0;
1701
1694 if (state && state->start == start) { 1702 if (state && state->start == start) {
1695 private = state->private; 1703 private = state->private;
1696 ret = 0; 1704 ret = 0;
@@ -1709,7 +1717,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1709 } 1717 }
1710 kunmap_atomic(kaddr, KM_IRQ0); 1718 kunmap_atomic(kaddr, KM_IRQ0);
1711 local_irq_restore(flags); 1719 local_irq_restore(flags);
1712 1720good:
1713 /* if the io failure tree for this inode is non-empty, 1721 /* if the io failure tree for this inode is non-empty,
1714 * check to see if we've recovered from a failed IO 1722 * check to see if we've recovered from a failed IO
1715 */ 1723 */
@@ -2243,6 +2251,7 @@ fail:
2243 return err; 2251 return err;
2244} 2252}
2245 2253
2254#if 0
2246/* 2255/*
2247 * when truncating bytes in a file, it is possible to avoid reading 2256 * when truncating bytes in a file, it is possible to avoid reading
2248 * the leaves that contain only checksum items. This can be the 2257 * the leaves that contain only checksum items. This can be the
@@ -2410,6 +2419,8 @@ out:
2410 return ret; 2419 return ret;
2411} 2420}
2412 2421
2422#endif
2423
2413/* 2424/*
2414 * this can truncate away extent items, csum items and directory items. 2425 * this can truncate away extent items, csum items and directory items.
2415 * It starts at a high offset and removes keys until it can't find 2426 * It starts at a high offset and removes keys until it can't find
@@ -2459,9 +2470,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2459 2470
2460 btrfs_init_path(path); 2471 btrfs_init_path(path);
2461 2472
2462 ret = drop_csum_leaves(trans, root, path, inode, new_size);
2463 BUG_ON(ret);
2464
2465search_again: 2473search_again:
2466 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2474 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2467 if (ret < 0) { 2475 if (ret < 0) {
@@ -2509,16 +2517,11 @@ search_again:
2509 } 2517 }
2510 item_end--; 2518 item_end--;
2511 } 2519 }
2512 if (found_type == BTRFS_CSUM_ITEM_KEY) {
2513 ret = btrfs_csum_truncate(trans, root, path,
2514 new_size);
2515 BUG_ON(ret);
2516 }
2517 if (item_end < new_size) { 2520 if (item_end < new_size) {
2518 if (found_type == BTRFS_DIR_ITEM_KEY) { 2521 if (found_type == BTRFS_DIR_ITEM_KEY) {
2519 found_type = BTRFS_INODE_ITEM_KEY; 2522 found_type = BTRFS_INODE_ITEM_KEY;
2520 } else if (found_type == BTRFS_EXTENT_ITEM_KEY) { 2523 } else if (found_type == BTRFS_EXTENT_ITEM_KEY) {
2521 found_type = BTRFS_CSUM_ITEM_KEY; 2524 found_type = BTRFS_EXTENT_DATA_KEY;
2522 } else if (found_type == BTRFS_EXTENT_DATA_KEY) { 2525 } else if (found_type == BTRFS_EXTENT_DATA_KEY) {
2523 found_type = BTRFS_XATTR_ITEM_KEY; 2526 found_type = BTRFS_XATTR_ITEM_KEY;
2524 } else if (found_type == BTRFS_XATTR_ITEM_KEY) { 2527 } else if (found_type == BTRFS_XATTR_ITEM_KEY) {