diff options
author | Chris Mason <chris.mason@oracle.com> | 2008-12-08 16:58:54 -0500 |
---|---|---|
committer | Chris Mason <chris.mason@oracle.com> | 2008-12-08 16:58:54 -0500 |
commit | d20f7043fa65659136c1a7c3c456eeeb5c6f431f (patch) | |
tree | 05d1031cadec6d440a97221e3a32adb504a51699 /fs/btrfs/disk-io.c | |
parent | c99e905c945c462085c6d64646dc5af0c0a16815 (diff) |
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r-- | fs/btrfs/disk-io.c | 45 |
1 files changed, 39 insertions, 6 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3eb7c2576fe..61dc3b2c834 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c | |||
@@ -445,11 +445,18 @@ static void end_workqueue_bio(struct bio *bio, int err) | |||
445 | end_io_wq->error = err; | 445 | end_io_wq->error = err; |
446 | end_io_wq->work.func = end_workqueue_fn; | 446 | end_io_wq->work.func = end_workqueue_fn; |
447 | end_io_wq->work.flags = 0; | 447 | end_io_wq->work.flags = 0; |
448 | if (bio->bi_rw & (1 << BIO_RW)) | 448 | |
449 | if (bio->bi_rw & (1 << BIO_RW)) { | ||
449 | btrfs_queue_worker(&fs_info->endio_write_workers, | 450 | btrfs_queue_worker(&fs_info->endio_write_workers, |
450 | &end_io_wq->work); | 451 | &end_io_wq->work); |
451 | else | 452 | } else { |
452 | btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work); | 453 | if (end_io_wq->metadata) |
454 | btrfs_queue_worker(&fs_info->endio_meta_workers, | ||
455 | &end_io_wq->work); | ||
456 | else | ||
457 | btrfs_queue_worker(&fs_info->endio_workers, | ||
458 | &end_io_wq->work); | ||
459 | } | ||
453 | } | 460 | } |
454 | 461 | ||
455 | int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, | 462 | int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, |
@@ -1208,6 +1215,9 @@ static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page) | |||
1208 | info = (struct btrfs_fs_info *)bdi->unplug_io_data; | 1215 | info = (struct btrfs_fs_info *)bdi->unplug_io_data; |
1209 | list_for_each(cur, &info->fs_devices->devices) { | 1216 | list_for_each(cur, &info->fs_devices->devices) { |
1210 | device = list_entry(cur, struct btrfs_device, dev_list); | 1217 | device = list_entry(cur, struct btrfs_device, dev_list); |
1218 | if (!device->bdev) | ||
1219 | continue; | ||
1220 | |||
1211 | bdi = blk_get_backing_dev_info(device->bdev); | 1221 | bdi = blk_get_backing_dev_info(device->bdev); |
1212 | if (bdi->unplug_io_fn) { | 1222 | if (bdi->unplug_io_fn) { |
1213 | bdi->unplug_io_fn(bdi, page); | 1223 | bdi->unplug_io_fn(bdi, page); |
@@ -1344,7 +1354,7 @@ static void end_workqueue_fn(struct btrfs_work *work) | |||
1344 | * blocksize <= pagesize, it is basically a noop | 1354 | * blocksize <= pagesize, it is basically a noop |
1345 | */ | 1355 | */ |
1346 | if (end_io_wq->metadata && !bio_ready_for_csum(bio)) { | 1356 | if (end_io_wq->metadata && !bio_ready_for_csum(bio)) { |
1347 | btrfs_queue_worker(&fs_info->endio_workers, | 1357 | btrfs_queue_worker(&fs_info->endio_meta_workers, |
1348 | &end_io_wq->work); | 1358 | &end_io_wq->work); |
1349 | return; | 1359 | return; |
1350 | } | 1360 | } |
@@ -1454,6 +1464,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1454 | struct buffer_head *bh; | 1464 | struct buffer_head *bh; |
1455 | struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), | 1465 | struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), |
1456 | GFP_NOFS); | 1466 | GFP_NOFS); |
1467 | struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), | ||
1468 | GFP_NOFS); | ||
1457 | struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root), | 1469 | struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root), |
1458 | GFP_NOFS); | 1470 | GFP_NOFS); |
1459 | struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info), | 1471 | struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info), |
@@ -1470,7 +1482,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1470 | struct btrfs_super_block *disk_super; | 1482 | struct btrfs_super_block *disk_super; |
1471 | 1483 | ||
1472 | if (!extent_root || !tree_root || !fs_info || | 1484 | if (!extent_root || !tree_root || !fs_info || |
1473 | !chunk_root || !dev_root) { | 1485 | !chunk_root || !dev_root || !csum_root) { |
1474 | err = -ENOMEM; | 1486 | err = -ENOMEM; |
1475 | goto fail; | 1487 | goto fail; |
1476 | } | 1488 | } |
@@ -1487,6 +1499,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1487 | init_completion(&fs_info->kobj_unregister); | 1499 | init_completion(&fs_info->kobj_unregister); |
1488 | fs_info->tree_root = tree_root; | 1500 | fs_info->tree_root = tree_root; |
1489 | fs_info->extent_root = extent_root; | 1501 | fs_info->extent_root = extent_root; |
1502 | fs_info->csum_root = csum_root; | ||
1490 | fs_info->chunk_root = chunk_root; | 1503 | fs_info->chunk_root = chunk_root; |
1491 | fs_info->dev_root = dev_root; | 1504 | fs_info->dev_root = dev_root; |
1492 | fs_info->fs_devices = fs_devices; | 1505 | fs_info->fs_devices = fs_devices; |
@@ -1652,6 +1665,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1652 | btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1); | 1665 | btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1); |
1653 | btrfs_init_workers(&fs_info->endio_workers, "endio", | 1666 | btrfs_init_workers(&fs_info->endio_workers, "endio", |
1654 | fs_info->thread_pool_size); | 1667 | fs_info->thread_pool_size); |
1668 | btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", | ||
1669 | fs_info->thread_pool_size); | ||
1655 | btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", | 1670 | btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", |
1656 | fs_info->thread_pool_size); | 1671 | fs_info->thread_pool_size); |
1657 | 1672 | ||
@@ -1667,6 +1682,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1667 | btrfs_start_workers(&fs_info->delalloc_workers, 1); | 1682 | btrfs_start_workers(&fs_info->delalloc_workers, 1); |
1668 | btrfs_start_workers(&fs_info->fixup_workers, 1); | 1683 | btrfs_start_workers(&fs_info->fixup_workers, 1); |
1669 | btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size); | 1684 | btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size); |
1685 | btrfs_start_workers(&fs_info->endio_meta_workers, | ||
1686 | fs_info->thread_pool_size); | ||
1670 | btrfs_start_workers(&fs_info->endio_write_workers, | 1687 | btrfs_start_workers(&fs_info->endio_write_workers, |
1671 | fs_info->thread_pool_size); | 1688 | fs_info->thread_pool_size); |
1672 | 1689 | ||
@@ -1751,6 +1768,13 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1751 | if (ret) | 1768 | if (ret) |
1752 | goto fail_extent_root; | 1769 | goto fail_extent_root; |
1753 | 1770 | ||
1771 | ret = find_and_setup_root(tree_root, fs_info, | ||
1772 | BTRFS_CSUM_TREE_OBJECTID, csum_root); | ||
1773 | if (ret) | ||
1774 | goto fail_extent_root; | ||
1775 | |||
1776 | csum_root->track_dirty = 1; | ||
1777 | |||
1754 | btrfs_read_block_groups(extent_root); | 1778 | btrfs_read_block_groups(extent_root); |
1755 | 1779 | ||
1756 | fs_info->generation = generation + 1; | 1780 | fs_info->generation = generation + 1; |
@@ -1761,7 +1785,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1761 | fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, | 1785 | fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, |
1762 | "btrfs-cleaner"); | 1786 | "btrfs-cleaner"); |
1763 | if (!fs_info->cleaner_kthread) | 1787 | if (!fs_info->cleaner_kthread) |
1764 | goto fail_extent_root; | 1788 | goto fail_csum_root; |
1765 | 1789 | ||
1766 | fs_info->transaction_kthread = kthread_run(transaction_kthread, | 1790 | fs_info->transaction_kthread = kthread_run(transaction_kthread, |
1767 | tree_root, | 1791 | tree_root, |
@@ -1825,6 +1849,8 @@ fail_cleaner: | |||
1825 | filemap_write_and_wait(fs_info->btree_inode->i_mapping); | 1849 | filemap_write_and_wait(fs_info->btree_inode->i_mapping); |
1826 | invalidate_inode_pages2(fs_info->btree_inode->i_mapping); | 1850 | invalidate_inode_pages2(fs_info->btree_inode->i_mapping); |
1827 | 1851 | ||
1852 | fail_csum_root: | ||
1853 | free_extent_buffer(csum_root->node); | ||
1828 | fail_extent_root: | 1854 | fail_extent_root: |
1829 | free_extent_buffer(extent_root->node); | 1855 | free_extent_buffer(extent_root->node); |
1830 | fail_tree_root: | 1856 | fail_tree_root: |
@@ -1838,6 +1864,7 @@ fail_sb_buffer: | |||
1838 | btrfs_stop_workers(&fs_info->delalloc_workers); | 1864 | btrfs_stop_workers(&fs_info->delalloc_workers); |
1839 | btrfs_stop_workers(&fs_info->workers); | 1865 | btrfs_stop_workers(&fs_info->workers); |
1840 | btrfs_stop_workers(&fs_info->endio_workers); | 1866 | btrfs_stop_workers(&fs_info->endio_workers); |
1867 | btrfs_stop_workers(&fs_info->endio_meta_workers); | ||
1841 | btrfs_stop_workers(&fs_info->endio_write_workers); | 1868 | btrfs_stop_workers(&fs_info->endio_write_workers); |
1842 | btrfs_stop_workers(&fs_info->submit_workers); | 1869 | btrfs_stop_workers(&fs_info->submit_workers); |
1843 | fail_iput: | 1870 | fail_iput: |
@@ -1853,6 +1880,7 @@ fail: | |||
1853 | kfree(fs_info); | 1880 | kfree(fs_info); |
1854 | kfree(chunk_root); | 1881 | kfree(chunk_root); |
1855 | kfree(dev_root); | 1882 | kfree(dev_root); |
1883 | kfree(csum_root); | ||
1856 | return ERR_PTR(err); | 1884 | return ERR_PTR(err); |
1857 | } | 1885 | } |
1858 | 1886 | ||
@@ -2131,6 +2159,9 @@ int close_ctree(struct btrfs_root *root) | |||
2131 | if (root->fs_info->dev_root->node); | 2159 | if (root->fs_info->dev_root->node); |
2132 | free_extent_buffer(root->fs_info->dev_root->node); | 2160 | free_extent_buffer(root->fs_info->dev_root->node); |
2133 | 2161 | ||
2162 | if (root->fs_info->csum_root->node); | ||
2163 | free_extent_buffer(root->fs_info->csum_root->node); | ||
2164 | |||
2134 | btrfs_free_block_groups(root->fs_info); | 2165 | btrfs_free_block_groups(root->fs_info); |
2135 | 2166 | ||
2136 | del_fs_roots(fs_info); | 2167 | del_fs_roots(fs_info); |
@@ -2141,6 +2172,7 @@ int close_ctree(struct btrfs_root *root) | |||
2141 | btrfs_stop_workers(&fs_info->delalloc_workers); | 2172 | btrfs_stop_workers(&fs_info->delalloc_workers); |
2142 | btrfs_stop_workers(&fs_info->workers); | 2173 | btrfs_stop_workers(&fs_info->workers); |
2143 | btrfs_stop_workers(&fs_info->endio_workers); | 2174 | btrfs_stop_workers(&fs_info->endio_workers); |
2175 | btrfs_stop_workers(&fs_info->endio_meta_workers); | ||
2144 | btrfs_stop_workers(&fs_info->endio_write_workers); | 2176 | btrfs_stop_workers(&fs_info->endio_write_workers); |
2145 | btrfs_stop_workers(&fs_info->submit_workers); | 2177 | btrfs_stop_workers(&fs_info->submit_workers); |
2146 | 2178 | ||
@@ -2163,6 +2195,7 @@ int close_ctree(struct btrfs_root *root) | |||
2163 | kfree(fs_info->tree_root); | 2195 | kfree(fs_info->tree_root); |
2164 | kfree(fs_info->chunk_root); | 2196 | kfree(fs_info->chunk_root); |
2165 | kfree(fs_info->dev_root); | 2197 | kfree(fs_info->dev_root); |
2198 | kfree(fs_info->csum_root); | ||
2166 | return 0; | 2199 | return 0; |
2167 | } | 2200 | } |
2168 | 2201 | ||