aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/disk-io.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-12-08 16:58:54 -0500
committerChris Mason <chris.mason@oracle.com>2008-12-08 16:58:54 -0500
commitd20f7043fa65659136c1a7c3c456eeeb5c6f431f (patch)
tree05d1031cadec6d440a97221e3a32adb504a51699 /fs/btrfs/disk-io.c
parentc99e905c945c462085c6d64646dc5af0c0a16815 (diff)
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have been stored in the subvolume trees, indexed by the inode that is referencing the data block. This means that when we read the inode, we've probably read in at least some checksums as well. But, this has a few problems: * The checksums are indexed by logical offset in the file. When compression is on, this means we have to do the expensive checksumming on the uncompressed data. It would be faster if we could checksum the compressed data instead. * If we implement encryption, we'll be checksumming the plain text and storing that on disk. This is significantly less secure. * For either compression or encryption, we have to get the plain text back before we can verify the checksum as correct. This makes the raid layer balancing and extent moving much more expensive. * It makes the front end caching code more complex, as we have touch the subvolume and inodes as we cache extents. * There is potentitally one copy of the checksum in each subvolume referencing an extent. The solution used here is to store the extent checksums in a dedicated tree. This allows us to index the checksums by phyiscal extent start and length. It means: * The checksum is against the data stored on disk, after any compression or encryption is done. * The checksum is stored in a central location, and can be verified without following back references, or reading inodes. This makes compression significantly faster by reducing the amount of data that needs to be checksummed. It will also allow much faster raid management code in general. The checksums are indexed by a key with a fixed objectid (a magic value in ctree.h) and offset set to the starting byte of the extent. This allows us to copy the checksum items into the fsync log tree directly (or any other tree), without having to invent a second format for them. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r--fs/btrfs/disk-io.c45
1 files changed, 39 insertions, 6 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3eb7c2576fe..61dc3b2c834 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -445,11 +445,18 @@ static void end_workqueue_bio(struct bio *bio, int err)
445 end_io_wq->error = err; 445 end_io_wq->error = err;
446 end_io_wq->work.func = end_workqueue_fn; 446 end_io_wq->work.func = end_workqueue_fn;
447 end_io_wq->work.flags = 0; 447 end_io_wq->work.flags = 0;
448 if (bio->bi_rw & (1 << BIO_RW)) 448
449 if (bio->bi_rw & (1 << BIO_RW)) {
449 btrfs_queue_worker(&fs_info->endio_write_workers, 450 btrfs_queue_worker(&fs_info->endio_write_workers,
450 &end_io_wq->work); 451 &end_io_wq->work);
451 else 452 } else {
452 btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work); 453 if (end_io_wq->metadata)
454 btrfs_queue_worker(&fs_info->endio_meta_workers,
455 &end_io_wq->work);
456 else
457 btrfs_queue_worker(&fs_info->endio_workers,
458 &end_io_wq->work);
459 }
453} 460}
454 461
455int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 462int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
@@ -1208,6 +1215,9 @@ static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1208 info = (struct btrfs_fs_info *)bdi->unplug_io_data; 1215 info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1209 list_for_each(cur, &info->fs_devices->devices) { 1216 list_for_each(cur, &info->fs_devices->devices) {
1210 device = list_entry(cur, struct btrfs_device, dev_list); 1217 device = list_entry(cur, struct btrfs_device, dev_list);
1218 if (!device->bdev)
1219 continue;
1220
1211 bdi = blk_get_backing_dev_info(device->bdev); 1221 bdi = blk_get_backing_dev_info(device->bdev);
1212 if (bdi->unplug_io_fn) { 1222 if (bdi->unplug_io_fn) {
1213 bdi->unplug_io_fn(bdi, page); 1223 bdi->unplug_io_fn(bdi, page);
@@ -1344,7 +1354,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
1344 * blocksize <= pagesize, it is basically a noop 1354 * blocksize <= pagesize, it is basically a noop
1345 */ 1355 */
1346 if (end_io_wq->metadata && !bio_ready_for_csum(bio)) { 1356 if (end_io_wq->metadata && !bio_ready_for_csum(bio)) {
1347 btrfs_queue_worker(&fs_info->endio_workers, 1357 btrfs_queue_worker(&fs_info->endio_meta_workers,
1348 &end_io_wq->work); 1358 &end_io_wq->work);
1349 return; 1359 return;
1350 } 1360 }
@@ -1454,6 +1464,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1454 struct buffer_head *bh; 1464 struct buffer_head *bh;
1455 struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), 1465 struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
1456 GFP_NOFS); 1466 GFP_NOFS);
1467 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
1468 GFP_NOFS);
1457 struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root), 1469 struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
1458 GFP_NOFS); 1470 GFP_NOFS);
1459 struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info), 1471 struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
@@ -1470,7 +1482,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1470 struct btrfs_super_block *disk_super; 1482 struct btrfs_super_block *disk_super;
1471 1483
1472 if (!extent_root || !tree_root || !fs_info || 1484 if (!extent_root || !tree_root || !fs_info ||
1473 !chunk_root || !dev_root) { 1485 !chunk_root || !dev_root || !csum_root) {
1474 err = -ENOMEM; 1486 err = -ENOMEM;
1475 goto fail; 1487 goto fail;
1476 } 1488 }
@@ -1487,6 +1499,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1487 init_completion(&fs_info->kobj_unregister); 1499 init_completion(&fs_info->kobj_unregister);
1488 fs_info->tree_root = tree_root; 1500 fs_info->tree_root = tree_root;
1489 fs_info->extent_root = extent_root; 1501 fs_info->extent_root = extent_root;
1502 fs_info->csum_root = csum_root;
1490 fs_info->chunk_root = chunk_root; 1503 fs_info->chunk_root = chunk_root;
1491 fs_info->dev_root = dev_root; 1504 fs_info->dev_root = dev_root;
1492 fs_info->fs_devices = fs_devices; 1505 fs_info->fs_devices = fs_devices;
@@ -1652,6 +1665,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1652 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1); 1665 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
1653 btrfs_init_workers(&fs_info->endio_workers, "endio", 1666 btrfs_init_workers(&fs_info->endio_workers, "endio",
1654 fs_info->thread_pool_size); 1667 fs_info->thread_pool_size);
1668 btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
1669 fs_info->thread_pool_size);
1655 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", 1670 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
1656 fs_info->thread_pool_size); 1671 fs_info->thread_pool_size);
1657 1672
@@ -1667,6 +1682,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1667 btrfs_start_workers(&fs_info->delalloc_workers, 1); 1682 btrfs_start_workers(&fs_info->delalloc_workers, 1);
1668 btrfs_start_workers(&fs_info->fixup_workers, 1); 1683 btrfs_start_workers(&fs_info->fixup_workers, 1);
1669 btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size); 1684 btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
1685 btrfs_start_workers(&fs_info->endio_meta_workers,
1686 fs_info->thread_pool_size);
1670 btrfs_start_workers(&fs_info->endio_write_workers, 1687 btrfs_start_workers(&fs_info->endio_write_workers,
1671 fs_info->thread_pool_size); 1688 fs_info->thread_pool_size);
1672 1689
@@ -1751,6 +1768,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1751 if (ret) 1768 if (ret)
1752 goto fail_extent_root; 1769 goto fail_extent_root;
1753 1770
1771 ret = find_and_setup_root(tree_root, fs_info,
1772 BTRFS_CSUM_TREE_OBJECTID, csum_root);
1773 if (ret)
1774 goto fail_extent_root;
1775
1776 csum_root->track_dirty = 1;
1777
1754 btrfs_read_block_groups(extent_root); 1778 btrfs_read_block_groups(extent_root);
1755 1779
1756 fs_info->generation = generation + 1; 1780 fs_info->generation = generation + 1;
@@ -1761,7 +1785,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1761 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 1785 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1762 "btrfs-cleaner"); 1786 "btrfs-cleaner");
1763 if (!fs_info->cleaner_kthread) 1787 if (!fs_info->cleaner_kthread)
1764 goto fail_extent_root; 1788 goto fail_csum_root;
1765 1789
1766 fs_info->transaction_kthread = kthread_run(transaction_kthread, 1790 fs_info->transaction_kthread = kthread_run(transaction_kthread,
1767 tree_root, 1791 tree_root,
@@ -1825,6 +1849,8 @@ fail_cleaner:
1825 filemap_write_and_wait(fs_info->btree_inode->i_mapping); 1849 filemap_write_and_wait(fs_info->btree_inode->i_mapping);
1826 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 1850 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
1827 1851
1852fail_csum_root:
1853 free_extent_buffer(csum_root->node);
1828fail_extent_root: 1854fail_extent_root:
1829 free_extent_buffer(extent_root->node); 1855 free_extent_buffer(extent_root->node);
1830fail_tree_root: 1856fail_tree_root:
@@ -1838,6 +1864,7 @@ fail_sb_buffer:
1838 btrfs_stop_workers(&fs_info->delalloc_workers); 1864 btrfs_stop_workers(&fs_info->delalloc_workers);
1839 btrfs_stop_workers(&fs_info->workers); 1865 btrfs_stop_workers(&fs_info->workers);
1840 btrfs_stop_workers(&fs_info->endio_workers); 1866 btrfs_stop_workers(&fs_info->endio_workers);
1867 btrfs_stop_workers(&fs_info->endio_meta_workers);
1841 btrfs_stop_workers(&fs_info->endio_write_workers); 1868 btrfs_stop_workers(&fs_info->endio_write_workers);
1842 btrfs_stop_workers(&fs_info->submit_workers); 1869 btrfs_stop_workers(&fs_info->submit_workers);
1843fail_iput: 1870fail_iput:
@@ -1853,6 +1880,7 @@ fail:
1853 kfree(fs_info); 1880 kfree(fs_info);
1854 kfree(chunk_root); 1881 kfree(chunk_root);
1855 kfree(dev_root); 1882 kfree(dev_root);
1883 kfree(csum_root);
1856 return ERR_PTR(err); 1884 return ERR_PTR(err);
1857} 1885}
1858 1886
@@ -2131,6 +2159,9 @@ int close_ctree(struct btrfs_root *root)
2131 if (root->fs_info->dev_root->node); 2159 if (root->fs_info->dev_root->node);
2132 free_extent_buffer(root->fs_info->dev_root->node); 2160 free_extent_buffer(root->fs_info->dev_root->node);
2133 2161
2162 if (root->fs_info->csum_root->node);
2163 free_extent_buffer(root->fs_info->csum_root->node);
2164
2134 btrfs_free_block_groups(root->fs_info); 2165 btrfs_free_block_groups(root->fs_info);
2135 2166
2136 del_fs_roots(fs_info); 2167 del_fs_roots(fs_info);
@@ -2141,6 +2172,7 @@ int close_ctree(struct btrfs_root *root)
2141 btrfs_stop_workers(&fs_info->delalloc_workers); 2172 btrfs_stop_workers(&fs_info->delalloc_workers);
2142 btrfs_stop_workers(&fs_info->workers); 2173 btrfs_stop_workers(&fs_info->workers);
2143 btrfs_stop_workers(&fs_info->endio_workers); 2174 btrfs_stop_workers(&fs_info->endio_workers);
2175 btrfs_stop_workers(&fs_info->endio_meta_workers);
2144 btrfs_stop_workers(&fs_info->endio_write_workers); 2176 btrfs_stop_workers(&fs_info->endio_write_workers);
2145 btrfs_stop_workers(&fs_info->submit_workers); 2177 btrfs_stop_workers(&fs_info->submit_workers);
2146 2178
@@ -2163,6 +2195,7 @@ int close_ctree(struct btrfs_root *root)
2163 kfree(fs_info->tree_root); 2195 kfree(fs_info->tree_root);
2164 kfree(fs_info->chunk_root); 2196 kfree(fs_info->chunk_root);
2165 kfree(fs_info->dev_root); 2197 kfree(fs_info->dev_root);
2198 kfree(fs_info->csum_root);
2166 return 0; 2199 return 0;
2167} 2200}
2168 2201