aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/compression.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-12-08 16:58:54 -0500
committerChris Mason <chris.mason@oracle.com>2008-12-08 16:58:54 -0500
commitd20f7043fa65659136c1a7c3c456eeeb5c6f431f (patch)
tree05d1031cadec6d440a97221e3a32adb504a51699 /fs/btrfs/compression.c
parentc99e905c945c462085c6d64646dc5af0c0a16815 (diff)
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have been stored in the subvolume trees, indexed by the inode that is referencing the data block. This means that when we read the inode, we've probably read in at least some checksums as well. But, this has a few problems: * The checksums are indexed by logical offset in the file. When compression is on, this means we have to do the expensive checksumming on the uncompressed data. It would be faster if we could checksum the compressed data instead. * If we implement encryption, we'll be checksumming the plain text and storing that on disk. This is significantly less secure. * For either compression or encryption, we have to get the plain text back before we can verify the checksum as correct. This makes the raid layer balancing and extent moving much more expensive. * It makes the front end caching code more complex, as we have touch the subvolume and inodes as we cache extents. * There is potentitally one copy of the checksum in each subvolume referencing an extent. The solution used here is to store the extent checksums in a dedicated tree. This allows us to index the checksums by phyiscal extent start and length. It means: * The checksum is against the data stored on disk, after any compression or encryption is done. * The checksum is stored in a central location, and can be verified without following back references, or reading inodes. This makes compression significantly faster by reducing the amount of data that needs to be checksummed. It will also allow much faster raid management code in general. The checksums are indexed by a key with a fixed objectid (a magic value in ctree.h) and offset set to the starting byte of the extent. This allows us to copy the checksum items into the fsync log tree directly (or any other tree), without having to invent a second format for them. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/compression.c')
-rw-r--r--fs/btrfs/compression.c124
1 files changed, 109 insertions, 15 deletions
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 4febe2eb0b83..ad7274137309 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -69,11 +69,27 @@ struct compressed_bio {
69 69
70 /* IO errors */ 70 /* IO errors */
71 int errors; 71 int errors;
72 int mirror_num;
72 73
73 /* for reads, this is the bio we are copying the data into */ 74 /* for reads, this is the bio we are copying the data into */
74 struct bio *orig_bio; 75 struct bio *orig_bio;
76
77 /*
78 * the start of a variable length array of checksums only
79 * used by reads
80 */
81 u32 sums;
75}; 82};
76 83
84static inline int compressed_bio_size(struct btrfs_root *root,
85 unsigned long disk_size)
86{
87 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
88 return sizeof(struct compressed_bio) +
89 ((disk_size + root->sectorsize - 1) / root->sectorsize) *
90 csum_size;
91}
92
77static struct bio *compressed_bio_alloc(struct block_device *bdev, 93static struct bio *compressed_bio_alloc(struct block_device *bdev,
78 u64 first_byte, gfp_t gfp_flags) 94 u64 first_byte, gfp_t gfp_flags)
79{ 95{
@@ -96,6 +112,47 @@ static struct bio *compressed_bio_alloc(struct block_device *bdev,
96 return bio; 112 return bio;
97} 113}
98 114
115static int check_compressed_csum(struct inode *inode,
116 struct compressed_bio *cb,
117 u64 disk_start)
118{
119 int ret;
120 struct btrfs_root *root = BTRFS_I(inode)->root;
121 struct page *page;
122 unsigned long i;
123 char *kaddr;
124 u32 csum;
125 u32 *cb_sum = &cb->sums;
126
127 if (btrfs_test_opt(root, NODATASUM) ||
128 btrfs_test_flag(inode, NODATASUM))
129 return 0;
130
131 for (i = 0; i < cb->nr_pages; i++) {
132 page = cb->compressed_pages[i];
133 csum = ~(u32)0;
134
135 kaddr = kmap_atomic(page, KM_USER0);
136 csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE);
137 btrfs_csum_final(csum, (char *)&csum);
138 kunmap_atomic(kaddr, KM_USER0);
139
140 if (csum != *cb_sum) {
141 printk("btrfs csum failed ino %lu extent %llu csum %u "
142 "wanted %u mirror %d\n", inode->i_ino,
143 (unsigned long long)disk_start,
144 csum, *cb_sum, cb->mirror_num);
145 ret = -EIO;
146 goto fail;
147 }
148 cb_sum++;
149
150 }
151 ret = 0;
152fail:
153 return ret;
154}
155
99/* when we finish reading compressed pages from the disk, we 156/* when we finish reading compressed pages from the disk, we
100 * decompress them and then run the bio end_io routines on the 157 * decompress them and then run the bio end_io routines on the
101 * decompressed pages (in the inode address space). 158 * decompressed pages (in the inode address space).
@@ -124,16 +181,21 @@ static void end_compressed_bio_read(struct bio *bio, int err)
124 if (!atomic_dec_and_test(&cb->pending_bios)) 181 if (!atomic_dec_and_test(&cb->pending_bios))
125 goto out; 182 goto out;
126 183
184 inode = cb->inode;
185 ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9);
186 if (ret)
187 goto csum_failed;
188
127 /* ok, we're the last bio for this extent, lets start 189 /* ok, we're the last bio for this extent, lets start
128 * the decompression. 190 * the decompression.
129 */ 191 */
130 inode = cb->inode;
131 tree = &BTRFS_I(inode)->io_tree; 192 tree = &BTRFS_I(inode)->io_tree;
132 ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, 193 ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
133 cb->start, 194 cb->start,
134 cb->orig_bio->bi_io_vec, 195 cb->orig_bio->bi_io_vec,
135 cb->orig_bio->bi_vcnt, 196 cb->orig_bio->bi_vcnt,
136 cb->compressed_len); 197 cb->compressed_len);
198csum_failed:
137 if (ret) 199 if (ret)
138 cb->errors = 1; 200 cb->errors = 1;
139 201
@@ -148,8 +210,21 @@ static void end_compressed_bio_read(struct bio *bio, int err)
148 /* do io completion on the original bio */ 210 /* do io completion on the original bio */
149 if (cb->errors) { 211 if (cb->errors) {
150 bio_io_error(cb->orig_bio); 212 bio_io_error(cb->orig_bio);
151 } else 213 } else {
214 int bio_index = 0;
215 struct bio_vec *bvec = cb->orig_bio->bi_io_vec;
216
217 /*
218 * we have verified the checksum already, set page
219 * checked so the end_io handlers know about it
220 */
221 while(bio_index < cb->orig_bio->bi_vcnt) {
222 SetPageChecked(bvec->bv_page);
223 bvec++;
224 bio_index++;
225 }
152 bio_endio(cb->orig_bio, 0); 226 bio_endio(cb->orig_bio, 0);
227 }
153 228
154 /* finally free the cb struct */ 229 /* finally free the cb struct */
155 kfree(cb->compressed_pages); 230 kfree(cb->compressed_pages);
@@ -277,12 +352,13 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
277 int ret; 352 int ret;
278 353
279 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); 354 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
280 cb = kmalloc(sizeof(*cb), GFP_NOFS); 355 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
281 atomic_set(&cb->pending_bios, 0); 356 atomic_set(&cb->pending_bios, 0);
282 cb->errors = 0; 357 cb->errors = 0;
283 cb->inode = inode; 358 cb->inode = inode;
284 cb->start = start; 359 cb->start = start;
285 cb->len = len; 360 cb->len = len;
361 cb->mirror_num = 0;
286 cb->compressed_pages = compressed_pages; 362 cb->compressed_pages = compressed_pages;
287 cb->compressed_len = compressed_len; 363 cb->compressed_len = compressed_len;
288 cb->orig_bio = NULL; 364 cb->orig_bio = NULL;
@@ -290,9 +366,6 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
290 366
291 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 367 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
292 368
293 ret = btrfs_csum_file_bytes(root, inode, start, len);
294 BUG_ON(ret);
295
296 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); 369 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
297 bio->bi_private = cb; 370 bio->bi_private = cb;
298 bio->bi_end_io = end_compressed_bio_write; 371 bio->bi_end_io = end_compressed_bio_write;
@@ -325,6 +398,9 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
325 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 398 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
326 BUG_ON(ret); 399 BUG_ON(ret);
327 400
401 ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
402 BUG_ON(ret);
403
328 ret = btrfs_map_bio(root, WRITE, bio, 0, 1); 404 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
329 BUG_ON(ret); 405 BUG_ON(ret);
330 406
@@ -348,6 +424,9 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
348 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 424 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
349 BUG_ON(ret); 425 BUG_ON(ret);
350 426
427 ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
428 BUG_ON(ret);
429
351 ret = btrfs_map_bio(root, WRITE, bio, 0, 1); 430 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
352 BUG_ON(ret); 431 BUG_ON(ret);
353 432
@@ -510,6 +589,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
510 u64 em_start; 589 u64 em_start;
511 struct extent_map *em; 590 struct extent_map *em;
512 int ret; 591 int ret;
592 u32 *sums;
513 593
514 tree = &BTRFS_I(inode)->io_tree; 594 tree = &BTRFS_I(inode)->io_tree;
515 em_tree = &BTRFS_I(inode)->extent_tree; 595 em_tree = &BTRFS_I(inode)->extent_tree;
@@ -521,15 +601,18 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
521 PAGE_CACHE_SIZE); 601 PAGE_CACHE_SIZE);
522 spin_unlock(&em_tree->lock); 602 spin_unlock(&em_tree->lock);
523 603
524 cb = kmalloc(sizeof(*cb), GFP_NOFS); 604 compressed_len = em->block_len;
605 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
525 atomic_set(&cb->pending_bios, 0); 606 atomic_set(&cb->pending_bios, 0);
526 cb->errors = 0; 607 cb->errors = 0;
527 cb->inode = inode; 608 cb->inode = inode;
609 cb->mirror_num = mirror_num;
610 sums = &cb->sums;
528 611
529 cb->start = em->orig_start; 612 cb->start = em->orig_start;
530 compressed_len = em->block_len;
531 em_len = em->len; 613 em_len = em->len;
532 em_start = em->start; 614 em_start = em->start;
615
533 free_extent_map(em); 616 free_extent_map(em);
534 em = NULL; 617 em = NULL;
535 618
@@ -551,11 +634,6 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
551 634
552 add_ra_bio_pages(inode, em_start + em_len, cb); 635 add_ra_bio_pages(inode, em_start + em_len, cb);
553 636
554 if (!btrfs_test_opt(root, NODATASUM) &&
555 !btrfs_test_flag(inode, NODATASUM)) {
556 btrfs_lookup_bio_sums(root, inode, cb->orig_bio);
557 }
558
559 /* include any pages we added in add_ra-bio_pages */ 637 /* include any pages we added in add_ra-bio_pages */
560 uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; 638 uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
561 cb->len = uncompressed_len; 639 cb->len = uncompressed_len;
@@ -568,6 +646,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
568 for (page_index = 0; page_index < nr_pages; page_index++) { 646 for (page_index = 0; page_index < nr_pages; page_index++) {
569 page = cb->compressed_pages[page_index]; 647 page = cb->compressed_pages[page_index];
570 page->mapping = inode->i_mapping; 648 page->mapping = inode->i_mapping;
649 page->index = em_start >> PAGE_CACHE_SHIFT;
650
571 if (comp_bio->bi_size) 651 if (comp_bio->bi_size)
572 ret = tree->ops->merge_bio_hook(page, 0, 652 ret = tree->ops->merge_bio_hook(page, 0,
573 PAGE_CACHE_SIZE, 653 PAGE_CACHE_SIZE,
@@ -591,7 +671,16 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
591 */ 671 */
592 atomic_inc(&cb->pending_bios); 672 atomic_inc(&cb->pending_bios);
593 673
594 ret = btrfs_map_bio(root, READ, comp_bio, 0, 0); 674 if (!btrfs_test_opt(root, NODATASUM) &&
675 !btrfs_test_flag(inode, NODATASUM)) {
676 btrfs_lookup_bio_sums(root, inode, comp_bio,
677 sums);
678 }
679 sums += (comp_bio->bi_size + root->sectorsize - 1) /
680 root->sectorsize;
681
682 ret = btrfs_map_bio(root, READ, comp_bio,
683 mirror_num, 0);
595 BUG_ON(ret); 684 BUG_ON(ret);
596 685
597 bio_put(comp_bio); 686 bio_put(comp_bio);
@@ -610,7 +699,12 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
610 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); 699 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
611 BUG_ON(ret); 700 BUG_ON(ret);
612 701
613 ret = btrfs_map_bio(root, READ, comp_bio, 0, 0); 702 if (!btrfs_test_opt(root, NODATASUM) &&
703 !btrfs_test_flag(inode, NODATASUM)) {
704 btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
705 }
706
707 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
614 BUG_ON(ret); 708 BUG_ON(ret);
615 709
616 bio_put(comp_bio); 710 bio_put(comp_bio);