diff options
author | Chris Mason <chris.mason@oracle.com> | 2008-04-09 16:28:12 -0400 |
---|---|---|
committer | Chris Mason <chris.mason@oracle.com> | 2008-09-25 11:04:01 -0400 |
commit | ce9adaa5a792c2099a83246265eb4055bc38b6b8 (patch) | |
tree | 3ac03bb73cd3aad0c0b194d387cbb5b136c92fa7 /fs/btrfs/disk-io.c | |
parent | 728131d8e40c2a47c59ca91a806299c4708029f9 (diff) |
Btrfs: Do metadata checksums for reads via a workqueue
Before, metadata checksumming was done by the callers of read_tree_block,
which would set EXTENT_CSUM bits in the extent tree to show that a given
range of pages was already checksummed and didn't need to be verified
again.
But, those bits could go away via try_to_releasepage, and the end
result was bogus checksum failures on pages that never left the cache.
The new code validates checksums when the page is read. It is a little
tricky because metadata blocks can span pages and a single read may
end up going via multiple bios.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r-- | fs/btrfs/disk-io.c | 258 |
1 files changed, 224 insertions, 34 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e444b99e02da..82109204788c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/radix-tree.h> | 24 | #include <linux/radix-tree.h> |
25 | #include <linux/writeback.h> | 25 | #include <linux/writeback.h> |
26 | #include <linux/buffer_head.h> // for block_sync_page | 26 | #include <linux/buffer_head.h> // for block_sync_page |
27 | #include <linux/workqueue.h> | ||
27 | #include "ctree.h" | 28 | #include "ctree.h" |
28 | #include "disk-io.h" | 29 | #include "disk-io.h" |
29 | #include "transaction.h" | 30 | #include "transaction.h" |
@@ -45,6 +46,16 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf) | |||
45 | #endif | 46 | #endif |
46 | 47 | ||
47 | static struct extent_io_ops btree_extent_io_ops; | 48 | static struct extent_io_ops btree_extent_io_ops; |
49 | static struct workqueue_struct *end_io_workqueue; | ||
50 | |||
51 | struct end_io_wq { | ||
52 | struct bio *bio; | ||
53 | bio_end_io_t *end_io; | ||
54 | void *private; | ||
55 | struct btrfs_fs_info *info; | ||
56 | int error; | ||
57 | struct list_head list; | ||
58 | }; | ||
48 | 59 | ||
49 | struct extent_map *btree_get_extent(struct inode *inode, struct page *page, | 60 | struct extent_map *btree_get_extent(struct inode *inode, struct page *page, |
50 | size_t page_offset, u64 start, u64 len, | 61 | size_t page_offset, u64 start, u64 len, |
@@ -219,11 +230,108 @@ static int btree_writepage_io_hook(struct page *page, u64 start, u64 end) | |||
219 | return 0; | 230 | return 0; |
220 | } | 231 | } |
221 | 232 | ||
233 | int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, | ||
234 | struct extent_state *state) | ||
235 | { | ||
236 | struct extent_io_tree *tree; | ||
237 | u64 found_start; | ||
238 | int found_level; | ||
239 | unsigned long len; | ||
240 | struct extent_buffer *eb; | ||
241 | struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; | ||
242 | int ret; | ||
243 | |||
244 | tree = &BTRFS_I(page->mapping->host)->io_tree; | ||
245 | if (page->private == EXTENT_PAGE_PRIVATE) | ||
246 | goto out; | ||
247 | if (!page->private) | ||
248 | goto out; | ||
249 | len = page->private >> 2; | ||
250 | if (len == 0) { | ||
251 | WARN_ON(1); | ||
252 | } | ||
253 | eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); | ||
254 | read_extent_buffer_pages(tree, eb, start + PAGE_CACHE_SIZE, 1, | ||
255 | btree_get_extent); | ||
256 | btrfs_clear_buffer_defrag(eb); | ||
257 | found_start = btrfs_header_bytenr(eb); | ||
258 | if (found_start != start) { | ||
259 | printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n", | ||
260 | start, found_start, len); | ||
261 | WARN_ON(1); | ||
262 | goto err; | ||
263 | } | ||
264 | if (eb->first_page != page) { | ||
265 | printk("bad first page %lu %lu\n", eb->first_page->index, | ||
266 | page->index); | ||
267 | WARN_ON(1); | ||
268 | goto err; | ||
269 | } | ||
270 | found_level = btrfs_header_level(eb); | ||
271 | |||
272 | ret = csum_tree_block(root, eb, 1); | ||
273 | |||
274 | end = min_t(u64, eb->len, PAGE_CACHE_SIZE); | ||
275 | end = eb->start + end - 1; | ||
276 | release_extent_buffer_tail_pages(eb); | ||
277 | err: | ||
278 | free_extent_buffer(eb); | ||
279 | out: | ||
280 | return 0; | ||
281 | } | ||
282 | |||
283 | #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) | ||
284 | static void end_workqueue_bio(struct bio *bio, int err) | ||
285 | #else | ||
286 | static int end_workqueue_bio(struct bio *bio, | ||
287 | unsigned int bytes_done, int err) | ||
288 | #endif | ||
289 | { | ||
290 | struct end_io_wq *end_io_wq = bio->bi_private; | ||
291 | struct btrfs_fs_info *fs_info; | ||
292 | unsigned long flags; | ||
293 | |||
294 | #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) | ||
295 | if (bio->bi_size) | ||
296 | return 1; | ||
297 | #endif | ||
298 | |||
299 | fs_info = end_io_wq->info; | ||
300 | spin_lock_irqsave(&fs_info->end_io_work_lock, flags); | ||
301 | end_io_wq->error = err; | ||
302 | list_add_tail(&end_io_wq->list, &fs_info->end_io_work_list); | ||
303 | spin_unlock_irqrestore(&fs_info->end_io_work_lock, flags); | ||
304 | queue_work(end_io_workqueue, &fs_info->end_io_work); | ||
305 | |||
306 | #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) | ||
307 | return 0; | ||
308 | #endif | ||
309 | } | ||
310 | |||
222 | static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio) | 311 | static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio) |
223 | { | 312 | { |
224 | struct btrfs_root *root = BTRFS_I(inode)->root; | 313 | struct btrfs_root *root = BTRFS_I(inode)->root; |
314 | struct end_io_wq *end_io_wq; | ||
225 | u64 offset; | 315 | u64 offset; |
226 | offset = bio->bi_sector << 9; | 316 | offset = bio->bi_sector << 9; |
317 | |||
318 | if (rw & (1 << BIO_RW)) { | ||
319 | return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio); | ||
320 | } | ||
321 | |||
322 | end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS); | ||
323 | if (!end_io_wq) | ||
324 | return -ENOMEM; | ||
325 | |||
326 | end_io_wq->private = bio->bi_private; | ||
327 | end_io_wq->end_io = bio->bi_end_io; | ||
328 | end_io_wq->info = root->fs_info; | ||
329 | end_io_wq->error = 0; | ||
330 | end_io_wq->bio = bio; | ||
331 | |||
332 | bio->bi_private = end_io_wq; | ||
333 | bio->bi_end_io = end_workqueue_bio; | ||
334 | |||
227 | if (offset == BTRFS_SUPER_INFO_OFFSET) { | 335 | if (offset == BTRFS_SUPER_INFO_OFFSET) { |
228 | bio->bi_bdev = root->fs_info->sb->s_bdev; | 336 | bio->bi_bdev = root->fs_info->sb->s_bdev; |
229 | submit_bio(rw, bio); | 337 | submit_bio(rw, bio); |
@@ -363,36 +471,7 @@ static int close_all_devices(struct btrfs_fs_info *fs_info) | |||
363 | int btrfs_verify_block_csum(struct btrfs_root *root, | 471 | int btrfs_verify_block_csum(struct btrfs_root *root, |
364 | struct extent_buffer *buf) | 472 | struct extent_buffer *buf) |
365 | { | 473 | { |
366 | struct extent_io_tree *io_tree; | 474 | return btrfs_buffer_uptodate(buf); |
367 | u64 end; | ||
368 | int ret; | ||
369 | |||
370 | io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; | ||
371 | if (buf->flags & EXTENT_CSUM) | ||
372 | return 0; | ||
373 | |||
374 | end = min_t(u64, buf->len, PAGE_CACHE_SIZE); | ||
375 | end = buf->start + end - 1; | ||
376 | if (test_range_bit(io_tree, buf->start, end, EXTENT_CSUM, 1)) { | ||
377 | buf->flags |= EXTENT_CSUM; | ||
378 | return 0; | ||
379 | } | ||
380 | lock_extent(io_tree, buf->start, end, GFP_NOFS); | ||
381 | |||
382 | if (test_range_bit(io_tree, buf->start, end, EXTENT_CSUM, 1)) { | ||
383 | buf->flags |= EXTENT_CSUM; | ||
384 | ret = 0; | ||
385 | goto out_unlock; | ||
386 | } | ||
387 | WARN_ON(buf->flags & EXTENT_CSUM); | ||
388 | |||
389 | ret = csum_tree_block(root, buf, 1); | ||
390 | set_extent_bits(io_tree, buf->start, end, EXTENT_CSUM, GFP_NOFS); | ||
391 | buf->flags |= EXTENT_CSUM; | ||
392 | |||
393 | out_unlock: | ||
394 | unlock_extent(io_tree, buf->start, end, GFP_NOFS); | ||
395 | return ret; | ||
396 | } | 475 | } |
397 | 476 | ||
398 | struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, | 477 | struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, |
@@ -430,11 +509,15 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, | |||
430 | buf = btrfs_find_create_tree_block(root, bytenr, blocksize); | 509 | buf = btrfs_find_create_tree_block(root, bytenr, blocksize); |
431 | if (!buf) | 510 | if (!buf) |
432 | return NULL; | 511 | return NULL; |
433 | read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0, 1, | ||
434 | btree_get_extent); | ||
435 | 512 | ||
436 | ret = btrfs_verify_block_csum(root, buf); | 513 | ret = read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0, |
514 | 1, btree_get_extent); | ||
515 | |||
516 | if (ret == 0) { | ||
517 | buf->flags |= EXTENT_UPTODATE; | ||
518 | } | ||
437 | return buf; | 519 | return buf; |
520 | |||
438 | } | 521 | } |
439 | 522 | ||
440 | int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, | 523 | int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, |
@@ -724,6 +807,99 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) | |||
724 | return 0; | 807 | return 0; |
725 | } | 808 | } |
726 | 809 | ||
810 | static int bio_ready_for_csum(struct bio *bio) | ||
811 | { | ||
812 | u64 length = 0; | ||
813 | u64 buf_len = 0; | ||
814 | u64 start = 0; | ||
815 | struct page *page; | ||
816 | struct extent_io_tree *io_tree = NULL; | ||
817 | struct btrfs_fs_info *info = NULL; | ||
818 | struct bio_vec *bvec; | ||
819 | int i; | ||
820 | int ret; | ||
821 | |||
822 | bio_for_each_segment(bvec, bio, i) { | ||
823 | page = bvec->bv_page; | ||
824 | if (page->private == EXTENT_PAGE_PRIVATE) { | ||
825 | length += bvec->bv_len; | ||
826 | continue; | ||
827 | } | ||
828 | if (!page->private) { | ||
829 | length += bvec->bv_len; | ||
830 | continue; | ||
831 | } | ||
832 | length = bvec->bv_len; | ||
833 | buf_len = page->private >> 2; | ||
834 | start = page_offset(page) + bvec->bv_offset; | ||
835 | io_tree = &BTRFS_I(page->mapping->host)->io_tree; | ||
836 | info = BTRFS_I(page->mapping->host)->root->fs_info; | ||
837 | } | ||
838 | /* are we fully contained in this bio? */ | ||
839 | if (buf_len <= length) | ||
840 | return 1; | ||
841 | |||
842 | ret = extent_range_uptodate(io_tree, start + length, | ||
843 | start + buf_len - 1); | ||
844 | if (ret == 1) | ||
845 | return ret; | ||
846 | return ret; | ||
847 | } | ||
848 | |||
849 | #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) | ||
850 | void btrfs_end_io_csum(void *p) | ||
851 | #else | ||
852 | void btrfs_end_io_csum(struct work_struct *work) | ||
853 | #endif | ||
854 | { | ||
855 | #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) | ||
856 | struct btrfs_fs_info *fs_info = p; | ||
857 | #else | ||
858 | struct btrfs_fs_info *fs_info = container_of(work, | ||
859 | struct btrfs_fs_info, | ||
860 | end_io_work); | ||
861 | #endif | ||
862 | unsigned long flags; | ||
863 | struct end_io_wq *end_io_wq; | ||
864 | struct bio *bio; | ||
865 | struct list_head *next; | ||
866 | int error; | ||
867 | int was_empty; | ||
868 | |||
869 | while(1) { | ||
870 | spin_lock_irqsave(&fs_info->end_io_work_lock, flags); | ||
871 | if (list_empty(&fs_info->end_io_work_list)) { | ||
872 | spin_unlock_irqrestore(&fs_info->end_io_work_lock, | ||
873 | flags); | ||
874 | return; | ||
875 | } | ||
876 | next = fs_info->end_io_work_list.next; | ||
877 | list_del(next); | ||
878 | spin_unlock_irqrestore(&fs_info->end_io_work_lock, flags); | ||
879 | |||
880 | end_io_wq = list_entry(next, struct end_io_wq, list); | ||
881 | |||
882 | bio = end_io_wq->bio; | ||
883 | if (!bio_ready_for_csum(bio)) { | ||
884 | spin_lock_irqsave(&fs_info->end_io_work_lock, flags); | ||
885 | was_empty = list_empty(&fs_info->end_io_work_list); | ||
886 | list_add_tail(&end_io_wq->list, | ||
887 | &fs_info->end_io_work_list); | ||
888 | spin_unlock_irqrestore(&fs_info->end_io_work_lock, | ||
889 | flags); | ||
890 | if (was_empty) | ||
891 | return; | ||
892 | continue; | ||
893 | } | ||
894 | error = end_io_wq->error; | ||
895 | bio->bi_private = end_io_wq->private; | ||
896 | bio->bi_end_io = end_io_wq->end_io; | ||
897 | kfree(end_io_wq); | ||
898 | bio_endio(bio, error); | ||
899 | } | ||
900 | } | ||
901 | |||
902 | |||
727 | struct btrfs_root *open_ctree(struct super_block *sb, | 903 | struct btrfs_root *open_ctree(struct super_block *sb, |
728 | struct btrfs_fs_devices *fs_devices) | 904 | struct btrfs_fs_devices *fs_devices) |
729 | { | 905 | { |
@@ -750,11 +926,16 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
750 | err = -ENOMEM; | 926 | err = -ENOMEM; |
751 | goto fail; | 927 | goto fail; |
752 | } | 928 | } |
929 | end_io_workqueue = create_workqueue("btrfs-end-io"); | ||
930 | BUG_ON(!end_io_workqueue); | ||
931 | |||
753 | INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS); | 932 | INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS); |
754 | INIT_LIST_HEAD(&fs_info->trans_list); | 933 | INIT_LIST_HEAD(&fs_info->trans_list); |
755 | INIT_LIST_HEAD(&fs_info->dead_roots); | 934 | INIT_LIST_HEAD(&fs_info->dead_roots); |
756 | INIT_LIST_HEAD(&fs_info->hashers); | 935 | INIT_LIST_HEAD(&fs_info->hashers); |
936 | INIT_LIST_HEAD(&fs_info->end_io_work_list); | ||
757 | spin_lock_init(&fs_info->hash_lock); | 937 | spin_lock_init(&fs_info->hash_lock); |
938 | spin_lock_init(&fs_info->end_io_work_lock); | ||
758 | spin_lock_init(&fs_info->delalloc_lock); | 939 | spin_lock_init(&fs_info->delalloc_lock); |
759 | spin_lock_init(&fs_info->new_trans_lock); | 940 | spin_lock_init(&fs_info->new_trans_lock); |
760 | 941 | ||
@@ -799,6 +980,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
799 | fs_info->btree_inode->i_mapping, GFP_NOFS); | 980 | fs_info->btree_inode->i_mapping, GFP_NOFS); |
800 | fs_info->do_barriers = 1; | 981 | fs_info->do_barriers = 1; |
801 | 982 | ||
983 | INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum); | ||
802 | #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) | 984 | #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) |
803 | INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info); | 985 | INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info); |
804 | #else | 986 | #else |
@@ -1044,6 +1226,8 @@ int close_ctree(struct btrfs_root *root) | |||
1044 | extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree); | 1226 | extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree); |
1045 | 1227 | ||
1046 | truncate_inode_pages(fs_info->btree_inode->i_mapping, 0); | 1228 | truncate_inode_pages(fs_info->btree_inode->i_mapping, 0); |
1229 | flush_workqueue(end_io_workqueue); | ||
1230 | destroy_workqueue(end_io_workqueue); | ||
1047 | 1231 | ||
1048 | iput(fs_info->btree_inode); | 1232 | iput(fs_info->btree_inode); |
1049 | #if 0 | 1233 | #if 0 |
@@ -1171,12 +1355,18 @@ int btrfs_read_buffer(struct extent_buffer *buf) | |||
1171 | { | 1355 | { |
1172 | struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; | 1356 | struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; |
1173 | struct inode *btree_inode = root->fs_info->btree_inode; | 1357 | struct inode *btree_inode = root->fs_info->btree_inode; |
1174 | return read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, | 1358 | int ret; |
1359 | ret = read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, | ||
1175 | buf, 0, 1, btree_get_extent); | 1360 | buf, 0, 1, btree_get_extent); |
1361 | if (ret == 0) { | ||
1362 | buf->flags |= EXTENT_UPTODATE; | ||
1363 | } | ||
1364 | return ret; | ||
1176 | } | 1365 | } |
1177 | 1366 | ||
1178 | static struct extent_io_ops btree_extent_io_ops = { | 1367 | static struct extent_io_ops btree_extent_io_ops = { |
1179 | .writepage_io_hook = btree_writepage_io_hook, | 1368 | .writepage_io_hook = btree_writepage_io_hook, |
1369 | .readpage_end_io_hook = btree_readpage_end_io_hook, | ||
1180 | .submit_bio_hook = btree_submit_bio_hook, | 1370 | .submit_bio_hook = btree_submit_bio_hook, |
1181 | /* note we're sharing with inode.c for the merge bio hook */ | 1371 | /* note we're sharing with inode.c for the merge bio hook */ |
1182 | .merge_bio_hook = btrfs_merge_bio_hook, | 1372 | .merge_bio_hook = btrfs_merge_bio_hook, |