aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/disk-io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r--fs/btrfs/disk-io.c233
1 files changed, 41 insertions, 192 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dfdab849037b..10a2a579cc7f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -50,6 +50,8 @@
50#include "sysfs.h" 50#include "sysfs.h"
51#include "qgroup.h" 51#include "qgroup.h"
52#include "compression.h" 52#include "compression.h"
53#include "tree-checker.h"
54#include "ref-verify.h"
53 55
54#ifdef CONFIG_X86 56#ifdef CONFIG_X86
55#include <asm/cpufeature.h> 57#include <asm/cpufeature.h>
@@ -543,146 +545,6 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
543 return ret; 545 return ret;
544} 546}
545 547
546#define CORRUPT(reason, eb, root, slot) \
547 btrfs_crit(root->fs_info, \
548 "corrupt %s, %s: block=%llu, root=%llu, slot=%d", \
549 btrfs_header_level(eb) == 0 ? "leaf" : "node", \
550 reason, btrfs_header_bytenr(eb), root->objectid, slot)
551
552static noinline int check_leaf(struct btrfs_root *root,
553 struct extent_buffer *leaf)
554{
555 struct btrfs_fs_info *fs_info = root->fs_info;
556 struct btrfs_key key;
557 struct btrfs_key leaf_key;
558 u32 nritems = btrfs_header_nritems(leaf);
559 int slot;
560
561 /*
562 * Extent buffers from a relocation tree have a owner field that
563 * corresponds to the subvolume tree they are based on. So just from an
564 * extent buffer alone we can not find out what is the id of the
565 * corresponding subvolume tree, so we can not figure out if the extent
566 * buffer corresponds to the root of the relocation tree or not. So skip
567 * this check for relocation trees.
568 */
569 if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) {
570 struct btrfs_root *check_root;
571
572 key.objectid = btrfs_header_owner(leaf);
573 key.type = BTRFS_ROOT_ITEM_KEY;
574 key.offset = (u64)-1;
575
576 check_root = btrfs_get_fs_root(fs_info, &key, false);
577 /*
578 * The only reason we also check NULL here is that during
579 * open_ctree() some roots has not yet been set up.
580 */
581 if (!IS_ERR_OR_NULL(check_root)) {
582 struct extent_buffer *eb;
583
584 eb = btrfs_root_node(check_root);
585 /* if leaf is the root, then it's fine */
586 if (leaf != eb) {
587 CORRUPT("non-root leaf's nritems is 0",
588 leaf, check_root, 0);
589 free_extent_buffer(eb);
590 return -EIO;
591 }
592 free_extent_buffer(eb);
593 }
594 return 0;
595 }
596
597 if (nritems == 0)
598 return 0;
599
600 /* Check the 0 item */
601 if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
602 BTRFS_LEAF_DATA_SIZE(fs_info)) {
603 CORRUPT("invalid item offset size pair", leaf, root, 0);
604 return -EIO;
605 }
606
607 /*
608 * Check to make sure each items keys are in the correct order and their
609 * offsets make sense. We only have to loop through nritems-1 because
610 * we check the current slot against the next slot, which verifies the
611 * next slot's offset+size makes sense and that the current's slot
612 * offset is correct.
613 */
614 for (slot = 0; slot < nritems - 1; slot++) {
615 btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
616 btrfs_item_key_to_cpu(leaf, &key, slot + 1);
617
618 /* Make sure the keys are in the right order */
619 if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
620 CORRUPT("bad key order", leaf, root, slot);
621 return -EIO;
622 }
623
624 /*
625 * Make sure the offset and ends are right, remember that the
626 * item data starts at the end of the leaf and grows towards the
627 * front.
628 */
629 if (btrfs_item_offset_nr(leaf, slot) !=
630 btrfs_item_end_nr(leaf, slot + 1)) {
631 CORRUPT("slot offset bad", leaf, root, slot);
632 return -EIO;
633 }
634
635 /*
636 * Check to make sure that we don't point outside of the leaf,
637 * just in case all the items are consistent to each other, but
638 * all point outside of the leaf.
639 */
640 if (btrfs_item_end_nr(leaf, slot) >
641 BTRFS_LEAF_DATA_SIZE(fs_info)) {
642 CORRUPT("slot end outside of leaf", leaf, root, slot);
643 return -EIO;
644 }
645 }
646
647 return 0;
648}
649
650static int check_node(struct btrfs_root *root, struct extent_buffer *node)
651{
652 unsigned long nr = btrfs_header_nritems(node);
653 struct btrfs_key key, next_key;
654 int slot;
655 u64 bytenr;
656 int ret = 0;
657
658 if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) {
659 btrfs_crit(root->fs_info,
660 "corrupt node: block %llu root %llu nritems %lu",
661 node->start, root->objectid, nr);
662 return -EIO;
663 }
664
665 for (slot = 0; slot < nr - 1; slot++) {
666 bytenr = btrfs_node_blockptr(node, slot);
667 btrfs_node_key_to_cpu(node, &key, slot);
668 btrfs_node_key_to_cpu(node, &next_key, slot + 1);
669
670 if (!bytenr) {
671 CORRUPT("invalid item slot", node, root, slot);
672 ret = -EIO;
673 goto out;
674 }
675
676 if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
677 CORRUPT("bad key order", node, root, slot);
678 ret = -EIO;
679 goto out;
680 }
681 }
682out:
683 return ret;
684}
685
686static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, 548static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
687 u64 phy_offset, struct page *page, 549 u64 phy_offset, struct page *page,
688 u64 start, u64 end, int mirror) 550 u64 start, u64 end, int mirror)
@@ -748,12 +610,12 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
748 * that we don't try and read the other copies of this block, just 610 * that we don't try and read the other copies of this block, just
749 * return -EIO. 611 * return -EIO.
750 */ 612 */
751 if (found_level == 0 && check_leaf(root, eb)) { 613 if (found_level == 0 && btrfs_check_leaf_full(root, eb)) {
752 set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); 614 set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
753 ret = -EIO; 615 ret = -EIO;
754 } 616 }
755 617
756 if (found_level > 0 && check_node(root, eb)) 618 if (found_level > 0 && btrfs_check_node(root, eb))
757 ret = -EIO; 619 ret = -EIO;
758 620
759 if (!ret) 621 if (!ret)
@@ -879,22 +741,9 @@ static void run_one_async_start(struct btrfs_work *work)
879 741
880static void run_one_async_done(struct btrfs_work *work) 742static void run_one_async_done(struct btrfs_work *work)
881{ 743{
882 struct btrfs_fs_info *fs_info;
883 struct async_submit_bio *async; 744 struct async_submit_bio *async;
884 int limit;
885 745
886 async = container_of(work, struct async_submit_bio, work); 746 async = container_of(work, struct async_submit_bio, work);
887 fs_info = async->fs_info;
888
889 limit = btrfs_async_submit_limit(fs_info);
890 limit = limit * 2 / 3;
891
892 /*
893 * atomic_dec_return implies a barrier for waitqueue_active
894 */
895 if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
896 waitqueue_active(&fs_info->async_submit_wait))
897 wake_up(&fs_info->async_submit_wait);
898 747
899 /* If an error occurred we just want to clean up the bio and move on */ 748 /* If an error occurred we just want to clean up the bio and move on */
900 if (async->status) { 749 if (async->status) {
@@ -942,19 +791,10 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
942 791
943 async->status = 0; 792 async->status = 0;
944 793
945 atomic_inc(&fs_info->nr_async_submits);
946
947 if (op_is_sync(bio->bi_opf)) 794 if (op_is_sync(bio->bi_opf))
948 btrfs_set_work_high_priority(&async->work); 795 btrfs_set_work_high_priority(&async->work);
949 796
950 btrfs_queue_work(fs_info->workers, &async->work); 797 btrfs_queue_work(fs_info->workers, &async->work);
951
952 while (atomic_read(&fs_info->async_submit_draining) &&
953 atomic_read(&fs_info->nr_async_submits)) {
954 wait_event(fs_info->async_submit_wait,
955 (atomic_read(&fs_info->nr_async_submits) == 0));
956 }
957
958 return 0; 798 return 0;
959} 799}
960 800
@@ -1005,9 +845,9 @@ static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio,
1005 return ret; 845 return ret;
1006} 846}
1007 847
1008static int check_async_write(unsigned long bio_flags) 848static int check_async_write(struct btrfs_inode *bi)
1009{ 849{
1010 if (bio_flags & EXTENT_BIO_TREE_LOG) 850 if (atomic_read(&bi->sync_writers))
1011 return 0; 851 return 0;
1012#ifdef CONFIG_X86 852#ifdef CONFIG_X86
1013 if (static_cpu_has(X86_FEATURE_XMM4_2)) 853 if (static_cpu_has(X86_FEATURE_XMM4_2))
@@ -1022,7 +862,7 @@ static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio,
1022{ 862{
1023 struct inode *inode = private_data; 863 struct inode *inode = private_data;
1024 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 864 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1025 int async = check_async_write(bio_flags); 865 int async = check_async_write(BTRFS_I(inode));
1026 blk_status_t ret; 866 blk_status_t ret;
1027 867
1028 if (bio_op(bio) != REQ_OP_WRITE) { 868 if (bio_op(bio) != REQ_OP_WRITE) {
@@ -2607,14 +2447,6 @@ int open_ctree(struct super_block *sb,
2607 goto fail_delalloc_bytes; 2447 goto fail_delalloc_bytes;
2608 } 2448 }
2609 2449
2610 fs_info->btree_inode = new_inode(sb);
2611 if (!fs_info->btree_inode) {
2612 err = -ENOMEM;
2613 goto fail_bio_counter;
2614 }
2615
2616 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
2617
2618 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); 2450 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
2619 INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); 2451 INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
2620 INIT_LIST_HEAD(&fs_info->trans_list); 2452 INIT_LIST_HEAD(&fs_info->trans_list);
@@ -2647,17 +2479,12 @@ int open_ctree(struct super_block *sb,
2647 btrfs_mapping_init(&fs_info->mapping_tree); 2479 btrfs_mapping_init(&fs_info->mapping_tree);
2648 btrfs_init_block_rsv(&fs_info->global_block_rsv, 2480 btrfs_init_block_rsv(&fs_info->global_block_rsv,
2649 BTRFS_BLOCK_RSV_GLOBAL); 2481 BTRFS_BLOCK_RSV_GLOBAL);
2650 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
2651 BTRFS_BLOCK_RSV_DELALLOC);
2652 btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); 2482 btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
2653 btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); 2483 btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
2654 btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); 2484 btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
2655 btrfs_init_block_rsv(&fs_info->delayed_block_rsv, 2485 btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
2656 BTRFS_BLOCK_RSV_DELOPS); 2486 BTRFS_BLOCK_RSV_DELOPS);
2657 atomic_set(&fs_info->nr_async_submits, 0);
2658 atomic_set(&fs_info->async_delalloc_pages, 0); 2487 atomic_set(&fs_info->async_delalloc_pages, 0);
2659 atomic_set(&fs_info->async_submit_draining, 0);
2660 atomic_set(&fs_info->nr_async_bios, 0);
2661 atomic_set(&fs_info->defrag_running, 0); 2488 atomic_set(&fs_info->defrag_running, 0);
2662 atomic_set(&fs_info->qgroup_op_seq, 0); 2489 atomic_set(&fs_info->qgroup_op_seq, 0);
2663 atomic_set(&fs_info->reada_works_cnt, 0); 2490 atomic_set(&fs_info->reada_works_cnt, 0);
@@ -2673,12 +2500,21 @@ int open_ctree(struct super_block *sb,
2673 /* readahead state */ 2500 /* readahead state */
2674 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 2501 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
2675 spin_lock_init(&fs_info->reada_lock); 2502 spin_lock_init(&fs_info->reada_lock);
2503 btrfs_init_ref_verify(fs_info);
2676 2504
2677 fs_info->thread_pool_size = min_t(unsigned long, 2505 fs_info->thread_pool_size = min_t(unsigned long,
2678 num_online_cpus() + 2, 8); 2506 num_online_cpus() + 2, 8);
2679 2507
2680 INIT_LIST_HEAD(&fs_info->ordered_roots); 2508 INIT_LIST_HEAD(&fs_info->ordered_roots);
2681 spin_lock_init(&fs_info->ordered_root_lock); 2509 spin_lock_init(&fs_info->ordered_root_lock);
2510
2511 fs_info->btree_inode = new_inode(sb);
2512 if (!fs_info->btree_inode) {
2513 err = -ENOMEM;
2514 goto fail_bio_counter;
2515 }
2516 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
2517
2682 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), 2518 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
2683 GFP_KERNEL); 2519 GFP_KERNEL);
2684 if (!fs_info->delayed_root) { 2520 if (!fs_info->delayed_root) {
@@ -2895,12 +2731,13 @@ int open_ctree(struct super_block *sb,
2895 sb->s_bdi->congested_fn = btrfs_congested_fn; 2731 sb->s_bdi->congested_fn = btrfs_congested_fn;
2896 sb->s_bdi->congested_data = fs_info; 2732 sb->s_bdi->congested_data = fs_info;
2897 sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; 2733 sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
2898 sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; 2734 sb->s_bdi->ra_pages = VM_MAX_READAHEAD * SZ_1K / PAGE_SIZE;
2899 sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); 2735 sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
2900 sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); 2736 sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
2901 2737
2902 sb->s_blocksize = sectorsize; 2738 sb->s_blocksize = sectorsize;
2903 sb->s_blocksize_bits = blksize_bits(sectorsize); 2739 sb->s_blocksize_bits = blksize_bits(sectorsize);
2740 memcpy(&sb->s_uuid, fs_info->fsid, BTRFS_FSID_SIZE);
2904 2741
2905 mutex_lock(&fs_info->chunk_mutex); 2742 mutex_lock(&fs_info->chunk_mutex);
2906 ret = btrfs_read_sys_array(fs_info); 2743 ret = btrfs_read_sys_array(fs_info);
@@ -3083,6 +2920,9 @@ retry_root_backup:
3083 if (ret) 2920 if (ret)
3084 goto fail_trans_kthread; 2921 goto fail_trans_kthread;
3085 2922
2923 if (btrfs_build_ref_tree(fs_info))
2924 btrfs_err(fs_info, "couldn't build ref tree");
2925
3086 /* do not make disk changes in broken FS or nologreplay is given */ 2926 /* do not make disk changes in broken FS or nologreplay is given */
3087 if (btrfs_super_log_root(disk_super) != 0 && 2927 if (btrfs_super_log_root(disk_super) != 0 &&
3088 !btrfs_test_opt(fs_info, NOLOGREPLAY)) { 2928 !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
@@ -3948,6 +3788,7 @@ void close_ctree(struct btrfs_fs_info *fs_info)
3948 cleanup_srcu_struct(&fs_info->subvol_srcu); 3788 cleanup_srcu_struct(&fs_info->subvol_srcu);
3949 3789
3950 btrfs_free_stripe_hash_table(fs_info); 3790 btrfs_free_stripe_hash_table(fs_info);
3791 btrfs_free_ref_cache(fs_info);
3951 3792
3952 __btrfs_free_block_rsv(root->orphan_block_rsv); 3793 __btrfs_free_block_rsv(root->orphan_block_rsv);
3953 root->orphan_block_rsv = NULL; 3794 root->orphan_block_rsv = NULL;
@@ -4007,7 +3848,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
4007 buf->len, 3848 buf->len,
4008 fs_info->dirty_metadata_batch); 3849 fs_info->dirty_metadata_batch);
4009#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 3850#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4010 if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) { 3851 /*
3852 * Since btrfs_mark_buffer_dirty() can be called with item pointer set
3853 * but item data not updated.
3854 * So here we should only check item pointers, not item data.
3855 */
3856 if (btrfs_header_level(buf) == 0 &&
3857 btrfs_check_leaf_relaxed(root, buf)) {
4011 btrfs_print_leaf(buf); 3858 btrfs_print_leaf(buf);
4012 ASSERT(0); 3859 ASSERT(0);
4013 } 3860 }
@@ -4272,26 +4119,28 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
4272 4119
4273 while ((node = rb_first(&delayed_refs->href_root)) != NULL) { 4120 while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
4274 struct btrfs_delayed_ref_head *head; 4121 struct btrfs_delayed_ref_head *head;
4275 struct btrfs_delayed_ref_node *tmp; 4122 struct rb_node *n;
4276 bool pin_bytes = false; 4123 bool pin_bytes = false;
4277 4124
4278 head = rb_entry(node, struct btrfs_delayed_ref_head, 4125 head = rb_entry(node, struct btrfs_delayed_ref_head,
4279 href_node); 4126 href_node);
4280 if (!mutex_trylock(&head->mutex)) { 4127 if (!mutex_trylock(&head->mutex)) {
4281 refcount_inc(&head->node.refs); 4128 refcount_inc(&head->refs);
4282 spin_unlock(&delayed_refs->lock); 4129 spin_unlock(&delayed_refs->lock);
4283 4130
4284 mutex_lock(&head->mutex); 4131 mutex_lock(&head->mutex);
4285 mutex_unlock(&head->mutex); 4132 mutex_unlock(&head->mutex);
4286 btrfs_put_delayed_ref(&head->node); 4133 btrfs_put_delayed_ref_head(head);
4287 spin_lock(&delayed_refs->lock); 4134 spin_lock(&delayed_refs->lock);
4288 continue; 4135 continue;
4289 } 4136 }
4290 spin_lock(&head->lock); 4137 spin_lock(&head->lock);
4291 list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list, 4138 while ((n = rb_first(&head->ref_tree)) != NULL) {
4292 list) { 4139 ref = rb_entry(n, struct btrfs_delayed_ref_node,
4140 ref_node);
4293 ref->in_tree = 0; 4141 ref->in_tree = 0;
4294 list_del(&ref->list); 4142 rb_erase(&ref->ref_node, &head->ref_tree);
4143 RB_CLEAR_NODE(&ref->ref_node);
4295 if (!list_empty(&ref->add_list)) 4144 if (!list_empty(&ref->add_list))
4296 list_del(&ref->add_list); 4145 list_del(&ref->add_list);
4297 atomic_dec(&delayed_refs->num_entries); 4146 atomic_dec(&delayed_refs->num_entries);
@@ -4304,16 +4153,16 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
4304 if (head->processing == 0) 4153 if (head->processing == 0)
4305 delayed_refs->num_heads_ready--; 4154 delayed_refs->num_heads_ready--;
4306 atomic_dec(&delayed_refs->num_entries); 4155 atomic_dec(&delayed_refs->num_entries);
4307 head->node.in_tree = 0;
4308 rb_erase(&head->href_node, &delayed_refs->href_root); 4156 rb_erase(&head->href_node, &delayed_refs->href_root);
4157 RB_CLEAR_NODE(&head->href_node);
4309 spin_unlock(&head->lock); 4158 spin_unlock(&head->lock);
4310 spin_unlock(&delayed_refs->lock); 4159 spin_unlock(&delayed_refs->lock);
4311 mutex_unlock(&head->mutex); 4160 mutex_unlock(&head->mutex);
4312 4161
4313 if (pin_bytes) 4162 if (pin_bytes)
4314 btrfs_pin_extent(fs_info, head->node.bytenr, 4163 btrfs_pin_extent(fs_info, head->bytenr,
4315 head->node.num_bytes, 1); 4164 head->num_bytes, 1);
4316 btrfs_put_delayed_ref(&head->node); 4165 btrfs_put_delayed_ref_head(head);
4317 cond_resched(); 4166 cond_resched();
4318 spin_lock(&delayed_refs->lock); 4167 spin_lock(&delayed_refs->lock);
4319 } 4168 }