aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/disk-io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r--fs/btrfs/disk-io.c370
1 files changed, 227 insertions, 143 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 22e98e04c2ea..a8f652dc940b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -45,6 +45,11 @@
45#include "inode-map.h" 45#include "inode-map.h"
46#include "check-integrity.h" 46#include "check-integrity.h"
47#include "rcu-string.h" 47#include "rcu-string.h"
48#include "dev-replace.h"
49
50#ifdef CONFIG_X86
51#include <asm/cpufeature.h>
52#endif
48 53
49static struct extent_io_ops btree_extent_io_ops; 54static struct extent_io_ops btree_extent_io_ops;
50static void end_workqueue_fn(struct btrfs_work *work); 55static void end_workqueue_fn(struct btrfs_work *work);
@@ -217,26 +222,16 @@ static struct extent_map *btree_get_extent(struct inode *inode,
217 write_lock(&em_tree->lock); 222 write_lock(&em_tree->lock);
218 ret = add_extent_mapping(em_tree, em); 223 ret = add_extent_mapping(em_tree, em);
219 if (ret == -EEXIST) { 224 if (ret == -EEXIST) {
220 u64 failed_start = em->start;
221 u64 failed_len = em->len;
222
223 free_extent_map(em); 225 free_extent_map(em);
224 em = lookup_extent_mapping(em_tree, start, len); 226 em = lookup_extent_mapping(em_tree, start, len);
225 if (em) { 227 if (!em)
226 ret = 0; 228 em = ERR_PTR(-EIO);
227 } else {
228 em = lookup_extent_mapping(em_tree, failed_start,
229 failed_len);
230 ret = -EIO;
231 }
232 } else if (ret) { 229 } else if (ret) {
233 free_extent_map(em); 230 free_extent_map(em);
234 em = NULL; 231 em = ERR_PTR(ret);
235 } 232 }
236 write_unlock(&em_tree->lock); 233 write_unlock(&em_tree->lock);
237 234
238 if (ret)
239 em = ERR_PTR(ret);
240out: 235out:
241 return em; 236 return em;
242} 237}
@@ -393,7 +388,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
393 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) 388 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
394 break; 389 break;
395 390
396 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, 391 num_copies = btrfs_num_copies(root->fs_info,
397 eb->start, eb->len); 392 eb->start, eb->len);
398 if (num_copies == 1) 393 if (num_copies == 1)
399 break; 394 break;
@@ -439,10 +434,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
439 WARN_ON(1); 434 WARN_ON(1);
440 return 0; 435 return 0;
441 } 436 }
442 if (eb->pages[0] != page) {
443 WARN_ON(1);
444 return 0;
445 }
446 if (!PageUptodate(page)) { 437 if (!PageUptodate(page)) {
447 WARN_ON(1); 438 WARN_ON(1);
448 return 0; 439 return 0;
@@ -862,21 +853,37 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
862 int mirror_num, unsigned long bio_flags, 853 int mirror_num, unsigned long bio_flags,
863 u64 bio_offset) 854 u64 bio_offset)
864{ 855{
856 int ret;
857
865 /* 858 /*
866 * when we're called for a write, we're already in the async 859 * when we're called for a write, we're already in the async
867 * submission context. Just jump into btrfs_map_bio 860 * submission context. Just jump into btrfs_map_bio
868 */ 861 */
869 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); 862 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
863 if (ret)
864 bio_endio(bio, ret);
865 return ret;
866}
867
868static int check_async_write(struct inode *inode, unsigned long bio_flags)
869{
870 if (bio_flags & EXTENT_BIO_TREE_LOG)
871 return 0;
872#ifdef CONFIG_X86
873 if (cpu_has_xmm4_2)
874 return 0;
875#endif
876 return 1;
870} 877}
871 878
872static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 879static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
873 int mirror_num, unsigned long bio_flags, 880 int mirror_num, unsigned long bio_flags,
874 u64 bio_offset) 881 u64 bio_offset)
875{ 882{
883 int async = check_async_write(inode, bio_flags);
876 int ret; 884 int ret;
877 885
878 if (!(rw & REQ_WRITE)) { 886 if (!(rw & REQ_WRITE)) {
879
880 /* 887 /*
881 * called for a read, do the setup so that checksum validation 888 * called for a read, do the setup so that checksum validation
882 * can happen in the async kernel threads 889 * can happen in the async kernel threads
@@ -884,20 +891,32 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
884 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, 891 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
885 bio, 1); 892 bio, 1);
886 if (ret) 893 if (ret)
887 return ret; 894 goto out_w_error;
888 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 895 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
889 mirror_num, 0); 896 mirror_num, 0);
897 } else if (!async) {
898 ret = btree_csum_one_bio(bio);
899 if (ret)
900 goto out_w_error;
901 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
902 mirror_num, 0);
903 } else {
904 /*
905 * kthread helpers are used to submit writes so that
906 * checksumming can happen in parallel across all CPUs
907 */
908 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
909 inode, rw, bio, mirror_num, 0,
910 bio_offset,
911 __btree_submit_bio_start,
912 __btree_submit_bio_done);
890 } 913 }
891 914
892 /* 915 if (ret) {
893 * kthread helpers are used to submit writes so that checksumming 916out_w_error:
894 * can happen in parallel across all CPUs 917 bio_endio(bio, ret);
895 */ 918 }
896 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 919 return ret;
897 inode, rw, bio, mirror_num, 0,
898 bio_offset,
899 __btree_submit_bio_start,
900 __btree_submit_bio_done);
901} 920}
902 921
903#ifdef CONFIG_MIGRATION 922#ifdef CONFIG_MIGRATION
@@ -982,6 +1001,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
982 1001
983static int btree_set_page_dirty(struct page *page) 1002static int btree_set_page_dirty(struct page *page)
984{ 1003{
1004#ifdef DEBUG
985 struct extent_buffer *eb; 1005 struct extent_buffer *eb;
986 1006
987 BUG_ON(!PagePrivate(page)); 1007 BUG_ON(!PagePrivate(page));
@@ -990,6 +1010,7 @@ static int btree_set_page_dirty(struct page *page)
990 BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 1010 BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
991 BUG_ON(!atomic_read(&eb->refs)); 1011 BUG_ON(!atomic_read(&eb->refs));
992 btrfs_assert_tree_locked(eb); 1012 btrfs_assert_tree_locked(eb);
1013#endif
993 return __set_page_dirty_nobuffers(page); 1014 return __set_page_dirty_nobuffers(page);
994} 1015}
995 1016
@@ -1121,11 +1142,11 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1121 root->fs_info->dirty_metadata_bytes); 1142 root->fs_info->dirty_metadata_bytes);
1122 } 1143 }
1123 spin_unlock(&root->fs_info->delalloc_lock); 1144 spin_unlock(&root->fs_info->delalloc_lock);
1124 }
1125 1145
1126 /* ugh, clear_extent_buffer_dirty needs to lock the page */ 1146 /* ugh, clear_extent_buffer_dirty needs to lock the page */
1127 btrfs_set_lock_blocking(buf); 1147 btrfs_set_lock_blocking(buf);
1128 clear_extent_buffer_dirty(buf); 1148 clear_extent_buffer_dirty(buf);
1149 }
1129 } 1150 }
1130} 1151}
1131 1152
@@ -1168,8 +1189,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1168 atomic_set(&root->log_commit[0], 0); 1189 atomic_set(&root->log_commit[0], 0);
1169 atomic_set(&root->log_commit[1], 0); 1190 atomic_set(&root->log_commit[1], 0);
1170 atomic_set(&root->log_writers, 0); 1191 atomic_set(&root->log_writers, 0);
1192 atomic_set(&root->log_batch, 0);
1171 atomic_set(&root->orphan_inodes, 0); 1193 atomic_set(&root->orphan_inodes, 0);
1172 root->log_batch = 0;
1173 root->log_transid = 0; 1194 root->log_transid = 0;
1174 root->last_log_commit = 0; 1195 root->last_log_commit = 0;
1175 extent_io_tree_init(&root->dirty_log_pages, 1196 extent_io_tree_init(&root->dirty_log_pages,
@@ -1185,7 +1206,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1185 root->root_key.objectid = objectid; 1206 root->root_key.objectid = objectid;
1186 root->anon_dev = 0; 1207 root->anon_dev = 0;
1187 1208
1188 spin_lock_init(&root->root_times_lock); 1209 spin_lock_init(&root->root_item_lock);
1189} 1210}
1190 1211
1191static int __must_check find_and_setup_root(struct btrfs_root *tree_root, 1212static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
@@ -1667,9 +1688,10 @@ static int transaction_kthread(void *arg)
1667 spin_unlock(&root->fs_info->trans_lock); 1688 spin_unlock(&root->fs_info->trans_lock);
1668 1689
1669 /* If the file system is aborted, this will always fail. */ 1690 /* If the file system is aborted, this will always fail. */
1670 trans = btrfs_join_transaction(root); 1691 trans = btrfs_attach_transaction(root);
1671 if (IS_ERR(trans)) { 1692 if (IS_ERR(trans)) {
1672 cannot_commit = true; 1693 if (PTR_ERR(trans) != -ENOENT)
1694 cannot_commit = true;
1673 goto sleep; 1695 goto sleep;
1674 } 1696 }
1675 if (transid == trans->transid) { 1697 if (transid == trans->transid) {
@@ -1994,13 +2016,11 @@ int open_ctree(struct super_block *sb,
1994 INIT_LIST_HEAD(&fs_info->trans_list); 2016 INIT_LIST_HEAD(&fs_info->trans_list);
1995 INIT_LIST_HEAD(&fs_info->dead_roots); 2017 INIT_LIST_HEAD(&fs_info->dead_roots);
1996 INIT_LIST_HEAD(&fs_info->delayed_iputs); 2018 INIT_LIST_HEAD(&fs_info->delayed_iputs);
1997 INIT_LIST_HEAD(&fs_info->hashers);
1998 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 2019 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1999 INIT_LIST_HEAD(&fs_info->ordered_operations); 2020 INIT_LIST_HEAD(&fs_info->ordered_operations);
2000 INIT_LIST_HEAD(&fs_info->caching_block_groups); 2021 INIT_LIST_HEAD(&fs_info->caching_block_groups);
2001 spin_lock_init(&fs_info->delalloc_lock); 2022 spin_lock_init(&fs_info->delalloc_lock);
2002 spin_lock_init(&fs_info->trans_lock); 2023 spin_lock_init(&fs_info->trans_lock);
2003 spin_lock_init(&fs_info->ref_cache_lock);
2004 spin_lock_init(&fs_info->fs_roots_radix_lock); 2024 spin_lock_init(&fs_info->fs_roots_radix_lock);
2005 spin_lock_init(&fs_info->delayed_iput_lock); 2025 spin_lock_init(&fs_info->delayed_iput_lock);
2006 spin_lock_init(&fs_info->defrag_inodes_lock); 2026 spin_lock_init(&fs_info->defrag_inodes_lock);
@@ -2014,12 +2034,15 @@ int open_ctree(struct super_block *sb,
2014 INIT_LIST_HEAD(&fs_info->space_info); 2034 INIT_LIST_HEAD(&fs_info->space_info);
2015 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); 2035 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
2016 btrfs_mapping_init(&fs_info->mapping_tree); 2036 btrfs_mapping_init(&fs_info->mapping_tree);
2017 btrfs_init_block_rsv(&fs_info->global_block_rsv); 2037 btrfs_init_block_rsv(&fs_info->global_block_rsv,
2018 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); 2038 BTRFS_BLOCK_RSV_GLOBAL);
2019 btrfs_init_block_rsv(&fs_info->trans_block_rsv); 2039 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
2020 btrfs_init_block_rsv(&fs_info->chunk_block_rsv); 2040 BTRFS_BLOCK_RSV_DELALLOC);
2021 btrfs_init_block_rsv(&fs_info->empty_block_rsv); 2041 btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
2022 btrfs_init_block_rsv(&fs_info->delayed_block_rsv); 2042 btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
2043 btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
2044 btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
2045 BTRFS_BLOCK_RSV_DELOPS);
2023 atomic_set(&fs_info->nr_async_submits, 0); 2046 atomic_set(&fs_info->nr_async_submits, 0);
2024 atomic_set(&fs_info->async_delalloc_pages, 0); 2047 atomic_set(&fs_info->async_delalloc_pages, 0);
2025 atomic_set(&fs_info->async_submit_draining, 0); 2048 atomic_set(&fs_info->async_submit_draining, 0);
@@ -2121,6 +2144,11 @@ int open_ctree(struct super_block *sb,
2121 init_rwsem(&fs_info->extent_commit_sem); 2144 init_rwsem(&fs_info->extent_commit_sem);
2122 init_rwsem(&fs_info->cleanup_work_sem); 2145 init_rwsem(&fs_info->cleanup_work_sem);
2123 init_rwsem(&fs_info->subvol_sem); 2146 init_rwsem(&fs_info->subvol_sem);
2147 fs_info->dev_replace.lock_owner = 0;
2148 atomic_set(&fs_info->dev_replace.nesting_level, 0);
2149 mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
2150 mutex_init(&fs_info->dev_replace.lock_management_lock);
2151 mutex_init(&fs_info->dev_replace.lock);
2124 2152
2125 spin_lock_init(&fs_info->qgroup_lock); 2153 spin_lock_init(&fs_info->qgroup_lock);
2126 fs_info->qgroup_tree = RB_ROOT; 2154 fs_info->qgroup_tree = RB_ROOT;
@@ -2269,6 +2297,10 @@ int open_ctree(struct super_block *sb,
2269 fs_info->thread_pool_size, 2297 fs_info->thread_pool_size,
2270 &fs_info->generic_worker); 2298 &fs_info->generic_worker);
2271 2299
2300 btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
2301 fs_info->thread_pool_size,
2302 &fs_info->generic_worker);
2303
2272 btrfs_init_workers(&fs_info->submit_workers, "submit", 2304 btrfs_init_workers(&fs_info->submit_workers, "submit",
2273 min_t(u64, fs_devices->num_devices, 2305 min_t(u64, fs_devices->num_devices,
2274 fs_info->thread_pool_size), 2306 fs_info->thread_pool_size),
@@ -2340,6 +2372,7 @@ int open_ctree(struct super_block *sb,
2340 ret |= btrfs_start_workers(&fs_info->delayed_workers); 2372 ret |= btrfs_start_workers(&fs_info->delayed_workers);
2341 ret |= btrfs_start_workers(&fs_info->caching_workers); 2373 ret |= btrfs_start_workers(&fs_info->caching_workers);
2342 ret |= btrfs_start_workers(&fs_info->readahead_workers); 2374 ret |= btrfs_start_workers(&fs_info->readahead_workers);
2375 ret |= btrfs_start_workers(&fs_info->flush_workers);
2343 if (ret) { 2376 if (ret) {
2344 err = -ENOMEM; 2377 err = -ENOMEM;
2345 goto fail_sb_buffer; 2378 goto fail_sb_buffer;
@@ -2408,7 +2441,11 @@ int open_ctree(struct super_block *sb,
2408 goto fail_tree_roots; 2441 goto fail_tree_roots;
2409 } 2442 }
2410 2443
2411 btrfs_close_extra_devices(fs_devices); 2444 /*
2445 * keep the device that is marked to be the target device for the
2446 * dev_replace procedure
2447 */
2448 btrfs_close_extra_devices(fs_info, fs_devices, 0);
2412 2449
2413 if (!fs_devices->latest_bdev) { 2450 if (!fs_devices->latest_bdev) {
2414 printk(KERN_CRIT "btrfs: failed to read devices on %s\n", 2451 printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
@@ -2480,6 +2517,14 @@ retry_root_backup:
2480 goto fail_block_groups; 2517 goto fail_block_groups;
2481 } 2518 }
2482 2519
2520 ret = btrfs_init_dev_replace(fs_info);
2521 if (ret) {
2522 pr_err("btrfs: failed to init dev_replace: %d\n", ret);
2523 goto fail_block_groups;
2524 }
2525
2526 btrfs_close_extra_devices(fs_info, fs_devices, 1);
2527
2483 ret = btrfs_init_space_info(fs_info); 2528 ret = btrfs_init_space_info(fs_info);
2484 if (ret) { 2529 if (ret) {
2485 printk(KERN_ERR "Failed to initial space info: %d\n", ret); 2530 printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@ -2491,6 +2536,15 @@ retry_root_backup:
2491 printk(KERN_ERR "Failed to read block groups: %d\n", ret); 2536 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
2492 goto fail_block_groups; 2537 goto fail_block_groups;
2493 } 2538 }
2539 fs_info->num_tolerated_disk_barrier_failures =
2540 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
2541 if (fs_info->fs_devices->missing_devices >
2542 fs_info->num_tolerated_disk_barrier_failures &&
2543 !(sb->s_flags & MS_RDONLY)) {
2544 printk(KERN_WARNING
2545 "Btrfs: too many missing devices, writeable mount is not allowed\n");
2546 goto fail_block_groups;
2547 }
2494 2548
2495 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 2549 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
2496 "btrfs-cleaner"); 2550 "btrfs-cleaner");
@@ -2619,6 +2673,13 @@ retry_root_backup:
2619 return ret; 2673 return ret;
2620 } 2674 }
2621 2675
2676 ret = btrfs_resume_dev_replace_async(fs_info);
2677 if (ret) {
2678 pr_warn("btrfs: failed to resume dev_replace\n");
2679 close_ctree(tree_root);
2680 return ret;
2681 }
2682
2622 return 0; 2683 return 0;
2623 2684
2624fail_qgroup: 2685fail_qgroup:
@@ -2655,6 +2716,7 @@ fail_sb_buffer:
2655 btrfs_stop_workers(&fs_info->submit_workers); 2716 btrfs_stop_workers(&fs_info->submit_workers);
2656 btrfs_stop_workers(&fs_info->delayed_workers); 2717 btrfs_stop_workers(&fs_info->delayed_workers);
2657 btrfs_stop_workers(&fs_info->caching_workers); 2718 btrfs_stop_workers(&fs_info->caching_workers);
2719 btrfs_stop_workers(&fs_info->flush_workers);
2658fail_alloc: 2720fail_alloc:
2659fail_iput: 2721fail_iput:
2660 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2722 btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2874,12 +2936,10 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
2874 printk_in_rcu("btrfs: disabling barriers on dev %s\n", 2936 printk_in_rcu("btrfs: disabling barriers on dev %s\n",
2875 rcu_str_deref(device->name)); 2937 rcu_str_deref(device->name));
2876 device->nobarriers = 1; 2938 device->nobarriers = 1;
2877 } 2939 } else if (!bio_flagged(bio, BIO_UPTODATE)) {
2878 if (!bio_flagged(bio, BIO_UPTODATE)) {
2879 ret = -EIO; 2940 ret = -EIO;
2880 if (!bio_flagged(bio, BIO_EOPNOTSUPP)) 2941 btrfs_dev_stat_inc_and_print(device,
2881 btrfs_dev_stat_inc_and_print(device, 2942 BTRFS_DEV_STAT_FLUSH_ERRS);
2882 BTRFS_DEV_STAT_FLUSH_ERRS);
2883 } 2943 }
2884 2944
2885 /* drop the reference from the wait == 0 run */ 2945 /* drop the reference from the wait == 0 run */
@@ -2918,14 +2978,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
2918{ 2978{
2919 struct list_head *head; 2979 struct list_head *head;
2920 struct btrfs_device *dev; 2980 struct btrfs_device *dev;
2921 int errors = 0; 2981 int errors_send = 0;
2982 int errors_wait = 0;
2922 int ret; 2983 int ret;
2923 2984
2924 /* send down all the barriers */ 2985 /* send down all the barriers */
2925 head = &info->fs_devices->devices; 2986 head = &info->fs_devices->devices;
2926 list_for_each_entry_rcu(dev, head, dev_list) { 2987 list_for_each_entry_rcu(dev, head, dev_list) {
2927 if (!dev->bdev) { 2988 if (!dev->bdev) {
2928 errors++; 2989 errors_send++;
2929 continue; 2990 continue;
2930 } 2991 }
2931 if (!dev->in_fs_metadata || !dev->writeable) 2992 if (!dev->in_fs_metadata || !dev->writeable)
@@ -2933,13 +2994,13 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
2933 2994
2934 ret = write_dev_flush(dev, 0); 2995 ret = write_dev_flush(dev, 0);
2935 if (ret) 2996 if (ret)
2936 errors++; 2997 errors_send++;
2937 } 2998 }
2938 2999
2939 /* wait for all the barriers */ 3000 /* wait for all the barriers */
2940 list_for_each_entry_rcu(dev, head, dev_list) { 3001 list_for_each_entry_rcu(dev, head, dev_list) {
2941 if (!dev->bdev) { 3002 if (!dev->bdev) {
2942 errors++; 3003 errors_wait++;
2943 continue; 3004 continue;
2944 } 3005 }
2945 if (!dev->in_fs_metadata || !dev->writeable) 3006 if (!dev->in_fs_metadata || !dev->writeable)
@@ -2947,13 +3008,87 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
2947 3008
2948 ret = write_dev_flush(dev, 1); 3009 ret = write_dev_flush(dev, 1);
2949 if (ret) 3010 if (ret)
2950 errors++; 3011 errors_wait++;
2951 } 3012 }
2952 if (errors) 3013 if (errors_send > info->num_tolerated_disk_barrier_failures ||
3014 errors_wait > info->num_tolerated_disk_barrier_failures)
2953 return -EIO; 3015 return -EIO;
2954 return 0; 3016 return 0;
2955} 3017}
2956 3018
3019int btrfs_calc_num_tolerated_disk_barrier_failures(
3020 struct btrfs_fs_info *fs_info)
3021{
3022 struct btrfs_ioctl_space_info space;
3023 struct btrfs_space_info *sinfo;
3024 u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
3025 BTRFS_BLOCK_GROUP_SYSTEM,
3026 BTRFS_BLOCK_GROUP_METADATA,
3027 BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
3028 int num_types = 4;
3029 int i;
3030 int c;
3031 int num_tolerated_disk_barrier_failures =
3032 (int)fs_info->fs_devices->num_devices;
3033
3034 for (i = 0; i < num_types; i++) {
3035 struct btrfs_space_info *tmp;
3036
3037 sinfo = NULL;
3038 rcu_read_lock();
3039 list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
3040 if (tmp->flags == types[i]) {
3041 sinfo = tmp;
3042 break;
3043 }
3044 }
3045 rcu_read_unlock();
3046
3047 if (!sinfo)
3048 continue;
3049
3050 down_read(&sinfo->groups_sem);
3051 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
3052 if (!list_empty(&sinfo->block_groups[c])) {
3053 u64 flags;
3054
3055 btrfs_get_block_group_info(
3056 &sinfo->block_groups[c], &space);
3057 if (space.total_bytes == 0 ||
3058 space.used_bytes == 0)
3059 continue;
3060 flags = space.flags;
3061 /*
3062 * return
3063 * 0: if dup, single or RAID0 is configured for
3064 * any of metadata, system or data, else
3065 * 1: if RAID5 is configured, or if RAID1 or
3066 * RAID10 is configured and only two mirrors
3067 * are used, else
3068 * 2: if RAID6 is configured, else
3069 * num_mirrors - 1: if RAID1 or RAID10 is
3070 * configured and more than
3071 * 2 mirrors are used.
3072 */
3073 if (num_tolerated_disk_barrier_failures > 0 &&
3074 ((flags & (BTRFS_BLOCK_GROUP_DUP |
3075 BTRFS_BLOCK_GROUP_RAID0)) ||
3076 ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
3077 == 0)))
3078 num_tolerated_disk_barrier_failures = 0;
3079 else if (num_tolerated_disk_barrier_failures > 1
3080 &&
3081 (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3082 BTRFS_BLOCK_GROUP_RAID10)))
3083 num_tolerated_disk_barrier_failures = 1;
3084 }
3085 }
3086 up_read(&sinfo->groups_sem);
3087 }
3088
3089 return num_tolerated_disk_barrier_failures;
3090}
3091
2957int write_all_supers(struct btrfs_root *root, int max_mirrors) 3092int write_all_supers(struct btrfs_root *root, int max_mirrors)
2958{ 3093{
2959 struct list_head *head; 3094 struct list_head *head;
@@ -2976,8 +3111,16 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2976 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 3111 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2977 head = &root->fs_info->fs_devices->devices; 3112 head = &root->fs_info->fs_devices->devices;
2978 3113
2979 if (do_barriers) 3114 if (do_barriers) {
2980 barrier_all_devices(root->fs_info); 3115 ret = barrier_all_devices(root->fs_info);
3116 if (ret) {
3117 mutex_unlock(
3118 &root->fs_info->fs_devices->device_list_mutex);
3119 btrfs_error(root->fs_info, ret,
3120 "errors while submitting device barriers.");
3121 return ret;
3122 }
3123 }
2981 3124
2982 list_for_each_entry_rcu(dev, head, dev_list) { 3125 list_for_each_entry_rcu(dev, head, dev_list) {
2983 if (!dev->bdev) { 3126 if (!dev->bdev) {
@@ -3177,16 +3320,18 @@ int close_ctree(struct btrfs_root *root)
3177 smp_mb(); 3320 smp_mb();
3178 3321
3179 /* pause restriper - we want to resume on mount */ 3322 /* pause restriper - we want to resume on mount */
3180 btrfs_pause_balance(root->fs_info); 3323 btrfs_pause_balance(fs_info);
3181 3324
3182 btrfs_scrub_cancel(root); 3325 btrfs_dev_replace_suspend_for_unmount(fs_info);
3326
3327 btrfs_scrub_cancel(fs_info);
3183 3328
3184 /* wait for any defraggers to finish */ 3329 /* wait for any defraggers to finish */
3185 wait_event(fs_info->transaction_wait, 3330 wait_event(fs_info->transaction_wait,
3186 (atomic_read(&fs_info->defrag_running) == 0)); 3331 (atomic_read(&fs_info->defrag_running) == 0));
3187 3332
3188 /* clear out the rbtree of defraggable inodes */ 3333 /* clear out the rbtree of defraggable inodes */
3189 btrfs_run_defrag_inodes(fs_info); 3334 btrfs_cleanup_defrag_inodes(fs_info);
3190 3335
3191 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 3336 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
3192 ret = btrfs_commit_super(root); 3337 ret = btrfs_commit_super(root);
@@ -3211,10 +3356,6 @@ int close_ctree(struct btrfs_root *root)
3211 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", 3356 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
3212 (unsigned long long)fs_info->delalloc_bytes); 3357 (unsigned long long)fs_info->delalloc_bytes);
3213 } 3358 }
3214 if (fs_info->total_ref_cache_size) {
3215 printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
3216 (unsigned long long)fs_info->total_ref_cache_size);
3217 }
3218 3359
3219 free_extent_buffer(fs_info->extent_root->node); 3360 free_extent_buffer(fs_info->extent_root->node);
3220 free_extent_buffer(fs_info->extent_root->commit_root); 3361 free_extent_buffer(fs_info->extent_root->commit_root);
@@ -3250,6 +3391,7 @@ int close_ctree(struct btrfs_root *root)
3250 btrfs_stop_workers(&fs_info->delayed_workers); 3391 btrfs_stop_workers(&fs_info->delayed_workers);
3251 btrfs_stop_workers(&fs_info->caching_workers); 3392 btrfs_stop_workers(&fs_info->caching_workers);
3252 btrfs_stop_workers(&fs_info->readahead_workers); 3393 btrfs_stop_workers(&fs_info->readahead_workers);
3394 btrfs_stop_workers(&fs_info->flush_workers);
3253 3395
3254#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 3396#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3255 if (btrfs_test_opt(root, CHECK_INTEGRITY)) 3397 if (btrfs_test_opt(root, CHECK_INTEGRITY))
@@ -3294,14 +3436,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
3294 int was_dirty; 3436 int was_dirty;
3295 3437
3296 btrfs_assert_tree_locked(buf); 3438 btrfs_assert_tree_locked(buf);
3297 if (transid != root->fs_info->generation) { 3439 if (transid != root->fs_info->generation)
3298 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " 3440 WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
3299 "found %llu running %llu\n", 3441 "found %llu running %llu\n",
3300 (unsigned long long)buf->start, 3442 (unsigned long long)buf->start,
3301 (unsigned long long)transid, 3443 (unsigned long long)transid,
3302 (unsigned long long)root->fs_info->generation); 3444 (unsigned long long)root->fs_info->generation);
3303 WARN_ON(1);
3304 }
3305 was_dirty = set_extent_buffer_dirty(buf); 3445 was_dirty = set_extent_buffer_dirty(buf);
3306 if (!was_dirty) { 3446 if (!was_dirty) {
3307 spin_lock(&root->fs_info->delalloc_lock); 3447 spin_lock(&root->fs_info->delalloc_lock);
@@ -3310,7 +3450,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
3310 } 3450 }
3311} 3451}
3312 3452
3313void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 3453static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
3454 int flush_delayed)
3314{ 3455{
3315 /* 3456 /*
3316 * looks as though older kernels can get into trouble with 3457 * looks as though older kernels can get into trouble with
@@ -3322,36 +3463,26 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
3322 if (current->flags & PF_MEMALLOC) 3463 if (current->flags & PF_MEMALLOC)
3323 return; 3464 return;
3324 3465
3325 btrfs_balance_delayed_items(root); 3466 if (flush_delayed)
3467 btrfs_balance_delayed_items(root);
3326 3468
3327 num_dirty = root->fs_info->dirty_metadata_bytes; 3469 num_dirty = root->fs_info->dirty_metadata_bytes;
3328 3470
3329 if (num_dirty > thresh) { 3471 if (num_dirty > thresh) {
3330 balance_dirty_pages_ratelimited_nr( 3472 balance_dirty_pages_ratelimited(
3331 root->fs_info->btree_inode->i_mapping, 1); 3473 root->fs_info->btree_inode->i_mapping);
3332 } 3474 }
3333 return; 3475 return;
3334} 3476}
3335 3477
3336void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 3478void btrfs_btree_balance_dirty(struct btrfs_root *root)
3337{ 3479{
3338 /* 3480 __btrfs_btree_balance_dirty(root, 1);
3339 * looks as though older kernels can get into trouble with 3481}
3340 * this code, they end up stuck in balance_dirty_pages forever
3341 */
3342 u64 num_dirty;
3343 unsigned long thresh = 32 * 1024 * 1024;
3344
3345 if (current->flags & PF_MEMALLOC)
3346 return;
3347
3348 num_dirty = root->fs_info->dirty_metadata_bytes;
3349 3482
3350 if (num_dirty > thresh) { 3483void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
3351 balance_dirty_pages_ratelimited_nr( 3484{
3352 root->fs_info->btree_inode->i_mapping, 1); 3485 __btrfs_btree_balance_dirty(root, 0);
3353 }
3354 return;
3355} 3486}
3356 3487
3357int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) 3488int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
@@ -3360,52 +3491,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
3360 return btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 3491 return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
3361} 3492}
3362 3493
3363int btree_lock_page_hook(struct page *page, void *data,
3364 void (*flush_fn)(void *))
3365{
3366 struct inode *inode = page->mapping->host;
3367 struct btrfs_root *root = BTRFS_I(inode)->root;
3368 struct extent_buffer *eb;
3369
3370 /*
3371 * We culled this eb but the page is still hanging out on the mapping,
3372 * carry on.
3373 */
3374 if (!PagePrivate(page))
3375 goto out;
3376
3377 eb = (struct extent_buffer *)page->private;
3378 if (!eb) {
3379 WARN_ON(1);
3380 goto out;
3381 }
3382 if (page != eb->pages[0])
3383 goto out;
3384
3385 if (!btrfs_try_tree_write_lock(eb)) {
3386 flush_fn(data);
3387 btrfs_tree_lock(eb);
3388 }
3389 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3390
3391 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3392 spin_lock(&root->fs_info->delalloc_lock);
3393 if (root->fs_info->dirty_metadata_bytes >= eb->len)
3394 root->fs_info->dirty_metadata_bytes -= eb->len;
3395 else
3396 WARN_ON(1);
3397 spin_unlock(&root->fs_info->delalloc_lock);
3398 }
3399
3400 btrfs_tree_unlock(eb);
3401out:
3402 if (!trylock_page(page)) {
3403 flush_fn(data);
3404 lock_page(page);
3405 }
3406 return 0;
3407}
3408
3409static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, 3494static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
3410 int read_only) 3495 int read_only)
3411{ 3496{
@@ -3608,7 +3693,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
3608 3693
3609 while (1) { 3694 while (1) {
3610 ret = find_first_extent_bit(dirty_pages, start, &start, &end, 3695 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
3611 mark); 3696 mark, NULL);
3612 if (ret) 3697 if (ret)
3613 break; 3698 break;
3614 3699
@@ -3663,7 +3748,7 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
3663again: 3748again:
3664 while (1) { 3749 while (1) {
3665 ret = find_first_extent_bit(unpin, 0, &start, &end, 3750 ret = find_first_extent_bit(unpin, 0, &start, &end,
3666 EXTENT_DIRTY); 3751 EXTENT_DIRTY, NULL);
3667 if (ret) 3752 if (ret)
3668 break; 3753 break;
3669 3754
@@ -3800,7 +3885,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
3800} 3885}
3801 3886
3802static struct extent_io_ops btree_extent_io_ops = { 3887static struct extent_io_ops btree_extent_io_ops = {
3803 .write_cache_pages_lock_hook = btree_lock_page_hook,
3804 .readpage_end_io_hook = btree_readpage_end_io_hook, 3888 .readpage_end_io_hook = btree_readpage_end_io_hook,
3805 .readpage_io_failed_hook = btree_io_failed_hook, 3889 .readpage_io_failed_hook = btree_io_failed_hook,
3806 .submit_bio_hook = btree_submit_bio_hook, 3890 .submit_bio_hook = btree_submit_bio_hook,