aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-18 12:42:05 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-18 12:42:05 -0500
commita22180d2666c018f4fef6818074d78bb76ff2bda (patch)
treea633aaf423ff39f94d00502d03dbbd99dab4b2ee /fs
parent2d4dce0070448bcb5ccd04553a4be4635417f565 (diff)
parent213490b301773ea9c6fb89a86424a6901fcdd069 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs update from Chris Mason: "A big set of fixes and features. In terms of line count, most of the code comes from Stefan, who added the ability to replace a single drive in place. This is different from how btrfs normally replaces drives, and is much much much faster. Josef is plowing through our synchronous write performance. This pull request does not include the DIO_OWN_WAITING patch that was discussed on the list, but it has a number of other improvements to cut down our latencies and CPU time during fsync/O_DIRECT writes. Miao Xie has a big series of fixes and is spreading out ordered operations over more CPUs. This improves performance and reduces contention. I've put in fixes for error handling around hash collisions. These are going back to individual stable kernels as I test against them. Otherwise we have a lot of fixes and cleanups, thanks everyone! raid5/6 is being rebased against the device replacement code. I'll have it posted this Friday along with a nice series of benchmarks." * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (115 commits) Btrfs: fix a bug of per-file nocow Btrfs: fix hash overflow handling Btrfs: don't take inode delalloc mutex if we're a free space inode Btrfs: fix autodefrag and umount lockup Btrfs: fix permissions of empty files not affected by umask Btrfs: put raid properties into global table Btrfs: fix BUG() in scrub when first superblock reading gives EIO Btrfs: do not call file_update_time in aio_write Btrfs: only unlock and relock if we have to Btrfs: use tokens where we can in the tree log Btrfs: optimize leaf_space_used Btrfs: don't memset new tokens Btrfs: only clear dirty on the buffer if it is marked as dirty Btrfs: move checks in set_page_dirty under DEBUG Btrfs: log changed inodes based on the extent map tree Btrfs: add path->really_keep_locks Btrfs: do not mark ems as prealloc if we are writing to them Btrfs: keep track of the extents original block length Btrfs: inline csums if we're fsyncing Btrfs: don't bother copying if we're only logging the inode ...
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c2
-rw-r--r--fs/btrfs/backref.c16
-rw-r--r--fs/btrfs/btrfs_inode.h4
-rw-r--r--fs/btrfs/check-integrity.c31
-rw-r--r--fs/btrfs/compression.c6
-rw-r--r--fs/btrfs/ctree.c241
-rw-r--r--fs/btrfs/ctree.h182
-rw-r--r--fs/btrfs/delayed-inode.c11
-rw-r--r--fs/btrfs/dev-replace.c856
-rw-r--r--fs/btrfs/dev-replace.h44
-rw-r--r--fs/btrfs/dir-item.c59
-rw-r--r--fs/btrfs/disk-io.c142
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c227
-rw-r--r--fs/btrfs/extent_io.c37
-rw-r--r--fs/btrfs/extent_io.h4
-rw-r--r--fs/btrfs/extent_map.c24
-rw-r--r--fs/btrfs/extent_map.h2
-rw-r--r--fs/btrfs/file-item.c21
-rw-r--r--fs/btrfs/file.c406
-rw-r--r--fs/btrfs/free-space-cache.c51
-rw-r--r--fs/btrfs/inode-map.c5
-rw-r--r--fs/btrfs/inode.c484
-rw-r--r--fs/btrfs/ioctl.c317
-rw-r--r--fs/btrfs/ioctl.h48
-rw-r--r--fs/btrfs/math.h44
-rw-r--r--fs/btrfs/ordered-data.c90
-rw-r--r--fs/btrfs/ordered-data.h7
-rw-r--r--fs/btrfs/print-tree.c3
-rw-r--r--fs/btrfs/reada.c31
-rw-r--r--fs/btrfs/relocation.c40
-rw-r--r--fs/btrfs/root-tree.c4
-rw-r--r--fs/btrfs/scrub.c1836
-rw-r--r--fs/btrfs/send.c8
-rw-r--r--fs/btrfs/super.c48
-rw-r--r--fs/btrfs/transaction.c170
-rw-r--r--fs/btrfs/transaction.h2
-rw-r--r--fs/btrfs/tree-log.c477
-rw-r--r--fs/btrfs/volumes.c966
-rw-r--r--fs/btrfs/volumes.h35
-rw-r--r--fs/btrfs/xattr.c13
42 files changed, 5255 insertions, 1745 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d7fcdba141a2..7df3e0f0ee51 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o ulist.o qgroup.o send.o 11 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o
12 12
13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o 14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 0c16e3dbfd56..e15d2b0d8d3b 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -121,6 +121,8 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
121 ret = posix_acl_equiv_mode(acl, &inode->i_mode); 121 ret = posix_acl_equiv_mode(acl, &inode->i_mode);
122 if (ret < 0) 122 if (ret < 0)
123 return ret; 123 return ret;
124 if (ret == 0)
125 acl = NULL;
124 } 126 }
125 ret = 0; 127 ret = 0;
126 break; 128 break;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 208d8aa5b07e..04edf69be875 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -461,6 +461,7 @@ static int __merge_refs(struct list_head *head, int mode)
461 pos2 = n2, n2 = pos2->next) { 461 pos2 = n2, n2 = pos2->next) {
462 struct __prelim_ref *ref2; 462 struct __prelim_ref *ref2;
463 struct __prelim_ref *xchg; 463 struct __prelim_ref *xchg;
464 struct extent_inode_elem *eie;
464 465
465 ref2 = list_entry(pos2, struct __prelim_ref, list); 466 ref2 = list_entry(pos2, struct __prelim_ref, list);
466 467
@@ -472,12 +473,20 @@ static int __merge_refs(struct list_head *head, int mode)
472 ref1 = ref2; 473 ref1 = ref2;
473 ref2 = xchg; 474 ref2 = xchg;
474 } 475 }
475 ref1->count += ref2->count;
476 } else { 476 } else {
477 if (ref1->parent != ref2->parent) 477 if (ref1->parent != ref2->parent)
478 continue; 478 continue;
479 ref1->count += ref2->count;
480 } 479 }
480
481 eie = ref1->inode_list;
482 while (eie && eie->next)
483 eie = eie->next;
484 if (eie)
485 eie->next = ref2->inode_list;
486 else
487 ref1->inode_list = ref2->inode_list;
488 ref1->count += ref2->count;
489
481 list_del(&ref2->list); 490 list_del(&ref2->list);
482 kfree(ref2); 491 kfree(ref2);
483 } 492 }
@@ -890,8 +899,7 @@ again:
890 while (!list_empty(&prefs)) { 899 while (!list_empty(&prefs)) {
891 ref = list_first_entry(&prefs, struct __prelim_ref, list); 900 ref = list_first_entry(&prefs, struct __prelim_ref, list);
892 list_del(&ref->list); 901 list_del(&ref->list);
893 if (ref->count < 0) 902 WARN_ON(ref->count < 0);
894 WARN_ON(1);
895 if (ref->count && ref->root_id && ref->parent == 0) { 903 if (ref->count && ref->root_id && ref->parent == 0) {
896 /* no parent == root of tree */ 904 /* no parent == root of tree */
897 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); 905 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ed8ca7ca5eff..2a8c242bc4f5 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -39,6 +39,7 @@
39#define BTRFS_INODE_HAS_ORPHAN_ITEM 5 39#define BTRFS_INODE_HAS_ORPHAN_ITEM 5
40#define BTRFS_INODE_HAS_ASYNC_EXTENT 6 40#define BTRFS_INODE_HAS_ASYNC_EXTENT 6
41#define BTRFS_INODE_NEEDS_FULL_SYNC 7 41#define BTRFS_INODE_NEEDS_FULL_SYNC 7
42#define BTRFS_INODE_COPY_EVERYTHING 8
42 43
43/* in memory btrfs inode */ 44/* in memory btrfs inode */
44struct btrfs_inode { 45struct btrfs_inode {
@@ -90,6 +91,9 @@ struct btrfs_inode {
90 91
91 unsigned long runtime_flags; 92 unsigned long runtime_flags;
92 93
94 /* Keep track of who's O_SYNC/fsycing currently */
95 atomic_t sync_writers;
96
93 /* full 64 bit generation number, struct vfs_inode doesn't have a big 97 /* full 64 bit generation number, struct vfs_inode doesn't have a big
94 * enough field for this. 98 * enough field for this.
95 */ 99 */
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 5a3e45db642a..11d47bfb62b4 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -137,7 +137,7 @@ struct btrfsic_block {
137 unsigned int never_written:1; /* block was added because it was 137 unsigned int never_written:1; /* block was added because it was
138 * referenced, not because it was 138 * referenced, not because it was
139 * written */ 139 * written */
140 unsigned int mirror_num:2; /* large enough to hold 140 unsigned int mirror_num; /* large enough to hold
141 * BTRFS_SUPER_MIRROR_MAX */ 141 * BTRFS_SUPER_MIRROR_MAX */
142 struct btrfsic_dev_state *dev_state; 142 struct btrfsic_dev_state *dev_state;
143 u64 dev_bytenr; /* key, physical byte num on disk */ 143 u64 dev_bytenr; /* key, physical byte num on disk */
@@ -723,7 +723,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
723 } 723 }
724 724
725 num_copies = 725 num_copies =
726 btrfs_num_copies(&state->root->fs_info->mapping_tree, 726 btrfs_num_copies(state->root->fs_info,
727 next_bytenr, state->metablock_size); 727 next_bytenr, state->metablock_size);
728 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 728 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
729 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 729 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -903,7 +903,7 @@ static int btrfsic_process_superblock_dev_mirror(
903 } 903 }
904 904
905 num_copies = 905 num_copies =
906 btrfs_num_copies(&state->root->fs_info->mapping_tree, 906 btrfs_num_copies(state->root->fs_info,
907 next_bytenr, state->metablock_size); 907 next_bytenr, state->metablock_size);
908 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 908 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
909 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 909 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1287,7 +1287,7 @@ static int btrfsic_create_link_to_next_block(
1287 *next_blockp = NULL; 1287 *next_blockp = NULL;
1288 if (0 == *num_copiesp) { 1288 if (0 == *num_copiesp) {
1289 *num_copiesp = 1289 *num_copiesp =
1290 btrfs_num_copies(&state->root->fs_info->mapping_tree, 1290 btrfs_num_copies(state->root->fs_info,
1291 next_bytenr, state->metablock_size); 1291 next_bytenr, state->metablock_size);
1292 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 1292 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1293 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 1293 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1489,7 +1489,7 @@ static int btrfsic_handle_extent_data(
1489 chunk_len = num_bytes; 1489 chunk_len = num_bytes;
1490 1490
1491 num_copies = 1491 num_copies =
1492 btrfs_num_copies(&state->root->fs_info->mapping_tree, 1492 btrfs_num_copies(state->root->fs_info,
1493 next_bytenr, state->datablock_size); 1493 next_bytenr, state->datablock_size);
1494 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 1494 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1495 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 1495 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1582,9 +1582,21 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
1582 struct btrfs_device *device; 1582 struct btrfs_device *device;
1583 1583
1584 length = len; 1584 length = len;
1585 ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ, 1585 ret = btrfs_map_block(state->root->fs_info, READ,
1586 bytenr, &length, &multi, mirror_num); 1586 bytenr, &length, &multi, mirror_num);
1587 1587
1588 if (ret) {
1589 block_ctx_out->start = 0;
1590 block_ctx_out->dev_bytenr = 0;
1591 block_ctx_out->len = 0;
1592 block_ctx_out->dev = NULL;
1593 block_ctx_out->datav = NULL;
1594 block_ctx_out->pagev = NULL;
1595 block_ctx_out->mem_to_free = NULL;
1596
1597 return ret;
1598 }
1599
1588 device = multi->stripes[0].dev; 1600 device = multi->stripes[0].dev;
1589 block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); 1601 block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
1590 block_ctx_out->dev_bytenr = multi->stripes[0].physical; 1602 block_ctx_out->dev_bytenr = multi->stripes[0].physical;
@@ -1594,8 +1606,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
1594 block_ctx_out->pagev = NULL; 1606 block_ctx_out->pagev = NULL;
1595 block_ctx_out->mem_to_free = NULL; 1607 block_ctx_out->mem_to_free = NULL;
1596 1608
1597 if (0 == ret) 1609 kfree(multi);
1598 kfree(multi);
1599 if (NULL == block_ctx_out->dev) { 1610 if (NULL == block_ctx_out->dev) {
1600 ret = -ENXIO; 1611 ret = -ENXIO;
1601 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n"); 1612 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
@@ -2463,7 +2474,7 @@ static int btrfsic_process_written_superblock(
2463 } 2474 }
2464 2475
2465 num_copies = 2476 num_copies =
2466 btrfs_num_copies(&state->root->fs_info->mapping_tree, 2477 btrfs_num_copies(state->root->fs_info,
2467 next_bytenr, BTRFS_SUPER_INFO_SIZE); 2478 next_bytenr, BTRFS_SUPER_INFO_SIZE);
2468 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 2479 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
2469 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 2480 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -2960,7 +2971,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
2960 struct btrfsic_block_data_ctx block_ctx; 2971 struct btrfsic_block_data_ctx block_ctx;
2961 int match = 0; 2972 int match = 0;
2962 2973
2963 num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, 2974 num_copies = btrfs_num_copies(state->root->fs_info,
2964 bytenr, state->metablock_size); 2975 bytenr, state->metablock_size);
2965 2976
2966 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { 2977 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index c6467aa88bee..94ab2f80e7e3 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -687,7 +687,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
687 687
688 ret = btrfs_map_bio(root, READ, comp_bio, 688 ret = btrfs_map_bio(root, READ, comp_bio,
689 mirror_num, 0); 689 mirror_num, 0);
690 BUG_ON(ret); /* -ENOMEM */ 690 if (ret)
691 bio_endio(comp_bio, ret);
691 692
692 bio_put(comp_bio); 693 bio_put(comp_bio);
693 694
@@ -712,7 +713,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
712 } 713 }
713 714
714 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); 715 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
715 BUG_ON(ret); /* -ENOMEM */ 716 if (ret)
717 bio_endio(comp_bio, ret);
716 718
717 bio_put(comp_bio); 719 bio_put(comp_bio);
718 return 0; 720 return 0;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cdfb4c49a806..c7b67cf24bba 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,8 +38,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
38 struct extent_buffer *dst_buf, 38 struct extent_buffer *dst_buf,
39 struct extent_buffer *src_buf); 39 struct extent_buffer *src_buf);
40static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 40static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
41 struct btrfs_path *path, int level, int slot, 41 struct btrfs_path *path, int level, int slot);
42 int tree_mod_log);
43static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 42static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
44 struct extent_buffer *eb); 43 struct extent_buffer *eb);
45struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, 44struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -776,8 +775,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
776 775
777static noinline void 776static noinline void
778tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, 777tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
779 struct extent_buffer *eb, 778 struct extent_buffer *eb, int slot, int atomic)
780 struct btrfs_disk_key *disk_key, int slot, int atomic)
781{ 779{
782 int ret; 780 int ret;
783 781
@@ -1140,13 +1138,13 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1140 switch (tm->op) { 1138 switch (tm->op) {
1141 case MOD_LOG_KEY_REMOVE_WHILE_FREEING: 1139 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1142 BUG_ON(tm->slot < n); 1140 BUG_ON(tm->slot < n);
1143 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1144 case MOD_LOG_KEY_REMOVE: 1141 case MOD_LOG_KEY_REMOVE:
1142 n++;
1143 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1145 btrfs_set_node_key(eb, &tm->key, tm->slot); 1144 btrfs_set_node_key(eb, &tm->key, tm->slot);
1146 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 1145 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1147 btrfs_set_node_ptr_generation(eb, tm->slot, 1146 btrfs_set_node_ptr_generation(eb, tm->slot,
1148 tm->generation); 1147 tm->generation);
1149 n++;
1150 break; 1148 break;
1151 case MOD_LOG_KEY_REPLACE: 1149 case MOD_LOG_KEY_REPLACE:
1152 BUG_ON(tm->slot >= n); 1150 BUG_ON(tm->slot >= n);
@@ -1361,19 +1359,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1361 u64 search_start; 1359 u64 search_start;
1362 int ret; 1360 int ret;
1363 1361
1364 if (trans->transaction != root->fs_info->running_transaction) { 1362 if (trans->transaction != root->fs_info->running_transaction)
1365 printk(KERN_CRIT "trans %llu running %llu\n", 1363 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1366 (unsigned long long)trans->transid, 1364 (unsigned long long)trans->transid,
1367 (unsigned long long) 1365 (unsigned long long)
1368 root->fs_info->running_transaction->transid); 1366 root->fs_info->running_transaction->transid);
1369 WARN_ON(1); 1367
1370 } 1368 if (trans->transid != root->fs_info->generation)
1371 if (trans->transid != root->fs_info->generation) { 1369 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1372 printk(KERN_CRIT "trans %llu running %llu\n",
1373 (unsigned long long)trans->transid, 1370 (unsigned long long)trans->transid,
1374 (unsigned long long)root->fs_info->generation); 1371 (unsigned long long)root->fs_info->generation);
1375 WARN_ON(1);
1376 }
1377 1372
1378 if (!should_cow_block(trans, root, buf)) { 1373 if (!should_cow_block(trans, root, buf)) {
1379 *cow_ret = buf; 1374 *cow_ret = buf;
@@ -1469,10 +1464,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1469 if (cache_only && parent_level != 1) 1464 if (cache_only && parent_level != 1)
1470 return 0; 1465 return 0;
1471 1466
1472 if (trans->transaction != root->fs_info->running_transaction) 1467 WARN_ON(trans->transaction != root->fs_info->running_transaction);
1473 WARN_ON(1); 1468 WARN_ON(trans->transid != root->fs_info->generation);
1474 if (trans->transid != root->fs_info->generation)
1475 WARN_ON(1);
1476 1469
1477 parent_nritems = btrfs_header_nritems(parent); 1470 parent_nritems = btrfs_header_nritems(parent);
1478 blocksize = btrfs_level_size(root, parent_level - 1); 1471 blocksize = btrfs_level_size(root, parent_level - 1);
@@ -1827,7 +1820,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1827 if (btrfs_header_nritems(right) == 0) { 1820 if (btrfs_header_nritems(right) == 0) {
1828 clean_tree_block(trans, root, right); 1821 clean_tree_block(trans, root, right);
1829 btrfs_tree_unlock(right); 1822 btrfs_tree_unlock(right);
1830 del_ptr(trans, root, path, level + 1, pslot + 1, 1); 1823 del_ptr(trans, root, path, level + 1, pslot + 1);
1831 root_sub_used(root, right->len); 1824 root_sub_used(root, right->len);
1832 btrfs_free_tree_block(trans, root, right, 0, 1); 1825 btrfs_free_tree_block(trans, root, right, 0, 1);
1833 free_extent_buffer_stale(right); 1826 free_extent_buffer_stale(right);
@@ -1836,7 +1829,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1836 struct btrfs_disk_key right_key; 1829 struct btrfs_disk_key right_key;
1837 btrfs_node_key(right, &right_key, 0); 1830 btrfs_node_key(right, &right_key, 0);
1838 tree_mod_log_set_node_key(root->fs_info, parent, 1831 tree_mod_log_set_node_key(root->fs_info, parent,
1839 &right_key, pslot + 1, 0); 1832 pslot + 1, 0);
1840 btrfs_set_node_key(parent, &right_key, pslot + 1); 1833 btrfs_set_node_key(parent, &right_key, pslot + 1);
1841 btrfs_mark_buffer_dirty(parent); 1834 btrfs_mark_buffer_dirty(parent);
1842 } 1835 }
@@ -1871,7 +1864,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1871 if (btrfs_header_nritems(mid) == 0) { 1864 if (btrfs_header_nritems(mid) == 0) {
1872 clean_tree_block(trans, root, mid); 1865 clean_tree_block(trans, root, mid);
1873 btrfs_tree_unlock(mid); 1866 btrfs_tree_unlock(mid);
1874 del_ptr(trans, root, path, level + 1, pslot, 1); 1867 del_ptr(trans, root, path, level + 1, pslot);
1875 root_sub_used(root, mid->len); 1868 root_sub_used(root, mid->len);
1876 btrfs_free_tree_block(trans, root, mid, 0, 1); 1869 btrfs_free_tree_block(trans, root, mid, 0, 1);
1877 free_extent_buffer_stale(mid); 1870 free_extent_buffer_stale(mid);
@@ -1880,7 +1873,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1880 /* update the parent key to reflect our changes */ 1873 /* update the parent key to reflect our changes */
1881 struct btrfs_disk_key mid_key; 1874 struct btrfs_disk_key mid_key;
1882 btrfs_node_key(mid, &mid_key, 0); 1875 btrfs_node_key(mid, &mid_key, 0);
1883 tree_mod_log_set_node_key(root->fs_info, parent, &mid_key, 1876 tree_mod_log_set_node_key(root->fs_info, parent,
1884 pslot, 0); 1877 pslot, 0);
1885 btrfs_set_node_key(parent, &mid_key, pslot); 1878 btrfs_set_node_key(parent, &mid_key, pslot);
1886 btrfs_mark_buffer_dirty(parent); 1879 btrfs_mark_buffer_dirty(parent);
@@ -1980,7 +1973,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1980 orig_slot += left_nr; 1973 orig_slot += left_nr;
1981 btrfs_node_key(mid, &disk_key, 0); 1974 btrfs_node_key(mid, &disk_key, 0);
1982 tree_mod_log_set_node_key(root->fs_info, parent, 1975 tree_mod_log_set_node_key(root->fs_info, parent,
1983 &disk_key, pslot, 0); 1976 pslot, 0);
1984 btrfs_set_node_key(parent, &disk_key, pslot); 1977 btrfs_set_node_key(parent, &disk_key, pslot);
1985 btrfs_mark_buffer_dirty(parent); 1978 btrfs_mark_buffer_dirty(parent);
1986 if (btrfs_header_nritems(left) > orig_slot) { 1979 if (btrfs_header_nritems(left) > orig_slot) {
@@ -2033,7 +2026,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2033 2026
2034 btrfs_node_key(right, &disk_key, 0); 2027 btrfs_node_key(right, &disk_key, 0);
2035 tree_mod_log_set_node_key(root->fs_info, parent, 2028 tree_mod_log_set_node_key(root->fs_info, parent,
2036 &disk_key, pslot + 1, 0); 2029 pslot + 1, 0);
2037 btrfs_set_node_key(parent, &disk_key, pslot + 1); 2030 btrfs_set_node_key(parent, &disk_key, pslot + 1);
2038 btrfs_mark_buffer_dirty(parent); 2031 btrfs_mark_buffer_dirty(parent);
2039 2032
@@ -2219,6 +2212,9 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
2219 int no_skips = 0; 2212 int no_skips = 0;
2220 struct extent_buffer *t; 2213 struct extent_buffer *t;
2221 2214
2215 if (path->really_keep_locks)
2216 return;
2217
2222 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2218 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2223 if (!path->nodes[i]) 2219 if (!path->nodes[i])
2224 break; 2220 break;
@@ -2266,7 +2262,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2266{ 2262{
2267 int i; 2263 int i;
2268 2264
2269 if (path->keep_locks) 2265 if (path->keep_locks || path->really_keep_locks)
2270 return; 2266 return;
2271 2267
2272 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2268 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -2499,7 +2495,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2499 if (!cow) 2495 if (!cow)
2500 write_lock_level = -1; 2496 write_lock_level = -1;
2501 2497
2502 if (cow && (p->keep_locks || p->lowest_level)) 2498 if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level))
2503 write_lock_level = BTRFS_MAX_LEVEL; 2499 write_lock_level = BTRFS_MAX_LEVEL;
2504 2500
2505 min_write_lock_level = write_lock_level; 2501 min_write_lock_level = write_lock_level;
@@ -2568,7 +2564,10 @@ again:
2568 * must have write locks on this node and the 2564 * must have write locks on this node and the
2569 * parent 2565 * parent
2570 */ 2566 */
2571 if (level + 1 > write_lock_level) { 2567 if (level > write_lock_level ||
2568 (level + 1 > write_lock_level &&
2569 level + 1 < BTRFS_MAX_LEVEL &&
2570 p->nodes[level + 1])) {
2572 write_lock_level = level + 1; 2571 write_lock_level = level + 1;
2573 btrfs_release_path(p); 2572 btrfs_release_path(p);
2574 goto again; 2573 goto again;
@@ -2917,7 +2916,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
2917 if (!path->nodes[i]) 2916 if (!path->nodes[i])
2918 break; 2917 break;
2919 t = path->nodes[i]; 2918 t = path->nodes[i];
2920 tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1); 2919 tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
2921 btrfs_set_node_key(t, key, tslot); 2920 btrfs_set_node_key(t, key, tslot);
2922 btrfs_mark_buffer_dirty(path->nodes[i]); 2921 btrfs_mark_buffer_dirty(path->nodes[i]);
2923 if (tslot != 0) 2922 if (tslot != 0)
@@ -3302,14 +3301,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
3302 */ 3301 */
3303static int leaf_space_used(struct extent_buffer *l, int start, int nr) 3302static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3304{ 3303{
3304 struct btrfs_item *start_item;
3305 struct btrfs_item *end_item;
3306 struct btrfs_map_token token;
3305 int data_len; 3307 int data_len;
3306 int nritems = btrfs_header_nritems(l); 3308 int nritems = btrfs_header_nritems(l);
3307 int end = min(nritems, start + nr) - 1; 3309 int end = min(nritems, start + nr) - 1;
3308 3310
3309 if (!nr) 3311 if (!nr)
3310 return 0; 3312 return 0;
3311 data_len = btrfs_item_end_nr(l, start); 3313 btrfs_init_map_token(&token);
3312 data_len = data_len - btrfs_item_offset_nr(l, end); 3314 start_item = btrfs_item_nr(l, start);
3315 end_item = btrfs_item_nr(l, end);
3316 data_len = btrfs_token_item_offset(l, start_item, &token) +
3317 btrfs_token_item_size(l, start_item, &token);
3318 data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3313 data_len += sizeof(struct btrfs_item) * nr; 3319 data_len += sizeof(struct btrfs_item) * nr;
3314 WARN_ON(data_len < 0); 3320 WARN_ON(data_len < 0);
3315 return data_len; 3321 return data_len;
@@ -3403,8 +3409,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3403 if (push_items == 0) 3409 if (push_items == 0)
3404 goto out_unlock; 3410 goto out_unlock;
3405 3411
3406 if (!empty && push_items == left_nritems) 3412 WARN_ON(!empty && push_items == left_nritems);
3407 WARN_ON(1);
3408 3413
3409 /* push left to right */ 3414 /* push left to right */
3410 right_nritems = btrfs_header_nritems(right); 3415 right_nritems = btrfs_header_nritems(right);
@@ -3642,11 +3647,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3642 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3647 btrfs_set_header_nritems(left, old_left_nritems + push_items);
3643 3648
3644 /* fixup right node */ 3649 /* fixup right node */
3645 if (push_items > right_nritems) { 3650 if (push_items > right_nritems)
3646 printk(KERN_CRIT "push items %d nr %u\n", push_items, 3651 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3647 right_nritems); 3652 right_nritems);
3648 WARN_ON(1);
3649 }
3650 3653
3651 if (push_items < right_nritems) { 3654 if (push_items < right_nritems) {
3652 push_space = btrfs_item_offset_nr(right, push_items - 1) - 3655 push_space = btrfs_item_offset_nr(right, push_items - 1) -
@@ -4602,16 +4605,21 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4602 * empty a node. 4605 * empty a node.
4603 */ 4606 */
4604static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4607static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4605 struct btrfs_path *path, int level, int slot, 4608 struct btrfs_path *path, int level, int slot)
4606 int tree_mod_log)
4607{ 4609{
4608 struct extent_buffer *parent = path->nodes[level]; 4610 struct extent_buffer *parent = path->nodes[level];
4609 u32 nritems; 4611 u32 nritems;
4610 int ret; 4612 int ret;
4611 4613
4614 if (level) {
4615 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4616 MOD_LOG_KEY_REMOVE);
4617 BUG_ON(ret < 0);
4618 }
4619
4612 nritems = btrfs_header_nritems(parent); 4620 nritems = btrfs_header_nritems(parent);
4613 if (slot != nritems - 1) { 4621 if (slot != nritems - 1) {
4614 if (tree_mod_log && level) 4622 if (level)
4615 tree_mod_log_eb_move(root->fs_info, parent, slot, 4623 tree_mod_log_eb_move(root->fs_info, parent, slot,
4616 slot + 1, nritems - slot - 1); 4624 slot + 1, nritems - slot - 1);
4617 memmove_extent_buffer(parent, 4625 memmove_extent_buffer(parent,
@@ -4619,10 +4627,6 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4619 btrfs_node_key_ptr_offset(slot + 1), 4627 btrfs_node_key_ptr_offset(slot + 1),
4620 sizeof(struct btrfs_key_ptr) * 4628 sizeof(struct btrfs_key_ptr) *
4621 (nritems - slot - 1)); 4629 (nritems - slot - 1));
4622 } else if (tree_mod_log && level) {
4623 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4624 MOD_LOG_KEY_REMOVE);
4625 BUG_ON(ret < 0);
4626 } 4630 }
4627 4631
4628 nritems--; 4632 nritems--;
@@ -4656,7 +4660,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4656 struct extent_buffer *leaf) 4660 struct extent_buffer *leaf)
4657{ 4661{
4658 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4662 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4659 del_ptr(trans, root, path, 1, path->slots[1], 1); 4663 del_ptr(trans, root, path, 1, path->slots[1]);
4660 4664
4661 /* 4665 /*
4662 * btrfs_free_extent is expensive, we want to make sure we 4666 * btrfs_free_extent is expensive, we want to make sure we
@@ -5123,13 +5127,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5123 right_path->search_commit_root = 1; 5127 right_path->search_commit_root = 1;
5124 right_path->skip_locking = 1; 5128 right_path->skip_locking = 1;
5125 5129
5126 spin_lock(&left_root->root_times_lock); 5130 spin_lock(&left_root->root_item_lock);
5127 left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); 5131 left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
5128 spin_unlock(&left_root->root_times_lock); 5132 spin_unlock(&left_root->root_item_lock);
5129 5133
5130 spin_lock(&right_root->root_times_lock); 5134 spin_lock(&right_root->root_item_lock);
5131 right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); 5135 right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
5132 spin_unlock(&right_root->root_times_lock); 5136 spin_unlock(&right_root->root_item_lock);
5133 5137
5134 trans = btrfs_join_transaction(left_root); 5138 trans = btrfs_join_transaction(left_root);
5135 if (IS_ERR(trans)) { 5139 if (IS_ERR(trans)) {
@@ -5224,15 +5228,15 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5224 goto out; 5228 goto out;
5225 } 5229 }
5226 5230
5227 spin_lock(&left_root->root_times_lock); 5231 spin_lock(&left_root->root_item_lock);
5228 ctransid = btrfs_root_ctransid(&left_root->root_item); 5232 ctransid = btrfs_root_ctransid(&left_root->root_item);
5229 spin_unlock(&left_root->root_times_lock); 5233 spin_unlock(&left_root->root_item_lock);
5230 if (ctransid != left_start_ctransid) 5234 if (ctransid != left_start_ctransid)
5231 left_start_ctransid = 0; 5235 left_start_ctransid = 0;
5232 5236
5233 spin_lock(&right_root->root_times_lock); 5237 spin_lock(&right_root->root_item_lock);
5234 ctransid = btrfs_root_ctransid(&right_root->root_item); 5238 ctransid = btrfs_root_ctransid(&right_root->root_item);
5235 spin_unlock(&right_root->root_times_lock); 5239 spin_unlock(&right_root->root_item_lock);
5236 if (ctransid != right_start_ctransid) 5240 if (ctransid != right_start_ctransid)
5237 right_start_ctransid = 0; 5241 right_start_ctransid = 0;
5238 5242
@@ -5496,6 +5500,139 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5496 return btrfs_next_old_leaf(root, path, 0); 5500 return btrfs_next_old_leaf(root, path, 0);
5497} 5501}
5498 5502
5503/* Release the path up to but not including the given level */
5504static void btrfs_release_level(struct btrfs_path *path, int level)
5505{
5506 int i;
5507
5508 for (i = 0; i < level; i++) {
5509 path->slots[i] = 0;
5510 if (!path->nodes[i])
5511 continue;
5512 if (path->locks[i]) {
5513 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
5514 path->locks[i] = 0;
5515 }
5516 free_extent_buffer(path->nodes[i]);
5517 path->nodes[i] = NULL;
5518 }
5519}
5520
5521/*
5522 * This function assumes 2 things
5523 *
5524 * 1) You are using path->keep_locks
5525 * 2) You are not inserting items.
5526 *
5527 * If either of these are not true do not use this function. If you need a next
5528 * leaf with either of these not being true then this function can be easily
5529 * adapted to do that, but at the moment these are the limitations.
5530 */
5531int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
5532 struct btrfs_root *root, struct btrfs_path *path,
5533 int del)
5534{
5535 struct extent_buffer *b;
5536 struct btrfs_key key;
5537 u32 nritems;
5538 int level = 1;
5539 int slot;
5540 int ret = 1;
5541 int write_lock_level = BTRFS_MAX_LEVEL;
5542 int ins_len = del ? -1 : 0;
5543
5544 WARN_ON(!(path->keep_locks || path->really_keep_locks));
5545
5546 nritems = btrfs_header_nritems(path->nodes[0]);
5547 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5548
5549 while (path->nodes[level]) {
5550 nritems = btrfs_header_nritems(path->nodes[level]);
5551 if (!(path->locks[level] & BTRFS_WRITE_LOCK)) {
5552search:
5553 btrfs_release_path(path);
5554 ret = btrfs_search_slot(trans, root, &key, path,
5555 ins_len, 1);
5556 if (ret < 0)
5557 goto out;
5558 level = 1;
5559 continue;
5560 }
5561
5562 if (path->slots[level] >= nritems - 1) {
5563 level++;
5564 continue;
5565 }
5566
5567 btrfs_release_level(path, level);
5568 break;
5569 }
5570
5571 if (!path->nodes[level]) {
5572 ret = 1;
5573 goto out;
5574 }
5575
5576 path->slots[level]++;
5577 b = path->nodes[level];
5578
5579 while (b) {
5580 level = btrfs_header_level(b);
5581
5582 if (!should_cow_block(trans, root, b))
5583 goto cow_done;
5584
5585 btrfs_set_path_blocking(path);
5586 ret = btrfs_cow_block(trans, root, b,
5587 path->nodes[level + 1],
5588 path->slots[level + 1], &b);
5589 if (ret)
5590 goto out;
5591cow_done:
5592 path->nodes[level] = b;
5593 btrfs_clear_path_blocking(path, NULL, 0);
5594 if (level != 0) {
5595 ret = setup_nodes_for_search(trans, root, path, b,
5596 level, ins_len,
5597 &write_lock_level);
5598 if (ret == -EAGAIN)
5599 goto search;
5600 if (ret)
5601 goto out;
5602
5603 b = path->nodes[level];
5604 slot = path->slots[level];
5605
5606 ret = read_block_for_search(trans, root, path,
5607 &b, level, slot, &key, 0);
5608 if (ret == -EAGAIN)
5609 goto search;
5610 if (ret)
5611 goto out;
5612 level = btrfs_header_level(b);
5613 if (!btrfs_try_tree_write_lock(b)) {
5614 btrfs_set_path_blocking(path);
5615 btrfs_tree_lock(b);
5616 btrfs_clear_path_blocking(path, b,
5617 BTRFS_WRITE_LOCK);
5618 }
5619 path->locks[level] = BTRFS_WRITE_LOCK;
5620 path->nodes[level] = b;
5621 path->slots[level] = 0;
5622 } else {
5623 path->slots[level] = 0;
5624 ret = 0;
5625 break;
5626 }
5627 }
5628
5629out:
5630 if (ret)
5631 btrfs_release_path(path);
5632
5633 return ret;
5634}
5635
5499int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 5636int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5500 u64 time_seq) 5637 u64 time_seq)
5501{ 5638{
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 596617ecd329..547b7b05727f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,7 +48,7 @@ struct btrfs_ordered_sum;
48 48
49#define BTRFS_MAGIC "_BHRfS_M" 49#define BTRFS_MAGIC "_BHRfS_M"
50 50
51#define BTRFS_MAX_MIRRORS 2 51#define BTRFS_MAX_MIRRORS 3
52 52
53#define BTRFS_MAX_LEVEL 8 53#define BTRFS_MAX_LEVEL 8
54 54
@@ -142,6 +142,8 @@ struct btrfs_ordered_sum;
142 142
143#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 143#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
144 144
145#define BTRFS_DEV_REPLACE_DEVID 0
146
145/* 147/*
146 * the max metadata block size. This limit is somewhat artificial, 148 * the max metadata block size. This limit is somewhat artificial,
147 * but the memmove costs go through the roof for larger blocks. 149 * but the memmove costs go through the roof for larger blocks.
@@ -172,6 +174,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
172/* four bytes for CRC32 */ 174/* four bytes for CRC32 */
173#define BTRFS_EMPTY_DIR_SIZE 0 175#define BTRFS_EMPTY_DIR_SIZE 0
174 176
177/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */
178#define REQ_GET_READ_MIRRORS (1 << 30)
179
175#define BTRFS_FT_UNKNOWN 0 180#define BTRFS_FT_UNKNOWN 0
176#define BTRFS_FT_REG_FILE 1 181#define BTRFS_FT_REG_FILE 1
177#define BTRFS_FT_DIR 2 182#define BTRFS_FT_DIR 2
@@ -571,6 +576,7 @@ struct btrfs_path {
571 unsigned int skip_locking:1; 576 unsigned int skip_locking:1;
572 unsigned int leave_spinning:1; 577 unsigned int leave_spinning:1;
573 unsigned int search_commit_root:1; 578 unsigned int search_commit_root:1;
579 unsigned int really_keep_locks:1;
574}; 580};
575 581
576/* 582/*
@@ -885,6 +891,59 @@ struct btrfs_dev_stats_item {
885 __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; 891 __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
886} __attribute__ ((__packed__)); 892} __attribute__ ((__packed__));
887 893
894#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
895#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
896#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0
897#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1
898#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2
899#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3
900#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4
901
902struct btrfs_dev_replace {
903 u64 replace_state; /* see #define above */
904 u64 time_started; /* seconds since 1-Jan-1970 */
905 u64 time_stopped; /* seconds since 1-Jan-1970 */
906 atomic64_t num_write_errors;
907 atomic64_t num_uncorrectable_read_errors;
908
909 u64 cursor_left;
910 u64 committed_cursor_left;
911 u64 cursor_left_last_write_of_item;
912 u64 cursor_right;
913
914 u64 cont_reading_from_srcdev_mode; /* see #define above */
915
916 int is_valid;
917 int item_needs_writeback;
918 struct btrfs_device *srcdev;
919 struct btrfs_device *tgtdev;
920
921 pid_t lock_owner;
922 atomic_t nesting_level;
923 struct mutex lock_finishing_cancel_unmount;
924 struct mutex lock_management_lock;
925 struct mutex lock;
926
927 struct btrfs_scrub_progress scrub_progress;
928};
929
930struct btrfs_dev_replace_item {
931 /*
932 * grow this item struct at the end for future enhancements and keep
933 * the existing values unchanged
934 */
935 __le64 src_devid;
936 __le64 cursor_left;
937 __le64 cursor_right;
938 __le64 cont_reading_from_srcdev_mode;
939
940 __le64 replace_state;
941 __le64 time_started;
942 __le64 time_stopped;
943 __le64 num_write_errors;
944 __le64 num_uncorrectable_read_errors;
945} __attribute__ ((__packed__));
946
888/* different types of block groups (and chunks) */ 947/* different types of block groups (and chunks) */
889#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) 948#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
890#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) 949#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
@@ -1333,6 +1392,7 @@ struct btrfs_fs_info {
1333 struct btrfs_workers generic_worker; 1392 struct btrfs_workers generic_worker;
1334 struct btrfs_workers workers; 1393 struct btrfs_workers workers;
1335 struct btrfs_workers delalloc_workers; 1394 struct btrfs_workers delalloc_workers;
1395 struct btrfs_workers flush_workers;
1336 struct btrfs_workers endio_workers; 1396 struct btrfs_workers endio_workers;
1337 struct btrfs_workers endio_meta_workers; 1397 struct btrfs_workers endio_meta_workers;
1338 struct btrfs_workers endio_meta_write_workers; 1398 struct btrfs_workers endio_meta_write_workers;
@@ -1429,6 +1489,8 @@ struct btrfs_fs_info {
1429 struct rw_semaphore scrub_super_lock; 1489 struct rw_semaphore scrub_super_lock;
1430 int scrub_workers_refcnt; 1490 int scrub_workers_refcnt;
1431 struct btrfs_workers scrub_workers; 1491 struct btrfs_workers scrub_workers;
1492 struct btrfs_workers scrub_wr_completion_workers;
1493 struct btrfs_workers scrub_nocow_workers;
1432 1494
1433#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1495#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1434 u32 check_integrity_print_mask; 1496 u32 check_integrity_print_mask;
@@ -1470,6 +1532,11 @@ struct btrfs_fs_info {
1470 int backup_root_index; 1532 int backup_root_index;
1471 1533
1472 int num_tolerated_disk_barrier_failures; 1534 int num_tolerated_disk_barrier_failures;
1535
1536 /* device replace state */
1537 struct btrfs_dev_replace dev_replace;
1538
1539 atomic_t mutually_exclusive_operation_running;
1473}; 1540};
1474 1541
1475/* 1542/*
@@ -1579,7 +1646,7 @@ struct btrfs_root {
1579 1646
1580 int force_cow; 1647 int force_cow;
1581 1648
1582 spinlock_t root_times_lock; 1649 spinlock_t root_item_lock;
1583}; 1650};
1584 1651
1585struct btrfs_ioctl_defrag_range_args { 1652struct btrfs_ioctl_defrag_range_args {
@@ -1723,6 +1790,12 @@ struct btrfs_ioctl_defrag_range_args {
1723#define BTRFS_DEV_STATS_KEY 249 1790#define BTRFS_DEV_STATS_KEY 249
1724 1791
1725/* 1792/*
1793 * Persistantly stores the device replace state in the device tree.
1794 * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
1795 */
1796#define BTRFS_DEV_REPLACE_KEY 250
1797
1798/*
1726 * string items are for debugging. They just store a short string of 1799 * string items are for debugging. They just store a short string of
1727 * data in the FS 1800 * data in the FS
1728 */ 1801 */
@@ -1787,7 +1860,7 @@ struct btrfs_map_token {
1787 1860
1788static inline void btrfs_init_map_token (struct btrfs_map_token *token) 1861static inline void btrfs_init_map_token (struct btrfs_map_token *token)
1789{ 1862{
1790 memset(token, 0, sizeof(*token)); 1863 token->kaddr = NULL;
1791} 1864}
1792 1865
1793/* some macros to generate set/get funcs for the struct fields. This 1866/* some macros to generate set/get funcs for the struct fields. This
@@ -2755,6 +2828,49 @@ BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
2755BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, 2828BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
2756 rsv_excl, 64); 2829 rsv_excl, 64);
2757 2830
2831/* btrfs_dev_replace_item */
2832BTRFS_SETGET_FUNCS(dev_replace_src_devid,
2833 struct btrfs_dev_replace_item, src_devid, 64);
2834BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
2835 struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
2836 64);
2837BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
2838 replace_state, 64);
2839BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
2840 time_started, 64);
2841BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
2842 time_stopped, 64);
2843BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
2844 num_write_errors, 64);
2845BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
2846 struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
2847 64);
2848BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
2849 cursor_left, 64);
2850BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
2851 cursor_right, 64);
2852
2853BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
2854 struct btrfs_dev_replace_item, src_devid, 64);
2855BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
2856 struct btrfs_dev_replace_item,
2857 cont_reading_from_srcdev_mode, 64);
2858BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
2859 struct btrfs_dev_replace_item, replace_state, 64);
2860BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
2861 struct btrfs_dev_replace_item, time_started, 64);
2862BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
2863 struct btrfs_dev_replace_item, time_stopped, 64);
2864BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
2865 struct btrfs_dev_replace_item, num_write_errors, 64);
2866BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
2867 struct btrfs_dev_replace_item,
2868 num_uncorrectable_read_errors, 64);
2869BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
2870 struct btrfs_dev_replace_item, cursor_left, 64);
2871BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
2872 struct btrfs_dev_replace_item, cursor_right, 64);
2873
2758static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) 2874static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
2759{ 2875{
2760 return sb->s_fs_info; 2876 return sb->s_fs_info;
@@ -2900,6 +3016,18 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
2900u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 3016u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2901u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); 3017u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
2902void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 3018void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
3019
3020enum btrfs_reserve_flush_enum {
3021 /* If we are in the transaction, we can't flush anything.*/
3022 BTRFS_RESERVE_NO_FLUSH,
3023 /*
3024 * Flushing delalloc may cause deadlock somewhere, in this
3025 * case, use FLUSH LIMIT
3026 */
3027 BTRFS_RESERVE_FLUSH_LIMIT,
3028 BTRFS_RESERVE_FLUSH_ALL,
3029};
3030
2903int btrfs_check_data_free_space(struct inode *inode, u64 bytes); 3031int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
2904void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); 3032void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
2905void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 3033void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -2919,19 +3047,13 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
2919void btrfs_free_block_rsv(struct btrfs_root *root, 3047void btrfs_free_block_rsv(struct btrfs_root *root,
2920 struct btrfs_block_rsv *rsv); 3048 struct btrfs_block_rsv *rsv);
2921int btrfs_block_rsv_add(struct btrfs_root *root, 3049int btrfs_block_rsv_add(struct btrfs_root *root,
2922 struct btrfs_block_rsv *block_rsv, 3050 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
2923 u64 num_bytes); 3051 enum btrfs_reserve_flush_enum flush);
2924int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
2925 struct btrfs_block_rsv *block_rsv,
2926 u64 num_bytes);
2927int btrfs_block_rsv_check(struct btrfs_root *root, 3052int btrfs_block_rsv_check(struct btrfs_root *root,
2928 struct btrfs_block_rsv *block_rsv, int min_factor); 3053 struct btrfs_block_rsv *block_rsv, int min_factor);
2929int btrfs_block_rsv_refill(struct btrfs_root *root, 3054int btrfs_block_rsv_refill(struct btrfs_root *root,
2930 struct btrfs_block_rsv *block_rsv, 3055 struct btrfs_block_rsv *block_rsv, u64 min_reserved,
2931 u64 min_reserved); 3056 enum btrfs_reserve_flush_enum flush);
2932int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
2933 struct btrfs_block_rsv *block_rsv,
2934 u64 min_reserved);
2935int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3057int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2936 struct btrfs_block_rsv *dst_rsv, 3058 struct btrfs_block_rsv *dst_rsv,
2937 u64 num_bytes); 3059 u64 num_bytes);
@@ -2955,6 +3077,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
2955int btrfs_init_space_info(struct btrfs_fs_info *fs_info); 3077int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
2956int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, 3078int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2957 struct btrfs_fs_info *fs_info); 3079 struct btrfs_fs_info *fs_info);
3080int __get_raid_index(u64 flags);
2958/* ctree.c */ 3081/* ctree.c */
2959int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 3082int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2960 int level, int *slot); 3083 int level, int *slot);
@@ -3065,6 +3188,9 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
3065} 3188}
3066 3189
3067int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 3190int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
3191int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
3192 struct btrfs_root *root, struct btrfs_path *path,
3193 int del);
3068int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 3194int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
3069 u64 time_seq); 3195 u64 time_seq);
3070static inline int btrfs_next_old_item(struct btrfs_root *root, 3196static inline int btrfs_next_old_item(struct btrfs_root *root,
@@ -3157,6 +3283,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
3157 struct btrfs_root *root); 3283 struct btrfs_root *root);
3158 3284
3159/* dir-item.c */ 3285/* dir-item.c */
3286int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
3287 const char *name, int name_len);
3160int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, 3288int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
3161 struct btrfs_root *root, const char *name, 3289 struct btrfs_root *root, const char *name,
3162 int name_len, struct inode *dir, 3290 int name_len, struct inode *dir,
@@ -3256,6 +3384,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
3256 struct btrfs_root *root, 3384 struct btrfs_root *root,
3257 struct btrfs_path *path, u64 objectid, 3385 struct btrfs_path *path, u64 objectid,
3258 u64 bytenr, int mod); 3386 u64 bytenr, int mod);
3387u64 btrfs_file_extent_length(struct btrfs_path *path);
3259int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, 3388int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
3260 struct btrfs_root *root, 3389 struct btrfs_root *root,
3261 struct btrfs_ordered_sum *sums); 3390 struct btrfs_ordered_sum *sums);
@@ -3271,6 +3400,19 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
3271int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 3400int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
3272 struct list_head *list, int search_commit); 3401 struct list_head *list, int search_commit);
3273/* inode.c */ 3402/* inode.c */
3403struct btrfs_delalloc_work {
3404 struct inode *inode;
3405 int wait;
3406 int delay_iput;
3407 struct completion completion;
3408 struct list_head list;
3409 struct btrfs_work work;
3410};
3411
3412struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
3413 int wait, int delay_iput);
3414void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
3415
3274struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, 3416struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
3275 size_t pg_offset, u64 start, u64 len, 3417 size_t pg_offset, u64 start, u64 len,
3276 int create); 3418 int create);
@@ -3370,9 +3512,12 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
3370 struct btrfs_ioctl_space_info *space); 3512 struct btrfs_ioctl_space_info *space);
3371 3513
3372/* file.c */ 3514/* file.c */
3515int btrfs_auto_defrag_init(void);
3516void btrfs_auto_defrag_exit(void);
3373int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 3517int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
3374 struct inode *inode); 3518 struct inode *inode);
3375int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); 3519int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
3520void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
3376int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); 3521int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
3377void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 3522void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
3378 int skip_pinned); 3523 int skip_pinned);
@@ -3519,15 +3664,16 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
3519 struct btrfs_pending_snapshot *pending); 3664 struct btrfs_pending_snapshot *pending);
3520 3665
3521/* scrub.c */ 3666/* scrub.c */
3522int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, 3667int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3523 struct btrfs_scrub_progress *progress, int readonly); 3668 u64 end, struct btrfs_scrub_progress *progress,
3669 int readonly, int is_dev_replace);
3524void btrfs_scrub_pause(struct btrfs_root *root); 3670void btrfs_scrub_pause(struct btrfs_root *root);
3525void btrfs_scrub_pause_super(struct btrfs_root *root); 3671void btrfs_scrub_pause_super(struct btrfs_root *root);
3526void btrfs_scrub_continue(struct btrfs_root *root); 3672void btrfs_scrub_continue(struct btrfs_root *root);
3527void btrfs_scrub_continue_super(struct btrfs_root *root); 3673void btrfs_scrub_continue_super(struct btrfs_root *root);
3528int __btrfs_scrub_cancel(struct btrfs_fs_info *info); 3674int btrfs_scrub_cancel(struct btrfs_fs_info *info);
3529int btrfs_scrub_cancel(struct btrfs_root *root); 3675int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
3530int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev); 3676 struct btrfs_device *dev);
3531int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); 3677int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
3532int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 3678int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
3533 struct btrfs_scrub_progress *progress); 3679 struct btrfs_scrub_progress *progress);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 478f66bdc57b..34836036f01b 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -651,7 +651,8 @@ static int btrfs_delayed_inode_reserve_metadata(
651 */ 651 */
652 if (!src_rsv || (!trans->bytes_reserved && 652 if (!src_rsv || (!trans->bytes_reserved &&
653 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { 653 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
654 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); 654 ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
655 BTRFS_RESERVE_NO_FLUSH);
655 /* 656 /*
656 * Since we're under a transaction reserve_metadata_bytes could 657 * Since we're under a transaction reserve_metadata_bytes could
657 * try to commit the transaction which will make it return 658 * try to commit the transaction which will make it return
@@ -686,7 +687,8 @@ static int btrfs_delayed_inode_reserve_metadata(
686 * reserve something strictly for us. If not be a pain and try 687 * reserve something strictly for us. If not be a pain and try
687 * to steal from the delalloc block rsv. 688 * to steal from the delalloc block rsv.
688 */ 689 */
689 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); 690 ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
691 BTRFS_RESERVE_NO_FLUSH);
690 if (!ret) 692 if (!ret)
691 goto out; 693 goto out;
692 694
@@ -1255,7 +1257,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1255 struct btrfs_delayed_node *delayed_node = NULL; 1257 struct btrfs_delayed_node *delayed_node = NULL;
1256 struct btrfs_root *root; 1258 struct btrfs_root *root;
1257 struct btrfs_block_rsv *block_rsv; 1259 struct btrfs_block_rsv *block_rsv;
1258 unsigned long nr = 0;
1259 int need_requeue = 0; 1260 int need_requeue = 0;
1260 int ret; 1261 int ret;
1261 1262
@@ -1316,11 +1317,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1316 delayed_node); 1317 delayed_node);
1317 mutex_unlock(&delayed_node->mutex); 1318 mutex_unlock(&delayed_node->mutex);
1318 1319
1319 nr = trans->blocks_used;
1320
1321 trans->block_rsv = block_rsv; 1320 trans->block_rsv = block_rsv;
1322 btrfs_end_transaction_dmeta(trans, root); 1321 btrfs_end_transaction_dmeta(trans, root);
1323 __btrfs_btree_balance_dirty(root, nr); 1322 btrfs_btree_balance_dirty_nodelay(root);
1324free_path: 1323free_path:
1325 btrfs_free_path(path); 1324 btrfs_free_path(path);
1326out: 1325out:
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
new file mode 100644
index 000000000000..66dbc8dbddf7
--- /dev/null
+++ b/fs/btrfs/dev-replace.c
@@ -0,0 +1,856 @@
1/*
2 * Copyright (C) STRATO AG 2012. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/bio.h>
20#include <linux/slab.h>
21#include <linux/buffer_head.h>
22#include <linux/blkdev.h>
23#include <linux/random.h>
24#include <linux/iocontext.h>
25#include <linux/capability.h>
26#include <linux/kthread.h>
27#include <linux/math64.h>
28#include <asm/div64.h>
29#include "compat.h"
30#include "ctree.h"
31#include "extent_map.h"
32#include "disk-io.h"
33#include "transaction.h"
34#include "print-tree.h"
35#include "volumes.h"
36#include "async-thread.h"
37#include "check-integrity.h"
38#include "rcu-string.h"
39#include "dev-replace.h"
40
41static u64 btrfs_get_seconds_since_1970(void);
42static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
43 int scrub_ret);
44static void btrfs_dev_replace_update_device_in_mapping_tree(
45 struct btrfs_fs_info *fs_info,
46 struct btrfs_device *srcdev,
47 struct btrfs_device *tgtdev);
48static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
49 char *srcdev_name,
50 struct btrfs_device **device);
51static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
52static int btrfs_dev_replace_kthread(void *data);
53static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
54
55
56int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
57{
58 struct btrfs_key key;
59 struct btrfs_root *dev_root = fs_info->dev_root;
60 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
61 struct extent_buffer *eb;
62 int slot;
63 int ret = 0;
64 struct btrfs_path *path = NULL;
65 int item_size;
66 struct btrfs_dev_replace_item *ptr;
67 u64 src_devid;
68
69 path = btrfs_alloc_path();
70 if (!path) {
71 ret = -ENOMEM;
72 goto out;
73 }
74
75 key.objectid = 0;
76 key.type = BTRFS_DEV_REPLACE_KEY;
77 key.offset = 0;
78 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
79 if (ret) {
80no_valid_dev_replace_entry_found:
81 ret = 0;
82 dev_replace->replace_state =
83 BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
84 dev_replace->cont_reading_from_srcdev_mode =
85 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
86 dev_replace->replace_state = 0;
87 dev_replace->time_started = 0;
88 dev_replace->time_stopped = 0;
89 atomic64_set(&dev_replace->num_write_errors, 0);
90 atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
91 dev_replace->cursor_left = 0;
92 dev_replace->committed_cursor_left = 0;
93 dev_replace->cursor_left_last_write_of_item = 0;
94 dev_replace->cursor_right = 0;
95 dev_replace->srcdev = NULL;
96 dev_replace->tgtdev = NULL;
97 dev_replace->is_valid = 0;
98 dev_replace->item_needs_writeback = 0;
99 goto out;
100 }
101 slot = path->slots[0];
102 eb = path->nodes[0];
103 item_size = btrfs_item_size_nr(eb, slot);
104 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
105
106 if (item_size != sizeof(struct btrfs_dev_replace_item)) {
107 pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n");
108 goto no_valid_dev_replace_entry_found;
109 }
110
111 src_devid = btrfs_dev_replace_src_devid(eb, ptr);
112 dev_replace->cont_reading_from_srcdev_mode =
113 btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
114 dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
115 dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
116 dev_replace->time_stopped =
117 btrfs_dev_replace_time_stopped(eb, ptr);
118 atomic64_set(&dev_replace->num_write_errors,
119 btrfs_dev_replace_num_write_errors(eb, ptr));
120 atomic64_set(&dev_replace->num_uncorrectable_read_errors,
121 btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
122 dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
123 dev_replace->committed_cursor_left = dev_replace->cursor_left;
124 dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
125 dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
126 dev_replace->is_valid = 1;
127
128 dev_replace->item_needs_writeback = 0;
129 switch (dev_replace->replace_state) {
130 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
131 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
132 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
133 dev_replace->srcdev = NULL;
134 dev_replace->tgtdev = NULL;
135 break;
136 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
137 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
138 dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
139 NULL, NULL);
140 dev_replace->tgtdev = btrfs_find_device(fs_info,
141 BTRFS_DEV_REPLACE_DEVID,
142 NULL, NULL);
143 /*
144 * allow 'btrfs dev replace_cancel' if src/tgt device is
145 * missing
146 */
147 if (!dev_replace->srcdev &&
148 !btrfs_test_opt(dev_root, DEGRADED)) {
149 ret = -EIO;
150 pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n",
151 (unsigned long long)src_devid);
152 }
153 if (!dev_replace->tgtdev &&
154 !btrfs_test_opt(dev_root, DEGRADED)) {
155 ret = -EIO;
156 pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n",
157 (unsigned long long)BTRFS_DEV_REPLACE_DEVID);
158 }
159 if (dev_replace->tgtdev) {
160 if (dev_replace->srcdev) {
161 dev_replace->tgtdev->total_bytes =
162 dev_replace->srcdev->total_bytes;
163 dev_replace->tgtdev->disk_total_bytes =
164 dev_replace->srcdev->disk_total_bytes;
165 dev_replace->tgtdev->bytes_used =
166 dev_replace->srcdev->bytes_used;
167 }
168 dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
169 btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
170 dev_replace->tgtdev);
171 }
172 break;
173 }
174
175out:
176 if (path)
177 btrfs_free_path(path);
178 return ret;
179}
180
181/*
182 * called from commit_transaction. Writes changed device replace state to
183 * disk.
184 */
185int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
186 struct btrfs_fs_info *fs_info)
187{
188 int ret;
189 struct btrfs_root *dev_root = fs_info->dev_root;
190 struct btrfs_path *path;
191 struct btrfs_key key;
192 struct extent_buffer *eb;
193 struct btrfs_dev_replace_item *ptr;
194 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
195
196 btrfs_dev_replace_lock(dev_replace);
197 if (!dev_replace->is_valid ||
198 !dev_replace->item_needs_writeback) {
199 btrfs_dev_replace_unlock(dev_replace);
200 return 0;
201 }
202 btrfs_dev_replace_unlock(dev_replace);
203
204 key.objectid = 0;
205 key.type = BTRFS_DEV_REPLACE_KEY;
206 key.offset = 0;
207
208 path = btrfs_alloc_path();
209 if (!path) {
210 ret = -ENOMEM;
211 goto out;
212 }
213 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
214 if (ret < 0) {
215 pr_warn("btrfs: error %d while searching for dev_replace item!\n",
216 ret);
217 goto out;
218 }
219
220 if (ret == 0 &&
221 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
222 /*
223 * need to delete old one and insert a new one.
224 * Since no attempt is made to recover any old state, if the
225 * dev_replace state is 'running', the data on the target
226 * drive is lost.
227 * It would be possible to recover the state: just make sure
228 * that the beginning of the item is never changed and always
229 * contains all the essential information. Then read this
230 * minimal set of information and use it as a base for the
231 * new state.
232 */
233 ret = btrfs_del_item(trans, dev_root, path);
234 if (ret != 0) {
235 pr_warn("btrfs: delete too small dev_replace item failed %d!\n",
236 ret);
237 goto out;
238 }
239 ret = 1;
240 }
241
242 if (ret == 1) {
243 /* need to insert a new item */
244 btrfs_release_path(path);
245 ret = btrfs_insert_empty_item(trans, dev_root, path,
246 &key, sizeof(*ptr));
247 if (ret < 0) {
248 pr_warn("btrfs: insert dev_replace item failed %d!\n",
249 ret);
250 goto out;
251 }
252 }
253
254 eb = path->nodes[0];
255 ptr = btrfs_item_ptr(eb, path->slots[0],
256 struct btrfs_dev_replace_item);
257
258 btrfs_dev_replace_lock(dev_replace);
259 if (dev_replace->srcdev)
260 btrfs_set_dev_replace_src_devid(eb, ptr,
261 dev_replace->srcdev->devid);
262 else
263 btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
264 btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
265 dev_replace->cont_reading_from_srcdev_mode);
266 btrfs_set_dev_replace_replace_state(eb, ptr,
267 dev_replace->replace_state);
268 btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
269 btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
270 btrfs_set_dev_replace_num_write_errors(eb, ptr,
271 atomic64_read(&dev_replace->num_write_errors));
272 btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
273 atomic64_read(&dev_replace->num_uncorrectable_read_errors));
274 dev_replace->cursor_left_last_write_of_item =
275 dev_replace->cursor_left;
276 btrfs_set_dev_replace_cursor_left(eb, ptr,
277 dev_replace->cursor_left_last_write_of_item);
278 btrfs_set_dev_replace_cursor_right(eb, ptr,
279 dev_replace->cursor_right);
280 dev_replace->item_needs_writeback = 0;
281 btrfs_dev_replace_unlock(dev_replace);
282
283 btrfs_mark_buffer_dirty(eb);
284
285out:
286 btrfs_free_path(path);
287
288 return ret;
289}
290
291void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
292{
293 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
294
295 dev_replace->committed_cursor_left =
296 dev_replace->cursor_left_last_write_of_item;
297}
298
299static u64 btrfs_get_seconds_since_1970(void)
300{
301 struct timespec t = CURRENT_TIME_SEC;
302
303 return t.tv_sec;
304}
305
306int btrfs_dev_replace_start(struct btrfs_root *root,
307 struct btrfs_ioctl_dev_replace_args *args)
308{
309 struct btrfs_trans_handle *trans;
310 struct btrfs_fs_info *fs_info = root->fs_info;
311 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
312 int ret;
313 struct btrfs_device *tgt_device = NULL;
314 struct btrfs_device *src_device = NULL;
315
316 switch (args->start.cont_reading_from_srcdev_mode) {
317 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
318 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
319 break;
320 default:
321 return -EINVAL;
322 }
323
324 if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
325 args->start.tgtdev_name[0] == '\0')
326 return -EINVAL;
327
328 mutex_lock(&fs_info->volume_mutex);
329 ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
330 &tgt_device);
331 if (ret) {
332 pr_err("btrfs: target device %s is invalid!\n",
333 args->start.tgtdev_name);
334 mutex_unlock(&fs_info->volume_mutex);
335 return -EINVAL;
336 }
337
338 ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
339 args->start.srcdev_name,
340 &src_device);
341 mutex_unlock(&fs_info->volume_mutex);
342 if (ret) {
343 ret = -EINVAL;
344 goto leave_no_lock;
345 }
346
347 if (tgt_device->total_bytes < src_device->total_bytes) {
348 pr_err("btrfs: target device is smaller than source device!\n");
349 ret = -EINVAL;
350 goto leave_no_lock;
351 }
352
353 btrfs_dev_replace_lock(dev_replace);
354 switch (dev_replace->replace_state) {
355 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
356 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
357 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
358 break;
359 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
360 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
361 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
362 goto leave;
363 }
364
365 dev_replace->cont_reading_from_srcdev_mode =
366 args->start.cont_reading_from_srcdev_mode;
367 WARN_ON(!src_device);
368 dev_replace->srcdev = src_device;
369 WARN_ON(!tgt_device);
370 dev_replace->tgtdev = tgt_device;
371
372 printk_in_rcu(KERN_INFO
373 "btrfs: dev_replace from %s (devid %llu) to %s) started\n",
374 src_device->missing ? "<missing disk>" :
375 rcu_str_deref(src_device->name),
376 src_device->devid,
377 rcu_str_deref(tgt_device->name));
378
379 tgt_device->total_bytes = src_device->total_bytes;
380 tgt_device->disk_total_bytes = src_device->disk_total_bytes;
381 tgt_device->bytes_used = src_device->bytes_used;
382
383 /*
384 * from now on, the writes to the srcdev are all duplicated to
385 * go to the tgtdev as well (refer to btrfs_map_block()).
386 */
387 dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
388 dev_replace->time_started = btrfs_get_seconds_since_1970();
389 dev_replace->cursor_left = 0;
390 dev_replace->committed_cursor_left = 0;
391 dev_replace->cursor_left_last_write_of_item = 0;
392 dev_replace->cursor_right = 0;
393 dev_replace->is_valid = 1;
394 dev_replace->item_needs_writeback = 1;
395 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
396 btrfs_dev_replace_unlock(dev_replace);
397
398 btrfs_wait_ordered_extents(root, 0);
399
400 /* force writing the updated state information to disk */
401 trans = btrfs_start_transaction(root, 0);
402 if (IS_ERR(trans)) {
403 ret = PTR_ERR(trans);
404 btrfs_dev_replace_lock(dev_replace);
405 goto leave;
406 }
407
408 ret = btrfs_commit_transaction(trans, root);
409 WARN_ON(ret);
410
411 /* the disk copy procedure reuses the scrub code */
412 ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
413 src_device->total_bytes,
414 &dev_replace->scrub_progress, 0, 1);
415
416 ret = btrfs_dev_replace_finishing(root->fs_info, ret);
417 WARN_ON(ret);
418
419 return 0;
420
421leave:
422 dev_replace->srcdev = NULL;
423 dev_replace->tgtdev = NULL;
424 btrfs_dev_replace_unlock(dev_replace);
425leave_no_lock:
426 if (tgt_device)
427 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
428 return ret;
429}
430
431static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
432 int scrub_ret)
433{
434 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
435 struct btrfs_device *tgt_device;
436 struct btrfs_device *src_device;
437 struct btrfs_root *root = fs_info->tree_root;
438 u8 uuid_tmp[BTRFS_UUID_SIZE];
439 struct btrfs_trans_handle *trans;
440 int ret = 0;
441
442 /* don't allow cancel or unmount to disturb the finishing procedure */
443 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
444
445 btrfs_dev_replace_lock(dev_replace);
446 /* was the operation canceled, or is it finished? */
447 if (dev_replace->replace_state !=
448 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
449 btrfs_dev_replace_unlock(dev_replace);
450 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
451 return 0;
452 }
453
454 tgt_device = dev_replace->tgtdev;
455 src_device = dev_replace->srcdev;
456 btrfs_dev_replace_unlock(dev_replace);
457
458 /* replace old device with new one in mapping tree */
459 if (!scrub_ret)
460 btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
461 src_device,
462 tgt_device);
463
464 /*
465 * flush all outstanding I/O and inode extent mappings before the
466 * copy operation is declared as being finished
467 */
468 btrfs_start_delalloc_inodes(root, 0);
469 btrfs_wait_ordered_extents(root, 0);
470
471 trans = btrfs_start_transaction(root, 0);
472 if (IS_ERR(trans)) {
473 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
474 return PTR_ERR(trans);
475 }
476 ret = btrfs_commit_transaction(trans, root);
477 WARN_ON(ret);
478
479 /* keep away write_all_supers() during the finishing procedure */
480 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
481 btrfs_dev_replace_lock(dev_replace);
482 dev_replace->replace_state =
483 scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
484 : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
485 dev_replace->tgtdev = NULL;
486 dev_replace->srcdev = NULL;
487 dev_replace->time_stopped = btrfs_get_seconds_since_1970();
488 dev_replace->item_needs_writeback = 1;
489
490 if (scrub_ret) {
491 printk_in_rcu(KERN_ERR
492 "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
493 src_device->missing ? "<missing disk>" :
494 rcu_str_deref(src_device->name),
495 src_device->devid,
496 rcu_str_deref(tgt_device->name), scrub_ret);
497 btrfs_dev_replace_unlock(dev_replace);
498 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
499 if (tgt_device)
500 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
501 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
502
503 return 0;
504 }
505
506 printk_in_rcu(KERN_INFO
507 "btrfs: dev_replace from %s (devid %llu) to %s) finished\n",
508 src_device->missing ? "<missing disk>" :
509 rcu_str_deref(src_device->name),
510 src_device->devid,
511 rcu_str_deref(tgt_device->name));
512 tgt_device->is_tgtdev_for_dev_replace = 0;
513 tgt_device->devid = src_device->devid;
514 src_device->devid = BTRFS_DEV_REPLACE_DEVID;
515 tgt_device->bytes_used = src_device->bytes_used;
516 memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
517 memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
518 memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
519 tgt_device->total_bytes = src_device->total_bytes;
520 tgt_device->disk_total_bytes = src_device->disk_total_bytes;
521 tgt_device->bytes_used = src_device->bytes_used;
522 if (fs_info->sb->s_bdev == src_device->bdev)
523 fs_info->sb->s_bdev = tgt_device->bdev;
524 if (fs_info->fs_devices->latest_bdev == src_device->bdev)
525 fs_info->fs_devices->latest_bdev = tgt_device->bdev;
526 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
527
528 btrfs_rm_dev_replace_srcdev(fs_info, src_device);
529 if (src_device->bdev) {
530 /* zero out the old super */
531 btrfs_scratch_superblock(src_device);
532 }
533 /*
534 * this is again a consistent state where no dev_replace procedure
535 * is running, the target device is part of the filesystem, the
536 * source device is not part of the filesystem anymore and its 1st
537 * superblock is scratched out so that it is no longer marked to
538 * belong to this filesystem.
539 */
540 btrfs_dev_replace_unlock(dev_replace);
541 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
542
543 /* write back the superblocks */
544 trans = btrfs_start_transaction(root, 0);
545 if (!IS_ERR(trans))
546 btrfs_commit_transaction(trans, root);
547
548 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
549
550 return 0;
551}
552
553static void btrfs_dev_replace_update_device_in_mapping_tree(
554 struct btrfs_fs_info *fs_info,
555 struct btrfs_device *srcdev,
556 struct btrfs_device *tgtdev)
557{
558 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
559 struct extent_map *em;
560 struct map_lookup *map;
561 u64 start = 0;
562 int i;
563
564 write_lock(&em_tree->lock);
565 do {
566 em = lookup_extent_mapping(em_tree, start, (u64)-1);
567 if (!em)
568 break;
569 map = (struct map_lookup *)em->bdev;
570 for (i = 0; i < map->num_stripes; i++)
571 if (srcdev == map->stripes[i].dev)
572 map->stripes[i].dev = tgtdev;
573 start = em->start + em->len;
574 free_extent_map(em);
575 } while (start);
576 write_unlock(&em_tree->lock);
577}
578
579static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
580 char *srcdev_name,
581 struct btrfs_device **device)
582{
583 int ret;
584
585 if (srcdevid) {
586 ret = 0;
587 *device = btrfs_find_device(root->fs_info, srcdevid, NULL,
588 NULL);
589 if (!*device)
590 ret = -ENOENT;
591 } else {
592 ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
593 device);
594 }
595 return ret;
596}
597
598void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
599 struct btrfs_ioctl_dev_replace_args *args)
600{
601 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
602
603 btrfs_dev_replace_lock(dev_replace);
604 /* even if !dev_replace_is_valid, the values are good enough for
605 * the replace_status ioctl */
606 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
607 args->status.replace_state = dev_replace->replace_state;
608 args->status.time_started = dev_replace->time_started;
609 args->status.time_stopped = dev_replace->time_stopped;
610 args->status.num_write_errors =
611 atomic64_read(&dev_replace->num_write_errors);
612 args->status.num_uncorrectable_read_errors =
613 atomic64_read(&dev_replace->num_uncorrectable_read_errors);
614 switch (dev_replace->replace_state) {
615 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
616 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
617 args->status.progress_1000 = 0;
618 break;
619 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
620 args->status.progress_1000 = 1000;
621 break;
622 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
623 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
624 args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
625 div64_u64(dev_replace->srcdev->total_bytes, 1000));
626 break;
627 }
628 btrfs_dev_replace_unlock(dev_replace);
629}
630
631int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
632 struct btrfs_ioctl_dev_replace_args *args)
633{
634 args->result = __btrfs_dev_replace_cancel(fs_info);
635 return 0;
636}
637
638static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
639{
640 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
641 struct btrfs_device *tgt_device = NULL;
642 struct btrfs_trans_handle *trans;
643 struct btrfs_root *root = fs_info->tree_root;
644 u64 result;
645 int ret;
646
647 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
648 btrfs_dev_replace_lock(dev_replace);
649 switch (dev_replace->replace_state) {
650 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
651 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
652 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
653 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
654 btrfs_dev_replace_unlock(dev_replace);
655 goto leave;
656 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
657 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
658 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
659 tgt_device = dev_replace->tgtdev;
660 dev_replace->tgtdev = NULL;
661 dev_replace->srcdev = NULL;
662 break;
663 }
664 dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
665 dev_replace->time_stopped = btrfs_get_seconds_since_1970();
666 dev_replace->item_needs_writeback = 1;
667 btrfs_dev_replace_unlock(dev_replace);
668 btrfs_scrub_cancel(fs_info);
669
670 trans = btrfs_start_transaction(root, 0);
671 if (IS_ERR(trans)) {
672 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
673 return PTR_ERR(trans);
674 }
675 ret = btrfs_commit_transaction(trans, root);
676 WARN_ON(ret);
677 if (tgt_device)
678 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
679
680leave:
681 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
682 return result;
683}
684
685void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
686{
687 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
688
689 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
690 btrfs_dev_replace_lock(dev_replace);
691 switch (dev_replace->replace_state) {
692 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
693 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
694 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
695 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
696 break;
697 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
698 dev_replace->replace_state =
699 BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
700 dev_replace->time_stopped = btrfs_get_seconds_since_1970();
701 dev_replace->item_needs_writeback = 1;
702 pr_info("btrfs: suspending dev_replace for unmount\n");
703 break;
704 }
705
706 btrfs_dev_replace_unlock(dev_replace);
707 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
708}
709
710/* resume dev_replace procedure that was interrupted by unmount */
711int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
712{
713 struct task_struct *task;
714 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
715
716 btrfs_dev_replace_lock(dev_replace);
717 switch (dev_replace->replace_state) {
718 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
719 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
720 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
721 btrfs_dev_replace_unlock(dev_replace);
722 return 0;
723 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
724 break;
725 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
726 dev_replace->replace_state =
727 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
728 break;
729 }
730 if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
731 pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n"
732 "btrfs: you may cancel the operation after 'mount -o degraded'\n");
733 btrfs_dev_replace_unlock(dev_replace);
734 return 0;
735 }
736 btrfs_dev_replace_unlock(dev_replace);
737
738 WARN_ON(atomic_xchg(
739 &fs_info->mutually_exclusive_operation_running, 1));
740 task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
741 return PTR_RET(task);
742}
743
744static int btrfs_dev_replace_kthread(void *data)
745{
746 struct btrfs_fs_info *fs_info = data;
747 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
748 struct btrfs_ioctl_dev_replace_args *status_args;
749 u64 progress;
750
751 status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
752 if (status_args) {
753 btrfs_dev_replace_status(fs_info, status_args);
754 progress = status_args->status.progress_1000;
755 kfree(status_args);
756 do_div(progress, 10);
757 printk_in_rcu(KERN_INFO
758 "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
759 dev_replace->srcdev->missing ? "<missing disk>" :
760 rcu_str_deref(dev_replace->srcdev->name),
761 dev_replace->srcdev->devid,
762 dev_replace->tgtdev ?
763 rcu_str_deref(dev_replace->tgtdev->name) :
764 "<missing target disk>",
765 (unsigned int)progress);
766 }
767 btrfs_dev_replace_continue_on_mount(fs_info);
768 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
769
770 return 0;
771}
772
773static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
774{
775 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
776 int ret;
777
778 ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
779 dev_replace->committed_cursor_left,
780 dev_replace->srcdev->total_bytes,
781 &dev_replace->scrub_progress, 0, 1);
782 ret = btrfs_dev_replace_finishing(fs_info, ret);
783 WARN_ON(ret);
784 return 0;
785}
786
787int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
788{
789 if (!dev_replace->is_valid)
790 return 0;
791
792 switch (dev_replace->replace_state) {
793 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
794 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
795 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
796 return 0;
797 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
798 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
799 /*
800 * return true even if tgtdev is missing (this is
801 * something that can happen if the dev_replace
802 * procedure is suspended by an umount and then
803 * the tgtdev is missing (or "btrfs dev scan") was
804 * not called and the the filesystem is remounted
805 * in degraded state. This does not stop the
806 * dev_replace procedure. It needs to be canceled
807 * manually if the cancelation is wanted.
808 */
809 break;
810 }
811 return 1;
812}
813
814void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
815{
816 /* the beginning is just an optimization for the typical case */
817 if (atomic_read(&dev_replace->nesting_level) == 0) {
818acquire_lock:
819 /* this is not a nested case where the same thread
820 * is trying to acqurire the same lock twice */
821 mutex_lock(&dev_replace->lock);
822 mutex_lock(&dev_replace->lock_management_lock);
823 dev_replace->lock_owner = current->pid;
824 atomic_inc(&dev_replace->nesting_level);
825 mutex_unlock(&dev_replace->lock_management_lock);
826 return;
827 }
828
829 mutex_lock(&dev_replace->lock_management_lock);
830 if (atomic_read(&dev_replace->nesting_level) > 0 &&
831 dev_replace->lock_owner == current->pid) {
832 WARN_ON(!mutex_is_locked(&dev_replace->lock));
833 atomic_inc(&dev_replace->nesting_level);
834 mutex_unlock(&dev_replace->lock_management_lock);
835 return;
836 }
837
838 mutex_unlock(&dev_replace->lock_management_lock);
839 goto acquire_lock;
840}
841
842void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
843{
844 WARN_ON(!mutex_is_locked(&dev_replace->lock));
845 mutex_lock(&dev_replace->lock_management_lock);
846 WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
847 WARN_ON(dev_replace->lock_owner != current->pid);
848 atomic_dec(&dev_replace->nesting_level);
849 if (atomic_read(&dev_replace->nesting_level) == 0) {
850 dev_replace->lock_owner = 0;
851 mutex_unlock(&dev_replace->lock_management_lock);
852 mutex_unlock(&dev_replace->lock);
853 } else {
854 mutex_unlock(&dev_replace->lock_management_lock);
855 }
856}
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
new file mode 100644
index 000000000000..20035cbbf021
--- /dev/null
+++ b/fs/btrfs/dev-replace.h
@@ -0,0 +1,44 @@
1/*
2 * Copyright (C) STRATO AG 2012. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#if !defined(__BTRFS_DEV_REPLACE__)
20#define __BTRFS_DEV_REPLACE__
21
22struct btrfs_ioctl_dev_replace_args;
23
24int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
25int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
26 struct btrfs_fs_info *fs_info);
27void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
28int btrfs_dev_replace_start(struct btrfs_root *root,
29 struct btrfs_ioctl_dev_replace_args *args);
30void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
31 struct btrfs_ioctl_dev_replace_args *args);
32int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
33 struct btrfs_ioctl_dev_replace_args *args);
34void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
35int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
36int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
37void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace);
38void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace);
39
40static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
41{
42 atomic64_inc(stat_value);
43}
44#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index c1a074d0696f..502c2158167c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -213,6 +213,65 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
213 return btrfs_match_dir_item_name(root, path, name, name_len); 213 return btrfs_match_dir_item_name(root, path, name, name_len);
214} 214}
215 215
216int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
217 const char *name, int name_len)
218{
219 int ret;
220 struct btrfs_key key;
221 struct btrfs_dir_item *di;
222 int data_size;
223 struct extent_buffer *leaf;
224 int slot;
225 struct btrfs_path *path;
226
227
228 path = btrfs_alloc_path();
229 if (!path)
230 return -ENOMEM;
231
232 key.objectid = dir;
233 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
234 key.offset = btrfs_name_hash(name, name_len);
235
236 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
237
238 /* return back any errors */
239 if (ret < 0)
240 goto out;
241
242 /* nothing found, we're safe */
243 if (ret > 0) {
244 ret = 0;
245 goto out;
246 }
247
248 /* we found an item, look for our name in the item */
249 di = btrfs_match_dir_item_name(root, path, name, name_len);
250 if (di) {
251 /* our exact name was found */
252 ret = -EEXIST;
253 goto out;
254 }
255
256 /*
257 * see if there is room in the item to insert this
258 * name
259 */
260 data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item);
261 leaf = path->nodes[0];
262 slot = path->slots[0];
263 if (data_size + btrfs_item_size_nr(leaf, slot) +
264 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) {
265 ret = -EOVERFLOW;
266 } else {
267 /* plenty of insertion room */
268 ret = 0;
269 }
270out:
271 btrfs_free_path(path);
272 return ret;
273}
274
216/* 275/*
217 * lookup a directory item based on index. 'dir' is the objectid 276 * lookup a directory item based on index. 'dir' is the objectid
218 * we're searching in, and 'mod' tells us if you plan on deleting the 277 * we're searching in, and 'mod' tells us if you plan on deleting the
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 22a0439e5a86..a8f652dc940b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -45,6 +45,7 @@
45#include "inode-map.h" 45#include "inode-map.h"
46#include "check-integrity.h" 46#include "check-integrity.h"
47#include "rcu-string.h" 47#include "rcu-string.h"
48#include "dev-replace.h"
48 49
49#ifdef CONFIG_X86 50#ifdef CONFIG_X86
50#include <asm/cpufeature.h> 51#include <asm/cpufeature.h>
@@ -387,7 +388,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
387 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) 388 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
388 break; 389 break;
389 390
390 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, 391 num_copies = btrfs_num_copies(root->fs_info,
391 eb->start, eb->len); 392 eb->start, eb->len);
392 if (num_copies == 1) 393 if (num_copies == 1)
393 break; 394 break;
@@ -852,11 +853,16 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
852 int mirror_num, unsigned long bio_flags, 853 int mirror_num, unsigned long bio_flags,
853 u64 bio_offset) 854 u64 bio_offset)
854{ 855{
856 int ret;
857
855 /* 858 /*
856 * when we're called for a write, we're already in the async 859 * when we're called for a write, we're already in the async
857 * submission context. Just jump into btrfs_map_bio 860 * submission context. Just jump into btrfs_map_bio
858 */ 861 */
859 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); 862 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
863 if (ret)
864 bio_endio(bio, ret);
865 return ret;
860} 866}
861 867
862static int check_async_write(struct inode *inode, unsigned long bio_flags) 868static int check_async_write(struct inode *inode, unsigned long bio_flags)
@@ -878,7 +884,6 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
878 int ret; 884 int ret;
879 885
880 if (!(rw & REQ_WRITE)) { 886 if (!(rw & REQ_WRITE)) {
881
882 /* 887 /*
883 * called for a read, do the setup so that checksum validation 888 * called for a read, do the setup so that checksum validation
884 * can happen in the async kernel threads 889 * can happen in the async kernel threads
@@ -886,26 +891,32 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
886 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, 891 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
887 bio, 1); 892 bio, 1);
888 if (ret) 893 if (ret)
889 return ret; 894 goto out_w_error;
890 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 895 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
891 mirror_num, 0); 896 mirror_num, 0);
892 } else if (!async) { 897 } else if (!async) {
893 ret = btree_csum_one_bio(bio); 898 ret = btree_csum_one_bio(bio);
894 if (ret) 899 if (ret)
895 return ret; 900 goto out_w_error;
896 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 901 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
897 mirror_num, 0); 902 mirror_num, 0);
903 } else {
904 /*
905 * kthread helpers are used to submit writes so that
906 * checksumming can happen in parallel across all CPUs
907 */
908 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
909 inode, rw, bio, mirror_num, 0,
910 bio_offset,
911 __btree_submit_bio_start,
912 __btree_submit_bio_done);
898 } 913 }
899 914
900 /* 915 if (ret) {
901 * kthread helpers are used to submit writes so that checksumming 916out_w_error:
902 * can happen in parallel across all CPUs 917 bio_endio(bio, ret);
903 */ 918 }
904 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 919 return ret;
905 inode, rw, bio, mirror_num, 0,
906 bio_offset,
907 __btree_submit_bio_start,
908 __btree_submit_bio_done);
909} 920}
910 921
911#ifdef CONFIG_MIGRATION 922#ifdef CONFIG_MIGRATION
@@ -990,6 +1001,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
990 1001
991static int btree_set_page_dirty(struct page *page) 1002static int btree_set_page_dirty(struct page *page)
992{ 1003{
1004#ifdef DEBUG
993 struct extent_buffer *eb; 1005 struct extent_buffer *eb;
994 1006
995 BUG_ON(!PagePrivate(page)); 1007 BUG_ON(!PagePrivate(page));
@@ -998,6 +1010,7 @@ static int btree_set_page_dirty(struct page *page)
998 BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 1010 BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
999 BUG_ON(!atomic_read(&eb->refs)); 1011 BUG_ON(!atomic_read(&eb->refs));
1000 btrfs_assert_tree_locked(eb); 1012 btrfs_assert_tree_locked(eb);
1013#endif
1001 return __set_page_dirty_nobuffers(page); 1014 return __set_page_dirty_nobuffers(page);
1002} 1015}
1003 1016
@@ -1129,11 +1142,11 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1129 root->fs_info->dirty_metadata_bytes); 1142 root->fs_info->dirty_metadata_bytes);
1130 } 1143 }
1131 spin_unlock(&root->fs_info->delalloc_lock); 1144 spin_unlock(&root->fs_info->delalloc_lock);
1132 }
1133 1145
1134 /* ugh, clear_extent_buffer_dirty needs to lock the page */ 1146 /* ugh, clear_extent_buffer_dirty needs to lock the page */
1135 btrfs_set_lock_blocking(buf); 1147 btrfs_set_lock_blocking(buf);
1136 clear_extent_buffer_dirty(buf); 1148 clear_extent_buffer_dirty(buf);
1149 }
1137 } 1150 }
1138} 1151}
1139 1152
@@ -1193,7 +1206,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1193 root->root_key.objectid = objectid; 1206 root->root_key.objectid = objectid;
1194 root->anon_dev = 0; 1207 root->anon_dev = 0;
1195 1208
1196 spin_lock_init(&root->root_times_lock); 1209 spin_lock_init(&root->root_item_lock);
1197} 1210}
1198 1211
1199static int __must_check find_and_setup_root(struct btrfs_root *tree_root, 1212static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
@@ -2131,6 +2144,11 @@ int open_ctree(struct super_block *sb,
2131 init_rwsem(&fs_info->extent_commit_sem); 2144 init_rwsem(&fs_info->extent_commit_sem);
2132 init_rwsem(&fs_info->cleanup_work_sem); 2145 init_rwsem(&fs_info->cleanup_work_sem);
2133 init_rwsem(&fs_info->subvol_sem); 2146 init_rwsem(&fs_info->subvol_sem);
2147 fs_info->dev_replace.lock_owner = 0;
2148 atomic_set(&fs_info->dev_replace.nesting_level, 0);
2149 mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
2150 mutex_init(&fs_info->dev_replace.lock_management_lock);
2151 mutex_init(&fs_info->dev_replace.lock);
2134 2152
2135 spin_lock_init(&fs_info->qgroup_lock); 2153 spin_lock_init(&fs_info->qgroup_lock);
2136 fs_info->qgroup_tree = RB_ROOT; 2154 fs_info->qgroup_tree = RB_ROOT;
@@ -2279,6 +2297,10 @@ int open_ctree(struct super_block *sb,
2279 fs_info->thread_pool_size, 2297 fs_info->thread_pool_size,
2280 &fs_info->generic_worker); 2298 &fs_info->generic_worker);
2281 2299
2300 btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
2301 fs_info->thread_pool_size,
2302 &fs_info->generic_worker);
2303
2282 btrfs_init_workers(&fs_info->submit_workers, "submit", 2304 btrfs_init_workers(&fs_info->submit_workers, "submit",
2283 min_t(u64, fs_devices->num_devices, 2305 min_t(u64, fs_devices->num_devices,
2284 fs_info->thread_pool_size), 2306 fs_info->thread_pool_size),
@@ -2350,6 +2372,7 @@ int open_ctree(struct super_block *sb,
2350 ret |= btrfs_start_workers(&fs_info->delayed_workers); 2372 ret |= btrfs_start_workers(&fs_info->delayed_workers);
2351 ret |= btrfs_start_workers(&fs_info->caching_workers); 2373 ret |= btrfs_start_workers(&fs_info->caching_workers);
2352 ret |= btrfs_start_workers(&fs_info->readahead_workers); 2374 ret |= btrfs_start_workers(&fs_info->readahead_workers);
2375 ret |= btrfs_start_workers(&fs_info->flush_workers);
2353 if (ret) { 2376 if (ret) {
2354 err = -ENOMEM; 2377 err = -ENOMEM;
2355 goto fail_sb_buffer; 2378 goto fail_sb_buffer;
@@ -2418,7 +2441,11 @@ int open_ctree(struct super_block *sb,
2418 goto fail_tree_roots; 2441 goto fail_tree_roots;
2419 } 2442 }
2420 2443
2421 btrfs_close_extra_devices(fs_devices); 2444 /*
2445 * keep the device that is marked to be the target device for the
2446 * dev_replace procedure
2447 */
2448 btrfs_close_extra_devices(fs_info, fs_devices, 0);
2422 2449
2423 if (!fs_devices->latest_bdev) { 2450 if (!fs_devices->latest_bdev) {
2424 printk(KERN_CRIT "btrfs: failed to read devices on %s\n", 2451 printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
@@ -2490,6 +2517,14 @@ retry_root_backup:
2490 goto fail_block_groups; 2517 goto fail_block_groups;
2491 } 2518 }
2492 2519
2520 ret = btrfs_init_dev_replace(fs_info);
2521 if (ret) {
2522 pr_err("btrfs: failed to init dev_replace: %d\n", ret);
2523 goto fail_block_groups;
2524 }
2525
2526 btrfs_close_extra_devices(fs_info, fs_devices, 1);
2527
2493 ret = btrfs_init_space_info(fs_info); 2528 ret = btrfs_init_space_info(fs_info);
2494 if (ret) { 2529 if (ret) {
2495 printk(KERN_ERR "Failed to initial space info: %d\n", ret); 2530 printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@ -2503,6 +2538,13 @@ retry_root_backup:
2503 } 2538 }
2504 fs_info->num_tolerated_disk_barrier_failures = 2539 fs_info->num_tolerated_disk_barrier_failures =
2505 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 2540 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
2541 if (fs_info->fs_devices->missing_devices >
2542 fs_info->num_tolerated_disk_barrier_failures &&
2543 !(sb->s_flags & MS_RDONLY)) {
2544 printk(KERN_WARNING
2545 "Btrfs: too many missing devices, writeable mount is not allowed\n");
2546 goto fail_block_groups;
2547 }
2506 2548
2507 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 2549 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
2508 "btrfs-cleaner"); 2550 "btrfs-cleaner");
@@ -2631,6 +2673,13 @@ retry_root_backup:
2631 return ret; 2673 return ret;
2632 } 2674 }
2633 2675
2676 ret = btrfs_resume_dev_replace_async(fs_info);
2677 if (ret) {
2678 pr_warn("btrfs: failed to resume dev_replace\n");
2679 close_ctree(tree_root);
2680 return ret;
2681 }
2682
2634 return 0; 2683 return 0;
2635 2684
2636fail_qgroup: 2685fail_qgroup:
@@ -2667,6 +2716,7 @@ fail_sb_buffer:
2667 btrfs_stop_workers(&fs_info->submit_workers); 2716 btrfs_stop_workers(&fs_info->submit_workers);
2668 btrfs_stop_workers(&fs_info->delayed_workers); 2717 btrfs_stop_workers(&fs_info->delayed_workers);
2669 btrfs_stop_workers(&fs_info->caching_workers); 2718 btrfs_stop_workers(&fs_info->caching_workers);
2719 btrfs_stop_workers(&fs_info->flush_workers);
2670fail_alloc: 2720fail_alloc:
2671fail_iput: 2721fail_iput:
2672 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2722 btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -3270,16 +3320,18 @@ int close_ctree(struct btrfs_root *root)
3270 smp_mb(); 3320 smp_mb();
3271 3321
3272 /* pause restriper - we want to resume on mount */ 3322 /* pause restriper - we want to resume on mount */
3273 btrfs_pause_balance(root->fs_info); 3323 btrfs_pause_balance(fs_info);
3274 3324
3275 btrfs_scrub_cancel(root); 3325 btrfs_dev_replace_suspend_for_unmount(fs_info);
3326
3327 btrfs_scrub_cancel(fs_info);
3276 3328
3277 /* wait for any defraggers to finish */ 3329 /* wait for any defraggers to finish */
3278 wait_event(fs_info->transaction_wait, 3330 wait_event(fs_info->transaction_wait,
3279 (atomic_read(&fs_info->defrag_running) == 0)); 3331 (atomic_read(&fs_info->defrag_running) == 0));
3280 3332
3281 /* clear out the rbtree of defraggable inodes */ 3333 /* clear out the rbtree of defraggable inodes */
3282 btrfs_run_defrag_inodes(fs_info); 3334 btrfs_cleanup_defrag_inodes(fs_info);
3283 3335
3284 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 3336 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
3285 ret = btrfs_commit_super(root); 3337 ret = btrfs_commit_super(root);
@@ -3339,6 +3391,7 @@ int close_ctree(struct btrfs_root *root)
3339 btrfs_stop_workers(&fs_info->delayed_workers); 3391 btrfs_stop_workers(&fs_info->delayed_workers);
3340 btrfs_stop_workers(&fs_info->caching_workers); 3392 btrfs_stop_workers(&fs_info->caching_workers);
3341 btrfs_stop_workers(&fs_info->readahead_workers); 3393 btrfs_stop_workers(&fs_info->readahead_workers);
3394 btrfs_stop_workers(&fs_info->flush_workers);
3342 3395
3343#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 3396#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3344 if (btrfs_test_opt(root, CHECK_INTEGRITY)) 3397 if (btrfs_test_opt(root, CHECK_INTEGRITY))
@@ -3383,14 +3436,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
3383 int was_dirty; 3436 int was_dirty;
3384 3437
3385 btrfs_assert_tree_locked(buf); 3438 btrfs_assert_tree_locked(buf);
3386 if (transid != root->fs_info->generation) { 3439 if (transid != root->fs_info->generation)
3387 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " 3440 WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
3388 "found %llu running %llu\n", 3441 "found %llu running %llu\n",
3389 (unsigned long long)buf->start, 3442 (unsigned long long)buf->start,
3390 (unsigned long long)transid, 3443 (unsigned long long)transid,
3391 (unsigned long long)root->fs_info->generation); 3444 (unsigned long long)root->fs_info->generation);
3392 WARN_ON(1);
3393 }
3394 was_dirty = set_extent_buffer_dirty(buf); 3445 was_dirty = set_extent_buffer_dirty(buf);
3395 if (!was_dirty) { 3446 if (!was_dirty) {
3396 spin_lock(&root->fs_info->delalloc_lock); 3447 spin_lock(&root->fs_info->delalloc_lock);
@@ -3399,7 +3450,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
3399 } 3450 }
3400} 3451}
3401 3452
3402void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 3453static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
3454 int flush_delayed)
3403{ 3455{
3404 /* 3456 /*
3405 * looks as though older kernels can get into trouble with 3457 * looks as though older kernels can get into trouble with
@@ -3411,7 +3463,8 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
3411 if (current->flags & PF_MEMALLOC) 3463 if (current->flags & PF_MEMALLOC)
3412 return; 3464 return;
3413 3465
3414 btrfs_balance_delayed_items(root); 3466 if (flush_delayed)
3467 btrfs_balance_delayed_items(root);
3415 3468
3416 num_dirty = root->fs_info->dirty_metadata_bytes; 3469 num_dirty = root->fs_info->dirty_metadata_bytes;
3417 3470
@@ -3422,25 +3475,14 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
3422 return; 3475 return;
3423} 3476}
3424 3477
3425void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 3478void btrfs_btree_balance_dirty(struct btrfs_root *root)
3426{ 3479{
3427 /* 3480 __btrfs_btree_balance_dirty(root, 1);
3428 * looks as though older kernels can get into trouble with 3481}
3429 * this code, they end up stuck in balance_dirty_pages forever
3430 */
3431 u64 num_dirty;
3432 unsigned long thresh = 32 * 1024 * 1024;
3433
3434 if (current->flags & PF_MEMALLOC)
3435 return;
3436
3437 num_dirty = root->fs_info->dirty_metadata_bytes;
3438 3482
3439 if (num_dirty > thresh) { 3483void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
3440 balance_dirty_pages_ratelimited( 3484{
3441 root->fs_info->btree_inode->i_mapping); 3485 __btrfs_btree_balance_dirty(root, 0);
3442 }
3443 return;
3444} 3486}
3445 3487
3446int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) 3488int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 2025a9132c16..305c33efb0e3 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -62,8 +62,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
62struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, 62struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
63 struct btrfs_key *location); 63 struct btrfs_key *location);
64int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); 64int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
65void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); 65void btrfs_btree_balance_dirty(struct btrfs_root *root);
66void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); 66void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
67void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); 67void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
68void btrfs_mark_buffer_dirty(struct extent_buffer *buf); 68void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
69int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, 69int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 06b2635073f3..521e9d4424f6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,7 @@
33#include "volumes.h" 33#include "volumes.h"
34#include "locking.h" 34#include "locking.h"
35#include "free-space-cache.h" 35#include "free-space-cache.h"
36#include "math.h"
36 37
37#undef SCRAMBLE_DELAYED_REFS 38#undef SCRAMBLE_DELAYED_REFS
38 39
@@ -649,24 +650,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
649 rcu_read_unlock(); 650 rcu_read_unlock();
650} 651}
651 652
652static u64 div_factor(u64 num, int factor)
653{
654 if (factor == 10)
655 return num;
656 num *= factor;
657 do_div(num, 10);
658 return num;
659}
660
661static u64 div_factor_fine(u64 num, int factor)
662{
663 if (factor == 100)
664 return num;
665 num *= factor;
666 do_div(num, 100);
667 return num;
668}
669
670u64 btrfs_find_block_group(struct btrfs_root *root, 653u64 btrfs_find_block_group(struct btrfs_root *root,
671 u64 search_start, u64 search_hint, int owner) 654 u64 search_start, u64 search_hint, int owner)
672{ 655{
@@ -1835,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1835 1818
1836 1819
1837 /* Tell the block device(s) that the sectors can be discarded */ 1820 /* Tell the block device(s) that the sectors can be discarded */
1838 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, 1821 ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
1839 bytenr, &num_bytes, &bbio, 0); 1822 bytenr, &num_bytes, &bbio, 0);
1840 /* Error condition is -ENOMEM */ 1823 /* Error condition is -ENOMEM */
1841 if (!ret) { 1824 if (!ret) {
@@ -2314,6 +2297,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2314 kfree(extent_op); 2297 kfree(extent_op);
2315 2298
2316 if (ret) { 2299 if (ret) {
2300 list_del_init(&locked_ref->cluster);
2301 mutex_unlock(&locked_ref->mutex);
2302
2317 printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); 2303 printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
2318 spin_lock(&delayed_refs->lock); 2304 spin_lock(&delayed_refs->lock);
2319 return ret; 2305 return ret;
@@ -2356,6 +2342,10 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2356 count++; 2342 count++;
2357 2343
2358 if (ret) { 2344 if (ret) {
2345 if (locked_ref) {
2346 list_del_init(&locked_ref->cluster);
2347 mutex_unlock(&locked_ref->mutex);
2348 }
2359 printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); 2349 printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
2360 spin_lock(&delayed_refs->lock); 2350 spin_lock(&delayed_refs->lock);
2361 return ret; 2351 return ret;
@@ -3661,7 +3651,7 @@ out:
3661 3651
3662static int can_overcommit(struct btrfs_root *root, 3652static int can_overcommit(struct btrfs_root *root,
3663 struct btrfs_space_info *space_info, u64 bytes, 3653 struct btrfs_space_info *space_info, u64 bytes,
3664 int flush) 3654 enum btrfs_reserve_flush_enum flush)
3665{ 3655{
3666 u64 profile = btrfs_get_alloc_profile(root, 0); 3656 u64 profile = btrfs_get_alloc_profile(root, 0);
3667 u64 avail; 3657 u64 avail;
@@ -3685,11 +3675,11 @@ static int can_overcommit(struct btrfs_root *root,
3685 avail >>= 1; 3675 avail >>= 1;
3686 3676
3687 /* 3677 /*
3688 * If we aren't flushing don't let us overcommit too much, say 3678 * If we aren't flushing all things, let us overcommit up to
3689 * 1/8th of the space. If we can flush, let it overcommit up to 3679 * 1/2th of the space. If we can flush, don't let us overcommit
3690 * 1/2 of the space. 3680 * too much, let it overcommit up to 1/8 of the space.
3691 */ 3681 */
3692 if (flush) 3682 if (flush == BTRFS_RESERVE_FLUSH_ALL)
3693 avail >>= 3; 3683 avail >>= 3;
3694 else 3684 else
3695 avail >>= 1; 3685 avail >>= 1;
@@ -3699,6 +3689,20 @@ static int can_overcommit(struct btrfs_root *root,
3699 return 0; 3689 return 0;
3700} 3690}
3701 3691
3692static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
3693 unsigned long nr_pages,
3694 enum wb_reason reason)
3695{
3696 if (!writeback_in_progress(sb->s_bdi) &&
3697 down_read_trylock(&sb->s_umount)) {
3698 writeback_inodes_sb_nr(sb, nr_pages, reason);
3699 up_read(&sb->s_umount);
3700 return 1;
3701 }
3702
3703 return 0;
3704}
3705
3702/* 3706/*
3703 * shrink metadata reservation for delalloc 3707 * shrink metadata reservation for delalloc
3704 */ 3708 */
@@ -3713,6 +3717,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3713 long time_left; 3717 long time_left;
3714 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3718 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3715 int loops = 0; 3719 int loops = 0;
3720 enum btrfs_reserve_flush_enum flush;
3716 3721
3717 trans = (struct btrfs_trans_handle *)current->journal_info; 3722 trans = (struct btrfs_trans_handle *)current->journal_info;
3718 block_rsv = &root->fs_info->delalloc_block_rsv; 3723 block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -3730,8 +3735,9 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3730 while (delalloc_bytes && loops < 3) { 3735 while (delalloc_bytes && loops < 3) {
3731 max_reclaim = min(delalloc_bytes, to_reclaim); 3736 max_reclaim = min(delalloc_bytes, to_reclaim);
3732 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 3737 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
3733 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, 3738 writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb,
3734 WB_REASON_FS_FREE_SPACE); 3739 nr_pages,
3740 WB_REASON_FS_FREE_SPACE);
3735 3741
3736 /* 3742 /*
3737 * We need to wait for the async pages to actually start before 3743 * We need to wait for the async pages to actually start before
@@ -3740,8 +3746,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3740 wait_event(root->fs_info->async_submit_wait, 3746 wait_event(root->fs_info->async_submit_wait,
3741 !atomic_read(&root->fs_info->async_delalloc_pages)); 3747 !atomic_read(&root->fs_info->async_delalloc_pages));
3742 3748
3749 if (!trans)
3750 flush = BTRFS_RESERVE_FLUSH_ALL;
3751 else
3752 flush = BTRFS_RESERVE_NO_FLUSH;
3743 spin_lock(&space_info->lock); 3753 spin_lock(&space_info->lock);
3744 if (can_overcommit(root, space_info, orig, !trans)) { 3754 if (can_overcommit(root, space_info, orig, flush)) {
3745 spin_unlock(&space_info->lock); 3755 spin_unlock(&space_info->lock);
3746 break; 3756 break;
3747 } 3757 }
@@ -3899,7 +3909,8 @@ static int flush_space(struct btrfs_root *root,
3899 */ 3909 */
3900static int reserve_metadata_bytes(struct btrfs_root *root, 3910static int reserve_metadata_bytes(struct btrfs_root *root,
3901 struct btrfs_block_rsv *block_rsv, 3911 struct btrfs_block_rsv *block_rsv,
3902 u64 orig_bytes, int flush) 3912 u64 orig_bytes,
3913 enum btrfs_reserve_flush_enum flush)
3903{ 3914{
3904 struct btrfs_space_info *space_info = block_rsv->space_info; 3915 struct btrfs_space_info *space_info = block_rsv->space_info;
3905 u64 used; 3916 u64 used;
@@ -3912,10 +3923,11 @@ again:
3912 ret = 0; 3923 ret = 0;
3913 spin_lock(&space_info->lock); 3924 spin_lock(&space_info->lock);
3914 /* 3925 /*
3915 * We only want to wait if somebody other than us is flushing and we are 3926 * We only want to wait if somebody other than us is flushing and we
3916 * actually alloed to flush. 3927 * are actually allowed to flush all things.
3917 */ 3928 */
3918 while (flush && !flushing && space_info->flush) { 3929 while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
3930 space_info->flush) {
3919 spin_unlock(&space_info->lock); 3931 spin_unlock(&space_info->lock);
3920 /* 3932 /*
3921 * If we have a trans handle we can't wait because the flusher 3933 * If we have a trans handle we can't wait because the flusher
@@ -3981,23 +3993,40 @@ again:
3981 * Couldn't make our reservation, save our place so while we're trying 3993 * Couldn't make our reservation, save our place so while we're trying
3982 * to reclaim space we can actually use it instead of somebody else 3994 * to reclaim space we can actually use it instead of somebody else
3983 * stealing it from us. 3995 * stealing it from us.
3996 *
3997 * We make the other tasks wait for the flush only when we can flush
3998 * all things.
3984 */ 3999 */
3985 if (ret && flush) { 4000 if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) {
3986 flushing = true; 4001 flushing = true;
3987 space_info->flush = 1; 4002 space_info->flush = 1;
3988 } 4003 }
3989 4004
3990 spin_unlock(&space_info->lock); 4005 spin_unlock(&space_info->lock);
3991 4006
3992 if (!ret || !flush) 4007 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
3993 goto out; 4008 goto out;
3994 4009
3995 ret = flush_space(root, space_info, num_bytes, orig_bytes, 4010 ret = flush_space(root, space_info, num_bytes, orig_bytes,
3996 flush_state); 4011 flush_state);
3997 flush_state++; 4012 flush_state++;
4013
4014 /*
4015 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
4016 * would happen. So skip delalloc flush.
4017 */
4018 if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4019 (flush_state == FLUSH_DELALLOC ||
4020 flush_state == FLUSH_DELALLOC_WAIT))
4021 flush_state = ALLOC_CHUNK;
4022
3998 if (!ret) 4023 if (!ret)
3999 goto again; 4024 goto again;
4000 else if (flush_state <= COMMIT_TRANS) 4025 else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4026 flush_state < COMMIT_TRANS)
4027 goto again;
4028 else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
4029 flush_state <= COMMIT_TRANS)
4001 goto again; 4030 goto again;
4002 4031
4003out: 4032out:
@@ -4148,9 +4177,9 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
4148 kfree(rsv); 4177 kfree(rsv);
4149} 4178}
4150 4179
4151static inline int __block_rsv_add(struct btrfs_root *root, 4180int btrfs_block_rsv_add(struct btrfs_root *root,
4152 struct btrfs_block_rsv *block_rsv, 4181 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
4153 u64 num_bytes, int flush) 4182 enum btrfs_reserve_flush_enum flush)
4154{ 4183{
4155 int ret; 4184 int ret;
4156 4185
@@ -4166,20 +4195,6 @@ static inline int __block_rsv_add(struct btrfs_root *root,
4166 return ret; 4195 return ret;
4167} 4196}
4168 4197
4169int btrfs_block_rsv_add(struct btrfs_root *root,
4170 struct btrfs_block_rsv *block_rsv,
4171 u64 num_bytes)
4172{
4173 return __block_rsv_add(root, block_rsv, num_bytes, 1);
4174}
4175
4176int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
4177 struct btrfs_block_rsv *block_rsv,
4178 u64 num_bytes)
4179{
4180 return __block_rsv_add(root, block_rsv, num_bytes, 0);
4181}
4182
4183int btrfs_block_rsv_check(struct btrfs_root *root, 4198int btrfs_block_rsv_check(struct btrfs_root *root,
4184 struct btrfs_block_rsv *block_rsv, int min_factor) 4199 struct btrfs_block_rsv *block_rsv, int min_factor)
4185{ 4200{
@@ -4198,9 +4213,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
4198 return ret; 4213 return ret;
4199} 4214}
4200 4215
4201static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, 4216int btrfs_block_rsv_refill(struct btrfs_root *root,
4202 struct btrfs_block_rsv *block_rsv, 4217 struct btrfs_block_rsv *block_rsv, u64 min_reserved,
4203 u64 min_reserved, int flush) 4218 enum btrfs_reserve_flush_enum flush)
4204{ 4219{
4205 u64 num_bytes = 0; 4220 u64 num_bytes = 0;
4206 int ret = -ENOSPC; 4221 int ret = -ENOSPC;
@@ -4228,20 +4243,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
4228 return ret; 4243 return ret;
4229} 4244}
4230 4245
4231int btrfs_block_rsv_refill(struct btrfs_root *root,
4232 struct btrfs_block_rsv *block_rsv,
4233 u64 min_reserved)
4234{
4235 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
4236}
4237
4238int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
4239 struct btrfs_block_rsv *block_rsv,
4240 u64 min_reserved)
4241{
4242 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
4243}
4244
4245int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 4246int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
4246 struct btrfs_block_rsv *dst_rsv, 4247 struct btrfs_block_rsv *dst_rsv,
4247 u64 num_bytes) 4248 u64 num_bytes)
@@ -4532,17 +4533,27 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4532 u64 csum_bytes; 4533 u64 csum_bytes;
4533 unsigned nr_extents = 0; 4534 unsigned nr_extents = 0;
4534 int extra_reserve = 0; 4535 int extra_reserve = 0;
4535 int flush = 1; 4536 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
4536 int ret; 4537 int ret;
4538 bool delalloc_lock = true;
4537 4539
4538 /* Need to be holding the i_mutex here if we aren't free space cache */ 4540 /* If we are a free space inode we need to not flush since we will be in
4539 if (btrfs_is_free_space_inode(inode)) 4541 * the middle of a transaction commit. We also don't need the delalloc
4540 flush = 0; 4542 * mutex since we won't race with anybody. We need this mostly to make
4543 * lockdep shut its filthy mouth.
4544 */
4545 if (btrfs_is_free_space_inode(inode)) {
4546 flush = BTRFS_RESERVE_NO_FLUSH;
4547 delalloc_lock = false;
4548 }
4541 4549
4542 if (flush && btrfs_transaction_in_commit(root->fs_info)) 4550 if (flush != BTRFS_RESERVE_NO_FLUSH &&
4551 btrfs_transaction_in_commit(root->fs_info))
4543 schedule_timeout(1); 4552 schedule_timeout(1);
4544 4553
4545 mutex_lock(&BTRFS_I(inode)->delalloc_mutex); 4554 if (delalloc_lock)
4555 mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
4556
4546 num_bytes = ALIGN(num_bytes, root->sectorsize); 4557 num_bytes = ALIGN(num_bytes, root->sectorsize);
4547 4558
4548 spin_lock(&BTRFS_I(inode)->lock); 4559 spin_lock(&BTRFS_I(inode)->lock);
@@ -4572,7 +4583,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4572 ret = btrfs_qgroup_reserve(root, num_bytes + 4583 ret = btrfs_qgroup_reserve(root, num_bytes +
4573 nr_extents * root->leafsize); 4584 nr_extents * root->leafsize);
4574 if (ret) { 4585 if (ret) {
4575 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4586 spin_lock(&BTRFS_I(inode)->lock);
4587 calc_csum_metadata_size(inode, num_bytes, 0);
4588 spin_unlock(&BTRFS_I(inode)->lock);
4589 if (delalloc_lock)
4590 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4576 return ret; 4591 return ret;
4577 } 4592 }
4578 } 4593 }
@@ -4607,7 +4622,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4607 btrfs_ino(inode), 4622 btrfs_ino(inode),
4608 to_free, 0); 4623 to_free, 0);
4609 } 4624 }
4610 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4625 if (root->fs_info->quota_enabled) {
4626 btrfs_qgroup_free(root, num_bytes +
4627 nr_extents * root->leafsize);
4628 }
4629 if (delalloc_lock)
4630 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4611 return ret; 4631 return ret;
4612 } 4632 }
4613 4633
@@ -4619,7 +4639,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4619 } 4639 }
4620 BTRFS_I(inode)->reserved_extents += nr_extents; 4640 BTRFS_I(inode)->reserved_extents += nr_extents;
4621 spin_unlock(&BTRFS_I(inode)->lock); 4641 spin_unlock(&BTRFS_I(inode)->lock);
4622 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4642
4643 if (delalloc_lock)
4644 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4623 4645
4624 if (to_reserve) 4646 if (to_reserve)
4625 trace_btrfs_space_reservation(root->fs_info,"delalloc", 4647 trace_btrfs_space_reservation(root->fs_info,"delalloc",
@@ -4969,9 +4991,13 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4969{ 4991{
4970 struct btrfs_fs_info *fs_info = root->fs_info; 4992 struct btrfs_fs_info *fs_info = root->fs_info;
4971 struct btrfs_block_group_cache *cache = NULL; 4993 struct btrfs_block_group_cache *cache = NULL;
4994 struct btrfs_space_info *space_info;
4995 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4972 u64 len; 4996 u64 len;
4997 bool readonly;
4973 4998
4974 while (start <= end) { 4999 while (start <= end) {
5000 readonly = false;
4975 if (!cache || 5001 if (!cache ||
4976 start >= cache->key.objectid + cache->key.offset) { 5002 start >= cache->key.objectid + cache->key.offset) {
4977 if (cache) 5003 if (cache)
@@ -4989,15 +5015,30 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4989 } 5015 }
4990 5016
4991 start += len; 5017 start += len;
5018 space_info = cache->space_info;
4992 5019
4993 spin_lock(&cache->space_info->lock); 5020 spin_lock(&space_info->lock);
4994 spin_lock(&cache->lock); 5021 spin_lock(&cache->lock);
4995 cache->pinned -= len; 5022 cache->pinned -= len;
4996 cache->space_info->bytes_pinned -= len; 5023 space_info->bytes_pinned -= len;
4997 if (cache->ro) 5024 if (cache->ro) {
4998 cache->space_info->bytes_readonly += len; 5025 space_info->bytes_readonly += len;
5026 readonly = true;
5027 }
4999 spin_unlock(&cache->lock); 5028 spin_unlock(&cache->lock);
5000 spin_unlock(&cache->space_info->lock); 5029 if (!readonly && global_rsv->space_info == space_info) {
5030 spin_lock(&global_rsv->lock);
5031 if (!global_rsv->full) {
5032 len = min(len, global_rsv->size -
5033 global_rsv->reserved);
5034 global_rsv->reserved += len;
5035 space_info->bytes_may_use += len;
5036 if (global_rsv->reserved >= global_rsv->size)
5037 global_rsv->full = 1;
5038 }
5039 spin_unlock(&global_rsv->lock);
5040 }
5041 spin_unlock(&space_info->lock);
5001 } 5042 }
5002 5043
5003 if (cache) 5044 if (cache)
@@ -5466,7 +5507,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5466 return 0; 5507 return 0;
5467} 5508}
5468 5509
5469static int __get_block_group_index(u64 flags) 5510int __get_raid_index(u64 flags)
5470{ 5511{
5471 int index; 5512 int index;
5472 5513
@@ -5486,7 +5527,7 @@ static int __get_block_group_index(u64 flags)
5486 5527
5487static int get_block_group_index(struct btrfs_block_group_cache *cache) 5528static int get_block_group_index(struct btrfs_block_group_cache *cache)
5488{ 5529{
5489 return __get_block_group_index(cache->flags); 5530 return __get_raid_index(cache->flags);
5490} 5531}
5491 5532
5492enum btrfs_loop_type { 5533enum btrfs_loop_type {
@@ -6269,7 +6310,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
6269 block_rsv = get_block_rsv(trans, root); 6310 block_rsv = get_block_rsv(trans, root);
6270 6311
6271 if (block_rsv->size == 0) { 6312 if (block_rsv->size == 0) {
6272 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); 6313 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6314 BTRFS_RESERVE_NO_FLUSH);
6273 /* 6315 /*
6274 * If we couldn't reserve metadata bytes try and use some from 6316 * If we couldn't reserve metadata bytes try and use some from
6275 * the global reserve. 6317 * the global reserve.
@@ -6292,11 +6334,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
6292 static DEFINE_RATELIMIT_STATE(_rs, 6334 static DEFINE_RATELIMIT_STATE(_rs,
6293 DEFAULT_RATELIMIT_INTERVAL, 6335 DEFAULT_RATELIMIT_INTERVAL,
6294 /*DEFAULT_RATELIMIT_BURST*/ 2); 6336 /*DEFAULT_RATELIMIT_BURST*/ 2);
6295 if (__ratelimit(&_rs)) { 6337 if (__ratelimit(&_rs))
6296 printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); 6338 WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
6297 WARN_ON(1); 6339 ret);
6298 } 6340 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6299 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); 6341 BTRFS_RESERVE_NO_FLUSH);
6300 if (!ret) { 6342 if (!ret) {
6301 return block_rsv; 6343 return block_rsv;
6302 } else if (ret && block_rsv != global_rsv) { 6344 } else if (ret && block_rsv != global_rsv) {
@@ -7427,7 +7469,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7427 */ 7469 */
7428 target = get_restripe_target(root->fs_info, block_group->flags); 7470 target = get_restripe_target(root->fs_info, block_group->flags);
7429 if (target) { 7471 if (target) {
7430 index = __get_block_group_index(extended_to_chunk(target)); 7472 index = __get_raid_index(extended_to_chunk(target));
7431 } else { 7473 } else {
7432 /* 7474 /*
7433 * this is just a balance, so if we were marked as full 7475 * this is just a balance, so if we were marked as full
@@ -7461,7 +7503,8 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7461 * check to make sure we can actually find a chunk with enough 7503 * check to make sure we can actually find a chunk with enough
7462 * space to fit our block group in. 7504 * space to fit our block group in.
7463 */ 7505 */
7464 if (device->total_bytes > device->bytes_used + min_free) { 7506 if (device->total_bytes > device->bytes_used + min_free &&
7507 !device->is_tgtdev_for_dev_replace) {
7465 ret = find_free_dev_extent(device, min_free, 7508 ret = find_free_dev_extent(device, min_free,
7466 &dev_offset, NULL); 7509 &dev_offset, NULL);
7467 if (!ret) 7510 if (!ret)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 472873a94d96..1b319df29eee 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -341,12 +341,10 @@ static int insert_state(struct extent_io_tree *tree,
341{ 341{
342 struct rb_node *node; 342 struct rb_node *node;
343 343
344 if (end < start) { 344 if (end < start)
345 printk(KERN_ERR "btrfs end < start %llu %llu\n", 345 WARN(1, KERN_ERR "btrfs end < start %llu %llu\n",
346 (unsigned long long)end, 346 (unsigned long long)end,
347 (unsigned long long)start); 347 (unsigned long long)start);
348 WARN_ON(1);
349 }
350 state->start = start; 348 state->start = start;
351 state->end = end; 349 state->end = end;
352 350
@@ -1919,12 +1917,12 @@ static void repair_io_failure_callback(struct bio *bio, int err)
1919 * the standard behavior is to write all copies in a raid setup. here we only 1917 * the standard behavior is to write all copies in a raid setup. here we only
1920 * want to write the one bad copy. so we do the mapping for ourselves and issue 1918 * want to write the one bad copy. so we do the mapping for ourselves and issue
1921 * submit_bio directly. 1919 * submit_bio directly.
1922 * to avoid any synchonization issues, wait for the data after writing, which 1920 * to avoid any synchronization issues, wait for the data after writing, which
1923 * actually prevents the read that triggered the error from finishing. 1921 * actually prevents the read that triggered the error from finishing.
1924 * currently, there can be no more than two copies of every data bit. thus, 1922 * currently, there can be no more than two copies of every data bit. thus,
1925 * exactly one rewrite is required. 1923 * exactly one rewrite is required.
1926 */ 1924 */
1927int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, 1925int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
1928 u64 length, u64 logical, struct page *page, 1926 u64 length, u64 logical, struct page *page,
1929 int mirror_num) 1927 int mirror_num)
1930{ 1928{
@@ -1946,7 +1944,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1946 bio->bi_size = 0; 1944 bio->bi_size = 0;
1947 map_length = length; 1945 map_length = length;
1948 1946
1949 ret = btrfs_map_block(map_tree, WRITE, logical, 1947 ret = btrfs_map_block(fs_info, WRITE, logical,
1950 &map_length, &bbio, mirror_num); 1948 &map_length, &bbio, mirror_num);
1951 if (ret) { 1949 if (ret) {
1952 bio_put(bio); 1950 bio_put(bio);
@@ -1984,14 +1982,13 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1984int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, 1982int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
1985 int mirror_num) 1983 int mirror_num)
1986{ 1984{
1987 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
1988 u64 start = eb->start; 1985 u64 start = eb->start;
1989 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); 1986 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
1990 int ret = 0; 1987 int ret = 0;
1991 1988
1992 for (i = 0; i < num_pages; i++) { 1989 for (i = 0; i < num_pages; i++) {
1993 struct page *p = extent_buffer_page(eb, i); 1990 struct page *p = extent_buffer_page(eb, i);
1994 ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE, 1991 ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
1995 start, p, mirror_num); 1992 start, p, mirror_num);
1996 if (ret) 1993 if (ret)
1997 break; 1994 break;
@@ -2010,7 +2007,7 @@ static int clean_io_failure(u64 start, struct page *page)
2010 u64 private; 2007 u64 private;
2011 u64 private_failure; 2008 u64 private_failure;
2012 struct io_failure_record *failrec; 2009 struct io_failure_record *failrec;
2013 struct btrfs_mapping_tree *map_tree; 2010 struct btrfs_fs_info *fs_info;
2014 struct extent_state *state; 2011 struct extent_state *state;
2015 int num_copies; 2012 int num_copies;
2016 int did_repair = 0; 2013 int did_repair = 0;
@@ -2046,11 +2043,11 @@ static int clean_io_failure(u64 start, struct page *page)
2046 spin_unlock(&BTRFS_I(inode)->io_tree.lock); 2043 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
2047 2044
2048 if (state && state->start == failrec->start) { 2045 if (state && state->start == failrec->start) {
2049 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; 2046 fs_info = BTRFS_I(inode)->root->fs_info;
2050 num_copies = btrfs_num_copies(map_tree, failrec->logical, 2047 num_copies = btrfs_num_copies(fs_info, failrec->logical,
2051 failrec->len); 2048 failrec->len);
2052 if (num_copies > 1) { 2049 if (num_copies > 1) {
2053 ret = repair_io_failure(map_tree, start, failrec->len, 2050 ret = repair_io_failure(fs_info, start, failrec->len,
2054 failrec->logical, page, 2051 failrec->logical, page,
2055 failrec->failed_mirror); 2052 failrec->failed_mirror);
2056 did_repair = !ret; 2053 did_repair = !ret;
@@ -2159,9 +2156,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
2159 * clean_io_failure() clean all those errors at once. 2156 * clean_io_failure() clean all those errors at once.
2160 */ 2157 */
2161 } 2158 }
2162 num_copies = btrfs_num_copies( 2159 num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
2163 &BTRFS_I(inode)->root->fs_info->mapping_tree, 2160 failrec->logical, failrec->len);
2164 failrec->logical, failrec->len);
2165 if (num_copies == 1) { 2161 if (num_copies == 1) {
2166 /* 2162 /*
2167 * we only have a single copy of the data, so don't bother with 2163 * we only have a single copy of the data, so don't bother with
@@ -2466,10 +2462,6 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2466 return bio; 2462 return bio;
2467} 2463}
2468 2464
2469/*
2470 * Since writes are async, they will only return -ENOMEM.
2471 * Reads can return the full range of I/O error conditions.
2472 */
2473static int __must_check submit_one_bio(int rw, struct bio *bio, 2465static int __must_check submit_one_bio(int rw, struct bio *bio,
2474 int mirror_num, unsigned long bio_flags) 2466 int mirror_num, unsigned long bio_flags)
2475{ 2467{
@@ -4721,10 +4713,9 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
4721 } 4713 }
4722 4714
4723 if (start + min_len > eb->len) { 4715 if (start + min_len > eb->len) {
4724 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " 4716 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
4725 "wanted %lu %lu\n", (unsigned long long)eb->start, 4717 "wanted %lu %lu\n", (unsigned long long)eb->start,
4726 eb->len, start, min_len); 4718 eb->len, start, min_len);
4727 WARN_ON(1);
4728 return -EINVAL; 4719 return -EINVAL;
4729 } 4720 }
4730 4721
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 711d12b80028..2eacfabd3263 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -337,9 +337,9 @@ struct bio *
337btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 337btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
338 gfp_t gfp_flags); 338 gfp_t gfp_flags);
339 339
340struct btrfs_mapping_tree; 340struct btrfs_fs_info;
341 341
342int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, 342int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
343 u64 length, u64 logical, struct page *page, 343 u64 length, u64 logical, struct page *page,
344 int mirror_num); 344 int mirror_num);
345int end_extent_writepage(struct page *page, int err, u64 start, u64 end); 345int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index ce9f79216723..f169d6b11d7f 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -49,7 +49,7 @@ void extent_map_tree_init(struct extent_map_tree *tree)
49struct extent_map *alloc_extent_map(void) 49struct extent_map *alloc_extent_map(void)
50{ 50{
51 struct extent_map *em; 51 struct extent_map *em;
52 em = kmem_cache_alloc(extent_map_cache, GFP_NOFS); 52 em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
53 if (!em) 53 if (!em)
54 return NULL; 54 return NULL;
55 em->in_tree = 0; 55 em->in_tree = 0;
@@ -198,16 +198,15 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
198 merge = rb_entry(rb, struct extent_map, rb_node); 198 merge = rb_entry(rb, struct extent_map, rb_node);
199 if (rb && mergable_maps(merge, em)) { 199 if (rb && mergable_maps(merge, em)) {
200 em->start = merge->start; 200 em->start = merge->start;
201 em->orig_start = merge->orig_start;
201 em->len += merge->len; 202 em->len += merge->len;
202 em->block_len += merge->block_len; 203 em->block_len += merge->block_len;
203 em->block_start = merge->block_start; 204 em->block_start = merge->block_start;
204 merge->in_tree = 0; 205 merge->in_tree = 0;
205 if (merge->generation > em->generation) { 206 em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
206 em->mod_start = em->start; 207 em->mod_start = merge->mod_start;
207 em->mod_len = em->len; 208 em->generation = max(em->generation, merge->generation);
208 em->generation = merge->generation; 209 list_move(&em->list, &tree->modified_extents);
209 list_move(&em->list, &tree->modified_extents);
210 }
211 210
212 list_del_init(&merge->list); 211 list_del_init(&merge->list);
213 rb_erase(&merge->rb_node, &tree->map); 212 rb_erase(&merge->rb_node, &tree->map);
@@ -223,11 +222,8 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
223 em->block_len += merge->len; 222 em->block_len += merge->len;
224 rb_erase(&merge->rb_node, &tree->map); 223 rb_erase(&merge->rb_node, &tree->map);
225 merge->in_tree = 0; 224 merge->in_tree = 0;
226 if (merge->generation > em->generation) { 225 em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
227 em->mod_len = em->len; 226 em->generation = max(em->generation, merge->generation);
228 em->generation = merge->generation;
229 list_move(&em->list, &tree->modified_extents);
230 }
231 list_del_init(&merge->list); 227 list_del_init(&merge->list);
232 free_extent_map(merge); 228 free_extent_map(merge);
233 } 229 }
@@ -265,9 +261,9 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
265 em->mod_start = em->start; 261 em->mod_start = em->start;
266 em->mod_len = em->len; 262 em->mod_len = em->len;
267 263
268 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 264 if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
269 prealloc = true; 265 prealloc = true;
270 clear_bit(EXTENT_FLAG_PREALLOC, &em->flags); 266 clear_bit(EXTENT_FLAG_FILLING, &em->flags);
271 } 267 }
272 268
273 try_merge_map(tree, em); 269 try_merge_map(tree, em);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 679225555f7b..922943ce29e8 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -14,6 +14,7 @@
14#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ 14#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ 15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
16#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ 16#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
17#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
17 18
18struct extent_map { 19struct extent_map {
19 struct rb_node rb_node; 20 struct rb_node rb_node;
@@ -24,6 +25,7 @@ struct extent_map {
24 u64 mod_start; 25 u64 mod_start;
25 u64 mod_len; 26 u64 mod_len;
26 u64 orig_start; 27 u64 orig_start;
28 u64 orig_block_len;
27 u64 block_start; 29 u64 block_start;
28 u64 block_len; 30 u64 block_len;
29 u64 generation; 31 u64 generation;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 1ad08e4e4a15..bd38cef42358 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -133,7 +133,6 @@ fail:
133 return ERR_PTR(ret); 133 return ERR_PTR(ret);
134} 134}
135 135
136
137int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, 136int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
138 struct btrfs_root *root, 137 struct btrfs_root *root,
139 struct btrfs_path *path, u64 objectid, 138 struct btrfs_path *path, u64 objectid,
@@ -151,6 +150,26 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
151 return ret; 150 return ret;
152} 151}
153 152
153u64 btrfs_file_extent_length(struct btrfs_path *path)
154{
155 int extent_type;
156 struct btrfs_file_extent_item *fi;
157 u64 len;
158
159 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
160 struct btrfs_file_extent_item);
161 extent_type = btrfs_file_extent_type(path->nodes[0], fi);
162
163 if (extent_type == BTRFS_FILE_EXTENT_REG ||
164 extent_type == BTRFS_FILE_EXTENT_PREALLOC)
165 len = btrfs_file_extent_num_bytes(path->nodes[0], fi);
166 else if (extent_type == BTRFS_FILE_EXTENT_INLINE)
167 len = btrfs_file_extent_inline_len(path->nodes[0], fi);
168 else
169 BUG();
170
171 return len;
172}
154 173
155static int __btrfs_lookup_bio_sums(struct btrfs_root *root, 174static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
156 struct inode *inode, struct bio *bio, 175 struct inode *inode, struct bio *bio,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9c6673a9231f..77061bf43edb 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -41,6 +41,7 @@
41#include "compat.h" 41#include "compat.h"
42#include "volumes.h" 42#include "volumes.h"
43 43
44static struct kmem_cache *btrfs_inode_defrag_cachep;
44/* 45/*
45 * when auto defrag is enabled we 46 * when auto defrag is enabled we
46 * queue up these defrag structs to remember which 47 * queue up these defrag structs to remember which
@@ -90,7 +91,7 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1,
90 * If an existing record is found the defrag item you 91 * If an existing record is found the defrag item you
91 * pass in is freed 92 * pass in is freed
92 */ 93 */
93static void __btrfs_add_inode_defrag(struct inode *inode, 94static int __btrfs_add_inode_defrag(struct inode *inode,
94 struct inode_defrag *defrag) 95 struct inode_defrag *defrag)
95{ 96{
96 struct btrfs_root *root = BTRFS_I(inode)->root; 97 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -118,18 +119,24 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
118 entry->transid = defrag->transid; 119 entry->transid = defrag->transid;
119 if (defrag->last_offset > entry->last_offset) 120 if (defrag->last_offset > entry->last_offset)
120 entry->last_offset = defrag->last_offset; 121 entry->last_offset = defrag->last_offset;
121 goto exists; 122 return -EEXIST;
122 } 123 }
123 } 124 }
124 set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 125 set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
125 rb_link_node(&defrag->rb_node, parent, p); 126 rb_link_node(&defrag->rb_node, parent, p);
126 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); 127 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
127 return; 128 return 0;
129}
128 130
129exists: 131static inline int __need_auto_defrag(struct btrfs_root *root)
130 kfree(defrag); 132{
131 return; 133 if (!btrfs_test_opt(root, AUTO_DEFRAG))
134 return 0;
135
136 if (btrfs_fs_closing(root->fs_info))
137 return 0;
132 138
139 return 1;
133} 140}
134 141
135/* 142/*
@@ -142,11 +149,9 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
142 struct btrfs_root *root = BTRFS_I(inode)->root; 149 struct btrfs_root *root = BTRFS_I(inode)->root;
143 struct inode_defrag *defrag; 150 struct inode_defrag *defrag;
144 u64 transid; 151 u64 transid;
152 int ret;
145 153
146 if (!btrfs_test_opt(root, AUTO_DEFRAG)) 154 if (!__need_auto_defrag(root))
147 return 0;
148
149 if (btrfs_fs_closing(root->fs_info))
150 return 0; 155 return 0;
151 156
152 if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) 157 if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
@@ -157,7 +162,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
157 else 162 else
158 transid = BTRFS_I(inode)->root->last_trans; 163 transid = BTRFS_I(inode)->root->last_trans;
159 164
160 defrag = kzalloc(sizeof(*defrag), GFP_NOFS); 165 defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
161 if (!defrag) 166 if (!defrag)
162 return -ENOMEM; 167 return -ENOMEM;
163 168
@@ -166,20 +171,56 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
166 defrag->root = root->root_key.objectid; 171 defrag->root = root->root_key.objectid;
167 172
168 spin_lock(&root->fs_info->defrag_inodes_lock); 173 spin_lock(&root->fs_info->defrag_inodes_lock);
169 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) 174 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
170 __btrfs_add_inode_defrag(inode, defrag); 175 /*
171 else 176 * If we set IN_DEFRAG flag and evict the inode from memory,
172 kfree(defrag); 177 * and then re-read this inode, this new inode doesn't have
178 * IN_DEFRAG flag. At the case, we may find the existed defrag.
179 */
180 ret = __btrfs_add_inode_defrag(inode, defrag);
181 if (ret)
182 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
183 } else {
184 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
185 }
173 spin_unlock(&root->fs_info->defrag_inodes_lock); 186 spin_unlock(&root->fs_info->defrag_inodes_lock);
174 return 0; 187 return 0;
175} 188}
176 189
177/* 190/*
178 * must be called with the defrag_inodes lock held 191 * Requeue the defrag object. If there is a defrag object that points to
192 * the same inode in the tree, we will merge them together (by
193 * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
179 */ 194 */
180struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, 195void btrfs_requeue_inode_defrag(struct inode *inode,
181 u64 root, u64 ino, 196 struct inode_defrag *defrag)
182 struct rb_node **next) 197{
198 struct btrfs_root *root = BTRFS_I(inode)->root;
199 int ret;
200
201 if (!__need_auto_defrag(root))
202 goto out;
203
204 /*
205 * Here we don't check the IN_DEFRAG flag, because we need merge
206 * them together.
207 */
208 spin_lock(&root->fs_info->defrag_inodes_lock);
209 ret = __btrfs_add_inode_defrag(inode, defrag);
210 spin_unlock(&root->fs_info->defrag_inodes_lock);
211 if (ret)
212 goto out;
213 return;
214out:
215 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
216}
217
218/*
219 * pick the defragable inode that we want, if it doesn't exist, we will get
220 * the next one.
221 */
222static struct inode_defrag *
223btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
183{ 224{
184 struct inode_defrag *entry = NULL; 225 struct inode_defrag *entry = NULL;
185 struct inode_defrag tmp; 226 struct inode_defrag tmp;
@@ -190,7 +231,8 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
190 tmp.ino = ino; 231 tmp.ino = ino;
191 tmp.root = root; 232 tmp.root = root;
192 233
193 p = info->defrag_inodes.rb_node; 234 spin_lock(&fs_info->defrag_inodes_lock);
235 p = fs_info->defrag_inodes.rb_node;
194 while (p) { 236 while (p) {
195 parent = p; 237 parent = p;
196 entry = rb_entry(parent, struct inode_defrag, rb_node); 238 entry = rb_entry(parent, struct inode_defrag, rb_node);
@@ -201,52 +243,131 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
201 else if (ret > 0) 243 else if (ret > 0)
202 p = parent->rb_right; 244 p = parent->rb_right;
203 else 245 else
204 return entry; 246 goto out;
205 } 247 }
206 248
207 if (next) { 249 if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
208 while (parent && __compare_inode_defrag(&tmp, entry) > 0) { 250 parent = rb_next(parent);
209 parent = rb_next(parent); 251 if (parent)
210 entry = rb_entry(parent, struct inode_defrag, rb_node); 252 entry = rb_entry(parent, struct inode_defrag, rb_node);
211 } 253 else
212 *next = parent; 254 entry = NULL;
213 } 255 }
214 return NULL; 256out:
257 if (entry)
258 rb_erase(parent, &fs_info->defrag_inodes);
259 spin_unlock(&fs_info->defrag_inodes_lock);
260 return entry;
215} 261}
216 262
217/* 263void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
218 * run through the list of inodes in the FS that need
219 * defragging
220 */
221int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
222{ 264{
223 struct inode_defrag *defrag; 265 struct inode_defrag *defrag;
266 struct rb_node *node;
267
268 spin_lock(&fs_info->defrag_inodes_lock);
269 node = rb_first(&fs_info->defrag_inodes);
270 while (node) {
271 rb_erase(node, &fs_info->defrag_inodes);
272 defrag = rb_entry(node, struct inode_defrag, rb_node);
273 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
274
275 if (need_resched()) {
276 spin_unlock(&fs_info->defrag_inodes_lock);
277 cond_resched();
278 spin_lock(&fs_info->defrag_inodes_lock);
279 }
280
281 node = rb_first(&fs_info->defrag_inodes);
282 }
283 spin_unlock(&fs_info->defrag_inodes_lock);
284}
285
286#define BTRFS_DEFRAG_BATCH 1024
287
288static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
289 struct inode_defrag *defrag)
290{
224 struct btrfs_root *inode_root; 291 struct btrfs_root *inode_root;
225 struct inode *inode; 292 struct inode *inode;
226 struct rb_node *n;
227 struct btrfs_key key; 293 struct btrfs_key key;
228 struct btrfs_ioctl_defrag_range_args range; 294 struct btrfs_ioctl_defrag_range_args range;
229 u64 first_ino = 0;
230 u64 root_objectid = 0;
231 int num_defrag; 295 int num_defrag;
232 int defrag_batch = 1024;
233 296
297 /* get the inode */
298 key.objectid = defrag->root;
299 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
300 key.offset = (u64)-1;
301 inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
302 if (IS_ERR(inode_root)) {
303 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
304 return PTR_ERR(inode_root);
305 }
306
307 key.objectid = defrag->ino;
308 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
309 key.offset = 0;
310 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
311 if (IS_ERR(inode)) {
312 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
313 return PTR_ERR(inode);
314 }
315
316 /* do a chunk of defrag */
317 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
234 memset(&range, 0, sizeof(range)); 318 memset(&range, 0, sizeof(range));
235 range.len = (u64)-1; 319 range.len = (u64)-1;
320 range.start = defrag->last_offset;
321
322 sb_start_write(fs_info->sb);
323 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
324 BTRFS_DEFRAG_BATCH);
325 sb_end_write(fs_info->sb);
326 /*
327 * if we filled the whole defrag batch, there
328 * must be more work to do. Queue this defrag
329 * again
330 */
331 if (num_defrag == BTRFS_DEFRAG_BATCH) {
332 defrag->last_offset = range.start;
333 btrfs_requeue_inode_defrag(inode, defrag);
334 } else if (defrag->last_offset && !defrag->cycled) {
335 /*
336 * we didn't fill our defrag batch, but
337 * we didn't start at zero. Make sure we loop
338 * around to the start of the file.
339 */
340 defrag->last_offset = 0;
341 defrag->cycled = 1;
342 btrfs_requeue_inode_defrag(inode, defrag);
343 } else {
344 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
345 }
346
347 iput(inode);
348 return 0;
349}
350
351/*
352 * run through the list of inodes in the FS that need
353 * defragging
354 */
355int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
356{
357 struct inode_defrag *defrag;
358 u64 first_ino = 0;
359 u64 root_objectid = 0;
236 360
237 atomic_inc(&fs_info->defrag_running); 361 atomic_inc(&fs_info->defrag_running);
238 spin_lock(&fs_info->defrag_inodes_lock);
239 while(1) { 362 while(1) {
240 n = NULL; 363 if (!__need_auto_defrag(fs_info->tree_root))
364 break;
241 365
242 /* find an inode to defrag */ 366 /* find an inode to defrag */
243 defrag = btrfs_find_defrag_inode(fs_info, root_objectid, 367 defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
244 first_ino, &n); 368 first_ino);
245 if (!defrag) { 369 if (!defrag) {
246 if (n) { 370 if (root_objectid || first_ino) {
247 defrag = rb_entry(n, struct inode_defrag,
248 rb_node);
249 } else if (root_objectid || first_ino) {
250 root_objectid = 0; 371 root_objectid = 0;
251 first_ino = 0; 372 first_ino = 0;
252 continue; 373 continue;
@@ -255,70 +376,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
255 } 376 }
256 } 377 }
257 378
258 /* remove it from the rbtree */
259 first_ino = defrag->ino + 1; 379 first_ino = defrag->ino + 1;
260 root_objectid = defrag->root; 380 root_objectid = defrag->root;
261 rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
262
263 if (btrfs_fs_closing(fs_info))
264 goto next_free;
265
266 spin_unlock(&fs_info->defrag_inodes_lock);
267
268 /* get the inode */
269 key.objectid = defrag->root;
270 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
271 key.offset = (u64)-1;
272 inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
273 if (IS_ERR(inode_root))
274 goto next;
275
276 key.objectid = defrag->ino;
277 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
278 key.offset = 0;
279
280 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
281 if (IS_ERR(inode))
282 goto next;
283 381
284 /* do a chunk of defrag */ 382 __btrfs_run_defrag_inode(fs_info, defrag);
285 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
286 range.start = defrag->last_offset;
287 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
288 defrag_batch);
289 /*
290 * if we filled the whole defrag batch, there
291 * must be more work to do. Queue this defrag
292 * again
293 */
294 if (num_defrag == defrag_batch) {
295 defrag->last_offset = range.start;
296 __btrfs_add_inode_defrag(inode, defrag);
297 /*
298 * we don't want to kfree defrag, we added it back to
299 * the rbtree
300 */
301 defrag = NULL;
302 } else if (defrag->last_offset && !defrag->cycled) {
303 /*
304 * we didn't fill our defrag batch, but
305 * we didn't start at zero. Make sure we loop
306 * around to the start of the file.
307 */
308 defrag->last_offset = 0;
309 defrag->cycled = 1;
310 __btrfs_add_inode_defrag(inode, defrag);
311 defrag = NULL;
312 }
313
314 iput(inode);
315next:
316 spin_lock(&fs_info->defrag_inodes_lock);
317next_free:
318 kfree(defrag);
319 } 383 }
320 spin_unlock(&fs_info->defrag_inodes_lock);
321
322 atomic_dec(&fs_info->defrag_running); 384 atomic_dec(&fs_info->defrag_running);
323 385
324 /* 386 /*
@@ -526,6 +588,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
526 split->block_len = em->block_len; 588 split->block_len = em->block_len;
527 else 589 else
528 split->block_len = split->len; 590 split->block_len = split->len;
591 split->orig_block_len = max(split->block_len,
592 em->orig_block_len);
529 split->generation = gen; 593 split->generation = gen;
530 split->bdev = em->bdev; 594 split->bdev = em->bdev;
531 split->flags = flags; 595 split->flags = flags;
@@ -547,6 +611,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
547 split->flags = flags; 611 split->flags = flags;
548 split->compress_type = em->compress_type; 612 split->compress_type = em->compress_type;
549 split->generation = gen; 613 split->generation = gen;
614 split->orig_block_len = max(em->block_len,
615 em->orig_block_len);
550 616
551 if (compressed) { 617 if (compressed) {
552 split->block_len = em->block_len; 618 split->block_len = em->block_len;
@@ -555,7 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
555 } else { 621 } else {
556 split->block_len = split->len; 622 split->block_len = split->len;
557 split->block_start = em->block_start + diff; 623 split->block_start = em->block_start + diff;
558 split->orig_start = split->start; 624 split->orig_start = em->orig_start;
559 } 625 }
560 626
561 ret = add_extent_mapping(em_tree, split); 627 ret = add_extent_mapping(em_tree, split);
@@ -1348,7 +1414,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1348 1414
1349 balance_dirty_pages_ratelimited(inode->i_mapping); 1415 balance_dirty_pages_ratelimited(inode->i_mapping);
1350 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1416 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1351 btrfs_btree_balance_dirty(root, 1); 1417 btrfs_btree_balance_dirty(root);
1352 1418
1353 pos += copied; 1419 pos += copied;
1354 num_written += copied; 1420 num_written += copied;
@@ -1397,6 +1463,24 @@ out:
1397 return written ? written : err; 1463 return written ? written : err;
1398} 1464}
1399 1465
1466static void update_time_for_write(struct inode *inode)
1467{
1468 struct timespec now;
1469
1470 if (IS_NOCMTIME(inode))
1471 return;
1472
1473 now = current_fs_time(inode->i_sb);
1474 if (!timespec_equal(&inode->i_mtime, &now))
1475 inode->i_mtime = now;
1476
1477 if (!timespec_equal(&inode->i_ctime, &now))
1478 inode->i_ctime = now;
1479
1480 if (IS_I_VERSION(inode))
1481 inode_inc_iversion(inode);
1482}
1483
1400static ssize_t btrfs_file_aio_write(struct kiocb *iocb, 1484static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1401 const struct iovec *iov, 1485 const struct iovec *iov,
1402 unsigned long nr_segs, loff_t pos) 1486 unsigned long nr_segs, loff_t pos)
@@ -1409,6 +1493,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1409 ssize_t num_written = 0; 1493 ssize_t num_written = 0;
1410 ssize_t err = 0; 1494 ssize_t err = 0;
1411 size_t count, ocount; 1495 size_t count, ocount;
1496 bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
1412 1497
1413 sb_start_write(inode->i_sb); 1498 sb_start_write(inode->i_sb);
1414 1499
@@ -1451,11 +1536,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1451 goto out; 1536 goto out;
1452 } 1537 }
1453 1538
1454 err = file_update_time(file); 1539 /*
1455 if (err) { 1540 * We reserve space for updating the inode when we reserve space for the
1456 mutex_unlock(&inode->i_mutex); 1541 * extent we are going to write, so we will enospc out there. We don't
1457 goto out; 1542 * need to start yet another transaction to update the inode as we will
1458 } 1543 * update the inode when we finish writing whatever data we write.
1544 */
1545 update_time_for_write(inode);
1459 1546
1460 start_pos = round_down(pos, root->sectorsize); 1547 start_pos = round_down(pos, root->sectorsize);
1461 if (start_pos > i_size_read(inode)) { 1548 if (start_pos > i_size_read(inode)) {
@@ -1466,6 +1553,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1466 } 1553 }
1467 } 1554 }
1468 1555
1556 if (sync)
1557 atomic_inc(&BTRFS_I(inode)->sync_writers);
1558
1469 if (unlikely(file->f_flags & O_DIRECT)) { 1559 if (unlikely(file->f_flags & O_DIRECT)) {
1470 num_written = __btrfs_direct_write(iocb, iov, nr_segs, 1560 num_written = __btrfs_direct_write(iocb, iov, nr_segs,
1471 pos, ppos, count, ocount); 1561 pos, ppos, count, ocount);
@@ -1492,14 +1582,21 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1492 * this will either be one more than the running transaction 1582 * this will either be one more than the running transaction
1493 * or the generation used for the next transaction if there isn't 1583 * or the generation used for the next transaction if there isn't
1494 * one running right now. 1584 * one running right now.
1585 *
1586 * We also have to set last_sub_trans to the current log transid,
1587 * otherwise subsequent syncs to a file that's been synced in this
1588 * transaction will appear to have already occured.
1495 */ 1589 */
1496 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 1590 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1591 BTRFS_I(inode)->last_sub_trans = root->log_transid;
1497 if (num_written > 0 || num_written == -EIOCBQUEUED) { 1592 if (num_written > 0 || num_written == -EIOCBQUEUED) {
1498 err = generic_write_sync(file, pos, num_written); 1593 err = generic_write_sync(file, pos, num_written);
1499 if (err < 0 && num_written > 0) 1594 if (err < 0 && num_written > 0)
1500 num_written = err; 1595 num_written = err;
1501 } 1596 }
1502out: 1597out:
1598 if (sync)
1599 atomic_dec(&BTRFS_I(inode)->sync_writers);
1503 sb_end_write(inode->i_sb); 1600 sb_end_write(inode->i_sb);
1504 current->backing_dev_info = NULL; 1601 current->backing_dev_info = NULL;
1505 return num_written ? num_written : err; 1602 return num_written ? num_written : err;
@@ -1550,7 +1647,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1550 * out of the ->i_mutex. If so, we can flush the dirty pages by 1647 * out of the ->i_mutex. If so, we can flush the dirty pages by
1551 * multi-task, and make the performance up. 1648 * multi-task, and make the performance up.
1552 */ 1649 */
1650 atomic_inc(&BTRFS_I(inode)->sync_writers);
1553 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 1651 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
1652 atomic_dec(&BTRFS_I(inode)->sync_writers);
1554 if (ret) 1653 if (ret)
1555 return ret; 1654 return ret;
1556 1655
@@ -1561,7 +1660,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1561 * range being left. 1660 * range being left.
1562 */ 1661 */
1563 atomic_inc(&root->log_batch); 1662 atomic_inc(&root->log_batch);
1564 btrfs_wait_ordered_range(inode, start, end); 1663 btrfs_wait_ordered_range(inode, start, end - start + 1);
1565 atomic_inc(&root->log_batch); 1664 atomic_inc(&root->log_batch);
1566 1665
1567 /* 1666 /*
@@ -1767,6 +1866,7 @@ out:
1767 1866
1768 hole_em->block_start = EXTENT_MAP_HOLE; 1867 hole_em->block_start = EXTENT_MAP_HOLE;
1769 hole_em->block_len = 0; 1868 hole_em->block_len = 0;
1869 hole_em->orig_block_len = 0;
1770 hole_em->bdev = root->fs_info->fs_devices->latest_bdev; 1870 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
1771 hole_em->compress_type = BTRFS_COMPRESS_NONE; 1871 hole_em->compress_type = BTRFS_COMPRESS_NONE;
1772 hole_em->generation = trans->transid; 1872 hole_em->generation = trans->transid;
@@ -1796,48 +1896,51 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
1796 struct btrfs_path *path; 1896 struct btrfs_path *path;
1797 struct btrfs_block_rsv *rsv; 1897 struct btrfs_block_rsv *rsv;
1798 struct btrfs_trans_handle *trans; 1898 struct btrfs_trans_handle *trans;
1799 u64 mask = BTRFS_I(inode)->root->sectorsize - 1; 1899 u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
1800 u64 lockstart = (offset + mask) & ~mask; 1900 u64 lockend = round_down(offset + len,
1801 u64 lockend = ((offset + len) & ~mask) - 1; 1901 BTRFS_I(inode)->root->sectorsize) - 1;
1802 u64 cur_offset = lockstart; 1902 u64 cur_offset = lockstart;
1803 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 1903 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
1804 u64 drop_end; 1904 u64 drop_end;
1805 unsigned long nr;
1806 int ret = 0; 1905 int ret = 0;
1807 int err = 0; 1906 int err = 0;
1808 bool same_page = (offset >> PAGE_CACHE_SHIFT) == 1907 bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
1809 ((offset + len) >> PAGE_CACHE_SHIFT); 1908 ((offset + len - 1) >> PAGE_CACHE_SHIFT));
1810 1909
1811 btrfs_wait_ordered_range(inode, offset, len); 1910 btrfs_wait_ordered_range(inode, offset, len);
1812 1911
1813 mutex_lock(&inode->i_mutex); 1912 mutex_lock(&inode->i_mutex);
1814 if (offset >= inode->i_size) { 1913 /*
1815 mutex_unlock(&inode->i_mutex); 1914 * We needn't truncate any page which is beyond the end of the file
1816 return 0; 1915 * because we are sure there is no data there.
1817 } 1916 */
1818
1819 /* 1917 /*
1820 * Only do this if we are in the same page and we aren't doing the 1918 * Only do this if we are in the same page and we aren't doing the
1821 * entire page. 1919 * entire page.
1822 */ 1920 */
1823 if (same_page && len < PAGE_CACHE_SIZE) { 1921 if (same_page && len < PAGE_CACHE_SIZE) {
1824 ret = btrfs_truncate_page(inode, offset, len, 0); 1922 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
1923 ret = btrfs_truncate_page(inode, offset, len, 0);
1825 mutex_unlock(&inode->i_mutex); 1924 mutex_unlock(&inode->i_mutex);
1826 return ret; 1925 return ret;
1827 } 1926 }
1828 1927
1829 /* zero back part of the first page */ 1928 /* zero back part of the first page */
1830 ret = btrfs_truncate_page(inode, offset, 0, 0); 1929 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
1831 if (ret) { 1930 ret = btrfs_truncate_page(inode, offset, 0, 0);
1832 mutex_unlock(&inode->i_mutex); 1931 if (ret) {
1833 return ret; 1932 mutex_unlock(&inode->i_mutex);
1933 return ret;
1934 }
1834 } 1935 }
1835 1936
1836 /* zero the front end of the last page */ 1937 /* zero the front end of the last page */
1837 ret = btrfs_truncate_page(inode, offset + len, 0, 1); 1938 if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
1838 if (ret) { 1939 ret = btrfs_truncate_page(inode, offset + len, 0, 1);
1839 mutex_unlock(&inode->i_mutex); 1940 if (ret) {
1840 return ret; 1941 mutex_unlock(&inode->i_mutex);
1942 return ret;
1943 }
1841 } 1944 }
1842 1945
1843 if (lockend < lockstart) { 1946 if (lockend < lockstart) {
@@ -1930,9 +2033,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
1930 break; 2033 break;
1931 } 2034 }
1932 2035
1933 nr = trans->blocks_used;
1934 btrfs_end_transaction(trans, root); 2036 btrfs_end_transaction(trans, root);
1935 btrfs_btree_balance_dirty(root, nr); 2037 btrfs_btree_balance_dirty(root);
1936 2038
1937 trans = btrfs_start_transaction(root, 3); 2039 trans = btrfs_start_transaction(root, 3);
1938 if (IS_ERR(trans)) { 2040 if (IS_ERR(trans)) {
@@ -1963,11 +2065,13 @@ out_trans:
1963 if (!trans) 2065 if (!trans)
1964 goto out_free; 2066 goto out_free;
1965 2067
2068 inode_inc_iversion(inode);
2069 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2070
1966 trans->block_rsv = &root->fs_info->trans_block_rsv; 2071 trans->block_rsv = &root->fs_info->trans_block_rsv;
1967 ret = btrfs_update_inode(trans, root, inode); 2072 ret = btrfs_update_inode(trans, root, inode);
1968 nr = trans->blocks_used;
1969 btrfs_end_transaction(trans, root); 2073 btrfs_end_transaction(trans, root);
1970 btrfs_btree_balance_dirty(root, nr); 2074 btrfs_btree_balance_dirty(root);
1971out_free: 2075out_free:
1972 btrfs_free_path(path); 2076 btrfs_free_path(path);
1973 btrfs_free_block_rsv(root, rsv); 2077 btrfs_free_block_rsv(root, rsv);
@@ -1991,12 +2095,12 @@ static long btrfs_fallocate(struct file *file, int mode,
1991 u64 alloc_end; 2095 u64 alloc_end;
1992 u64 alloc_hint = 0; 2096 u64 alloc_hint = 0;
1993 u64 locked_end; 2097 u64 locked_end;
1994 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1995 struct extent_map *em; 2098 struct extent_map *em;
2099 int blocksize = BTRFS_I(inode)->root->sectorsize;
1996 int ret; 2100 int ret;
1997 2101
1998 alloc_start = offset & ~mask; 2102 alloc_start = round_down(offset, blocksize);
1999 alloc_end = (offset + len + mask) & ~mask; 2103 alloc_end = round_up(offset + len, blocksize);
2000 2104
2001 /* Make sure we aren't being give some crap mode */ 2105 /* Make sure we aren't being give some crap mode */
2002 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 2106 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
@@ -2009,7 +2113,7 @@ static long btrfs_fallocate(struct file *file, int mode,
2009 * Make sure we have enough space before we do the 2113 * Make sure we have enough space before we do the
2010 * allocation. 2114 * allocation.
2011 */ 2115 */
2012 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1); 2116 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
2013 if (ret) 2117 if (ret)
2014 return ret; 2118 return ret;
2015 2119
@@ -2077,7 +2181,7 @@ static long btrfs_fallocate(struct file *file, int mode,
2077 } 2181 }
2078 last_byte = min(extent_map_end(em), alloc_end); 2182 last_byte = min(extent_map_end(em), alloc_end);
2079 actual_end = min_t(u64, extent_map_end(em), offset + len); 2183 actual_end = min_t(u64, extent_map_end(em), offset + len);
2080 last_byte = (last_byte + mask) & ~mask; 2184 last_byte = ALIGN(last_byte, blocksize);
2081 2185
2082 if (em->block_start == EXTENT_MAP_HOLE || 2186 if (em->block_start == EXTENT_MAP_HOLE ||
2083 (cur_offset >= inode->i_size && 2187 (cur_offset >= inode->i_size &&
@@ -2116,7 +2220,7 @@ static long btrfs_fallocate(struct file *file, int mode,
2116out: 2220out:
2117 mutex_unlock(&inode->i_mutex); 2221 mutex_unlock(&inode->i_mutex);
2118 /* Let go of our reservation. */ 2222 /* Let go of our reservation. */
2119 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1); 2223 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
2120 return ret; 2224 return ret;
2121} 2225}
2122 2226
@@ -2292,3 +2396,21 @@ const struct file_operations btrfs_file_operations = {
2292 .compat_ioctl = btrfs_ioctl, 2396 .compat_ioctl = btrfs_ioctl,
2293#endif 2397#endif
2294}; 2398};
2399
2400void btrfs_auto_defrag_exit(void)
2401{
2402 if (btrfs_inode_defrag_cachep)
2403 kmem_cache_destroy(btrfs_inode_defrag_cachep);
2404}
2405
2406int btrfs_auto_defrag_init(void)
2407{
2408 btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
2409 sizeof(struct inode_defrag), 0,
2410 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
2411 NULL);
2412 if (!btrfs_inode_defrag_cachep)
2413 return -ENOMEM;
2414
2415 return 0;
2416}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 1027b854b90c..59ea2e4349c9 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -307,7 +307,6 @@ static void io_ctl_unmap_page(struct io_ctl *io_ctl)
307 307
308static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) 308static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
309{ 309{
310 WARN_ON(io_ctl->cur);
311 BUG_ON(io_ctl->index >= io_ctl->num_pages); 310 BUG_ON(io_ctl->index >= io_ctl->num_pages);
312 io_ctl->page = io_ctl->pages[io_ctl->index++]; 311 io_ctl->page = io_ctl->pages[io_ctl->index++];
313 io_ctl->cur = kmap(io_ctl->page); 312 io_ctl->cur = kmap(io_ctl->page);
@@ -1250,18 +1249,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
1250 * if previous extent entry covers the offset, 1249 * if previous extent entry covers the offset,
1251 * we should return it instead of the bitmap entry 1250 * we should return it instead of the bitmap entry
1252 */ 1251 */
1253 n = &entry->offset_index; 1252 n = rb_prev(&entry->offset_index);
1254 while (1) { 1253 if (n) {
1255 n = rb_prev(n);
1256 if (!n)
1257 break;
1258 prev = rb_entry(n, struct btrfs_free_space, 1254 prev = rb_entry(n, struct btrfs_free_space,
1259 offset_index); 1255 offset_index);
1260 if (!prev->bitmap) { 1256 if (!prev->bitmap &&
1261 if (prev->offset + prev->bytes > offset) 1257 prev->offset + prev->bytes > offset)
1262 entry = prev; 1258 entry = prev;
1263 break;
1264 }
1265 } 1259 }
1266 } 1260 }
1267 return entry; 1261 return entry;
@@ -1287,18 +1281,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
1287 } 1281 }
1288 1282
1289 if (entry->bitmap) { 1283 if (entry->bitmap) {
1290 n = &entry->offset_index; 1284 n = rb_prev(&entry->offset_index);
1291 while (1) { 1285 if (n) {
1292 n = rb_prev(n);
1293 if (!n)
1294 break;
1295 prev = rb_entry(n, struct btrfs_free_space, 1286 prev = rb_entry(n, struct btrfs_free_space,
1296 offset_index); 1287 offset_index);
1297 if (!prev->bitmap) { 1288 if (!prev->bitmap &&
1298 if (prev->offset + prev->bytes > offset) 1289 prev->offset + prev->bytes > offset)
1299 return prev; 1290 return prev;
1300 break;
1301 }
1302 } 1291 }
1303 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) 1292 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1304 return entry; 1293 return entry;
@@ -1364,7 +1353,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1364 u64 bitmap_bytes; 1353 u64 bitmap_bytes;
1365 u64 extent_bytes; 1354 u64 extent_bytes;
1366 u64 size = block_group->key.offset; 1355 u64 size = block_group->key.offset;
1367 u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; 1356 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
1368 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 1357 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
1369 1358
1370 BUG_ON(ctl->total_bitmaps > max_bitmaps); 1359 BUG_ON(ctl->total_bitmaps > max_bitmaps);
@@ -1650,8 +1639,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
1650 * some block groups are so tiny they can't be enveloped by a bitmap, so 1639 * some block groups are so tiny they can't be enveloped by a bitmap, so
1651 * don't even bother to create a bitmap for this 1640 * don't even bother to create a bitmap for this
1652 */ 1641 */
1653 if (BITS_PER_BITMAP * block_group->sectorsize > 1642 if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset)
1654 block_group->key.offset)
1655 return false; 1643 return false;
1656 1644
1657 return true; 1645 return true;
@@ -2298,10 +2286,10 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2298 unsigned long total_found = 0; 2286 unsigned long total_found = 0;
2299 int ret; 2287 int ret;
2300 2288
2301 i = offset_to_bit(entry->offset, block_group->sectorsize, 2289 i = offset_to_bit(entry->offset, ctl->unit,
2302 max_t(u64, offset, entry->offset)); 2290 max_t(u64, offset, entry->offset));
2303 want_bits = bytes_to_bits(bytes, block_group->sectorsize); 2291 want_bits = bytes_to_bits(bytes, ctl->unit);
2304 min_bits = bytes_to_bits(min_bytes, block_group->sectorsize); 2292 min_bits = bytes_to_bits(min_bytes, ctl->unit);
2305 2293
2306again: 2294again:
2307 found_bits = 0; 2295 found_bits = 0;
@@ -2325,23 +2313,22 @@ again:
2325 2313
2326 total_found += found_bits; 2314 total_found += found_bits;
2327 2315
2328 if (cluster->max_size < found_bits * block_group->sectorsize) 2316 if (cluster->max_size < found_bits * ctl->unit)
2329 cluster->max_size = found_bits * block_group->sectorsize; 2317 cluster->max_size = found_bits * ctl->unit;
2330 2318
2331 if (total_found < want_bits || cluster->max_size < cont1_bytes) { 2319 if (total_found < want_bits || cluster->max_size < cont1_bytes) {
2332 i = next_zero + 1; 2320 i = next_zero + 1;
2333 goto again; 2321 goto again;
2334 } 2322 }
2335 2323
2336 cluster->window_start = start * block_group->sectorsize + 2324 cluster->window_start = start * ctl->unit + entry->offset;
2337 entry->offset;
2338 rb_erase(&entry->offset_index, &ctl->free_space_offset); 2325 rb_erase(&entry->offset_index, &ctl->free_space_offset);
2339 ret = tree_insert_offset(&cluster->root, entry->offset, 2326 ret = tree_insert_offset(&cluster->root, entry->offset,
2340 &entry->offset_index, 1); 2327 &entry->offset_index, 1);
2341 BUG_ON(ret); /* -EEXIST; Logic error */ 2328 BUG_ON(ret); /* -EEXIST; Logic error */
2342 2329
2343 trace_btrfs_setup_cluster(block_group, cluster, 2330 trace_btrfs_setup_cluster(block_group, cluster,
2344 total_found * block_group->sectorsize, 1); 2331 total_found * ctl->unit, 1);
2345 return 0; 2332 return 0;
2346} 2333}
2347 2334
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b1a1c929ba80..d26f67a59e36 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -434,8 +434,9 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
434 * 3 items for pre-allocation 434 * 3 items for pre-allocation
435 */ 435 */
436 trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); 436 trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
437 ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv, 437 ret = btrfs_block_rsv_add(root, trans->block_rsv,
438 trans->bytes_reserved); 438 trans->bytes_reserved,
439 BTRFS_RESERVE_NO_FLUSH);
439 if (ret) 440 if (ret)
440 goto out; 441 goto out;
441 trace_btrfs_space_reservation(root->fs_info, "ino_cache", 442 trace_btrfs_space_reservation(root->fs_info, "ino_cache",
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 95542a1b3dfc..67ed24ae86bb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -71,6 +71,7 @@ static const struct file_operations btrfs_dir_file_operations;
71static struct extent_io_ops btrfs_extent_io_ops; 71static struct extent_io_ops btrfs_extent_io_ops;
72 72
73static struct kmem_cache *btrfs_inode_cachep; 73static struct kmem_cache *btrfs_inode_cachep;
74static struct kmem_cache *btrfs_delalloc_work_cachep;
74struct kmem_cache *btrfs_trans_handle_cachep; 75struct kmem_cache *btrfs_trans_handle_cachep;
75struct kmem_cache *btrfs_transaction_cachep; 76struct kmem_cache *btrfs_transaction_cachep;
76struct kmem_cache *btrfs_path_cachep; 77struct kmem_cache *btrfs_path_cachep;
@@ -94,6 +95,10 @@ static noinline int cow_file_range(struct inode *inode,
94 struct page *locked_page, 95 struct page *locked_page,
95 u64 start, u64 end, int *page_started, 96 u64 start, u64 end, int *page_started,
96 unsigned long *nr_written, int unlock); 97 unsigned long *nr_written, int unlock);
98static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
99 u64 len, u64 orig_start,
100 u64 block_start, u64 block_len,
101 u64 orig_block_len, int type);
97 102
98static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 103static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
99 struct inode *inode, struct inode *dir, 104 struct inode *inode, struct inode *dir,
@@ -698,14 +703,19 @@ retry:
698 703
699 em->block_start = ins.objectid; 704 em->block_start = ins.objectid;
700 em->block_len = ins.offset; 705 em->block_len = ins.offset;
706 em->orig_block_len = ins.offset;
701 em->bdev = root->fs_info->fs_devices->latest_bdev; 707 em->bdev = root->fs_info->fs_devices->latest_bdev;
702 em->compress_type = async_extent->compress_type; 708 em->compress_type = async_extent->compress_type;
703 set_bit(EXTENT_FLAG_PINNED, &em->flags); 709 set_bit(EXTENT_FLAG_PINNED, &em->flags);
704 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 710 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
711 em->generation = -1;
705 712
706 while (1) { 713 while (1) {
707 write_lock(&em_tree->lock); 714 write_lock(&em_tree->lock);
708 ret = add_extent_mapping(em_tree, em); 715 ret = add_extent_mapping(em_tree, em);
716 if (!ret)
717 list_move(&em->list,
718 &em_tree->modified_extents);
709 write_unlock(&em_tree->lock); 719 write_unlock(&em_tree->lock);
710 if (ret != -EEXIST) { 720 if (ret != -EEXIST) {
711 free_extent_map(em); 721 free_extent_map(em);
@@ -803,14 +813,14 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
803 * required to start IO on it. It may be clean and already done with 813 * required to start IO on it. It may be clean and already done with
804 * IO when we return. 814 * IO when we return.
805 */ 815 */
806static noinline int cow_file_range(struct inode *inode, 816static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
807 struct page *locked_page, 817 struct inode *inode,
808 u64 start, u64 end, int *page_started, 818 struct btrfs_root *root,
809 unsigned long *nr_written, 819 struct page *locked_page,
810 int unlock) 820 u64 start, u64 end, int *page_started,
821 unsigned long *nr_written,
822 int unlock)
811{ 823{
812 struct btrfs_root *root = BTRFS_I(inode)->root;
813 struct btrfs_trans_handle *trans;
814 u64 alloc_hint = 0; 824 u64 alloc_hint = 0;
815 u64 num_bytes; 825 u64 num_bytes;
816 unsigned long ram_size; 826 unsigned long ram_size;
@@ -823,25 +833,10 @@ static noinline int cow_file_range(struct inode *inode,
823 int ret = 0; 833 int ret = 0;
824 834
825 BUG_ON(btrfs_is_free_space_inode(inode)); 835 BUG_ON(btrfs_is_free_space_inode(inode));
826 trans = btrfs_join_transaction(root);
827 if (IS_ERR(trans)) {
828 extent_clear_unlock_delalloc(inode,
829 &BTRFS_I(inode)->io_tree,
830 start, end, locked_page,
831 EXTENT_CLEAR_UNLOCK_PAGE |
832 EXTENT_CLEAR_UNLOCK |
833 EXTENT_CLEAR_DELALLOC |
834 EXTENT_CLEAR_DIRTY |
835 EXTENT_SET_WRITEBACK |
836 EXTENT_END_WRITEBACK);
837 return PTR_ERR(trans);
838 }
839 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
840 836
841 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 837 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
842 num_bytes = max(blocksize, num_bytes); 838 num_bytes = max(blocksize, num_bytes);
843 disk_num_bytes = num_bytes; 839 disk_num_bytes = num_bytes;
844 ret = 0;
845 840
846 /* if this is a small write inside eof, kick off defrag */ 841 /* if this is a small write inside eof, kick off defrag */
847 if (num_bytes < 64 * 1024 && 842 if (num_bytes < 64 * 1024 &&
@@ -900,12 +895,17 @@ static noinline int cow_file_range(struct inode *inode,
900 895
901 em->block_start = ins.objectid; 896 em->block_start = ins.objectid;
902 em->block_len = ins.offset; 897 em->block_len = ins.offset;
898 em->orig_block_len = ins.offset;
903 em->bdev = root->fs_info->fs_devices->latest_bdev; 899 em->bdev = root->fs_info->fs_devices->latest_bdev;
904 set_bit(EXTENT_FLAG_PINNED, &em->flags); 900 set_bit(EXTENT_FLAG_PINNED, &em->flags);
901 em->generation = -1;
905 902
906 while (1) { 903 while (1) {
907 write_lock(&em_tree->lock); 904 write_lock(&em_tree->lock);
908 ret = add_extent_mapping(em_tree, em); 905 ret = add_extent_mapping(em_tree, em);
906 if (!ret)
907 list_move(&em->list,
908 &em_tree->modified_extents);
909 write_unlock(&em_tree->lock); 909 write_unlock(&em_tree->lock);
910 if (ret != -EEXIST) { 910 if (ret != -EEXIST) {
911 free_extent_map(em); 911 free_extent_map(em);
@@ -952,11 +952,9 @@ static noinline int cow_file_range(struct inode *inode,
952 alloc_hint = ins.objectid + ins.offset; 952 alloc_hint = ins.objectid + ins.offset;
953 start += cur_alloc_size; 953 start += cur_alloc_size;
954 } 954 }
955 ret = 0;
956out: 955out:
957 btrfs_end_transaction(trans, root);
958
959 return ret; 956 return ret;
957
960out_unlock: 958out_unlock:
961 extent_clear_unlock_delalloc(inode, 959 extent_clear_unlock_delalloc(inode,
962 &BTRFS_I(inode)->io_tree, 960 &BTRFS_I(inode)->io_tree,
@@ -971,6 +969,39 @@ out_unlock:
971 goto out; 969 goto out;
972} 970}
973 971
972static noinline int cow_file_range(struct inode *inode,
973 struct page *locked_page,
974 u64 start, u64 end, int *page_started,
975 unsigned long *nr_written,
976 int unlock)
977{
978 struct btrfs_trans_handle *trans;
979 struct btrfs_root *root = BTRFS_I(inode)->root;
980 int ret;
981
982 trans = btrfs_join_transaction(root);
983 if (IS_ERR(trans)) {
984 extent_clear_unlock_delalloc(inode,
985 &BTRFS_I(inode)->io_tree,
986 start, end, locked_page,
987 EXTENT_CLEAR_UNLOCK_PAGE |
988 EXTENT_CLEAR_UNLOCK |
989 EXTENT_CLEAR_DELALLOC |
990 EXTENT_CLEAR_DIRTY |
991 EXTENT_SET_WRITEBACK |
992 EXTENT_END_WRITEBACK);
993 return PTR_ERR(trans);
994 }
995 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
996
997 ret = __cow_file_range(trans, inode, root, locked_page, start, end,
998 page_started, nr_written, unlock);
999
1000 btrfs_end_transaction(trans, root);
1001
1002 return ret;
1003}
1004
974/* 1005/*
975 * work queue call back to started compression on a file and pages 1006 * work queue call back to started compression on a file and pages
976 */ 1007 */
@@ -1126,6 +1157,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
1126 u64 extent_offset; 1157 u64 extent_offset;
1127 u64 disk_bytenr; 1158 u64 disk_bytenr;
1128 u64 num_bytes; 1159 u64 num_bytes;
1160 u64 disk_num_bytes;
1129 int extent_type; 1161 int extent_type;
1130 int ret, err; 1162 int ret, err;
1131 int type; 1163 int type;
@@ -1228,6 +1260,8 @@ next_slot:
1228 extent_offset = btrfs_file_extent_offset(leaf, fi); 1260 extent_offset = btrfs_file_extent_offset(leaf, fi);
1229 extent_end = found_key.offset + 1261 extent_end = found_key.offset +
1230 btrfs_file_extent_num_bytes(leaf, fi); 1262 btrfs_file_extent_num_bytes(leaf, fi);
1263 disk_num_bytes =
1264 btrfs_file_extent_disk_num_bytes(leaf, fi);
1231 if (extent_end <= start) { 1265 if (extent_end <= start) {
1232 path->slots[0]++; 1266 path->slots[0]++;
1233 goto next_slot; 1267 goto next_slot;
@@ -1281,9 +1315,9 @@ out_check:
1281 1315
1282 btrfs_release_path(path); 1316 btrfs_release_path(path);
1283 if (cow_start != (u64)-1) { 1317 if (cow_start != (u64)-1) {
1284 ret = cow_file_range(inode, locked_page, cow_start, 1318 ret = __cow_file_range(trans, inode, root, locked_page,
1285 found_key.offset - 1, page_started, 1319 cow_start, found_key.offset - 1,
1286 nr_written, 1); 1320 page_started, nr_written, 1);
1287 if (ret) { 1321 if (ret) {
1288 btrfs_abort_transaction(trans, root, ret); 1322 btrfs_abort_transaction(trans, root, ret);
1289 goto error; 1323 goto error;
@@ -1298,16 +1332,21 @@ out_check:
1298 em = alloc_extent_map(); 1332 em = alloc_extent_map();
1299 BUG_ON(!em); /* -ENOMEM */ 1333 BUG_ON(!em); /* -ENOMEM */
1300 em->start = cur_offset; 1334 em->start = cur_offset;
1301 em->orig_start = em->start; 1335 em->orig_start = found_key.offset - extent_offset;
1302 em->len = num_bytes; 1336 em->len = num_bytes;
1303 em->block_len = num_bytes; 1337 em->block_len = num_bytes;
1304 em->block_start = disk_bytenr; 1338 em->block_start = disk_bytenr;
1339 em->orig_block_len = disk_num_bytes;
1305 em->bdev = root->fs_info->fs_devices->latest_bdev; 1340 em->bdev = root->fs_info->fs_devices->latest_bdev;
1306 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1341 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1307 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 1342 set_bit(EXTENT_FLAG_FILLING, &em->flags);
1343 em->generation = -1;
1308 while (1) { 1344 while (1) {
1309 write_lock(&em_tree->lock); 1345 write_lock(&em_tree->lock);
1310 ret = add_extent_mapping(em_tree, em); 1346 ret = add_extent_mapping(em_tree, em);
1347 if (!ret)
1348 list_move(&em->list,
1349 &em_tree->modified_extents);
1311 write_unlock(&em_tree->lock); 1350 write_unlock(&em_tree->lock);
1312 if (ret != -EEXIST) { 1351 if (ret != -EEXIST) {
1313 free_extent_map(em); 1352 free_extent_map(em);
@@ -1352,8 +1391,9 @@ out_check:
1352 } 1391 }
1353 1392
1354 if (cow_start != (u64)-1) { 1393 if (cow_start != (u64)-1) {
1355 ret = cow_file_range(inode, locked_page, cow_start, end, 1394 ret = __cow_file_range(trans, inode, root, locked_page,
1356 page_started, nr_written, 1); 1395 cow_start, end,
1396 page_started, nr_written, 1);
1357 if (ret) { 1397 if (ret) {
1358 btrfs_abort_transaction(trans, root, ret); 1398 btrfs_abort_transaction(trans, root, ret);
1359 goto error; 1399 goto error;
@@ -1531,7 +1571,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1531 unsigned long bio_flags) 1571 unsigned long bio_flags)
1532{ 1572{
1533 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 1573 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1534 struct btrfs_mapping_tree *map_tree;
1535 u64 logical = (u64)bio->bi_sector << 9; 1574 u64 logical = (u64)bio->bi_sector << 9;
1536 u64 length = 0; 1575 u64 length = 0;
1537 u64 map_length; 1576 u64 map_length;
@@ -1541,11 +1580,10 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1541 return 0; 1580 return 0;
1542 1581
1543 length = bio->bi_size; 1582 length = bio->bi_size;
1544 map_tree = &root->fs_info->mapping_tree;
1545 map_length = length; 1583 map_length = length;
1546 ret = btrfs_map_block(map_tree, READ, logical, 1584 ret = btrfs_map_block(root->fs_info, READ, logical,
1547 &map_length, NULL, 0); 1585 &map_length, NULL, 0);
1548 /* Will always return 0 or 1 with map_multi == NULL */ 1586 /* Will always return 0 with map_multi == NULL */
1549 BUG_ON(ret < 0); 1587 BUG_ON(ret < 0);
1550 if (map_length < length + size) 1588 if (map_length < length + size)
1551 return 1; 1589 return 1;
@@ -1586,7 +1624,12 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1586 u64 bio_offset) 1624 u64 bio_offset)
1587{ 1625{
1588 struct btrfs_root *root = BTRFS_I(inode)->root; 1626 struct btrfs_root *root = BTRFS_I(inode)->root;
1589 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1627 int ret;
1628
1629 ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
1630 if (ret)
1631 bio_endio(bio, ret);
1632 return ret;
1590} 1633}
1591 1634
1592/* 1635/*
@@ -1601,6 +1644,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1601 int ret = 0; 1644 int ret = 0;
1602 int skip_sum; 1645 int skip_sum;
1603 int metadata = 0; 1646 int metadata = 0;
1647 int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1604 1648
1605 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1649 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1606 1650
@@ -1610,31 +1654,43 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1610 if (!(rw & REQ_WRITE)) { 1654 if (!(rw & REQ_WRITE)) {
1611 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); 1655 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1612 if (ret) 1656 if (ret)
1613 return ret; 1657 goto out;
1614 1658
1615 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1659 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1616 return btrfs_submit_compressed_read(inode, bio, 1660 ret = btrfs_submit_compressed_read(inode, bio,
1617 mirror_num, bio_flags); 1661 mirror_num,
1662 bio_flags);
1663 goto out;
1618 } else if (!skip_sum) { 1664 } else if (!skip_sum) {
1619 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); 1665 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1620 if (ret) 1666 if (ret)
1621 return ret; 1667 goto out;
1622 } 1668 }
1623 goto mapit; 1669 goto mapit;
1624 } else if (!skip_sum) { 1670 } else if (async && !skip_sum) {
1625 /* csum items have already been cloned */ 1671 /* csum items have already been cloned */
1626 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 1672 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1627 goto mapit; 1673 goto mapit;
1628 /* we're doing a write, do the async checksumming */ 1674 /* we're doing a write, do the async checksumming */
1629 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1675 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1630 inode, rw, bio, mirror_num, 1676 inode, rw, bio, mirror_num,
1631 bio_flags, bio_offset, 1677 bio_flags, bio_offset,
1632 __btrfs_submit_bio_start, 1678 __btrfs_submit_bio_start,
1633 __btrfs_submit_bio_done); 1679 __btrfs_submit_bio_done);
1680 goto out;
1681 } else if (!skip_sum) {
1682 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1683 if (ret)
1684 goto out;
1634 } 1685 }
1635 1686
1636mapit: 1687mapit:
1637 return btrfs_map_bio(root, rw, bio, mirror_num, 0); 1688 ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
1689
1690out:
1691 if (ret < 0)
1692 bio_endio(bio, ret);
1693 return ret;
1638} 1694}
1639 1695
1640/* 1696/*
@@ -1657,8 +1713,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1657int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 1713int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1658 struct extent_state **cached_state) 1714 struct extent_state **cached_state)
1659{ 1715{
1660 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) 1716 WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
1661 WARN_ON(1);
1662 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1717 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1663 cached_state, GFP_NOFS); 1718 cached_state, GFP_NOFS);
1664} 1719}
@@ -1867,22 +1922,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1867 1922
1868 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1923 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
1869 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 1924 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
1870 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1925 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1871 if (!ret) { 1926 if (nolock)
1872 if (nolock) 1927 trans = btrfs_join_transaction_nolock(root);
1873 trans = btrfs_join_transaction_nolock(root); 1928 else
1874 else 1929 trans = btrfs_join_transaction(root);
1875 trans = btrfs_join_transaction(root); 1930 if (IS_ERR(trans)) {
1876 if (IS_ERR(trans)) { 1931 ret = PTR_ERR(trans);
1877 ret = PTR_ERR(trans); 1932 trans = NULL;
1878 trans = NULL; 1933 goto out;
1879 goto out;
1880 }
1881 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1882 ret = btrfs_update_inode_fallback(trans, root, inode);
1883 if (ret) /* -ENOMEM or corruption */
1884 btrfs_abort_transaction(trans, root, ret);
1885 } 1934 }
1935 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1936 ret = btrfs_update_inode_fallback(trans, root, inode);
1937 if (ret) /* -ENOMEM or corruption */
1938 btrfs_abort_transaction(trans, root, ret);
1886 goto out; 1939 goto out;
1887 } 1940 }
1888 1941
@@ -1931,15 +1984,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1931 add_pending_csums(trans, inode, ordered_extent->file_offset, 1984 add_pending_csums(trans, inode, ordered_extent->file_offset,
1932 &ordered_extent->list); 1985 &ordered_extent->list);
1933 1986
1934 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1987 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1935 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1988 ret = btrfs_update_inode_fallback(trans, root, inode);
1936 ret = btrfs_update_inode_fallback(trans, root, inode); 1989 if (ret) { /* -ENOMEM or corruption */
1937 if (ret) { /* -ENOMEM or corruption */ 1990 btrfs_abort_transaction(trans, root, ret);
1938 btrfs_abort_transaction(trans, root, ret); 1991 goto out_unlock;
1939 goto out_unlock;
1940 }
1941 } else {
1942 btrfs_set_inode_last_trans(trans, inode);
1943 } 1992 }
1944 ret = 0; 1993 ret = 0;
1945out_unlock: 1994out_unlock:
@@ -3074,7 +3123,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3074 struct btrfs_trans_handle *trans; 3123 struct btrfs_trans_handle *trans;
3075 struct inode *inode = dentry->d_inode; 3124 struct inode *inode = dentry->d_inode;
3076 int ret; 3125 int ret;
3077 unsigned long nr = 0;
3078 3126
3079 trans = __unlink_start_trans(dir, dentry); 3127 trans = __unlink_start_trans(dir, dentry);
3080 if (IS_ERR(trans)) 3128 if (IS_ERR(trans))
@@ -3094,9 +3142,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3094 } 3142 }
3095 3143
3096out: 3144out:
3097 nr = trans->blocks_used;
3098 __unlink_end_trans(trans, root); 3145 __unlink_end_trans(trans, root);
3099 btrfs_btree_balance_dirty(root, nr); 3146 btrfs_btree_balance_dirty(root);
3100 return ret; 3147 return ret;
3101} 3148}
3102 3149
@@ -3186,7 +3233,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3186 int err = 0; 3233 int err = 0;
3187 struct btrfs_root *root = BTRFS_I(dir)->root; 3234 struct btrfs_root *root = BTRFS_I(dir)->root;
3188 struct btrfs_trans_handle *trans; 3235 struct btrfs_trans_handle *trans;
3189 unsigned long nr = 0;
3190 3236
3191 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) 3237 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
3192 return -ENOTEMPTY; 3238 return -ENOTEMPTY;
@@ -3215,9 +3261,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3215 if (!err) 3261 if (!err)
3216 btrfs_i_size_write(inode, 0); 3262 btrfs_i_size_write(inode, 0);
3217out: 3263out:
3218 nr = trans->blocks_used;
3219 __unlink_end_trans(trans, root); 3264 __unlink_end_trans(trans, root);
3220 btrfs_btree_balance_dirty(root, nr); 3265 btrfs_btree_balance_dirty(root);
3221 3266
3222 return err; 3267 return err;
3223} 3268}
@@ -3497,11 +3542,11 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
3497 if (ret) 3542 if (ret)
3498 goto out; 3543 goto out;
3499 3544
3500 ret = -ENOMEM;
3501again: 3545again:
3502 page = find_or_create_page(mapping, index, mask); 3546 page = find_or_create_page(mapping, index, mask);
3503 if (!page) { 3547 if (!page) {
3504 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3548 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3549 ret = -ENOMEM;
3505 goto out; 3550 goto out;
3506 } 3551 }
3507 3552
@@ -3550,7 +3595,6 @@ again:
3550 goto out_unlock; 3595 goto out_unlock;
3551 } 3596 }
3552 3597
3553 ret = 0;
3554 if (offset != PAGE_CACHE_SIZE) { 3598 if (offset != PAGE_CACHE_SIZE) {
3555 if (!len) 3599 if (!len)
3556 len = PAGE_CACHE_SIZE - offset; 3600 len = PAGE_CACHE_SIZE - offset;
@@ -3668,6 +3712,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3668 3712
3669 hole_em->block_start = EXTENT_MAP_HOLE; 3713 hole_em->block_start = EXTENT_MAP_HOLE;
3670 hole_em->block_len = 0; 3714 hole_em->block_len = 0;
3715 hole_em->orig_block_len = 0;
3671 hole_em->bdev = root->fs_info->fs_devices->latest_bdev; 3716 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
3672 hole_em->compress_type = BTRFS_COMPRESS_NONE; 3717 hole_em->compress_type = BTRFS_COMPRESS_NONE;
3673 hole_em->generation = trans->transid; 3718 hole_em->generation = trans->transid;
@@ -3783,7 +3828,6 @@ void btrfs_evict_inode(struct inode *inode)
3783 struct btrfs_root *root = BTRFS_I(inode)->root; 3828 struct btrfs_root *root = BTRFS_I(inode)->root;
3784 struct btrfs_block_rsv *rsv, *global_rsv; 3829 struct btrfs_block_rsv *rsv, *global_rsv;
3785 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 3830 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
3786 unsigned long nr;
3787 int ret; 3831 int ret;
3788 3832
3789 trace_btrfs_inode_evict(inode); 3833 trace_btrfs_inode_evict(inode);
@@ -3829,7 +3873,8 @@ void btrfs_evict_inode(struct inode *inode)
3829 * inode item when doing the truncate. 3873 * inode item when doing the truncate.
3830 */ 3874 */
3831 while (1) { 3875 while (1) {
3832 ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); 3876 ret = btrfs_block_rsv_refill(root, rsv, min_size,
3877 BTRFS_RESERVE_FLUSH_LIMIT);
3833 3878
3834 /* 3879 /*
3835 * Try and steal from the global reserve since we will 3880 * Try and steal from the global reserve since we will
@@ -3847,7 +3892,7 @@ void btrfs_evict_inode(struct inode *inode)
3847 goto no_delete; 3892 goto no_delete;
3848 } 3893 }
3849 3894
3850 trans = btrfs_start_transaction_noflush(root, 1); 3895 trans = btrfs_start_transaction_lflush(root, 1);
3851 if (IS_ERR(trans)) { 3896 if (IS_ERR(trans)) {
3852 btrfs_orphan_del(NULL, inode); 3897 btrfs_orphan_del(NULL, inode);
3853 btrfs_free_block_rsv(root, rsv); 3898 btrfs_free_block_rsv(root, rsv);
@@ -3864,10 +3909,9 @@ void btrfs_evict_inode(struct inode *inode)
3864 ret = btrfs_update_inode(trans, root, inode); 3909 ret = btrfs_update_inode(trans, root, inode);
3865 BUG_ON(ret); 3910 BUG_ON(ret);
3866 3911
3867 nr = trans->blocks_used;
3868 btrfs_end_transaction(trans, root); 3912 btrfs_end_transaction(trans, root);
3869 trans = NULL; 3913 trans = NULL;
3870 btrfs_btree_balance_dirty(root, nr); 3914 btrfs_btree_balance_dirty(root);
3871 } 3915 }
3872 3916
3873 btrfs_free_block_rsv(root, rsv); 3917 btrfs_free_block_rsv(root, rsv);
@@ -3883,9 +3927,8 @@ void btrfs_evict_inode(struct inode *inode)
3883 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 3927 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
3884 btrfs_return_ino(root, btrfs_ino(inode)); 3928 btrfs_return_ino(root, btrfs_ino(inode));
3885 3929
3886 nr = trans->blocks_used;
3887 btrfs_end_transaction(trans, root); 3930 btrfs_end_transaction(trans, root);
3888 btrfs_btree_balance_dirty(root, nr); 3931 btrfs_btree_balance_dirty(root);
3889no_delete: 3932no_delete:
3890 clear_inode(inode); 3933 clear_inode(inode);
3891 return; 3934 return;
@@ -4775,8 +4818,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4775 if (S_ISREG(mode)) { 4818 if (S_ISREG(mode)) {
4776 if (btrfs_test_opt(root, NODATASUM)) 4819 if (btrfs_test_opt(root, NODATASUM))
4777 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 4820 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
4778 if (btrfs_test_opt(root, NODATACOW) || 4821 if (btrfs_test_opt(root, NODATACOW))
4779 (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
4780 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 4822 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
4781 } 4823 }
4782 4824
@@ -4842,7 +4884,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
4842 ret = btrfs_insert_dir_item(trans, root, name, name_len, 4884 ret = btrfs_insert_dir_item(trans, root, name, name_len,
4843 parent_inode, &key, 4885 parent_inode, &key,
4844 btrfs_inode_type(inode), index); 4886 btrfs_inode_type(inode), index);
4845 if (ret == -EEXIST) 4887 if (ret == -EEXIST || ret == -EOVERFLOW)
4846 goto fail_dir_item; 4888 goto fail_dir_item;
4847 else if (ret) { 4889 else if (ret) {
4848 btrfs_abort_transaction(trans, root, ret); 4890 btrfs_abort_transaction(trans, root, ret);
@@ -4897,7 +4939,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4897 int err; 4939 int err;
4898 int drop_inode = 0; 4940 int drop_inode = 0;
4899 u64 objectid; 4941 u64 objectid;
4900 unsigned long nr = 0;
4901 u64 index = 0; 4942 u64 index = 0;
4902 4943
4903 if (!new_valid_dev(rdev)) 4944 if (!new_valid_dev(rdev))
@@ -4930,6 +4971,12 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4930 goto out_unlock; 4971 goto out_unlock;
4931 } 4972 }
4932 4973
4974 err = btrfs_update_inode(trans, root, inode);
4975 if (err) {
4976 drop_inode = 1;
4977 goto out_unlock;
4978 }
4979
4933 /* 4980 /*
4934 * If the active LSM wants to access the inode during 4981 * If the active LSM wants to access the inode during
4935 * d_instantiate it needs these. Smack checks to see 4982 * d_instantiate it needs these. Smack checks to see
@@ -4947,9 +4994,8 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4947 d_instantiate(dentry, inode); 4994 d_instantiate(dentry, inode);
4948 } 4995 }
4949out_unlock: 4996out_unlock:
4950 nr = trans->blocks_used;
4951 btrfs_end_transaction(trans, root); 4997 btrfs_end_transaction(trans, root);
4952 btrfs_btree_balance_dirty(root, nr); 4998 btrfs_btree_balance_dirty(root);
4953 if (drop_inode) { 4999 if (drop_inode) {
4954 inode_dec_link_count(inode); 5000 inode_dec_link_count(inode);
4955 iput(inode); 5001 iput(inode);
@@ -4963,9 +5009,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4963 struct btrfs_trans_handle *trans; 5009 struct btrfs_trans_handle *trans;
4964 struct btrfs_root *root = BTRFS_I(dir)->root; 5010 struct btrfs_root *root = BTRFS_I(dir)->root;
4965 struct inode *inode = NULL; 5011 struct inode *inode = NULL;
4966 int drop_inode = 0; 5012 int drop_inode_on_err = 0;
4967 int err; 5013 int err;
4968 unsigned long nr = 0;
4969 u64 objectid; 5014 u64 objectid;
4970 u64 index = 0; 5015 u64 index = 0;
4971 5016
@@ -4989,12 +5034,15 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4989 err = PTR_ERR(inode); 5034 err = PTR_ERR(inode);
4990 goto out_unlock; 5035 goto out_unlock;
4991 } 5036 }
5037 drop_inode_on_err = 1;
4992 5038
4993 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 5039 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
4994 if (err) { 5040 if (err)
4995 drop_inode = 1; 5041 goto out_unlock;
5042
5043 err = btrfs_update_inode(trans, root, inode);
5044 if (err)
4996 goto out_unlock; 5045 goto out_unlock;
4997 }
4998 5046
4999 /* 5047 /*
5000 * If the active LSM wants to access the inode during 5048 * If the active LSM wants to access the inode during
@@ -5007,21 +5055,20 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
5007 5055
5008 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 5056 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
5009 if (err) 5057 if (err)
5010 drop_inode = 1; 5058 goto out_unlock;
5011 else { 5059
5012 inode->i_mapping->a_ops = &btrfs_aops; 5060 inode->i_mapping->a_ops = &btrfs_aops;
5013 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 5061 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
5014 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 5062 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
5015 d_instantiate(dentry, inode); 5063 d_instantiate(dentry, inode);
5016 } 5064
5017out_unlock: 5065out_unlock:
5018 nr = trans->blocks_used;
5019 btrfs_end_transaction(trans, root); 5066 btrfs_end_transaction(trans, root);
5020 if (drop_inode) { 5067 if (err && drop_inode_on_err) {
5021 inode_dec_link_count(inode); 5068 inode_dec_link_count(inode);
5022 iput(inode); 5069 iput(inode);
5023 } 5070 }
5024 btrfs_btree_balance_dirty(root, nr); 5071 btrfs_btree_balance_dirty(root);
5025 return err; 5072 return err;
5026} 5073}
5027 5074
@@ -5032,7 +5079,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5032 struct btrfs_root *root = BTRFS_I(dir)->root; 5079 struct btrfs_root *root = BTRFS_I(dir)->root;
5033 struct inode *inode = old_dentry->d_inode; 5080 struct inode *inode = old_dentry->d_inode;
5034 u64 index; 5081 u64 index;
5035 unsigned long nr = 0;
5036 int err; 5082 int err;
5037 int drop_inode = 0; 5083 int drop_inode = 0;
5038 5084
@@ -5062,6 +5108,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5062 inode_inc_iversion(inode); 5108 inode_inc_iversion(inode);
5063 inode->i_ctime = CURRENT_TIME; 5109 inode->i_ctime = CURRENT_TIME;
5064 ihold(inode); 5110 ihold(inode);
5111 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
5065 5112
5066 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); 5113 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
5067 5114
@@ -5076,14 +5123,13 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5076 btrfs_log_new_name(trans, inode, NULL, parent); 5123 btrfs_log_new_name(trans, inode, NULL, parent);
5077 } 5124 }
5078 5125
5079 nr = trans->blocks_used;
5080 btrfs_end_transaction(trans, root); 5126 btrfs_end_transaction(trans, root);
5081fail: 5127fail:
5082 if (drop_inode) { 5128 if (drop_inode) {
5083 inode_dec_link_count(inode); 5129 inode_dec_link_count(inode);
5084 iput(inode); 5130 iput(inode);
5085 } 5131 }
5086 btrfs_btree_balance_dirty(root, nr); 5132 btrfs_btree_balance_dirty(root);
5087 return err; 5133 return err;
5088} 5134}
5089 5135
@@ -5096,7 +5142,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
5096 int drop_on_err = 0; 5142 int drop_on_err = 0;
5097 u64 objectid = 0; 5143 u64 objectid = 0;
5098 u64 index = 0; 5144 u64 index = 0;
5099 unsigned long nr = 1;
5100 5145
5101 /* 5146 /*
5102 * 2 items for inode and ref 5147 * 2 items for inode and ref
@@ -5142,11 +5187,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
5142 drop_on_err = 0; 5187 drop_on_err = 0;
5143 5188
5144out_fail: 5189out_fail:
5145 nr = trans->blocks_used;
5146 btrfs_end_transaction(trans, root); 5190 btrfs_end_transaction(trans, root);
5147 if (drop_on_err) 5191 if (drop_on_err)
5148 iput(inode); 5192 iput(inode);
5149 btrfs_btree_balance_dirty(root, nr); 5193 btrfs_btree_balance_dirty(root);
5150 return err; 5194 return err;
5151} 5195}
5152 5196
@@ -5340,6 +5384,7 @@ again:
5340 if (start + len <= found_key.offset) 5384 if (start + len <= found_key.offset)
5341 goto not_found; 5385 goto not_found;
5342 em->start = start; 5386 em->start = start;
5387 em->orig_start = start;
5343 em->len = found_key.offset - start; 5388 em->len = found_key.offset - start;
5344 goto not_found_em; 5389 goto not_found_em;
5345 } 5390 }
@@ -5350,6 +5395,8 @@ again:
5350 em->len = extent_end - extent_start; 5395 em->len = extent_end - extent_start;
5351 em->orig_start = extent_start - 5396 em->orig_start = extent_start -
5352 btrfs_file_extent_offset(leaf, item); 5397 btrfs_file_extent_offset(leaf, item);
5398 em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
5399 item);
5353 bytenr = btrfs_file_extent_disk_bytenr(leaf, item); 5400 bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
5354 if (bytenr == 0) { 5401 if (bytenr == 0) {
5355 em->block_start = EXTENT_MAP_HOLE; 5402 em->block_start = EXTENT_MAP_HOLE;
@@ -5359,8 +5406,7 @@ again:
5359 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5406 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5360 em->compress_type = compress_type; 5407 em->compress_type = compress_type;
5361 em->block_start = bytenr; 5408 em->block_start = bytenr;
5362 em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 5409 em->block_len = em->orig_block_len;
5363 item);
5364 } else { 5410 } else {
5365 bytenr += btrfs_file_extent_offset(leaf, item); 5411 bytenr += btrfs_file_extent_offset(leaf, item);
5366 em->block_start = bytenr; 5412 em->block_start = bytenr;
@@ -5390,7 +5436,8 @@ again:
5390 em->start = extent_start + extent_offset; 5436 em->start = extent_start + extent_offset;
5391 em->len = (copy_size + root->sectorsize - 1) & 5437 em->len = (copy_size + root->sectorsize - 1) &
5392 ~((u64)root->sectorsize - 1); 5438 ~((u64)root->sectorsize - 1);
5393 em->orig_start = EXTENT_MAP_INLINE; 5439 em->orig_block_len = em->len;
5440 em->orig_start = em->start;
5394 if (compress_type) { 5441 if (compress_type) {
5395 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5442 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5396 em->compress_type = compress_type; 5443 em->compress_type = compress_type;
@@ -5439,11 +5486,11 @@ again:
5439 extent_map_end(em) - 1, NULL, GFP_NOFS); 5486 extent_map_end(em) - 1, NULL, GFP_NOFS);
5440 goto insert; 5487 goto insert;
5441 } else { 5488 } else {
5442 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); 5489 WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
5443 WARN_ON(1);
5444 } 5490 }
5445not_found: 5491not_found:
5446 em->start = start; 5492 em->start = start;
5493 em->orig_start = start;
5447 em->len = len; 5494 em->len = len;
5448not_found_em: 5495not_found_em:
5449 em->block_start = EXTENT_MAP_HOLE; 5496 em->block_start = EXTENT_MAP_HOLE;
@@ -5645,38 +5692,19 @@ out:
5645} 5692}
5646 5693
5647static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 5694static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5648 struct extent_map *em,
5649 u64 start, u64 len) 5695 u64 start, u64 len)
5650{ 5696{
5651 struct btrfs_root *root = BTRFS_I(inode)->root; 5697 struct btrfs_root *root = BTRFS_I(inode)->root;
5652 struct btrfs_trans_handle *trans; 5698 struct btrfs_trans_handle *trans;
5653 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5699 struct extent_map *em;
5654 struct btrfs_key ins; 5700 struct btrfs_key ins;
5655 u64 alloc_hint; 5701 u64 alloc_hint;
5656 int ret; 5702 int ret;
5657 bool insert = false;
5658
5659 /*
5660 * Ok if the extent map we looked up is a hole and is for the exact
5661 * range we want, there is no reason to allocate a new one, however if
5662 * it is not right then we need to free this one and drop the cache for
5663 * our range.
5664 */
5665 if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
5666 em->len != len) {
5667 free_extent_map(em);
5668 em = NULL;
5669 insert = true;
5670 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5671 }
5672 5703
5673 trans = btrfs_join_transaction(root); 5704 trans = btrfs_join_transaction(root);
5674 if (IS_ERR(trans)) 5705 if (IS_ERR(trans))
5675 return ERR_CAST(trans); 5706 return ERR_CAST(trans);
5676 5707
5677 if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024)
5678 btrfs_add_inode_defrag(trans, inode);
5679
5680 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5708 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5681 5709
5682 alloc_hint = get_extent_allocation_hint(inode, start, len); 5710 alloc_hint = get_extent_allocation_hint(inode, start, len);
@@ -5687,37 +5715,10 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5687 goto out; 5715 goto out;
5688 } 5716 }
5689 5717
5690 if (!em) { 5718 em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
5691 em = alloc_extent_map(); 5719 ins.offset, ins.offset, 0);
5692 if (!em) { 5720 if (IS_ERR(em))
5693 em = ERR_PTR(-ENOMEM); 5721 goto out;
5694 goto out;
5695 }
5696 }
5697
5698 em->start = start;
5699 em->orig_start = em->start;
5700 em->len = ins.offset;
5701
5702 em->block_start = ins.objectid;
5703 em->block_len = ins.offset;
5704 em->bdev = root->fs_info->fs_devices->latest_bdev;
5705
5706 /*
5707 * We need to do this because if we're using the original em we searched
5708 * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
5709 */
5710 em->flags = 0;
5711 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5712
5713 while (insert) {
5714 write_lock(&em_tree->lock);
5715 ret = add_extent_mapping(em_tree, em);
5716 write_unlock(&em_tree->lock);
5717 if (ret != -EEXIST)
5718 break;
5719 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
5720 }
5721 5722
5722 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, 5723 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
5723 ins.offset, ins.offset, 0); 5724 ins.offset, ins.offset, 0);
@@ -5894,7 +5895,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
5894static struct extent_map *create_pinned_em(struct inode *inode, u64 start, 5895static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
5895 u64 len, u64 orig_start, 5896 u64 len, u64 orig_start,
5896 u64 block_start, u64 block_len, 5897 u64 block_start, u64 block_len,
5897 int type) 5898 u64 orig_block_len, int type)
5898{ 5899{
5899 struct extent_map_tree *em_tree; 5900 struct extent_map_tree *em_tree;
5900 struct extent_map *em; 5901 struct extent_map *em;
@@ -5912,15 +5913,20 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
5912 em->block_len = block_len; 5913 em->block_len = block_len;
5913 em->block_start = block_start; 5914 em->block_start = block_start;
5914 em->bdev = root->fs_info->fs_devices->latest_bdev; 5915 em->bdev = root->fs_info->fs_devices->latest_bdev;
5916 em->orig_block_len = orig_block_len;
5917 em->generation = -1;
5915 set_bit(EXTENT_FLAG_PINNED, &em->flags); 5918 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5916 if (type == BTRFS_ORDERED_PREALLOC) 5919 if (type == BTRFS_ORDERED_PREALLOC)
5917 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 5920 set_bit(EXTENT_FLAG_FILLING, &em->flags);
5918 5921
5919 do { 5922 do {
5920 btrfs_drop_extent_cache(inode, em->start, 5923 btrfs_drop_extent_cache(inode, em->start,
5921 em->start + em->len - 1, 0); 5924 em->start + em->len - 1, 0);
5922 write_lock(&em_tree->lock); 5925 write_lock(&em_tree->lock);
5923 ret = add_extent_mapping(em_tree, em); 5926 ret = add_extent_mapping(em_tree, em);
5927 if (!ret)
5928 list_move(&em->list,
5929 &em_tree->modified_extents);
5924 write_unlock(&em_tree->lock); 5930 write_unlock(&em_tree->lock);
5925 } while (ret == -EEXIST); 5931 } while (ret == -EEXIST);
5926 5932
@@ -6047,13 +6053,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
6047 goto must_cow; 6053 goto must_cow;
6048 6054
6049 if (can_nocow_odirect(trans, inode, start, len) == 1) { 6055 if (can_nocow_odirect(trans, inode, start, len) == 1) {
6050 u64 orig_start = em->start; 6056 u64 orig_start = em->orig_start;
6057 u64 orig_block_len = em->orig_block_len;
6051 6058
6052 if (type == BTRFS_ORDERED_PREALLOC) { 6059 if (type == BTRFS_ORDERED_PREALLOC) {
6053 free_extent_map(em); 6060 free_extent_map(em);
6054 em = create_pinned_em(inode, start, len, 6061 em = create_pinned_em(inode, start, len,
6055 orig_start, 6062 orig_start,
6056 block_start, len, type); 6063 block_start, len,
6064 orig_block_len, type);
6057 if (IS_ERR(em)) { 6065 if (IS_ERR(em)) {
6058 btrfs_end_transaction(trans, root); 6066 btrfs_end_transaction(trans, root);
6059 goto unlock_err; 6067 goto unlock_err;
@@ -6077,7 +6085,8 @@ must_cow:
6077 * it above 6085 * it above
6078 */ 6086 */
6079 len = bh_result->b_size; 6087 len = bh_result->b_size;
6080 em = btrfs_new_extent_direct(inode, em, start, len); 6088 free_extent_map(em);
6089 em = btrfs_new_extent_direct(inode, start, len);
6081 if (IS_ERR(em)) { 6090 if (IS_ERR(em)) {
6082 ret = PTR_ERR(em); 6091 ret = PTR_ERR(em);
6083 goto unlock_err; 6092 goto unlock_err;
@@ -6318,6 +6327,9 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
6318 struct btrfs_root *root = BTRFS_I(inode)->root; 6327 struct btrfs_root *root = BTRFS_I(inode)->root;
6319 int ret; 6328 int ret;
6320 6329
6330 if (async_submit)
6331 async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
6332
6321 bio_get(bio); 6333 bio_get(bio);
6322 6334
6323 if (!write) { 6335 if (!write) {
@@ -6362,7 +6374,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6362{ 6374{
6363 struct inode *inode = dip->inode; 6375 struct inode *inode = dip->inode;
6364 struct btrfs_root *root = BTRFS_I(inode)->root; 6376 struct btrfs_root *root = BTRFS_I(inode)->root;
6365 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
6366 struct bio *bio; 6377 struct bio *bio;
6367 struct bio *orig_bio = dip->orig_bio; 6378 struct bio *orig_bio = dip->orig_bio;
6368 struct bio_vec *bvec = orig_bio->bi_io_vec; 6379 struct bio_vec *bvec = orig_bio->bi_io_vec;
@@ -6375,7 +6386,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6375 int async_submit = 0; 6386 int async_submit = 0;
6376 6387
6377 map_length = orig_bio->bi_size; 6388 map_length = orig_bio->bi_size;
6378 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6389 ret = btrfs_map_block(root->fs_info, READ, start_sector << 9,
6379 &map_length, NULL, 0); 6390 &map_length, NULL, 0);
6380 if (ret) { 6391 if (ret) {
6381 bio_put(orig_bio); 6392 bio_put(orig_bio);
@@ -6429,7 +6440,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6429 bio->bi_end_io = btrfs_end_dio_bio; 6440 bio->bi_end_io = btrfs_end_dio_bio;
6430 6441
6431 map_length = orig_bio->bi_size; 6442 map_length = orig_bio->bi_size;
6432 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6443 ret = btrfs_map_block(root->fs_info, READ,
6444 start_sector << 9,
6433 &map_length, NULL, 0); 6445 &map_length, NULL, 0);
6434 if (ret) { 6446 if (ret) {
6435 bio_put(bio); 6447 bio_put(bio);
@@ -6582,9 +6594,17 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
6582 btrfs_submit_direct, 0); 6594 btrfs_submit_direct, 0);
6583} 6595}
6584 6596
6597#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
6598
6585static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 6599static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
6586 __u64 start, __u64 len) 6600 __u64 start, __u64 len)
6587{ 6601{
6602 int ret;
6603
6604 ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
6605 if (ret)
6606 return ret;
6607
6588 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); 6608 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
6589} 6609}
6590 6610
@@ -6855,7 +6875,6 @@ static int btrfs_truncate(struct inode *inode)
6855 int ret; 6875 int ret;
6856 int err = 0; 6876 int err = 0;
6857 struct btrfs_trans_handle *trans; 6877 struct btrfs_trans_handle *trans;
6858 unsigned long nr;
6859 u64 mask = root->sectorsize - 1; 6878 u64 mask = root->sectorsize - 1;
6860 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 6879 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
6861 6880
@@ -6978,9 +6997,8 @@ static int btrfs_truncate(struct inode *inode)
6978 break; 6997 break;
6979 } 6998 }
6980 6999
6981 nr = trans->blocks_used;
6982 btrfs_end_transaction(trans, root); 7000 btrfs_end_transaction(trans, root);
6983 btrfs_btree_balance_dirty(root, nr); 7001 btrfs_btree_balance_dirty(root);
6984 7002
6985 trans = btrfs_start_transaction(root, 2); 7003 trans = btrfs_start_transaction(root, 2);
6986 if (IS_ERR(trans)) { 7004 if (IS_ERR(trans)) {
@@ -7014,9 +7032,8 @@ static int btrfs_truncate(struct inode *inode)
7014 if (ret && !err) 7032 if (ret && !err)
7015 err = ret; 7033 err = ret;
7016 7034
7017 nr = trans->blocks_used;
7018 ret = btrfs_end_transaction(trans, root); 7035 ret = btrfs_end_transaction(trans, root);
7019 btrfs_btree_balance_dirty(root, nr); 7036 btrfs_btree_balance_dirty(root);
7020 } 7037 }
7021 7038
7022out: 7039out:
@@ -7093,6 +7110,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
7093 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); 7110 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
7094 ei->io_tree.track_uptodate = 1; 7111 ei->io_tree.track_uptodate = 1;
7095 ei->io_failure_tree.track_uptodate = 1; 7112 ei->io_failure_tree.track_uptodate = 1;
7113 atomic_set(&ei->sync_writers, 0);
7096 mutex_init(&ei->log_mutex); 7114 mutex_init(&ei->log_mutex);
7097 mutex_init(&ei->delalloc_mutex); 7115 mutex_init(&ei->delalloc_mutex);
7098 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 7116 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
@@ -7203,6 +7221,8 @@ void btrfs_destroy_cachep(void)
7203 kmem_cache_destroy(btrfs_path_cachep); 7221 kmem_cache_destroy(btrfs_path_cachep);
7204 if (btrfs_free_space_cachep) 7222 if (btrfs_free_space_cachep)
7205 kmem_cache_destroy(btrfs_free_space_cachep); 7223 kmem_cache_destroy(btrfs_free_space_cachep);
7224 if (btrfs_delalloc_work_cachep)
7225 kmem_cache_destroy(btrfs_delalloc_work_cachep);
7206} 7226}
7207 7227
7208int btrfs_init_cachep(void) 7228int btrfs_init_cachep(void)
@@ -7237,6 +7257,13 @@ int btrfs_init_cachep(void)
7237 if (!btrfs_free_space_cachep) 7257 if (!btrfs_free_space_cachep)
7238 goto fail; 7258 goto fail;
7239 7259
7260 btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
7261 sizeof(struct btrfs_delalloc_work), 0,
7262 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
7263 NULL);
7264 if (!btrfs_delalloc_work_cachep)
7265 goto fail;
7266
7240 return 0; 7267 return 0;
7241fail: 7268fail:
7242 btrfs_destroy_cachep(); 7269 btrfs_destroy_cachep();
@@ -7308,6 +7335,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7308 if (S_ISDIR(old_inode->i_mode) && new_inode && 7335 if (S_ISDIR(old_inode->i_mode) && new_inode &&
7309 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 7336 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
7310 return -ENOTEMPTY; 7337 return -ENOTEMPTY;
7338
7339
7340 /* check for collisions, even if the name isn't there */
7341 ret = btrfs_check_dir_item_collision(root, new_dir->i_ino,
7342 new_dentry->d_name.name,
7343 new_dentry->d_name.len);
7344
7345 if (ret) {
7346 if (ret == -EEXIST) {
7347 /* we shouldn't get
7348 * eexist without a new_inode */
7349 if (!new_inode) {
7350 WARN_ON(1);
7351 return ret;
7352 }
7353 } else {
7354 /* maybe -EOVERFLOW */
7355 return ret;
7356 }
7357 }
7358 ret = 0;
7359
7311 /* 7360 /*
7312 * we're using rename to replace one file with another. 7361 * we're using rename to replace one file with another.
7313 * and the replacement file is large. Start IO on it now so 7362 * and the replacement file is large. Start IO on it now so
@@ -7447,6 +7496,49 @@ out_notrans:
7447 return ret; 7496 return ret;
7448} 7497}
7449 7498
7499static void btrfs_run_delalloc_work(struct btrfs_work *work)
7500{
7501 struct btrfs_delalloc_work *delalloc_work;
7502
7503 delalloc_work = container_of(work, struct btrfs_delalloc_work,
7504 work);
7505 if (delalloc_work->wait)
7506 btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1);
7507 else
7508 filemap_flush(delalloc_work->inode->i_mapping);
7509
7510 if (delalloc_work->delay_iput)
7511 btrfs_add_delayed_iput(delalloc_work->inode);
7512 else
7513 iput(delalloc_work->inode);
7514 complete(&delalloc_work->completion);
7515}
7516
7517struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
7518 int wait, int delay_iput)
7519{
7520 struct btrfs_delalloc_work *work;
7521
7522 work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
7523 if (!work)
7524 return NULL;
7525
7526 init_completion(&work->completion);
7527 INIT_LIST_HEAD(&work->list);
7528 work->inode = inode;
7529 work->wait = wait;
7530 work->delay_iput = delay_iput;
7531 work->work.func = btrfs_run_delalloc_work;
7532
7533 return work;
7534}
7535
7536void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
7537{
7538 wait_for_completion(&work->completion);
7539 kmem_cache_free(btrfs_delalloc_work_cachep, work);
7540}
7541
7450/* 7542/*
7451 * some fairly slow code that needs optimization. This walks the list 7543 * some fairly slow code that needs optimization. This walks the list
7452 * of all the inodes with pending delalloc and forces them to disk. 7544 * of all the inodes with pending delalloc and forces them to disk.
@@ -7456,10 +7548,15 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
7456 struct list_head *head = &root->fs_info->delalloc_inodes; 7548 struct list_head *head = &root->fs_info->delalloc_inodes;
7457 struct btrfs_inode *binode; 7549 struct btrfs_inode *binode;
7458 struct inode *inode; 7550 struct inode *inode;
7551 struct btrfs_delalloc_work *work, *next;
7552 struct list_head works;
7553 int ret = 0;
7459 7554
7460 if (root->fs_info->sb->s_flags & MS_RDONLY) 7555 if (root->fs_info->sb->s_flags & MS_RDONLY)
7461 return -EROFS; 7556 return -EROFS;
7462 7557
7558 INIT_LIST_HEAD(&works);
7559
7463 spin_lock(&root->fs_info->delalloc_lock); 7560 spin_lock(&root->fs_info->delalloc_lock);
7464 while (!list_empty(head)) { 7561 while (!list_empty(head)) {
7465 binode = list_entry(head->next, struct btrfs_inode, 7562 binode = list_entry(head->next, struct btrfs_inode,
@@ -7469,11 +7566,14 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
7469 list_del_init(&binode->delalloc_inodes); 7566 list_del_init(&binode->delalloc_inodes);
7470 spin_unlock(&root->fs_info->delalloc_lock); 7567 spin_unlock(&root->fs_info->delalloc_lock);
7471 if (inode) { 7568 if (inode) {
7472 filemap_flush(inode->i_mapping); 7569 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
7473 if (delay_iput) 7570 if (!work) {
7474 btrfs_add_delayed_iput(inode); 7571 ret = -ENOMEM;
7475 else 7572 goto out;
7476 iput(inode); 7573 }
7574 list_add_tail(&work->list, &works);
7575 btrfs_queue_worker(&root->fs_info->flush_workers,
7576 &work->work);
7477 } 7577 }
7478 cond_resched(); 7578 cond_resched();
7479 spin_lock(&root->fs_info->delalloc_lock); 7579 spin_lock(&root->fs_info->delalloc_lock);
@@ -7492,7 +7592,12 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
7492 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 7592 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
7493 } 7593 }
7494 atomic_dec(&root->fs_info->async_submit_draining); 7594 atomic_dec(&root->fs_info->async_submit_draining);
7495 return 0; 7595out:
7596 list_for_each_entry_safe(work, next, &works, list) {
7597 list_del_init(&work->list);
7598 btrfs_wait_and_free_delalloc_work(work);
7599 }
7600 return ret;
7496} 7601}
7497 7602
7498static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 7603static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
@@ -7512,7 +7617,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7512 unsigned long ptr; 7617 unsigned long ptr;
7513 struct btrfs_file_extent_item *ei; 7618 struct btrfs_file_extent_item *ei;
7514 struct extent_buffer *leaf; 7619 struct extent_buffer *leaf;
7515 unsigned long nr = 0;
7516 7620
7517 name_len = strlen(symname) + 1; 7621 name_len = strlen(symname) + 1;
7518 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 7622 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
@@ -7610,13 +7714,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7610out_unlock: 7714out_unlock:
7611 if (!err) 7715 if (!err)
7612 d_instantiate(dentry, inode); 7716 d_instantiate(dentry, inode);
7613 nr = trans->blocks_used;
7614 btrfs_end_transaction(trans, root); 7717 btrfs_end_transaction(trans, root);
7615 if (drop_inode) { 7718 if (drop_inode) {
7616 inode_dec_link_count(inode); 7719 inode_dec_link_count(inode);
7617 iput(inode); 7720 iput(inode);
7618 } 7721 }
7619 btrfs_btree_balance_dirty(root, nr); 7722 btrfs_btree_balance_dirty(root);
7620 return err; 7723 return err;
7621} 7724}
7622 7725
@@ -7679,6 +7782,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
7679 em->len = ins.offset; 7782 em->len = ins.offset;
7680 em->block_start = ins.objectid; 7783 em->block_start = ins.objectid;
7681 em->block_len = ins.offset; 7784 em->block_len = ins.offset;
7785 em->orig_block_len = ins.offset;
7682 em->bdev = root->fs_info->fs_devices->latest_bdev; 7786 em->bdev = root->fs_info->fs_devices->latest_bdev;
7683 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 7787 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
7684 em->generation = trans->transid; 7788 em->generation = trans->transid;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5b3429ab8ec1..4b4516770f05 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -55,6 +55,7 @@
55#include "backref.h" 55#include "backref.h"
56#include "rcu-string.h" 56#include "rcu-string.h"
57#include "send.h" 57#include "send.h"
58#include "dev-replace.h"
58 59
59/* Mask out flags that are inappropriate for the given type of inode. */ 60/* Mask out flags that are inappropriate for the given type of inode. */
60static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 61static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -140,8 +141,11 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
140 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; 141 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
141 } 142 }
142 143
143 if (flags & BTRFS_INODE_NODATACOW) 144 if (flags & BTRFS_INODE_NODATACOW) {
144 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 145 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
146 if (S_ISREG(inode->i_mode))
147 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
148 }
145 149
146 btrfs_update_iflags(inode); 150 btrfs_update_iflags(inode);
147} 151}
@@ -571,8 +575,12 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
571 ret = btrfs_commit_transaction(trans, 575 ret = btrfs_commit_transaction(trans,
572 root->fs_info->extent_root); 576 root->fs_info->extent_root);
573 } 577 }
574 if (ret) 578 if (ret) {
579 /* cleanup_transaction has freed this for us */
580 if (trans->aborted)
581 pending_snapshot = NULL;
575 goto fail; 582 goto fail;
583 }
576 584
577 ret = pending_snapshot->error; 585 ret = pending_snapshot->error;
578 if (ret) 586 if (ret)
@@ -705,6 +713,16 @@ static noinline int btrfs_mksubvol(struct path *parent,
705 if (error) 713 if (error)
706 goto out_dput; 714 goto out_dput;
707 715
716 /*
717 * even if this name doesn't exist, we may get hash collisions.
718 * check for them now when we can safely fail
719 */
720 error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
721 dir->i_ino, name,
722 namelen);
723 if (error)
724 goto out_dput;
725
708 down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); 726 down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
709 727
710 if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) 728 if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
@@ -1293,12 +1311,13 @@ out_ra:
1293 return ret; 1311 return ret;
1294} 1312}
1295 1313
1296static noinline int btrfs_ioctl_resize(struct btrfs_root *root, 1314static noinline int btrfs_ioctl_resize(struct file *file,
1297 void __user *arg) 1315 void __user *arg)
1298{ 1316{
1299 u64 new_size; 1317 u64 new_size;
1300 u64 old_size; 1318 u64 old_size;
1301 u64 devid = 1; 1319 u64 devid = 1;
1320 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
1302 struct btrfs_ioctl_vol_args *vol_args; 1321 struct btrfs_ioctl_vol_args *vol_args;
1303 struct btrfs_trans_handle *trans; 1322 struct btrfs_trans_handle *trans;
1304 struct btrfs_device *device = NULL; 1323 struct btrfs_device *device = NULL;
@@ -1313,13 +1332,17 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1313 if (!capable(CAP_SYS_ADMIN)) 1332 if (!capable(CAP_SYS_ADMIN))
1314 return -EPERM; 1333 return -EPERM;
1315 1334
1316 mutex_lock(&root->fs_info->volume_mutex); 1335 ret = mnt_want_write_file(file);
1317 if (root->fs_info->balance_ctl) { 1336 if (ret)
1318 printk(KERN_INFO "btrfs: balance in progress\n"); 1337 return ret;
1319 ret = -EINVAL; 1338
1320 goto out; 1339 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
1340 1)) {
1341 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
1342 return -EINPROGRESS;
1321 } 1343 }
1322 1344
1345 mutex_lock(&root->fs_info->volume_mutex);
1323 vol_args = memdup_user(arg, sizeof(*vol_args)); 1346 vol_args = memdup_user(arg, sizeof(*vol_args));
1324 if (IS_ERR(vol_args)) { 1347 if (IS_ERR(vol_args)) {
1325 ret = PTR_ERR(vol_args); 1348 ret = PTR_ERR(vol_args);
@@ -1339,7 +1362,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1339 printk(KERN_INFO "btrfs: resizing devid %llu\n", 1362 printk(KERN_INFO "btrfs: resizing devid %llu\n",
1340 (unsigned long long)devid); 1363 (unsigned long long)devid);
1341 } 1364 }
1342 device = btrfs_find_device(root, devid, NULL, NULL); 1365 device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
1343 if (!device) { 1366 if (!device) {
1344 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", 1367 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
1345 (unsigned long long)devid); 1368 (unsigned long long)devid);
@@ -1371,6 +1394,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1371 } 1394 }
1372 } 1395 }
1373 1396
1397 if (device->is_tgtdev_for_dev_replace) {
1398 ret = -EINVAL;
1399 goto out_free;
1400 }
1401
1374 old_size = device->total_bytes; 1402 old_size = device->total_bytes;
1375 1403
1376 if (mod < 0) { 1404 if (mod < 0) {
@@ -1409,12 +1437,14 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1409 btrfs_commit_transaction(trans, root); 1437 btrfs_commit_transaction(trans, root);
1410 } else if (new_size < old_size) { 1438 } else if (new_size < old_size) {
1411 ret = btrfs_shrink_device(device, new_size); 1439 ret = btrfs_shrink_device(device, new_size);
1412 } 1440 } /* equal, nothing need to do */
1413 1441
1414out_free: 1442out_free:
1415 kfree(vol_args); 1443 kfree(vol_args);
1416out: 1444out:
1417 mutex_unlock(&root->fs_info->volume_mutex); 1445 mutex_unlock(&root->fs_info->volume_mutex);
1446 mnt_drop_write_file(file);
1447 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
1418 return ret; 1448 return ret;
1419} 1449}
1420 1450
@@ -2156,9 +2186,17 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
2156 if (btrfs_root_readonly(root)) 2186 if (btrfs_root_readonly(root))
2157 return -EROFS; 2187 return -EROFS;
2158 2188
2189 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2190 1)) {
2191 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2192 return -EINPROGRESS;
2193 }
2159 ret = mnt_want_write_file(file); 2194 ret = mnt_want_write_file(file);
2160 if (ret) 2195 if (ret) {
2196 atomic_set(&root->fs_info->mutually_exclusive_operation_running,
2197 0);
2161 return ret; 2198 return ret;
2199 }
2162 2200
2163 switch (inode->i_mode & S_IFMT) { 2201 switch (inode->i_mode & S_IFMT) {
2164 case S_IFDIR: 2202 case S_IFDIR:
@@ -2210,6 +2248,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
2210 } 2248 }
2211out: 2249out:
2212 mnt_drop_write_file(file); 2250 mnt_drop_write_file(file);
2251 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2213 return ret; 2252 return ret;
2214} 2253}
2215 2254
@@ -2221,13 +2260,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
2221 if (!capable(CAP_SYS_ADMIN)) 2260 if (!capable(CAP_SYS_ADMIN))
2222 return -EPERM; 2261 return -EPERM;
2223 2262
2224 mutex_lock(&root->fs_info->volume_mutex); 2263 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2225 if (root->fs_info->balance_ctl) { 2264 1)) {
2226 printk(KERN_INFO "btrfs: balance in progress\n"); 2265 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2227 ret = -EINVAL; 2266 return -EINPROGRESS;
2228 goto out;
2229 } 2267 }
2230 2268
2269 mutex_lock(&root->fs_info->volume_mutex);
2231 vol_args = memdup_user(arg, sizeof(*vol_args)); 2270 vol_args = memdup_user(arg, sizeof(*vol_args));
2232 if (IS_ERR(vol_args)) { 2271 if (IS_ERR(vol_args)) {
2233 ret = PTR_ERR(vol_args); 2272 ret = PTR_ERR(vol_args);
@@ -2240,27 +2279,31 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
2240 kfree(vol_args); 2279 kfree(vol_args);
2241out: 2280out:
2242 mutex_unlock(&root->fs_info->volume_mutex); 2281 mutex_unlock(&root->fs_info->volume_mutex);
2282 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2243 return ret; 2283 return ret;
2244} 2284}
2245 2285
2246static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) 2286static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2247{ 2287{
2288 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
2248 struct btrfs_ioctl_vol_args *vol_args; 2289 struct btrfs_ioctl_vol_args *vol_args;
2249 int ret; 2290 int ret;
2250 2291
2251 if (!capable(CAP_SYS_ADMIN)) 2292 if (!capable(CAP_SYS_ADMIN))
2252 return -EPERM; 2293 return -EPERM;
2253 2294
2254 if (root->fs_info->sb->s_flags & MS_RDONLY) 2295 ret = mnt_want_write_file(file);
2255 return -EROFS; 2296 if (ret)
2297 return ret;
2256 2298
2257 mutex_lock(&root->fs_info->volume_mutex); 2299 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2258 if (root->fs_info->balance_ctl) { 2300 1)) {
2259 printk(KERN_INFO "btrfs: balance in progress\n"); 2301 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2260 ret = -EINVAL; 2302 mnt_drop_write_file(file);
2261 goto out; 2303 return -EINPROGRESS;
2262 } 2304 }
2263 2305
2306 mutex_lock(&root->fs_info->volume_mutex);
2264 vol_args = memdup_user(arg, sizeof(*vol_args)); 2307 vol_args = memdup_user(arg, sizeof(*vol_args));
2265 if (IS_ERR(vol_args)) { 2308 if (IS_ERR(vol_args)) {
2266 ret = PTR_ERR(vol_args); 2309 ret = PTR_ERR(vol_args);
@@ -2273,6 +2316,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
2273 kfree(vol_args); 2316 kfree(vol_args);
2274out: 2317out:
2275 mutex_unlock(&root->fs_info->volume_mutex); 2318 mutex_unlock(&root->fs_info->volume_mutex);
2319 mnt_drop_write_file(file);
2320 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2276 return ret; 2321 return ret;
2277} 2322}
2278 2323
@@ -2328,7 +2373,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
2328 s_uuid = di_args->uuid; 2373 s_uuid = di_args->uuid;
2329 2374
2330 mutex_lock(&fs_devices->device_list_mutex); 2375 mutex_lock(&fs_devices->device_list_mutex);
2331 dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL); 2376 dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);
2332 mutex_unlock(&fs_devices->device_list_mutex); 2377 mutex_unlock(&fs_devices->device_list_mutex);
2333 2378
2334 if (!dev) { 2379 if (!dev) {
@@ -2821,12 +2866,19 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2821 struct btrfs_disk_key disk_key; 2866 struct btrfs_disk_key disk_key;
2822 u64 objectid = 0; 2867 u64 objectid = 0;
2823 u64 dir_id; 2868 u64 dir_id;
2869 int ret;
2824 2870
2825 if (!capable(CAP_SYS_ADMIN)) 2871 if (!capable(CAP_SYS_ADMIN))
2826 return -EPERM; 2872 return -EPERM;
2827 2873
2828 if (copy_from_user(&objectid, argp, sizeof(objectid))) 2874 ret = mnt_want_write_file(file);
2829 return -EFAULT; 2875 if (ret)
2876 return ret;
2877
2878 if (copy_from_user(&objectid, argp, sizeof(objectid))) {
2879 ret = -EFAULT;
2880 goto out;
2881 }
2830 2882
2831 if (!objectid) 2883 if (!objectid)
2832 objectid = root->root_key.objectid; 2884 objectid = root->root_key.objectid;
@@ -2836,21 +2888,28 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2836 location.offset = (u64)-1; 2888 location.offset = (u64)-1;
2837 2889
2838 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 2890 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
2839 if (IS_ERR(new_root)) 2891 if (IS_ERR(new_root)) {
2840 return PTR_ERR(new_root); 2892 ret = PTR_ERR(new_root);
2893 goto out;
2894 }
2841 2895
2842 if (btrfs_root_refs(&new_root->root_item) == 0) 2896 if (btrfs_root_refs(&new_root->root_item) == 0) {
2843 return -ENOENT; 2897 ret = -ENOENT;
2898 goto out;
2899 }
2844 2900
2845 path = btrfs_alloc_path(); 2901 path = btrfs_alloc_path();
2846 if (!path) 2902 if (!path) {
2847 return -ENOMEM; 2903 ret = -ENOMEM;
2904 goto out;
2905 }
2848 path->leave_spinning = 1; 2906 path->leave_spinning = 1;
2849 2907
2850 trans = btrfs_start_transaction(root, 1); 2908 trans = btrfs_start_transaction(root, 1);
2851 if (IS_ERR(trans)) { 2909 if (IS_ERR(trans)) {
2852 btrfs_free_path(path); 2910 btrfs_free_path(path);
2853 return PTR_ERR(trans); 2911 ret = PTR_ERR(trans);
2912 goto out;
2854 } 2913 }
2855 2914
2856 dir_id = btrfs_super_root_dir(root->fs_info->super_copy); 2915 dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
@@ -2861,7 +2920,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2861 btrfs_end_transaction(trans, root); 2920 btrfs_end_transaction(trans, root);
2862 printk(KERN_ERR "Umm, you don't have the default dir item, " 2921 printk(KERN_ERR "Umm, you don't have the default dir item, "
2863 "this isn't going to work\n"); 2922 "this isn't going to work\n");
2864 return -ENOENT; 2923 ret = -ENOENT;
2924 goto out;
2865 } 2925 }
2866 2926
2867 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); 2927 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
@@ -2871,8 +2931,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2871 2931
2872 btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL); 2932 btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
2873 btrfs_end_transaction(trans, root); 2933 btrfs_end_transaction(trans, root);
2874 2934out:
2875 return 0; 2935 mnt_drop_write_file(file);
2936 return ret;
2876} 2937}
2877 2938
2878void btrfs_get_block_group_info(struct list_head *groups_list, 2939void btrfs_get_block_group_info(struct list_head *groups_list,
@@ -3036,32 +3097,38 @@ long btrfs_ioctl_trans_end(struct file *file)
3036 return 0; 3097 return 0;
3037} 3098}
3038 3099
3039static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp) 3100static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
3101 void __user *argp)
3040{ 3102{
3041 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
3042 struct btrfs_trans_handle *trans; 3103 struct btrfs_trans_handle *trans;
3043 u64 transid; 3104 u64 transid;
3044 int ret; 3105 int ret;
3045 3106
3046 trans = btrfs_start_transaction(root, 0); 3107 trans = btrfs_attach_transaction(root);
3047 if (IS_ERR(trans)) 3108 if (IS_ERR(trans)) {
3048 return PTR_ERR(trans); 3109 if (PTR_ERR(trans) != -ENOENT)
3110 return PTR_ERR(trans);
3111
3112 /* No running transaction, don't bother */
3113 transid = root->fs_info->last_trans_committed;
3114 goto out;
3115 }
3049 transid = trans->transid; 3116 transid = trans->transid;
3050 ret = btrfs_commit_transaction_async(trans, root, 0); 3117 ret = btrfs_commit_transaction_async(trans, root, 0);
3051 if (ret) { 3118 if (ret) {
3052 btrfs_end_transaction(trans, root); 3119 btrfs_end_transaction(trans, root);
3053 return ret; 3120 return ret;
3054 } 3121 }
3055 3122out:
3056 if (argp) 3123 if (argp)
3057 if (copy_to_user(argp, &transid, sizeof(transid))) 3124 if (copy_to_user(argp, &transid, sizeof(transid)))
3058 return -EFAULT; 3125 return -EFAULT;
3059 return 0; 3126 return 0;
3060} 3127}
3061 3128
3062static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp) 3129static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
3130 void __user *argp)
3063{ 3131{
3064 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
3065 u64 transid; 3132 u64 transid;
3066 3133
3067 if (argp) { 3134 if (argp) {
@@ -3073,10 +3140,11 @@ static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
3073 return btrfs_wait_for_commit(root, transid); 3140 return btrfs_wait_for_commit(root, transid);
3074} 3141}
3075 3142
3076static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) 3143static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
3077{ 3144{
3078 int ret; 3145 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3079 struct btrfs_ioctl_scrub_args *sa; 3146 struct btrfs_ioctl_scrub_args *sa;
3147 int ret;
3080 3148
3081 if (!capable(CAP_SYS_ADMIN)) 3149 if (!capable(CAP_SYS_ADMIN))
3082 return -EPERM; 3150 return -EPERM;
@@ -3085,12 +3153,22 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
3085 if (IS_ERR(sa)) 3153 if (IS_ERR(sa))
3086 return PTR_ERR(sa); 3154 return PTR_ERR(sa);
3087 3155
3088 ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end, 3156 if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
3089 &sa->progress, sa->flags & BTRFS_SCRUB_READONLY); 3157 ret = mnt_want_write_file(file);
3158 if (ret)
3159 goto out;
3160 }
3161
3162 ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end,
3163 &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
3164 0);
3090 3165
3091 if (copy_to_user(arg, sa, sizeof(*sa))) 3166 if (copy_to_user(arg, sa, sizeof(*sa)))
3092 ret = -EFAULT; 3167 ret = -EFAULT;
3093 3168
3169 if (!(sa->flags & BTRFS_SCRUB_READONLY))
3170 mnt_drop_write_file(file);
3171out:
3094 kfree(sa); 3172 kfree(sa);
3095 return ret; 3173 return ret;
3096} 3174}
@@ -3100,7 +3178,7 @@ static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
3100 if (!capable(CAP_SYS_ADMIN)) 3178 if (!capable(CAP_SYS_ADMIN))
3101 return -EPERM; 3179 return -EPERM;
3102 3180
3103 return btrfs_scrub_cancel(root); 3181 return btrfs_scrub_cancel(root->fs_info);
3104} 3182}
3105 3183
3106static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, 3184static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
@@ -3149,6 +3227,51 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
3149 return ret; 3227 return ret;
3150} 3228}
3151 3229
3230static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
3231{
3232 struct btrfs_ioctl_dev_replace_args *p;
3233 int ret;
3234
3235 if (!capable(CAP_SYS_ADMIN))
3236 return -EPERM;
3237
3238 p = memdup_user(arg, sizeof(*p));
3239 if (IS_ERR(p))
3240 return PTR_ERR(p);
3241
3242 switch (p->cmd) {
3243 case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
3244 if (atomic_xchg(
3245 &root->fs_info->mutually_exclusive_operation_running,
3246 1)) {
3247 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
3248 ret = -EINPROGRESS;
3249 } else {
3250 ret = btrfs_dev_replace_start(root, p);
3251 atomic_set(
3252 &root->fs_info->mutually_exclusive_operation_running,
3253 0);
3254 }
3255 break;
3256 case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
3257 btrfs_dev_replace_status(root->fs_info, p);
3258 ret = 0;
3259 break;
3260 case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
3261 ret = btrfs_dev_replace_cancel(root->fs_info, p);
3262 break;
3263 default:
3264 ret = -EINVAL;
3265 break;
3266 }
3267
3268 if (copy_to_user(arg, p, sizeof(*p)))
3269 ret = -EFAULT;
3270
3271 kfree(p);
3272 return ret;
3273}
3274
3152static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) 3275static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
3153{ 3276{
3154 int ret = 0; 3277 int ret = 0;
@@ -3315,6 +3438,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3315 struct btrfs_ioctl_balance_args *bargs; 3438 struct btrfs_ioctl_balance_args *bargs;
3316 struct btrfs_balance_control *bctl; 3439 struct btrfs_balance_control *bctl;
3317 int ret; 3440 int ret;
3441 int need_to_clear_lock = 0;
3318 3442
3319 if (!capable(CAP_SYS_ADMIN)) 3443 if (!capable(CAP_SYS_ADMIN))
3320 return -EPERM; 3444 return -EPERM;
@@ -3350,10 +3474,13 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3350 bargs = NULL; 3474 bargs = NULL;
3351 } 3475 }
3352 3476
3353 if (fs_info->balance_ctl) { 3477 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
3478 1)) {
3479 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
3354 ret = -EINPROGRESS; 3480 ret = -EINPROGRESS;
3355 goto out_bargs; 3481 goto out_bargs;
3356 } 3482 }
3483 need_to_clear_lock = 1;
3357 3484
3358 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 3485 bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
3359 if (!bctl) { 3486 if (!bctl) {
@@ -3387,6 +3514,9 @@ do_balance:
3387out_bargs: 3514out_bargs:
3388 kfree(bargs); 3515 kfree(bargs);
3389out: 3516out:
3517 if (need_to_clear_lock)
3518 atomic_set(&root->fs_info->mutually_exclusive_operation_running,
3519 0);
3390 mutex_unlock(&fs_info->balance_mutex); 3520 mutex_unlock(&fs_info->balance_mutex);
3391 mutex_unlock(&fs_info->volume_mutex); 3521 mutex_unlock(&fs_info->volume_mutex);
3392 mnt_drop_write_file(file); 3522 mnt_drop_write_file(file);
@@ -3441,8 +3571,9 @@ out:
3441 return ret; 3571 return ret;
3442} 3572}
3443 3573
3444static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg) 3574static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
3445{ 3575{
3576 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3446 struct btrfs_ioctl_quota_ctl_args *sa; 3577 struct btrfs_ioctl_quota_ctl_args *sa;
3447 struct btrfs_trans_handle *trans = NULL; 3578 struct btrfs_trans_handle *trans = NULL;
3448 int ret; 3579 int ret;
@@ -3451,12 +3582,15 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
3451 if (!capable(CAP_SYS_ADMIN)) 3582 if (!capable(CAP_SYS_ADMIN))
3452 return -EPERM; 3583 return -EPERM;
3453 3584
3454 if (root->fs_info->sb->s_flags & MS_RDONLY) 3585 ret = mnt_want_write_file(file);
3455 return -EROFS; 3586 if (ret)
3587 return ret;
3456 3588
3457 sa = memdup_user(arg, sizeof(*sa)); 3589 sa = memdup_user(arg, sizeof(*sa));
3458 if (IS_ERR(sa)) 3590 if (IS_ERR(sa)) {
3459 return PTR_ERR(sa); 3591 ret = PTR_ERR(sa);
3592 goto drop_write;
3593 }
3460 3594
3461 if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) { 3595 if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {
3462 trans = btrfs_start_transaction(root, 2); 3596 trans = btrfs_start_transaction(root, 2);
@@ -3489,14 +3623,16 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
3489 if (err && !ret) 3623 if (err && !ret)
3490 ret = err; 3624 ret = err;
3491 } 3625 }
3492
3493out: 3626out:
3494 kfree(sa); 3627 kfree(sa);
3628drop_write:
3629 mnt_drop_write_file(file);
3495 return ret; 3630 return ret;
3496} 3631}
3497 3632
3498static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg) 3633static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
3499{ 3634{
3635 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3500 struct btrfs_ioctl_qgroup_assign_args *sa; 3636 struct btrfs_ioctl_qgroup_assign_args *sa;
3501 struct btrfs_trans_handle *trans; 3637 struct btrfs_trans_handle *trans;
3502 int ret; 3638 int ret;
@@ -3505,12 +3641,15 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
3505 if (!capable(CAP_SYS_ADMIN)) 3641 if (!capable(CAP_SYS_ADMIN))
3506 return -EPERM; 3642 return -EPERM;
3507 3643
3508 if (root->fs_info->sb->s_flags & MS_RDONLY) 3644 ret = mnt_want_write_file(file);
3509 return -EROFS; 3645 if (ret)
3646 return ret;
3510 3647
3511 sa = memdup_user(arg, sizeof(*sa)); 3648 sa = memdup_user(arg, sizeof(*sa));
3512 if (IS_ERR(sa)) 3649 if (IS_ERR(sa)) {
3513 return PTR_ERR(sa); 3650 ret = PTR_ERR(sa);
3651 goto drop_write;
3652 }
3514 3653
3515 trans = btrfs_join_transaction(root); 3654 trans = btrfs_join_transaction(root);
3516 if (IS_ERR(trans)) { 3655 if (IS_ERR(trans)) {
@@ -3533,11 +3672,14 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
3533 3672
3534out: 3673out:
3535 kfree(sa); 3674 kfree(sa);
3675drop_write:
3676 mnt_drop_write_file(file);
3536 return ret; 3677 return ret;
3537} 3678}
3538 3679
3539static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg) 3680static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
3540{ 3681{
3682 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3541 struct btrfs_ioctl_qgroup_create_args *sa; 3683 struct btrfs_ioctl_qgroup_create_args *sa;
3542 struct btrfs_trans_handle *trans; 3684 struct btrfs_trans_handle *trans;
3543 int ret; 3685 int ret;
@@ -3546,12 +3688,15 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
3546 if (!capable(CAP_SYS_ADMIN)) 3688 if (!capable(CAP_SYS_ADMIN))
3547 return -EPERM; 3689 return -EPERM;
3548 3690
3549 if (root->fs_info->sb->s_flags & MS_RDONLY) 3691 ret = mnt_want_write_file(file);
3550 return -EROFS; 3692 if (ret)
3693 return ret;
3551 3694
3552 sa = memdup_user(arg, sizeof(*sa)); 3695 sa = memdup_user(arg, sizeof(*sa));
3553 if (IS_ERR(sa)) 3696 if (IS_ERR(sa)) {
3554 return PTR_ERR(sa); 3697 ret = PTR_ERR(sa);
3698 goto drop_write;
3699 }
3555 3700
3556 trans = btrfs_join_transaction(root); 3701 trans = btrfs_join_transaction(root);
3557 if (IS_ERR(trans)) { 3702 if (IS_ERR(trans)) {
@@ -3573,11 +3718,14 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
3573 3718
3574out: 3719out:
3575 kfree(sa); 3720 kfree(sa);
3721drop_write:
3722 mnt_drop_write_file(file);
3576 return ret; 3723 return ret;
3577} 3724}
3578 3725
3579static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg) 3726static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
3580{ 3727{
3728 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3581 struct btrfs_ioctl_qgroup_limit_args *sa; 3729 struct btrfs_ioctl_qgroup_limit_args *sa;
3582 struct btrfs_trans_handle *trans; 3730 struct btrfs_trans_handle *trans;
3583 int ret; 3731 int ret;
@@ -3587,12 +3735,15 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
3587 if (!capable(CAP_SYS_ADMIN)) 3735 if (!capable(CAP_SYS_ADMIN))
3588 return -EPERM; 3736 return -EPERM;
3589 3737
3590 if (root->fs_info->sb->s_flags & MS_RDONLY) 3738 ret = mnt_want_write_file(file);
3591 return -EROFS; 3739 if (ret)
3740 return ret;
3592 3741
3593 sa = memdup_user(arg, sizeof(*sa)); 3742 sa = memdup_user(arg, sizeof(*sa));
3594 if (IS_ERR(sa)) 3743 if (IS_ERR(sa)) {
3595 return PTR_ERR(sa); 3744 ret = PTR_ERR(sa);
3745 goto drop_write;
3746 }
3596 3747
3597 trans = btrfs_join_transaction(root); 3748 trans = btrfs_join_transaction(root);
3598 if (IS_ERR(trans)) { 3749 if (IS_ERR(trans)) {
@@ -3615,6 +3766,8 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
3615 3766
3616out: 3767out:
3617 kfree(sa); 3768 kfree(sa);
3769drop_write:
3770 mnt_drop_write_file(file);
3618 return ret; 3771 return ret;
3619} 3772}
3620 3773
@@ -3735,11 +3888,11 @@ long btrfs_ioctl(struct file *file, unsigned int
3735 case BTRFS_IOC_DEFRAG_RANGE: 3888 case BTRFS_IOC_DEFRAG_RANGE:
3736 return btrfs_ioctl_defrag(file, argp); 3889 return btrfs_ioctl_defrag(file, argp);
3737 case BTRFS_IOC_RESIZE: 3890 case BTRFS_IOC_RESIZE:
3738 return btrfs_ioctl_resize(root, argp); 3891 return btrfs_ioctl_resize(file, argp);
3739 case BTRFS_IOC_ADD_DEV: 3892 case BTRFS_IOC_ADD_DEV:
3740 return btrfs_ioctl_add_dev(root, argp); 3893 return btrfs_ioctl_add_dev(root, argp);
3741 case BTRFS_IOC_RM_DEV: 3894 case BTRFS_IOC_RM_DEV:
3742 return btrfs_ioctl_rm_dev(root, argp); 3895 return btrfs_ioctl_rm_dev(file, argp);
3743 case BTRFS_IOC_FS_INFO: 3896 case BTRFS_IOC_FS_INFO:
3744 return btrfs_ioctl_fs_info(root, argp); 3897 return btrfs_ioctl_fs_info(root, argp);
3745 case BTRFS_IOC_DEV_INFO: 3898 case BTRFS_IOC_DEV_INFO:
@@ -3768,11 +3921,11 @@ long btrfs_ioctl(struct file *file, unsigned int
3768 btrfs_sync_fs(file->f_dentry->d_sb, 1); 3921 btrfs_sync_fs(file->f_dentry->d_sb, 1);
3769 return 0; 3922 return 0;
3770 case BTRFS_IOC_START_SYNC: 3923 case BTRFS_IOC_START_SYNC:
3771 return btrfs_ioctl_start_sync(file, argp); 3924 return btrfs_ioctl_start_sync(root, argp);
3772 case BTRFS_IOC_WAIT_SYNC: 3925 case BTRFS_IOC_WAIT_SYNC:
3773 return btrfs_ioctl_wait_sync(file, argp); 3926 return btrfs_ioctl_wait_sync(root, argp);
3774 case BTRFS_IOC_SCRUB: 3927 case BTRFS_IOC_SCRUB:
3775 return btrfs_ioctl_scrub(root, argp); 3928 return btrfs_ioctl_scrub(file, argp);
3776 case BTRFS_IOC_SCRUB_CANCEL: 3929 case BTRFS_IOC_SCRUB_CANCEL:
3777 return btrfs_ioctl_scrub_cancel(root, argp); 3930 return btrfs_ioctl_scrub_cancel(root, argp);
3778 case BTRFS_IOC_SCRUB_PROGRESS: 3931 case BTRFS_IOC_SCRUB_PROGRESS:
@@ -3790,13 +3943,15 @@ long btrfs_ioctl(struct file *file, unsigned int
3790 case BTRFS_IOC_GET_DEV_STATS: 3943 case BTRFS_IOC_GET_DEV_STATS:
3791 return btrfs_ioctl_get_dev_stats(root, argp); 3944 return btrfs_ioctl_get_dev_stats(root, argp);
3792 case BTRFS_IOC_QUOTA_CTL: 3945 case BTRFS_IOC_QUOTA_CTL:
3793 return btrfs_ioctl_quota_ctl(root, argp); 3946 return btrfs_ioctl_quota_ctl(file, argp);
3794 case BTRFS_IOC_QGROUP_ASSIGN: 3947 case BTRFS_IOC_QGROUP_ASSIGN:
3795 return btrfs_ioctl_qgroup_assign(root, argp); 3948 return btrfs_ioctl_qgroup_assign(file, argp);
3796 case BTRFS_IOC_QGROUP_CREATE: 3949 case BTRFS_IOC_QGROUP_CREATE:
3797 return btrfs_ioctl_qgroup_create(root, argp); 3950 return btrfs_ioctl_qgroup_create(file, argp);
3798 case BTRFS_IOC_QGROUP_LIMIT: 3951 case BTRFS_IOC_QGROUP_LIMIT:
3799 return btrfs_ioctl_qgroup_limit(root, argp); 3952 return btrfs_ioctl_qgroup_limit(file, argp);
3953 case BTRFS_IOC_DEV_REPLACE:
3954 return btrfs_ioctl_dev_replace(root, argp);
3800 } 3955 }
3801 3956
3802 return -ENOTTY; 3957 return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 731e2875ab93..dabca9cc8c2e 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,6 +30,8 @@ struct btrfs_ioctl_vol_args {
30 char name[BTRFS_PATH_NAME_MAX + 1]; 30 char name[BTRFS_PATH_NAME_MAX + 1];
31}; 31};
32 32
33#define BTRFS_DEVICE_PATH_NAME_MAX 1024
34
33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) 35#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
34#define BTRFS_SUBVOL_RDONLY (1ULL << 1) 36#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
35#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) 37#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2)
@@ -123,7 +125,48 @@ struct btrfs_ioctl_scrub_args {
123 __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8]; 125 __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
124}; 126};
125 127
126#define BTRFS_DEVICE_PATH_NAME_MAX 1024 128#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
129#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
130struct btrfs_ioctl_dev_replace_start_params {
131 __u64 srcdevid; /* in, if 0, use srcdev_name instead */
132 __u64 cont_reading_from_srcdev_mode; /* in, see #define
133 * above */
134 __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
135 __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
136};
137
138#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0
139#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED 1
140#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2
141#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3
142#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4
143struct btrfs_ioctl_dev_replace_status_params {
144 __u64 replace_state; /* out, see #define above */
145 __u64 progress_1000; /* out, 0 <= x <= 1000 */
146 __u64 time_started; /* out, seconds since 1-Jan-1970 */
147 __u64 time_stopped; /* out, seconds since 1-Jan-1970 */
148 __u64 num_write_errors; /* out */
149 __u64 num_uncorrectable_read_errors; /* out */
150};
151
152#define BTRFS_IOCTL_DEV_REPLACE_CMD_START 0
153#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS 1
154#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL 2
155#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0
156#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1
157#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2
158struct btrfs_ioctl_dev_replace_args {
159 __u64 cmd; /* in */
160 __u64 result; /* out */
161
162 union {
163 struct btrfs_ioctl_dev_replace_start_params start;
164 struct btrfs_ioctl_dev_replace_status_params status;
165 }; /* in/out */
166
167 __u64 spare[64];
168};
169
127struct btrfs_ioctl_dev_info_args { 170struct btrfs_ioctl_dev_info_args {
128 __u64 devid; /* in/out */ 171 __u64 devid; /* in/out */
129 __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */ 172 __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */
@@ -453,4 +496,7 @@ struct btrfs_ioctl_send_args {
453 struct btrfs_ioctl_qgroup_limit_args) 496 struct btrfs_ioctl_qgroup_limit_args)
454#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ 497#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
455 struct btrfs_ioctl_get_dev_stats) 498 struct btrfs_ioctl_get_dev_stats)
499#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
500 struct btrfs_ioctl_dev_replace_args)
501
456#endif 502#endif
diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h
new file mode 100644
index 000000000000..b7816cefbd13
--- /dev/null
+++ b/fs/btrfs/math.h
@@ -0,0 +1,44 @@
1
2/*
3 * Copyright (C) 2012 Fujitsu. All rights reserved.
4 * Written by Miao Xie <miaox@cn.fujitsu.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License v2 as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public
16 * License along with this program; if not, write to the
17 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 * Boston, MA 021110-1307, USA.
19 */
20
21#ifndef __BTRFS_MATH_H
22#define __BTRFS_MATH_H
23
24#include <asm/div64.h>
25
26static inline u64 div_factor(u64 num, int factor)
27{
28 if (factor == 10)
29 return num;
30 num *= factor;
31 do_div(num, 10);
32 return num;
33}
34
35static inline u64 div_factor_fine(u64 num, int factor)
36{
37 if (factor == 100)
38 return num;
39 num *= factor;
40 do_div(num, 100);
41 return num;
42}
43
44#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 7772f02ba28e..f10731297040 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -211,6 +211,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
211 init_waitqueue_head(&entry->wait); 211 init_waitqueue_head(&entry->wait);
212 INIT_LIST_HEAD(&entry->list); 212 INIT_LIST_HEAD(&entry->list);
213 INIT_LIST_HEAD(&entry->root_extent_list); 213 INIT_LIST_HEAD(&entry->root_extent_list);
214 INIT_LIST_HEAD(&entry->work_list);
215 init_completion(&entry->completion);
214 216
215 trace_btrfs_ordered_extent_add(inode, entry); 217 trace_btrfs_ordered_extent_add(inode, entry);
216 218
@@ -464,18 +466,28 @@ void btrfs_remove_ordered_extent(struct inode *inode,
464 wake_up(&entry->wait); 466 wake_up(&entry->wait);
465} 467}
466 468
469static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
470{
471 struct btrfs_ordered_extent *ordered;
472
473 ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
474 btrfs_start_ordered_extent(ordered->inode, ordered, 1);
475 complete(&ordered->completion);
476}
477
467/* 478/*
468 * wait for all the ordered extents in a root. This is done when balancing 479 * wait for all the ordered extents in a root. This is done when balancing
469 * space between drives. 480 * space between drives.
470 */ 481 */
471void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) 482void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
472{ 483{
473 struct list_head splice; 484 struct list_head splice, works;
474 struct list_head *cur; 485 struct list_head *cur;
475 struct btrfs_ordered_extent *ordered; 486 struct btrfs_ordered_extent *ordered, *next;
476 struct inode *inode; 487 struct inode *inode;
477 488
478 INIT_LIST_HEAD(&splice); 489 INIT_LIST_HEAD(&splice);
490 INIT_LIST_HEAD(&works);
479 491
480 spin_lock(&root->fs_info->ordered_extent_lock); 492 spin_lock(&root->fs_info->ordered_extent_lock);
481 list_splice_init(&root->fs_info->ordered_extents, &splice); 493 list_splice_init(&root->fs_info->ordered_extents, &splice);
@@ -494,19 +506,32 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
494 spin_unlock(&root->fs_info->ordered_extent_lock); 506 spin_unlock(&root->fs_info->ordered_extent_lock);
495 507
496 if (inode) { 508 if (inode) {
497 btrfs_start_ordered_extent(inode, ordered, 1); 509 ordered->flush_work.func = btrfs_run_ordered_extent_work;
498 btrfs_put_ordered_extent(ordered); 510 list_add_tail(&ordered->work_list, &works);
499 if (delay_iput) 511 btrfs_queue_worker(&root->fs_info->flush_workers,
500 btrfs_add_delayed_iput(inode); 512 &ordered->flush_work);
501 else
502 iput(inode);
503 } else { 513 } else {
504 btrfs_put_ordered_extent(ordered); 514 btrfs_put_ordered_extent(ordered);
505 } 515 }
506 516
517 cond_resched();
507 spin_lock(&root->fs_info->ordered_extent_lock); 518 spin_lock(&root->fs_info->ordered_extent_lock);
508 } 519 }
509 spin_unlock(&root->fs_info->ordered_extent_lock); 520 spin_unlock(&root->fs_info->ordered_extent_lock);
521
522 list_for_each_entry_safe(ordered, next, &works, work_list) {
523 list_del_init(&ordered->work_list);
524 wait_for_completion(&ordered->completion);
525
526 inode = ordered->inode;
527 btrfs_put_ordered_extent(ordered);
528 if (delay_iput)
529 btrfs_add_delayed_iput(inode);
530 else
531 iput(inode);
532
533 cond_resched();
534 }
510} 535}
511 536
512/* 537/*
@@ -519,13 +544,17 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
519 * extra check to make sure the ordered operation list really is empty 544 * extra check to make sure the ordered operation list really is empty
520 * before we return 545 * before we return
521 */ 546 */
522void btrfs_run_ordered_operations(struct btrfs_root *root, int wait) 547int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
523{ 548{
524 struct btrfs_inode *btrfs_inode; 549 struct btrfs_inode *btrfs_inode;
525 struct inode *inode; 550 struct inode *inode;
526 struct list_head splice; 551 struct list_head splice;
552 struct list_head works;
553 struct btrfs_delalloc_work *work, *next;
554 int ret = 0;
527 555
528 INIT_LIST_HEAD(&splice); 556 INIT_LIST_HEAD(&splice);
557 INIT_LIST_HEAD(&works);
529 558
530 mutex_lock(&root->fs_info->ordered_operations_mutex); 559 mutex_lock(&root->fs_info->ordered_operations_mutex);
531 spin_lock(&root->fs_info->ordered_extent_lock); 560 spin_lock(&root->fs_info->ordered_extent_lock);
@@ -533,6 +562,7 @@ again:
533 list_splice_init(&root->fs_info->ordered_operations, &splice); 562 list_splice_init(&root->fs_info->ordered_operations, &splice);
534 563
535 while (!list_empty(&splice)) { 564 while (!list_empty(&splice)) {
565
536 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 566 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
537 ordered_operations); 567 ordered_operations);
538 568
@@ -549,15 +579,26 @@ again:
549 list_add_tail(&BTRFS_I(inode)->ordered_operations, 579 list_add_tail(&BTRFS_I(inode)->ordered_operations,
550 &root->fs_info->ordered_operations); 580 &root->fs_info->ordered_operations);
551 } 581 }
582
583 if (!inode)
584 continue;
552 spin_unlock(&root->fs_info->ordered_extent_lock); 585 spin_unlock(&root->fs_info->ordered_extent_lock);
553 586
554 if (inode) { 587 work = btrfs_alloc_delalloc_work(inode, wait, 1);
555 if (wait) 588 if (!work) {
556 btrfs_wait_ordered_range(inode, 0, (u64)-1); 589 if (list_empty(&BTRFS_I(inode)->ordered_operations))
557 else 590 list_add_tail(&btrfs_inode->ordered_operations,
558 filemap_flush(inode->i_mapping); 591 &splice);
559 btrfs_add_delayed_iput(inode); 592 spin_lock(&root->fs_info->ordered_extent_lock);
593 list_splice_tail(&splice,
594 &root->fs_info->ordered_operations);
595 spin_unlock(&root->fs_info->ordered_extent_lock);
596 ret = -ENOMEM;
597 goto out;
560 } 598 }
599 list_add_tail(&work->list, &works);
600 btrfs_queue_worker(&root->fs_info->flush_workers,
601 &work->work);
561 602
562 cond_resched(); 603 cond_resched();
563 spin_lock(&root->fs_info->ordered_extent_lock); 604 spin_lock(&root->fs_info->ordered_extent_lock);
@@ -566,7 +607,13 @@ again:
566 goto again; 607 goto again;
567 608
568 spin_unlock(&root->fs_info->ordered_extent_lock); 609 spin_unlock(&root->fs_info->ordered_extent_lock);
610out:
611 list_for_each_entry_safe(work, next, &works, list) {
612 list_del_init(&work->list);
613 btrfs_wait_and_free_delalloc_work(work);
614 }
569 mutex_unlock(&root->fs_info->ordered_operations_mutex); 615 mutex_unlock(&root->fs_info->ordered_operations_mutex);
616 return ret;
570} 617}
571 618
572/* 619/*
@@ -606,7 +653,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
606 u64 end; 653 u64 end;
607 u64 orig_end; 654 u64 orig_end;
608 struct btrfs_ordered_extent *ordered; 655 struct btrfs_ordered_extent *ordered;
609 int found;
610 656
611 if (start + len < start) { 657 if (start + len < start) {
612 orig_end = INT_LIMIT(loff_t); 658 orig_end = INT_LIMIT(loff_t);
@@ -642,7 +688,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
642 filemap_fdatawait_range(inode->i_mapping, start, orig_end); 688 filemap_fdatawait_range(inode->i_mapping, start, orig_end);
643 689
644 end = orig_end; 690 end = orig_end;
645 found = 0;
646 while (1) { 691 while (1) {
647 ordered = btrfs_lookup_first_ordered_extent(inode, end); 692 ordered = btrfs_lookup_first_ordered_extent(inode, end);
648 if (!ordered) 693 if (!ordered)
@@ -655,7 +700,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
655 btrfs_put_ordered_extent(ordered); 700 btrfs_put_ordered_extent(ordered);
656 break; 701 break;
657 } 702 }
658 found++;
659 btrfs_start_ordered_extent(inode, ordered, 1); 703 btrfs_start_ordered_extent(inode, ordered, 1);
660 end = ordered->file_offset; 704 end = ordered->file_offset;
661 btrfs_put_ordered_extent(ordered); 705 btrfs_put_ordered_extent(ordered);
@@ -934,15 +978,6 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
934 if (last_mod < root->fs_info->last_trans_committed) 978 if (last_mod < root->fs_info->last_trans_committed)
935 return; 979 return;
936 980
937 /*
938 * the transaction is already committing. Just start the IO and
939 * don't bother with all of this list nonsense
940 */
941 if (trans && root->fs_info->running_transaction->blocked) {
942 btrfs_wait_ordered_range(inode, 0, (u64)-1);
943 return;
944 }
945
946 spin_lock(&root->fs_info->ordered_extent_lock); 981 spin_lock(&root->fs_info->ordered_extent_lock);
947 if (list_empty(&BTRFS_I(inode)->ordered_operations)) { 982 if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
948 list_add_tail(&BTRFS_I(inode)->ordered_operations, 983 list_add_tail(&BTRFS_I(inode)->ordered_operations,
@@ -959,6 +994,7 @@ int __init ordered_data_init(void)
959 NULL); 994 NULL);
960 if (!btrfs_ordered_extent_cache) 995 if (!btrfs_ordered_extent_cache)
961 return -ENOMEM; 996 return -ENOMEM;
997
962 return 0; 998 return 0;
963} 999}
964 1000
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 853fc7beedfa..f29d4bf5fbe7 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -128,8 +128,11 @@ struct btrfs_ordered_extent {
128 struct list_head root_extent_list; 128 struct list_head root_extent_list;
129 129
130 struct btrfs_work work; 130 struct btrfs_work work;
131};
132 131
132 struct completion completion;
133 struct btrfs_work flush_work;
134 struct list_head work_list;
135};
133 136
134/* 137/*
135 * calculates the total size you need to allocate for an ordered sum 138 * calculates the total size you need to allocate for an ordered sum
@@ -186,7 +189,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
186int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 189int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
187 struct btrfs_ordered_extent *ordered); 190 struct btrfs_ordered_extent *ordered);
188int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 191int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
189void btrfs_run_ordered_operations(struct btrfs_root *root, int wait); 192int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
190void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 193void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
191 struct btrfs_root *root, 194 struct btrfs_root *root,
192 struct inode *inode); 195 struct inode *inode);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 5e23684887eb..50d95fd190a5 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -297,6 +297,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
297 case BTRFS_DEV_STATS_KEY: 297 case BTRFS_DEV_STATS_KEY:
298 printk(KERN_INFO "\t\tdevice stats\n"); 298 printk(KERN_INFO "\t\tdevice stats\n");
299 break; 299 break;
300 case BTRFS_DEV_REPLACE_KEY:
301 printk(KERN_INFO "\t\tdev replace\n");
302 break;
300 }; 303 };
301 } 304 }
302} 305}
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index a955669519a2..96b93daa0bbb 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -27,6 +27,7 @@
27#include "volumes.h" 27#include "volumes.h"
28#include "disk-io.h" 28#include "disk-io.h"
29#include "transaction.h" 29#include "transaction.h"
30#include "dev-replace.h"
30 31
31#undef DEBUG 32#undef DEBUG
32 33
@@ -323,7 +324,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
323 struct reada_extent *re = NULL; 324 struct reada_extent *re = NULL;
324 struct reada_extent *re_exist = NULL; 325 struct reada_extent *re_exist = NULL;
325 struct btrfs_fs_info *fs_info = root->fs_info; 326 struct btrfs_fs_info *fs_info = root->fs_info;
326 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
327 struct btrfs_bio *bbio = NULL; 327 struct btrfs_bio *bbio = NULL;
328 struct btrfs_device *dev; 328 struct btrfs_device *dev;
329 struct btrfs_device *prev_dev; 329 struct btrfs_device *prev_dev;
@@ -332,6 +332,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
332 int nzones = 0; 332 int nzones = 0;
333 int i; 333 int i;
334 unsigned long index = logical >> PAGE_CACHE_SHIFT; 334 unsigned long index = logical >> PAGE_CACHE_SHIFT;
335 int dev_replace_is_ongoing;
335 336
336 spin_lock(&fs_info->reada_lock); 337 spin_lock(&fs_info->reada_lock);
337 re = radix_tree_lookup(&fs_info->reada_tree, index); 338 re = radix_tree_lookup(&fs_info->reada_tree, index);
@@ -358,7 +359,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
358 * map block 359 * map block
359 */ 360 */
360 length = blocksize; 361 length = blocksize;
361 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0); 362 ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length,
363 &bbio, 0);
362 if (ret || !bbio || length < blocksize) 364 if (ret || !bbio || length < blocksize)
363 goto error; 365 goto error;
364 366
@@ -393,6 +395,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
393 } 395 }
394 396
395 /* insert extent in reada_tree + all per-device trees, all or nothing */ 397 /* insert extent in reada_tree + all per-device trees, all or nothing */
398 btrfs_dev_replace_lock(&fs_info->dev_replace);
396 spin_lock(&fs_info->reada_lock); 399 spin_lock(&fs_info->reada_lock);
397 ret = radix_tree_insert(&fs_info->reada_tree, index, re); 400 ret = radix_tree_insert(&fs_info->reada_tree, index, re);
398 if (ret == -EEXIST) { 401 if (ret == -EEXIST) {
@@ -400,13 +403,17 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
400 BUG_ON(!re_exist); 403 BUG_ON(!re_exist);
401 re_exist->refcnt++; 404 re_exist->refcnt++;
402 spin_unlock(&fs_info->reada_lock); 405 spin_unlock(&fs_info->reada_lock);
406 btrfs_dev_replace_unlock(&fs_info->dev_replace);
403 goto error; 407 goto error;
404 } 408 }
405 if (ret) { 409 if (ret) {
406 spin_unlock(&fs_info->reada_lock); 410 spin_unlock(&fs_info->reada_lock);
411 btrfs_dev_replace_unlock(&fs_info->dev_replace);
407 goto error; 412 goto error;
408 } 413 }
409 prev_dev = NULL; 414 prev_dev = NULL;
415 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
416 &fs_info->dev_replace);
410 for (i = 0; i < nzones; ++i) { 417 for (i = 0; i < nzones; ++i) {
411 dev = bbio->stripes[i].dev; 418 dev = bbio->stripes[i].dev;
412 if (dev == prev_dev) { 419 if (dev == prev_dev) {
@@ -419,21 +426,36 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
419 */ 426 */
420 continue; 427 continue;
421 } 428 }
429 if (!dev->bdev) {
430 /* cannot read ahead on missing device */
431 continue;
432 }
433 if (dev_replace_is_ongoing &&
434 dev == fs_info->dev_replace.tgtdev) {
435 /*
436 * as this device is selected for reading only as
437 * a last resort, skip it for read ahead.
438 */
439 continue;
440 }
422 prev_dev = dev; 441 prev_dev = dev;
423 ret = radix_tree_insert(&dev->reada_extents, index, re); 442 ret = radix_tree_insert(&dev->reada_extents, index, re);
424 if (ret) { 443 if (ret) {
425 while (--i >= 0) { 444 while (--i >= 0) {
426 dev = bbio->stripes[i].dev; 445 dev = bbio->stripes[i].dev;
427 BUG_ON(dev == NULL); 446 BUG_ON(dev == NULL);
447 /* ignore whether the entry was inserted */
428 radix_tree_delete(&dev->reada_extents, index); 448 radix_tree_delete(&dev->reada_extents, index);
429 } 449 }
430 BUG_ON(fs_info == NULL); 450 BUG_ON(fs_info == NULL);
431 radix_tree_delete(&fs_info->reada_tree, index); 451 radix_tree_delete(&fs_info->reada_tree, index);
432 spin_unlock(&fs_info->reada_lock); 452 spin_unlock(&fs_info->reada_lock);
453 btrfs_dev_replace_unlock(&fs_info->dev_replace);
433 goto error; 454 goto error;
434 } 455 }
435 } 456 }
436 spin_unlock(&fs_info->reada_lock); 457 spin_unlock(&fs_info->reada_lock);
458 btrfs_dev_replace_unlock(&fs_info->dev_replace);
437 459
438 kfree(bbio); 460 kfree(bbio);
439 return re; 461 return re;
@@ -915,7 +937,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
915 generation = btrfs_header_generation(node); 937 generation = btrfs_header_generation(node);
916 free_extent_buffer(node); 938 free_extent_buffer(node);
917 939
918 reada_add_block(rc, start, &max_key, level, generation); 940 if (reada_add_block(rc, start, &max_key, level, generation)) {
941 kfree(rc);
942 return ERR_PTR(-ENOMEM);
943 }
919 944
920 reada_start_machine(root->fs_info); 945 reada_start_machine(root->fs_info);
921 946
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 776f0aa128fc..300e09ac3659 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2025,7 +2025,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2025 struct btrfs_root_item *root_item; 2025 struct btrfs_root_item *root_item;
2026 struct btrfs_path *path; 2026 struct btrfs_path *path;
2027 struct extent_buffer *leaf; 2027 struct extent_buffer *leaf;
2028 unsigned long nr;
2029 int level; 2028 int level;
2030 int max_level; 2029 int max_level;
2031 int replaced = 0; 2030 int replaced = 0;
@@ -2074,7 +2073,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2074 BUG_ON(IS_ERR(trans)); 2073 BUG_ON(IS_ERR(trans));
2075 trans->block_rsv = rc->block_rsv; 2074 trans->block_rsv = rc->block_rsv;
2076 2075
2077 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved); 2076 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
2077 BTRFS_RESERVE_FLUSH_ALL);
2078 if (ret) { 2078 if (ret) {
2079 BUG_ON(ret != -EAGAIN); 2079 BUG_ON(ret != -EAGAIN);
2080 ret = btrfs_commit_transaction(trans, root); 2080 ret = btrfs_commit_transaction(trans, root);
@@ -2125,10 +2125,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2125 path->slots[level]); 2125 path->slots[level]);
2126 root_item->drop_level = level; 2126 root_item->drop_level = level;
2127 2127
2128 nr = trans->blocks_used;
2129 btrfs_end_transaction_throttle(trans, root); 2128 btrfs_end_transaction_throttle(trans, root);
2130 2129
2131 btrfs_btree_balance_dirty(root, nr); 2130 btrfs_btree_balance_dirty(root);
2132 2131
2133 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2132 if (replaced && rc->stage == UPDATE_DATA_PTRS)
2134 invalidate_extent_cache(root, &key, &next_key); 2133 invalidate_extent_cache(root, &key, &next_key);
@@ -2155,10 +2154,9 @@ out:
2155 btrfs_update_reloc_root(trans, root); 2154 btrfs_update_reloc_root(trans, root);
2156 } 2155 }
2157 2156
2158 nr = trans->blocks_used;
2159 btrfs_end_transaction_throttle(trans, root); 2157 btrfs_end_transaction_throttle(trans, root);
2160 2158
2161 btrfs_btree_balance_dirty(root, nr); 2159 btrfs_btree_balance_dirty(root);
2162 2160
2163 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2161 if (replaced && rc->stage == UPDATE_DATA_PTRS)
2164 invalidate_extent_cache(root, &key, &next_key); 2162 invalidate_extent_cache(root, &key, &next_key);
@@ -2184,7 +2182,8 @@ int prepare_to_merge(struct reloc_control *rc, int err)
2184again: 2182again:
2185 if (!err) { 2183 if (!err) {
2186 num_bytes = rc->merging_rsv_size; 2184 num_bytes = rc->merging_rsv_size;
2187 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); 2185 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2186 BTRFS_RESERVE_FLUSH_ALL);
2188 if (ret) 2187 if (ret)
2189 err = ret; 2188 err = ret;
2190 } 2189 }
@@ -2459,7 +2458,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2459 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2458 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2460 2459
2461 trans->block_rsv = rc->block_rsv; 2460 trans->block_rsv = rc->block_rsv;
2462 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); 2461 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2462 BTRFS_RESERVE_FLUSH_ALL);
2463 if (ret) { 2463 if (ret) {
2464 if (ret == -EAGAIN) 2464 if (ret == -EAGAIN)
2465 rc->commit_transaction = 1; 2465 rc->commit_transaction = 1;
@@ -3259,7 +3259,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3259 struct btrfs_path *path; 3259 struct btrfs_path *path;
3260 struct btrfs_root *root = fs_info->tree_root; 3260 struct btrfs_root *root = fs_info->tree_root;
3261 struct btrfs_trans_handle *trans; 3261 struct btrfs_trans_handle *trans;
3262 unsigned long nr;
3263 int ret = 0; 3262 int ret = 0;
3264 3263
3265 if (inode) 3264 if (inode)
@@ -3293,9 +3292,8 @@ truncate:
3293 ret = btrfs_truncate_free_space_cache(root, trans, path, inode); 3292 ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
3294 3293
3295 btrfs_free_path(path); 3294 btrfs_free_path(path);
3296 nr = trans->blocks_used;
3297 btrfs_end_transaction(trans, root); 3295 btrfs_end_transaction(trans, root);
3298 btrfs_btree_balance_dirty(root, nr); 3296 btrfs_btree_balance_dirty(root);
3299out: 3297out:
3300 iput(inode); 3298 iput(inode);
3301 return ret; 3299 return ret;
@@ -3685,7 +3683,8 @@ int prepare_to_relocate(struct reloc_control *rc)
3685 * is no reservation in transaction handle. 3683 * is no reservation in transaction handle.
3686 */ 3684 */
3687 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv, 3685 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
3688 rc->extent_root->nodesize * 256); 3686 rc->extent_root->nodesize * 256,
3687 BTRFS_RESERVE_FLUSH_ALL);
3689 if (ret) 3688 if (ret)
3690 return ret; 3689 return ret;
3691 3690
@@ -3711,7 +3710,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3711 struct btrfs_trans_handle *trans = NULL; 3710 struct btrfs_trans_handle *trans = NULL;
3712 struct btrfs_path *path; 3711 struct btrfs_path *path;
3713 struct btrfs_extent_item *ei; 3712 struct btrfs_extent_item *ei;
3714 unsigned long nr;
3715 u64 flags; 3713 u64 flags;
3716 u32 item_size; 3714 u32 item_size;
3717 int ret; 3715 int ret;
@@ -3828,9 +3826,8 @@ restart:
3828 ret = btrfs_commit_transaction(trans, rc->extent_root); 3826 ret = btrfs_commit_transaction(trans, rc->extent_root);
3829 BUG_ON(ret); 3827 BUG_ON(ret);
3830 } else { 3828 } else {
3831 nr = trans->blocks_used;
3832 btrfs_end_transaction_throttle(trans, rc->extent_root); 3829 btrfs_end_transaction_throttle(trans, rc->extent_root);
3833 btrfs_btree_balance_dirty(rc->extent_root, nr); 3830 btrfs_btree_balance_dirty(rc->extent_root);
3834 } 3831 }
3835 trans = NULL; 3832 trans = NULL;
3836 3833
@@ -3860,9 +3857,8 @@ restart:
3860 GFP_NOFS); 3857 GFP_NOFS);
3861 3858
3862 if (trans) { 3859 if (trans) {
3863 nr = trans->blocks_used;
3864 btrfs_end_transaction_throttle(trans, rc->extent_root); 3860 btrfs_end_transaction_throttle(trans, rc->extent_root);
3865 btrfs_btree_balance_dirty(rc->extent_root, nr); 3861 btrfs_btree_balance_dirty(rc->extent_root);
3866 } 3862 }
3867 3863
3868 if (!err) { 3864 if (!err) {
@@ -3941,7 +3937,6 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3941 struct btrfs_trans_handle *trans; 3937 struct btrfs_trans_handle *trans;
3942 struct btrfs_root *root; 3938 struct btrfs_root *root;
3943 struct btrfs_key key; 3939 struct btrfs_key key;
3944 unsigned long nr;
3945 u64 objectid = BTRFS_FIRST_FREE_OBJECTID; 3940 u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
3946 int err = 0; 3941 int err = 0;
3947 3942
@@ -3969,9 +3964,8 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3969 3964
3970 err = btrfs_orphan_add(trans, inode); 3965 err = btrfs_orphan_add(trans, inode);
3971out: 3966out:
3972 nr = trans->blocks_used;
3973 btrfs_end_transaction(trans, root); 3967 btrfs_end_transaction(trans, root);
3974 btrfs_btree_balance_dirty(root, nr); 3968 btrfs_btree_balance_dirty(root);
3975 if (err) { 3969 if (err) {
3976 if (inode) 3970 if (inode)
3977 iput(inode); 3971 iput(inode);
@@ -4057,7 +4051,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4057 (unsigned long long)rc->block_group->key.objectid, 4051 (unsigned long long)rc->block_group->key.objectid,
4058 (unsigned long long)rc->block_group->flags); 4052 (unsigned long long)rc->block_group->flags);
4059 4053
4060 btrfs_start_delalloc_inodes(fs_info->tree_root, 0); 4054 ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
4055 if (ret < 0) {
4056 err = ret;
4057 goto out;
4058 }
4061 btrfs_wait_ordered_extents(fs_info->tree_root, 0); 4059 btrfs_wait_ordered_extents(fs_info->tree_root, 0);
4062 4060
4063 while (1) { 4061 while (1) {
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index eb923d087da7..668af537a3ea 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -548,9 +548,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
548 struct btrfs_root_item *item = &root->root_item; 548 struct btrfs_root_item *item = &root->root_item;
549 struct timespec ct = CURRENT_TIME; 549 struct timespec ct = CURRENT_TIME;
550 550
551 spin_lock(&root->root_times_lock); 551 spin_lock(&root->root_item_lock);
552 item->ctransid = cpu_to_le64(trans->transid); 552 item->ctransid = cpu_to_le64(trans->transid);
553 item->ctime.sec = cpu_to_le64(ct.tv_sec); 553 item->ctime.sec = cpu_to_le64(ct.tv_sec);
554 item->ctime.nsec = cpu_to_le32(ct.tv_nsec); 554 item->ctime.nsec = cpu_to_le32(ct.tv_nsec);
555 spin_unlock(&root->root_times_lock); 555 spin_unlock(&root->root_item_lock);
556} 556}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 27892f67e69b..bdbb94f245c9 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2011 STRATO. All rights reserved. 2 * Copyright (C) 2011, 2012 STRATO. All rights reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or 4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public 5 * modify it under the terms of the GNU General Public
@@ -25,6 +25,7 @@
25#include "transaction.h" 25#include "transaction.h"
26#include "backref.h" 26#include "backref.h"
27#include "extent_io.h" 27#include "extent_io.h"
28#include "dev-replace.h"
28#include "check-integrity.h" 29#include "check-integrity.h"
29#include "rcu-string.h" 30#include "rcu-string.h"
30 31
@@ -42,10 +43,23 @@
42 */ 43 */
43 44
44struct scrub_block; 45struct scrub_block;
45struct scrub_dev; 46struct scrub_ctx;
46 47
47#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ 48/*
48#define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ 49 * the following three values only influence the performance.
50 * The last one configures the number of parallel and outstanding I/O
51 * operations. The first two values configure an upper limit for the number
52 * of (dynamically allocated) pages that are added to a bio.
53 */
54#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
55#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
56#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
57
58/*
59 * the following value times PAGE_SIZE needs to be large enough to match the
60 * largest node/leaf/sector size that shall be supported.
61 * Values larger than BTRFS_STRIPE_LEN are not supported.
62 */
49#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ 63#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
50 64
51struct scrub_page { 65struct scrub_page {
@@ -56,6 +70,8 @@ struct scrub_page {
56 u64 generation; 70 u64 generation;
57 u64 logical; 71 u64 logical;
58 u64 physical; 72 u64 physical;
73 u64 physical_for_dev_replace;
74 atomic_t ref_count;
59 struct { 75 struct {
60 unsigned int mirror_num:8; 76 unsigned int mirror_num:8;
61 unsigned int have_csum:1; 77 unsigned int have_csum:1;
@@ -66,23 +82,28 @@ struct scrub_page {
66 82
67struct scrub_bio { 83struct scrub_bio {
68 int index; 84 int index;
69 struct scrub_dev *sdev; 85 struct scrub_ctx *sctx;
86 struct btrfs_device *dev;
70 struct bio *bio; 87 struct bio *bio;
71 int err; 88 int err;
72 u64 logical; 89 u64 logical;
73 u64 physical; 90 u64 physical;
74 struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; 91#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
92 struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
93#else
94 struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
95#endif
75 int page_count; 96 int page_count;
76 int next_free; 97 int next_free;
77 struct btrfs_work work; 98 struct btrfs_work work;
78}; 99};
79 100
80struct scrub_block { 101struct scrub_block {
81 struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK]; 102 struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
82 int page_count; 103 int page_count;
83 atomic_t outstanding_pages; 104 atomic_t outstanding_pages;
84 atomic_t ref_count; /* free mem on transition to zero */ 105 atomic_t ref_count; /* free mem on transition to zero */
85 struct scrub_dev *sdev; 106 struct scrub_ctx *sctx;
86 struct { 107 struct {
87 unsigned int header_error:1; 108 unsigned int header_error:1;
88 unsigned int checksum_error:1; 109 unsigned int checksum_error:1;
@@ -91,23 +112,35 @@ struct scrub_block {
91 }; 112 };
92}; 113};
93 114
94struct scrub_dev { 115struct scrub_wr_ctx {
95 struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; 116 struct scrub_bio *wr_curr_bio;
96 struct btrfs_device *dev; 117 struct btrfs_device *tgtdev;
118 int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
119 atomic_t flush_all_writes;
120 struct mutex wr_lock;
121};
122
123struct scrub_ctx {
124 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
125 struct btrfs_root *dev_root;
97 int first_free; 126 int first_free;
98 int curr; 127 int curr;
99 atomic_t in_flight; 128 atomic_t bios_in_flight;
100 atomic_t fixup_cnt; 129 atomic_t workers_pending;
101 spinlock_t list_lock; 130 spinlock_t list_lock;
102 wait_queue_head_t list_wait; 131 wait_queue_head_t list_wait;
103 u16 csum_size; 132 u16 csum_size;
104 struct list_head csum_list; 133 struct list_head csum_list;
105 atomic_t cancel_req; 134 atomic_t cancel_req;
106 int readonly; 135 int readonly;
107 int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ 136 int pages_per_rd_bio;
108 u32 sectorsize; 137 u32 sectorsize;
109 u32 nodesize; 138 u32 nodesize;
110 u32 leafsize; 139 u32 leafsize;
140
141 int is_dev_replace;
142 struct scrub_wr_ctx wr_ctx;
143
111 /* 144 /*
112 * statistics 145 * statistics
113 */ 146 */
@@ -116,13 +149,23 @@ struct scrub_dev {
116}; 149};
117 150
118struct scrub_fixup_nodatasum { 151struct scrub_fixup_nodatasum {
119 struct scrub_dev *sdev; 152 struct scrub_ctx *sctx;
153 struct btrfs_device *dev;
120 u64 logical; 154 u64 logical;
121 struct btrfs_root *root; 155 struct btrfs_root *root;
122 struct btrfs_work work; 156 struct btrfs_work work;
123 int mirror_num; 157 int mirror_num;
124}; 158};
125 159
160struct scrub_copy_nocow_ctx {
161 struct scrub_ctx *sctx;
162 u64 logical;
163 u64 len;
164 int mirror_num;
165 u64 physical_for_dev_replace;
166 struct btrfs_work work;
167};
168
126struct scrub_warning { 169struct scrub_warning {
127 struct btrfs_path *path; 170 struct btrfs_path *path;
128 u64 extent_item_size; 171 u64 extent_item_size;
@@ -137,15 +180,20 @@ struct scrub_warning {
137}; 180};
138 181
139 182
183static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
184static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
185static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
186static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
140static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); 187static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
141static int scrub_setup_recheck_block(struct scrub_dev *sdev, 188static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
142 struct btrfs_mapping_tree *map_tree, 189 struct btrfs_fs_info *fs_info,
190 struct scrub_block *original_sblock,
143 u64 length, u64 logical, 191 u64 length, u64 logical,
144 struct scrub_block *sblock); 192 struct scrub_block *sblocks_for_recheck);
145static int scrub_recheck_block(struct btrfs_fs_info *fs_info, 193static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
146 struct scrub_block *sblock, int is_metadata, 194 struct scrub_block *sblock, int is_metadata,
147 int have_csum, u8 *csum, u64 generation, 195 int have_csum, u8 *csum, u64 generation,
148 u16 csum_size); 196 u16 csum_size);
149static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 197static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
150 struct scrub_block *sblock, 198 struct scrub_block *sblock,
151 int is_metadata, int have_csum, 199 int is_metadata, int have_csum,
@@ -158,118 +206,221 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
158static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 206static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
159 struct scrub_block *sblock_good, 207 struct scrub_block *sblock_good,
160 int page_num, int force_write); 208 int page_num, int force_write);
209static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
210static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
211 int page_num);
161static int scrub_checksum_data(struct scrub_block *sblock); 212static int scrub_checksum_data(struct scrub_block *sblock);
162static int scrub_checksum_tree_block(struct scrub_block *sblock); 213static int scrub_checksum_tree_block(struct scrub_block *sblock);
163static int scrub_checksum_super(struct scrub_block *sblock); 214static int scrub_checksum_super(struct scrub_block *sblock);
164static void scrub_block_get(struct scrub_block *sblock); 215static void scrub_block_get(struct scrub_block *sblock);
165static void scrub_block_put(struct scrub_block *sblock); 216static void scrub_block_put(struct scrub_block *sblock);
166static int scrub_add_page_to_bio(struct scrub_dev *sdev, 217static void scrub_page_get(struct scrub_page *spage);
167 struct scrub_page *spage); 218static void scrub_page_put(struct scrub_page *spage);
168static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, 219static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
169 u64 physical, u64 flags, u64 gen, int mirror_num, 220 struct scrub_page *spage);
170 u8 *csum, int force); 221static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
222 u64 physical, struct btrfs_device *dev, u64 flags,
223 u64 gen, int mirror_num, u8 *csum, int force,
224 u64 physical_for_dev_replace);
171static void scrub_bio_end_io(struct bio *bio, int err); 225static void scrub_bio_end_io(struct bio *bio, int err);
172static void scrub_bio_end_io_worker(struct btrfs_work *work); 226static void scrub_bio_end_io_worker(struct btrfs_work *work);
173static void scrub_block_complete(struct scrub_block *sblock); 227static void scrub_block_complete(struct scrub_block *sblock);
228static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
229 u64 extent_logical, u64 extent_len,
230 u64 *extent_physical,
231 struct btrfs_device **extent_dev,
232 int *extent_mirror_num);
233static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
234 struct scrub_wr_ctx *wr_ctx,
235 struct btrfs_fs_info *fs_info,
236 struct btrfs_device *dev,
237 int is_dev_replace);
238static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
239static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
240 struct scrub_page *spage);
241static void scrub_wr_submit(struct scrub_ctx *sctx);
242static void scrub_wr_bio_end_io(struct bio *bio, int err);
243static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
244static int write_page_nocow(struct scrub_ctx *sctx,
245 u64 physical_for_dev_replace, struct page *page);
246static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
247 void *ctx);
248static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
249 int mirror_num, u64 physical_for_dev_replace);
250static void copy_nocow_pages_worker(struct btrfs_work *work);
251
252
253static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
254{
255 atomic_inc(&sctx->bios_in_flight);
256}
257
258static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
259{
260 atomic_dec(&sctx->bios_in_flight);
261 wake_up(&sctx->list_wait);
262}
263
264/*
265 * used for workers that require transaction commits (i.e., for the
266 * NOCOW case)
267 */
268static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
269{
270 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
271
272 /*
273 * increment scrubs_running to prevent cancel requests from
274 * completing as long as a worker is running. we must also
275 * increment scrubs_paused to prevent deadlocking on pause
276 * requests used for transactions commits (as the worker uses a
277 * transaction context). it is safe to regard the worker
278 * as paused for all matters practical. effectively, we only
279 * avoid cancellation requests from completing.
280 */
281 mutex_lock(&fs_info->scrub_lock);
282 atomic_inc(&fs_info->scrubs_running);
283 atomic_inc(&fs_info->scrubs_paused);
284 mutex_unlock(&fs_info->scrub_lock);
285 atomic_inc(&sctx->workers_pending);
286}
174 287
288/* used for workers that require transaction commits */
289static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
290{
291 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
175 292
176static void scrub_free_csums(struct scrub_dev *sdev) 293 /*
294 * see scrub_pending_trans_workers_inc() why we're pretending
295 * to be paused in the scrub counters
296 */
297 mutex_lock(&fs_info->scrub_lock);
298 atomic_dec(&fs_info->scrubs_running);
299 atomic_dec(&fs_info->scrubs_paused);
300 mutex_unlock(&fs_info->scrub_lock);
301 atomic_dec(&sctx->workers_pending);
302 wake_up(&fs_info->scrub_pause_wait);
303 wake_up(&sctx->list_wait);
304}
305
306static void scrub_free_csums(struct scrub_ctx *sctx)
177{ 307{
178 while (!list_empty(&sdev->csum_list)) { 308 while (!list_empty(&sctx->csum_list)) {
179 struct btrfs_ordered_sum *sum; 309 struct btrfs_ordered_sum *sum;
180 sum = list_first_entry(&sdev->csum_list, 310 sum = list_first_entry(&sctx->csum_list,
181 struct btrfs_ordered_sum, list); 311 struct btrfs_ordered_sum, list);
182 list_del(&sum->list); 312 list_del(&sum->list);
183 kfree(sum); 313 kfree(sum);
184 } 314 }
185} 315}
186 316
187static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) 317static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
188{ 318{
189 int i; 319 int i;
190 320
191 if (!sdev) 321 if (!sctx)
192 return; 322 return;
193 323
324 scrub_free_wr_ctx(&sctx->wr_ctx);
325
194 /* this can happen when scrub is cancelled */ 326 /* this can happen when scrub is cancelled */
195 if (sdev->curr != -1) { 327 if (sctx->curr != -1) {
196 struct scrub_bio *sbio = sdev->bios[sdev->curr]; 328 struct scrub_bio *sbio = sctx->bios[sctx->curr];
197 329
198 for (i = 0; i < sbio->page_count; i++) { 330 for (i = 0; i < sbio->page_count; i++) {
199 BUG_ON(!sbio->pagev[i]); 331 WARN_ON(!sbio->pagev[i]->page);
200 BUG_ON(!sbio->pagev[i]->page);
201 scrub_block_put(sbio->pagev[i]->sblock); 332 scrub_block_put(sbio->pagev[i]->sblock);
202 } 333 }
203 bio_put(sbio->bio); 334 bio_put(sbio->bio);
204 } 335 }
205 336
206 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 337 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
207 struct scrub_bio *sbio = sdev->bios[i]; 338 struct scrub_bio *sbio = sctx->bios[i];
208 339
209 if (!sbio) 340 if (!sbio)
210 break; 341 break;
211 kfree(sbio); 342 kfree(sbio);
212 } 343 }
213 344
214 scrub_free_csums(sdev); 345 scrub_free_csums(sctx);
215 kfree(sdev); 346 kfree(sctx);
216} 347}
217 348
218static noinline_for_stack 349static noinline_for_stack
219struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) 350struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
220{ 351{
221 struct scrub_dev *sdev; 352 struct scrub_ctx *sctx;
222 int i; 353 int i;
223 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 354 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
224 int pages_per_bio; 355 int pages_per_rd_bio;
356 int ret;
225 357
226 pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, 358 /*
227 bio_get_nr_vecs(dev->bdev)); 359 * the setting of pages_per_rd_bio is correct for scrub but might
228 sdev = kzalloc(sizeof(*sdev), GFP_NOFS); 360 * be wrong for the dev_replace code where we might read from
229 if (!sdev) 361 * different devices in the initial huge bios. However, that
362 * code is able to correctly handle the case when adding a page
363 * to a bio fails.
364 */
365 if (dev->bdev)
366 pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
367 bio_get_nr_vecs(dev->bdev));
368 else
369 pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
370 sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
371 if (!sctx)
230 goto nomem; 372 goto nomem;
231 sdev->dev = dev; 373 sctx->is_dev_replace = is_dev_replace;
232 sdev->pages_per_bio = pages_per_bio; 374 sctx->pages_per_rd_bio = pages_per_rd_bio;
233 sdev->curr = -1; 375 sctx->curr = -1;
234 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 376 sctx->dev_root = dev->dev_root;
377 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
235 struct scrub_bio *sbio; 378 struct scrub_bio *sbio;
236 379
237 sbio = kzalloc(sizeof(*sbio), GFP_NOFS); 380 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
238 if (!sbio) 381 if (!sbio)
239 goto nomem; 382 goto nomem;
240 sdev->bios[i] = sbio; 383 sctx->bios[i] = sbio;
241 384
242 sbio->index = i; 385 sbio->index = i;
243 sbio->sdev = sdev; 386 sbio->sctx = sctx;
244 sbio->page_count = 0; 387 sbio->page_count = 0;
245 sbio->work.func = scrub_bio_end_io_worker; 388 sbio->work.func = scrub_bio_end_io_worker;
246 389
247 if (i != SCRUB_BIOS_PER_DEV-1) 390 if (i != SCRUB_BIOS_PER_SCTX - 1)
248 sdev->bios[i]->next_free = i + 1; 391 sctx->bios[i]->next_free = i + 1;
249 else 392 else
250 sdev->bios[i]->next_free = -1; 393 sctx->bios[i]->next_free = -1;
251 } 394 }
252 sdev->first_free = 0; 395 sctx->first_free = 0;
253 sdev->nodesize = dev->dev_root->nodesize; 396 sctx->nodesize = dev->dev_root->nodesize;
254 sdev->leafsize = dev->dev_root->leafsize; 397 sctx->leafsize = dev->dev_root->leafsize;
255 sdev->sectorsize = dev->dev_root->sectorsize; 398 sctx->sectorsize = dev->dev_root->sectorsize;
256 atomic_set(&sdev->in_flight, 0); 399 atomic_set(&sctx->bios_in_flight, 0);
257 atomic_set(&sdev->fixup_cnt, 0); 400 atomic_set(&sctx->workers_pending, 0);
258 atomic_set(&sdev->cancel_req, 0); 401 atomic_set(&sctx->cancel_req, 0);
259 sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); 402 sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
260 INIT_LIST_HEAD(&sdev->csum_list); 403 INIT_LIST_HEAD(&sctx->csum_list);
261 404
262 spin_lock_init(&sdev->list_lock); 405 spin_lock_init(&sctx->list_lock);
263 spin_lock_init(&sdev->stat_lock); 406 spin_lock_init(&sctx->stat_lock);
264 init_waitqueue_head(&sdev->list_wait); 407 init_waitqueue_head(&sctx->list_wait);
265 return sdev; 408
409 ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
410 fs_info->dev_replace.tgtdev, is_dev_replace);
411 if (ret) {
412 scrub_free_ctx(sctx);
413 return ERR_PTR(ret);
414 }
415 return sctx;
266 416
267nomem: 417nomem:
268 scrub_free_dev(sdev); 418 scrub_free_ctx(sctx);
269 return ERR_PTR(-ENOMEM); 419 return ERR_PTR(-ENOMEM);
270} 420}
271 421
272static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) 422static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
423 void *warn_ctx)
273{ 424{
274 u64 isize; 425 u64 isize;
275 u32 nlink; 426 u32 nlink;
@@ -277,7 +428,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
277 int i; 428 int i;
278 struct extent_buffer *eb; 429 struct extent_buffer *eb;
279 struct btrfs_inode_item *inode_item; 430 struct btrfs_inode_item *inode_item;
280 struct scrub_warning *swarn = ctx; 431 struct scrub_warning *swarn = warn_ctx;
281 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; 432 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
282 struct inode_fs_paths *ipath = NULL; 433 struct inode_fs_paths *ipath = NULL;
283 struct btrfs_root *local_root; 434 struct btrfs_root *local_root;
@@ -345,8 +496,8 @@ err:
345 496
346static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) 497static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
347{ 498{
348 struct btrfs_device *dev = sblock->sdev->dev; 499 struct btrfs_device *dev;
349 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 500 struct btrfs_fs_info *fs_info;
350 struct btrfs_path *path; 501 struct btrfs_path *path;
351 struct btrfs_key found_key; 502 struct btrfs_key found_key;
352 struct extent_buffer *eb; 503 struct extent_buffer *eb;
@@ -361,15 +512,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
361 const int bufsize = 4096; 512 const int bufsize = 4096;
362 int ret; 513 int ret;
363 514
515 WARN_ON(sblock->page_count < 1);
516 dev = sblock->pagev[0]->dev;
517 fs_info = sblock->sctx->dev_root->fs_info;
518
364 path = btrfs_alloc_path(); 519 path = btrfs_alloc_path();
365 520
366 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); 521 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
367 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); 522 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
368 BUG_ON(sblock->page_count < 1); 523 swarn.sector = (sblock->pagev[0]->physical) >> 9;
369 swarn.sector = (sblock->pagev[0].physical) >> 9; 524 swarn.logical = sblock->pagev[0]->logical;
370 swarn.logical = sblock->pagev[0].logical;
371 swarn.errstr = errstr; 525 swarn.errstr = errstr;
372 swarn.dev = dev; 526 swarn.dev = NULL;
373 swarn.msg_bufsize = bufsize; 527 swarn.msg_bufsize = bufsize;
374 swarn.scratch_bufsize = bufsize; 528 swarn.scratch_bufsize = bufsize;
375 529
@@ -405,6 +559,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
405 } while (ret != 1); 559 } while (ret != 1);
406 } else { 560 } else {
407 swarn.path = path; 561 swarn.path = path;
562 swarn.dev = dev;
408 iterate_extent_inodes(fs_info, found_key.objectid, 563 iterate_extent_inodes(fs_info, found_key.objectid,
409 extent_item_pos, 1, 564 extent_item_pos, 1,
410 scrub_print_warning_inode, &swarn); 565 scrub_print_warning_inode, &swarn);
@@ -416,11 +571,11 @@ out:
416 kfree(swarn.msg_buf); 571 kfree(swarn.msg_buf);
417} 572}
418 573
419static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) 574static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
420{ 575{
421 struct page *page = NULL; 576 struct page *page = NULL;
422 unsigned long index; 577 unsigned long index;
423 struct scrub_fixup_nodatasum *fixup = ctx; 578 struct scrub_fixup_nodatasum *fixup = fixup_ctx;
424 int ret; 579 int ret;
425 int corrected = 0; 580 int corrected = 0;
426 struct btrfs_key key; 581 struct btrfs_key key;
@@ -451,7 +606,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
451 } 606 }
452 607
453 if (PageUptodate(page)) { 608 if (PageUptodate(page)) {
454 struct btrfs_mapping_tree *map_tree; 609 struct btrfs_fs_info *fs_info;
455 if (PageDirty(page)) { 610 if (PageDirty(page)) {
456 /* 611 /*
457 * we need to write the data to the defect sector. the 612 * we need to write the data to the defect sector. the
@@ -472,8 +627,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
472 ret = -EIO; 627 ret = -EIO;
473 goto out; 628 goto out;
474 } 629 }
475 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; 630 fs_info = BTRFS_I(inode)->root->fs_info;
476 ret = repair_io_failure(map_tree, offset, PAGE_SIZE, 631 ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
477 fixup->logical, page, 632 fixup->logical, page,
478 fixup->mirror_num); 633 fixup->mirror_num);
479 unlock_page(page); 634 unlock_page(page);
@@ -530,21 +685,21 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
530{ 685{
531 int ret; 686 int ret;
532 struct scrub_fixup_nodatasum *fixup; 687 struct scrub_fixup_nodatasum *fixup;
533 struct scrub_dev *sdev; 688 struct scrub_ctx *sctx;
534 struct btrfs_trans_handle *trans = NULL; 689 struct btrfs_trans_handle *trans = NULL;
535 struct btrfs_fs_info *fs_info; 690 struct btrfs_fs_info *fs_info;
536 struct btrfs_path *path; 691 struct btrfs_path *path;
537 int uncorrectable = 0; 692 int uncorrectable = 0;
538 693
539 fixup = container_of(work, struct scrub_fixup_nodatasum, work); 694 fixup = container_of(work, struct scrub_fixup_nodatasum, work);
540 sdev = fixup->sdev; 695 sctx = fixup->sctx;
541 fs_info = fixup->root->fs_info; 696 fs_info = fixup->root->fs_info;
542 697
543 path = btrfs_alloc_path(); 698 path = btrfs_alloc_path();
544 if (!path) { 699 if (!path) {
545 spin_lock(&sdev->stat_lock); 700 spin_lock(&sctx->stat_lock);
546 ++sdev->stat.malloc_errors; 701 ++sctx->stat.malloc_errors;
547 spin_unlock(&sdev->stat_lock); 702 spin_unlock(&sctx->stat_lock);
548 uncorrectable = 1; 703 uncorrectable = 1;
549 goto out; 704 goto out;
550 } 705 }
@@ -573,35 +728,30 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
573 } 728 }
574 WARN_ON(ret != 1); 729 WARN_ON(ret != 1);
575 730
576 spin_lock(&sdev->stat_lock); 731 spin_lock(&sctx->stat_lock);
577 ++sdev->stat.corrected_errors; 732 ++sctx->stat.corrected_errors;
578 spin_unlock(&sdev->stat_lock); 733 spin_unlock(&sctx->stat_lock);
579 734
580out: 735out:
581 if (trans && !IS_ERR(trans)) 736 if (trans && !IS_ERR(trans))
582 btrfs_end_transaction(trans, fixup->root); 737 btrfs_end_transaction(trans, fixup->root);
583 if (uncorrectable) { 738 if (uncorrectable) {
584 spin_lock(&sdev->stat_lock); 739 spin_lock(&sctx->stat_lock);
585 ++sdev->stat.uncorrectable_errors; 740 ++sctx->stat.uncorrectable_errors;
586 spin_unlock(&sdev->stat_lock); 741 spin_unlock(&sctx->stat_lock);
587 742 btrfs_dev_replace_stats_inc(
743 &sctx->dev_root->fs_info->dev_replace.
744 num_uncorrectable_read_errors);
588 printk_ratelimited_in_rcu(KERN_ERR 745 printk_ratelimited_in_rcu(KERN_ERR
589 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", 746 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
590 (unsigned long long)fixup->logical, 747 (unsigned long long)fixup->logical,
591 rcu_str_deref(sdev->dev->name)); 748 rcu_str_deref(fixup->dev->name));
592 } 749 }
593 750
594 btrfs_free_path(path); 751 btrfs_free_path(path);
595 kfree(fixup); 752 kfree(fixup);
596 753
597 /* see caller why we're pretending to be paused in the scrub counters */ 754 scrub_pending_trans_workers_dec(sctx);
598 mutex_lock(&fs_info->scrub_lock);
599 atomic_dec(&fs_info->scrubs_running);
600 atomic_dec(&fs_info->scrubs_paused);
601 mutex_unlock(&fs_info->scrub_lock);
602 atomic_dec(&sdev->fixup_cnt);
603 wake_up(&fs_info->scrub_pause_wait);
604 wake_up(&sdev->list_wait);
605} 755}
606 756
607/* 757/*
@@ -614,7 +764,8 @@ out:
614 */ 764 */
615static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) 765static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
616{ 766{
617 struct scrub_dev *sdev = sblock_to_check->sdev; 767 struct scrub_ctx *sctx = sblock_to_check->sctx;
768 struct btrfs_device *dev;
618 struct btrfs_fs_info *fs_info; 769 struct btrfs_fs_info *fs_info;
619 u64 length; 770 u64 length;
620 u64 logical; 771 u64 logical;
@@ -633,16 +784,33 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
633 DEFAULT_RATELIMIT_BURST); 784 DEFAULT_RATELIMIT_BURST);
634 785
635 BUG_ON(sblock_to_check->page_count < 1); 786 BUG_ON(sblock_to_check->page_count < 1);
636 fs_info = sdev->dev->dev_root->fs_info; 787 fs_info = sctx->dev_root->fs_info;
788 if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
789 /*
790 * if we find an error in a super block, we just report it.
791 * They will get written with the next transaction commit
792 * anyway
793 */
794 spin_lock(&sctx->stat_lock);
795 ++sctx->stat.super_errors;
796 spin_unlock(&sctx->stat_lock);
797 return 0;
798 }
637 length = sblock_to_check->page_count * PAGE_SIZE; 799 length = sblock_to_check->page_count * PAGE_SIZE;
638 logical = sblock_to_check->pagev[0].logical; 800 logical = sblock_to_check->pagev[0]->logical;
639 generation = sblock_to_check->pagev[0].generation; 801 generation = sblock_to_check->pagev[0]->generation;
640 BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); 802 BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
641 failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; 803 failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
642 is_metadata = !(sblock_to_check->pagev[0].flags & 804 is_metadata = !(sblock_to_check->pagev[0]->flags &
643 BTRFS_EXTENT_FLAG_DATA); 805 BTRFS_EXTENT_FLAG_DATA);
644 have_csum = sblock_to_check->pagev[0].have_csum; 806 have_csum = sblock_to_check->pagev[0]->have_csum;
645 csum = sblock_to_check->pagev[0].csum; 807 csum = sblock_to_check->pagev[0]->csum;
808 dev = sblock_to_check->pagev[0]->dev;
809
810 if (sctx->is_dev_replace && !is_metadata && !have_csum) {
811 sblocks_for_recheck = NULL;
812 goto nodatasum_case;
813 }
646 814
647 /* 815 /*
648 * read all mirrors one after the other. This includes to 816 * read all mirrors one after the other. This includes to
@@ -677,43 +845,32 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
677 sizeof(*sblocks_for_recheck), 845 sizeof(*sblocks_for_recheck),
678 GFP_NOFS); 846 GFP_NOFS);
679 if (!sblocks_for_recheck) { 847 if (!sblocks_for_recheck) {
680 spin_lock(&sdev->stat_lock); 848 spin_lock(&sctx->stat_lock);
681 sdev->stat.malloc_errors++; 849 sctx->stat.malloc_errors++;
682 sdev->stat.read_errors++; 850 sctx->stat.read_errors++;
683 sdev->stat.uncorrectable_errors++; 851 sctx->stat.uncorrectable_errors++;
684 spin_unlock(&sdev->stat_lock); 852 spin_unlock(&sctx->stat_lock);
685 btrfs_dev_stat_inc_and_print(sdev->dev, 853 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
686 BTRFS_DEV_STAT_READ_ERRS);
687 goto out; 854 goto out;
688 } 855 }
689 856
690 /* setup the context, map the logical blocks and alloc the pages */ 857 /* setup the context, map the logical blocks and alloc the pages */
691 ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length, 858 ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
692 logical, sblocks_for_recheck); 859 logical, sblocks_for_recheck);
693 if (ret) { 860 if (ret) {
694 spin_lock(&sdev->stat_lock); 861 spin_lock(&sctx->stat_lock);
695 sdev->stat.read_errors++; 862 sctx->stat.read_errors++;
696 sdev->stat.uncorrectable_errors++; 863 sctx->stat.uncorrectable_errors++;
697 spin_unlock(&sdev->stat_lock); 864 spin_unlock(&sctx->stat_lock);
698 btrfs_dev_stat_inc_and_print(sdev->dev, 865 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
699 BTRFS_DEV_STAT_READ_ERRS);
700 goto out; 866 goto out;
701 } 867 }
702 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); 868 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
703 sblock_bad = sblocks_for_recheck + failed_mirror_index; 869 sblock_bad = sblocks_for_recheck + failed_mirror_index;
704 870
705 /* build and submit the bios for the failed mirror, check checksums */ 871 /* build and submit the bios for the failed mirror, check checksums */
706 ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, 872 scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
707 csum, generation, sdev->csum_size); 873 csum, generation, sctx->csum_size);
708 if (ret) {
709 spin_lock(&sdev->stat_lock);
710 sdev->stat.read_errors++;
711 sdev->stat.uncorrectable_errors++;
712 spin_unlock(&sdev->stat_lock);
713 btrfs_dev_stat_inc_and_print(sdev->dev,
714 BTRFS_DEV_STAT_READ_ERRS);
715 goto out;
716 }
717 874
718 if (!sblock_bad->header_error && !sblock_bad->checksum_error && 875 if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
719 sblock_bad->no_io_error_seen) { 876 sblock_bad->no_io_error_seen) {
@@ -725,50 +882,54 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
725 * different bio (usually one of the two latter cases is 882 * different bio (usually one of the two latter cases is
726 * the cause) 883 * the cause)
727 */ 884 */
728 spin_lock(&sdev->stat_lock); 885 spin_lock(&sctx->stat_lock);
729 sdev->stat.unverified_errors++; 886 sctx->stat.unverified_errors++;
730 spin_unlock(&sdev->stat_lock); 887 spin_unlock(&sctx->stat_lock);
731 888
889 if (sctx->is_dev_replace)
890 scrub_write_block_to_dev_replace(sblock_bad);
732 goto out; 891 goto out;
733 } 892 }
734 893
735 if (!sblock_bad->no_io_error_seen) { 894 if (!sblock_bad->no_io_error_seen) {
736 spin_lock(&sdev->stat_lock); 895 spin_lock(&sctx->stat_lock);
737 sdev->stat.read_errors++; 896 sctx->stat.read_errors++;
738 spin_unlock(&sdev->stat_lock); 897 spin_unlock(&sctx->stat_lock);
739 if (__ratelimit(&_rs)) 898 if (__ratelimit(&_rs))
740 scrub_print_warning("i/o error", sblock_to_check); 899 scrub_print_warning("i/o error", sblock_to_check);
741 btrfs_dev_stat_inc_and_print(sdev->dev, 900 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
742 BTRFS_DEV_STAT_READ_ERRS);
743 } else if (sblock_bad->checksum_error) { 901 } else if (sblock_bad->checksum_error) {
744 spin_lock(&sdev->stat_lock); 902 spin_lock(&sctx->stat_lock);
745 sdev->stat.csum_errors++; 903 sctx->stat.csum_errors++;
746 spin_unlock(&sdev->stat_lock); 904 spin_unlock(&sctx->stat_lock);
747 if (__ratelimit(&_rs)) 905 if (__ratelimit(&_rs))
748 scrub_print_warning("checksum error", sblock_to_check); 906 scrub_print_warning("checksum error", sblock_to_check);
749 btrfs_dev_stat_inc_and_print(sdev->dev, 907 btrfs_dev_stat_inc_and_print(dev,
750 BTRFS_DEV_STAT_CORRUPTION_ERRS); 908 BTRFS_DEV_STAT_CORRUPTION_ERRS);
751 } else if (sblock_bad->header_error) { 909 } else if (sblock_bad->header_error) {
752 spin_lock(&sdev->stat_lock); 910 spin_lock(&sctx->stat_lock);
753 sdev->stat.verify_errors++; 911 sctx->stat.verify_errors++;
754 spin_unlock(&sdev->stat_lock); 912 spin_unlock(&sctx->stat_lock);
755 if (__ratelimit(&_rs)) 913 if (__ratelimit(&_rs))
756 scrub_print_warning("checksum/header error", 914 scrub_print_warning("checksum/header error",
757 sblock_to_check); 915 sblock_to_check);
758 if (sblock_bad->generation_error) 916 if (sblock_bad->generation_error)
759 btrfs_dev_stat_inc_and_print(sdev->dev, 917 btrfs_dev_stat_inc_and_print(dev,
760 BTRFS_DEV_STAT_GENERATION_ERRS); 918 BTRFS_DEV_STAT_GENERATION_ERRS);
761 else 919 else
762 btrfs_dev_stat_inc_and_print(sdev->dev, 920 btrfs_dev_stat_inc_and_print(dev,
763 BTRFS_DEV_STAT_CORRUPTION_ERRS); 921 BTRFS_DEV_STAT_CORRUPTION_ERRS);
764 } 922 }
765 923
766 if (sdev->readonly) 924 if (sctx->readonly && !sctx->is_dev_replace)
767 goto did_not_correct_error; 925 goto did_not_correct_error;
768 926
769 if (!is_metadata && !have_csum) { 927 if (!is_metadata && !have_csum) {
770 struct scrub_fixup_nodatasum *fixup_nodatasum; 928 struct scrub_fixup_nodatasum *fixup_nodatasum;
771 929
930nodatasum_case:
931 WARN_ON(sctx->is_dev_replace);
932
772 /* 933 /*
773 * !is_metadata and !have_csum, this means that the data 934 * !is_metadata and !have_csum, this means that the data
774 * might not be COW'ed, that it might be modified 935 * might not be COW'ed, that it might be modified
@@ -779,24 +940,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
779 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); 940 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
780 if (!fixup_nodatasum) 941 if (!fixup_nodatasum)
781 goto did_not_correct_error; 942 goto did_not_correct_error;
782 fixup_nodatasum->sdev = sdev; 943 fixup_nodatasum->sctx = sctx;
944 fixup_nodatasum->dev = dev;
783 fixup_nodatasum->logical = logical; 945 fixup_nodatasum->logical = logical;
784 fixup_nodatasum->root = fs_info->extent_root; 946 fixup_nodatasum->root = fs_info->extent_root;
785 fixup_nodatasum->mirror_num = failed_mirror_index + 1; 947 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
786 /* 948 scrub_pending_trans_workers_inc(sctx);
787 * increment scrubs_running to prevent cancel requests from
788 * completing as long as a fixup worker is running. we must also
789 * increment scrubs_paused to prevent deadlocking on pause
790 * requests used for transactions commits (as the worker uses a
791 * transaction context). it is safe to regard the fixup worker
792 * as paused for all matters practical. effectively, we only
793 * avoid cancellation requests from completing.
794 */
795 mutex_lock(&fs_info->scrub_lock);
796 atomic_inc(&fs_info->scrubs_running);
797 atomic_inc(&fs_info->scrubs_paused);
798 mutex_unlock(&fs_info->scrub_lock);
799 atomic_inc(&sdev->fixup_cnt);
800 fixup_nodatasum->work.func = scrub_fixup_nodatasum; 949 fixup_nodatasum->work.func = scrub_fixup_nodatasum;
801 btrfs_queue_worker(&fs_info->scrub_workers, 950 btrfs_queue_worker(&fs_info->scrub_workers,
802 &fixup_nodatasum->work); 951 &fixup_nodatasum->work);
@@ -805,26 +954,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
805 954
806 /* 955 /*
807 * now build and submit the bios for the other mirrors, check 956 * now build and submit the bios for the other mirrors, check
808 * checksums 957 * checksums.
809 */ 958 * First try to pick the mirror which is completely without I/O
810 for (mirror_index = 0;
811 mirror_index < BTRFS_MAX_MIRRORS &&
812 sblocks_for_recheck[mirror_index].page_count > 0;
813 mirror_index++) {
814 if (mirror_index == failed_mirror_index)
815 continue;
816
817 /* build and submit the bios, check checksums */
818 ret = scrub_recheck_block(fs_info,
819 sblocks_for_recheck + mirror_index,
820 is_metadata, have_csum, csum,
821 generation, sdev->csum_size);
822 if (ret)
823 goto did_not_correct_error;
824 }
825
826 /*
827 * first try to pick the mirror which is completely without I/O
828 * errors and also does not have a checksum error. 959 * errors and also does not have a checksum error.
829 * If one is found, and if a checksum is present, the full block 960 * If one is found, and if a checksum is present, the full block
830 * that is known to contain an error is rewritten. Afterwards 961 * that is known to contain an error is rewritten. Afterwards
@@ -840,24 +971,93 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
840 mirror_index < BTRFS_MAX_MIRRORS && 971 mirror_index < BTRFS_MAX_MIRRORS &&
841 sblocks_for_recheck[mirror_index].page_count > 0; 972 sblocks_for_recheck[mirror_index].page_count > 0;
842 mirror_index++) { 973 mirror_index++) {
843 struct scrub_block *sblock_other = sblocks_for_recheck + 974 struct scrub_block *sblock_other;
844 mirror_index; 975
976 if (mirror_index == failed_mirror_index)
977 continue;
978 sblock_other = sblocks_for_recheck + mirror_index;
979
980 /* build and submit the bios, check checksums */
981 scrub_recheck_block(fs_info, sblock_other, is_metadata,
982 have_csum, csum, generation,
983 sctx->csum_size);
845 984
846 if (!sblock_other->header_error && 985 if (!sblock_other->header_error &&
847 !sblock_other->checksum_error && 986 !sblock_other->checksum_error &&
848 sblock_other->no_io_error_seen) { 987 sblock_other->no_io_error_seen) {
849 int force_write = is_metadata || have_csum; 988 if (sctx->is_dev_replace) {
850 989 scrub_write_block_to_dev_replace(sblock_other);
851 ret = scrub_repair_block_from_good_copy(sblock_bad, 990 } else {
852 sblock_other, 991 int force_write = is_metadata || have_csum;
853 force_write); 992
993 ret = scrub_repair_block_from_good_copy(
994 sblock_bad, sblock_other,
995 force_write);
996 }
854 if (0 == ret) 997 if (0 == ret)
855 goto corrected_error; 998 goto corrected_error;
856 } 999 }
857 } 1000 }
858 1001
859 /* 1002 /*
860 * in case of I/O errors in the area that is supposed to be 1003 * for dev_replace, pick good pages and write to the target device.
1004 */
1005 if (sctx->is_dev_replace) {
1006 success = 1;
1007 for (page_num = 0; page_num < sblock_bad->page_count;
1008 page_num++) {
1009 int sub_success;
1010
1011 sub_success = 0;
1012 for (mirror_index = 0;
1013 mirror_index < BTRFS_MAX_MIRRORS &&
1014 sblocks_for_recheck[mirror_index].page_count > 0;
1015 mirror_index++) {
1016 struct scrub_block *sblock_other =
1017 sblocks_for_recheck + mirror_index;
1018 struct scrub_page *page_other =
1019 sblock_other->pagev[page_num];
1020
1021 if (!page_other->io_error) {
1022 ret = scrub_write_page_to_dev_replace(
1023 sblock_other, page_num);
1024 if (ret == 0) {
1025 /* succeeded for this page */
1026 sub_success = 1;
1027 break;
1028 } else {
1029 btrfs_dev_replace_stats_inc(
1030 &sctx->dev_root->
1031 fs_info->dev_replace.
1032 num_write_errors);
1033 }
1034 }
1035 }
1036
1037 if (!sub_success) {
1038 /*
1039 * did not find a mirror to fetch the page
1040 * from. scrub_write_page_to_dev_replace()
1041 * handles this case (page->io_error), by
1042 * filling the block with zeros before
1043 * submitting the write request
1044 */
1045 success = 0;
1046 ret = scrub_write_page_to_dev_replace(
1047 sblock_bad, page_num);
1048 if (ret)
1049 btrfs_dev_replace_stats_inc(
1050 &sctx->dev_root->fs_info->
1051 dev_replace.num_write_errors);
1052 }
1053 }
1054
1055 goto out;
1056 }
1057
1058 /*
1059 * for regular scrub, repair those pages that are errored.
1060 * In case of I/O errors in the area that is supposed to be
861 * repaired, continue by picking good copies of those pages. 1061 * repaired, continue by picking good copies of those pages.
862 * Select the good pages from mirrors to rewrite bad pages from 1062 * Select the good pages from mirrors to rewrite bad pages from
863 * the area to fix. Afterwards verify the checksum of the block 1063 * the area to fix. Afterwards verify the checksum of the block
@@ -887,7 +1087,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
887 1087
888 success = 1; 1088 success = 1;
889 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { 1089 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
890 struct scrub_page *page_bad = sblock_bad->pagev + page_num; 1090 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
891 1091
892 if (!page_bad->io_error) 1092 if (!page_bad->io_error)
893 continue; 1093 continue;
@@ -898,8 +1098,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
898 mirror_index++) { 1098 mirror_index++) {
899 struct scrub_block *sblock_other = sblocks_for_recheck + 1099 struct scrub_block *sblock_other = sblocks_for_recheck +
900 mirror_index; 1100 mirror_index;
901 struct scrub_page *page_other = sblock_other->pagev + 1101 struct scrub_page *page_other = sblock_other->pagev[
902 page_num; 1102 page_num];
903 1103
904 if (!page_other->io_error) { 1104 if (!page_other->io_error) {
905 ret = scrub_repair_page_from_good_copy( 1105 ret = scrub_repair_page_from_good_copy(
@@ -928,10 +1128,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
928 * is verified, but most likely the data comes out 1128 * is verified, but most likely the data comes out
929 * of the page cache. 1129 * of the page cache.
930 */ 1130 */
931 ret = scrub_recheck_block(fs_info, sblock_bad, 1131 scrub_recheck_block(fs_info, sblock_bad,
932 is_metadata, have_csum, csum, 1132 is_metadata, have_csum, csum,
933 generation, sdev->csum_size); 1133 generation, sctx->csum_size);
934 if (!ret && !sblock_bad->header_error && 1134 if (!sblock_bad->header_error &&
935 !sblock_bad->checksum_error && 1135 !sblock_bad->checksum_error &&
936 sblock_bad->no_io_error_seen) 1136 sblock_bad->no_io_error_seen)
937 goto corrected_error; 1137 goto corrected_error;
@@ -939,23 +1139,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
939 goto did_not_correct_error; 1139 goto did_not_correct_error;
940 } else { 1140 } else {
941corrected_error: 1141corrected_error:
942 spin_lock(&sdev->stat_lock); 1142 spin_lock(&sctx->stat_lock);
943 sdev->stat.corrected_errors++; 1143 sctx->stat.corrected_errors++;
944 spin_unlock(&sdev->stat_lock); 1144 spin_unlock(&sctx->stat_lock);
945 printk_ratelimited_in_rcu(KERN_ERR 1145 printk_ratelimited_in_rcu(KERN_ERR
946 "btrfs: fixed up error at logical %llu on dev %s\n", 1146 "btrfs: fixed up error at logical %llu on dev %s\n",
947 (unsigned long long)logical, 1147 (unsigned long long)logical,
948 rcu_str_deref(sdev->dev->name)); 1148 rcu_str_deref(dev->name));
949 } 1149 }
950 } else { 1150 } else {
951did_not_correct_error: 1151did_not_correct_error:
952 spin_lock(&sdev->stat_lock); 1152 spin_lock(&sctx->stat_lock);
953 sdev->stat.uncorrectable_errors++; 1153 sctx->stat.uncorrectable_errors++;
954 spin_unlock(&sdev->stat_lock); 1154 spin_unlock(&sctx->stat_lock);
955 printk_ratelimited_in_rcu(KERN_ERR 1155 printk_ratelimited_in_rcu(KERN_ERR
956 "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", 1156 "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
957 (unsigned long long)logical, 1157 (unsigned long long)logical,
958 rcu_str_deref(sdev->dev->name)); 1158 rcu_str_deref(dev->name));
959 } 1159 }
960 1160
961out: 1161out:
@@ -966,11 +1166,11 @@ out:
966 mirror_index; 1166 mirror_index;
967 int page_index; 1167 int page_index;
968 1168
969 for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; 1169 for (page_index = 0; page_index < sblock->page_count;
970 page_index++) 1170 page_index++) {
971 if (sblock->pagev[page_index].page) 1171 sblock->pagev[page_index]->sblock = NULL;
972 __free_page( 1172 scrub_page_put(sblock->pagev[page_index]);
973 sblock->pagev[page_index].page); 1173 }
974 } 1174 }
975 kfree(sblocks_for_recheck); 1175 kfree(sblocks_for_recheck);
976 } 1176 }
@@ -978,8 +1178,9 @@ out:
978 return 0; 1178 return 0;
979} 1179}
980 1180
981static int scrub_setup_recheck_block(struct scrub_dev *sdev, 1181static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
982 struct btrfs_mapping_tree *map_tree, 1182 struct btrfs_fs_info *fs_info,
1183 struct scrub_block *original_sblock,
983 u64 length, u64 logical, 1184 u64 length, u64 logical,
984 struct scrub_block *sblocks_for_recheck) 1185 struct scrub_block *sblocks_for_recheck)
985{ 1186{
@@ -988,7 +1189,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
988 int ret; 1189 int ret;
989 1190
990 /* 1191 /*
991 * note: the three members sdev, ref_count and outstanding_pages 1192 * note: the two members ref_count and outstanding_pages
992 * are not used (and not set) in the blocks that are used for 1193 * are not used (and not set) in the blocks that are used for
993 * the recheck procedure 1194 * the recheck procedure
994 */ 1195 */
@@ -1003,14 +1204,14 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
1003 * with a length of PAGE_SIZE, each returned stripe 1204 * with a length of PAGE_SIZE, each returned stripe
1004 * represents one mirror 1205 * represents one mirror
1005 */ 1206 */
1006 ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length, 1207 ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
1007 &bbio, 0); 1208 &mapped_length, &bbio, 0);
1008 if (ret || !bbio || mapped_length < sublen) { 1209 if (ret || !bbio || mapped_length < sublen) {
1009 kfree(bbio); 1210 kfree(bbio);
1010 return -EIO; 1211 return -EIO;
1011 } 1212 }
1012 1213
1013 BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); 1214 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1014 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; 1215 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
1015 mirror_index++) { 1216 mirror_index++) {
1016 struct scrub_block *sblock; 1217 struct scrub_block *sblock;
@@ -1020,21 +1221,31 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
1020 continue; 1221 continue;
1021 1222
1022 sblock = sblocks_for_recheck + mirror_index; 1223 sblock = sblocks_for_recheck + mirror_index;
1023 page = sblock->pagev + page_index; 1224 sblock->sctx = sctx;
1225 page = kzalloc(sizeof(*page), GFP_NOFS);
1226 if (!page) {
1227leave_nomem:
1228 spin_lock(&sctx->stat_lock);
1229 sctx->stat.malloc_errors++;
1230 spin_unlock(&sctx->stat_lock);
1231 kfree(bbio);
1232 return -ENOMEM;
1233 }
1234 scrub_page_get(page);
1235 sblock->pagev[page_index] = page;
1024 page->logical = logical; 1236 page->logical = logical;
1025 page->physical = bbio->stripes[mirror_index].physical; 1237 page->physical = bbio->stripes[mirror_index].physical;
1238 BUG_ON(page_index >= original_sblock->page_count);
1239 page->physical_for_dev_replace =
1240 original_sblock->pagev[page_index]->
1241 physical_for_dev_replace;
1026 /* for missing devices, dev->bdev is NULL */ 1242 /* for missing devices, dev->bdev is NULL */
1027 page->dev = bbio->stripes[mirror_index].dev; 1243 page->dev = bbio->stripes[mirror_index].dev;
1028 page->mirror_num = mirror_index + 1; 1244 page->mirror_num = mirror_index + 1;
1029 page->page = alloc_page(GFP_NOFS);
1030 if (!page->page) {
1031 spin_lock(&sdev->stat_lock);
1032 sdev->stat.malloc_errors++;
1033 spin_unlock(&sdev->stat_lock);
1034 kfree(bbio);
1035 return -ENOMEM;
1036 }
1037 sblock->page_count++; 1245 sblock->page_count++;
1246 page->page = alloc_page(GFP_NOFS);
1247 if (!page->page)
1248 goto leave_nomem;
1038 } 1249 }
1039 kfree(bbio); 1250 kfree(bbio);
1040 length -= sublen; 1251 length -= sublen;
@@ -1052,10 +1263,10 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
1052 * to take those pages that are not errored from all the mirrors so that 1263 * to take those pages that are not errored from all the mirrors so that
1053 * the pages that are errored in the just handled mirror can be repaired. 1264 * the pages that are errored in the just handled mirror can be repaired.
1054 */ 1265 */
1055static int scrub_recheck_block(struct btrfs_fs_info *fs_info, 1266static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1056 struct scrub_block *sblock, int is_metadata, 1267 struct scrub_block *sblock, int is_metadata,
1057 int have_csum, u8 *csum, u64 generation, 1268 int have_csum, u8 *csum, u64 generation,
1058 u16 csum_size) 1269 u16 csum_size)
1059{ 1270{
1060 int page_num; 1271 int page_num;
1061 1272
@@ -1065,8 +1276,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1065 1276
1066 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1277 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1067 struct bio *bio; 1278 struct bio *bio;
1068 int ret; 1279 struct scrub_page *page = sblock->pagev[page_num];
1069 struct scrub_page *page = sblock->pagev + page_num;
1070 DECLARE_COMPLETION_ONSTACK(complete); 1280 DECLARE_COMPLETION_ONSTACK(complete);
1071 1281
1072 if (page->dev->bdev == NULL) { 1282 if (page->dev->bdev == NULL) {
@@ -1075,20 +1285,19 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1075 continue; 1285 continue;
1076 } 1286 }
1077 1287
1078 BUG_ON(!page->page); 1288 WARN_ON(!page->page);
1079 bio = bio_alloc(GFP_NOFS, 1); 1289 bio = bio_alloc(GFP_NOFS, 1);
1080 if (!bio) 1290 if (!bio) {
1081 return -EIO; 1291 page->io_error = 1;
1292 sblock->no_io_error_seen = 0;
1293 continue;
1294 }
1082 bio->bi_bdev = page->dev->bdev; 1295 bio->bi_bdev = page->dev->bdev;
1083 bio->bi_sector = page->physical >> 9; 1296 bio->bi_sector = page->physical >> 9;
1084 bio->bi_end_io = scrub_complete_bio_end_io; 1297 bio->bi_end_io = scrub_complete_bio_end_io;
1085 bio->bi_private = &complete; 1298 bio->bi_private = &complete;
1086 1299
1087 ret = bio_add_page(bio, page->page, PAGE_SIZE, 0); 1300 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1088 if (PAGE_SIZE != ret) {
1089 bio_put(bio);
1090 return -EIO;
1091 }
1092 btrfsic_submit_bio(READ, bio); 1301 btrfsic_submit_bio(READ, bio);
1093 1302
1094 /* this will also unplug the queue */ 1303 /* this will also unplug the queue */
@@ -1105,7 +1314,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1105 have_csum, csum, generation, 1314 have_csum, csum, generation,
1106 csum_size); 1315 csum_size);
1107 1316
1108 return 0; 1317 return;
1109} 1318}
1110 1319
1111static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 1320static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
@@ -1120,14 +1329,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1120 struct btrfs_root *root = fs_info->extent_root; 1329 struct btrfs_root *root = fs_info->extent_root;
1121 void *mapped_buffer; 1330 void *mapped_buffer;
1122 1331
1123 BUG_ON(!sblock->pagev[0].page); 1332 WARN_ON(!sblock->pagev[0]->page);
1124 if (is_metadata) { 1333 if (is_metadata) {
1125 struct btrfs_header *h; 1334 struct btrfs_header *h;
1126 1335
1127 mapped_buffer = kmap_atomic(sblock->pagev[0].page); 1336 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1128 h = (struct btrfs_header *)mapped_buffer; 1337 h = (struct btrfs_header *)mapped_buffer;
1129 1338
1130 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || 1339 if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) ||
1131 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || 1340 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
1132 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1341 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1133 BTRFS_UUID_SIZE)) { 1342 BTRFS_UUID_SIZE)) {
@@ -1141,7 +1350,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1141 if (!have_csum) 1350 if (!have_csum)
1142 return; 1351 return;
1143 1352
1144 mapped_buffer = kmap_atomic(sblock->pagev[0].page); 1353 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1145 } 1354 }
1146 1355
1147 for (page_num = 0;;) { 1356 for (page_num = 0;;) {
@@ -1157,9 +1366,9 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1157 page_num++; 1366 page_num++;
1158 if (page_num >= sblock->page_count) 1367 if (page_num >= sblock->page_count)
1159 break; 1368 break;
1160 BUG_ON(!sblock->pagev[page_num].page); 1369 WARN_ON(!sblock->pagev[page_num]->page);
1161 1370
1162 mapped_buffer = kmap_atomic(sblock->pagev[page_num].page); 1371 mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
1163 } 1372 }
1164 1373
1165 btrfs_csum_final(crc, calculated_csum); 1374 btrfs_csum_final(crc, calculated_csum);
@@ -1197,17 +1406,23 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1197 struct scrub_block *sblock_good, 1406 struct scrub_block *sblock_good,
1198 int page_num, int force_write) 1407 int page_num, int force_write)
1199{ 1408{
1200 struct scrub_page *page_bad = sblock_bad->pagev + page_num; 1409 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1201 struct scrub_page *page_good = sblock_good->pagev + page_num; 1410 struct scrub_page *page_good = sblock_good->pagev[page_num];
1202 1411
1203 BUG_ON(sblock_bad->pagev[page_num].page == NULL); 1412 BUG_ON(page_bad->page == NULL);
1204 BUG_ON(sblock_good->pagev[page_num].page == NULL); 1413 BUG_ON(page_good->page == NULL);
1205 if (force_write || sblock_bad->header_error || 1414 if (force_write || sblock_bad->header_error ||
1206 sblock_bad->checksum_error || page_bad->io_error) { 1415 sblock_bad->checksum_error || page_bad->io_error) {
1207 struct bio *bio; 1416 struct bio *bio;
1208 int ret; 1417 int ret;
1209 DECLARE_COMPLETION_ONSTACK(complete); 1418 DECLARE_COMPLETION_ONSTACK(complete);
1210 1419
1420 if (!page_bad->dev->bdev) {
1421 printk_ratelimited(KERN_WARNING
1422 "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
1423 return -EIO;
1424 }
1425
1211 bio = bio_alloc(GFP_NOFS, 1); 1426 bio = bio_alloc(GFP_NOFS, 1);
1212 if (!bio) 1427 if (!bio)
1213 return -EIO; 1428 return -EIO;
@@ -1228,6 +1443,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1228 if (!bio_flagged(bio, BIO_UPTODATE)) { 1443 if (!bio_flagged(bio, BIO_UPTODATE)) {
1229 btrfs_dev_stat_inc_and_print(page_bad->dev, 1444 btrfs_dev_stat_inc_and_print(page_bad->dev,
1230 BTRFS_DEV_STAT_WRITE_ERRS); 1445 BTRFS_DEV_STAT_WRITE_ERRS);
1446 btrfs_dev_replace_stats_inc(
1447 &sblock_bad->sctx->dev_root->fs_info->
1448 dev_replace.num_write_errors);
1231 bio_put(bio); 1449 bio_put(bio);
1232 return -EIO; 1450 return -EIO;
1233 } 1451 }
@@ -1237,13 +1455,174 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1237 return 0; 1455 return 0;
1238} 1456}
1239 1457
1240static void scrub_checksum(struct scrub_block *sblock) 1458static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1459{
1460 int page_num;
1461
1462 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1463 int ret;
1464
1465 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1466 if (ret)
1467 btrfs_dev_replace_stats_inc(
1468 &sblock->sctx->dev_root->fs_info->dev_replace.
1469 num_write_errors);
1470 }
1471}
1472
1473static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1474 int page_num)
1475{
1476 struct scrub_page *spage = sblock->pagev[page_num];
1477
1478 BUG_ON(spage->page == NULL);
1479 if (spage->io_error) {
1480 void *mapped_buffer = kmap_atomic(spage->page);
1481
1482 memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1483 flush_dcache_page(spage->page);
1484 kunmap_atomic(mapped_buffer);
1485 }
1486 return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1487}
1488
1489static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1490 struct scrub_page *spage)
1491{
1492 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1493 struct scrub_bio *sbio;
1494 int ret;
1495
1496 mutex_lock(&wr_ctx->wr_lock);
1497again:
1498 if (!wr_ctx->wr_curr_bio) {
1499 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1500 GFP_NOFS);
1501 if (!wr_ctx->wr_curr_bio) {
1502 mutex_unlock(&wr_ctx->wr_lock);
1503 return -ENOMEM;
1504 }
1505 wr_ctx->wr_curr_bio->sctx = sctx;
1506 wr_ctx->wr_curr_bio->page_count = 0;
1507 }
1508 sbio = wr_ctx->wr_curr_bio;
1509 if (sbio->page_count == 0) {
1510 struct bio *bio;
1511
1512 sbio->physical = spage->physical_for_dev_replace;
1513 sbio->logical = spage->logical;
1514 sbio->dev = wr_ctx->tgtdev;
1515 bio = sbio->bio;
1516 if (!bio) {
1517 bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1518 if (!bio) {
1519 mutex_unlock(&wr_ctx->wr_lock);
1520 return -ENOMEM;
1521 }
1522 sbio->bio = bio;
1523 }
1524
1525 bio->bi_private = sbio;
1526 bio->bi_end_io = scrub_wr_bio_end_io;
1527 bio->bi_bdev = sbio->dev->bdev;
1528 bio->bi_sector = sbio->physical >> 9;
1529 sbio->err = 0;
1530 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1531 spage->physical_for_dev_replace ||
1532 sbio->logical + sbio->page_count * PAGE_SIZE !=
1533 spage->logical) {
1534 scrub_wr_submit(sctx);
1535 goto again;
1536 }
1537
1538 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1539 if (ret != PAGE_SIZE) {
1540 if (sbio->page_count < 1) {
1541 bio_put(sbio->bio);
1542 sbio->bio = NULL;
1543 mutex_unlock(&wr_ctx->wr_lock);
1544 return -EIO;
1545 }
1546 scrub_wr_submit(sctx);
1547 goto again;
1548 }
1549
1550 sbio->pagev[sbio->page_count] = spage;
1551 scrub_page_get(spage);
1552 sbio->page_count++;
1553 if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1554 scrub_wr_submit(sctx);
1555 mutex_unlock(&wr_ctx->wr_lock);
1556
1557 return 0;
1558}
1559
1560static void scrub_wr_submit(struct scrub_ctx *sctx)
1561{
1562 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1563 struct scrub_bio *sbio;
1564
1565 if (!wr_ctx->wr_curr_bio)
1566 return;
1567
1568 sbio = wr_ctx->wr_curr_bio;
1569 wr_ctx->wr_curr_bio = NULL;
1570 WARN_ON(!sbio->bio->bi_bdev);
1571 scrub_pending_bio_inc(sctx);
1572 /* process all writes in a single worker thread. Then the block layer
1573 * orders the requests before sending them to the driver which
1574 * doubled the write performance on spinning disks when measured
1575 * with Linux 3.5 */
1576 btrfsic_submit_bio(WRITE, sbio->bio);
1577}
1578
1579static void scrub_wr_bio_end_io(struct bio *bio, int err)
1580{
1581 struct scrub_bio *sbio = bio->bi_private;
1582 struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1583
1584 sbio->err = err;
1585 sbio->bio = bio;
1586
1587 sbio->work.func = scrub_wr_bio_end_io_worker;
1588 btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
1589}
1590
1591static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1592{
1593 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1594 struct scrub_ctx *sctx = sbio->sctx;
1595 int i;
1596
1597 WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1598 if (sbio->err) {
1599 struct btrfs_dev_replace *dev_replace =
1600 &sbio->sctx->dev_root->fs_info->dev_replace;
1601
1602 for (i = 0; i < sbio->page_count; i++) {
1603 struct scrub_page *spage = sbio->pagev[i];
1604
1605 spage->io_error = 1;
1606 btrfs_dev_replace_stats_inc(&dev_replace->
1607 num_write_errors);
1608 }
1609 }
1610
1611 for (i = 0; i < sbio->page_count; i++)
1612 scrub_page_put(sbio->pagev[i]);
1613
1614 bio_put(sbio->bio);
1615 kfree(sbio);
1616 scrub_pending_bio_dec(sctx);
1617}
1618
1619static int scrub_checksum(struct scrub_block *sblock)
1241{ 1620{
1242 u64 flags; 1621 u64 flags;
1243 int ret; 1622 int ret;
1244 1623
1245 BUG_ON(sblock->page_count < 1); 1624 WARN_ON(sblock->page_count < 1);
1246 flags = sblock->pagev[0].flags; 1625 flags = sblock->pagev[0]->flags;
1247 ret = 0; 1626 ret = 0;
1248 if (flags & BTRFS_EXTENT_FLAG_DATA) 1627 if (flags & BTRFS_EXTENT_FLAG_DATA)
1249 ret = scrub_checksum_data(sblock); 1628 ret = scrub_checksum_data(sblock);
@@ -1255,30 +1634,32 @@ static void scrub_checksum(struct scrub_block *sblock)
1255 WARN_ON(1); 1634 WARN_ON(1);
1256 if (ret) 1635 if (ret)
1257 scrub_handle_errored_block(sblock); 1636 scrub_handle_errored_block(sblock);
1637
1638 return ret;
1258} 1639}
1259 1640
1260static int scrub_checksum_data(struct scrub_block *sblock) 1641static int scrub_checksum_data(struct scrub_block *sblock)
1261{ 1642{
1262 struct scrub_dev *sdev = sblock->sdev; 1643 struct scrub_ctx *sctx = sblock->sctx;
1263 u8 csum[BTRFS_CSUM_SIZE]; 1644 u8 csum[BTRFS_CSUM_SIZE];
1264 u8 *on_disk_csum; 1645 u8 *on_disk_csum;
1265 struct page *page; 1646 struct page *page;
1266 void *buffer; 1647 void *buffer;
1267 u32 crc = ~(u32)0; 1648 u32 crc = ~(u32)0;
1268 int fail = 0; 1649 int fail = 0;
1269 struct btrfs_root *root = sdev->dev->dev_root; 1650 struct btrfs_root *root = sctx->dev_root;
1270 u64 len; 1651 u64 len;
1271 int index; 1652 int index;
1272 1653
1273 BUG_ON(sblock->page_count < 1); 1654 BUG_ON(sblock->page_count < 1);
1274 if (!sblock->pagev[0].have_csum) 1655 if (!sblock->pagev[0]->have_csum)
1275 return 0; 1656 return 0;
1276 1657
1277 on_disk_csum = sblock->pagev[0].csum; 1658 on_disk_csum = sblock->pagev[0]->csum;
1278 page = sblock->pagev[0].page; 1659 page = sblock->pagev[0]->page;
1279 buffer = kmap_atomic(page); 1660 buffer = kmap_atomic(page);
1280 1661
1281 len = sdev->sectorsize; 1662 len = sctx->sectorsize;
1282 index = 0; 1663 index = 0;
1283 for (;;) { 1664 for (;;) {
1284 u64 l = min_t(u64, len, PAGE_SIZE); 1665 u64 l = min_t(u64, len, PAGE_SIZE);
@@ -1290,13 +1671,13 @@ static int scrub_checksum_data(struct scrub_block *sblock)
1290 break; 1671 break;
1291 index++; 1672 index++;
1292 BUG_ON(index >= sblock->page_count); 1673 BUG_ON(index >= sblock->page_count);
1293 BUG_ON(!sblock->pagev[index].page); 1674 BUG_ON(!sblock->pagev[index]->page);
1294 page = sblock->pagev[index].page; 1675 page = sblock->pagev[index]->page;
1295 buffer = kmap_atomic(page); 1676 buffer = kmap_atomic(page);
1296 } 1677 }
1297 1678
1298 btrfs_csum_final(crc, csum); 1679 btrfs_csum_final(crc, csum);
1299 if (memcmp(csum, on_disk_csum, sdev->csum_size)) 1680 if (memcmp(csum, on_disk_csum, sctx->csum_size))
1300 fail = 1; 1681 fail = 1;
1301 1682
1302 return fail; 1683 return fail;
@@ -1304,9 +1685,9 @@ static int scrub_checksum_data(struct scrub_block *sblock)
1304 1685
1305static int scrub_checksum_tree_block(struct scrub_block *sblock) 1686static int scrub_checksum_tree_block(struct scrub_block *sblock)
1306{ 1687{
1307 struct scrub_dev *sdev = sblock->sdev; 1688 struct scrub_ctx *sctx = sblock->sctx;
1308 struct btrfs_header *h; 1689 struct btrfs_header *h;
1309 struct btrfs_root *root = sdev->dev->dev_root; 1690 struct btrfs_root *root = sctx->dev_root;
1310 struct btrfs_fs_info *fs_info = root->fs_info; 1691 struct btrfs_fs_info *fs_info = root->fs_info;
1311 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1692 u8 calculated_csum[BTRFS_CSUM_SIZE];
1312 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1693 u8 on_disk_csum[BTRFS_CSUM_SIZE];
@@ -1321,10 +1702,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1321 int index; 1702 int index;
1322 1703
1323 BUG_ON(sblock->page_count < 1); 1704 BUG_ON(sblock->page_count < 1);
1324 page = sblock->pagev[0].page; 1705 page = sblock->pagev[0]->page;
1325 mapped_buffer = kmap_atomic(page); 1706 mapped_buffer = kmap_atomic(page);
1326 h = (struct btrfs_header *)mapped_buffer; 1707 h = (struct btrfs_header *)mapped_buffer;
1327 memcpy(on_disk_csum, h->csum, sdev->csum_size); 1708 memcpy(on_disk_csum, h->csum, sctx->csum_size);
1328 1709
1329 /* 1710 /*
1330 * we don't use the getter functions here, as we 1711 * we don't use the getter functions here, as we
@@ -1332,10 +1713,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1332 * b) the page is already kmapped 1713 * b) the page is already kmapped
1333 */ 1714 */
1334 1715
1335 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr)) 1716 if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr))
1336 ++fail; 1717 ++fail;
1337 1718
1338 if (sblock->pagev[0].generation != le64_to_cpu(h->generation)) 1719 if (sblock->pagev[0]->generation != le64_to_cpu(h->generation))
1339 ++fail; 1720 ++fail;
1340 1721
1341 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1722 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1345,8 +1726,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1345 BTRFS_UUID_SIZE)) 1726 BTRFS_UUID_SIZE))
1346 ++fail; 1727 ++fail;
1347 1728
1348 BUG_ON(sdev->nodesize != sdev->leafsize); 1729 WARN_ON(sctx->nodesize != sctx->leafsize);
1349 len = sdev->nodesize - BTRFS_CSUM_SIZE; 1730 len = sctx->nodesize - BTRFS_CSUM_SIZE;
1350 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 1731 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1351 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 1732 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1352 index = 0; 1733 index = 0;
@@ -1360,15 +1741,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1360 break; 1741 break;
1361 index++; 1742 index++;
1362 BUG_ON(index >= sblock->page_count); 1743 BUG_ON(index >= sblock->page_count);
1363 BUG_ON(!sblock->pagev[index].page); 1744 BUG_ON(!sblock->pagev[index]->page);
1364 page = sblock->pagev[index].page; 1745 page = sblock->pagev[index]->page;
1365 mapped_buffer = kmap_atomic(page); 1746 mapped_buffer = kmap_atomic(page);
1366 mapped_size = PAGE_SIZE; 1747 mapped_size = PAGE_SIZE;
1367 p = mapped_buffer; 1748 p = mapped_buffer;
1368 } 1749 }
1369 1750
1370 btrfs_csum_final(crc, calculated_csum); 1751 btrfs_csum_final(crc, calculated_csum);
1371 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) 1752 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1372 ++crc_fail; 1753 ++crc_fail;
1373 1754
1374 return fail || crc_fail; 1755 return fail || crc_fail;
@@ -1377,8 +1758,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1377static int scrub_checksum_super(struct scrub_block *sblock) 1758static int scrub_checksum_super(struct scrub_block *sblock)
1378{ 1759{
1379 struct btrfs_super_block *s; 1760 struct btrfs_super_block *s;
1380 struct scrub_dev *sdev = sblock->sdev; 1761 struct scrub_ctx *sctx = sblock->sctx;
1381 struct btrfs_root *root = sdev->dev->dev_root; 1762 struct btrfs_root *root = sctx->dev_root;
1382 struct btrfs_fs_info *fs_info = root->fs_info; 1763 struct btrfs_fs_info *fs_info = root->fs_info;
1383 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1764 u8 calculated_csum[BTRFS_CSUM_SIZE];
1384 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1765 u8 on_disk_csum[BTRFS_CSUM_SIZE];
@@ -1393,15 +1774,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1393 int index; 1774 int index;
1394 1775
1395 BUG_ON(sblock->page_count < 1); 1776 BUG_ON(sblock->page_count < 1);
1396 page = sblock->pagev[0].page; 1777 page = sblock->pagev[0]->page;
1397 mapped_buffer = kmap_atomic(page); 1778 mapped_buffer = kmap_atomic(page);
1398 s = (struct btrfs_super_block *)mapped_buffer; 1779 s = (struct btrfs_super_block *)mapped_buffer;
1399 memcpy(on_disk_csum, s->csum, sdev->csum_size); 1780 memcpy(on_disk_csum, s->csum, sctx->csum_size);
1400 1781
1401 if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) 1782 if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr))
1402 ++fail_cor; 1783 ++fail_cor;
1403 1784
1404 if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) 1785 if (sblock->pagev[0]->generation != le64_to_cpu(s->generation))
1405 ++fail_gen; 1786 ++fail_gen;
1406 1787
1407 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1788 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1421,15 +1802,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1421 break; 1802 break;
1422 index++; 1803 index++;
1423 BUG_ON(index >= sblock->page_count); 1804 BUG_ON(index >= sblock->page_count);
1424 BUG_ON(!sblock->pagev[index].page); 1805 BUG_ON(!sblock->pagev[index]->page);
1425 page = sblock->pagev[index].page; 1806 page = sblock->pagev[index]->page;
1426 mapped_buffer = kmap_atomic(page); 1807 mapped_buffer = kmap_atomic(page);
1427 mapped_size = PAGE_SIZE; 1808 mapped_size = PAGE_SIZE;
1428 p = mapped_buffer; 1809 p = mapped_buffer;
1429 } 1810 }
1430 1811
1431 btrfs_csum_final(crc, calculated_csum); 1812 btrfs_csum_final(crc, calculated_csum);
1432 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) 1813 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1433 ++fail_cor; 1814 ++fail_cor;
1434 1815
1435 if (fail_cor + fail_gen) { 1816 if (fail_cor + fail_gen) {
@@ -1438,14 +1819,14 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1438 * They will get written with the next transaction commit 1819 * They will get written with the next transaction commit
1439 * anyway 1820 * anyway
1440 */ 1821 */
1441 spin_lock(&sdev->stat_lock); 1822 spin_lock(&sctx->stat_lock);
1442 ++sdev->stat.super_errors; 1823 ++sctx->stat.super_errors;
1443 spin_unlock(&sdev->stat_lock); 1824 spin_unlock(&sctx->stat_lock);
1444 if (fail_cor) 1825 if (fail_cor)
1445 btrfs_dev_stat_inc_and_print(sdev->dev, 1826 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1446 BTRFS_DEV_STAT_CORRUPTION_ERRS); 1827 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1447 else 1828 else
1448 btrfs_dev_stat_inc_and_print(sdev->dev, 1829 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1449 BTRFS_DEV_STAT_GENERATION_ERRS); 1830 BTRFS_DEV_STAT_GENERATION_ERRS);
1450 } 1831 }
1451 1832
@@ -1463,28 +1844,54 @@ static void scrub_block_put(struct scrub_block *sblock)
1463 int i; 1844 int i;
1464 1845
1465 for (i = 0; i < sblock->page_count; i++) 1846 for (i = 0; i < sblock->page_count; i++)
1466 if (sblock->pagev[i].page) 1847 scrub_page_put(sblock->pagev[i]);
1467 __free_page(sblock->pagev[i].page);
1468 kfree(sblock); 1848 kfree(sblock);
1469 } 1849 }
1470} 1850}
1471 1851
1472static void scrub_submit(struct scrub_dev *sdev) 1852static void scrub_page_get(struct scrub_page *spage)
1853{
1854 atomic_inc(&spage->ref_count);
1855}
1856
1857static void scrub_page_put(struct scrub_page *spage)
1858{
1859 if (atomic_dec_and_test(&spage->ref_count)) {
1860 if (spage->page)
1861 __free_page(spage->page);
1862 kfree(spage);
1863 }
1864}
1865
1866static void scrub_submit(struct scrub_ctx *sctx)
1473{ 1867{
1474 struct scrub_bio *sbio; 1868 struct scrub_bio *sbio;
1475 1869
1476 if (sdev->curr == -1) 1870 if (sctx->curr == -1)
1477 return; 1871 return;
1478 1872
1479 sbio = sdev->bios[sdev->curr]; 1873 sbio = sctx->bios[sctx->curr];
1480 sdev->curr = -1; 1874 sctx->curr = -1;
1481 atomic_inc(&sdev->in_flight); 1875 scrub_pending_bio_inc(sctx);
1482 1876
1483 btrfsic_submit_bio(READ, sbio->bio); 1877 if (!sbio->bio->bi_bdev) {
1878 /*
1879 * this case should not happen. If btrfs_map_block() is
1880 * wrong, it could happen for dev-replace operations on
1881 * missing devices when no mirrors are available, but in
1882 * this case it should already fail the mount.
1883 * This case is handled correctly (but _very_ slowly).
1884 */
1885 printk_ratelimited(KERN_WARNING
1886 "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
1887 bio_endio(sbio->bio, -EIO);
1888 } else {
1889 btrfsic_submit_bio(READ, sbio->bio);
1890 }
1484} 1891}
1485 1892
1486static int scrub_add_page_to_bio(struct scrub_dev *sdev, 1893static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
1487 struct scrub_page *spage) 1894 struct scrub_page *spage)
1488{ 1895{
1489 struct scrub_block *sblock = spage->sblock; 1896 struct scrub_block *sblock = spage->sblock;
1490 struct scrub_bio *sbio; 1897 struct scrub_bio *sbio;
@@ -1494,28 +1901,29 @@ again:
1494 /* 1901 /*
1495 * grab a fresh bio or wait for one to become available 1902 * grab a fresh bio or wait for one to become available
1496 */ 1903 */
1497 while (sdev->curr == -1) { 1904 while (sctx->curr == -1) {
1498 spin_lock(&sdev->list_lock); 1905 spin_lock(&sctx->list_lock);
1499 sdev->curr = sdev->first_free; 1906 sctx->curr = sctx->first_free;
1500 if (sdev->curr != -1) { 1907 if (sctx->curr != -1) {
1501 sdev->first_free = sdev->bios[sdev->curr]->next_free; 1908 sctx->first_free = sctx->bios[sctx->curr]->next_free;
1502 sdev->bios[sdev->curr]->next_free = -1; 1909 sctx->bios[sctx->curr]->next_free = -1;
1503 sdev->bios[sdev->curr]->page_count = 0; 1910 sctx->bios[sctx->curr]->page_count = 0;
1504 spin_unlock(&sdev->list_lock); 1911 spin_unlock(&sctx->list_lock);
1505 } else { 1912 } else {
1506 spin_unlock(&sdev->list_lock); 1913 spin_unlock(&sctx->list_lock);
1507 wait_event(sdev->list_wait, sdev->first_free != -1); 1914 wait_event(sctx->list_wait, sctx->first_free != -1);
1508 } 1915 }
1509 } 1916 }
1510 sbio = sdev->bios[sdev->curr]; 1917 sbio = sctx->bios[sctx->curr];
1511 if (sbio->page_count == 0) { 1918 if (sbio->page_count == 0) {
1512 struct bio *bio; 1919 struct bio *bio;
1513 1920
1514 sbio->physical = spage->physical; 1921 sbio->physical = spage->physical;
1515 sbio->logical = spage->logical; 1922 sbio->logical = spage->logical;
1923 sbio->dev = spage->dev;
1516 bio = sbio->bio; 1924 bio = sbio->bio;
1517 if (!bio) { 1925 if (!bio) {
1518 bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio); 1926 bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
1519 if (!bio) 1927 if (!bio)
1520 return -ENOMEM; 1928 return -ENOMEM;
1521 sbio->bio = bio; 1929 sbio->bio = bio;
@@ -1523,14 +1931,15 @@ again:
1523 1931
1524 bio->bi_private = sbio; 1932 bio->bi_private = sbio;
1525 bio->bi_end_io = scrub_bio_end_io; 1933 bio->bi_end_io = scrub_bio_end_io;
1526 bio->bi_bdev = sdev->dev->bdev; 1934 bio->bi_bdev = sbio->dev->bdev;
1527 bio->bi_sector = spage->physical >> 9; 1935 bio->bi_sector = sbio->physical >> 9;
1528 sbio->err = 0; 1936 sbio->err = 0;
1529 } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 1937 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1530 spage->physical || 1938 spage->physical ||
1531 sbio->logical + sbio->page_count * PAGE_SIZE != 1939 sbio->logical + sbio->page_count * PAGE_SIZE !=
1532 spage->logical) { 1940 spage->logical ||
1533 scrub_submit(sdev); 1941 sbio->dev != spage->dev) {
1942 scrub_submit(sctx);
1534 goto again; 1943 goto again;
1535 } 1944 }
1536 1945
@@ -1542,81 +1951,87 @@ again:
1542 sbio->bio = NULL; 1951 sbio->bio = NULL;
1543 return -EIO; 1952 return -EIO;
1544 } 1953 }
1545 scrub_submit(sdev); 1954 scrub_submit(sctx);
1546 goto again; 1955 goto again;
1547 } 1956 }
1548 1957
1549 scrub_block_get(sblock); /* one for the added page */ 1958 scrub_block_get(sblock); /* one for the page added to the bio */
1550 atomic_inc(&sblock->outstanding_pages); 1959 atomic_inc(&sblock->outstanding_pages);
1551 sbio->page_count++; 1960 sbio->page_count++;
1552 if (sbio->page_count == sdev->pages_per_bio) 1961 if (sbio->page_count == sctx->pages_per_rd_bio)
1553 scrub_submit(sdev); 1962 scrub_submit(sctx);
1554 1963
1555 return 0; 1964 return 0;
1556} 1965}
1557 1966
1558static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, 1967static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
1559 u64 physical, u64 flags, u64 gen, int mirror_num, 1968 u64 physical, struct btrfs_device *dev, u64 flags,
1560 u8 *csum, int force) 1969 u64 gen, int mirror_num, u8 *csum, int force,
1970 u64 physical_for_dev_replace)
1561{ 1971{
1562 struct scrub_block *sblock; 1972 struct scrub_block *sblock;
1563 int index; 1973 int index;
1564 1974
1565 sblock = kzalloc(sizeof(*sblock), GFP_NOFS); 1975 sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
1566 if (!sblock) { 1976 if (!sblock) {
1567 spin_lock(&sdev->stat_lock); 1977 spin_lock(&sctx->stat_lock);
1568 sdev->stat.malloc_errors++; 1978 sctx->stat.malloc_errors++;
1569 spin_unlock(&sdev->stat_lock); 1979 spin_unlock(&sctx->stat_lock);
1570 return -ENOMEM; 1980 return -ENOMEM;
1571 } 1981 }
1572 1982
1573 /* one ref inside this function, plus one for each page later on */ 1983 /* one ref inside this function, plus one for each page added to
1984 * a bio later on */
1574 atomic_set(&sblock->ref_count, 1); 1985 atomic_set(&sblock->ref_count, 1);
1575 sblock->sdev = sdev; 1986 sblock->sctx = sctx;
1576 sblock->no_io_error_seen = 1; 1987 sblock->no_io_error_seen = 1;
1577 1988
1578 for (index = 0; len > 0; index++) { 1989 for (index = 0; len > 0; index++) {
1579 struct scrub_page *spage = sblock->pagev + index; 1990 struct scrub_page *spage;
1580 u64 l = min_t(u64, len, PAGE_SIZE); 1991 u64 l = min_t(u64, len, PAGE_SIZE);
1581 1992
1582 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); 1993 spage = kzalloc(sizeof(*spage), GFP_NOFS);
1583 spage->page = alloc_page(GFP_NOFS); 1994 if (!spage) {
1584 if (!spage->page) { 1995leave_nomem:
1585 spin_lock(&sdev->stat_lock); 1996 spin_lock(&sctx->stat_lock);
1586 sdev->stat.malloc_errors++; 1997 sctx->stat.malloc_errors++;
1587 spin_unlock(&sdev->stat_lock); 1998 spin_unlock(&sctx->stat_lock);
1588 while (index > 0) { 1999 scrub_block_put(sblock);
1589 index--;
1590 __free_page(sblock->pagev[index].page);
1591 }
1592 kfree(sblock);
1593 return -ENOMEM; 2000 return -ENOMEM;
1594 } 2001 }
2002 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2003 scrub_page_get(spage);
2004 sblock->pagev[index] = spage;
1595 spage->sblock = sblock; 2005 spage->sblock = sblock;
1596 spage->dev = sdev->dev; 2006 spage->dev = dev;
1597 spage->flags = flags; 2007 spage->flags = flags;
1598 spage->generation = gen; 2008 spage->generation = gen;
1599 spage->logical = logical; 2009 spage->logical = logical;
1600 spage->physical = physical; 2010 spage->physical = physical;
2011 spage->physical_for_dev_replace = physical_for_dev_replace;
1601 spage->mirror_num = mirror_num; 2012 spage->mirror_num = mirror_num;
1602 if (csum) { 2013 if (csum) {
1603 spage->have_csum = 1; 2014 spage->have_csum = 1;
1604 memcpy(spage->csum, csum, sdev->csum_size); 2015 memcpy(spage->csum, csum, sctx->csum_size);
1605 } else { 2016 } else {
1606 spage->have_csum = 0; 2017 spage->have_csum = 0;
1607 } 2018 }
1608 sblock->page_count++; 2019 sblock->page_count++;
2020 spage->page = alloc_page(GFP_NOFS);
2021 if (!spage->page)
2022 goto leave_nomem;
1609 len -= l; 2023 len -= l;
1610 logical += l; 2024 logical += l;
1611 physical += l; 2025 physical += l;
2026 physical_for_dev_replace += l;
1612 } 2027 }
1613 2028
1614 BUG_ON(sblock->page_count == 0); 2029 WARN_ON(sblock->page_count == 0);
1615 for (index = 0; index < sblock->page_count; index++) { 2030 for (index = 0; index < sblock->page_count; index++) {
1616 struct scrub_page *spage = sblock->pagev + index; 2031 struct scrub_page *spage = sblock->pagev[index];
1617 int ret; 2032 int ret;
1618 2033
1619 ret = scrub_add_page_to_bio(sdev, spage); 2034 ret = scrub_add_page_to_rd_bio(sctx, spage);
1620 if (ret) { 2035 if (ret) {
1621 scrub_block_put(sblock); 2036 scrub_block_put(sblock);
1622 return ret; 2037 return ret;
@@ -1624,7 +2039,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
1624 } 2039 }
1625 2040
1626 if (force) 2041 if (force)
1627 scrub_submit(sdev); 2042 scrub_submit(sctx);
1628 2043
1629 /* last one frees, either here or in bio completion for last page */ 2044 /* last one frees, either here or in bio completion for last page */
1630 scrub_block_put(sblock); 2045 scrub_block_put(sblock);
@@ -1634,8 +2049,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
1634static void scrub_bio_end_io(struct bio *bio, int err) 2049static void scrub_bio_end_io(struct bio *bio, int err)
1635{ 2050{
1636 struct scrub_bio *sbio = bio->bi_private; 2051 struct scrub_bio *sbio = bio->bi_private;
1637 struct scrub_dev *sdev = sbio->sdev; 2052 struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1638 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
1639 2053
1640 sbio->err = err; 2054 sbio->err = err;
1641 sbio->bio = bio; 2055 sbio->bio = bio;
@@ -1646,10 +2060,10 @@ static void scrub_bio_end_io(struct bio *bio, int err)
1646static void scrub_bio_end_io_worker(struct btrfs_work *work) 2060static void scrub_bio_end_io_worker(struct btrfs_work *work)
1647{ 2061{
1648 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 2062 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1649 struct scrub_dev *sdev = sbio->sdev; 2063 struct scrub_ctx *sctx = sbio->sctx;
1650 int i; 2064 int i;
1651 2065
1652 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); 2066 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
1653 if (sbio->err) { 2067 if (sbio->err) {
1654 for (i = 0; i < sbio->page_count; i++) { 2068 for (i = 0; i < sbio->page_count; i++) {
1655 struct scrub_page *spage = sbio->pagev[i]; 2069 struct scrub_page *spage = sbio->pagev[i];
@@ -1671,23 +2085,37 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
1671 2085
1672 bio_put(sbio->bio); 2086 bio_put(sbio->bio);
1673 sbio->bio = NULL; 2087 sbio->bio = NULL;
1674 spin_lock(&sdev->list_lock); 2088 spin_lock(&sctx->list_lock);
1675 sbio->next_free = sdev->first_free; 2089 sbio->next_free = sctx->first_free;
1676 sdev->first_free = sbio->index; 2090 sctx->first_free = sbio->index;
1677 spin_unlock(&sdev->list_lock); 2091 spin_unlock(&sctx->list_lock);
1678 atomic_dec(&sdev->in_flight); 2092
1679 wake_up(&sdev->list_wait); 2093 if (sctx->is_dev_replace &&
2094 atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2095 mutex_lock(&sctx->wr_ctx.wr_lock);
2096 scrub_wr_submit(sctx);
2097 mutex_unlock(&sctx->wr_ctx.wr_lock);
2098 }
2099
2100 scrub_pending_bio_dec(sctx);
1680} 2101}
1681 2102
1682static void scrub_block_complete(struct scrub_block *sblock) 2103static void scrub_block_complete(struct scrub_block *sblock)
1683{ 2104{
1684 if (!sblock->no_io_error_seen) 2105 if (!sblock->no_io_error_seen) {
1685 scrub_handle_errored_block(sblock); 2106 scrub_handle_errored_block(sblock);
1686 else 2107 } else {
1687 scrub_checksum(sblock); 2108 /*
2109 * if has checksum error, write via repair mechanism in
2110 * dev replace case, otherwise write here in dev replace
2111 * case.
2112 */
2113 if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
2114 scrub_write_block_to_dev_replace(sblock);
2115 }
1688} 2116}
1689 2117
1690static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, 2118static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
1691 u8 *csum) 2119 u8 *csum)
1692{ 2120{
1693 struct btrfs_ordered_sum *sum = NULL; 2121 struct btrfs_ordered_sum *sum = NULL;
@@ -1695,15 +2123,15 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1695 unsigned long i; 2123 unsigned long i;
1696 unsigned long num_sectors; 2124 unsigned long num_sectors;
1697 2125
1698 while (!list_empty(&sdev->csum_list)) { 2126 while (!list_empty(&sctx->csum_list)) {
1699 sum = list_first_entry(&sdev->csum_list, 2127 sum = list_first_entry(&sctx->csum_list,
1700 struct btrfs_ordered_sum, list); 2128 struct btrfs_ordered_sum, list);
1701 if (sum->bytenr > logical) 2129 if (sum->bytenr > logical)
1702 return 0; 2130 return 0;
1703 if (sum->bytenr + sum->len > logical) 2131 if (sum->bytenr + sum->len > logical)
1704 break; 2132 break;
1705 2133
1706 ++sdev->stat.csum_discards; 2134 ++sctx->stat.csum_discards;
1707 list_del(&sum->list); 2135 list_del(&sum->list);
1708 kfree(sum); 2136 kfree(sum);
1709 sum = NULL; 2137 sum = NULL;
@@ -1711,10 +2139,10 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1711 if (!sum) 2139 if (!sum)
1712 return 0; 2140 return 0;
1713 2141
1714 num_sectors = sum->len / sdev->sectorsize; 2142 num_sectors = sum->len / sctx->sectorsize;
1715 for (i = 0; i < num_sectors; ++i) { 2143 for (i = 0; i < num_sectors; ++i) {
1716 if (sum->sums[i].bytenr == logical) { 2144 if (sum->sums[i].bytenr == logical) {
1717 memcpy(csum, &sum->sums[i].sum, sdev->csum_size); 2145 memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
1718 ret = 1; 2146 ret = 1;
1719 break; 2147 break;
1720 } 2148 }
@@ -1727,29 +2155,30 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1727} 2155}
1728 2156
1729/* scrub extent tries to collect up to 64 kB for each bio */ 2157/* scrub extent tries to collect up to 64 kB for each bio */
1730static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, 2158static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
1731 u64 physical, u64 flags, u64 gen, int mirror_num) 2159 u64 physical, struct btrfs_device *dev, u64 flags,
2160 u64 gen, int mirror_num, u64 physical_for_dev_replace)
1732{ 2161{
1733 int ret; 2162 int ret;
1734 u8 csum[BTRFS_CSUM_SIZE]; 2163 u8 csum[BTRFS_CSUM_SIZE];
1735 u32 blocksize; 2164 u32 blocksize;
1736 2165
1737 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2166 if (flags & BTRFS_EXTENT_FLAG_DATA) {
1738 blocksize = sdev->sectorsize; 2167 blocksize = sctx->sectorsize;
1739 spin_lock(&sdev->stat_lock); 2168 spin_lock(&sctx->stat_lock);
1740 sdev->stat.data_extents_scrubbed++; 2169 sctx->stat.data_extents_scrubbed++;
1741 sdev->stat.data_bytes_scrubbed += len; 2170 sctx->stat.data_bytes_scrubbed += len;
1742 spin_unlock(&sdev->stat_lock); 2171 spin_unlock(&sctx->stat_lock);
1743 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2172 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1744 BUG_ON(sdev->nodesize != sdev->leafsize); 2173 WARN_ON(sctx->nodesize != sctx->leafsize);
1745 blocksize = sdev->nodesize; 2174 blocksize = sctx->nodesize;
1746 spin_lock(&sdev->stat_lock); 2175 spin_lock(&sctx->stat_lock);
1747 sdev->stat.tree_extents_scrubbed++; 2176 sctx->stat.tree_extents_scrubbed++;
1748 sdev->stat.tree_bytes_scrubbed += len; 2177 sctx->stat.tree_bytes_scrubbed += len;
1749 spin_unlock(&sdev->stat_lock); 2178 spin_unlock(&sctx->stat_lock);
1750 } else { 2179 } else {
1751 blocksize = sdev->sectorsize; 2180 blocksize = sctx->sectorsize;
1752 BUG_ON(1); 2181 WARN_ON(1);
1753 } 2182 }
1754 2183
1755 while (len) { 2184 while (len) {
@@ -1758,26 +2187,38 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
1758 2187
1759 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2188 if (flags & BTRFS_EXTENT_FLAG_DATA) {
1760 /* push csums to sbio */ 2189 /* push csums to sbio */
1761 have_csum = scrub_find_csum(sdev, logical, l, csum); 2190 have_csum = scrub_find_csum(sctx, logical, l, csum);
1762 if (have_csum == 0) 2191 if (have_csum == 0)
1763 ++sdev->stat.no_csum; 2192 ++sctx->stat.no_csum;
2193 if (sctx->is_dev_replace && !have_csum) {
2194 ret = copy_nocow_pages(sctx, logical, l,
2195 mirror_num,
2196 physical_for_dev_replace);
2197 goto behind_scrub_pages;
2198 }
1764 } 2199 }
1765 ret = scrub_pages(sdev, logical, l, physical, flags, gen, 2200 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
1766 mirror_num, have_csum ? csum : NULL, 0); 2201 mirror_num, have_csum ? csum : NULL, 0,
2202 physical_for_dev_replace);
2203behind_scrub_pages:
1767 if (ret) 2204 if (ret)
1768 return ret; 2205 return ret;
1769 len -= l; 2206 len -= l;
1770 logical += l; 2207 logical += l;
1771 physical += l; 2208 physical += l;
2209 physical_for_dev_replace += l;
1772 } 2210 }
1773 return 0; 2211 return 0;
1774} 2212}
1775 2213
1776static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, 2214static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
1777 struct map_lookup *map, int num, u64 base, u64 length) 2215 struct map_lookup *map,
2216 struct btrfs_device *scrub_dev,
2217 int num, u64 base, u64 length,
2218 int is_dev_replace)
1778{ 2219{
1779 struct btrfs_path *path; 2220 struct btrfs_path *path;
1780 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 2221 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
1781 struct btrfs_root *root = fs_info->extent_root; 2222 struct btrfs_root *root = fs_info->extent_root;
1782 struct btrfs_root *csum_root = fs_info->csum_root; 2223 struct btrfs_root *csum_root = fs_info->csum_root;
1783 struct btrfs_extent_item *extent; 2224 struct btrfs_extent_item *extent;
@@ -1797,9 +2238,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1797 struct reada_control *reada2; 2238 struct reada_control *reada2;
1798 struct btrfs_key key_start; 2239 struct btrfs_key key_start;
1799 struct btrfs_key key_end; 2240 struct btrfs_key key_end;
1800
1801 u64 increment = map->stripe_len; 2241 u64 increment = map->stripe_len;
1802 u64 offset; 2242 u64 offset;
2243 u64 extent_logical;
2244 u64 extent_physical;
2245 u64 extent_len;
2246 struct btrfs_device *extent_dev;
2247 int extent_mirror_num;
1803 2248
1804 nstripes = length; 2249 nstripes = length;
1805 offset = 0; 2250 offset = 0;
@@ -1843,8 +2288,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1843 */ 2288 */
1844 logical = base + offset; 2289 logical = base + offset;
1845 2290
1846 wait_event(sdev->list_wait, 2291 wait_event(sctx->list_wait,
1847 atomic_read(&sdev->in_flight) == 0); 2292 atomic_read(&sctx->bios_in_flight) == 0);
1848 atomic_inc(&fs_info->scrubs_paused); 2293 atomic_inc(&fs_info->scrubs_paused);
1849 wake_up(&fs_info->scrub_pause_wait); 2294 wake_up(&fs_info->scrub_pause_wait);
1850 2295
@@ -1898,7 +2343,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1898 * canceled? 2343 * canceled?
1899 */ 2344 */
1900 if (atomic_read(&fs_info->scrub_cancel_req) || 2345 if (atomic_read(&fs_info->scrub_cancel_req) ||
1901 atomic_read(&sdev->cancel_req)) { 2346 atomic_read(&sctx->cancel_req)) {
1902 ret = -ECANCELED; 2347 ret = -ECANCELED;
1903 goto out; 2348 goto out;
1904 } 2349 }
@@ -1907,9 +2352,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1907 */ 2352 */
1908 if (atomic_read(&fs_info->scrub_pause_req)) { 2353 if (atomic_read(&fs_info->scrub_pause_req)) {
1909 /* push queued extents */ 2354 /* push queued extents */
1910 scrub_submit(sdev); 2355 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
1911 wait_event(sdev->list_wait, 2356 scrub_submit(sctx);
1912 atomic_read(&sdev->in_flight) == 0); 2357 mutex_lock(&sctx->wr_ctx.wr_lock);
2358 scrub_wr_submit(sctx);
2359 mutex_unlock(&sctx->wr_ctx.wr_lock);
2360 wait_event(sctx->list_wait,
2361 atomic_read(&sctx->bios_in_flight) == 0);
2362 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
1913 atomic_inc(&fs_info->scrubs_paused); 2363 atomic_inc(&fs_info->scrubs_paused);
1914 wake_up(&fs_info->scrub_pause_wait); 2364 wake_up(&fs_info->scrub_pause_wait);
1915 mutex_lock(&fs_info->scrub_lock); 2365 mutex_lock(&fs_info->scrub_lock);
@@ -1926,7 +2376,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1926 2376
1927 ret = btrfs_lookup_csums_range(csum_root, logical, 2377 ret = btrfs_lookup_csums_range(csum_root, logical,
1928 logical + map->stripe_len - 1, 2378 logical + map->stripe_len - 1,
1929 &sdev->csum_list, 1); 2379 &sctx->csum_list, 1);
1930 if (ret) 2380 if (ret)
1931 goto out; 2381 goto out;
1932 2382
@@ -2004,9 +2454,20 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
2004 key.objectid; 2454 key.objectid;
2005 } 2455 }
2006 2456
2007 ret = scrub_extent(sdev, key.objectid, key.offset, 2457 extent_logical = key.objectid;
2008 key.objectid - logical + physical, 2458 extent_physical = key.objectid - logical + physical;
2009 flags, generation, mirror_num); 2459 extent_len = key.offset;
2460 extent_dev = scrub_dev;
2461 extent_mirror_num = mirror_num;
2462 if (is_dev_replace)
2463 scrub_remap_extent(fs_info, extent_logical,
2464 extent_len, &extent_physical,
2465 &extent_dev,
2466 &extent_mirror_num);
2467 ret = scrub_extent(sctx, extent_logical, extent_len,
2468 extent_physical, extent_dev, flags,
2469 generation, extent_mirror_num,
2470 key.objectid - logical + physical);
2010 if (ret) 2471 if (ret)
2011 goto out; 2472 goto out;
2012 2473
@@ -2016,29 +2477,34 @@ next:
2016 btrfs_release_path(path); 2477 btrfs_release_path(path);
2017 logical += increment; 2478 logical += increment;
2018 physical += map->stripe_len; 2479 physical += map->stripe_len;
2019 spin_lock(&sdev->stat_lock); 2480 spin_lock(&sctx->stat_lock);
2020 sdev->stat.last_physical = physical; 2481 sctx->stat.last_physical = physical;
2021 spin_unlock(&sdev->stat_lock); 2482 spin_unlock(&sctx->stat_lock);
2022 } 2483 }
2484out:
2023 /* push queued extents */ 2485 /* push queued extents */
2024 scrub_submit(sdev); 2486 scrub_submit(sctx);
2487 mutex_lock(&sctx->wr_ctx.wr_lock);
2488 scrub_wr_submit(sctx);
2489 mutex_unlock(&sctx->wr_ctx.wr_lock);
2025 2490
2026out:
2027 blk_finish_plug(&plug); 2491 blk_finish_plug(&plug);
2028 btrfs_free_path(path); 2492 btrfs_free_path(path);
2029 return ret < 0 ? ret : 0; 2493 return ret < 0 ? ret : 0;
2030} 2494}
2031 2495
2032static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, 2496static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2033 u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, 2497 struct btrfs_device *scrub_dev,
2034 u64 dev_offset) 2498 u64 chunk_tree, u64 chunk_objectid,
2499 u64 chunk_offset, u64 length,
2500 u64 dev_offset, int is_dev_replace)
2035{ 2501{
2036 struct btrfs_mapping_tree *map_tree = 2502 struct btrfs_mapping_tree *map_tree =
2037 &sdev->dev->dev_root->fs_info->mapping_tree; 2503 &sctx->dev_root->fs_info->mapping_tree;
2038 struct map_lookup *map; 2504 struct map_lookup *map;
2039 struct extent_map *em; 2505 struct extent_map *em;
2040 int i; 2506 int i;
2041 int ret = -EINVAL; 2507 int ret = 0;
2042 2508
2043 read_lock(&map_tree->map_tree.lock); 2509 read_lock(&map_tree->map_tree.lock);
2044 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 2510 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
@@ -2055,9 +2521,11 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
2055 goto out; 2521 goto out;
2056 2522
2057 for (i = 0; i < map->num_stripes; ++i) { 2523 for (i = 0; i < map->num_stripes; ++i) {
2058 if (map->stripes[i].dev == sdev->dev && 2524 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
2059 map->stripes[i].physical == dev_offset) { 2525 map->stripes[i].physical == dev_offset) {
2060 ret = scrub_stripe(sdev, map, i, chunk_offset, length); 2526 ret = scrub_stripe(sctx, map, scrub_dev, i,
2527 chunk_offset, length,
2528 is_dev_replace);
2061 if (ret) 2529 if (ret)
2062 goto out; 2530 goto out;
2063 } 2531 }
@@ -2069,11 +2537,13 @@ out:
2069} 2537}
2070 2538
2071static noinline_for_stack 2539static noinline_for_stack
2072int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) 2540int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2541 struct btrfs_device *scrub_dev, u64 start, u64 end,
2542 int is_dev_replace)
2073{ 2543{
2074 struct btrfs_dev_extent *dev_extent = NULL; 2544 struct btrfs_dev_extent *dev_extent = NULL;
2075 struct btrfs_path *path; 2545 struct btrfs_path *path;
2076 struct btrfs_root *root = sdev->dev->dev_root; 2546 struct btrfs_root *root = sctx->dev_root;
2077 struct btrfs_fs_info *fs_info = root->fs_info; 2547 struct btrfs_fs_info *fs_info = root->fs_info;
2078 u64 length; 2548 u64 length;
2079 u64 chunk_tree; 2549 u64 chunk_tree;
@@ -2085,6 +2555,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2085 struct btrfs_key key; 2555 struct btrfs_key key;
2086 struct btrfs_key found_key; 2556 struct btrfs_key found_key;
2087 struct btrfs_block_group_cache *cache; 2557 struct btrfs_block_group_cache *cache;
2558 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
2088 2559
2089 path = btrfs_alloc_path(); 2560 path = btrfs_alloc_path();
2090 if (!path) 2561 if (!path)
@@ -2094,11 +2565,10 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2094 path->search_commit_root = 1; 2565 path->search_commit_root = 1;
2095 path->skip_locking = 1; 2566 path->skip_locking = 1;
2096 2567
2097 key.objectid = sdev->dev->devid; 2568 key.objectid = scrub_dev->devid;
2098 key.offset = 0ull; 2569 key.offset = 0ull;
2099 key.type = BTRFS_DEV_EXTENT_KEY; 2570 key.type = BTRFS_DEV_EXTENT_KEY;
2100 2571
2101
2102 while (1) { 2572 while (1) {
2103 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2573 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2104 if (ret < 0) 2574 if (ret < 0)
@@ -2117,7 +2587,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2117 2587
2118 btrfs_item_key_to_cpu(l, &found_key, slot); 2588 btrfs_item_key_to_cpu(l, &found_key, slot);
2119 2589
2120 if (found_key.objectid != sdev->dev->devid) 2590 if (found_key.objectid != scrub_dev->devid)
2121 break; 2591 break;
2122 2592
2123 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) 2593 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
@@ -2151,11 +2621,62 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2151 ret = -ENOENT; 2621 ret = -ENOENT;
2152 break; 2622 break;
2153 } 2623 }
2154 ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, 2624 dev_replace->cursor_right = found_key.offset + length;
2155 chunk_offset, length, found_key.offset); 2625 dev_replace->cursor_left = found_key.offset;
2626 dev_replace->item_needs_writeback = 1;
2627 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
2628 chunk_offset, length, found_key.offset,
2629 is_dev_replace);
2630
2631 /*
2632 * flush, submit all pending read and write bios, afterwards
2633 * wait for them.
2634 * Note that in the dev replace case, a read request causes
2635 * write requests that are submitted in the read completion
2636 * worker. Therefore in the current situation, it is required
2637 * that all write requests are flushed, so that all read and
2638 * write requests are really completed when bios_in_flight
2639 * changes to 0.
2640 */
2641 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2642 scrub_submit(sctx);
2643 mutex_lock(&sctx->wr_ctx.wr_lock);
2644 scrub_wr_submit(sctx);
2645 mutex_unlock(&sctx->wr_ctx.wr_lock);
2646
2647 wait_event(sctx->list_wait,
2648 atomic_read(&sctx->bios_in_flight) == 0);
2649 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2650 atomic_inc(&fs_info->scrubs_paused);
2651 wake_up(&fs_info->scrub_pause_wait);
2652 wait_event(sctx->list_wait,
2653 atomic_read(&sctx->workers_pending) == 0);
2654
2655 mutex_lock(&fs_info->scrub_lock);
2656 while (atomic_read(&fs_info->scrub_pause_req)) {
2657 mutex_unlock(&fs_info->scrub_lock);
2658 wait_event(fs_info->scrub_pause_wait,
2659 atomic_read(&fs_info->scrub_pause_req) == 0);
2660 mutex_lock(&fs_info->scrub_lock);
2661 }
2662 atomic_dec(&fs_info->scrubs_paused);
2663 mutex_unlock(&fs_info->scrub_lock);
2664 wake_up(&fs_info->scrub_pause_wait);
2665
2666 dev_replace->cursor_left = dev_replace->cursor_right;
2667 dev_replace->item_needs_writeback = 1;
2156 btrfs_put_block_group(cache); 2668 btrfs_put_block_group(cache);
2157 if (ret) 2669 if (ret)
2158 break; 2670 break;
2671 if (is_dev_replace &&
2672 atomic64_read(&dev_replace->num_write_errors) > 0) {
2673 ret = -EIO;
2674 break;
2675 }
2676 if (sctx->stat.malloc_errors > 0) {
2677 ret = -ENOMEM;
2678 break;
2679 }
2159 2680
2160 key.offset = found_key.offset + length; 2681 key.offset = found_key.offset + length;
2161 btrfs_release_path(path); 2682 btrfs_release_path(path);
@@ -2170,14 +2691,14 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2170 return ret < 0 ? ret : 0; 2691 return ret < 0 ? ret : 0;
2171} 2692}
2172 2693
2173static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) 2694static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2695 struct btrfs_device *scrub_dev)
2174{ 2696{
2175 int i; 2697 int i;
2176 u64 bytenr; 2698 u64 bytenr;
2177 u64 gen; 2699 u64 gen;
2178 int ret; 2700 int ret;
2179 struct btrfs_device *device = sdev->dev; 2701 struct btrfs_root *root = sctx->dev_root;
2180 struct btrfs_root *root = device->dev_root;
2181 2702
2182 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 2703 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
2183 return -EIO; 2704 return -EIO;
@@ -2186,15 +2707,16 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
2186 2707
2187 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 2708 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2188 bytenr = btrfs_sb_offset(i); 2709 bytenr = btrfs_sb_offset(i);
2189 if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) 2710 if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
2190 break; 2711 break;
2191 2712
2192 ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 2713 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
2193 BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); 2714 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
2715 NULL, 1, bytenr);
2194 if (ret) 2716 if (ret)
2195 return ret; 2717 return ret;
2196 } 2718 }
2197 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 2719 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2198 2720
2199 return 0; 2721 return 0;
2200} 2722}
@@ -2202,19 +2724,38 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
2202/* 2724/*
2203 * get a reference count on fs_info->scrub_workers. start worker if necessary 2725 * get a reference count on fs_info->scrub_workers. start worker if necessary
2204 */ 2726 */
2205static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) 2727static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2728 int is_dev_replace)
2206{ 2729{
2207 struct btrfs_fs_info *fs_info = root->fs_info;
2208 int ret = 0; 2730 int ret = 0;
2209 2731
2210 mutex_lock(&fs_info->scrub_lock); 2732 mutex_lock(&fs_info->scrub_lock);
2211 if (fs_info->scrub_workers_refcnt == 0) { 2733 if (fs_info->scrub_workers_refcnt == 0) {
2212 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 2734 if (is_dev_replace)
2213 fs_info->thread_pool_size, &fs_info->generic_worker); 2735 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
2736 &fs_info->generic_worker);
2737 else
2738 btrfs_init_workers(&fs_info->scrub_workers, "scrub",
2739 fs_info->thread_pool_size,
2740 &fs_info->generic_worker);
2214 fs_info->scrub_workers.idle_thresh = 4; 2741 fs_info->scrub_workers.idle_thresh = 4;
2215 ret = btrfs_start_workers(&fs_info->scrub_workers); 2742 ret = btrfs_start_workers(&fs_info->scrub_workers);
2216 if (ret) 2743 if (ret)
2217 goto out; 2744 goto out;
2745 btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
2746 "scrubwrc",
2747 fs_info->thread_pool_size,
2748 &fs_info->generic_worker);
2749 fs_info->scrub_wr_completion_workers.idle_thresh = 2;
2750 ret = btrfs_start_workers(
2751 &fs_info->scrub_wr_completion_workers);
2752 if (ret)
2753 goto out;
2754 btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
2755 &fs_info->generic_worker);
2756 ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
2757 if (ret)
2758 goto out;
2218 } 2759 }
2219 ++fs_info->scrub_workers_refcnt; 2760 ++fs_info->scrub_workers_refcnt;
2220out: 2761out:
@@ -2223,40 +2764,41 @@ out:
2223 return ret; 2764 return ret;
2224} 2765}
2225 2766
2226static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) 2767static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2227{ 2768{
2228 struct btrfs_fs_info *fs_info = root->fs_info;
2229
2230 mutex_lock(&fs_info->scrub_lock); 2769 mutex_lock(&fs_info->scrub_lock);
2231 if (--fs_info->scrub_workers_refcnt == 0) 2770 if (--fs_info->scrub_workers_refcnt == 0) {
2232 btrfs_stop_workers(&fs_info->scrub_workers); 2771 btrfs_stop_workers(&fs_info->scrub_workers);
2772 btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
2773 btrfs_stop_workers(&fs_info->scrub_nocow_workers);
2774 }
2233 WARN_ON(fs_info->scrub_workers_refcnt < 0); 2775 WARN_ON(fs_info->scrub_workers_refcnt < 0);
2234 mutex_unlock(&fs_info->scrub_lock); 2776 mutex_unlock(&fs_info->scrub_lock);
2235} 2777}
2236 2778
2237 2779int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2238int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, 2780 u64 end, struct btrfs_scrub_progress *progress,
2239 struct btrfs_scrub_progress *progress, int readonly) 2781 int readonly, int is_dev_replace)
2240{ 2782{
2241 struct scrub_dev *sdev; 2783 struct scrub_ctx *sctx;
2242 struct btrfs_fs_info *fs_info = root->fs_info;
2243 int ret; 2784 int ret;
2244 struct btrfs_device *dev; 2785 struct btrfs_device *dev;
2245 2786
2246 if (btrfs_fs_closing(root->fs_info)) 2787 if (btrfs_fs_closing(fs_info))
2247 return -EINVAL; 2788 return -EINVAL;
2248 2789
2249 /* 2790 /*
2250 * check some assumptions 2791 * check some assumptions
2251 */ 2792 */
2252 if (root->nodesize != root->leafsize) { 2793 if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
2253 printk(KERN_ERR 2794 printk(KERN_ERR
2254 "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", 2795 "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
2255 root->nodesize, root->leafsize); 2796 fs_info->chunk_root->nodesize,
2797 fs_info->chunk_root->leafsize);
2256 return -EINVAL; 2798 return -EINVAL;
2257 } 2799 }
2258 2800
2259 if (root->nodesize > BTRFS_STRIPE_LEN) { 2801 if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
2260 /* 2802 /*
2261 * in this case scrub is unable to calculate the checksum 2803 * in this case scrub is unable to calculate the checksum
2262 * the way scrub is implemented. Do not handle this 2804 * the way scrub is implemented. Do not handle this
@@ -2264,80 +2806,105 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
2264 */ 2806 */
2265 printk(KERN_ERR 2807 printk(KERN_ERR
2266 "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", 2808 "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
2267 root->nodesize, BTRFS_STRIPE_LEN); 2809 fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
2268 return -EINVAL; 2810 return -EINVAL;
2269 } 2811 }
2270 2812
2271 if (root->sectorsize != PAGE_SIZE) { 2813 if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
2272 /* not supported for data w/o checksums */ 2814 /* not supported for data w/o checksums */
2273 printk(KERN_ERR 2815 printk(KERN_ERR
2274 "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", 2816 "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
2275 root->sectorsize, (unsigned long long)PAGE_SIZE); 2817 fs_info->chunk_root->sectorsize,
2818 (unsigned long long)PAGE_SIZE);
2276 return -EINVAL; 2819 return -EINVAL;
2277 } 2820 }
2278 2821
2279 ret = scrub_workers_get(root); 2822 if (fs_info->chunk_root->nodesize >
2823 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
2824 fs_info->chunk_root->sectorsize >
2825 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
2826 /*
2827 * would exhaust the array bounds of pagev member in
2828 * struct scrub_block
2829 */
2830 pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
2831 fs_info->chunk_root->nodesize,
2832 SCRUB_MAX_PAGES_PER_BLOCK,
2833 fs_info->chunk_root->sectorsize,
2834 SCRUB_MAX_PAGES_PER_BLOCK);
2835 return -EINVAL;
2836 }
2837
2838 ret = scrub_workers_get(fs_info, is_dev_replace);
2280 if (ret) 2839 if (ret)
2281 return ret; 2840 return ret;
2282 2841
2283 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2842 mutex_lock(&fs_info->fs_devices->device_list_mutex);
2284 dev = btrfs_find_device(root, devid, NULL, NULL); 2843 dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2285 if (!dev || dev->missing) { 2844 if (!dev || (dev->missing && !is_dev_replace)) {
2286 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2845 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2287 scrub_workers_put(root); 2846 scrub_workers_put(fs_info);
2288 return -ENODEV; 2847 return -ENODEV;
2289 } 2848 }
2290 mutex_lock(&fs_info->scrub_lock); 2849 mutex_lock(&fs_info->scrub_lock);
2291 2850
2292 if (!dev->in_fs_metadata) { 2851 if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
2293 mutex_unlock(&fs_info->scrub_lock); 2852 mutex_unlock(&fs_info->scrub_lock);
2294 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2853 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2295 scrub_workers_put(root); 2854 scrub_workers_put(fs_info);
2296 return -ENODEV; 2855 return -EIO;
2297 } 2856 }
2298 2857
2299 if (dev->scrub_device) { 2858 btrfs_dev_replace_lock(&fs_info->dev_replace);
2859 if (dev->scrub_device ||
2860 (!is_dev_replace &&
2861 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
2862 btrfs_dev_replace_unlock(&fs_info->dev_replace);
2300 mutex_unlock(&fs_info->scrub_lock); 2863 mutex_unlock(&fs_info->scrub_lock);
2301 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2864 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2302 scrub_workers_put(root); 2865 scrub_workers_put(fs_info);
2303 return -EINPROGRESS; 2866 return -EINPROGRESS;
2304 } 2867 }
2305 sdev = scrub_setup_dev(dev); 2868 btrfs_dev_replace_unlock(&fs_info->dev_replace);
2306 if (IS_ERR(sdev)) { 2869 sctx = scrub_setup_ctx(dev, is_dev_replace);
2870 if (IS_ERR(sctx)) {
2307 mutex_unlock(&fs_info->scrub_lock); 2871 mutex_unlock(&fs_info->scrub_lock);
2308 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2872 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2309 scrub_workers_put(root); 2873 scrub_workers_put(fs_info);
2310 return PTR_ERR(sdev); 2874 return PTR_ERR(sctx);
2311 } 2875 }
2312 sdev->readonly = readonly; 2876 sctx->readonly = readonly;
2313 dev->scrub_device = sdev; 2877 dev->scrub_device = sctx;
2314 2878
2315 atomic_inc(&fs_info->scrubs_running); 2879 atomic_inc(&fs_info->scrubs_running);
2316 mutex_unlock(&fs_info->scrub_lock); 2880 mutex_unlock(&fs_info->scrub_lock);
2317 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2881 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2318 2882
2319 down_read(&fs_info->scrub_super_lock); 2883 if (!is_dev_replace) {
2320 ret = scrub_supers(sdev); 2884 down_read(&fs_info->scrub_super_lock);
2321 up_read(&fs_info->scrub_super_lock); 2885 ret = scrub_supers(sctx, dev);
2886 up_read(&fs_info->scrub_super_lock);
2887 }
2322 2888
2323 if (!ret) 2889 if (!ret)
2324 ret = scrub_enumerate_chunks(sdev, start, end); 2890 ret = scrub_enumerate_chunks(sctx, dev, start, end,
2891 is_dev_replace);
2325 2892
2326 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 2893 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2327 atomic_dec(&fs_info->scrubs_running); 2894 atomic_dec(&fs_info->scrubs_running);
2328 wake_up(&fs_info->scrub_pause_wait); 2895 wake_up(&fs_info->scrub_pause_wait);
2329 2896
2330 wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); 2897 wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
2331 2898
2332 if (progress) 2899 if (progress)
2333 memcpy(progress, &sdev->stat, sizeof(*progress)); 2900 memcpy(progress, &sctx->stat, sizeof(*progress));
2334 2901
2335 mutex_lock(&fs_info->scrub_lock); 2902 mutex_lock(&fs_info->scrub_lock);
2336 dev->scrub_device = NULL; 2903 dev->scrub_device = NULL;
2337 mutex_unlock(&fs_info->scrub_lock); 2904 mutex_unlock(&fs_info->scrub_lock);
2338 2905
2339 scrub_free_dev(sdev); 2906 scrub_free_ctx(sctx);
2340 scrub_workers_put(root); 2907 scrub_workers_put(fs_info);
2341 2908
2342 return ret; 2909 return ret;
2343} 2910}
@@ -2377,9 +2944,8 @@ void btrfs_scrub_continue_super(struct btrfs_root *root)
2377 up_write(&root->fs_info->scrub_super_lock); 2944 up_write(&root->fs_info->scrub_super_lock);
2378} 2945}
2379 2946
2380int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) 2947int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
2381{ 2948{
2382
2383 mutex_lock(&fs_info->scrub_lock); 2949 mutex_lock(&fs_info->scrub_lock);
2384 if (!atomic_read(&fs_info->scrubs_running)) { 2950 if (!atomic_read(&fs_info->scrubs_running)) {
2385 mutex_unlock(&fs_info->scrub_lock); 2951 mutex_unlock(&fs_info->scrub_lock);
@@ -2399,23 +2965,18 @@ int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
2399 return 0; 2965 return 0;
2400} 2966}
2401 2967
2402int btrfs_scrub_cancel(struct btrfs_root *root) 2968int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
2969 struct btrfs_device *dev)
2403{ 2970{
2404 return __btrfs_scrub_cancel(root->fs_info); 2971 struct scrub_ctx *sctx;
2405}
2406
2407int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
2408{
2409 struct btrfs_fs_info *fs_info = root->fs_info;
2410 struct scrub_dev *sdev;
2411 2972
2412 mutex_lock(&fs_info->scrub_lock); 2973 mutex_lock(&fs_info->scrub_lock);
2413 sdev = dev->scrub_device; 2974 sctx = dev->scrub_device;
2414 if (!sdev) { 2975 if (!sctx) {
2415 mutex_unlock(&fs_info->scrub_lock); 2976 mutex_unlock(&fs_info->scrub_lock);
2416 return -ENOTCONN; 2977 return -ENOTCONN;
2417 } 2978 }
2418 atomic_inc(&sdev->cancel_req); 2979 atomic_inc(&sctx->cancel_req);
2419 while (dev->scrub_device) { 2980 while (dev->scrub_device) {
2420 mutex_unlock(&fs_info->scrub_lock); 2981 mutex_unlock(&fs_info->scrub_lock);
2421 wait_event(fs_info->scrub_pause_wait, 2982 wait_event(fs_info->scrub_pause_wait,
@@ -2438,12 +2999,12 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
2438 * does not go away in cancel_dev. FIXME: find a better solution 2999 * does not go away in cancel_dev. FIXME: find a better solution
2439 */ 3000 */
2440 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3001 mutex_lock(&fs_info->fs_devices->device_list_mutex);
2441 dev = btrfs_find_device(root, devid, NULL, NULL); 3002 dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2442 if (!dev) { 3003 if (!dev) {
2443 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3004 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2444 return -ENODEV; 3005 return -ENODEV;
2445 } 3006 }
2446 ret = btrfs_scrub_cancel_dev(root, dev); 3007 ret = btrfs_scrub_cancel_dev(fs_info, dev);
2447 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3008 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2448 3009
2449 return ret; 3010 return ret;
@@ -2453,15 +3014,284 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2453 struct btrfs_scrub_progress *progress) 3014 struct btrfs_scrub_progress *progress)
2454{ 3015{
2455 struct btrfs_device *dev; 3016 struct btrfs_device *dev;
2456 struct scrub_dev *sdev = NULL; 3017 struct scrub_ctx *sctx = NULL;
2457 3018
2458 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 3019 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2459 dev = btrfs_find_device(root, devid, NULL, NULL); 3020 dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
2460 if (dev) 3021 if (dev)
2461 sdev = dev->scrub_device; 3022 sctx = dev->scrub_device;
2462 if (sdev) 3023 if (sctx)
2463 memcpy(progress, &sdev->stat, sizeof(*progress)); 3024 memcpy(progress, &sctx->stat, sizeof(*progress));
2464 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 3025 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2465 3026
2466 return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV; 3027 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
3028}
3029
3030static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3031 u64 extent_logical, u64 extent_len,
3032 u64 *extent_physical,
3033 struct btrfs_device **extent_dev,
3034 int *extent_mirror_num)
3035{
3036 u64 mapped_length;
3037 struct btrfs_bio *bbio = NULL;
3038 int ret;
3039
3040 mapped_length = extent_len;
3041 ret = btrfs_map_block(fs_info, READ, extent_logical,
3042 &mapped_length, &bbio, 0);
3043 if (ret || !bbio || mapped_length < extent_len ||
3044 !bbio->stripes[0].dev->bdev) {
3045 kfree(bbio);
3046 return;
3047 }
3048
3049 *extent_physical = bbio->stripes[0].physical;
3050 *extent_mirror_num = bbio->mirror_num;
3051 *extent_dev = bbio->stripes[0].dev;
3052 kfree(bbio);
3053}
3054
3055static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
3056 struct scrub_wr_ctx *wr_ctx,
3057 struct btrfs_fs_info *fs_info,
3058 struct btrfs_device *dev,
3059 int is_dev_replace)
3060{
3061 WARN_ON(wr_ctx->wr_curr_bio != NULL);
3062
3063 mutex_init(&wr_ctx->wr_lock);
3064 wr_ctx->wr_curr_bio = NULL;
3065 if (!is_dev_replace)
3066 return 0;
3067
3068 WARN_ON(!dev->bdev);
3069 wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
3070 bio_get_nr_vecs(dev->bdev));
3071 wr_ctx->tgtdev = dev;
3072 atomic_set(&wr_ctx->flush_all_writes, 0);
3073 return 0;
3074}
3075
3076static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
3077{
3078 mutex_lock(&wr_ctx->wr_lock);
3079 kfree(wr_ctx->wr_curr_bio);
3080 wr_ctx->wr_curr_bio = NULL;
3081 mutex_unlock(&wr_ctx->wr_lock);
3082}
3083
3084static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3085 int mirror_num, u64 physical_for_dev_replace)
3086{
3087 struct scrub_copy_nocow_ctx *nocow_ctx;
3088 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3089
3090 nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
3091 if (!nocow_ctx) {
3092 spin_lock(&sctx->stat_lock);
3093 sctx->stat.malloc_errors++;
3094 spin_unlock(&sctx->stat_lock);
3095 return -ENOMEM;
3096 }
3097
3098 scrub_pending_trans_workers_inc(sctx);
3099
3100 nocow_ctx->sctx = sctx;
3101 nocow_ctx->logical = logical;
3102 nocow_ctx->len = len;
3103 nocow_ctx->mirror_num = mirror_num;
3104 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3105 nocow_ctx->work.func = copy_nocow_pages_worker;
3106 btrfs_queue_worker(&fs_info->scrub_nocow_workers,
3107 &nocow_ctx->work);
3108
3109 return 0;
3110}
3111
3112static void copy_nocow_pages_worker(struct btrfs_work *work)
3113{
3114 struct scrub_copy_nocow_ctx *nocow_ctx =
3115 container_of(work, struct scrub_copy_nocow_ctx, work);
3116 struct scrub_ctx *sctx = nocow_ctx->sctx;
3117 u64 logical = nocow_ctx->logical;
3118 u64 len = nocow_ctx->len;
3119 int mirror_num = nocow_ctx->mirror_num;
3120 u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3121 int ret;
3122 struct btrfs_trans_handle *trans = NULL;
3123 struct btrfs_fs_info *fs_info;
3124 struct btrfs_path *path;
3125 struct btrfs_root *root;
3126 int not_written = 0;
3127
3128 fs_info = sctx->dev_root->fs_info;
3129 root = fs_info->extent_root;
3130
3131 path = btrfs_alloc_path();
3132 if (!path) {
3133 spin_lock(&sctx->stat_lock);
3134 sctx->stat.malloc_errors++;
3135 spin_unlock(&sctx->stat_lock);
3136 not_written = 1;
3137 goto out;
3138 }
3139
3140 trans = btrfs_join_transaction(root);
3141 if (IS_ERR(trans)) {
3142 not_written = 1;
3143 goto out;
3144 }
3145
3146 ret = iterate_inodes_from_logical(logical, fs_info, path,
3147 copy_nocow_pages_for_inode,
3148 nocow_ctx);
3149 if (ret != 0 && ret != -ENOENT) {
3150 pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n",
3151 (unsigned long long)logical,
3152 (unsigned long long)physical_for_dev_replace,
3153 (unsigned long long)len,
3154 (unsigned long long)mirror_num, ret);
3155 not_written = 1;
3156 goto out;
3157 }
3158
3159out:
3160 if (trans && !IS_ERR(trans))
3161 btrfs_end_transaction(trans, root);
3162 if (not_written)
3163 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
3164 num_uncorrectable_read_errors);
3165
3166 btrfs_free_path(path);
3167 kfree(nocow_ctx);
3168
3169 scrub_pending_trans_workers_dec(sctx);
3170}
3171
3172static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3173{
3174 unsigned long index;
3175 struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3176 int ret = 0;
3177 struct btrfs_key key;
3178 struct inode *inode = NULL;
3179 struct btrfs_root *local_root;
3180 u64 physical_for_dev_replace;
3181 u64 len;
3182 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3183
3184 key.objectid = root;
3185 key.type = BTRFS_ROOT_ITEM_KEY;
3186 key.offset = (u64)-1;
3187 local_root = btrfs_read_fs_root_no_name(fs_info, &key);
3188 if (IS_ERR(local_root))
3189 return PTR_ERR(local_root);
3190
3191 key.type = BTRFS_INODE_ITEM_KEY;
3192 key.objectid = inum;
3193 key.offset = 0;
3194 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
3195 if (IS_ERR(inode))
3196 return PTR_ERR(inode);
3197
3198 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3199 len = nocow_ctx->len;
3200 while (len >= PAGE_CACHE_SIZE) {
3201 struct page *page = NULL;
3202 int ret_sub;
3203
3204 index = offset >> PAGE_CACHE_SHIFT;
3205
3206 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
3207 if (!page) {
3208 pr_err("find_or_create_page() failed\n");
3209 ret = -ENOMEM;
3210 goto next_page;
3211 }
3212
3213 if (PageUptodate(page)) {
3214 if (PageDirty(page))
3215 goto next_page;
3216 } else {
3217 ClearPageError(page);
3218 ret_sub = extent_read_full_page(&BTRFS_I(inode)->
3219 io_tree,
3220 page, btrfs_get_extent,
3221 nocow_ctx->mirror_num);
3222 if (ret_sub) {
3223 ret = ret_sub;
3224 goto next_page;
3225 }
3226 wait_on_page_locked(page);
3227 if (!PageUptodate(page)) {
3228 ret = -EIO;
3229 goto next_page;
3230 }
3231 }
3232 ret_sub = write_page_nocow(nocow_ctx->sctx,
3233 physical_for_dev_replace, page);
3234 if (ret_sub) {
3235 ret = ret_sub;
3236 goto next_page;
3237 }
3238
3239next_page:
3240 if (page) {
3241 unlock_page(page);
3242 put_page(page);
3243 }
3244 offset += PAGE_CACHE_SIZE;
3245 physical_for_dev_replace += PAGE_CACHE_SIZE;
3246 len -= PAGE_CACHE_SIZE;
3247 }
3248
3249 if (inode)
3250 iput(inode);
3251 return ret;
3252}
3253
3254static int write_page_nocow(struct scrub_ctx *sctx,
3255 u64 physical_for_dev_replace, struct page *page)
3256{
3257 struct bio *bio;
3258 struct btrfs_device *dev;
3259 int ret;
3260 DECLARE_COMPLETION_ONSTACK(compl);
3261
3262 dev = sctx->wr_ctx.tgtdev;
3263 if (!dev)
3264 return -EIO;
3265 if (!dev->bdev) {
3266 printk_ratelimited(KERN_WARNING
3267 "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
3268 return -EIO;
3269 }
3270 bio = bio_alloc(GFP_NOFS, 1);
3271 if (!bio) {
3272 spin_lock(&sctx->stat_lock);
3273 sctx->stat.malloc_errors++;
3274 spin_unlock(&sctx->stat_lock);
3275 return -ENOMEM;
3276 }
3277 bio->bi_private = &compl;
3278 bio->bi_end_io = scrub_complete_bio_end_io;
3279 bio->bi_size = 0;
3280 bio->bi_sector = physical_for_dev_replace >> 9;
3281 bio->bi_bdev = dev->bdev;
3282 ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
3283 if (ret != PAGE_CACHE_SIZE) {
3284leave_with_eio:
3285 bio_put(bio);
3286 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
3287 return -EIO;
3288 }
3289 btrfsic_submit_bio(WRITE_SYNC, bio);
3290 wait_for_completion(&compl);
3291
3292 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
3293 goto leave_with_eio;
3294
3295 bio_put(bio);
3296 return 0;
2467} 3297}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e78b297b0b00..54454542ad40 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4397,9 +4397,9 @@ static int full_send_tree(struct send_ctx *sctx)
4397 if (!path) 4397 if (!path)
4398 return -ENOMEM; 4398 return -ENOMEM;
4399 4399
4400 spin_lock(&send_root->root_times_lock); 4400 spin_lock(&send_root->root_item_lock);
4401 start_ctransid = btrfs_root_ctransid(&send_root->root_item); 4401 start_ctransid = btrfs_root_ctransid(&send_root->root_item);
4402 spin_unlock(&send_root->root_times_lock); 4402 spin_unlock(&send_root->root_item_lock);
4403 4403
4404 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 4404 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
4405 key.type = BTRFS_INODE_ITEM_KEY; 4405 key.type = BTRFS_INODE_ITEM_KEY;
@@ -4422,9 +4422,9 @@ join_trans:
4422 * Make sure the tree has not changed after re-joining. We detect this 4422 * Make sure the tree has not changed after re-joining. We detect this
4423 * by comparing start_ctransid and ctransid. They should always match. 4423 * by comparing start_ctransid and ctransid. They should always match.
4424 */ 4424 */
4425 spin_lock(&send_root->root_times_lock); 4425 spin_lock(&send_root->root_item_lock);
4426 ctransid = btrfs_root_ctransid(&send_root->root_item); 4426 ctransid = btrfs_root_ctransid(&send_root->root_item);
4427 spin_unlock(&send_root->root_times_lock); 4427 spin_unlock(&send_root->root_item_lock);
4428 4428
4429 if (ctransid != start_ctransid) { 4429 if (ctransid != start_ctransid) {
4430 WARN(1, KERN_WARNING "btrfs: the root that you're trying to " 4430 WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 915ac14c2064..99545df1b86c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -55,6 +55,7 @@
55#include "export.h" 55#include "export.h"
56#include "compression.h" 56#include "compression.h"
57#include "rcu-string.h" 57#include "rcu-string.h"
58#include "dev-replace.h"
58 59
59#define CREATE_TRACE_POINTS 60#define CREATE_TRACE_POINTS
60#include <trace/events/btrfs.h> 61#include <trace/events/btrfs.h>
@@ -116,7 +117,16 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
116 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 117 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
117 sb->s_flags |= MS_RDONLY; 118 sb->s_flags |= MS_RDONLY;
118 printk(KERN_INFO "btrfs is forced readonly\n"); 119 printk(KERN_INFO "btrfs is forced readonly\n");
119 __btrfs_scrub_cancel(fs_info); 120 /*
121 * Note that a running device replace operation is not
122 * canceled here although there is no way to update
123 * the progress. It would add the risk of a deadlock,
124 * therefore the canceling is ommited. The only penalty
125 * is that some I/O remains active until the procedure
126 * completes. The next time when the filesystem is
127 * mounted writeable again, the device replace
128 * operation continues.
129 */
120// WARN_ON(1); 130// WARN_ON(1);
121 } 131 }
122} 132}
@@ -1186,7 +1196,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1186 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); 1196 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
1187 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); 1197 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
1188 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); 1198 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
1189 btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size); 1199 btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
1200 new_pool_size);
1190} 1201}
1191 1202
1192static int btrfs_remount(struct super_block *sb, int *flags, char *data) 1203static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -1215,8 +1226,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1215 return 0; 1226 return 0;
1216 1227
1217 if (*flags & MS_RDONLY) { 1228 if (*flags & MS_RDONLY) {
1229 /*
1230 * this also happens on 'umount -rf' or on shutdown, when
1231 * the filesystem is busy.
1232 */
1218 sb->s_flags |= MS_RDONLY; 1233 sb->s_flags |= MS_RDONLY;
1219 1234
1235 btrfs_dev_replace_suspend_for_unmount(fs_info);
1236 btrfs_scrub_cancel(fs_info);
1237
1220 ret = btrfs_commit_super(root); 1238 ret = btrfs_commit_super(root);
1221 if (ret) 1239 if (ret)
1222 goto restore; 1240 goto restore;
@@ -1226,6 +1244,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1226 goto restore; 1244 goto restore;
1227 } 1245 }
1228 1246
1247 if (fs_info->fs_devices->missing_devices >
1248 fs_info->num_tolerated_disk_barrier_failures &&
1249 !(*flags & MS_RDONLY)) {
1250 printk(KERN_WARNING
1251 "Btrfs: too many missing devices, writeable remount is not allowed\n");
1252 ret = -EACCES;
1253 goto restore;
1254 }
1255
1229 if (btrfs_super_log_root(fs_info->super_copy) != 0) { 1256 if (btrfs_super_log_root(fs_info->super_copy) != 0) {
1230 ret = -EINVAL; 1257 ret = -EINVAL;
1231 goto restore; 1258 goto restore;
@@ -1244,6 +1271,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1244 if (ret) 1271 if (ret)
1245 goto restore; 1272 goto restore;
1246 1273
1274 ret = btrfs_resume_dev_replace_async(fs_info);
1275 if (ret) {
1276 pr_warn("btrfs: failed to resume dev_replace\n");
1277 goto restore;
1278 }
1247 sb->s_flags &= ~MS_RDONLY; 1279 sb->s_flags &= ~MS_RDONLY;
1248 } 1280 }
1249 1281
@@ -1336,7 +1368,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1336 min_stripe_size = BTRFS_STRIPE_LEN; 1368 min_stripe_size = BTRFS_STRIPE_LEN;
1337 1369
1338 list_for_each_entry(device, &fs_devices->devices, dev_list) { 1370 list_for_each_entry(device, &fs_devices->devices, dev_list) {
1339 if (!device->in_fs_metadata || !device->bdev) 1371 if (!device->in_fs_metadata || !device->bdev ||
1372 device->is_tgtdev_for_dev_replace)
1340 continue; 1373 continue;
1341 1374
1342 avail_space = device->total_bytes - device->bytes_used; 1375 avail_space = device->total_bytes - device->bytes_used;
@@ -1647,10 +1680,14 @@ static int __init init_btrfs_fs(void)
1647 if (err) 1680 if (err)
1648 goto free_ordered_data; 1681 goto free_ordered_data;
1649 1682
1650 err = btrfs_interface_init(); 1683 err = btrfs_auto_defrag_init();
1651 if (err) 1684 if (err)
1652 goto free_delayed_inode; 1685 goto free_delayed_inode;
1653 1686
1687 err = btrfs_interface_init();
1688 if (err)
1689 goto free_auto_defrag;
1690
1654 err = register_filesystem(&btrfs_fs_type); 1691 err = register_filesystem(&btrfs_fs_type);
1655 if (err) 1692 if (err)
1656 goto unregister_ioctl; 1693 goto unregister_ioctl;
@@ -1662,6 +1699,8 @@ static int __init init_btrfs_fs(void)
1662 1699
1663unregister_ioctl: 1700unregister_ioctl:
1664 btrfs_interface_exit(); 1701 btrfs_interface_exit();
1702free_auto_defrag:
1703 btrfs_auto_defrag_exit();
1665free_delayed_inode: 1704free_delayed_inode:
1666 btrfs_delayed_inode_exit(); 1705 btrfs_delayed_inode_exit();
1667free_ordered_data: 1706free_ordered_data:
@@ -1681,6 +1720,7 @@ free_compress:
1681static void __exit exit_btrfs_fs(void) 1720static void __exit exit_btrfs_fs(void)
1682{ 1721{
1683 btrfs_destroy_cachep(); 1722 btrfs_destroy_cachep();
1723 btrfs_auto_defrag_exit();
1684 btrfs_delayed_inode_exit(); 1724 btrfs_delayed_inode_exit();
1685 ordered_data_exit(); 1725 ordered_data_exit();
1686 extent_map_exit(); 1726 extent_map_exit();
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 04bbfb1052eb..87fac9a21ea5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -30,6 +30,7 @@
30#include "tree-log.h" 30#include "tree-log.h"
31#include "inode-map.h" 31#include "inode-map.h"
32#include "volumes.h" 32#include "volumes.h"
33#include "dev-replace.h"
33 34
34#define BTRFS_ROOT_TRANS_TAG 0 35#define BTRFS_ROOT_TRANS_TAG 0
35 36
@@ -145,16 +146,12 @@ loop:
145 * the log must never go across transaction boundaries. 146 * the log must never go across transaction boundaries.
146 */ 147 */
147 smp_mb(); 148 smp_mb();
148 if (!list_empty(&fs_info->tree_mod_seq_list)) { 149 if (!list_empty(&fs_info->tree_mod_seq_list))
149 printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when " 150 WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when "
150 "creating a fresh transaction\n"); 151 "creating a fresh transaction\n");
151 WARN_ON(1); 152 if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
152 } 153 WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
153 if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) {
154 printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
155 "creating a fresh transaction\n"); 154 "creating a fresh transaction\n");
156 WARN_ON(1);
157 }
158 atomic_set(&fs_info->tree_mod_seq, 0); 155 atomic_set(&fs_info->tree_mod_seq, 0);
159 156
160 spin_lock_init(&cur_trans->commit_lock); 157 spin_lock_init(&cur_trans->commit_lock);
@@ -295,9 +292,9 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
295 return 0; 292 return 0;
296} 293}
297 294
298static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 295static struct btrfs_trans_handle *
299 u64 num_items, int type, 296start_transaction(struct btrfs_root *root, u64 num_items, int type,
300 int noflush) 297 enum btrfs_reserve_flush_enum flush)
301{ 298{
302 struct btrfs_trans_handle *h; 299 struct btrfs_trans_handle *h;
303 struct btrfs_transaction *cur_trans; 300 struct btrfs_transaction *cur_trans;
@@ -312,6 +309,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
312 WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); 309 WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
313 h = current->journal_info; 310 h = current->journal_info;
314 h->use_count++; 311 h->use_count++;
312 WARN_ON(h->use_count > 2);
315 h->orig_rsv = h->block_rsv; 313 h->orig_rsv = h->block_rsv;
316 h->block_rsv = NULL; 314 h->block_rsv = NULL;
317 goto got_it; 315 goto got_it;
@@ -331,14 +329,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
331 } 329 }
332 330
333 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 331 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
334 if (noflush) 332 ret = btrfs_block_rsv_add(root,
335 ret = btrfs_block_rsv_add_noflush(root, 333 &root->fs_info->trans_block_rsv,
336 &root->fs_info->trans_block_rsv, 334 num_bytes, flush);
337 num_bytes);
338 else
339 ret = btrfs_block_rsv_add(root,
340 &root->fs_info->trans_block_rsv,
341 num_bytes);
342 if (ret) 335 if (ret)
343 return ERR_PTR(ret); 336 return ERR_PTR(ret);
344 } 337 }
@@ -422,13 +415,15 @@ got_it:
422struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 415struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
423 int num_items) 416 int num_items)
424{ 417{
425 return start_transaction(root, num_items, TRANS_START, 0); 418 return start_transaction(root, num_items, TRANS_START,
419 BTRFS_RESERVE_FLUSH_ALL);
426} 420}
427 421
428struct btrfs_trans_handle *btrfs_start_transaction_noflush( 422struct btrfs_trans_handle *btrfs_start_transaction_lflush(
429 struct btrfs_root *root, int num_items) 423 struct btrfs_root *root, int num_items)
430{ 424{
431 return start_transaction(root, num_items, TRANS_START, 1); 425 return start_transaction(root, num_items, TRANS_START,
426 BTRFS_RESERVE_FLUSH_LIMIT);
432} 427}
433 428
434struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) 429struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
@@ -461,28 +456,31 @@ static noinline void wait_for_commit(struct btrfs_root *root,
461int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) 456int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
462{ 457{
463 struct btrfs_transaction *cur_trans = NULL, *t; 458 struct btrfs_transaction *cur_trans = NULL, *t;
464 int ret; 459 int ret = 0;
465 460
466 ret = 0;
467 if (transid) { 461 if (transid) {
468 if (transid <= root->fs_info->last_trans_committed) 462 if (transid <= root->fs_info->last_trans_committed)
469 goto out; 463 goto out;
470 464
465 ret = -EINVAL;
471 /* find specified transaction */ 466 /* find specified transaction */
472 spin_lock(&root->fs_info->trans_lock); 467 spin_lock(&root->fs_info->trans_lock);
473 list_for_each_entry(t, &root->fs_info->trans_list, list) { 468 list_for_each_entry(t, &root->fs_info->trans_list, list) {
474 if (t->transid == transid) { 469 if (t->transid == transid) {
475 cur_trans = t; 470 cur_trans = t;
476 atomic_inc(&cur_trans->use_count); 471 atomic_inc(&cur_trans->use_count);
472 ret = 0;
477 break; 473 break;
478 } 474 }
479 if (t->transid > transid) 475 if (t->transid > transid) {
476 ret = 0;
480 break; 477 break;
478 }
481 } 479 }
482 spin_unlock(&root->fs_info->trans_lock); 480 spin_unlock(&root->fs_info->trans_lock);
483 ret = -EINVAL; 481 /* The specified transaction doesn't exist */
484 if (!cur_trans) 482 if (!cur_trans)
485 goto out; /* bad transid */ 483 goto out;
486 } else { 484 } else {
487 /* find newest transaction that is committing | committed */ 485 /* find newest transaction that is committing | committed */
488 spin_lock(&root->fs_info->trans_lock); 486 spin_lock(&root->fs_info->trans_lock);
@@ -502,9 +500,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
502 } 500 }
503 501
504 wait_for_commit(root, cur_trans); 502 wait_for_commit(root, cur_trans);
505
506 put_transaction(cur_trans); 503 put_transaction(cur_trans);
507 ret = 0;
508out: 504out:
509 return ret; 505 return ret;
510} 506}
@@ -851,7 +847,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
851 return ret; 847 return ret;
852 848
853 ret = btrfs_run_dev_stats(trans, root->fs_info); 849 ret = btrfs_run_dev_stats(trans, root->fs_info);
854 BUG_ON(ret); 850 WARN_ON(ret);
851 ret = btrfs_run_dev_replace(trans, root->fs_info);
852 WARN_ON(ret);
855 853
856 ret = btrfs_run_qgroups(trans, root->fs_info); 854 ret = btrfs_run_qgroups(trans, root->fs_info);
857 BUG_ON(ret); 855 BUG_ON(ret);
@@ -874,6 +872,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
874 switch_commit_root(fs_info->extent_root); 872 switch_commit_root(fs_info->extent_root);
875 up_write(&fs_info->extent_commit_sem); 873 up_write(&fs_info->extent_commit_sem);
876 874
875 btrfs_after_dev_replace_commit(fs_info);
876
877 return 0; 877 return 0;
878} 878}
879 879
@@ -958,7 +958,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
958 struct btrfs_fs_info *info = root->fs_info; 958 struct btrfs_fs_info *info = root->fs_info;
959 struct btrfs_trans_handle *trans; 959 struct btrfs_trans_handle *trans;
960 int ret; 960 int ret;
961 unsigned long nr;
962 961
963 if (xchg(&root->defrag_running, 1)) 962 if (xchg(&root->defrag_running, 1))
964 return 0; 963 return 0;
@@ -970,9 +969,8 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
970 969
971 ret = btrfs_defrag_leaves(trans, root, cacheonly); 970 ret = btrfs_defrag_leaves(trans, root, cacheonly);
972 971
973 nr = trans->blocks_used;
974 btrfs_end_transaction(trans, root); 972 btrfs_end_transaction(trans, root);
975 btrfs_btree_balance_dirty(info->tree_root, nr); 973 btrfs_btree_balance_dirty(info->tree_root);
976 cond_resched(); 974 cond_resched();
977 975
978 if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) 976 if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
@@ -1032,8 +1030,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1032 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 1030 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
1033 1031
1034 if (to_reserve > 0) { 1032 if (to_reserve > 0) {
1035 ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv, 1033 ret = btrfs_block_rsv_add(root, &pending->block_rsv,
1036 to_reserve); 1034 to_reserve,
1035 BTRFS_RESERVE_NO_FLUSH);
1037 if (ret) { 1036 if (ret) {
1038 pending->error = ret; 1037 pending->error = ret;
1039 goto no_free_objectid; 1038 goto no_free_objectid;
@@ -1191,7 +1190,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1191 parent_inode, &key, 1190 parent_inode, &key,
1192 BTRFS_FT_DIR, index); 1191 BTRFS_FT_DIR, index);
1193 /* We have check then name at the beginning, so it is impossible. */ 1192 /* We have check then name at the beginning, so it is impossible. */
1194 BUG_ON(ret == -EEXIST); 1193 BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
1195 if (ret) { 1194 if (ret) {
1196 btrfs_abort_transaction(trans, root, ret); 1195 btrfs_abort_transaction(trans, root, ret);
1197 goto fail; 1196 goto fail;
@@ -1309,9 +1308,10 @@ static void do_async_commit(struct work_struct *work)
1309 * We've got freeze protection passed with the transaction. 1308 * We've got freeze protection passed with the transaction.
1310 * Tell lockdep about it. 1309 * Tell lockdep about it.
1311 */ 1310 */
1312 rwsem_acquire_read( 1311 if (ac->newtrans->type < TRANS_JOIN_NOLOCK)
1313 &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1312 rwsem_acquire_read(
1314 0, 1, _THIS_IP_); 1313 &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1314 0, 1, _THIS_IP_);
1315 1315
1316 current->journal_info = ac->newtrans; 1316 current->journal_info = ac->newtrans;
1317 1317
@@ -1349,8 +1349,10 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1349 * Tell lockdep we've released the freeze rwsem, since the 1349 * Tell lockdep we've released the freeze rwsem, since the
1350 * async commit thread will be the one to unlock it. 1350 * async commit thread will be the one to unlock it.
1351 */ 1351 */
1352 rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1352 if (trans->type < TRANS_JOIN_NOLOCK)
1353 1, _THIS_IP_); 1353 rwsem_release(
1354 &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1355 1, _THIS_IP_);
1354 1356
1355 schedule_delayed_work(&ac->work, 0); 1357 schedule_delayed_work(&ac->work, 0);
1356 1358
@@ -1400,6 +1402,48 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1400 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1402 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1401} 1403}
1402 1404
1405static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1406 struct btrfs_root *root)
1407{
1408 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
1409 int snap_pending = 0;
1410 int ret;
1411
1412 if (!flush_on_commit) {
1413 spin_lock(&root->fs_info->trans_lock);
1414 if (!list_empty(&trans->transaction->pending_snapshots))
1415 snap_pending = 1;
1416 spin_unlock(&root->fs_info->trans_lock);
1417 }
1418
1419 if (flush_on_commit || snap_pending) {
1420 btrfs_start_delalloc_inodes(root, 1);
1421 btrfs_wait_ordered_extents(root, 1);
1422 }
1423
1424 ret = btrfs_run_delayed_items(trans, root);
1425 if (ret)
1426 return ret;
1427
1428 /*
1429 * running the delayed items may have added new refs. account
1430 * them now so that they hinder processing of more delayed refs
1431 * as little as possible.
1432 */
1433 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
1434
1435 /*
1436 * rename don't use btrfs_join_transaction, so, once we
1437 * set the transaction to blocked above, we aren't going
1438 * to get any new ordered operations. We can safely run
1439 * it here and no for sure that nothing new will be added
1440 * to the list
1441 */
1442 btrfs_run_ordered_operations(root, 1);
1443
1444 return 0;
1445}
1446
1403/* 1447/*
1404 * btrfs_transaction state sequence: 1448 * btrfs_transaction state sequence:
1405 * in_commit = 0, blocked = 0 (initial) 1449 * in_commit = 0, blocked = 0 (initial)
@@ -1414,15 +1458,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1414 struct btrfs_transaction *cur_trans = trans->transaction; 1458 struct btrfs_transaction *cur_trans = trans->transaction;
1415 struct btrfs_transaction *prev_trans = NULL; 1459 struct btrfs_transaction *prev_trans = NULL;
1416 DEFINE_WAIT(wait); 1460 DEFINE_WAIT(wait);
1417 int ret = -EIO; 1461 int ret;
1418 int should_grow = 0; 1462 int should_grow = 0;
1419 unsigned long now = get_seconds(); 1463 unsigned long now = get_seconds();
1420 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
1421 1464
1422 btrfs_run_ordered_operations(root, 0); 1465 ret = btrfs_run_ordered_operations(root, 0);
1466 if (ret) {
1467 btrfs_abort_transaction(trans, root, ret);
1468 goto cleanup_transaction;
1469 }
1423 1470
1424 if (cur_trans->aborted) 1471 if (cur_trans->aborted) {
1472 ret = cur_trans->aborted;
1425 goto cleanup_transaction; 1473 goto cleanup_transaction;
1474 }
1426 1475
1427 /* make a pass through all the delayed refs we have so far 1476 /* make a pass through all the delayed refs we have so far
1428 * any runnings procs may add more while we are here 1477 * any runnings procs may add more while we are here
@@ -1490,39 +1539,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1490 should_grow = 1; 1539 should_grow = 1;
1491 1540
1492 do { 1541 do {
1493 int snap_pending = 0;
1494
1495 joined = cur_trans->num_joined; 1542 joined = cur_trans->num_joined;
1496 if (!list_empty(&trans->transaction->pending_snapshots))
1497 snap_pending = 1;
1498 1543
1499 WARN_ON(cur_trans != trans->transaction); 1544 WARN_ON(cur_trans != trans->transaction);
1500 1545
1501 if (flush_on_commit || snap_pending) { 1546 ret = btrfs_flush_all_pending_stuffs(trans, root);
1502 btrfs_start_delalloc_inodes(root, 1);
1503 btrfs_wait_ordered_extents(root, 1);
1504 }
1505
1506 ret = btrfs_run_delayed_items(trans, root);
1507 if (ret) 1547 if (ret)
1508 goto cleanup_transaction; 1548 goto cleanup_transaction;
1509 1549
1510 /*
1511 * running the delayed items may have added new refs. account
1512 * them now so that they hinder processing of more delayed refs
1513 * as little as possible.
1514 */
1515 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
1516
1517 /*
1518 * rename don't use btrfs_join_transaction, so, once we
1519 * set the transaction to blocked above, we aren't going
1520 * to get any new ordered operations. We can safely run
1521 * it here and no for sure that nothing new will be added
1522 * to the list
1523 */
1524 btrfs_run_ordered_operations(root, 1);
1525
1526 prepare_to_wait(&cur_trans->writer_wait, &wait, 1550 prepare_to_wait(&cur_trans->writer_wait, &wait,
1527 TASK_UNINTERRUPTIBLE); 1551 TASK_UNINTERRUPTIBLE);
1528 1552
@@ -1535,6 +1559,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1535 } while (atomic_read(&cur_trans->num_writers) > 1 || 1559 } while (atomic_read(&cur_trans->num_writers) > 1 ||
1536 (should_grow && cur_trans->num_joined != joined)); 1560 (should_grow && cur_trans->num_joined != joined));
1537 1561
1562 ret = btrfs_flush_all_pending_stuffs(trans, root);
1563 if (ret)
1564 goto cleanup_transaction;
1565
1538 /* 1566 /*
1539 * Ok now we need to make sure to block out any other joins while we 1567 * Ok now we need to make sure to block out any other joins while we
1540 * commit the transaction. We could have started a join before setting 1568 * commit the transaction. We could have started a join before setting
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 80961947a6b2..0e8aa1e6c287 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -105,7 +105,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
105 struct btrfs_root *root); 105 struct btrfs_root *root);
106struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 106struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
107 int num_items); 107 int num_items);
108struct btrfs_trans_handle *btrfs_start_transaction_noflush( 108struct btrfs_trans_handle *btrfs_start_transaction_lflush(
109 struct btrfs_root *root, int num_items); 109 struct btrfs_root *root, int num_items);
110struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); 110struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
111struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); 111struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 81e407d9677a..83186c7e45d4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2952,33 +2952,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2952 struct btrfs_inode_item *item, 2952 struct btrfs_inode_item *item,
2953 struct inode *inode, int log_inode_only) 2953 struct inode *inode, int log_inode_only)
2954{ 2954{
2955 btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); 2955 struct btrfs_map_token token;
2956 btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); 2956
2957 btrfs_set_inode_mode(leaf, item, inode->i_mode); 2957 btrfs_init_map_token(&token);
2958 btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
2959
2960 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
2961 inode->i_atime.tv_sec);
2962 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
2963 inode->i_atime.tv_nsec);
2964
2965 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
2966 inode->i_mtime.tv_sec);
2967 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
2968 inode->i_mtime.tv_nsec);
2969
2970 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
2971 inode->i_ctime.tv_sec);
2972 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
2973 inode->i_ctime.tv_nsec);
2974
2975 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
2976
2977 btrfs_set_inode_sequence(leaf, item, inode->i_version);
2978 btrfs_set_inode_transid(leaf, item, trans->transid);
2979 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2980 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2981 btrfs_set_inode_block_group(leaf, item, 0);
2982 2958
2983 if (log_inode_only) { 2959 if (log_inode_only) {
2984 /* set the generation to zero so the recover code 2960 /* set the generation to zero so the recover code
@@ -2986,14 +2962,63 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2986 * just to say 'this inode exists' and a logging 2962 * just to say 'this inode exists' and a logging
2987 * to say 'update this inode with these values' 2963 * to say 'update this inode with these values'
2988 */ 2964 */
2989 btrfs_set_inode_generation(leaf, item, 0); 2965 btrfs_set_token_inode_generation(leaf, item, 0, &token);
2990 btrfs_set_inode_size(leaf, item, 0); 2966 btrfs_set_token_inode_size(leaf, item, 0, &token);
2991 } else { 2967 } else {
2992 btrfs_set_inode_generation(leaf, item, 2968 btrfs_set_token_inode_generation(leaf, item,
2993 BTRFS_I(inode)->generation); 2969 BTRFS_I(inode)->generation,
2994 btrfs_set_inode_size(leaf, item, inode->i_size); 2970 &token);
2995 } 2971 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
2972 }
2973
2974 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
2975 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
2976 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
2977 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
2978
2979 btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
2980 inode->i_atime.tv_sec, &token);
2981 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
2982 inode->i_atime.tv_nsec, &token);
2983
2984 btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
2985 inode->i_mtime.tv_sec, &token);
2986 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
2987 inode->i_mtime.tv_nsec, &token);
2988
2989 btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
2990 inode->i_ctime.tv_sec, &token);
2991 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
2992 inode->i_ctime.tv_nsec, &token);
2993
2994 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
2995 &token);
2996
2997 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
2998 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
2999 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3000 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3001 btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3002}
2996 3003
3004static int log_inode_item(struct btrfs_trans_handle *trans,
3005 struct btrfs_root *log, struct btrfs_path *path,
3006 struct inode *inode)
3007{
3008 struct btrfs_inode_item *inode_item;
3009 struct btrfs_key key;
3010 int ret;
3011
3012 memcpy(&key, &BTRFS_I(inode)->location, sizeof(key));
3013 ret = btrfs_insert_empty_item(trans, log, path, &key,
3014 sizeof(*inode_item));
3015 if (ret && ret != -EEXIST)
3016 return ret;
3017 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3018 struct btrfs_inode_item);
3019 fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
3020 btrfs_release_path(path);
3021 return 0;
2997} 3022}
2998 3023
2999static noinline int copy_items(struct btrfs_trans_handle *trans, 3024static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -3130,151 +3155,234 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3130 return 0; 3155 return 0;
3131} 3156}
3132 3157
3133struct log_args { 3158static int drop_adjacent_extents(struct btrfs_trans_handle *trans,
3134 struct extent_buffer *src; 3159 struct btrfs_root *root, struct inode *inode,
3135 u64 next_offset; 3160 struct extent_map *em,
3136 int start_slot; 3161 struct btrfs_path *path)
3137 int nr; 3162{
3138}; 3163 struct btrfs_file_extent_item *fi;
3164 struct extent_buffer *leaf;
3165 struct btrfs_key key, new_key;
3166 struct btrfs_map_token token;
3167 u64 extent_end;
3168 u64 extent_offset = 0;
3169 int extent_type;
3170 int del_slot = 0;
3171 int del_nr = 0;
3172 int ret = 0;
3173
3174 while (1) {
3175 btrfs_init_map_token(&token);
3176 leaf = path->nodes[0];
3177 path->slots[0]++;
3178 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3179 if (del_nr) {
3180 ret = btrfs_del_items(trans, root, path,
3181 del_slot, del_nr);
3182 if (ret)
3183 return ret;
3184 del_nr = 0;
3185 }
3186
3187 ret = btrfs_next_leaf_write(trans, root, path, 1);
3188 if (ret < 0)
3189 return ret;
3190 if (ret > 0)
3191 return 0;
3192 leaf = path->nodes[0];
3193 }
3194
3195 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3196 if (key.objectid != btrfs_ino(inode) ||
3197 key.type != BTRFS_EXTENT_DATA_KEY ||
3198 key.offset >= em->start + em->len)
3199 break;
3200
3201 fi = btrfs_item_ptr(leaf, path->slots[0],
3202 struct btrfs_file_extent_item);
3203 extent_type = btrfs_token_file_extent_type(leaf, fi, &token);
3204 if (extent_type == BTRFS_FILE_EXTENT_REG ||
3205 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
3206 extent_offset = btrfs_token_file_extent_offset(leaf,
3207 fi, &token);
3208 extent_end = key.offset +
3209 btrfs_token_file_extent_num_bytes(leaf, fi,
3210 &token);
3211 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
3212 extent_end = key.offset +
3213 btrfs_file_extent_inline_len(leaf, fi);
3214 } else {
3215 BUG();
3216 }
3217
3218 if (extent_end <= em->len + em->start) {
3219 if (!del_nr) {
3220 del_slot = path->slots[0];
3221 }
3222 del_nr++;
3223 continue;
3224 }
3225
3226 /*
3227 * Ok so we'll ignore previous items if we log a new extent,
3228 * which can lead to overlapping extents, so if we have an
3229 * existing extent we want to adjust we _have_ to check the next
3230 * guy to make sure we even need this extent anymore, this keeps
3231 * us from panicing in set_item_key_safe.
3232 */
3233 if (path->slots[0] < btrfs_header_nritems(leaf) - 1) {
3234 struct btrfs_key tmp_key;
3235
3236 btrfs_item_key_to_cpu(leaf, &tmp_key,
3237 path->slots[0] + 1);
3238 if (tmp_key.objectid == btrfs_ino(inode) &&
3239 tmp_key.type == BTRFS_EXTENT_DATA_KEY &&
3240 tmp_key.offset <= em->start + em->len) {
3241 if (!del_nr)
3242 del_slot = path->slots[0];
3243 del_nr++;
3244 continue;
3245 }
3246 }
3247
3248 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
3249 memcpy(&new_key, &key, sizeof(new_key));
3250 new_key.offset = em->start + em->len;
3251 btrfs_set_item_key_safe(trans, root, path, &new_key);
3252 extent_offset += em->start + em->len - key.offset;
3253 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset,
3254 &token);
3255 btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end -
3256 (em->start + em->len),
3257 &token);
3258 btrfs_mark_buffer_dirty(leaf);
3259 }
3260
3261 if (del_nr)
3262 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
3263
3264 return ret;
3265}
3139 3266
3140static int log_one_extent(struct btrfs_trans_handle *trans, 3267static int log_one_extent(struct btrfs_trans_handle *trans,
3141 struct inode *inode, struct btrfs_root *root, 3268 struct inode *inode, struct btrfs_root *root,
3142 struct extent_map *em, struct btrfs_path *path, 3269 struct extent_map *em, struct btrfs_path *path)
3143 struct btrfs_path *dst_path, struct log_args *args)
3144{ 3270{
3145 struct btrfs_root *log = root->log_root; 3271 struct btrfs_root *log = root->log_root;
3146 struct btrfs_file_extent_item *fi; 3272 struct btrfs_file_extent_item *fi;
3273 struct extent_buffer *leaf;
3274 struct list_head ordered_sums;
3275 struct btrfs_map_token token;
3147 struct btrfs_key key; 3276 struct btrfs_key key;
3148 u64 start = em->mod_start; 3277 u64 csum_offset = em->mod_start - em->start;
3149 u64 search_start = start; 3278 u64 csum_len = em->mod_len;
3150 u64 len = em->mod_len; 3279 u64 extent_offset = em->start - em->orig_start;
3151 u64 num_bytes; 3280 u64 block_len;
3152 int nritems;
3153 int ret; 3281 int ret;
3282 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3154 3283
3155 if (BTRFS_I(inode)->logged_trans == trans->transid) { 3284 INIT_LIST_HEAD(&ordered_sums);
3156 ret = __btrfs_drop_extents(trans, log, inode, dst_path, start, 3285 btrfs_init_map_token(&token);
3157 start + len, NULL, 0); 3286 key.objectid = btrfs_ino(inode);
3158 if (ret) 3287 key.type = BTRFS_EXTENT_DATA_KEY;
3159 return ret; 3288 key.offset = em->start;
3289 path->really_keep_locks = 1;
3290
3291 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi));
3292 if (ret && ret != -EEXIST) {
3293 path->really_keep_locks = 0;
3294 return ret;
3160 } 3295 }
3296 leaf = path->nodes[0];
3297 fi = btrfs_item_ptr(leaf, path->slots[0],
3298 struct btrfs_file_extent_item);
3299 btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
3300 &token);
3301 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3302 skip_csum = true;
3303 btrfs_set_token_file_extent_type(leaf, fi,
3304 BTRFS_FILE_EXTENT_PREALLOC,
3305 &token);
3306 } else {
3307 btrfs_set_token_file_extent_type(leaf, fi,
3308 BTRFS_FILE_EXTENT_REG,
3309 &token);
3310 if (em->block_start == 0)
3311 skip_csum = true;
3312 }
3313
3314 block_len = max(em->block_len, em->orig_block_len);
3315 if (em->compress_type != BTRFS_COMPRESS_NONE) {
3316 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3317 em->block_start,
3318 &token);
3319 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3320 &token);
3321 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
3322 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3323 em->block_start -
3324 extent_offset, &token);
3325 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3326 &token);
3327 } else {
3328 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
3329 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
3330 &token);
3331 }
3332
3333 btrfs_set_token_file_extent_offset(leaf, fi,
3334 em->start - em->orig_start,
3335 &token);
3336 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
3337 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token);
3338 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
3339 &token);
3340 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
3341 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
3342 btrfs_mark_buffer_dirty(leaf);
3161 3343
3162 while (len) { 3344 /*
3163 if (args->nr) 3345 * Have to check the extent to the right of us to make sure it doesn't
3164 goto next_slot; 3346 * fall in our current range. We're ok if the previous extent is in our
3165again: 3347 * range since the recovery stuff will run us in key order and thus just
3166 key.objectid = btrfs_ino(inode); 3348 * drop the part we overwrote.
3167 key.type = BTRFS_EXTENT_DATA_KEY; 3349 */
3168 key.offset = search_start; 3350 ret = drop_adjacent_extents(trans, log, inode, em, path);
3169 3351 btrfs_release_path(path);
3170 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3352 path->really_keep_locks = 0;
3171 if (ret < 0) 3353 if (ret) {
3172 return ret; 3354 return ret;
3173 3355 }
3174 if (ret) {
3175 /*
3176 * A rare case were we can have an em for a section of a
3177 * larger extent so we need to make sure that this em
3178 * falls within the extent we've found. If not we just
3179 * bail and go back to ye-olde way of doing things but
3180 * it happens often enough in testing that we need to do
3181 * this dance to make sure.
3182 */
3183 do {
3184 if (path->slots[0] == 0) {
3185 btrfs_release_path(path);
3186 if (search_start == 0)
3187 return -ENOENT;
3188 search_start--;
3189 goto again;
3190 }
3191
3192 path->slots[0]--;
3193 btrfs_item_key_to_cpu(path->nodes[0], &key,
3194 path->slots[0]);
3195 if (key.objectid != btrfs_ino(inode) ||
3196 key.type != BTRFS_EXTENT_DATA_KEY) {
3197 btrfs_release_path(path);
3198 return -ENOENT;
3199 }
3200 } while (key.offset > start);
3201 3356
3202 fi = btrfs_item_ptr(path->nodes[0], path->slots[0], 3357 if (skip_csum)
3203 struct btrfs_file_extent_item); 3358 return 0;
3204 num_bytes = btrfs_file_extent_num_bytes(path->nodes[0],
3205 fi);
3206 if (key.offset + num_bytes <= start) {
3207 btrfs_release_path(path);
3208 return -ENOENT;
3209 }
3210 }
3211 args->src = path->nodes[0];
3212next_slot:
3213 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3214 fi = btrfs_item_ptr(args->src, path->slots[0],
3215 struct btrfs_file_extent_item);
3216 if (args->nr &&
3217 args->start_slot + args->nr == path->slots[0]) {
3218 args->nr++;
3219 } else if (args->nr) {
3220 ret = copy_items(trans, inode, dst_path, args->src,
3221 args->start_slot, args->nr,
3222 LOG_INODE_ALL);
3223 if (ret)
3224 return ret;
3225 args->nr = 1;
3226 args->start_slot = path->slots[0];
3227 } else if (!args->nr) {
3228 args->nr = 1;
3229 args->start_slot = path->slots[0];
3230 }
3231 nritems = btrfs_header_nritems(path->nodes[0]);
3232 path->slots[0]++;
3233 num_bytes = btrfs_file_extent_num_bytes(args->src, fi);
3234 if (len < num_bytes) {
3235 /* I _think_ this is ok, envision we write to a
3236 * preallocated space that is adjacent to a previously
3237 * written preallocated space that gets merged when we
3238 * mark this preallocated space written. If we do not
3239 * have the adjacent extent in cache then when we copy
3240 * this extent it could end up being larger than our EM
3241 * thinks it is, which is a-ok, so just set len to 0.
3242 */
3243 len = 0;
3244 } else {
3245 len -= num_bytes;
3246 }
3247 start = key.offset + num_bytes;
3248 args->next_offset = start;
3249 search_start = start;
3250 3359
3251 if (path->slots[0] < nritems) { 3360 /* block start is already adjusted for the file extent offset. */
3252 if (len) 3361 ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
3253 goto next_slot; 3362 em->block_start + csum_offset,
3254 break; 3363 em->block_start + csum_offset +
3255 } 3364 csum_len - 1, &ordered_sums, 0);
3365 if (ret)
3366 return ret;
3256 3367
3257 if (args->nr) { 3368 while (!list_empty(&ordered_sums)) {
3258 ret = copy_items(trans, inode, dst_path, args->src, 3369 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3259 args->start_slot, args->nr, 3370 struct btrfs_ordered_sum,
3260 LOG_INODE_ALL); 3371 list);
3261 if (ret) 3372 if (!ret)
3262 return ret; 3373 ret = btrfs_csum_file_blocks(trans, log, sums);
3263 args->nr = 0; 3374 list_del(&sums->list);
3264 btrfs_release_path(path); 3375 kfree(sums);
3265 }
3266 } 3376 }
3267 3377
3268 return 0; 3378 return ret;
3269} 3379}
3270 3380
3271static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 3381static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3272 struct btrfs_root *root, 3382 struct btrfs_root *root,
3273 struct inode *inode, 3383 struct inode *inode,
3274 struct btrfs_path *path, 3384 struct btrfs_path *path)
3275 struct btrfs_path *dst_path)
3276{ 3385{
3277 struct log_args args;
3278 struct extent_map *em, *n; 3386 struct extent_map *em, *n;
3279 struct list_head extents; 3387 struct list_head extents;
3280 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3388 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
@@ -3283,8 +3391,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3283 3391
3284 INIT_LIST_HEAD(&extents); 3392 INIT_LIST_HEAD(&extents);
3285 3393
3286 memset(&args, 0, sizeof(args));
3287
3288 write_lock(&tree->lock); 3394 write_lock(&tree->lock);
3289 test_gen = root->fs_info->last_trans_committed; 3395 test_gen = root->fs_info->last_trans_committed;
3290 3396
@@ -3317,34 +3423,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3317 3423
3318 write_unlock(&tree->lock); 3424 write_unlock(&tree->lock);
3319 3425
3320 /* 3426 ret = log_one_extent(trans, inode, root, em, path);
3321 * If the previous EM and the last extent we left off on aren't
3322 * sequential then we need to copy the items we have and redo
3323 * our search
3324 */
3325 if (args.nr && em->mod_start != args.next_offset) {
3326 ret = copy_items(trans, inode, dst_path, args.src,
3327 args.start_slot, args.nr,
3328 LOG_INODE_ALL);
3329 if (ret) {
3330 free_extent_map(em);
3331 write_lock(&tree->lock);
3332 continue;
3333 }
3334 btrfs_release_path(path);
3335 args.nr = 0;
3336 }
3337
3338 ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
3339 free_extent_map(em); 3427 free_extent_map(em);
3340 write_lock(&tree->lock); 3428 write_lock(&tree->lock);
3341 } 3429 }
3342 WARN_ON(!list_empty(&extents)); 3430 WARN_ON(!list_empty(&extents));
3343 write_unlock(&tree->lock); 3431 write_unlock(&tree->lock);
3344 3432
3345 if (!ret && args.nr)
3346 ret = copy_items(trans, inode, dst_path, args.src,
3347 args.start_slot, args.nr, LOG_INODE_ALL);
3348 btrfs_release_path(path); 3433 btrfs_release_path(path);
3349 return ret; 3434 return ret;
3350} 3435}
@@ -3400,7 +3485,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3400 3485
3401 3486
3402 /* today the code can only do partial logging of directories */ 3487 /* today the code can only do partial logging of directories */
3403 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 3488 if (S_ISDIR(inode->i_mode) ||
3489 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3490 &BTRFS_I(inode)->runtime_flags) &&
3491 inode_only == LOG_INODE_EXISTS))
3404 max_key.type = BTRFS_XATTR_ITEM_KEY; 3492 max_key.type = BTRFS_XATTR_ITEM_KEY;
3405 else 3493 else
3406 max_key.type = (u8)-1; 3494 max_key.type = (u8)-1;
@@ -3432,14 +3520,28 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3432 } else { 3520 } else {
3433 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3521 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3434 &BTRFS_I(inode)->runtime_flags)) { 3522 &BTRFS_I(inode)->runtime_flags)) {
3523 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3524 &BTRFS_I(inode)->runtime_flags);
3435 ret = btrfs_truncate_inode_items(trans, log, 3525 ret = btrfs_truncate_inode_items(trans, log,
3436 inode, 0, 0); 3526 inode, 0, 0);
3437 } else { 3527 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3438 fast_search = true; 3528 &BTRFS_I(inode)->runtime_flags)) {
3529 if (inode_only == LOG_INODE_ALL)
3530 fast_search = true;
3439 max_key.type = BTRFS_XATTR_ITEM_KEY; 3531 max_key.type = BTRFS_XATTR_ITEM_KEY;
3440 ret = drop_objectid_items(trans, log, path, ino, 3532 ret = drop_objectid_items(trans, log, path, ino,
3441 BTRFS_XATTR_ITEM_KEY); 3533 max_key.type);
3534 } else {
3535 if (inode_only == LOG_INODE_ALL)
3536 fast_search = true;
3537 ret = log_inode_item(trans, log, dst_path, inode);
3538 if (ret) {
3539 err = ret;
3540 goto out_unlock;
3541 }
3542 goto log_extents;
3442 } 3543 }
3544
3443 } 3545 }
3444 if (ret) { 3546 if (ret) {
3445 err = ret; 3547 err = ret;
@@ -3518,11 +3620,10 @@ next_slot:
3518 ins_nr = 0; 3620 ins_nr = 0;
3519 } 3621 }
3520 3622
3623log_extents:
3521 if (fast_search) { 3624 if (fast_search) {
3522 btrfs_release_path(path);
3523 btrfs_release_path(dst_path); 3625 btrfs_release_path(dst_path);
3524 ret = btrfs_log_changed_extents(trans, root, inode, path, 3626 ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
3525 dst_path);
3526 if (ret) { 3627 if (ret) {
3527 err = ret; 3628 err = ret;
3528 goto out_unlock; 3629 goto out_unlock;
@@ -3531,8 +3632,10 @@ next_slot:
3531 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3632 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3532 struct extent_map *em, *n; 3633 struct extent_map *em, *n;
3533 3634
3635 write_lock(&tree->lock);
3534 list_for_each_entry_safe(em, n, &tree->modified_extents, list) 3636 list_for_each_entry_safe(em, n, &tree->modified_extents, list)
3535 list_del_init(&em->list); 3637 list_del_init(&em->list);
3638 write_unlock(&tree->lock);
3536 } 3639 }
3537 3640
3538 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 3641 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e3c6ee3cc2ba..5cce6aa74012 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,7 +25,6 @@
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <asm/div64.h>
29#include "compat.h" 28#include "compat.h"
30#include "ctree.h" 29#include "ctree.h"
31#include "extent_map.h" 30#include "extent_map.h"
@@ -36,6 +35,8 @@
36#include "async-thread.h" 35#include "async-thread.h"
37#include "check-integrity.h" 36#include "check-integrity.h"
38#include "rcu-string.h" 37#include "rcu-string.h"
38#include "math.h"
39#include "dev-replace.h"
39 40
40static int init_first_rw_device(struct btrfs_trans_handle *trans, 41static int init_first_rw_device(struct btrfs_trans_handle *trans,
41 struct btrfs_root *root, 42 struct btrfs_root *root,
@@ -71,6 +72,19 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
71 kfree(fs_devices); 72 kfree(fs_devices);
72} 73}
73 74
75static void btrfs_kobject_uevent(struct block_device *bdev,
76 enum kobject_action action)
77{
78 int ret;
79
80 ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
81 if (ret)
82 pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n",
83 action,
84 kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
85 &disk_to_dev(bdev->bd_disk)->kobj);
86}
87
74void btrfs_cleanup_fs_uuids(void) 88void btrfs_cleanup_fs_uuids(void)
75{ 89{
76 struct btrfs_fs_devices *fs_devices; 90 struct btrfs_fs_devices *fs_devices;
@@ -108,6 +122,44 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
108 return NULL; 122 return NULL;
109} 123}
110 124
125static int
126btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
127 int flush, struct block_device **bdev,
128 struct buffer_head **bh)
129{
130 int ret;
131
132 *bdev = blkdev_get_by_path(device_path, flags, holder);
133
134 if (IS_ERR(*bdev)) {
135 ret = PTR_ERR(*bdev);
136 printk(KERN_INFO "btrfs: open %s failed\n", device_path);
137 goto error;
138 }
139
140 if (flush)
141 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
142 ret = set_blocksize(*bdev, 4096);
143 if (ret) {
144 blkdev_put(*bdev, flags);
145 goto error;
146 }
147 invalidate_bdev(*bdev);
148 *bh = btrfs_read_dev_super(*bdev);
149 if (!*bh) {
150 ret = -EINVAL;
151 blkdev_put(*bdev, flags);
152 goto error;
153 }
154
155 return 0;
156
157error:
158 *bdev = NULL;
159 *bh = NULL;
160 return ret;
161}
162
111static void requeue_list(struct btrfs_pending_bios *pending_bios, 163static void requeue_list(struct btrfs_pending_bios *pending_bios,
112 struct bio *head, struct bio *tail) 164 struct bio *head, struct bio *tail)
113{ 165{
@@ -467,7 +519,8 @@ error:
467 return ERR_PTR(-ENOMEM); 519 return ERR_PTR(-ENOMEM);
468} 520}
469 521
470void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 522void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
523 struct btrfs_fs_devices *fs_devices, int step)
471{ 524{
472 struct btrfs_device *device, *next; 525 struct btrfs_device *device, *next;
473 526
@@ -480,8 +533,9 @@ again:
480 /* This is the initialized path, it is safe to release the devices. */ 533 /* This is the initialized path, it is safe to release the devices. */
481 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 534 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
482 if (device->in_fs_metadata) { 535 if (device->in_fs_metadata) {
483 if (!latest_transid || 536 if (!device->is_tgtdev_for_dev_replace &&
484 device->generation > latest_transid) { 537 (!latest_transid ||
538 device->generation > latest_transid)) {
485 latest_devid = device->devid; 539 latest_devid = device->devid;
486 latest_transid = device->generation; 540 latest_transid = device->generation;
487 latest_bdev = device->bdev; 541 latest_bdev = device->bdev;
@@ -489,6 +543,21 @@ again:
489 continue; 543 continue;
490 } 544 }
491 545
546 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
547 /*
548 * In the first step, keep the device which has
549 * the correct fsid and the devid that is used
550 * for the dev_replace procedure.
551 * In the second step, the dev_replace state is
552 * read from the device tree and it is known
553 * whether the procedure is really active or
554 * not, which means whether this device is
555 * used or whether it should be removed.
556 */
557 if (step == 0 || device->is_tgtdev_for_dev_replace) {
558 continue;
559 }
560 }
492 if (device->bdev) { 561 if (device->bdev) {
493 blkdev_put(device->bdev, device->mode); 562 blkdev_put(device->bdev, device->mode);
494 device->bdev = NULL; 563 device->bdev = NULL;
@@ -497,7 +566,8 @@ again:
497 if (device->writeable) { 566 if (device->writeable) {
498 list_del_init(&device->dev_alloc_list); 567 list_del_init(&device->dev_alloc_list);
499 device->writeable = 0; 568 device->writeable = 0;
500 fs_devices->rw_devices--; 569 if (!device->is_tgtdev_for_dev_replace)
570 fs_devices->rw_devices--;
501 } 571 }
502 list_del_init(&device->dev_list); 572 list_del_init(&device->dev_list);
503 fs_devices->num_devices--; 573 fs_devices->num_devices--;
@@ -555,7 +625,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
555 if (device->bdev) 625 if (device->bdev)
556 fs_devices->open_devices--; 626 fs_devices->open_devices--;
557 627
558 if (device->writeable) { 628 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
559 list_del_init(&device->dev_alloc_list); 629 list_del_init(&device->dev_alloc_list);
560 fs_devices->rw_devices--; 630 fs_devices->rw_devices--;
561 } 631 }
@@ -637,18 +707,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
637 if (!device->name) 707 if (!device->name)
638 continue; 708 continue;
639 709
640 bdev = blkdev_get_by_path(device->name->str, flags, holder); 710 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
641 if (IS_ERR(bdev)) { 711 &bdev, &bh);
642 printk(KERN_INFO "btrfs: open %s failed\n", device->name->str); 712 if (ret)
643 goto error; 713 continue;
644 }
645 filemap_write_and_wait(bdev->bd_inode->i_mapping);
646 invalidate_bdev(bdev);
647 set_blocksize(bdev, 4096);
648
649 bh = btrfs_read_dev_super(bdev);
650 if (!bh)
651 goto error_close;
652 714
653 disk_super = (struct btrfs_super_block *)bh->b_data; 715 disk_super = (struct btrfs_super_block *)bh->b_data;
654 devid = btrfs_stack_device_id(&disk_super->dev_item); 716 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -687,7 +749,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
687 fs_devices->rotating = 1; 749 fs_devices->rotating = 1;
688 750
689 fs_devices->open_devices++; 751 fs_devices->open_devices++;
690 if (device->writeable) { 752 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
691 fs_devices->rw_devices++; 753 fs_devices->rw_devices++;
692 list_add(&device->dev_alloc_list, 754 list_add(&device->dev_alloc_list,
693 &fs_devices->alloc_list); 755 &fs_devices->alloc_list);
@@ -697,9 +759,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
697 759
698error_brelse: 760error_brelse:
699 brelse(bh); 761 brelse(bh);
700error_close:
701 blkdev_put(bdev, flags); 762 blkdev_put(bdev, flags);
702error:
703 continue; 763 continue;
704 } 764 }
705 if (fs_devices->open_devices == 0) { 765 if (fs_devices->open_devices == 0) {
@@ -744,40 +804,30 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
744 u64 total_devices; 804 u64 total_devices;
745 805
746 flags |= FMODE_EXCL; 806 flags |= FMODE_EXCL;
747 bdev = blkdev_get_by_path(path, flags, holder);
748
749 if (IS_ERR(bdev)) {
750 ret = PTR_ERR(bdev);
751 goto error;
752 }
753
754 mutex_lock(&uuid_mutex); 807 mutex_lock(&uuid_mutex);
755 ret = set_blocksize(bdev, 4096); 808 ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh);
756 if (ret) 809 if (ret)
757 goto error_close; 810 goto error;
758 bh = btrfs_read_dev_super(bdev);
759 if (!bh) {
760 ret = -EINVAL;
761 goto error_close;
762 }
763 disk_super = (struct btrfs_super_block *)bh->b_data; 811 disk_super = (struct btrfs_super_block *)bh->b_data;
764 devid = btrfs_stack_device_id(&disk_super->dev_item); 812 devid = btrfs_stack_device_id(&disk_super->dev_item);
765 transid = btrfs_super_generation(disk_super); 813 transid = btrfs_super_generation(disk_super);
766 total_devices = btrfs_super_num_devices(disk_super); 814 total_devices = btrfs_super_num_devices(disk_super);
767 if (disk_super->label[0]) 815 if (disk_super->label[0]) {
816 if (disk_super->label[BTRFS_LABEL_SIZE - 1])
817 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
768 printk(KERN_INFO "device label %s ", disk_super->label); 818 printk(KERN_INFO "device label %s ", disk_super->label);
769 else 819 } else {
770 printk(KERN_INFO "device fsid %pU ", disk_super->fsid); 820 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
821 }
771 printk(KERN_CONT "devid %llu transid %llu %s\n", 822 printk(KERN_CONT "devid %llu transid %llu %s\n",
772 (unsigned long long)devid, (unsigned long long)transid, path); 823 (unsigned long long)devid, (unsigned long long)transid, path);
773 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 824 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
774 if (!ret && fs_devices_ret) 825 if (!ret && fs_devices_ret)
775 (*fs_devices_ret)->total_devices = total_devices; 826 (*fs_devices_ret)->total_devices = total_devices;
776 brelse(bh); 827 brelse(bh);
777error_close:
778 mutex_unlock(&uuid_mutex);
779 blkdev_put(bdev, flags); 828 blkdev_put(bdev, flags);
780error: 829error:
830 mutex_unlock(&uuid_mutex);
781 return ret; 831 return ret;
782} 832}
783 833
@@ -796,7 +846,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
796 846
797 *length = 0; 847 *length = 0;
798 848
799 if (start >= device->total_bytes) 849 if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
800 return 0; 850 return 0;
801 851
802 path = btrfs_alloc_path(); 852 path = btrfs_alloc_path();
@@ -913,7 +963,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
913 max_hole_size = 0; 963 max_hole_size = 0;
914 hole_size = 0; 964 hole_size = 0;
915 965
916 if (search_start >= search_end) { 966 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
917 ret = -ENOSPC; 967 ret = -ENOSPC;
918 goto error; 968 goto error;
919 } 969 }
@@ -1096,6 +1146,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1096 struct btrfs_key key; 1146 struct btrfs_key key;
1097 1147
1098 WARN_ON(!device->in_fs_metadata); 1148 WARN_ON(!device->in_fs_metadata);
1149 WARN_ON(device->is_tgtdev_for_dev_replace);
1099 path = btrfs_alloc_path(); 1150 path = btrfs_alloc_path();
1100 if (!path) 1151 if (!path)
1101 return -ENOMEM; 1152 return -ENOMEM;
@@ -1330,16 +1381,22 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1330 root->fs_info->avail_system_alloc_bits | 1381 root->fs_info->avail_system_alloc_bits |
1331 root->fs_info->avail_metadata_alloc_bits; 1382 root->fs_info->avail_metadata_alloc_bits;
1332 1383
1333 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && 1384 num_devices = root->fs_info->fs_devices->num_devices;
1334 root->fs_info->fs_devices->num_devices <= 4) { 1385 btrfs_dev_replace_lock(&root->fs_info->dev_replace);
1386 if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
1387 WARN_ON(num_devices < 1);
1388 num_devices--;
1389 }
1390 btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
1391
1392 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
1335 printk(KERN_ERR "btrfs: unable to go below four devices " 1393 printk(KERN_ERR "btrfs: unable to go below four devices "
1336 "on raid10\n"); 1394 "on raid10\n");
1337 ret = -EINVAL; 1395 ret = -EINVAL;
1338 goto out; 1396 goto out;
1339 } 1397 }
1340 1398
1341 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && 1399 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
1342 root->fs_info->fs_devices->num_devices <= 2) {
1343 printk(KERN_ERR "btrfs: unable to go below two " 1400 printk(KERN_ERR "btrfs: unable to go below two "
1344 "devices on raid1\n"); 1401 "devices on raid1\n");
1345 ret = -EINVAL; 1402 ret = -EINVAL;
@@ -1357,7 +1414,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1357 * is held. 1414 * is held.
1358 */ 1415 */
1359 list_for_each_entry(tmp, devices, dev_list) { 1416 list_for_each_entry(tmp, devices, dev_list) {
1360 if (tmp->in_fs_metadata && !tmp->bdev) { 1417 if (tmp->in_fs_metadata &&
1418 !tmp->is_tgtdev_for_dev_replace &&
1419 !tmp->bdev) {
1361 device = tmp; 1420 device = tmp;
1362 break; 1421 break;
1363 } 1422 }
@@ -1371,24 +1430,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1371 goto out; 1430 goto out;
1372 } 1431 }
1373 } else { 1432 } else {
1374 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, 1433 ret = btrfs_get_bdev_and_sb(device_path,
1375 root->fs_info->bdev_holder); 1434 FMODE_READ | FMODE_EXCL,
1376 if (IS_ERR(bdev)) { 1435 root->fs_info->bdev_holder, 0,
1377 ret = PTR_ERR(bdev); 1436 &bdev, &bh);
1437 if (ret)
1378 goto out; 1438 goto out;
1379 }
1380
1381 set_blocksize(bdev, 4096);
1382 invalidate_bdev(bdev);
1383 bh = btrfs_read_dev_super(bdev);
1384 if (!bh) {
1385 ret = -EINVAL;
1386 goto error_close;
1387 }
1388 disk_super = (struct btrfs_super_block *)bh->b_data; 1439 disk_super = (struct btrfs_super_block *)bh->b_data;
1389 devid = btrfs_stack_device_id(&disk_super->dev_item); 1440 devid = btrfs_stack_device_id(&disk_super->dev_item);
1390 dev_uuid = disk_super->dev_item.uuid; 1441 dev_uuid = disk_super->dev_item.uuid;
1391 device = btrfs_find_device(root, devid, dev_uuid, 1442 device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1392 disk_super->fsid); 1443 disk_super->fsid);
1393 if (!device) { 1444 if (!device) {
1394 ret = -ENOENT; 1445 ret = -ENOENT;
@@ -1396,6 +1447,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1396 } 1447 }
1397 } 1448 }
1398 1449
1450 if (device->is_tgtdev_for_dev_replace) {
1451 pr_err("btrfs: unable to remove the dev_replace target dev\n");
1452 ret = -EINVAL;
1453 goto error_brelse;
1454 }
1455
1399 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1456 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1400 printk(KERN_ERR "btrfs: unable to remove the only writeable " 1457 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1401 "device\n"); 1458 "device\n");
@@ -1415,6 +1472,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1415 if (ret) 1472 if (ret)
1416 goto error_undo; 1473 goto error_undo;
1417 1474
1475 /*
1476 * TODO: the superblock still includes this device in its num_devices
1477 * counter although write_all_supers() is not locked out. This
1478 * could give a filesystem state which requires a degraded mount.
1479 */
1418 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1480 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1419 if (ret) 1481 if (ret)
1420 goto error_undo; 1482 goto error_undo;
@@ -1425,7 +1487,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1425 spin_unlock(&root->fs_info->free_chunk_lock); 1487 spin_unlock(&root->fs_info->free_chunk_lock);
1426 1488
1427 device->in_fs_metadata = 0; 1489 device->in_fs_metadata = 0;
1428 btrfs_scrub_cancel_dev(root, device); 1490 btrfs_scrub_cancel_dev(root->fs_info, device);
1429 1491
1430 /* 1492 /*
1431 * the device list mutex makes sure that we don't change 1493 * the device list mutex makes sure that we don't change
@@ -1482,7 +1544,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1482 * at this point, the device is zero sized. We want to 1544 * at this point, the device is zero sized. We want to
1483 * remove it from the devices list and zero out the old super 1545 * remove it from the devices list and zero out the old super
1484 */ 1546 */
1485 if (clear_super) { 1547 if (clear_super && disk_super) {
1486 /* make sure this device isn't detected as part of 1548 /* make sure this device isn't detected as part of
1487 * the FS anymore 1549 * the FS anymore
1488 */ 1550 */
@@ -1493,9 +1555,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1493 1555
1494 ret = 0; 1556 ret = 0;
1495 1557
1558 /* Notify udev that device has changed */
1559 btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
1560
1496error_brelse: 1561error_brelse:
1497 brelse(bh); 1562 brelse(bh);
1498error_close:
1499 if (bdev) 1563 if (bdev)
1500 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1564 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1501out: 1565out:
@@ -1512,6 +1576,112 @@ error_undo:
1512 goto error_brelse; 1576 goto error_brelse;
1513} 1577}
1514 1578
1579void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
1580 struct btrfs_device *srcdev)
1581{
1582 WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
1583 list_del_rcu(&srcdev->dev_list);
1584 list_del_rcu(&srcdev->dev_alloc_list);
1585 fs_info->fs_devices->num_devices--;
1586 if (srcdev->missing) {
1587 fs_info->fs_devices->missing_devices--;
1588 fs_info->fs_devices->rw_devices++;
1589 }
1590 if (srcdev->can_discard)
1591 fs_info->fs_devices->num_can_discard--;
1592 if (srcdev->bdev)
1593 fs_info->fs_devices->open_devices--;
1594
1595 call_rcu(&srcdev->rcu, free_device);
1596}
1597
1598void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
1599 struct btrfs_device *tgtdev)
1600{
1601 struct btrfs_device *next_device;
1602
1603 WARN_ON(!tgtdev);
1604 mutex_lock(&fs_info->fs_devices->device_list_mutex);
1605 if (tgtdev->bdev) {
1606 btrfs_scratch_superblock(tgtdev);
1607 fs_info->fs_devices->open_devices--;
1608 }
1609 fs_info->fs_devices->num_devices--;
1610 if (tgtdev->can_discard)
1611 fs_info->fs_devices->num_can_discard++;
1612
1613 next_device = list_entry(fs_info->fs_devices->devices.next,
1614 struct btrfs_device, dev_list);
1615 if (tgtdev->bdev == fs_info->sb->s_bdev)
1616 fs_info->sb->s_bdev = next_device->bdev;
1617 if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
1618 fs_info->fs_devices->latest_bdev = next_device->bdev;
1619 list_del_rcu(&tgtdev->dev_list);
1620
1621 call_rcu(&tgtdev->rcu, free_device);
1622
1623 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1624}
1625
1626int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
1627 struct btrfs_device **device)
1628{
1629 int ret = 0;
1630 struct btrfs_super_block *disk_super;
1631 u64 devid;
1632 u8 *dev_uuid;
1633 struct block_device *bdev;
1634 struct buffer_head *bh;
1635
1636 *device = NULL;
1637 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
1638 root->fs_info->bdev_holder, 0, &bdev, &bh);
1639 if (ret)
1640 return ret;
1641 disk_super = (struct btrfs_super_block *)bh->b_data;
1642 devid = btrfs_stack_device_id(&disk_super->dev_item);
1643 dev_uuid = disk_super->dev_item.uuid;
1644 *device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1645 disk_super->fsid);
1646 brelse(bh);
1647 if (!*device)
1648 ret = -ENOENT;
1649 blkdev_put(bdev, FMODE_READ);
1650 return ret;
1651}
1652
1653int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
1654 char *device_path,
1655 struct btrfs_device **device)
1656{
1657 *device = NULL;
1658 if (strcmp(device_path, "missing") == 0) {
1659 struct list_head *devices;
1660 struct btrfs_device *tmp;
1661
1662 devices = &root->fs_info->fs_devices->devices;
1663 /*
1664 * It is safe to read the devices since the volume_mutex
1665 * is held by the caller.
1666 */
1667 list_for_each_entry(tmp, devices, dev_list) {
1668 if (tmp->in_fs_metadata && !tmp->bdev) {
1669 *device = tmp;
1670 break;
1671 }
1672 }
1673
1674 if (!*device) {
1675 pr_err("btrfs: no missing device found\n");
1676 return -ENOENT;
1677 }
1678
1679 return 0;
1680 } else {
1681 return btrfs_find_device_by_path(root, device_path, device);
1682 }
1683}
1684
1515/* 1685/*
1516 * does all the dirty work required for changing file system's UUID. 1686 * does all the dirty work required for changing file system's UUID.
1517 */ 1687 */
@@ -1630,7 +1800,8 @@ next_slot:
1630 read_extent_buffer(leaf, fs_uuid, 1800 read_extent_buffer(leaf, fs_uuid,
1631 (unsigned long)btrfs_device_fsid(dev_item), 1801 (unsigned long)btrfs_device_fsid(dev_item),
1632 BTRFS_UUID_SIZE); 1802 BTRFS_UUID_SIZE);
1633 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 1803 device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1804 fs_uuid);
1634 BUG_ON(!device); /* Logic error */ 1805 BUG_ON(!device); /* Logic error */
1635 1806
1636 if (device->fs_devices->seeding) { 1807 if (device->fs_devices->seeding) {
@@ -1678,16 +1849,17 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1678 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1849 filemap_write_and_wait(bdev->bd_inode->i_mapping);
1679 1850
1680 devices = &root->fs_info->fs_devices->devices; 1851 devices = &root->fs_info->fs_devices->devices;
1681 /* 1852
1682 * we have the volume lock, so we don't need the extra 1853 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1683 * device list mutex while reading the list here.
1684 */
1685 list_for_each_entry(device, devices, dev_list) { 1854 list_for_each_entry(device, devices, dev_list) {
1686 if (device->bdev == bdev) { 1855 if (device->bdev == bdev) {
1687 ret = -EEXIST; 1856 ret = -EEXIST;
1857 mutex_unlock(
1858 &root->fs_info->fs_devices->device_list_mutex);
1688 goto error; 1859 goto error;
1689 } 1860 }
1690 } 1861 }
1862 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1691 1863
1692 device = kzalloc(sizeof(*device), GFP_NOFS); 1864 device = kzalloc(sizeof(*device), GFP_NOFS);
1693 if (!device) { 1865 if (!device) {
@@ -1737,6 +1909,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1737 device->dev_root = root->fs_info->dev_root; 1909 device->dev_root = root->fs_info->dev_root;
1738 device->bdev = bdev; 1910 device->bdev = bdev;
1739 device->in_fs_metadata = 1; 1911 device->in_fs_metadata = 1;
1912 device->is_tgtdev_for_dev_replace = 0;
1740 device->mode = FMODE_EXCL; 1913 device->mode = FMODE_EXCL;
1741 set_blocksize(device->bdev, 4096); 1914 set_blocksize(device->bdev, 4096);
1742 1915
@@ -1844,6 +2017,98 @@ error:
1844 return ret; 2017 return ret;
1845} 2018}
1846 2019
2020int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2021 struct btrfs_device **device_out)
2022{
2023 struct request_queue *q;
2024 struct btrfs_device *device;
2025 struct block_device *bdev;
2026 struct btrfs_fs_info *fs_info = root->fs_info;
2027 struct list_head *devices;
2028 struct rcu_string *name;
2029 int ret = 0;
2030
2031 *device_out = NULL;
2032 if (fs_info->fs_devices->seeding)
2033 return -EINVAL;
2034
2035 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2036 fs_info->bdev_holder);
2037 if (IS_ERR(bdev))
2038 return PTR_ERR(bdev);
2039
2040 filemap_write_and_wait(bdev->bd_inode->i_mapping);
2041
2042 devices = &fs_info->fs_devices->devices;
2043 list_for_each_entry(device, devices, dev_list) {
2044 if (device->bdev == bdev) {
2045 ret = -EEXIST;
2046 goto error;
2047 }
2048 }
2049
2050 device = kzalloc(sizeof(*device), GFP_NOFS);
2051 if (!device) {
2052 ret = -ENOMEM;
2053 goto error;
2054 }
2055
2056 name = rcu_string_strdup(device_path, GFP_NOFS);
2057 if (!name) {
2058 kfree(device);
2059 ret = -ENOMEM;
2060 goto error;
2061 }
2062 rcu_assign_pointer(device->name, name);
2063
2064 q = bdev_get_queue(bdev);
2065 if (blk_queue_discard(q))
2066 device->can_discard = 1;
2067 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2068 device->writeable = 1;
2069 device->work.func = pending_bios_fn;
2070 generate_random_uuid(device->uuid);
2071 device->devid = BTRFS_DEV_REPLACE_DEVID;
2072 spin_lock_init(&device->io_lock);
2073 device->generation = 0;
2074 device->io_width = root->sectorsize;
2075 device->io_align = root->sectorsize;
2076 device->sector_size = root->sectorsize;
2077 device->total_bytes = i_size_read(bdev->bd_inode);
2078 device->disk_total_bytes = device->total_bytes;
2079 device->dev_root = fs_info->dev_root;
2080 device->bdev = bdev;
2081 device->in_fs_metadata = 1;
2082 device->is_tgtdev_for_dev_replace = 1;
2083 device->mode = FMODE_EXCL;
2084 set_blocksize(device->bdev, 4096);
2085 device->fs_devices = fs_info->fs_devices;
2086 list_add(&device->dev_list, &fs_info->fs_devices->devices);
2087 fs_info->fs_devices->num_devices++;
2088 fs_info->fs_devices->open_devices++;
2089 if (device->can_discard)
2090 fs_info->fs_devices->num_can_discard++;
2091 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2092
2093 *device_out = device;
2094 return ret;
2095
2096error:
2097 blkdev_put(bdev, FMODE_EXCL);
2098 return ret;
2099}
2100
2101void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
2102 struct btrfs_device *tgtdev)
2103{
2104 WARN_ON(fs_info->fs_devices->rw_devices == 0);
2105 tgtdev->io_width = fs_info->dev_root->sectorsize;
2106 tgtdev->io_align = fs_info->dev_root->sectorsize;
2107 tgtdev->sector_size = fs_info->dev_root->sectorsize;
2108 tgtdev->dev_root = fs_info->dev_root;
2109 tgtdev->in_fs_metadata = 1;
2110}
2111
1847static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2112static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1848 struct btrfs_device *device) 2113 struct btrfs_device *device)
1849{ 2114{
@@ -1900,7 +2165,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1900 2165
1901 if (!device->writeable) 2166 if (!device->writeable)
1902 return -EACCES; 2167 return -EACCES;
1903 if (new_size <= device->total_bytes) 2168 if (new_size <= device->total_bytes ||
2169 device->is_tgtdev_for_dev_replace)
1904 return -EINVAL; 2170 return -EINVAL;
1905 2171
1906 btrfs_set_super_total_bytes(super_copy, old_total + diff); 2172 btrfs_set_super_total_bytes(super_copy, old_total + diff);
@@ -2338,18 +2604,6 @@ static int chunk_profiles_filter(u64 chunk_type,
2338 return 1; 2604 return 1;
2339} 2605}
2340 2606
2341static u64 div_factor_fine(u64 num, int factor)
2342{
2343 if (factor <= 0)
2344 return 0;
2345 if (factor >= 100)
2346 return num;
2347
2348 num *= factor;
2349 do_div(num, 100);
2350 return num;
2351}
2352
2353static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 2607static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2354 struct btrfs_balance_args *bargs) 2608 struct btrfs_balance_args *bargs)
2355{ 2609{
@@ -2514,15 +2768,6 @@ static int should_balance_chunk(struct btrfs_root *root,
2514 return 1; 2768 return 1;
2515} 2769}
2516 2770
2517static u64 div_factor(u64 num, int factor)
2518{
2519 if (factor == 10)
2520 return num;
2521 num *= factor;
2522 do_div(num, 10);
2523 return num;
2524}
2525
2526static int __btrfs_balance(struct btrfs_fs_info *fs_info) 2771static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2527{ 2772{
2528 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 2773 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
@@ -2550,7 +2795,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2550 size_to_free = div_factor(old_size, 1); 2795 size_to_free = div_factor(old_size, 1);
2551 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 2796 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
2552 if (!device->writeable || 2797 if (!device->writeable ||
2553 device->total_bytes - device->bytes_used > size_to_free) 2798 device->total_bytes - device->bytes_used > size_to_free ||
2799 device->is_tgtdev_for_dev_replace)
2554 continue; 2800 continue;
2555 2801
2556 ret = btrfs_shrink_device(device, old_size - size_to_free); 2802 ret = btrfs_shrink_device(device, old_size - size_to_free);
@@ -2728,6 +2974,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2728 u64 allowed; 2974 u64 allowed;
2729 int mixed = 0; 2975 int mixed = 0;
2730 int ret; 2976 int ret;
2977 u64 num_devices;
2731 2978
2732 if (btrfs_fs_closing(fs_info) || 2979 if (btrfs_fs_closing(fs_info) ||
2733 atomic_read(&fs_info->balance_pause_req) || 2980 atomic_read(&fs_info->balance_pause_req) ||
@@ -2756,10 +3003,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2756 } 3003 }
2757 } 3004 }
2758 3005
3006 num_devices = fs_info->fs_devices->num_devices;
3007 btrfs_dev_replace_lock(&fs_info->dev_replace);
3008 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
3009 BUG_ON(num_devices < 1);
3010 num_devices--;
3011 }
3012 btrfs_dev_replace_unlock(&fs_info->dev_replace);
2759 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 3013 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2760 if (fs_info->fs_devices->num_devices == 1) 3014 if (num_devices == 1)
2761 allowed |= BTRFS_BLOCK_GROUP_DUP; 3015 allowed |= BTRFS_BLOCK_GROUP_DUP;
2762 else if (fs_info->fs_devices->num_devices < 4) 3016 else if (num_devices < 4)
2763 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3017 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
2764 else 3018 else
2765 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 3019 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
@@ -2902,6 +3156,7 @@ static int balance_kthread(void *data)
2902 ret = btrfs_balance(fs_info->balance_ctl, NULL); 3156 ret = btrfs_balance(fs_info->balance_ctl, NULL);
2903 } 3157 }
2904 3158
3159 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
2905 mutex_unlock(&fs_info->balance_mutex); 3160 mutex_unlock(&fs_info->balance_mutex);
2906 mutex_unlock(&fs_info->volume_mutex); 3161 mutex_unlock(&fs_info->volume_mutex);
2907 3162
@@ -2924,6 +3179,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
2924 return 0; 3179 return 0;
2925 } 3180 }
2926 3181
3182 WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
2927 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 3183 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
2928 if (IS_ERR(tsk)) 3184 if (IS_ERR(tsk))
2929 return PTR_ERR(tsk); 3185 return PTR_ERR(tsk);
@@ -3080,7 +3336,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
3080 u64 old_size = device->total_bytes; 3336 u64 old_size = device->total_bytes;
3081 u64 diff = device->total_bytes - new_size; 3337 u64 diff = device->total_bytes - new_size;
3082 3338
3083 if (new_size >= device->total_bytes) 3339 if (device->is_tgtdev_for_dev_replace)
3084 return -EINVAL; 3340 return -EINVAL;
3085 3341
3086 path = btrfs_alloc_path(); 3342 path = btrfs_alloc_path();
@@ -3235,6 +3491,14 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
3235 return 0; 3491 return 0;
3236} 3492}
3237 3493
3494struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
3495 { 2, 1, 0, 4, 2, 2 /* raid10 */ },
3496 { 1, 1, 2, 2, 2, 2 /* raid1 */ },
3497 { 1, 2, 1, 1, 1, 2 /* dup */ },
3498 { 1, 1, 0, 2, 1, 1 /* raid0 */ },
3499 { 1, 1, 0, 1, 1, 1 /* single */ },
3500};
3501
3238static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3502static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3239 struct btrfs_root *extent_root, 3503 struct btrfs_root *extent_root,
3240 struct map_lookup **map_ret, 3504 struct map_lookup **map_ret,
@@ -3264,43 +3528,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3264 int ndevs; 3528 int ndevs;
3265 int i; 3529 int i;
3266 int j; 3530 int j;
3531 int index;
3267 3532
3268 BUG_ON(!alloc_profile_is_valid(type, 0)); 3533 BUG_ON(!alloc_profile_is_valid(type, 0));
3269 3534
3270 if (list_empty(&fs_devices->alloc_list)) 3535 if (list_empty(&fs_devices->alloc_list))
3271 return -ENOSPC; 3536 return -ENOSPC;
3272 3537
3273 sub_stripes = 1; 3538 index = __get_raid_index(type);
3274 dev_stripes = 1;
3275 devs_increment = 1;
3276 ncopies = 1;
3277 devs_max = 0; /* 0 == as many as possible */
3278 devs_min = 1;
3279 3539
3280 /* 3540 sub_stripes = btrfs_raid_array[index].sub_stripes;
3281 * define the properties of each RAID type. 3541 dev_stripes = btrfs_raid_array[index].dev_stripes;
3282 * FIXME: move this to a global table and use it in all RAID 3542 devs_max = btrfs_raid_array[index].devs_max;
3283 * calculation code 3543 devs_min = btrfs_raid_array[index].devs_min;
3284 */ 3544 devs_increment = btrfs_raid_array[index].devs_increment;
3285 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 3545 ncopies = btrfs_raid_array[index].ncopies;
3286 dev_stripes = 2;
3287 ncopies = 2;
3288 devs_max = 1;
3289 } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
3290 devs_min = 2;
3291 } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
3292 devs_increment = 2;
3293 ncopies = 2;
3294 devs_max = 2;
3295 devs_min = 2;
3296 } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
3297 sub_stripes = 2;
3298 devs_increment = 2;
3299 ncopies = 2;
3300 devs_min = 4;
3301 } else {
3302 devs_max = 1;
3303 }
3304 3546
3305 if (type & BTRFS_BLOCK_GROUP_DATA) { 3547 if (type & BTRFS_BLOCK_GROUP_DATA) {
3306 max_stripe_size = 1024 * 1024 * 1024; 3548 max_stripe_size = 1024 * 1024 * 1024;
@@ -3347,13 +3589,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3347 cur = cur->next; 3589 cur = cur->next;
3348 3590
3349 if (!device->writeable) { 3591 if (!device->writeable) {
3350 printk(KERN_ERR 3592 WARN(1, KERN_ERR
3351 "btrfs: read-only device in alloc_list\n"); 3593 "btrfs: read-only device in alloc_list\n");
3352 WARN_ON(1);
3353 continue; 3594 continue;
3354 } 3595 }
3355 3596
3356 if (!device->in_fs_metadata) 3597 if (!device->in_fs_metadata ||
3598 device->is_tgtdev_for_dev_replace)
3357 continue; 3599 continue;
3358 3600
3359 if (device->total_bytes > device->bytes_used) 3601 if (device->total_bytes > device->bytes_used)
@@ -3382,6 +3624,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3382 devices_info[ndevs].total_avail = total_avail; 3624 devices_info[ndevs].total_avail = total_avail;
3383 devices_info[ndevs].dev = device; 3625 devices_info[ndevs].dev = device;
3384 ++ndevs; 3626 ++ndevs;
3627 WARN_ON(ndevs > fs_devices->rw_devices);
3385 } 3628 }
3386 3629
3387 /* 3630 /*
@@ -3740,8 +3983,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
3740 } 3983 }
3741} 3984}
3742 3985
3743int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) 3986int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
3744{ 3987{
3988 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
3745 struct extent_map *em; 3989 struct extent_map *em;
3746 struct map_lookup *map; 3990 struct map_lookup *map;
3747 struct extent_map_tree *em_tree = &map_tree->map_tree; 3991 struct extent_map_tree *em_tree = &map_tree->map_tree;
@@ -3761,32 +4005,60 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
3761 else 4005 else
3762 ret = 1; 4006 ret = 1;
3763 free_extent_map(em); 4007 free_extent_map(em);
4008
4009 btrfs_dev_replace_lock(&fs_info->dev_replace);
4010 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
4011 ret++;
4012 btrfs_dev_replace_unlock(&fs_info->dev_replace);
4013
3764 return ret; 4014 return ret;
3765} 4015}
3766 4016
3767static int find_live_mirror(struct map_lookup *map, int first, int num, 4017static int find_live_mirror(struct btrfs_fs_info *fs_info,
3768 int optimal) 4018 struct map_lookup *map, int first, int num,
4019 int optimal, int dev_replace_is_ongoing)
3769{ 4020{
3770 int i; 4021 int i;
3771 if (map->stripes[optimal].dev->bdev) 4022 int tolerance;
3772 return optimal; 4023 struct btrfs_device *srcdev;
3773 for (i = first; i < first + num; i++) { 4024
3774 if (map->stripes[i].dev->bdev) 4025 if (dev_replace_is_ongoing &&
3775 return i; 4026 fs_info->dev_replace.cont_reading_from_srcdev_mode ==
4027 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
4028 srcdev = fs_info->dev_replace.srcdev;
4029 else
4030 srcdev = NULL;
4031
4032 /*
4033 * try to avoid the drive that is the source drive for a
4034 * dev-replace procedure, only choose it if no other non-missing
4035 * mirror is available
4036 */
4037 for (tolerance = 0; tolerance < 2; tolerance++) {
4038 if (map->stripes[optimal].dev->bdev &&
4039 (tolerance || map->stripes[optimal].dev != srcdev))
4040 return optimal;
4041 for (i = first; i < first + num; i++) {
4042 if (map->stripes[i].dev->bdev &&
4043 (tolerance || map->stripes[i].dev != srcdev))
4044 return i;
4045 }
3776 } 4046 }
4047
3777 /* we couldn't find one that doesn't fail. Just return something 4048 /* we couldn't find one that doesn't fail. Just return something
3778 * and the io error handling code will clean up eventually 4049 * and the io error handling code will clean up eventually
3779 */ 4050 */
3780 return optimal; 4051 return optimal;
3781} 4052}
3782 4053
3783static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 4054static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
3784 u64 logical, u64 *length, 4055 u64 logical, u64 *length,
3785 struct btrfs_bio **bbio_ret, 4056 struct btrfs_bio **bbio_ret,
3786 int mirror_num) 4057 int mirror_num)
3787{ 4058{
3788 struct extent_map *em; 4059 struct extent_map *em;
3789 struct map_lookup *map; 4060 struct map_lookup *map;
4061 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
3790 struct extent_map_tree *em_tree = &map_tree->map_tree; 4062 struct extent_map_tree *em_tree = &map_tree->map_tree;
3791 u64 offset; 4063 u64 offset;
3792 u64 stripe_offset; 4064 u64 stripe_offset;
@@ -3800,6 +4072,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3800 int num_stripes; 4072 int num_stripes;
3801 int max_errors = 0; 4073 int max_errors = 0;
3802 struct btrfs_bio *bbio = NULL; 4074 struct btrfs_bio *bbio = NULL;
4075 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
4076 int dev_replace_is_ongoing = 0;
4077 int num_alloc_stripes;
4078 int patch_the_first_stripe_for_dev_replace = 0;
4079 u64 physical_to_patch_in_first_stripe = 0;
3803 4080
3804 read_lock(&em_tree->lock); 4081 read_lock(&em_tree->lock);
3805 em = lookup_extent_mapping(em_tree, logical, *length); 4082 em = lookup_extent_mapping(em_tree, logical, *length);
@@ -3816,9 +4093,6 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3816 map = (struct map_lookup *)em->bdev; 4093 map = (struct map_lookup *)em->bdev;
3817 offset = logical - em->start; 4094 offset = logical - em->start;
3818 4095
3819 if (mirror_num > map->num_stripes)
3820 mirror_num = 0;
3821
3822 stripe_nr = offset; 4096 stripe_nr = offset;
3823 /* 4097 /*
3824 * stripe_nr counts the total number of stripes we have to stride 4098 * stripe_nr counts the total number of stripes we have to stride
@@ -3845,6 +4119,93 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3845 if (!bbio_ret) 4119 if (!bbio_ret)
3846 goto out; 4120 goto out;
3847 4121
4122 btrfs_dev_replace_lock(dev_replace);
4123 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
4124 if (!dev_replace_is_ongoing)
4125 btrfs_dev_replace_unlock(dev_replace);
4126
4127 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
4128 !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
4129 dev_replace->tgtdev != NULL) {
4130 /*
4131 * in dev-replace case, for repair case (that's the only
4132 * case where the mirror is selected explicitly when
4133 * calling btrfs_map_block), blocks left of the left cursor
4134 * can also be read from the target drive.
4135 * For REQ_GET_READ_MIRRORS, the target drive is added as
4136 * the last one to the array of stripes. For READ, it also
4137 * needs to be supported using the same mirror number.
4138 * If the requested block is not left of the left cursor,
4139 * EIO is returned. This can happen because btrfs_num_copies()
4140 * returns one more in the dev-replace case.
4141 */
4142 u64 tmp_length = *length;
4143 struct btrfs_bio *tmp_bbio = NULL;
4144 int tmp_num_stripes;
4145 u64 srcdev_devid = dev_replace->srcdev->devid;
4146 int index_srcdev = 0;
4147 int found = 0;
4148 u64 physical_of_found = 0;
4149
4150 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
4151 logical, &tmp_length, &tmp_bbio, 0);
4152 if (ret) {
4153 WARN_ON(tmp_bbio != NULL);
4154 goto out;
4155 }
4156
4157 tmp_num_stripes = tmp_bbio->num_stripes;
4158 if (mirror_num > tmp_num_stripes) {
4159 /*
4160 * REQ_GET_READ_MIRRORS does not contain this
4161 * mirror, that means that the requested area
4162 * is not left of the left cursor
4163 */
4164 ret = -EIO;
4165 kfree(tmp_bbio);
4166 goto out;
4167 }
4168
4169 /*
4170 * process the rest of the function using the mirror_num
4171 * of the source drive. Therefore look it up first.
4172 * At the end, patch the device pointer to the one of the
4173 * target drive.
4174 */
4175 for (i = 0; i < tmp_num_stripes; i++) {
4176 if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
4177 /*
4178 * In case of DUP, in order to keep it
4179 * simple, only add the mirror with the
4180 * lowest physical address
4181 */
4182 if (found &&
4183 physical_of_found <=
4184 tmp_bbio->stripes[i].physical)
4185 continue;
4186 index_srcdev = i;
4187 found = 1;
4188 physical_of_found =
4189 tmp_bbio->stripes[i].physical;
4190 }
4191 }
4192
4193 if (found) {
4194 mirror_num = index_srcdev + 1;
4195 patch_the_first_stripe_for_dev_replace = 1;
4196 physical_to_patch_in_first_stripe = physical_of_found;
4197 } else {
4198 WARN_ON(1);
4199 ret = -EIO;
4200 kfree(tmp_bbio);
4201 goto out;
4202 }
4203
4204 kfree(tmp_bbio);
4205 } else if (mirror_num > map->num_stripes) {
4206 mirror_num = 0;
4207 }
4208
3848 num_stripes = 1; 4209 num_stripes = 1;
3849 stripe_index = 0; 4210 stripe_index = 0;
3850 stripe_nr_orig = stripe_nr; 4211 stripe_nr_orig = stripe_nr;
@@ -3859,19 +4220,20 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3859 stripe_nr_end - stripe_nr_orig); 4220 stripe_nr_end - stripe_nr_orig);
3860 stripe_index = do_div(stripe_nr, map->num_stripes); 4221 stripe_index = do_div(stripe_nr, map->num_stripes);
3861 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 4222 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3862 if (rw & (REQ_WRITE | REQ_DISCARD)) 4223 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
3863 num_stripes = map->num_stripes; 4224 num_stripes = map->num_stripes;
3864 else if (mirror_num) 4225 else if (mirror_num)
3865 stripe_index = mirror_num - 1; 4226 stripe_index = mirror_num - 1;
3866 else { 4227 else {
3867 stripe_index = find_live_mirror(map, 0, 4228 stripe_index = find_live_mirror(fs_info, map, 0,
3868 map->num_stripes, 4229 map->num_stripes,
3869 current->pid % map->num_stripes); 4230 current->pid % map->num_stripes,
4231 dev_replace_is_ongoing);
3870 mirror_num = stripe_index + 1; 4232 mirror_num = stripe_index + 1;
3871 } 4233 }
3872 4234
3873 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 4235 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3874 if (rw & (REQ_WRITE | REQ_DISCARD)) { 4236 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
3875 num_stripes = map->num_stripes; 4237 num_stripes = map->num_stripes;
3876 } else if (mirror_num) { 4238 } else if (mirror_num) {
3877 stripe_index = mirror_num - 1; 4239 stripe_index = mirror_num - 1;
@@ -3885,7 +4247,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3885 stripe_index = do_div(stripe_nr, factor); 4247 stripe_index = do_div(stripe_nr, factor);
3886 stripe_index *= map->sub_stripes; 4248 stripe_index *= map->sub_stripes;
3887 4249
3888 if (rw & REQ_WRITE) 4250 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
3889 num_stripes = map->sub_stripes; 4251 num_stripes = map->sub_stripes;
3890 else if (rw & REQ_DISCARD) 4252 else if (rw & REQ_DISCARD)
3891 num_stripes = min_t(u64, map->sub_stripes * 4253 num_stripes = min_t(u64, map->sub_stripes *
@@ -3895,9 +4257,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3895 stripe_index += mirror_num - 1; 4257 stripe_index += mirror_num - 1;
3896 else { 4258 else {
3897 int old_stripe_index = stripe_index; 4259 int old_stripe_index = stripe_index;
3898 stripe_index = find_live_mirror(map, stripe_index, 4260 stripe_index = find_live_mirror(fs_info, map,
4261 stripe_index,
3899 map->sub_stripes, stripe_index + 4262 map->sub_stripes, stripe_index +
3900 current->pid % map->sub_stripes); 4263 current->pid % map->sub_stripes,
4264 dev_replace_is_ongoing);
3901 mirror_num = stripe_index - old_stripe_index + 1; 4265 mirror_num = stripe_index - old_stripe_index + 1;
3902 } 4266 }
3903 } else { 4267 } else {
@@ -3911,7 +4275,14 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3911 } 4275 }
3912 BUG_ON(stripe_index >= map->num_stripes); 4276 BUG_ON(stripe_index >= map->num_stripes);
3913 4277
3914 bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); 4278 num_alloc_stripes = num_stripes;
4279 if (dev_replace_is_ongoing) {
4280 if (rw & (REQ_WRITE | REQ_DISCARD))
4281 num_alloc_stripes <<= 1;
4282 if (rw & REQ_GET_READ_MIRRORS)
4283 num_alloc_stripes++;
4284 }
4285 bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
3915 if (!bbio) { 4286 if (!bbio) {
3916 ret = -ENOMEM; 4287 ret = -ENOMEM;
3917 goto out; 4288 goto out;
@@ -3998,7 +4369,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3998 } 4369 }
3999 } 4370 }
4000 4371
4001 if (rw & REQ_WRITE) { 4372 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
4002 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 4373 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4003 BTRFS_BLOCK_GROUP_RAID10 | 4374 BTRFS_BLOCK_GROUP_RAID10 |
4004 BTRFS_BLOCK_GROUP_DUP)) { 4375 BTRFS_BLOCK_GROUP_DUP)) {
@@ -4006,20 +4377,115 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
4006 } 4377 }
4007 } 4378 }
4008 4379
4380 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
4381 dev_replace->tgtdev != NULL) {
4382 int index_where_to_add;
4383 u64 srcdev_devid = dev_replace->srcdev->devid;
4384
4385 /*
4386 * duplicate the write operations while the dev replace
4387 * procedure is running. Since the copying of the old disk
4388 * to the new disk takes place at run time while the
4389 * filesystem is mounted writable, the regular write
4390 * operations to the old disk have to be duplicated to go
4391 * to the new disk as well.
4392 * Note that device->missing is handled by the caller, and
4393 * that the write to the old disk is already set up in the
4394 * stripes array.
4395 */
4396 index_where_to_add = num_stripes;
4397 for (i = 0; i < num_stripes; i++) {
4398 if (bbio->stripes[i].dev->devid == srcdev_devid) {
4399 /* write to new disk, too */
4400 struct btrfs_bio_stripe *new =
4401 bbio->stripes + index_where_to_add;
4402 struct btrfs_bio_stripe *old =
4403 bbio->stripes + i;
4404
4405 new->physical = old->physical;
4406 new->length = old->length;
4407 new->dev = dev_replace->tgtdev;
4408 index_where_to_add++;
4409 max_errors++;
4410 }
4411 }
4412 num_stripes = index_where_to_add;
4413 } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
4414 dev_replace->tgtdev != NULL) {
4415 u64 srcdev_devid = dev_replace->srcdev->devid;
4416 int index_srcdev = 0;
4417 int found = 0;
4418 u64 physical_of_found = 0;
4419
4420 /*
4421 * During the dev-replace procedure, the target drive can
4422 * also be used to read data in case it is needed to repair
4423 * a corrupt block elsewhere. This is possible if the
4424 * requested area is left of the left cursor. In this area,
4425 * the target drive is a full copy of the source drive.
4426 */
4427 for (i = 0; i < num_stripes; i++) {
4428 if (bbio->stripes[i].dev->devid == srcdev_devid) {
4429 /*
4430 * In case of DUP, in order to keep it
4431 * simple, only add the mirror with the
4432 * lowest physical address
4433 */
4434 if (found &&
4435 physical_of_found <=
4436 bbio->stripes[i].physical)
4437 continue;
4438 index_srcdev = i;
4439 found = 1;
4440 physical_of_found = bbio->stripes[i].physical;
4441 }
4442 }
4443 if (found) {
4444 u64 length = map->stripe_len;
4445
4446 if (physical_of_found + length <=
4447 dev_replace->cursor_left) {
4448 struct btrfs_bio_stripe *tgtdev_stripe =
4449 bbio->stripes + num_stripes;
4450
4451 tgtdev_stripe->physical = physical_of_found;
4452 tgtdev_stripe->length =
4453 bbio->stripes[index_srcdev].length;
4454 tgtdev_stripe->dev = dev_replace->tgtdev;
4455
4456 num_stripes++;
4457 }
4458 }
4459 }
4460
4009 *bbio_ret = bbio; 4461 *bbio_ret = bbio;
4010 bbio->num_stripes = num_stripes; 4462 bbio->num_stripes = num_stripes;
4011 bbio->max_errors = max_errors; 4463 bbio->max_errors = max_errors;
4012 bbio->mirror_num = mirror_num; 4464 bbio->mirror_num = mirror_num;
4465
4466 /*
4467 * this is the case that REQ_READ && dev_replace_is_ongoing &&
4468 * mirror_num == num_stripes + 1 && dev_replace target drive is
4469 * available as a mirror
4470 */
4471 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
4472 WARN_ON(num_stripes > 1);
4473 bbio->stripes[0].dev = dev_replace->tgtdev;
4474 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
4475 bbio->mirror_num = map->num_stripes + 1;
4476 }
4013out: 4477out:
4478 if (dev_replace_is_ongoing)
4479 btrfs_dev_replace_unlock(dev_replace);
4014 free_extent_map(em); 4480 free_extent_map(em);
4015 return ret; 4481 return ret;
4016} 4482}
4017 4483
4018int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 4484int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4019 u64 logical, u64 *length, 4485 u64 logical, u64 *length,
4020 struct btrfs_bio **bbio_ret, int mirror_num) 4486 struct btrfs_bio **bbio_ret, int mirror_num)
4021{ 4487{
4022 return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, 4488 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
4023 mirror_num); 4489 mirror_num);
4024} 4490}
4025 4491
@@ -4238,10 +4704,116 @@ static noinline void schedule_bio(struct btrfs_root *root,
4238 &device->work); 4704 &device->work);
4239} 4705}
4240 4706
4707static int bio_size_ok(struct block_device *bdev, struct bio *bio,
4708 sector_t sector)
4709{
4710 struct bio_vec *prev;
4711 struct request_queue *q = bdev_get_queue(bdev);
4712 unsigned short max_sectors = queue_max_sectors(q);
4713 struct bvec_merge_data bvm = {
4714 .bi_bdev = bdev,
4715 .bi_sector = sector,
4716 .bi_rw = bio->bi_rw,
4717 };
4718
4719 if (bio->bi_vcnt == 0) {
4720 WARN_ON(1);
4721 return 1;
4722 }
4723
4724 prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
4725 if ((bio->bi_size >> 9) > max_sectors)
4726 return 0;
4727
4728 if (!q->merge_bvec_fn)
4729 return 1;
4730
4731 bvm.bi_size = bio->bi_size - prev->bv_len;
4732 if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
4733 return 0;
4734 return 1;
4735}
4736
4737static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
4738 struct bio *bio, u64 physical, int dev_nr,
4739 int rw, int async)
4740{
4741 struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
4742
4743 bio->bi_private = bbio;
4744 bio->bi_private = merge_stripe_index_into_bio_private(
4745 bio->bi_private, (unsigned int)dev_nr);
4746 bio->bi_end_io = btrfs_end_bio;
4747 bio->bi_sector = physical >> 9;
4748#ifdef DEBUG
4749 {
4750 struct rcu_string *name;
4751
4752 rcu_read_lock();
4753 name = rcu_dereference(dev->name);
4754 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
4755 "(%s id %llu), size=%u\n", rw,
4756 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
4757 name->str, dev->devid, bio->bi_size);
4758 rcu_read_unlock();
4759 }
4760#endif
4761 bio->bi_bdev = dev->bdev;
4762 if (async)
4763 schedule_bio(root, dev, rw, bio);
4764 else
4765 btrfsic_submit_bio(rw, bio);
4766}
4767
4768static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
4769 struct bio *first_bio, struct btrfs_device *dev,
4770 int dev_nr, int rw, int async)
4771{
4772 struct bio_vec *bvec = first_bio->bi_io_vec;
4773 struct bio *bio;
4774 int nr_vecs = bio_get_nr_vecs(dev->bdev);
4775 u64 physical = bbio->stripes[dev_nr].physical;
4776
4777again:
4778 bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
4779 if (!bio)
4780 return -ENOMEM;
4781
4782 while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
4783 if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
4784 bvec->bv_offset) < bvec->bv_len) {
4785 u64 len = bio->bi_size;
4786
4787 atomic_inc(&bbio->stripes_pending);
4788 submit_stripe_bio(root, bbio, bio, physical, dev_nr,
4789 rw, async);
4790 physical += len;
4791 goto again;
4792 }
4793 bvec++;
4794 }
4795
4796 submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
4797 return 0;
4798}
4799
4800static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
4801{
4802 atomic_inc(&bbio->error);
4803 if (atomic_dec_and_test(&bbio->stripes_pending)) {
4804 bio->bi_private = bbio->private;
4805 bio->bi_end_io = bbio->end_io;
4806 bio->bi_bdev = (struct block_device *)
4807 (unsigned long)bbio->mirror_num;
4808 bio->bi_sector = logical >> 9;
4809 kfree(bbio);
4810 bio_endio(bio, -EIO);
4811 }
4812}
4813
4241int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 4814int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4242 int mirror_num, int async_submit) 4815 int mirror_num, int async_submit)
4243{ 4816{
4244 struct btrfs_mapping_tree *map_tree;
4245 struct btrfs_device *dev; 4817 struct btrfs_device *dev;
4246 struct bio *first_bio = bio; 4818 struct bio *first_bio = bio;
4247 u64 logical = (u64)bio->bi_sector << 9; 4819 u64 logical = (u64)bio->bi_sector << 9;
@@ -4253,12 +4825,11 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4253 struct btrfs_bio *bbio = NULL; 4825 struct btrfs_bio *bbio = NULL;
4254 4826
4255 length = bio->bi_size; 4827 length = bio->bi_size;
4256 map_tree = &root->fs_info->mapping_tree;
4257 map_length = length; 4828 map_length = length;
4258 4829
4259 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, 4830 ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
4260 mirror_num); 4831 mirror_num);
4261 if (ret) /* -ENOMEM */ 4832 if (ret)
4262 return ret; 4833 return ret;
4263 4834
4264 total_devs = bbio->num_stripes; 4835 total_devs = bbio->num_stripes;
@@ -4276,52 +4847,48 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4276 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 4847 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4277 4848
4278 while (dev_nr < total_devs) { 4849 while (dev_nr < total_devs) {
4850 dev = bbio->stripes[dev_nr].dev;
4851 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
4852 bbio_error(bbio, first_bio, logical);
4853 dev_nr++;
4854 continue;
4855 }
4856
4857 /*
4858 * Check and see if we're ok with this bio based on it's size
4859 * and offset with the given device.
4860 */
4861 if (!bio_size_ok(dev->bdev, first_bio,
4862 bbio->stripes[dev_nr].physical >> 9)) {
4863 ret = breakup_stripe_bio(root, bbio, first_bio, dev,
4864 dev_nr, rw, async_submit);
4865 BUG_ON(ret);
4866 dev_nr++;
4867 continue;
4868 }
4869
4279 if (dev_nr < total_devs - 1) { 4870 if (dev_nr < total_devs - 1) {
4280 bio = bio_clone(first_bio, GFP_NOFS); 4871 bio = bio_clone(first_bio, GFP_NOFS);
4281 BUG_ON(!bio); /* -ENOMEM */ 4872 BUG_ON(!bio); /* -ENOMEM */
4282 } else { 4873 } else {
4283 bio = first_bio; 4874 bio = first_bio;
4284 } 4875 }
4285 bio->bi_private = bbio; 4876
4286 bio->bi_private = merge_stripe_index_into_bio_private( 4877 submit_stripe_bio(root, bbio, bio,
4287 bio->bi_private, (unsigned int)dev_nr); 4878 bbio->stripes[dev_nr].physical, dev_nr, rw,
4288 bio->bi_end_io = btrfs_end_bio; 4879 async_submit);
4289 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
4290 dev = bbio->stripes[dev_nr].dev;
4291 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
4292#ifdef DEBUG
4293 struct rcu_string *name;
4294
4295 rcu_read_lock();
4296 name = rcu_dereference(dev->name);
4297 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
4298 "(%s id %llu), size=%u\n", rw,
4299 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
4300 name->str, dev->devid, bio->bi_size);
4301 rcu_read_unlock();
4302#endif
4303 bio->bi_bdev = dev->bdev;
4304 if (async_submit)
4305 schedule_bio(root, dev, rw, bio);
4306 else
4307 btrfsic_submit_bio(rw, bio);
4308 } else {
4309 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
4310 bio->bi_sector = logical >> 9;
4311 bio_endio(bio, -EIO);
4312 }
4313 dev_nr++; 4880 dev_nr++;
4314 } 4881 }
4315 return 0; 4882 return 0;
4316} 4883}
4317 4884
4318struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 4885struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
4319 u8 *uuid, u8 *fsid) 4886 u8 *uuid, u8 *fsid)
4320{ 4887{
4321 struct btrfs_device *device; 4888 struct btrfs_device *device;
4322 struct btrfs_fs_devices *cur_devices; 4889 struct btrfs_fs_devices *cur_devices;
4323 4890
4324 cur_devices = root->fs_info->fs_devices; 4891 cur_devices = fs_info->fs_devices;
4325 while (cur_devices) { 4892 while (cur_devices) {
4326 if (!fsid || 4893 if (!fsid ||
4327 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 4894 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
@@ -4402,6 +4969,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
4402 em->bdev = (struct block_device *)map; 4969 em->bdev = (struct block_device *)map;
4403 em->start = logical; 4970 em->start = logical;
4404 em->len = length; 4971 em->len = length;
4972 em->orig_start = 0;
4405 em->block_start = 0; 4973 em->block_start = 0;
4406 em->block_len = em->len; 4974 em->block_len = em->len;
4407 4975
@@ -4419,8 +4987,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
4419 read_extent_buffer(leaf, uuid, (unsigned long) 4987 read_extent_buffer(leaf, uuid, (unsigned long)
4420 btrfs_stripe_dev_uuid_nr(chunk, i), 4988 btrfs_stripe_dev_uuid_nr(chunk, i),
4421 BTRFS_UUID_SIZE); 4989 BTRFS_UUID_SIZE);
4422 map->stripes[i].dev = btrfs_find_device(root, devid, uuid, 4990 map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
4423 NULL); 4991 uuid, NULL);
4424 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 4992 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
4425 kfree(map); 4993 kfree(map);
4426 free_extent_map(em); 4994 free_extent_map(em);
@@ -4461,6 +5029,8 @@ static void fill_device_from_item(struct extent_buffer *leaf,
4461 device->io_align = btrfs_device_io_align(leaf, dev_item); 5029 device->io_align = btrfs_device_io_align(leaf, dev_item);
4462 device->io_width = btrfs_device_io_width(leaf, dev_item); 5030 device->io_width = btrfs_device_io_width(leaf, dev_item);
4463 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 5031 device->sector_size = btrfs_device_sector_size(leaf, dev_item);
5032 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
5033 device->is_tgtdev_for_dev_replace = 0;
4464 5034
4465 ptr = (unsigned long)btrfs_device_uuid(dev_item); 5035 ptr = (unsigned long)btrfs_device_uuid(dev_item);
4466 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 5036 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
@@ -4538,7 +5108,7 @@ static int read_one_dev(struct btrfs_root *root,
4538 return ret; 5108 return ret;
4539 } 5109 }
4540 5110
4541 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 5111 device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
4542 if (!device || !device->bdev) { 5112 if (!device || !device->bdev) {
4543 if (!btrfs_test_opt(root, DEGRADED)) 5113 if (!btrfs_test_opt(root, DEGRADED))
4544 return -EIO; 5114 return -EIO;
@@ -4571,7 +5141,7 @@ static int read_one_dev(struct btrfs_root *root,
4571 fill_device_from_item(leaf, dev_item, device); 5141 fill_device_from_item(leaf, dev_item, device);
4572 device->dev_root = root->fs_info->dev_root; 5142 device->dev_root = root->fs_info->dev_root;
4573 device->in_fs_metadata = 1; 5143 device->in_fs_metadata = 1;
4574 if (device->writeable) { 5144 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
4575 device->fs_devices->total_rw_bytes += device->total_bytes; 5145 device->fs_devices->total_rw_bytes += device->total_bytes;
4576 spin_lock(&root->fs_info->free_chunk_lock); 5146 spin_lock(&root->fs_info->free_chunk_lock);
4577 root->fs_info->free_chunk_space += device->total_bytes - 5147 root->fs_info->free_chunk_space += device->total_bytes -
@@ -4930,7 +5500,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
4930 int i; 5500 int i;
4931 5501
4932 mutex_lock(&fs_devices->device_list_mutex); 5502 mutex_lock(&fs_devices->device_list_mutex);
4933 dev = btrfs_find_device(root, stats->devid, NULL, NULL); 5503 dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
4934 mutex_unlock(&fs_devices->device_list_mutex); 5504 mutex_unlock(&fs_devices->device_list_mutex);
4935 5505
4936 if (!dev) { 5506 if (!dev) {
@@ -4958,3 +5528,21 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
4958 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 5528 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
4959 return 0; 5529 return 0;
4960} 5530}
5531
5532int btrfs_scratch_superblock(struct btrfs_device *device)
5533{
5534 struct buffer_head *bh;
5535 struct btrfs_super_block *disk_super;
5536
5537 bh = btrfs_read_dev_super(device->bdev);
5538 if (!bh)
5539 return -EINVAL;
5540 disk_super = (struct btrfs_super_block *)bh->b_data;
5541
5542 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
5543 set_buffer_dirty(bh);
5544 sync_dirty_buffer(bh);
5545 brelse(bh);
5546
5547 return 0;
5548}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 53c06af92e8d..d3c3939ac751 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -50,6 +50,7 @@ struct btrfs_device {
50 int in_fs_metadata; 50 int in_fs_metadata;
51 int missing; 51 int missing;
52 int can_discard; 52 int can_discard;
53 int is_tgtdev_for_dev_replace;
53 54
54 spinlock_t io_lock; 55 spinlock_t io_lock;
55 56
@@ -88,7 +89,7 @@ struct btrfs_device {
88 u8 uuid[BTRFS_UUID_SIZE]; 89 u8 uuid[BTRFS_UUID_SIZE];
89 90
90 /* per-device scrub information */ 91 /* per-device scrub information */
91 struct scrub_dev *scrub_device; 92 struct scrub_ctx *scrub_device;
92 93
93 struct btrfs_work work; 94 struct btrfs_work work;
94 struct rcu_head rcu; 95 struct rcu_head rcu;
@@ -179,6 +180,15 @@ struct btrfs_device_info {
179 u64 total_avail; 180 u64 total_avail;
180}; 181};
181 182
183struct btrfs_raid_attr {
184 int sub_stripes; /* sub_stripes info for map */
185 int dev_stripes; /* stripes per dev */
186 int devs_max; /* max devs to use */
187 int devs_min; /* min devs needed */
188 int devs_increment; /* ndevs has to be a multiple of this */
189 int ncopies; /* how many copies to data has */
190};
191
182struct map_lookup { 192struct map_lookup {
183 u64 type; 193 u64 type;
184 int io_align; 194 int io_align;
@@ -248,7 +258,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
248 struct btrfs_device *device, 258 struct btrfs_device *device,
249 u64 chunk_tree, u64 chunk_objectid, 259 u64 chunk_tree, u64 chunk_objectid,
250 u64 chunk_offset, u64 start, u64 num_bytes); 260 u64 chunk_offset, u64 start, u64 num_bytes);
251int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 261int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
252 u64 logical, u64 *length, 262 u64 logical, u64 *length,
253 struct btrfs_bio **bbio_ret, int mirror_num); 263 struct btrfs_bio **bbio_ret, int mirror_num);
254int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 264int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -267,19 +277,27 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
267int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 277int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
268 struct btrfs_fs_devices **fs_devices_ret); 278 struct btrfs_fs_devices **fs_devices_ret);
269int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); 279int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
270void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); 280void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
281 struct btrfs_fs_devices *fs_devices, int step);
282int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
283 char *device_path,
284 struct btrfs_device **device);
285int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
286 struct btrfs_device **device);
271int btrfs_add_device(struct btrfs_trans_handle *trans, 287int btrfs_add_device(struct btrfs_trans_handle *trans,
272 struct btrfs_root *root, 288 struct btrfs_root *root,
273 struct btrfs_device *device); 289 struct btrfs_device *device);
274int btrfs_rm_device(struct btrfs_root *root, char *device_path); 290int btrfs_rm_device(struct btrfs_root *root, char *device_path);
275void btrfs_cleanup_fs_uuids(void); 291void btrfs_cleanup_fs_uuids(void);
276int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); 292int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
277int btrfs_grow_device(struct btrfs_trans_handle *trans, 293int btrfs_grow_device(struct btrfs_trans_handle *trans,
278 struct btrfs_device *device, u64 new_size); 294 struct btrfs_device *device, u64 new_size);
279struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 295struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
280 u8 *uuid, u8 *fsid); 296 u8 *uuid, u8 *fsid);
281int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); 297int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
282int btrfs_init_new_device(struct btrfs_root *root, char *path); 298int btrfs_init_new_device(struct btrfs_root *root, char *path);
299int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
300 struct btrfs_device **device_out);
283int btrfs_balance(struct btrfs_balance_control *bctl, 301int btrfs_balance(struct btrfs_balance_control *bctl,
284 struct btrfs_ioctl_balance_args *bargs); 302 struct btrfs_ioctl_balance_args *bargs);
285int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); 303int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
@@ -296,6 +314,13 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
296int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); 314int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
297int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 315int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
298 struct btrfs_fs_info *fs_info); 316 struct btrfs_fs_info *fs_info);
317void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
318 struct btrfs_device *srcdev);
319void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
320 struct btrfs_device *tgtdev);
321void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
322 struct btrfs_device *tgtdev);
323int btrfs_scratch_superblock(struct btrfs_device *device);
299 324
300static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 325static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
301 int index) 326 int index)
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3f4e2d69e83a..446a6848c554 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -122,6 +122,16 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
122 */ 122 */
123 if (!value) 123 if (!value)
124 goto out; 124 goto out;
125 } else {
126 di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
127 name, name_len, 0);
128 if (IS_ERR(di)) {
129 ret = PTR_ERR(di);
130 goto out;
131 }
132 if (!di && !value)
133 goto out;
134 btrfs_release_path(path);
125 } 135 }
126 136
127again: 137again:
@@ -198,6 +208,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
198 208
199 inode_inc_iversion(inode); 209 inode_inc_iversion(inode);
200 inode->i_ctime = CURRENT_TIME; 210 inode->i_ctime = CURRENT_TIME;
211 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
201 ret = btrfs_update_inode(trans, root, inode); 212 ret = btrfs_update_inode(trans, root, inode);
202 BUG_ON(ret); 213 BUG_ON(ret);
203out: 214out:
@@ -265,7 +276,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
265 276
266 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 277 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
267 if (verify_dir_item(root, leaf, di)) 278 if (verify_dir_item(root, leaf, di))
268 continue; 279 goto next;
269 280
270 name_len = btrfs_dir_name_len(leaf, di); 281 name_len = btrfs_dir_name_len(leaf, di);
271 total_size += name_len + 1; 282 total_size += name_len + 1;