aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-03-02 19:41:54 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-03-02 19:41:54 -0500
commitb695188dd39162a1a6bff11fdbcc4c0b65b933ab (patch)
treea3df7c052d38b5bfaf335fbf3130abcc5c6ca577 /fs
parent48476df99894492a0f7239f2f3c9a2dde4ff38e2 (diff)
parent180e001cd5fc2950dc6a7997dde5b65c954d0e79 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs update from Chris Mason: "The biggest feature in the pull is the new (and still experimental) raid56 code that David Woodhouse started long ago. I'm still working on the parity logging setup that will avoid inconsistent parity after a crash, so this is only for testing right now. But, I'd really like to get it out to a broader audience to hammer out any performance issues or other problems. scrub does not yet correct errors on raid5/6 either. Josef has another pass at fsync performance. The big change here is to combine waiting for metadata with waiting for data, which is a big latency win. It is also step one toward using atomics from the hardware during a commit. Mark Fasheh has a new way to use btrfs send/receive to send only the metadata changes. SUSE is using this to make snapper more efficient at finding changes between snapshosts. Snapshot-aware defrag is also included. Otherwise we have a large number of fixes and cleanups. Eric Sandeen wins the award for removing the most lines, and I'm hoping we steal this idea from XFS over and over again." * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (118 commits) btrfs: fixup/remove module.h usage as required Btrfs: delete inline extents when we find them during logging btrfs: try harder to allocate raid56 stripe cache Btrfs: cleanup to make the function btrfs_delalloc_reserve_metadata more logic Btrfs: don't call btrfs_qgroup_free if just btrfs_qgroup_reserve fails Btrfs: remove reduplicate check about root in the function btrfs_clean_quota_tree Btrfs: return ENOMEM rather than use BUG_ON when btrfs_alloc_path fails Btrfs: fix missing deleted items in btrfs_clean_quota_tree btrfs: use only inline_pages from extent buffer Btrfs: fix wrong reserved space when deleting a snapshot/subvolume Btrfs: fix wrong reserved space in qgroup during snap/subv creation Btrfs: remove unnecessary dget_parent/dput when creating the pending snapshot btrfs: remove a printk from scan_one_device Btrfs: fix NULL pointer after aborting a transaction Btrfs: fix memory leak of log roots Btrfs: copy everything if we've created an inline extent btrfs: cleanup for open-coded alignment Btrfs: do not change inode flags in rename Btrfs: use reserved space for creating a snapshot clear chunk_alloc flag on retryable failure ...
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/Kconfig3
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/backref.c5
-rw-r--r--fs/btrfs/backref.h2
-rw-r--r--fs/btrfs/btrfs_inode.h20
-rw-r--r--fs/btrfs/check-integrity.c3
-rw-r--r--fs/btrfs/compression.c4
-rw-r--r--fs/btrfs/ctree.c68
-rw-r--r--fs/btrfs/ctree.h150
-rw-r--r--fs/btrfs/delayed-inode.c147
-rw-r--r--fs/btrfs/delayed-inode.h1
-rw-r--r--fs/btrfs/delayed-ref.c82
-rw-r--r--fs/btrfs/delayed-ref.h52
-rw-r--r--fs/btrfs/dev-replace.c6
-rw-r--r--fs/btrfs/disk-io.c227
-rw-r--r--fs/btrfs/disk-io.h7
-rw-r--r--fs/btrfs/extent-tree.c578
-rw-r--r--fs/btrfs/extent_io.c138
-rw-r--r--fs/btrfs/extent_io.h8
-rw-r--r--fs/btrfs/extent_map.c1
-rw-r--r--fs/btrfs/file-item.c67
-rw-r--r--fs/btrfs/file.c57
-rw-r--r--fs/btrfs/free-space-cache.c62
-rw-r--r--fs/btrfs/inode.c1064
-rw-r--r--fs/btrfs/ioctl.c211
-rw-r--r--fs/btrfs/ioctl.h502
-rw-r--r--fs/btrfs/locking.c5
-rw-r--r--fs/btrfs/ordered-data.c98
-rw-r--r--fs/btrfs/ordered-data.h14
-rw-r--r--fs/btrfs/print-tree.c1
-rw-r--r--fs/btrfs/qgroup.c55
-rw-r--r--fs/btrfs/raid56.c2099
-rw-r--r--fs/btrfs/raid56.h51
-rw-r--r--fs/btrfs/relocation.c2
-rw-r--r--fs/btrfs/scrub.c10
-rw-r--r--fs/btrfs/send.c53
-rw-r--r--fs/btrfs/send.h1
-rw-r--r--fs/btrfs/super.c89
-rw-r--r--fs/btrfs/sysfs.c1
-rw-r--r--fs/btrfs/transaction.c151
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-defrag.c19
-rw-r--r--fs/btrfs/tree-log.c166
-rw-r--r--fs/btrfs/ulist.c2
-rw-r--r--fs/btrfs/volumes.c636
-rw-r--r--fs/btrfs/volumes.h11
46 files changed, 5421 insertions, 1518 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index ccd25ba7a9ac..9a8622a5b867 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -5,6 +5,9 @@ config BTRFS_FS
5 select ZLIB_DEFLATE 5 select ZLIB_DEFLATE
6 select LZO_COMPRESS 6 select LZO_COMPRESS
7 select LZO_DECOMPRESS 7 select LZO_DECOMPRESS
8 select RAID6_PQ
9 select XOR_BLOCKS
10
8 help 11 help
9 Btrfs is a new filesystem with extents, writable snapshotting, 12 Btrfs is a new filesystem with extents, writable snapshotting,
10 support for multiple devices and many more features. 13 support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 7df3e0f0ee51..3932224f99e9 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o 11 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o
12 12
13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o 14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 04edf69be875..bd605c87adfd 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -352,11 +352,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
352 err = __resolve_indirect_ref(fs_info, search_commit_root, 352 err = __resolve_indirect_ref(fs_info, search_commit_root,
353 time_seq, ref, parents, 353 time_seq, ref, parents,
354 extent_item_pos); 354 extent_item_pos);
355 if (err) { 355 if (err)
356 if (ret == 0)
357 ret = err;
358 continue; 356 continue;
359 }
360 357
361 /* we put the first parent into the ref at hand */ 358 /* we put the first parent into the ref at hand */
362 ULIST_ITER_INIT(&uiter); 359 ULIST_ITER_INIT(&uiter);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index d61feca79455..310a7f6d09b1 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -19,7 +19,7 @@
19#ifndef __BTRFS_BACKREF__ 19#ifndef __BTRFS_BACKREF__
20#define __BTRFS_BACKREF__ 20#define __BTRFS_BACKREF__
21 21
22#include "ioctl.h" 22#include <linux/btrfs.h>
23#include "ulist.h" 23#include "ulist.h"
24#include "extent_io.h" 24#include "extent_io.h"
25 25
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 2a8c242bc4f5..d9b97d4960e6 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -40,6 +40,8 @@
40#define BTRFS_INODE_HAS_ASYNC_EXTENT 6 40#define BTRFS_INODE_HAS_ASYNC_EXTENT 6
41#define BTRFS_INODE_NEEDS_FULL_SYNC 7 41#define BTRFS_INODE_NEEDS_FULL_SYNC 7
42#define BTRFS_INODE_COPY_EVERYTHING 8 42#define BTRFS_INODE_COPY_EVERYTHING 8
43#define BTRFS_INODE_IN_DELALLOC_LIST 9
44#define BTRFS_INODE_READDIO_NEED_LOCK 10
43 45
44/* in memory btrfs inode */ 46/* in memory btrfs inode */
45struct btrfs_inode { 47struct btrfs_inode {
@@ -216,4 +218,22 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
216 return 0; 218 return 0;
217} 219}
218 220
221/*
222 * Disable DIO read nolock optimization, so new dio readers will be forced
223 * to grab i_mutex. It is used to avoid the endless truncate due to
224 * nonlocked dio read.
225 */
226static inline void btrfs_inode_block_unlocked_dio(struct inode *inode)
227{
228 set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags);
229 smp_mb();
230}
231
232static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode)
233{
234 smp_mb__before_clear_bit();
235 clear_bit(BTRFS_INODE_READDIO_NEED_LOCK,
236 &BTRFS_I(inode)->runtime_flags);
237}
238
219#endif 239#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 11d47bfb62b4..18af6f48781a 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -813,8 +813,7 @@ static int btrfsic_process_superblock_dev_mirror(
813 (bh->b_data + (dev_bytenr & 4095)); 813 (bh->b_data + (dev_bytenr & 4095));
814 814
815 if (btrfs_super_bytenr(super_tmp) != dev_bytenr || 815 if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
816 strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC, 816 super_tmp->magic != cpu_to_le64(BTRFS_MAGIC) ||
817 sizeof(super_tmp->magic)) ||
818 memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || 817 memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
819 btrfs_super_nodesize(super_tmp) != state->metablock_size || 818 btrfs_super_nodesize(super_tmp) != state->metablock_size ||
820 btrfs_super_leafsize(super_tmp) != state->metablock_size || 819 btrfs_super_leafsize(super_tmp) != state->metablock_size ||
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 94ab2f80e7e3..15b94089abc4 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -372,7 +372,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
372 page = compressed_pages[pg_index]; 372 page = compressed_pages[pg_index];
373 page->mapping = inode->i_mapping; 373 page->mapping = inode->i_mapping;
374 if (bio->bi_size) 374 if (bio->bi_size)
375 ret = io_tree->ops->merge_bio_hook(page, 0, 375 ret = io_tree->ops->merge_bio_hook(WRITE, page, 0,
376 PAGE_CACHE_SIZE, 376 PAGE_CACHE_SIZE,
377 bio, 0); 377 bio, 0);
378 else 378 else
@@ -655,7 +655,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
655 page->index = em_start >> PAGE_CACHE_SHIFT; 655 page->index = em_start >> PAGE_CACHE_SHIFT;
656 656
657 if (comp_bio->bi_size) 657 if (comp_bio->bi_size)
658 ret = tree->ops->merge_bio_hook(page, 0, 658 ret = tree->ops->merge_bio_hook(READ, page, 0,
659 PAGE_CACHE_SIZE, 659 PAGE_CACHE_SIZE,
660 comp_bio, 0); 660 comp_bio, 0);
661 else 661 else
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index eea5da7a2b9a..ecd25a1b4e51 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1138,6 +1138,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1138 switch (tm->op) { 1138 switch (tm->op) {
1139 case MOD_LOG_KEY_REMOVE_WHILE_FREEING: 1139 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1140 BUG_ON(tm->slot < n); 1140 BUG_ON(tm->slot < n);
1141 /* Fallthrough */
1141 case MOD_LOG_KEY_REMOVE_WHILE_MOVING: 1142 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1142 case MOD_LOG_KEY_REMOVE: 1143 case MOD_LOG_KEY_REMOVE:
1143 btrfs_set_node_key(eb, &tm->key, tm->slot); 1144 btrfs_set_node_key(eb, &tm->key, tm->slot);
@@ -1222,7 +1223,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1222 1223
1223 __tree_mod_log_rewind(eb_rewin, time_seq, tm); 1224 __tree_mod_log_rewind(eb_rewin, time_seq, tm);
1224 WARN_ON(btrfs_header_nritems(eb_rewin) > 1225 WARN_ON(btrfs_header_nritems(eb_rewin) >
1225 BTRFS_NODEPTRS_PER_BLOCK(fs_info->fs_root)); 1226 BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1226 1227
1227 return eb_rewin; 1228 return eb_rewin;
1228} 1229}
@@ -1441,7 +1442,7 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1441 */ 1442 */
1442int btrfs_realloc_node(struct btrfs_trans_handle *trans, 1443int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1443 struct btrfs_root *root, struct extent_buffer *parent, 1444 struct btrfs_root *root, struct extent_buffer *parent,
1444 int start_slot, int cache_only, u64 *last_ret, 1445 int start_slot, u64 *last_ret,
1445 struct btrfs_key *progress) 1446 struct btrfs_key *progress)
1446{ 1447{
1447 struct extent_buffer *cur; 1448 struct extent_buffer *cur;
@@ -1461,8 +1462,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1461 struct btrfs_disk_key disk_key; 1462 struct btrfs_disk_key disk_key;
1462 1463
1463 parent_level = btrfs_header_level(parent); 1464 parent_level = btrfs_header_level(parent);
1464 if (cache_only && parent_level != 1)
1465 return 0;
1466 1465
1467 WARN_ON(trans->transaction != root->fs_info->running_transaction); 1466 WARN_ON(trans->transaction != root->fs_info->running_transaction);
1468 WARN_ON(trans->transid != root->fs_info->generation); 1467 WARN_ON(trans->transid != root->fs_info->generation);
@@ -1508,10 +1507,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1508 else 1507 else
1509 uptodate = 0; 1508 uptodate = 0;
1510 if (!cur || !uptodate) { 1509 if (!cur || !uptodate) {
1511 if (cache_only) {
1512 free_extent_buffer(cur);
1513 continue;
1514 }
1515 if (!cur) { 1510 if (!cur) {
1516 cur = read_tree_block(root, blocknr, 1511 cur = read_tree_block(root, blocknr,
1517 blocksize, gen); 1512 blocksize, gen);
@@ -4825,8 +4820,8 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4825 4820
4826/* 4821/*
4827 * A helper function to walk down the tree starting at min_key, and looking 4822 * A helper function to walk down the tree starting at min_key, and looking
4828 * for nodes or leaves that are either in cache or have a minimum 4823 * for nodes or leaves that are have a minimum transaction id.
4829 * transaction id. This is used by the btree defrag code, and tree logging 4824 * This is used by the btree defrag code, and tree logging
4830 * 4825 *
4831 * This does not cow, but it does stuff the starting key it finds back 4826 * This does not cow, but it does stuff the starting key it finds back
4832 * into min_key, so you can call btrfs_search_slot with cow=1 on the 4827 * into min_key, so you can call btrfs_search_slot with cow=1 on the
@@ -4847,7 +4842,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4847 */ 4842 */
4848int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 4843int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4849 struct btrfs_key *max_key, 4844 struct btrfs_key *max_key,
4850 struct btrfs_path *path, int cache_only, 4845 struct btrfs_path *path,
4851 u64 min_trans) 4846 u64 min_trans)
4852{ 4847{
4853 struct extent_buffer *cur; 4848 struct extent_buffer *cur;
@@ -4887,15 +4882,12 @@ again:
4887 if (sret && slot > 0) 4882 if (sret && slot > 0)
4888 slot--; 4883 slot--;
4889 /* 4884 /*
4890 * check this node pointer against the cache_only and 4885 * check this node pointer against the min_trans parameters.
4891 * min_trans parameters. If it isn't in cache or is too 4886 * If it is too old, old, skip to the next one.
4892 * old, skip to the next one.
4893 */ 4887 */
4894 while (slot < nritems) { 4888 while (slot < nritems) {
4895 u64 blockptr; 4889 u64 blockptr;
4896 u64 gen; 4890 u64 gen;
4897 struct extent_buffer *tmp;
4898 struct btrfs_disk_key disk_key;
4899 4891
4900 blockptr = btrfs_node_blockptr(cur, slot); 4892 blockptr = btrfs_node_blockptr(cur, slot);
4901 gen = btrfs_node_ptr_generation(cur, slot); 4893 gen = btrfs_node_ptr_generation(cur, slot);
@@ -4903,27 +4895,7 @@ again:
4903 slot++; 4895 slot++;
4904 continue; 4896 continue;
4905 } 4897 }
4906 if (!cache_only) 4898 break;
4907 break;
4908
4909 if (max_key) {
4910 btrfs_node_key(cur, &disk_key, slot);
4911 if (comp_keys(&disk_key, max_key) >= 0) {
4912 ret = 1;
4913 goto out;
4914 }
4915 }
4916
4917 tmp = btrfs_find_tree_block(root, blockptr,
4918 btrfs_level_size(root, level - 1));
4919
4920 if (tmp && btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
4921 free_extent_buffer(tmp);
4922 break;
4923 }
4924 if (tmp)
4925 free_extent_buffer(tmp);
4926 slot++;
4927 } 4899 }
4928find_next_key: 4900find_next_key:
4929 /* 4901 /*
@@ -4934,7 +4906,7 @@ find_next_key:
4934 path->slots[level] = slot; 4906 path->slots[level] = slot;
4935 btrfs_set_path_blocking(path); 4907 btrfs_set_path_blocking(path);
4936 sret = btrfs_find_next_key(root, path, min_key, level, 4908 sret = btrfs_find_next_key(root, path, min_key, level,
4937 cache_only, min_trans); 4909 min_trans);
4938 if (sret == 0) { 4910 if (sret == 0) {
4939 btrfs_release_path(path); 4911 btrfs_release_path(path);
4940 goto again; 4912 goto again;
@@ -5399,8 +5371,7 @@ out:
5399/* 5371/*
5400 * this is similar to btrfs_next_leaf, but does not try to preserve 5372 * this is similar to btrfs_next_leaf, but does not try to preserve
5401 * and fixup the path. It looks for and returns the next key in the 5373 * and fixup the path. It looks for and returns the next key in the
5402 * tree based on the current path and the cache_only and min_trans 5374 * tree based on the current path and the min_trans parameters.
5403 * parameters.
5404 * 5375 *
5405 * 0 is returned if another key is found, < 0 if there are any errors 5376 * 0 is returned if another key is found, < 0 if there are any errors
5406 * and 1 is returned if there are no higher keys in the tree 5377 * and 1 is returned if there are no higher keys in the tree
@@ -5409,8 +5380,7 @@ out:
5409 * calling this function. 5380 * calling this function.
5410 */ 5381 */
5411int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 5382int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5412 struct btrfs_key *key, int level, 5383 struct btrfs_key *key, int level, u64 min_trans)
5413 int cache_only, u64 min_trans)
5414{ 5384{
5415 int slot; 5385 int slot;
5416 struct extent_buffer *c; 5386 struct extent_buffer *c;
@@ -5461,22 +5431,8 @@ next:
5461 if (level == 0) 5431 if (level == 0)
5462 btrfs_item_key_to_cpu(c, key, slot); 5432 btrfs_item_key_to_cpu(c, key, slot);
5463 else { 5433 else {
5464 u64 blockptr = btrfs_node_blockptr(c, slot);
5465 u64 gen = btrfs_node_ptr_generation(c, slot); 5434 u64 gen = btrfs_node_ptr_generation(c, slot);
5466 5435
5467 if (cache_only) {
5468 struct extent_buffer *cur;
5469 cur = btrfs_find_tree_block(root, blockptr,
5470 btrfs_level_size(root, level - 1));
5471 if (!cur ||
5472 btrfs_buffer_uptodate(cur, gen, 1) <= 0) {
5473 slot++;
5474 if (cur)
5475 free_extent_buffer(cur);
5476 goto next;
5477 }
5478 free_extent_buffer(cur);
5479 }
5480 if (gen < min_trans) { 5436 if (gen < min_trans) {
5481 slot++; 5437 slot++;
5482 goto next; 5438 goto next;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 547b7b05727f..0d82922179db 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -31,10 +31,10 @@
31#include <trace/events/btrfs.h> 31#include <trace/events/btrfs.h>
32#include <asm/kmap_types.h> 32#include <asm/kmap_types.h>
33#include <linux/pagemap.h> 33#include <linux/pagemap.h>
34#include <linux/btrfs.h>
34#include "extent_io.h" 35#include "extent_io.h"
35#include "extent_map.h" 36#include "extent_map.h"
36#include "async-thread.h" 37#include "async-thread.h"
37#include "ioctl.h"
38 38
39struct btrfs_trans_handle; 39struct btrfs_trans_handle;
40struct btrfs_transaction; 40struct btrfs_transaction;
@@ -46,7 +46,7 @@ extern struct kmem_cache *btrfs_path_cachep;
46extern struct kmem_cache *btrfs_free_space_cachep; 46extern struct kmem_cache *btrfs_free_space_cachep;
47struct btrfs_ordered_sum; 47struct btrfs_ordered_sum;
48 48
49#define BTRFS_MAGIC "_BHRfS_M" 49#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
50 50
51#define BTRFS_MAX_MIRRORS 3 51#define BTRFS_MAX_MIRRORS 3
52 52
@@ -191,6 +191,8 @@ static int btrfs_csum_sizes[] = { 4, 0 };
191/* ioprio of readahead is set to idle */ 191/* ioprio of readahead is set to idle */
192#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) 192#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
193 193
194#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024)
195
194/* 196/*
195 * The key defines the order in the tree, and so it also defines (optimal) 197 * The key defines the order in the tree, and so it also defines (optimal)
196 * block layout. 198 * block layout.
@@ -336,7 +338,10 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
336/* 338/*
337 * File system states 339 * File system states
338 */ 340 */
341#define BTRFS_FS_STATE_ERROR 0
342#define BTRFS_FS_STATE_REMOUNTING 1
339 343
344/* Super block flags */
340/* Errors detected */ 345/* Errors detected */
341#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) 346#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2)
342 347
@@ -502,6 +507,7 @@ struct btrfs_super_block {
502#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) 507#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
503 508
504#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) 509#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
510#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7)
505 511
506#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 512#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
507#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 513#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
@@ -511,6 +517,7 @@ struct btrfs_super_block {
511 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ 517 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
512 BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ 518 BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
513 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ 519 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
520 BTRFS_FEATURE_INCOMPAT_RAID56 | \
514 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) 521 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
515 522
516/* 523/*
@@ -952,8 +959,20 @@ struct btrfs_dev_replace_item {
952#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) 959#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
953#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) 960#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
954#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) 961#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
962#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7)
963#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8)
955#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE 964#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
956#define BTRFS_NR_RAID_TYPES 5 965
966enum btrfs_raid_types {
967 BTRFS_RAID_RAID10,
968 BTRFS_RAID_RAID1,
969 BTRFS_RAID_DUP,
970 BTRFS_RAID_RAID0,
971 BTRFS_RAID_SINGLE,
972 BTRFS_RAID_RAID5,
973 BTRFS_RAID_RAID6,
974 BTRFS_NR_RAID_TYPES
975};
957 976
958#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ 977#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
959 BTRFS_BLOCK_GROUP_SYSTEM | \ 978 BTRFS_BLOCK_GROUP_SYSTEM | \
@@ -961,6 +980,8 @@ struct btrfs_dev_replace_item {
961 980
962#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ 981#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
963 BTRFS_BLOCK_GROUP_RAID1 | \ 982 BTRFS_BLOCK_GROUP_RAID1 | \
983 BTRFS_BLOCK_GROUP_RAID5 | \
984 BTRFS_BLOCK_GROUP_RAID6 | \
964 BTRFS_BLOCK_GROUP_DUP | \ 985 BTRFS_BLOCK_GROUP_DUP | \
965 BTRFS_BLOCK_GROUP_RAID10) 986 BTRFS_BLOCK_GROUP_RAID10)
966/* 987/*
@@ -1185,6 +1206,10 @@ struct btrfs_block_group_cache {
1185 u64 flags; 1206 u64 flags;
1186 u64 sectorsize; 1207 u64 sectorsize;
1187 u64 cache_generation; 1208 u64 cache_generation;
1209
1210 /* for raid56, this is a full stripe, without parity */
1211 unsigned long full_stripe_len;
1212
1188 unsigned int ro:1; 1213 unsigned int ro:1;
1189 unsigned int dirty:1; 1214 unsigned int dirty:1;
1190 unsigned int iref:1; 1215 unsigned int iref:1;
@@ -1225,6 +1250,28 @@ struct seq_list {
1225 u64 seq; 1250 u64 seq;
1226}; 1251};
1227 1252
1253enum btrfs_orphan_cleanup_state {
1254 ORPHAN_CLEANUP_STARTED = 1,
1255 ORPHAN_CLEANUP_DONE = 2,
1256};
1257
1258/* used by the raid56 code to lock stripes for read/modify/write */
1259struct btrfs_stripe_hash {
1260 struct list_head hash_list;
1261 wait_queue_head_t wait;
1262 spinlock_t lock;
1263};
1264
1265/* used by the raid56 code to lock stripes for read/modify/write */
1266struct btrfs_stripe_hash_table {
1267 struct list_head stripe_cache;
1268 spinlock_t cache_lock;
1269 int cache_size;
1270 struct btrfs_stripe_hash table[];
1271};
1272
1273#define BTRFS_STRIPE_HASH_TABLE_BITS 11
1274
1228/* fs_info */ 1275/* fs_info */
1229struct reloc_control; 1276struct reloc_control;
1230struct btrfs_device; 1277struct btrfs_device;
@@ -1250,6 +1297,7 @@ struct btrfs_fs_info {
1250 1297
1251 /* block group cache stuff */ 1298 /* block group cache stuff */
1252 spinlock_t block_group_cache_lock; 1299 spinlock_t block_group_cache_lock;
1300 u64 first_logical_byte;
1253 struct rb_root block_group_cache_tree; 1301 struct rb_root block_group_cache_tree;
1254 1302
1255 /* keep track of unallocated space */ 1303 /* keep track of unallocated space */
@@ -1288,7 +1336,23 @@ struct btrfs_fs_info {
1288 u64 last_trans_log_full_commit; 1336 u64 last_trans_log_full_commit;
1289 unsigned long mount_opt; 1337 unsigned long mount_opt;
1290 unsigned long compress_type:4; 1338 unsigned long compress_type:4;
1339 /*
1340 * It is a suggestive number, the read side is safe even it gets a
1341 * wrong number because we will write out the data into a regular
1342 * extent. The write side(mount/remount) is under ->s_umount lock,
1343 * so it is also safe.
1344 */
1291 u64 max_inline; 1345 u64 max_inline;
1346 /*
1347 * Protected by ->chunk_mutex and sb->s_umount.
1348 *
1349 * The reason that we use two lock to protect it is because only
1350 * remount and mount operations can change it and these two operations
1351 * are under sb->s_umount, but the read side (chunk allocation) can not
1352 * acquire sb->s_umount or the deadlock would happen. So we use two
1353 * locks to protect it. On the write side, we must acquire two locks,
1354 * and on the read side, we just need acquire one of them.
1355 */
1292 u64 alloc_start; 1356 u64 alloc_start;
1293 struct btrfs_transaction *running_transaction; 1357 struct btrfs_transaction *running_transaction;
1294 wait_queue_head_t transaction_throttle; 1358 wait_queue_head_t transaction_throttle;
@@ -1307,6 +1371,13 @@ struct btrfs_fs_info {
1307 struct mutex cleaner_mutex; 1371 struct mutex cleaner_mutex;
1308 struct mutex chunk_mutex; 1372 struct mutex chunk_mutex;
1309 struct mutex volume_mutex; 1373 struct mutex volume_mutex;
1374
1375 /* this is used during read/modify/write to make sure
1376 * no two ios are trying to mod the same stripe at the same
1377 * time
1378 */
1379 struct btrfs_stripe_hash_table *stripe_hash_table;
1380
1310 /* 1381 /*
1311 * this protects the ordered operations list only while we are 1382 * this protects the ordered operations list only while we are
1312 * processing all of the entries on it. This way we make 1383 * processing all of the entries on it. This way we make
@@ -1365,6 +1436,7 @@ struct btrfs_fs_info {
1365 */ 1436 */
1366 struct list_head ordered_extents; 1437 struct list_head ordered_extents;
1367 1438
1439 spinlock_t delalloc_lock;
1368 /* 1440 /*
1369 * all of the inodes that have delalloc bytes. It is possible for 1441 * all of the inodes that have delalloc bytes. It is possible for
1370 * this list to be empty even when there is still dirty data=ordered 1442 * this list to be empty even when there is still dirty data=ordered
@@ -1373,13 +1445,6 @@ struct btrfs_fs_info {
1373 struct list_head delalloc_inodes; 1445 struct list_head delalloc_inodes;
1374 1446
1375 /* 1447 /*
1376 * special rename and truncate targets that must be on disk before
1377 * we're allowed to commit. This is basically the ext3 style
1378 * data=ordered list.
1379 */
1380 struct list_head ordered_operations;
1381
1382 /*
1383 * there is a pool of worker threads for checksumming during writes 1448 * there is a pool of worker threads for checksumming during writes
1384 * and a pool for checksumming after reads. This is because readers 1449 * and a pool for checksumming after reads. This is because readers
1385 * can run with FS locks held, and the writers may be waiting for 1450 * can run with FS locks held, and the writers may be waiting for
@@ -1395,6 +1460,8 @@ struct btrfs_fs_info {
1395 struct btrfs_workers flush_workers; 1460 struct btrfs_workers flush_workers;
1396 struct btrfs_workers endio_workers; 1461 struct btrfs_workers endio_workers;
1397 struct btrfs_workers endio_meta_workers; 1462 struct btrfs_workers endio_meta_workers;
1463 struct btrfs_workers endio_raid56_workers;
1464 struct btrfs_workers rmw_workers;
1398 struct btrfs_workers endio_meta_write_workers; 1465 struct btrfs_workers endio_meta_write_workers;
1399 struct btrfs_workers endio_write_workers; 1466 struct btrfs_workers endio_write_workers;
1400 struct btrfs_workers endio_freespace_worker; 1467 struct btrfs_workers endio_freespace_worker;
@@ -1423,10 +1490,12 @@ struct btrfs_fs_info {
1423 1490
1424 u64 total_pinned; 1491 u64 total_pinned;
1425 1492
1426 /* protected by the delalloc lock, used to keep from writing 1493 /* used to keep from writing metadata until there is a nice batch */
1427 * metadata until there is a nice batch 1494 struct percpu_counter dirty_metadata_bytes;
1428 */ 1495 struct percpu_counter delalloc_bytes;
1429 u64 dirty_metadata_bytes; 1496 s32 dirty_metadata_batch;
1497 s32 delalloc_batch;
1498
1430 struct list_head dirty_cowonly_roots; 1499 struct list_head dirty_cowonly_roots;
1431 1500
1432 struct btrfs_fs_devices *fs_devices; 1501 struct btrfs_fs_devices *fs_devices;
@@ -1442,9 +1511,6 @@ struct btrfs_fs_info {
1442 1511
1443 struct reloc_control *reloc_ctl; 1512 struct reloc_control *reloc_ctl;
1444 1513
1445 spinlock_t delalloc_lock;
1446 u64 delalloc_bytes;
1447
1448 /* data_alloc_cluster is only used in ssd mode */ 1514 /* data_alloc_cluster is only used in ssd mode */
1449 struct btrfs_free_cluster data_alloc_cluster; 1515 struct btrfs_free_cluster data_alloc_cluster;
1450 1516
@@ -1456,6 +1522,8 @@ struct btrfs_fs_info {
1456 struct rb_root defrag_inodes; 1522 struct rb_root defrag_inodes;
1457 atomic_t defrag_running; 1523 atomic_t defrag_running;
1458 1524
1525 /* Used to protect avail_{data, metadata, system}_alloc_bits */
1526 seqlock_t profiles_lock;
1459 /* 1527 /*
1460 * these three are in extended format (availability of single 1528 * these three are in extended format (availability of single
1461 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other 1529 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
@@ -1520,7 +1588,7 @@ struct btrfs_fs_info {
1520 u64 qgroup_seq; 1588 u64 qgroup_seq;
1521 1589
1522 /* filesystem state */ 1590 /* filesystem state */
1523 u64 fs_state; 1591 unsigned long fs_state;
1524 1592
1525 struct btrfs_delayed_root *delayed_root; 1593 struct btrfs_delayed_root *delayed_root;
1526 1594
@@ -1623,6 +1691,9 @@ struct btrfs_root {
1623 1691
1624 struct list_head root_list; 1692 struct list_head root_list;
1625 1693
1694 spinlock_t log_extents_lock[2];
1695 struct list_head logged_list[2];
1696
1626 spinlock_t orphan_lock; 1697 spinlock_t orphan_lock;
1627 atomic_t orphan_inodes; 1698 atomic_t orphan_inodes;
1628 struct btrfs_block_rsv *orphan_block_rsv; 1699 struct btrfs_block_rsv *orphan_block_rsv;
@@ -1832,6 +1903,7 @@ struct btrfs_ioctl_defrag_range_args {
1832 1903
1833#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1904#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1834#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1905#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
1906#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt)
1835#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ 1907#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
1836 BTRFS_MOUNT_##opt) 1908 BTRFS_MOUNT_##opt)
1837/* 1909/*
@@ -2936,8 +3008,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
2936 u64 num_bytes, u64 *refs, u64 *flags); 3008 u64 num_bytes, u64 *refs, u64 *flags);
2937int btrfs_pin_extent(struct btrfs_root *root, 3009int btrfs_pin_extent(struct btrfs_root *root,
2938 u64 bytenr, u64 num, int reserved); 3010 u64 bytenr, u64 num, int reserved);
2939int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, 3011int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
2940 struct btrfs_root *root,
2941 u64 bytenr, u64 num_bytes); 3012 u64 bytenr, u64 num_bytes);
2942int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 3013int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2943 struct btrfs_root *root, 3014 struct btrfs_root *root,
@@ -3035,8 +3106,13 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3035int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 3106int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
3036 struct inode *inode); 3107 struct inode *inode);
3037void btrfs_orphan_release_metadata(struct inode *inode); 3108void btrfs_orphan_release_metadata(struct inode *inode);
3038int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, 3109int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
3039 struct btrfs_pending_snapshot *pending); 3110 struct btrfs_block_rsv *rsv,
3111 int nitems,
3112 u64 *qgroup_reserved);
3113void btrfs_subvolume_release_metadata(struct btrfs_root *root,
3114 struct btrfs_block_rsv *rsv,
3115 u64 qgroup_reserved);
3040int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); 3116int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
3041void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); 3117void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
3042int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); 3118int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
@@ -3092,10 +3168,10 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
3092struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); 3168struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
3093int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 3169int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
3094 struct btrfs_key *key, int lowest_level, 3170 struct btrfs_key *key, int lowest_level,
3095 int cache_only, u64 min_trans); 3171 u64 min_trans);
3096int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 3172int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3097 struct btrfs_key *max_key, 3173 struct btrfs_key *max_key,
3098 struct btrfs_path *path, int cache_only, 3174 struct btrfs_path *path,
3099 u64 min_trans); 3175 u64 min_trans);
3100enum btrfs_compare_tree_result { 3176enum btrfs_compare_tree_result {
3101 BTRFS_COMPARE_TREE_NEW, 3177 BTRFS_COMPARE_TREE_NEW,
@@ -3148,7 +3224,7 @@ int btrfs_search_slot_for_read(struct btrfs_root *root,
3148 int find_higher, int return_any); 3224 int find_higher, int return_any);
3149int btrfs_realloc_node(struct btrfs_trans_handle *trans, 3225int btrfs_realloc_node(struct btrfs_trans_handle *trans,
3150 struct btrfs_root *root, struct extent_buffer *parent, 3226 struct btrfs_root *root, struct extent_buffer *parent,
3151 int start_slot, int cache_only, u64 *last_ret, 3227 int start_slot, u64 *last_ret,
3152 struct btrfs_key *progress); 3228 struct btrfs_key *progress);
3153void btrfs_release_path(struct btrfs_path *p); 3229void btrfs_release_path(struct btrfs_path *p);
3154struct btrfs_path *btrfs_alloc_path(void); 3230struct btrfs_path *btrfs_alloc_path(void);
@@ -3459,9 +3535,9 @@ int btrfs_writepages(struct address_space *mapping,
3459 struct writeback_control *wbc); 3535 struct writeback_control *wbc);
3460int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 3536int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
3461 struct btrfs_root *new_root, u64 new_dirid); 3537 struct btrfs_root *new_root, u64 new_dirid);
3462int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 3538int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
3463 size_t size, struct bio *bio, unsigned long bio_flags); 3539 size_t size, struct bio *bio,
3464 3540 unsigned long bio_flags);
3465int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 3541int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
3466int btrfs_readpage(struct file *file, struct page *page); 3542int btrfs_readpage(struct file *file, struct page *page);
3467void btrfs_evict_inode(struct inode *inode); 3543void btrfs_evict_inode(struct inode *inode);
@@ -3543,7 +3619,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
3543 3619
3544/* tree-defrag.c */ 3620/* tree-defrag.c */
3545int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 3621int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
3546 struct btrfs_root *root, int cache_only); 3622 struct btrfs_root *root);
3547 3623
3548/* sysfs.c */ 3624/* sysfs.c */
3549int btrfs_init_sysfs(void); 3625int btrfs_init_sysfs(void);
@@ -3620,11 +3696,14 @@ __printf(5, 6)
3620void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, 3696void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
3621 unsigned int line, int errno, const char *fmt, ...); 3697 unsigned int line, int errno, const char *fmt, ...);
3622 3698
3699/*
3700 * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
3701 * will panic(). Otherwise we BUG() here.
3702 */
3623#define btrfs_panic(fs_info, errno, fmt, args...) \ 3703#define btrfs_panic(fs_info, errno, fmt, args...) \
3624do { \ 3704do { \
3625 struct btrfs_fs_info *_i = (fs_info); \ 3705 __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
3626 __btrfs_panic(_i, __func__, __LINE__, errno, fmt, ##args); \ 3706 BUG(); \
3627 BUG_ON(!(_i->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)); \
3628} while (0) 3707} while (0)
3629 3708
3630/* acl.c */ 3709/* acl.c */
@@ -3745,4 +3824,11 @@ static inline int is_fstree(u64 rootid)
3745 return 1; 3824 return 1;
3746 return 0; 3825 return 0;
3747} 3826}
3827
3828static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
3829{
3830 return signal_pending(current);
3831}
3832
3833
3748#endif 3834#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 34836036f01b..0b278b117cbe 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -875,7 +875,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
875 struct btrfs_delayed_item *delayed_item) 875 struct btrfs_delayed_item *delayed_item)
876{ 876{
877 struct extent_buffer *leaf; 877 struct extent_buffer *leaf;
878 struct btrfs_item *item;
879 char *ptr; 878 char *ptr;
880 int ret; 879 int ret;
881 880
@@ -886,7 +885,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
886 885
887 leaf = path->nodes[0]; 886 leaf = path->nodes[0];
888 887
889 item = btrfs_item_nr(leaf, path->slots[0]);
890 ptr = btrfs_item_ptr(leaf, path->slots[0], char); 888 ptr = btrfs_item_ptr(leaf, path->slots[0], char);
891 889
892 write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr, 890 write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
@@ -1065,32 +1063,25 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
1065 } 1063 }
1066} 1064}
1067 1065
1068static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, 1066static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1069 struct btrfs_root *root, 1067 struct btrfs_root *root,
1070 struct btrfs_path *path, 1068 struct btrfs_path *path,
1071 struct btrfs_delayed_node *node) 1069 struct btrfs_delayed_node *node)
1072{ 1070{
1073 struct btrfs_key key; 1071 struct btrfs_key key;
1074 struct btrfs_inode_item *inode_item; 1072 struct btrfs_inode_item *inode_item;
1075 struct extent_buffer *leaf; 1073 struct extent_buffer *leaf;
1076 int ret; 1074 int ret;
1077 1075
1078 mutex_lock(&node->mutex);
1079 if (!node->inode_dirty) {
1080 mutex_unlock(&node->mutex);
1081 return 0;
1082 }
1083
1084 key.objectid = node->inode_id; 1076 key.objectid = node->inode_id;
1085 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 1077 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
1086 key.offset = 0; 1078 key.offset = 0;
1079
1087 ret = btrfs_lookup_inode(trans, root, path, &key, 1); 1080 ret = btrfs_lookup_inode(trans, root, path, &key, 1);
1088 if (ret > 0) { 1081 if (ret > 0) {
1089 btrfs_release_path(path); 1082 btrfs_release_path(path);
1090 mutex_unlock(&node->mutex);
1091 return -ENOENT; 1083 return -ENOENT;
1092 } else if (ret < 0) { 1084 } else if (ret < 0) {
1093 mutex_unlock(&node->mutex);
1094 return ret; 1085 return ret;
1095 } 1086 }
1096 1087
@@ -1105,11 +1096,47 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1105 1096
1106 btrfs_delayed_inode_release_metadata(root, node); 1097 btrfs_delayed_inode_release_metadata(root, node);
1107 btrfs_release_delayed_inode(node); 1098 btrfs_release_delayed_inode(node);
1108 mutex_unlock(&node->mutex);
1109 1099
1110 return 0; 1100 return 0;
1111} 1101}
1112 1102
1103static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1104 struct btrfs_root *root,
1105 struct btrfs_path *path,
1106 struct btrfs_delayed_node *node)
1107{
1108 int ret;
1109
1110 mutex_lock(&node->mutex);
1111 if (!node->inode_dirty) {
1112 mutex_unlock(&node->mutex);
1113 return 0;
1114 }
1115
1116 ret = __btrfs_update_delayed_inode(trans, root, path, node);
1117 mutex_unlock(&node->mutex);
1118 return ret;
1119}
1120
1121static inline int
1122__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1123 struct btrfs_path *path,
1124 struct btrfs_delayed_node *node)
1125{
1126 int ret;
1127
1128 ret = btrfs_insert_delayed_items(trans, path, node->root, node);
1129 if (ret)
1130 return ret;
1131
1132 ret = btrfs_delete_delayed_items(trans, path, node->root, node);
1133 if (ret)
1134 return ret;
1135
1136 ret = btrfs_update_delayed_inode(trans, node->root, path, node);
1137 return ret;
1138}
1139
1113/* 1140/*
1114 * Called when committing the transaction. 1141 * Called when committing the transaction.
1115 * Returns 0 on success. 1142 * Returns 0 on success.
@@ -1119,7 +1146,6 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1119static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, 1146static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1120 struct btrfs_root *root, int nr) 1147 struct btrfs_root *root, int nr)
1121{ 1148{
1122 struct btrfs_root *curr_root = root;
1123 struct btrfs_delayed_root *delayed_root; 1149 struct btrfs_delayed_root *delayed_root;
1124 struct btrfs_delayed_node *curr_node, *prev_node; 1150 struct btrfs_delayed_node *curr_node, *prev_node;
1125 struct btrfs_path *path; 1151 struct btrfs_path *path;
@@ -1142,15 +1168,8 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1142 1168
1143 curr_node = btrfs_first_delayed_node(delayed_root); 1169 curr_node = btrfs_first_delayed_node(delayed_root);
1144 while (curr_node && (!count || (count && nr--))) { 1170 while (curr_node && (!count || (count && nr--))) {
1145 curr_root = curr_node->root; 1171 ret = __btrfs_commit_inode_delayed_items(trans, path,
1146 ret = btrfs_insert_delayed_items(trans, path, curr_root, 1172 curr_node);
1147 curr_node);
1148 if (!ret)
1149 ret = btrfs_delete_delayed_items(trans, path,
1150 curr_root, curr_node);
1151 if (!ret)
1152 ret = btrfs_update_delayed_inode(trans, curr_root,
1153 path, curr_node);
1154 if (ret) { 1173 if (ret) {
1155 btrfs_release_delayed_node(curr_node); 1174 btrfs_release_delayed_node(curr_node);
1156 curr_node = NULL; 1175 curr_node = NULL;
@@ -1183,51 +1202,93 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
1183 return __btrfs_run_delayed_items(trans, root, nr); 1202 return __btrfs_run_delayed_items(trans, root, nr);
1184} 1203}
1185 1204
1186static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, 1205int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1187 struct btrfs_delayed_node *node) 1206 struct inode *inode)
1188{ 1207{
1208 struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
1189 struct btrfs_path *path; 1209 struct btrfs_path *path;
1190 struct btrfs_block_rsv *block_rsv; 1210 struct btrfs_block_rsv *block_rsv;
1191 int ret; 1211 int ret;
1192 1212
1213 if (!delayed_node)
1214 return 0;
1215
1216 mutex_lock(&delayed_node->mutex);
1217 if (!delayed_node->count) {
1218 mutex_unlock(&delayed_node->mutex);
1219 btrfs_release_delayed_node(delayed_node);
1220 return 0;
1221 }
1222 mutex_unlock(&delayed_node->mutex);
1223
1193 path = btrfs_alloc_path(); 1224 path = btrfs_alloc_path();
1194 if (!path) 1225 if (!path)
1195 return -ENOMEM; 1226 return -ENOMEM;
1196 path->leave_spinning = 1; 1227 path->leave_spinning = 1;
1197 1228
1198 block_rsv = trans->block_rsv; 1229 block_rsv = trans->block_rsv;
1199 trans->block_rsv = &node->root->fs_info->delayed_block_rsv; 1230 trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
1200 1231
1201 ret = btrfs_insert_delayed_items(trans, path, node->root, node); 1232 ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
1202 if (!ret)
1203 ret = btrfs_delete_delayed_items(trans, path, node->root, node);
1204 if (!ret)
1205 ret = btrfs_update_delayed_inode(trans, node->root, path, node);
1206 btrfs_free_path(path);
1207 1233
1234 btrfs_release_delayed_node(delayed_node);
1235 btrfs_free_path(path);
1208 trans->block_rsv = block_rsv; 1236 trans->block_rsv = block_rsv;
1237
1209 return ret; 1238 return ret;
1210} 1239}
1211 1240
1212int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, 1241int btrfs_commit_inode_delayed_inode(struct inode *inode)
1213 struct inode *inode)
1214{ 1242{
1243 struct btrfs_trans_handle *trans;
1215 struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); 1244 struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
1245 struct btrfs_path *path;
1246 struct btrfs_block_rsv *block_rsv;
1216 int ret; 1247 int ret;
1217 1248
1218 if (!delayed_node) 1249 if (!delayed_node)
1219 return 0; 1250 return 0;
1220 1251
1221 mutex_lock(&delayed_node->mutex); 1252 mutex_lock(&delayed_node->mutex);
1222 if (!delayed_node->count) { 1253 if (!delayed_node->inode_dirty) {
1223 mutex_unlock(&delayed_node->mutex); 1254 mutex_unlock(&delayed_node->mutex);
1224 btrfs_release_delayed_node(delayed_node); 1255 btrfs_release_delayed_node(delayed_node);
1225 return 0; 1256 return 0;
1226 } 1257 }
1227 mutex_unlock(&delayed_node->mutex); 1258 mutex_unlock(&delayed_node->mutex);
1228 1259
1229 ret = __btrfs_commit_inode_delayed_items(trans, delayed_node); 1260 trans = btrfs_join_transaction(delayed_node->root);
1261 if (IS_ERR(trans)) {
1262 ret = PTR_ERR(trans);
1263 goto out;
1264 }
1265
1266 path = btrfs_alloc_path();
1267 if (!path) {
1268 ret = -ENOMEM;
1269 goto trans_out;
1270 }
1271 path->leave_spinning = 1;
1272
1273 block_rsv = trans->block_rsv;
1274 trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
1275
1276 mutex_lock(&delayed_node->mutex);
1277 if (delayed_node->inode_dirty)
1278 ret = __btrfs_update_delayed_inode(trans, delayed_node->root,
1279 path, delayed_node);
1280 else
1281 ret = 0;
1282 mutex_unlock(&delayed_node->mutex);
1283
1284 btrfs_free_path(path);
1285 trans->block_rsv = block_rsv;
1286trans_out:
1287 btrfs_end_transaction(trans, delayed_node->root);
1288 btrfs_btree_balance_dirty(delayed_node->root);
1289out:
1230 btrfs_release_delayed_node(delayed_node); 1290 btrfs_release_delayed_node(delayed_node);
1291
1231 return ret; 1292 return ret;
1232} 1293}
1233 1294
@@ -1258,7 +1319,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1258 struct btrfs_root *root; 1319 struct btrfs_root *root;
1259 struct btrfs_block_rsv *block_rsv; 1320 struct btrfs_block_rsv *block_rsv;
1260 int need_requeue = 0; 1321 int need_requeue = 0;
1261 int ret;
1262 1322
1263 async_node = container_of(work, struct btrfs_async_delayed_node, work); 1323 async_node = container_of(work, struct btrfs_async_delayed_node, work);
1264 1324
@@ -1277,14 +1337,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1277 block_rsv = trans->block_rsv; 1337 block_rsv = trans->block_rsv;
1278 trans->block_rsv = &root->fs_info->delayed_block_rsv; 1338 trans->block_rsv = &root->fs_info->delayed_block_rsv;
1279 1339
1280 ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); 1340 __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
1281 if (!ret)
1282 ret = btrfs_delete_delayed_items(trans, path, root,
1283 delayed_node);
1284
1285 if (!ret)
1286 btrfs_update_delayed_inode(trans, root, path, delayed_node);
1287
1288 /* 1341 /*
1289 * Maybe new delayed items have been inserted, so we need requeue 1342 * Maybe new delayed items have been inserted, so we need requeue
1290 * the work. Besides that, we must dequeue the empty delayed nodes 1343 * the work. Besides that, we must dequeue the empty delayed nodes
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 4f808e1baeed..78b6ad0fc669 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -117,6 +117,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
117/* Used for evicting the inode. */ 117/* Used for evicting the inode. */
118void btrfs_remove_delayed_node(struct inode *inode); 118void btrfs_remove_delayed_node(struct inode *inode);
119void btrfs_kill_delayed_inode_items(struct inode *inode); 119void btrfs_kill_delayed_inode_items(struct inode *inode);
120int btrfs_commit_inode_delayed_inode(struct inode *inode);
120 121
121 122
122int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, 123int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index ae9411773397..b7a0641ead77 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -23,6 +23,10 @@
23#include "delayed-ref.h" 23#include "delayed-ref.h"
24#include "transaction.h" 24#include "transaction.h"
25 25
26struct kmem_cache *btrfs_delayed_ref_head_cachep;
27struct kmem_cache *btrfs_delayed_tree_ref_cachep;
28struct kmem_cache *btrfs_delayed_data_ref_cachep;
29struct kmem_cache *btrfs_delayed_extent_op_cachep;
26/* 30/*
27 * delayed back reference update tracking. For subvolume trees 31 * delayed back reference update tracking. For subvolume trees
28 * we queue up extent allocations and backref maintenance for 32 * we queue up extent allocations and backref maintenance for
@@ -422,6 +426,14 @@ again:
422 return 1; 426 return 1;
423} 427}
424 428
429void btrfs_release_ref_cluster(struct list_head *cluster)
430{
431 struct list_head *pos, *q;
432
433 list_for_each_safe(pos, q, cluster)
434 list_del_init(pos);
435}
436
425/* 437/*
426 * helper function to update an extent delayed ref in the 438 * helper function to update an extent delayed ref in the
427 * rbtree. existing and update must both have the same 439 * rbtree. existing and update must both have the same
@@ -511,7 +523,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
511 ref->extent_op->flags_to_set; 523 ref->extent_op->flags_to_set;
512 existing_ref->extent_op->update_flags = 1; 524 existing_ref->extent_op->update_flags = 1;
513 } 525 }
514 kfree(ref->extent_op); 526 btrfs_free_delayed_extent_op(ref->extent_op);
515 } 527 }
516 } 528 }
517 /* 529 /*
@@ -592,7 +604,7 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
592 * we've updated the existing ref, free the newly 604 * we've updated the existing ref, free the newly
593 * allocated ref 605 * allocated ref
594 */ 606 */
595 kfree(head_ref); 607 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
596 } else { 608 } else {
597 delayed_refs->num_heads++; 609 delayed_refs->num_heads++;
598 delayed_refs->num_heads_ready++; 610 delayed_refs->num_heads_ready++;
@@ -653,7 +665,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
653 * we've updated the existing ref, free the newly 665 * we've updated the existing ref, free the newly
654 * allocated ref 666 * allocated ref
655 */ 667 */
656 kfree(full_ref); 668 kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
657 } else { 669 } else {
658 delayed_refs->num_entries++; 670 delayed_refs->num_entries++;
659 trans->delayed_ref_updates++; 671 trans->delayed_ref_updates++;
@@ -714,7 +726,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
714 * we've updated the existing ref, free the newly 726 * we've updated the existing ref, free the newly
715 * allocated ref 727 * allocated ref
716 */ 728 */
717 kfree(full_ref); 729 kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
718 } else { 730 } else {
719 delayed_refs->num_entries++; 731 delayed_refs->num_entries++;
720 trans->delayed_ref_updates++; 732 trans->delayed_ref_updates++;
@@ -738,13 +750,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
738 struct btrfs_delayed_ref_root *delayed_refs; 750 struct btrfs_delayed_ref_root *delayed_refs;
739 751
740 BUG_ON(extent_op && extent_op->is_data); 752 BUG_ON(extent_op && extent_op->is_data);
741 ref = kmalloc(sizeof(*ref), GFP_NOFS); 753 ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
742 if (!ref) 754 if (!ref)
743 return -ENOMEM; 755 return -ENOMEM;
744 756
745 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); 757 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
746 if (!head_ref) { 758 if (!head_ref) {
747 kfree(ref); 759 kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
748 return -ENOMEM; 760 return -ENOMEM;
749 } 761 }
750 762
@@ -786,13 +798,13 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
786 struct btrfs_delayed_ref_root *delayed_refs; 798 struct btrfs_delayed_ref_root *delayed_refs;
787 799
788 BUG_ON(extent_op && !extent_op->is_data); 800 BUG_ON(extent_op && !extent_op->is_data);
789 ref = kmalloc(sizeof(*ref), GFP_NOFS); 801 ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
790 if (!ref) 802 if (!ref)
791 return -ENOMEM; 803 return -ENOMEM;
792 804
793 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); 805 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
794 if (!head_ref) { 806 if (!head_ref) {
795 kfree(ref); 807 kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
796 return -ENOMEM; 808 return -ENOMEM;
797 } 809 }
798 810
@@ -826,7 +838,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
826 struct btrfs_delayed_ref_head *head_ref; 838 struct btrfs_delayed_ref_head *head_ref;
827 struct btrfs_delayed_ref_root *delayed_refs; 839 struct btrfs_delayed_ref_root *delayed_refs;
828 840
829 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); 841 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
830 if (!head_ref) 842 if (!head_ref)
831 return -ENOMEM; 843 return -ENOMEM;
832 844
@@ -860,3 +872,51 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
860 return btrfs_delayed_node_to_head(ref); 872 return btrfs_delayed_node_to_head(ref);
861 return NULL; 873 return NULL;
862} 874}
875
876void btrfs_delayed_ref_exit(void)
877{
878 if (btrfs_delayed_ref_head_cachep)
879 kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
880 if (btrfs_delayed_tree_ref_cachep)
881 kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
882 if (btrfs_delayed_data_ref_cachep)
883 kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
884 if (btrfs_delayed_extent_op_cachep)
885 kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
886}
887
888int btrfs_delayed_ref_init(void)
889{
890 btrfs_delayed_ref_head_cachep = kmem_cache_create(
891 "btrfs_delayed_ref_head",
892 sizeof(struct btrfs_delayed_ref_head), 0,
893 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
894 if (!btrfs_delayed_ref_head_cachep)
895 goto fail;
896
897 btrfs_delayed_tree_ref_cachep = kmem_cache_create(
898 "btrfs_delayed_tree_ref",
899 sizeof(struct btrfs_delayed_tree_ref), 0,
900 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
901 if (!btrfs_delayed_tree_ref_cachep)
902 goto fail;
903
904 btrfs_delayed_data_ref_cachep = kmem_cache_create(
905 "btrfs_delayed_data_ref",
906 sizeof(struct btrfs_delayed_data_ref), 0,
907 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
908 if (!btrfs_delayed_data_ref_cachep)
909 goto fail;
910
911 btrfs_delayed_extent_op_cachep = kmem_cache_create(
912 "btrfs_delayed_extent_op",
913 sizeof(struct btrfs_delayed_extent_op), 0,
914 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
915 if (!btrfs_delayed_extent_op_cachep)
916 goto fail;
917
918 return 0;
919fail:
920 btrfs_delayed_ref_exit();
921 return -ENOMEM;
922}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index c9d703693df0..f75fcaf79aeb 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -132,6 +132,15 @@ struct btrfs_delayed_ref_root {
132 unsigned long num_heads_ready; 132 unsigned long num_heads_ready;
133 133
134 /* 134 /*
135 * bumped when someone is making progress on the delayed
136 * refs, so that other procs know they are just adding to
137 * contention intead of helping
138 */
139 atomic_t procs_running_refs;
140 atomic_t ref_seq;
141 wait_queue_head_t wait;
142
143 /*
135 * set when the tree is flushing before a transaction commit, 144 * set when the tree is flushing before a transaction commit,
136 * used by the throttling code to decide if new updates need 145 * used by the throttling code to decide if new updates need
137 * to be run right away 146 * to be run right away
@@ -141,12 +150,47 @@ struct btrfs_delayed_ref_root {
141 u64 run_delayed_start; 150 u64 run_delayed_start;
142}; 151};
143 152
153extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
154extern struct kmem_cache *btrfs_delayed_tree_ref_cachep;
155extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
156extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
157
158int btrfs_delayed_ref_init(void);
159void btrfs_delayed_ref_exit(void);
160
161static inline struct btrfs_delayed_extent_op *
162btrfs_alloc_delayed_extent_op(void)
163{
164 return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS);
165}
166
167static inline void
168btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
169{
170 if (op)
171 kmem_cache_free(btrfs_delayed_extent_op_cachep, op);
172}
173
144static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) 174static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
145{ 175{
146 WARN_ON(atomic_read(&ref->refs) == 0); 176 WARN_ON(atomic_read(&ref->refs) == 0);
147 if (atomic_dec_and_test(&ref->refs)) { 177 if (atomic_dec_and_test(&ref->refs)) {
148 WARN_ON(ref->in_tree); 178 WARN_ON(ref->in_tree);
149 kfree(ref); 179 switch (ref->type) {
180 case BTRFS_TREE_BLOCK_REF_KEY:
181 case BTRFS_SHARED_BLOCK_REF_KEY:
182 kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
183 break;
184 case BTRFS_EXTENT_DATA_REF_KEY:
185 case BTRFS_SHARED_DATA_REF_KEY:
186 kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
187 break;
188 case 0:
189 kmem_cache_free(btrfs_delayed_ref_head_cachep, ref);
190 break;
191 default:
192 BUG();
193 }
150 } 194 }
151} 195}
152 196
@@ -176,8 +220,14 @@ struct btrfs_delayed_ref_head *
176btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); 220btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
177int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, 221int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
178 struct btrfs_delayed_ref_head *head); 222 struct btrfs_delayed_ref_head *head);
223static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
224{
225 mutex_unlock(&head->mutex);
226}
227
179int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 228int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
180 struct list_head *cluster, u64 search_start); 229 struct list_head *cluster, u64 search_start);
230void btrfs_release_ref_cluster(struct list_head *cluster);
181 231
182int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, 232int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
183 struct btrfs_delayed_ref_root *delayed_refs, 233 struct btrfs_delayed_ref_root *delayed_refs,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 66dbc8dbddf7..7ba7b3900cb8 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -465,7 +465,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
465 * flush all outstanding I/O and inode extent mappings before the 465 * flush all outstanding I/O and inode extent mappings before the
466 * copy operation is declared as being finished 466 * copy operation is declared as being finished
467 */ 467 */
468 btrfs_start_delalloc_inodes(root, 0); 468 ret = btrfs_start_delalloc_inodes(root, 0);
469 if (ret) {
470 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
471 return ret;
472 }
469 btrfs_wait_ordered_extents(root, 0); 473 btrfs_wait_ordered_extents(root, 0);
470 474
471 trans = btrfs_start_transaction(root, 0); 475 trans = btrfs_start_transaction(root, 0);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a8f652dc940b..02369a3c162e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,6 +46,7 @@
46#include "check-integrity.h" 46#include "check-integrity.h"
47#include "rcu-string.h" 47#include "rcu-string.h"
48#include "dev-replace.h" 48#include "dev-replace.h"
49#include "raid56.h"
49 50
50#ifdef CONFIG_X86 51#ifdef CONFIG_X86
51#include <asm/cpufeature.h> 52#include <asm/cpufeature.h>
@@ -56,7 +57,8 @@ static void end_workqueue_fn(struct btrfs_work *work);
56static void free_fs_root(struct btrfs_root *root); 57static void free_fs_root(struct btrfs_root *root);
57static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, 58static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
58 int read_only); 59 int read_only);
59static void btrfs_destroy_ordered_operations(struct btrfs_root *root); 60static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
61 struct btrfs_root *root);
60static void btrfs_destroy_ordered_extents(struct btrfs_root *root); 62static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
61static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, 63static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
62 struct btrfs_root *root); 64 struct btrfs_root *root);
@@ -420,7 +422,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
420static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) 422static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
421{ 423{
422 struct extent_io_tree *tree; 424 struct extent_io_tree *tree;
423 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 425 u64 start = page_offset(page);
424 u64 found_start; 426 u64 found_start;
425 struct extent_buffer *eb; 427 struct extent_buffer *eb;
426 428
@@ -639,8 +641,15 @@ err:
639 btree_readahead_hook(root, eb, eb->start, ret); 641 btree_readahead_hook(root, eb, eb->start, ret);
640 } 642 }
641 643
642 if (ret) 644 if (ret) {
645 /*
646 * our io error hook is going to dec the io pages
647 * again, we have to make sure it has something
648 * to decrement
649 */
650 atomic_inc(&eb->io_pages);
643 clear_extent_buffer_uptodate(eb); 651 clear_extent_buffer_uptodate(eb);
652 }
644 free_extent_buffer(eb); 653 free_extent_buffer(eb);
645out: 654out:
646 return ret; 655 return ret;
@@ -654,6 +663,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
654 eb = (struct extent_buffer *)page->private; 663 eb = (struct extent_buffer *)page->private;
655 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 664 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
656 eb->read_mirror = failed_mirror; 665 eb->read_mirror = failed_mirror;
666 atomic_dec(&eb->io_pages);
657 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) 667 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
658 btree_readahead_hook(root, eb, eb->start, -EIO); 668 btree_readahead_hook(root, eb, eb->start, -EIO);
659 return -EIO; /* we fixed nothing */ 669 return -EIO; /* we fixed nothing */
@@ -670,17 +680,23 @@ static void end_workqueue_bio(struct bio *bio, int err)
670 end_io_wq->work.flags = 0; 680 end_io_wq->work.flags = 0;
671 681
672 if (bio->bi_rw & REQ_WRITE) { 682 if (bio->bi_rw & REQ_WRITE) {
673 if (end_io_wq->metadata == 1) 683 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
674 btrfs_queue_worker(&fs_info->endio_meta_write_workers, 684 btrfs_queue_worker(&fs_info->endio_meta_write_workers,
675 &end_io_wq->work); 685 &end_io_wq->work);
676 else if (end_io_wq->metadata == 2) 686 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
677 btrfs_queue_worker(&fs_info->endio_freespace_worker, 687 btrfs_queue_worker(&fs_info->endio_freespace_worker,
678 &end_io_wq->work); 688 &end_io_wq->work);
689 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
690 btrfs_queue_worker(&fs_info->endio_raid56_workers,
691 &end_io_wq->work);
679 else 692 else
680 btrfs_queue_worker(&fs_info->endio_write_workers, 693 btrfs_queue_worker(&fs_info->endio_write_workers,
681 &end_io_wq->work); 694 &end_io_wq->work);
682 } else { 695 } else {
683 if (end_io_wq->metadata) 696 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
697 btrfs_queue_worker(&fs_info->endio_raid56_workers,
698 &end_io_wq->work);
699 else if (end_io_wq->metadata)
684 btrfs_queue_worker(&fs_info->endio_meta_workers, 700 btrfs_queue_worker(&fs_info->endio_meta_workers,
685 &end_io_wq->work); 701 &end_io_wq->work);
686 else 702 else
@@ -695,6 +711,7 @@ static void end_workqueue_bio(struct bio *bio, int err)
695 * 0 - if data 711 * 0 - if data
696 * 1 - if normal metadta 712 * 1 - if normal metadta
697 * 2 - if writing to the free space cache area 713 * 2 - if writing to the free space cache area
714 * 3 - raid parity work
698 */ 715 */
699int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 716int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
700 int metadata) 717 int metadata)
@@ -946,18 +963,20 @@ static int btree_writepages(struct address_space *mapping,
946 struct writeback_control *wbc) 963 struct writeback_control *wbc)
947{ 964{
948 struct extent_io_tree *tree; 965 struct extent_io_tree *tree;
966 struct btrfs_fs_info *fs_info;
967 int ret;
968
949 tree = &BTRFS_I(mapping->host)->io_tree; 969 tree = &BTRFS_I(mapping->host)->io_tree;
950 if (wbc->sync_mode == WB_SYNC_NONE) { 970 if (wbc->sync_mode == WB_SYNC_NONE) {
951 struct btrfs_root *root = BTRFS_I(mapping->host)->root;
952 u64 num_dirty;
953 unsigned long thresh = 32 * 1024 * 1024;
954 971
955 if (wbc->for_kupdate) 972 if (wbc->for_kupdate)
956 return 0; 973 return 0;
957 974
975 fs_info = BTRFS_I(mapping->host)->root->fs_info;
958 /* this is a bit racy, but that's ok */ 976 /* this is a bit racy, but that's ok */
959 num_dirty = root->fs_info->dirty_metadata_bytes; 977 ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
960 if (num_dirty < thresh) 978 BTRFS_DIRTY_METADATA_THRESH);
979 if (ret < 0)
961 return 0; 980 return 0;
962 } 981 }
963 return btree_write_cache_pages(mapping, wbc); 982 return btree_write_cache_pages(mapping, wbc);
@@ -1125,24 +1144,16 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
1125void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1144void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1126 struct extent_buffer *buf) 1145 struct extent_buffer *buf)
1127{ 1146{
1147 struct btrfs_fs_info *fs_info = root->fs_info;
1148
1128 if (btrfs_header_generation(buf) == 1149 if (btrfs_header_generation(buf) ==
1129 root->fs_info->running_transaction->transid) { 1150 fs_info->running_transaction->transid) {
1130 btrfs_assert_tree_locked(buf); 1151 btrfs_assert_tree_locked(buf);
1131 1152
1132 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { 1153 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
1133 spin_lock(&root->fs_info->delalloc_lock); 1154 __percpu_counter_add(&fs_info->dirty_metadata_bytes,
1134 if (root->fs_info->dirty_metadata_bytes >= buf->len) 1155 -buf->len,
1135 root->fs_info->dirty_metadata_bytes -= buf->len; 1156 fs_info->dirty_metadata_batch);
1136 else {
1137 spin_unlock(&root->fs_info->delalloc_lock);
1138 btrfs_panic(root->fs_info, -EOVERFLOW,
1139 "Can't clear %lu bytes from "
1140 " dirty_mdatadata_bytes (%llu)",
1141 buf->len,
1142 root->fs_info->dirty_metadata_bytes);
1143 }
1144 spin_unlock(&root->fs_info->delalloc_lock);
1145
1146 /* ugh, clear_extent_buffer_dirty needs to lock the page */ 1157 /* ugh, clear_extent_buffer_dirty needs to lock the page */
1147 btrfs_set_lock_blocking(buf); 1158 btrfs_set_lock_blocking(buf);
1148 clear_extent_buffer_dirty(buf); 1159 clear_extent_buffer_dirty(buf);
@@ -1178,9 +1189,13 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1178 1189
1179 INIT_LIST_HEAD(&root->dirty_list); 1190 INIT_LIST_HEAD(&root->dirty_list);
1180 INIT_LIST_HEAD(&root->root_list); 1191 INIT_LIST_HEAD(&root->root_list);
1192 INIT_LIST_HEAD(&root->logged_list[0]);
1193 INIT_LIST_HEAD(&root->logged_list[1]);
1181 spin_lock_init(&root->orphan_lock); 1194 spin_lock_init(&root->orphan_lock);
1182 spin_lock_init(&root->inode_lock); 1195 spin_lock_init(&root->inode_lock);
1183 spin_lock_init(&root->accounting_lock); 1196 spin_lock_init(&root->accounting_lock);
1197 spin_lock_init(&root->log_extents_lock[0]);
1198 spin_lock_init(&root->log_extents_lock[1]);
1184 mutex_init(&root->objectid_mutex); 1199 mutex_init(&root->objectid_mutex);
1185 mutex_init(&root->log_mutex); 1200 mutex_init(&root->log_mutex);
1186 init_waitqueue_head(&root->log_writer_wait); 1201 init_waitqueue_head(&root->log_writer_wait);
@@ -2004,10 +2019,24 @@ int open_ctree(struct super_block *sb,
2004 goto fail_srcu; 2019 goto fail_srcu;
2005 } 2020 }
2006 2021
2022 ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0);
2023 if (ret) {
2024 err = ret;
2025 goto fail_bdi;
2026 }
2027 fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
2028 (1 + ilog2(nr_cpu_ids));
2029
2030 ret = percpu_counter_init(&fs_info->delalloc_bytes, 0);
2031 if (ret) {
2032 err = ret;
2033 goto fail_dirty_metadata_bytes;
2034 }
2035
2007 fs_info->btree_inode = new_inode(sb); 2036 fs_info->btree_inode = new_inode(sb);
2008 if (!fs_info->btree_inode) { 2037 if (!fs_info->btree_inode) {
2009 err = -ENOMEM; 2038 err = -ENOMEM;
2010 goto fail_bdi; 2039 goto fail_delalloc_bytes;
2011 } 2040 }
2012 2041
2013 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); 2042 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -2017,7 +2046,6 @@ int open_ctree(struct super_block *sb,
2017 INIT_LIST_HEAD(&fs_info->dead_roots); 2046 INIT_LIST_HEAD(&fs_info->dead_roots);
2018 INIT_LIST_HEAD(&fs_info->delayed_iputs); 2047 INIT_LIST_HEAD(&fs_info->delayed_iputs);
2019 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 2048 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
2020 INIT_LIST_HEAD(&fs_info->ordered_operations);
2021 INIT_LIST_HEAD(&fs_info->caching_block_groups); 2049 INIT_LIST_HEAD(&fs_info->caching_block_groups);
2022 spin_lock_init(&fs_info->delalloc_lock); 2050 spin_lock_init(&fs_info->delalloc_lock);
2023 spin_lock_init(&fs_info->trans_lock); 2051 spin_lock_init(&fs_info->trans_lock);
@@ -2028,6 +2056,7 @@ int open_ctree(struct super_block *sb,
2028 spin_lock_init(&fs_info->tree_mod_seq_lock); 2056 spin_lock_init(&fs_info->tree_mod_seq_lock);
2029 rwlock_init(&fs_info->tree_mod_log_lock); 2057 rwlock_init(&fs_info->tree_mod_log_lock);
2030 mutex_init(&fs_info->reloc_mutex); 2058 mutex_init(&fs_info->reloc_mutex);
2059 seqlock_init(&fs_info->profiles_lock);
2031 2060
2032 init_completion(&fs_info->kobj_unregister); 2061 init_completion(&fs_info->kobj_unregister);
2033 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 2062 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2126,6 +2155,7 @@ int open_ctree(struct super_block *sb,
2126 2155
2127 spin_lock_init(&fs_info->block_group_cache_lock); 2156 spin_lock_init(&fs_info->block_group_cache_lock);
2128 fs_info->block_group_cache_tree = RB_ROOT; 2157 fs_info->block_group_cache_tree = RB_ROOT;
2158 fs_info->first_logical_byte = (u64)-1;
2129 2159
2130 extent_io_tree_init(&fs_info->freed_extents[0], 2160 extent_io_tree_init(&fs_info->freed_extents[0],
2131 fs_info->btree_inode->i_mapping); 2161 fs_info->btree_inode->i_mapping);
@@ -2165,6 +2195,12 @@ int open_ctree(struct super_block *sb,
2165 init_waitqueue_head(&fs_info->transaction_blocked_wait); 2195 init_waitqueue_head(&fs_info->transaction_blocked_wait);
2166 init_waitqueue_head(&fs_info->async_submit_wait); 2196 init_waitqueue_head(&fs_info->async_submit_wait);
2167 2197
2198 ret = btrfs_alloc_stripe_hash_table(fs_info);
2199 if (ret) {
2200 err = ret;
2201 goto fail_alloc;
2202 }
2203
2168 __setup_root(4096, 4096, 4096, 4096, tree_root, 2204 __setup_root(4096, 4096, 4096, 4096, tree_root,
2169 fs_info, BTRFS_ROOT_TREE_OBJECTID); 2205 fs_info, BTRFS_ROOT_TREE_OBJECTID);
2170 2206
@@ -2187,7 +2223,8 @@ int open_ctree(struct super_block *sb,
2187 goto fail_alloc; 2223 goto fail_alloc;
2188 2224
2189 /* check FS state, whether FS is broken. */ 2225 /* check FS state, whether FS is broken. */
2190 fs_info->fs_state |= btrfs_super_flags(disk_super); 2226 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
2227 set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
2191 2228
2192 ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); 2229 ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
2193 if (ret) { 2230 if (ret) {
@@ -2261,6 +2298,8 @@ int open_ctree(struct super_block *sb,
2261 leafsize = btrfs_super_leafsize(disk_super); 2298 leafsize = btrfs_super_leafsize(disk_super);
2262 sectorsize = btrfs_super_sectorsize(disk_super); 2299 sectorsize = btrfs_super_sectorsize(disk_super);
2263 stripesize = btrfs_super_stripesize(disk_super); 2300 stripesize = btrfs_super_stripesize(disk_super);
2301 fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids));
2302 fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
2264 2303
2265 /* 2304 /*
2266 * mixed block groups end up with duplicate but slightly offset 2305 * mixed block groups end up with duplicate but slightly offset
@@ -2332,6 +2371,12 @@ int open_ctree(struct super_block *sb,
2332 btrfs_init_workers(&fs_info->endio_meta_write_workers, 2371 btrfs_init_workers(&fs_info->endio_meta_write_workers,
2333 "endio-meta-write", fs_info->thread_pool_size, 2372 "endio-meta-write", fs_info->thread_pool_size,
2334 &fs_info->generic_worker); 2373 &fs_info->generic_worker);
2374 btrfs_init_workers(&fs_info->endio_raid56_workers,
2375 "endio-raid56", fs_info->thread_pool_size,
2376 &fs_info->generic_worker);
2377 btrfs_init_workers(&fs_info->rmw_workers,
2378 "rmw", fs_info->thread_pool_size,
2379 &fs_info->generic_worker);
2335 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", 2380 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
2336 fs_info->thread_pool_size, 2381 fs_info->thread_pool_size,
2337 &fs_info->generic_worker); 2382 &fs_info->generic_worker);
@@ -2350,6 +2395,8 @@ int open_ctree(struct super_block *sb,
2350 */ 2395 */
2351 fs_info->endio_workers.idle_thresh = 4; 2396 fs_info->endio_workers.idle_thresh = 4;
2352 fs_info->endio_meta_workers.idle_thresh = 4; 2397 fs_info->endio_meta_workers.idle_thresh = 4;
2398 fs_info->endio_raid56_workers.idle_thresh = 4;
2399 fs_info->rmw_workers.idle_thresh = 2;
2353 2400
2354 fs_info->endio_write_workers.idle_thresh = 2; 2401 fs_info->endio_write_workers.idle_thresh = 2;
2355 fs_info->endio_meta_write_workers.idle_thresh = 2; 2402 fs_info->endio_meta_write_workers.idle_thresh = 2;
@@ -2366,6 +2413,8 @@ int open_ctree(struct super_block *sb,
2366 ret |= btrfs_start_workers(&fs_info->fixup_workers); 2413 ret |= btrfs_start_workers(&fs_info->fixup_workers);
2367 ret |= btrfs_start_workers(&fs_info->endio_workers); 2414 ret |= btrfs_start_workers(&fs_info->endio_workers);
2368 ret |= btrfs_start_workers(&fs_info->endio_meta_workers); 2415 ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
2416 ret |= btrfs_start_workers(&fs_info->rmw_workers);
2417 ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);
2369 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); 2418 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
2370 ret |= btrfs_start_workers(&fs_info->endio_write_workers); 2419 ret |= btrfs_start_workers(&fs_info->endio_write_workers);
2371 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); 2420 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
@@ -2390,8 +2439,7 @@ int open_ctree(struct super_block *sb,
2390 sb->s_blocksize = sectorsize; 2439 sb->s_blocksize = sectorsize;
2391 sb->s_blocksize_bits = blksize_bits(sectorsize); 2440 sb->s_blocksize_bits = blksize_bits(sectorsize);
2392 2441
2393 if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, 2442 if (disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) {
2394 sizeof(disk_super->magic))) {
2395 printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); 2443 printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
2396 goto fail_sb_buffer; 2444 goto fail_sb_buffer;
2397 } 2445 }
@@ -2694,13 +2742,13 @@ fail_cleaner:
2694 * kthreads 2742 * kthreads
2695 */ 2743 */
2696 filemap_write_and_wait(fs_info->btree_inode->i_mapping); 2744 filemap_write_and_wait(fs_info->btree_inode->i_mapping);
2697 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2698 2745
2699fail_block_groups: 2746fail_block_groups:
2700 btrfs_free_block_groups(fs_info); 2747 btrfs_free_block_groups(fs_info);
2701 2748
2702fail_tree_roots: 2749fail_tree_roots:
2703 free_root_pointers(fs_info, 1); 2750 free_root_pointers(fs_info, 1);
2751 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2704 2752
2705fail_sb_buffer: 2753fail_sb_buffer:
2706 btrfs_stop_workers(&fs_info->generic_worker); 2754 btrfs_stop_workers(&fs_info->generic_worker);
@@ -2710,6 +2758,8 @@ fail_sb_buffer:
2710 btrfs_stop_workers(&fs_info->workers); 2758 btrfs_stop_workers(&fs_info->workers);
2711 btrfs_stop_workers(&fs_info->endio_workers); 2759 btrfs_stop_workers(&fs_info->endio_workers);
2712 btrfs_stop_workers(&fs_info->endio_meta_workers); 2760 btrfs_stop_workers(&fs_info->endio_meta_workers);
2761 btrfs_stop_workers(&fs_info->endio_raid56_workers);
2762 btrfs_stop_workers(&fs_info->rmw_workers);
2713 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2763 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2714 btrfs_stop_workers(&fs_info->endio_write_workers); 2764 btrfs_stop_workers(&fs_info->endio_write_workers);
2715 btrfs_stop_workers(&fs_info->endio_freespace_worker); 2765 btrfs_stop_workers(&fs_info->endio_freespace_worker);
@@ -2721,13 +2771,17 @@ fail_alloc:
2721fail_iput: 2771fail_iput:
2722 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2772 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2723 2773
2724 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2725 iput(fs_info->btree_inode); 2774 iput(fs_info->btree_inode);
2775fail_delalloc_bytes:
2776 percpu_counter_destroy(&fs_info->delalloc_bytes);
2777fail_dirty_metadata_bytes:
2778 percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
2726fail_bdi: 2779fail_bdi:
2727 bdi_destroy(&fs_info->bdi); 2780 bdi_destroy(&fs_info->bdi);
2728fail_srcu: 2781fail_srcu:
2729 cleanup_srcu_struct(&fs_info->subvol_srcu); 2782 cleanup_srcu_struct(&fs_info->subvol_srcu);
2730fail: 2783fail:
2784 btrfs_free_stripe_hash_table(fs_info);
2731 btrfs_close_devices(fs_info->fs_devices); 2785 btrfs_close_devices(fs_info->fs_devices);
2732 return err; 2786 return err;
2733 2787
@@ -2795,8 +2849,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
2795 2849
2796 super = (struct btrfs_super_block *)bh->b_data; 2850 super = (struct btrfs_super_block *)bh->b_data;
2797 if (btrfs_super_bytenr(super) != bytenr || 2851 if (btrfs_super_bytenr(super) != bytenr ||
2798 strncmp((char *)(&super->magic), BTRFS_MAGIC, 2852 super->magic != cpu_to_le64(BTRFS_MAGIC)) {
2799 sizeof(super->magic))) {
2800 brelse(bh); 2853 brelse(bh);
2801 continue; 2854 continue;
2802 } 2855 }
@@ -3076,11 +3129,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
3076 ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) 3129 ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
3077 == 0))) 3130 == 0)))
3078 num_tolerated_disk_barrier_failures = 0; 3131 num_tolerated_disk_barrier_failures = 0;
3079 else if (num_tolerated_disk_barrier_failures > 1 3132 else if (num_tolerated_disk_barrier_failures > 1) {
3080 && 3133 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3081 (flags & (BTRFS_BLOCK_GROUP_RAID1 | 3134 BTRFS_BLOCK_GROUP_RAID5 |
3082 BTRFS_BLOCK_GROUP_RAID10))) 3135 BTRFS_BLOCK_GROUP_RAID10)) {
3083 num_tolerated_disk_barrier_failures = 1; 3136 num_tolerated_disk_barrier_failures = 1;
3137 } else if (flags &
3138 BTRFS_BLOCK_GROUP_RAID5) {
3139 num_tolerated_disk_barrier_failures = 2;
3140 }
3141 }
3084 } 3142 }
3085 } 3143 }
3086 up_read(&sinfo->groups_sem); 3144 up_read(&sinfo->groups_sem);
@@ -3195,6 +3253,11 @@ void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
3195 if (btrfs_root_refs(&root->root_item) == 0) 3253 if (btrfs_root_refs(&root->root_item) == 0)
3196 synchronize_srcu(&fs_info->subvol_srcu); 3254 synchronize_srcu(&fs_info->subvol_srcu);
3197 3255
3256 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
3257 btrfs_free_log(NULL, root);
3258 btrfs_free_log_root_tree(NULL, fs_info);
3259 }
3260
3198 __btrfs_remove_free_space_cache(root->free_ino_pinned); 3261 __btrfs_remove_free_space_cache(root->free_ino_pinned);
3199 __btrfs_remove_free_space_cache(root->free_ino_ctl); 3262 __btrfs_remove_free_space_cache(root->free_ino_ctl);
3200 free_fs_root(root); 3263 free_fs_root(root);
@@ -3339,7 +3402,7 @@ int close_ctree(struct btrfs_root *root)
3339 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 3402 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
3340 } 3403 }
3341 3404
3342 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 3405 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3343 btrfs_error_commit_super(root); 3406 btrfs_error_commit_super(root);
3344 3407
3345 btrfs_put_block_group_cache(fs_info); 3408 btrfs_put_block_group_cache(fs_info);
@@ -3352,9 +3415,9 @@ int close_ctree(struct btrfs_root *root)
3352 3415
3353 btrfs_free_qgroup_config(root->fs_info); 3416 btrfs_free_qgroup_config(root->fs_info);
3354 3417
3355 if (fs_info->delalloc_bytes) { 3418 if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
3356 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", 3419 printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n",
3357 (unsigned long long)fs_info->delalloc_bytes); 3420 percpu_counter_sum(&fs_info->delalloc_bytes));
3358 } 3421 }
3359 3422
3360 free_extent_buffer(fs_info->extent_root->node); 3423 free_extent_buffer(fs_info->extent_root->node);
@@ -3384,6 +3447,8 @@ int close_ctree(struct btrfs_root *root)
3384 btrfs_stop_workers(&fs_info->workers); 3447 btrfs_stop_workers(&fs_info->workers);
3385 btrfs_stop_workers(&fs_info->endio_workers); 3448 btrfs_stop_workers(&fs_info->endio_workers);
3386 btrfs_stop_workers(&fs_info->endio_meta_workers); 3449 btrfs_stop_workers(&fs_info->endio_meta_workers);
3450 btrfs_stop_workers(&fs_info->endio_raid56_workers);
3451 btrfs_stop_workers(&fs_info->rmw_workers);
3387 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 3452 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
3388 btrfs_stop_workers(&fs_info->endio_write_workers); 3453 btrfs_stop_workers(&fs_info->endio_write_workers);
3389 btrfs_stop_workers(&fs_info->endio_freespace_worker); 3454 btrfs_stop_workers(&fs_info->endio_freespace_worker);
@@ -3401,9 +3466,13 @@ int close_ctree(struct btrfs_root *root)
3401 btrfs_close_devices(fs_info->fs_devices); 3466 btrfs_close_devices(fs_info->fs_devices);
3402 btrfs_mapping_tree_free(&fs_info->mapping_tree); 3467 btrfs_mapping_tree_free(&fs_info->mapping_tree);
3403 3468
3469 percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
3470 percpu_counter_destroy(&fs_info->delalloc_bytes);
3404 bdi_destroy(&fs_info->bdi); 3471 bdi_destroy(&fs_info->bdi);
3405 cleanup_srcu_struct(&fs_info->subvol_srcu); 3472 cleanup_srcu_struct(&fs_info->subvol_srcu);
3406 3473
3474 btrfs_free_stripe_hash_table(fs_info);
3475
3407 return 0; 3476 return 0;
3408} 3477}
3409 3478
@@ -3443,11 +3512,10 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
3443 (unsigned long long)transid, 3512 (unsigned long long)transid,
3444 (unsigned long long)root->fs_info->generation); 3513 (unsigned long long)root->fs_info->generation);
3445 was_dirty = set_extent_buffer_dirty(buf); 3514 was_dirty = set_extent_buffer_dirty(buf);
3446 if (!was_dirty) { 3515 if (!was_dirty)
3447 spin_lock(&root->fs_info->delalloc_lock); 3516 __percpu_counter_add(&root->fs_info->dirty_metadata_bytes,
3448 root->fs_info->dirty_metadata_bytes += buf->len; 3517 buf->len,
3449 spin_unlock(&root->fs_info->delalloc_lock); 3518 root->fs_info->dirty_metadata_batch);
3450 }
3451} 3519}
3452 3520
3453static void __btrfs_btree_balance_dirty(struct btrfs_root *root, 3521static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
@@ -3457,8 +3525,7 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
3457 * looks as though older kernels can get into trouble with 3525 * looks as though older kernels can get into trouble with
3458 * this code, they end up stuck in balance_dirty_pages forever 3526 * this code, they end up stuck in balance_dirty_pages forever
3459 */ 3527 */
3460 u64 num_dirty; 3528 int ret;
3461 unsigned long thresh = 32 * 1024 * 1024;
3462 3529
3463 if (current->flags & PF_MEMALLOC) 3530 if (current->flags & PF_MEMALLOC)
3464 return; 3531 return;
@@ -3466,9 +3533,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
3466 if (flush_delayed) 3533 if (flush_delayed)
3467 btrfs_balance_delayed_items(root); 3534 btrfs_balance_delayed_items(root);
3468 3535
3469 num_dirty = root->fs_info->dirty_metadata_bytes; 3536 ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
3470 3537 BTRFS_DIRTY_METADATA_THRESH);
3471 if (num_dirty > thresh) { 3538 if (ret > 0) {
3472 balance_dirty_pages_ratelimited( 3539 balance_dirty_pages_ratelimited(
3473 root->fs_info->btree_inode->i_mapping); 3540 root->fs_info->btree_inode->i_mapping);
3474 } 3541 }
@@ -3518,7 +3585,8 @@ void btrfs_error_commit_super(struct btrfs_root *root)
3518 btrfs_cleanup_transaction(root); 3585 btrfs_cleanup_transaction(root);
3519} 3586}
3520 3587
3521static void btrfs_destroy_ordered_operations(struct btrfs_root *root) 3588static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
3589 struct btrfs_root *root)
3522{ 3590{
3523 struct btrfs_inode *btrfs_inode; 3591 struct btrfs_inode *btrfs_inode;
3524 struct list_head splice; 3592 struct list_head splice;
@@ -3528,7 +3596,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
3528 mutex_lock(&root->fs_info->ordered_operations_mutex); 3596 mutex_lock(&root->fs_info->ordered_operations_mutex);
3529 spin_lock(&root->fs_info->ordered_extent_lock); 3597 spin_lock(&root->fs_info->ordered_extent_lock);
3530 3598
3531 list_splice_init(&root->fs_info->ordered_operations, &splice); 3599 list_splice_init(&t->ordered_operations, &splice);
3532 while (!list_empty(&splice)) { 3600 while (!list_empty(&splice)) {
3533 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 3601 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
3534 ordered_operations); 3602 ordered_operations);
@@ -3544,35 +3612,16 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
3544 3612
3545static void btrfs_destroy_ordered_extents(struct btrfs_root *root) 3613static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
3546{ 3614{
3547 struct list_head splice;
3548 struct btrfs_ordered_extent *ordered; 3615 struct btrfs_ordered_extent *ordered;
3549 struct inode *inode;
3550
3551 INIT_LIST_HEAD(&splice);
3552 3616
3553 spin_lock(&root->fs_info->ordered_extent_lock); 3617 spin_lock(&root->fs_info->ordered_extent_lock);
3554 3618 /*
3555 list_splice_init(&root->fs_info->ordered_extents, &splice); 3619 * This will just short circuit the ordered completion stuff which will
3556 while (!list_empty(&splice)) { 3620 * make sure the ordered extent gets properly cleaned up.
3557 ordered = list_entry(splice.next, struct btrfs_ordered_extent, 3621 */
3558 root_extent_list); 3622 list_for_each_entry(ordered, &root->fs_info->ordered_extents,
3559 3623 root_extent_list)
3560 list_del_init(&ordered->root_extent_list); 3624 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
3561 atomic_inc(&ordered->refs);
3562
3563 /* the inode may be getting freed (in sys_unlink path). */
3564 inode = igrab(ordered->inode);
3565
3566 spin_unlock(&root->fs_info->ordered_extent_lock);
3567 if (inode)
3568 iput(inode);
3569
3570 atomic_set(&ordered->refs, 1);
3571 btrfs_put_ordered_extent(ordered);
3572
3573 spin_lock(&root->fs_info->ordered_extent_lock);
3574 }
3575
3576 spin_unlock(&root->fs_info->ordered_extent_lock); 3625 spin_unlock(&root->fs_info->ordered_extent_lock);
3577} 3626}
3578 3627
@@ -3594,11 +3643,11 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3594 } 3643 }
3595 3644
3596 while ((node = rb_first(&delayed_refs->root)) != NULL) { 3645 while ((node = rb_first(&delayed_refs->root)) != NULL) {
3597 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 3646 struct btrfs_delayed_ref_head *head = NULL;
3598 3647
3648 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
3599 atomic_set(&ref->refs, 1); 3649 atomic_set(&ref->refs, 1);
3600 if (btrfs_delayed_ref_is_head(ref)) { 3650 if (btrfs_delayed_ref_is_head(ref)) {
3601 struct btrfs_delayed_ref_head *head;
3602 3651
3603 head = btrfs_delayed_node_to_head(ref); 3652 head = btrfs_delayed_node_to_head(ref);
3604 if (!mutex_trylock(&head->mutex)) { 3653 if (!mutex_trylock(&head->mutex)) {
@@ -3614,16 +3663,18 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3614 continue; 3663 continue;
3615 } 3664 }
3616 3665
3617 kfree(head->extent_op); 3666 btrfs_free_delayed_extent_op(head->extent_op);
3618 delayed_refs->num_heads--; 3667 delayed_refs->num_heads--;
3619 if (list_empty(&head->cluster)) 3668 if (list_empty(&head->cluster))
3620 delayed_refs->num_heads_ready--; 3669 delayed_refs->num_heads_ready--;
3621 list_del_init(&head->cluster); 3670 list_del_init(&head->cluster);
3622 } 3671 }
3672
3623 ref->in_tree = 0; 3673 ref->in_tree = 0;
3624 rb_erase(&ref->rb_node, &delayed_refs->root); 3674 rb_erase(&ref->rb_node, &delayed_refs->root);
3625 delayed_refs->num_entries--; 3675 delayed_refs->num_entries--;
3626 3676 if (head)
3677 mutex_unlock(&head->mutex);
3627 spin_unlock(&delayed_refs->lock); 3678 spin_unlock(&delayed_refs->lock);
3628 btrfs_put_delayed_ref(ref); 3679 btrfs_put_delayed_ref(ref);
3629 3680
@@ -3671,6 +3722,8 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
3671 delalloc_inodes); 3722 delalloc_inodes);
3672 3723
3673 list_del_init(&btrfs_inode->delalloc_inodes); 3724 list_del_init(&btrfs_inode->delalloc_inodes);
3725 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
3726 &btrfs_inode->runtime_flags);
3674 3727
3675 btrfs_invalidate_inodes(btrfs_inode->root); 3728 btrfs_invalidate_inodes(btrfs_inode->root);
3676 } 3729 }
@@ -3823,10 +3876,8 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
3823 3876
3824 while (!list_empty(&list)) { 3877 while (!list_empty(&list)) {
3825 t = list_entry(list.next, struct btrfs_transaction, list); 3878 t = list_entry(list.next, struct btrfs_transaction, list);
3826 if (!t)
3827 break;
3828 3879
3829 btrfs_destroy_ordered_operations(root); 3880 btrfs_destroy_ordered_operations(t, root);
3830 3881
3831 btrfs_destroy_ordered_extents(root); 3882 btrfs_destroy_ordered_extents(root);
3832 3883
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 305c33efb0e3..034d7dc552b2 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,6 +25,13 @@
25#define BTRFS_SUPER_MIRROR_MAX 3 25#define BTRFS_SUPER_MIRROR_MAX 3
26#define BTRFS_SUPER_MIRROR_SHIFT 12 26#define BTRFS_SUPER_MIRROR_SHIFT 12
27 27
28enum {
29 BTRFS_WQ_ENDIO_DATA = 0,
30 BTRFS_WQ_ENDIO_METADATA = 1,
31 BTRFS_WQ_ENDIO_FREE_SPACE = 2,
32 BTRFS_WQ_ENDIO_RAID56 = 3,
33};
34
28static inline u64 btrfs_sb_offset(int mirror) 35static inline u64 btrfs_sb_offset(int mirror)
29{ 36{
30 u64 start = 16 * 1024; 37 u64 start = 16 * 1024;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cf54bdfee334..3e074dab2d57 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,6 +31,7 @@
31#include "print-tree.h" 31#include "print-tree.h"
32#include "transaction.h" 32#include "transaction.h"
33#include "volumes.h" 33#include "volumes.h"
34#include "raid56.h"
34#include "locking.h" 35#include "locking.h"
35#include "free-space-cache.h" 36#include "free-space-cache.h"
36#include "math.h" 37#include "math.h"
@@ -72,8 +73,7 @@ enum {
72 RESERVE_ALLOC_NO_ACCOUNT = 2, 73 RESERVE_ALLOC_NO_ACCOUNT = 2,
73}; 74};
74 75
75static int update_block_group(struct btrfs_trans_handle *trans, 76static int update_block_group(struct btrfs_root *root,
76 struct btrfs_root *root,
77 u64 bytenr, u64 num_bytes, int alloc); 77 u64 bytenr, u64 num_bytes, int alloc);
78static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 78static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
79 struct btrfs_root *root, 79 struct btrfs_root *root,
@@ -103,6 +103,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
103 int dump_block_groups); 103 int dump_block_groups);
104static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 104static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
105 u64 num_bytes, int reserve); 105 u64 num_bytes, int reserve);
106static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
107 u64 num_bytes);
106 108
107static noinline int 109static noinline int
108block_group_cache_done(struct btrfs_block_group_cache *cache) 110block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -162,6 +164,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
162 rb_link_node(&block_group->cache_node, parent, p); 164 rb_link_node(&block_group->cache_node, parent, p);
163 rb_insert_color(&block_group->cache_node, 165 rb_insert_color(&block_group->cache_node,
164 &info->block_group_cache_tree); 166 &info->block_group_cache_tree);
167
168 if (info->first_logical_byte > block_group->key.objectid)
169 info->first_logical_byte = block_group->key.objectid;
170
165 spin_unlock(&info->block_group_cache_lock); 171 spin_unlock(&info->block_group_cache_lock);
166 172
167 return 0; 173 return 0;
@@ -203,8 +209,11 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
203 break; 209 break;
204 } 210 }
205 } 211 }
206 if (ret) 212 if (ret) {
207 btrfs_get_block_group(ret); 213 btrfs_get_block_group(ret);
214 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
215 info->first_logical_byte = ret->key.objectid;
216 }
208 spin_unlock(&info->block_group_cache_lock); 217 spin_unlock(&info->block_group_cache_lock);
209 218
210 return ret; 219 return ret;
@@ -468,8 +477,6 @@ out:
468} 477}
469 478
470static int cache_block_group(struct btrfs_block_group_cache *cache, 479static int cache_block_group(struct btrfs_block_group_cache *cache,
471 struct btrfs_trans_handle *trans,
472 struct btrfs_root *root,
473 int load_cache_only) 480 int load_cache_only)
474{ 481{
475 DEFINE_WAIT(wait); 482 DEFINE_WAIT(wait);
@@ -527,12 +534,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
527 cache->cached = BTRFS_CACHE_FAST; 534 cache->cached = BTRFS_CACHE_FAST;
528 spin_unlock(&cache->lock); 535 spin_unlock(&cache->lock);
529 536
530 /*
531 * We can't do the read from on-disk cache during a commit since we need
532 * to have the normal tree locking. Also if we are currently trying to
533 * allocate blocks for the tree root we can't do the fast caching since
534 * we likely hold important locks.
535 */
536 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 537 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
537 ret = load_free_space_cache(fs_info, cache); 538 ret = load_free_space_cache(fs_info, cache);
538 539
@@ -1852,6 +1853,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1852 *actual_bytes = discarded_bytes; 1853 *actual_bytes = discarded_bytes;
1853 1854
1854 1855
1856 if (ret == -EOPNOTSUPP)
1857 ret = 0;
1855 return ret; 1858 return ret;
1856} 1859}
1857 1860
@@ -2143,7 +2146,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2143 node->num_bytes); 2146 node->num_bytes);
2144 } 2147 }
2145 } 2148 }
2146 mutex_unlock(&head->mutex);
2147 return ret; 2149 return ret;
2148 } 2150 }
2149 2151
@@ -2258,7 +2260,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2258 * process of being added. Don't run this ref yet. 2260 * process of being added. Don't run this ref yet.
2259 */ 2261 */
2260 list_del_init(&locked_ref->cluster); 2262 list_del_init(&locked_ref->cluster);
2261 mutex_unlock(&locked_ref->mutex); 2263 btrfs_delayed_ref_unlock(locked_ref);
2262 locked_ref = NULL; 2264 locked_ref = NULL;
2263 delayed_refs->num_heads_ready++; 2265 delayed_refs->num_heads_ready++;
2264 spin_unlock(&delayed_refs->lock); 2266 spin_unlock(&delayed_refs->lock);
@@ -2285,7 +2287,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2285 ref = &locked_ref->node; 2287 ref = &locked_ref->node;
2286 2288
2287 if (extent_op && must_insert_reserved) { 2289 if (extent_op && must_insert_reserved) {
2288 kfree(extent_op); 2290 btrfs_free_delayed_extent_op(extent_op);
2289 extent_op = NULL; 2291 extent_op = NULL;
2290 } 2292 }
2291 2293
@@ -2294,28 +2296,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2294 2296
2295 ret = run_delayed_extent_op(trans, root, 2297 ret = run_delayed_extent_op(trans, root,
2296 ref, extent_op); 2298 ref, extent_op);
2297 kfree(extent_op); 2299 btrfs_free_delayed_extent_op(extent_op);
2298 2300
2299 if (ret) { 2301 if (ret) {
2300 list_del_init(&locked_ref->cluster); 2302 printk(KERN_DEBUG
2301 mutex_unlock(&locked_ref->mutex); 2303 "btrfs: run_delayed_extent_op "
2302 2304 "returned %d\n", ret);
2303 printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
2304 spin_lock(&delayed_refs->lock); 2305 spin_lock(&delayed_refs->lock);
2306 btrfs_delayed_ref_unlock(locked_ref);
2305 return ret; 2307 return ret;
2306 } 2308 }
2307 2309
2308 goto next; 2310 goto next;
2309 } 2311 }
2310
2311 list_del_init(&locked_ref->cluster);
2312 locked_ref = NULL;
2313 } 2312 }
2314 2313
2315 ref->in_tree = 0; 2314 ref->in_tree = 0;
2316 rb_erase(&ref->rb_node, &delayed_refs->root); 2315 rb_erase(&ref->rb_node, &delayed_refs->root);
2317 delayed_refs->num_entries--; 2316 delayed_refs->num_entries--;
2318 if (locked_ref) { 2317 if (!btrfs_delayed_ref_is_head(ref)) {
2319 /* 2318 /*
2320 * when we play the delayed ref, also correct the 2319 * when we play the delayed ref, also correct the
2321 * ref_mod on head 2320 * ref_mod on head
@@ -2337,20 +2336,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2337 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2336 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2338 must_insert_reserved); 2337 must_insert_reserved);
2339 2338
2340 btrfs_put_delayed_ref(ref); 2339 btrfs_free_delayed_extent_op(extent_op);
2341 kfree(extent_op);
2342 count++;
2343
2344 if (ret) { 2340 if (ret) {
2345 if (locked_ref) { 2341 btrfs_delayed_ref_unlock(locked_ref);
2346 list_del_init(&locked_ref->cluster); 2342 btrfs_put_delayed_ref(ref);
2347 mutex_unlock(&locked_ref->mutex); 2343 printk(KERN_DEBUG
2348 } 2344 "btrfs: run_one_delayed_ref returned %d\n", ret);
2349 printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
2350 spin_lock(&delayed_refs->lock); 2345 spin_lock(&delayed_refs->lock);
2351 return ret; 2346 return ret;
2352 } 2347 }
2353 2348
2349 /*
2350 * If this node is a head, that means all the refs in this head
2351 * have been dealt with, and we will pick the next head to deal
2352 * with, so we must unlock the head and drop it from the cluster
2353 * list before we release it.
2354 */
2355 if (btrfs_delayed_ref_is_head(ref)) {
2356 list_del_init(&locked_ref->cluster);
2357 btrfs_delayed_ref_unlock(locked_ref);
2358 locked_ref = NULL;
2359 }
2360 btrfs_put_delayed_ref(ref);
2361 count++;
2354next: 2362next:
2355 cond_resched(); 2363 cond_resched();
2356 spin_lock(&delayed_refs->lock); 2364 spin_lock(&delayed_refs->lock);
@@ -2435,6 +2443,16 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2435 return ret; 2443 return ret;
2436} 2444}
2437 2445
2446static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
2447 int count)
2448{
2449 int val = atomic_read(&delayed_refs->ref_seq);
2450
2451 if (val < seq || val >= seq + count)
2452 return 1;
2453 return 0;
2454}
2455
2438/* 2456/*
2439 * this starts processing the delayed reference count updates and 2457 * this starts processing the delayed reference count updates and
2440 * extent insertions we have queued up so far. count can be 2458 * extent insertions we have queued up so far. count can be
@@ -2469,6 +2487,44 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2469 2487
2470 delayed_refs = &trans->transaction->delayed_refs; 2488 delayed_refs = &trans->transaction->delayed_refs;
2471 INIT_LIST_HEAD(&cluster); 2489 INIT_LIST_HEAD(&cluster);
2490 if (count == 0) {
2491 count = delayed_refs->num_entries * 2;
2492 run_most = 1;
2493 }
2494
2495 if (!run_all && !run_most) {
2496 int old;
2497 int seq = atomic_read(&delayed_refs->ref_seq);
2498
2499progress:
2500 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2501 if (old) {
2502 DEFINE_WAIT(__wait);
2503 if (delayed_refs->num_entries < 16348)
2504 return 0;
2505
2506 prepare_to_wait(&delayed_refs->wait, &__wait,
2507 TASK_UNINTERRUPTIBLE);
2508
2509 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2510 if (old) {
2511 schedule();
2512 finish_wait(&delayed_refs->wait, &__wait);
2513
2514 if (!refs_newer(delayed_refs, seq, 256))
2515 goto progress;
2516 else
2517 return 0;
2518 } else {
2519 finish_wait(&delayed_refs->wait, &__wait);
2520 goto again;
2521 }
2522 }
2523
2524 } else {
2525 atomic_inc(&delayed_refs->procs_running_refs);
2526 }
2527
2472again: 2528again:
2473 loops = 0; 2529 loops = 0;
2474 spin_lock(&delayed_refs->lock); 2530 spin_lock(&delayed_refs->lock);
@@ -2477,10 +2533,6 @@ again:
2477 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2533 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2478#endif 2534#endif
2479 2535
2480 if (count == 0) {
2481 count = delayed_refs->num_entries * 2;
2482 run_most = 1;
2483 }
2484 while (1) { 2536 while (1) {
2485 if (!(run_all || run_most) && 2537 if (!(run_all || run_most) &&
2486 delayed_refs->num_heads_ready < 64) 2538 delayed_refs->num_heads_ready < 64)
@@ -2500,11 +2552,15 @@ again:
2500 2552
2501 ret = run_clustered_refs(trans, root, &cluster); 2553 ret = run_clustered_refs(trans, root, &cluster);
2502 if (ret < 0) { 2554 if (ret < 0) {
2555 btrfs_release_ref_cluster(&cluster);
2503 spin_unlock(&delayed_refs->lock); 2556 spin_unlock(&delayed_refs->lock);
2504 btrfs_abort_transaction(trans, root, ret); 2557 btrfs_abort_transaction(trans, root, ret);
2558 atomic_dec(&delayed_refs->procs_running_refs);
2505 return ret; 2559 return ret;
2506 } 2560 }
2507 2561
2562 atomic_add(ret, &delayed_refs->ref_seq);
2563
2508 count -= min_t(unsigned long, ret, count); 2564 count -= min_t(unsigned long, ret, count);
2509 2565
2510 if (count == 0) 2566 if (count == 0)
@@ -2573,6 +2629,11 @@ again:
2573 goto again; 2629 goto again;
2574 } 2630 }
2575out: 2631out:
2632 atomic_dec(&delayed_refs->procs_running_refs);
2633 smp_mb();
2634 if (waitqueue_active(&delayed_refs->wait))
2635 wake_up(&delayed_refs->wait);
2636
2576 spin_unlock(&delayed_refs->lock); 2637 spin_unlock(&delayed_refs->lock);
2577 assert_qgroups_uptodate(trans); 2638 assert_qgroups_uptodate(trans);
2578 return 0; 2639 return 0;
@@ -2586,7 +2647,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2586 struct btrfs_delayed_extent_op *extent_op; 2647 struct btrfs_delayed_extent_op *extent_op;
2587 int ret; 2648 int ret;
2588 2649
2589 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 2650 extent_op = btrfs_alloc_delayed_extent_op();
2590 if (!extent_op) 2651 if (!extent_op)
2591 return -ENOMEM; 2652 return -ENOMEM;
2592 2653
@@ -2598,7 +2659,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2598 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, 2659 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2599 num_bytes, extent_op); 2660 num_bytes, extent_op);
2600 if (ret) 2661 if (ret)
2601 kfree(extent_op); 2662 btrfs_free_delayed_extent_op(extent_op);
2602 return ret; 2663 return ret;
2603} 2664}
2604 2665
@@ -3223,12 +3284,14 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3223 u64 extra_flags = chunk_to_extended(flags) & 3284 u64 extra_flags = chunk_to_extended(flags) &
3224 BTRFS_EXTENDED_PROFILE_MASK; 3285 BTRFS_EXTENDED_PROFILE_MASK;
3225 3286
3287 write_seqlock(&fs_info->profiles_lock);
3226 if (flags & BTRFS_BLOCK_GROUP_DATA) 3288 if (flags & BTRFS_BLOCK_GROUP_DATA)
3227 fs_info->avail_data_alloc_bits |= extra_flags; 3289 fs_info->avail_data_alloc_bits |= extra_flags;
3228 if (flags & BTRFS_BLOCK_GROUP_METADATA) 3290 if (flags & BTRFS_BLOCK_GROUP_METADATA)
3229 fs_info->avail_metadata_alloc_bits |= extra_flags; 3291 fs_info->avail_metadata_alloc_bits |= extra_flags;
3230 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3292 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3231 fs_info->avail_system_alloc_bits |= extra_flags; 3293 fs_info->avail_system_alloc_bits |= extra_flags;
3294 write_sequnlock(&fs_info->profiles_lock);
3232} 3295}
3233 3296
3234/* 3297/*
@@ -3276,6 +3339,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3276 u64 num_devices = root->fs_info->fs_devices->rw_devices + 3339 u64 num_devices = root->fs_info->fs_devices->rw_devices +
3277 root->fs_info->fs_devices->missing_devices; 3340 root->fs_info->fs_devices->missing_devices;
3278 u64 target; 3341 u64 target;
3342 u64 tmp;
3279 3343
3280 /* 3344 /*
3281 * see if restripe for this chunk_type is in progress, if so 3345 * see if restripe for this chunk_type is in progress, if so
@@ -3292,40 +3356,48 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3292 } 3356 }
3293 spin_unlock(&root->fs_info->balance_lock); 3357 spin_unlock(&root->fs_info->balance_lock);
3294 3358
3359 /* First, mask out the RAID levels which aren't possible */
3295 if (num_devices == 1) 3360 if (num_devices == 1)
3296 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 3361 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
3362 BTRFS_BLOCK_GROUP_RAID5);
3363 if (num_devices < 3)
3364 flags &= ~BTRFS_BLOCK_GROUP_RAID6;
3297 if (num_devices < 4) 3365 if (num_devices < 4)
3298 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 3366 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3299 3367
3300 if ((flags & BTRFS_BLOCK_GROUP_DUP) && 3368 tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3301 (flags & (BTRFS_BLOCK_GROUP_RAID1 | 3369 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
3302 BTRFS_BLOCK_GROUP_RAID10))) { 3370 BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
3303 flags &= ~BTRFS_BLOCK_GROUP_DUP; 3371 flags &= ~tmp;
3304 }
3305
3306 if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
3307 (flags & BTRFS_BLOCK_GROUP_RAID10)) {
3308 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
3309 }
3310 3372
3311 if ((flags & BTRFS_BLOCK_GROUP_RAID0) && 3373 if (tmp & BTRFS_BLOCK_GROUP_RAID6)
3312 ((flags & BTRFS_BLOCK_GROUP_RAID1) | 3374 tmp = BTRFS_BLOCK_GROUP_RAID6;
3313 (flags & BTRFS_BLOCK_GROUP_RAID10) | 3375 else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
3314 (flags & BTRFS_BLOCK_GROUP_DUP))) { 3376 tmp = BTRFS_BLOCK_GROUP_RAID5;
3315 flags &= ~BTRFS_BLOCK_GROUP_RAID0; 3377 else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
3316 } 3378 tmp = BTRFS_BLOCK_GROUP_RAID10;
3379 else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
3380 tmp = BTRFS_BLOCK_GROUP_RAID1;
3381 else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
3382 tmp = BTRFS_BLOCK_GROUP_RAID0;
3317 3383
3318 return extended_to_chunk(flags); 3384 return extended_to_chunk(flags | tmp);
3319} 3385}
3320 3386
3321static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) 3387static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3322{ 3388{
3323 if (flags & BTRFS_BLOCK_GROUP_DATA) 3389 unsigned seq;
3324 flags |= root->fs_info->avail_data_alloc_bits; 3390
3325 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3391 do {
3326 flags |= root->fs_info->avail_system_alloc_bits; 3392 seq = read_seqbegin(&root->fs_info->profiles_lock);
3327 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 3393
3328 flags |= root->fs_info->avail_metadata_alloc_bits; 3394 if (flags & BTRFS_BLOCK_GROUP_DATA)
3395 flags |= root->fs_info->avail_data_alloc_bits;
3396 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3397 flags |= root->fs_info->avail_system_alloc_bits;
3398 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3399 flags |= root->fs_info->avail_metadata_alloc_bits;
3400 } while (read_seqretry(&root->fs_info->profiles_lock, seq));
3329 3401
3330 return btrfs_reduce_alloc_profile(root, flags); 3402 return btrfs_reduce_alloc_profile(root, flags);
3331} 3403}
@@ -3333,6 +3405,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3333u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3405u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3334{ 3406{
3335 u64 flags; 3407 u64 flags;
3408 u64 ret;
3336 3409
3337 if (data) 3410 if (data)
3338 flags = BTRFS_BLOCK_GROUP_DATA; 3411 flags = BTRFS_BLOCK_GROUP_DATA;
@@ -3341,7 +3414,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3341 else 3414 else
3342 flags = BTRFS_BLOCK_GROUP_METADATA; 3415 flags = BTRFS_BLOCK_GROUP_METADATA;
3343 3416
3344 return get_alloc_profile(root, flags); 3417 ret = get_alloc_profile(root, flags);
3418 return ret;
3345} 3419}
3346 3420
3347/* 3421/*
@@ -3357,7 +3431,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3357 int ret = 0, committed = 0, alloc_chunk = 1; 3431 int ret = 0, committed = 0, alloc_chunk = 1;
3358 3432
3359 /* make sure bytes are sectorsize aligned */ 3433 /* make sure bytes are sectorsize aligned */
3360 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3434 bytes = ALIGN(bytes, root->sectorsize);
3361 3435
3362 if (root == root->fs_info->tree_root || 3436 if (root == root->fs_info->tree_root ||
3363 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { 3437 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
@@ -3452,7 +3526,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3452 struct btrfs_space_info *data_sinfo; 3526 struct btrfs_space_info *data_sinfo;
3453 3527
3454 /* make sure bytes are sectorsize aligned */ 3528 /* make sure bytes are sectorsize aligned */
3455 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3529 bytes = ALIGN(bytes, root->sectorsize);
3456 3530
3457 data_sinfo = root->fs_info->data_sinfo; 3531 data_sinfo = root->fs_info->data_sinfo;
3458 spin_lock(&data_sinfo->lock); 3532 spin_lock(&data_sinfo->lock);
@@ -3516,8 +3590,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
3516{ 3590{
3517 u64 num_dev; 3591 u64 num_dev;
3518 3592
3519 if (type & BTRFS_BLOCK_GROUP_RAID10 || 3593 if (type & (BTRFS_BLOCK_GROUP_RAID10 |
3520 type & BTRFS_BLOCK_GROUP_RAID0) 3594 BTRFS_BLOCK_GROUP_RAID0 |
3595 BTRFS_BLOCK_GROUP_RAID5 |
3596 BTRFS_BLOCK_GROUP_RAID6))
3521 num_dev = root->fs_info->fs_devices->rw_devices; 3597 num_dev = root->fs_info->fs_devices->rw_devices;
3522 else if (type & BTRFS_BLOCK_GROUP_RAID1) 3598 else if (type & BTRFS_BLOCK_GROUP_RAID1)
3523 num_dev = 2; 3599 num_dev = 2;
@@ -3564,6 +3640,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3564 int wait_for_alloc = 0; 3640 int wait_for_alloc = 0;
3565 int ret = 0; 3641 int ret = 0;
3566 3642
3643 /* Don't re-enter if we're already allocating a chunk */
3644 if (trans->allocating_chunk)
3645 return -ENOSPC;
3646
3567 space_info = __find_space_info(extent_root->fs_info, flags); 3647 space_info = __find_space_info(extent_root->fs_info, flags);
3568 if (!space_info) { 3648 if (!space_info) {
3569 ret = update_space_info(extent_root->fs_info, flags, 3649 ret = update_space_info(extent_root->fs_info, flags,
@@ -3606,6 +3686,8 @@ again:
3606 goto again; 3686 goto again;
3607 } 3687 }
3608 3688
3689 trans->allocating_chunk = true;
3690
3609 /* 3691 /*
3610 * If we have mixed data/metadata chunks we want to make sure we keep 3692 * If we have mixed data/metadata chunks we want to make sure we keep
3611 * allocating mixed chunks instead of individual chunks. 3693 * allocating mixed chunks instead of individual chunks.
@@ -3632,19 +3714,20 @@ again:
3632 check_system_chunk(trans, extent_root, flags); 3714 check_system_chunk(trans, extent_root, flags);
3633 3715
3634 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3716 ret = btrfs_alloc_chunk(trans, extent_root, flags);
3635 if (ret < 0 && ret != -ENOSPC) 3717 trans->allocating_chunk = false;
3636 goto out;
3637 3718
3638 spin_lock(&space_info->lock); 3719 spin_lock(&space_info->lock);
3720 if (ret < 0 && ret != -ENOSPC)
3721 goto out;
3639 if (ret) 3722 if (ret)
3640 space_info->full = 1; 3723 space_info->full = 1;
3641 else 3724 else
3642 ret = 1; 3725 ret = 1;
3643 3726
3644 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 3727 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3728out:
3645 space_info->chunk_alloc = 0; 3729 space_info->chunk_alloc = 0;
3646 spin_unlock(&space_info->lock); 3730 spin_unlock(&space_info->lock);
3647out:
3648 mutex_unlock(&fs_info->chunk_mutex); 3731 mutex_unlock(&fs_info->chunk_mutex);
3649 return ret; 3732 return ret;
3650} 3733}
@@ -3653,13 +3736,31 @@ static int can_overcommit(struct btrfs_root *root,
3653 struct btrfs_space_info *space_info, u64 bytes, 3736 struct btrfs_space_info *space_info, u64 bytes,
3654 enum btrfs_reserve_flush_enum flush) 3737 enum btrfs_reserve_flush_enum flush)
3655{ 3738{
3739 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3656 u64 profile = btrfs_get_alloc_profile(root, 0); 3740 u64 profile = btrfs_get_alloc_profile(root, 0);
3741 u64 rsv_size = 0;
3657 u64 avail; 3742 u64 avail;
3658 u64 used; 3743 u64 used;
3744 u64 to_add;
3659 3745
3660 used = space_info->bytes_used + space_info->bytes_reserved + 3746 used = space_info->bytes_used + space_info->bytes_reserved +
3661 space_info->bytes_pinned + space_info->bytes_readonly + 3747 space_info->bytes_pinned + space_info->bytes_readonly;
3662 space_info->bytes_may_use; 3748
3749 spin_lock(&global_rsv->lock);
3750 rsv_size = global_rsv->size;
3751 spin_unlock(&global_rsv->lock);
3752
3753 /*
3754 * We only want to allow over committing if we have lots of actual space
3755 * free, but if we don't have enough space to handle the global reserve
3756 * space then we could end up having a real enospc problem when trying
3757 * to allocate a chunk or some other such important allocation.
3758 */
3759 rsv_size <<= 1;
3760 if (used + rsv_size >= space_info->total_bytes)
3761 return 0;
3762
3763 used += space_info->bytes_may_use;
3663 3764
3664 spin_lock(&root->fs_info->free_chunk_lock); 3765 spin_lock(&root->fs_info->free_chunk_lock);
3665 avail = root->fs_info->free_chunk_space; 3766 avail = root->fs_info->free_chunk_space;
@@ -3667,28 +3768,60 @@ static int can_overcommit(struct btrfs_root *root,
3667 3768
3668 /* 3769 /*
3669 * If we have dup, raid1 or raid10 then only half of the free 3770 * If we have dup, raid1 or raid10 then only half of the free
3670 * space is actually useable. 3771 * space is actually useable. For raid56, the space info used
3772 * doesn't include the parity drive, so we don't have to
3773 * change the math
3671 */ 3774 */
3672 if (profile & (BTRFS_BLOCK_GROUP_DUP | 3775 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3673 BTRFS_BLOCK_GROUP_RAID1 | 3776 BTRFS_BLOCK_GROUP_RAID1 |
3674 BTRFS_BLOCK_GROUP_RAID10)) 3777 BTRFS_BLOCK_GROUP_RAID10))
3675 avail >>= 1; 3778 avail >>= 1;
3676 3779
3780 to_add = space_info->total_bytes;
3781
3677 /* 3782 /*
3678 * If we aren't flushing all things, let us overcommit up to 3783 * If we aren't flushing all things, let us overcommit up to
3679 * 1/2th of the space. If we can flush, don't let us overcommit 3784 * 1/2th of the space. If we can flush, don't let us overcommit
3680 * too much, let it overcommit up to 1/8 of the space. 3785 * too much, let it overcommit up to 1/8 of the space.
3681 */ 3786 */
3682 if (flush == BTRFS_RESERVE_FLUSH_ALL) 3787 if (flush == BTRFS_RESERVE_FLUSH_ALL)
3683 avail >>= 3; 3788 to_add >>= 3;
3684 else 3789 else
3685 avail >>= 1; 3790 to_add >>= 1;
3686 3791
3687 if (used + bytes < space_info->total_bytes + avail) 3792 /*
3793 * Limit the overcommit to the amount of free space we could possibly
3794 * allocate for chunks.
3795 */
3796 to_add = min(avail, to_add);
3797
3798 if (used + bytes < space_info->total_bytes + to_add)
3688 return 1; 3799 return 1;
3689 return 0; 3800 return 0;
3690} 3801}
3691 3802
3803void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3804 unsigned long nr_pages)
3805{
3806 struct super_block *sb = root->fs_info->sb;
3807 int started;
3808
3809 /* If we can not start writeback, just sync all the delalloc file. */
3810 started = try_to_writeback_inodes_sb_nr(sb, nr_pages,
3811 WB_REASON_FS_FREE_SPACE);
3812 if (!started) {
3813 /*
3814 * We needn't worry the filesystem going from r/w to r/o though
3815 * we don't acquire ->s_umount mutex, because the filesystem
3816 * should guarantee the delalloc inodes list be empty after
3817 * the filesystem is readonly(all dirty pages are written to
3818 * the disk).
3819 */
3820 btrfs_start_delalloc_inodes(root, 0);
3821 btrfs_wait_ordered_extents(root, 0);
3822 }
3823}
3824
3692/* 3825/*
3693 * shrink metadata reservation for delalloc 3826 * shrink metadata reservation for delalloc
3694 */ 3827 */
@@ -3710,7 +3843,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3710 space_info = block_rsv->space_info; 3843 space_info = block_rsv->space_info;
3711 3844
3712 smp_mb(); 3845 smp_mb();
3713 delalloc_bytes = root->fs_info->delalloc_bytes; 3846 delalloc_bytes = percpu_counter_sum_positive(
3847 &root->fs_info->delalloc_bytes);
3714 if (delalloc_bytes == 0) { 3848 if (delalloc_bytes == 0) {
3715 if (trans) 3849 if (trans)
3716 return; 3850 return;
@@ -3721,10 +3855,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3721 while (delalloc_bytes && loops < 3) { 3855 while (delalloc_bytes && loops < 3) {
3722 max_reclaim = min(delalloc_bytes, to_reclaim); 3856 max_reclaim = min(delalloc_bytes, to_reclaim);
3723 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 3857 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
3724 try_to_writeback_inodes_sb_nr(root->fs_info->sb, 3858 btrfs_writeback_inodes_sb_nr(root, nr_pages);
3725 nr_pages,
3726 WB_REASON_FS_FREE_SPACE);
3727
3728 /* 3859 /*
3729 * We need to wait for the async pages to actually start before 3860 * We need to wait for the async pages to actually start before
3730 * we do anything. 3861 * we do anything.
@@ -3752,7 +3883,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3752 break; 3883 break;
3753 } 3884 }
3754 smp_mb(); 3885 smp_mb();
3755 delalloc_bytes = root->fs_info->delalloc_bytes; 3886 delalloc_bytes = percpu_counter_sum_positive(
3887 &root->fs_info->delalloc_bytes);
3756 } 3888 }
3757} 3889}
3758 3890
@@ -4016,6 +4148,15 @@ again:
4016 goto again; 4148 goto again;
4017 4149
4018out: 4150out:
4151 if (ret == -ENOSPC &&
4152 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
4153 struct btrfs_block_rsv *global_rsv =
4154 &root->fs_info->global_block_rsv;
4155
4156 if (block_rsv != global_rsv &&
4157 !block_rsv_use_bytes(global_rsv, orig_bytes))
4158 ret = 0;
4159 }
4019 if (flushing) { 4160 if (flushing) {
4020 spin_lock(&space_info->lock); 4161 spin_lock(&space_info->lock);
4021 space_info->flush = 0; 4162 space_info->flush = 0;
@@ -4402,19 +4543,60 @@ void btrfs_orphan_release_metadata(struct inode *inode)
4402 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 4543 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4403} 4544}
4404 4545
4405int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, 4546/*
4406 struct btrfs_pending_snapshot *pending) 4547 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
4548 * root: the root of the parent directory
4549 * rsv: block reservation
4550 * items: the number of items that we need do reservation
4551 * qgroup_reserved: used to return the reserved size in qgroup
4552 *
4553 * This function is used to reserve the space for snapshot/subvolume
4554 * creation and deletion. Those operations are different with the
4555 * common file/directory operations, they change two fs/file trees
4556 * and root tree, the number of items that the qgroup reserves is
4557 * different with the free space reservation. So we can not use
4558 * the space reseravtion mechanism in start_transaction().
4559 */
4560int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
4561 struct btrfs_block_rsv *rsv,
4562 int items,
4563 u64 *qgroup_reserved)
4407{ 4564{
4408 struct btrfs_root *root = pending->root; 4565 u64 num_bytes;
4409 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); 4566 int ret;
4410 struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; 4567
4411 /* 4568 if (root->fs_info->quota_enabled) {
4412 * two for root back/forward refs, two for directory entries, 4569 /* One for parent inode, two for dir entries */
4413 * one for root of the snapshot and one for parent inode. 4570 num_bytes = 3 * root->leafsize;
4414 */ 4571 ret = btrfs_qgroup_reserve(root, num_bytes);
4415 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6); 4572 if (ret)
4416 dst_rsv->space_info = src_rsv->space_info; 4573 return ret;
4417 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4574 } else {
4575 num_bytes = 0;
4576 }
4577
4578 *qgroup_reserved = num_bytes;
4579
4580 num_bytes = btrfs_calc_trans_metadata_size(root, items);
4581 rsv->space_info = __find_space_info(root->fs_info,
4582 BTRFS_BLOCK_GROUP_METADATA);
4583 ret = btrfs_block_rsv_add(root, rsv, num_bytes,
4584 BTRFS_RESERVE_FLUSH_ALL);
4585 if (ret) {
4586 if (*qgroup_reserved)
4587 btrfs_qgroup_free(root, *qgroup_reserved);
4588 }
4589
4590 return ret;
4591}
4592
4593void btrfs_subvolume_release_metadata(struct btrfs_root *root,
4594 struct btrfs_block_rsv *rsv,
4595 u64 qgroup_reserved)
4596{
4597 btrfs_block_rsv_release(root, rsv, (u64)-1);
4598 if (qgroup_reserved)
4599 btrfs_qgroup_free(root, qgroup_reserved);
4418} 4600}
4419 4601
4420/** 4602/**
@@ -4522,6 +4704,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4522 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 4704 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
4523 int ret = 0; 4705 int ret = 0;
4524 bool delalloc_lock = true; 4706 bool delalloc_lock = true;
4707 u64 to_free = 0;
4708 unsigned dropped;
4525 4709
4526 /* If we are a free space inode we need to not flush since we will be in 4710 /* If we are a free space inode we need to not flush since we will be in
4527 * the middle of a transaction commit. We also don't need the delalloc 4711 * the middle of a transaction commit. We also don't need the delalloc
@@ -4565,54 +4749,19 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4565 csum_bytes = BTRFS_I(inode)->csum_bytes; 4749 csum_bytes = BTRFS_I(inode)->csum_bytes;
4566 spin_unlock(&BTRFS_I(inode)->lock); 4750 spin_unlock(&BTRFS_I(inode)->lock);
4567 4751
4568 if (root->fs_info->quota_enabled) 4752 if (root->fs_info->quota_enabled) {
4569 ret = btrfs_qgroup_reserve(root, num_bytes + 4753 ret = btrfs_qgroup_reserve(root, num_bytes +
4570 nr_extents * root->leafsize); 4754 nr_extents * root->leafsize);
4755 if (ret)
4756 goto out_fail;
4757 }
4571 4758
4572 /* 4759 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4573 * ret != 0 here means the qgroup reservation failed, we go straight to 4760 if (unlikely(ret)) {
4574 * the shared error handling then. 4761 if (root->fs_info->quota_enabled)
4575 */
4576 if (ret == 0)
4577 ret = reserve_metadata_bytes(root, block_rsv,
4578 to_reserve, flush);
4579
4580 if (ret) {
4581 u64 to_free = 0;
4582 unsigned dropped;
4583
4584 spin_lock(&BTRFS_I(inode)->lock);
4585 dropped = drop_outstanding_extent(inode);
4586 /*
4587 * If the inodes csum_bytes is the same as the original
4588 * csum_bytes then we know we haven't raced with any free()ers
4589 * so we can just reduce our inodes csum bytes and carry on.
4590 * Otherwise we have to do the normal free thing to account for
4591 * the case that the free side didn't free up its reserve
4592 * because of this outstanding reservation.
4593 */
4594 if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4595 calc_csum_metadata_size(inode, num_bytes, 0);
4596 else
4597 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4598 spin_unlock(&BTRFS_I(inode)->lock);
4599 if (dropped)
4600 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4601
4602 if (to_free) {
4603 btrfs_block_rsv_release(root, block_rsv, to_free);
4604 trace_btrfs_space_reservation(root->fs_info,
4605 "delalloc",
4606 btrfs_ino(inode),
4607 to_free, 0);
4608 }
4609 if (root->fs_info->quota_enabled) {
4610 btrfs_qgroup_free(root, num_bytes + 4762 btrfs_qgroup_free(root, num_bytes +
4611 nr_extents * root->leafsize); 4763 nr_extents * root->leafsize);
4612 } 4764 goto out_fail;
4613 if (delalloc_lock)
4614 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4615 return ret;
4616 } 4765 }
4617 4766
4618 spin_lock(&BTRFS_I(inode)->lock); 4767 spin_lock(&BTRFS_I(inode)->lock);
@@ -4633,6 +4782,34 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4633 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4782 block_rsv_add_bytes(block_rsv, to_reserve, 1);
4634 4783
4635 return 0; 4784 return 0;
4785
4786out_fail:
4787 spin_lock(&BTRFS_I(inode)->lock);
4788 dropped = drop_outstanding_extent(inode);
4789 /*
4790 * If the inodes csum_bytes is the same as the original
4791 * csum_bytes then we know we haven't raced with any free()ers
4792 * so we can just reduce our inodes csum bytes and carry on.
4793 * Otherwise we have to do the normal free thing to account for
4794 * the case that the free side didn't free up its reserve
4795 * because of this outstanding reservation.
4796 */
4797 if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4798 calc_csum_metadata_size(inode, num_bytes, 0);
4799 else
4800 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4801 spin_unlock(&BTRFS_I(inode)->lock);
4802 if (dropped)
4803 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4804
4805 if (to_free) {
4806 btrfs_block_rsv_release(root, block_rsv, to_free);
4807 trace_btrfs_space_reservation(root->fs_info, "delalloc",
4808 btrfs_ino(inode), to_free, 0);
4809 }
4810 if (delalloc_lock)
4811 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4812 return ret;
4636} 4813}
4637 4814
4638/** 4815/**
@@ -4654,7 +4831,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4654 spin_lock(&BTRFS_I(inode)->lock); 4831 spin_lock(&BTRFS_I(inode)->lock);
4655 dropped = drop_outstanding_extent(inode); 4832 dropped = drop_outstanding_extent(inode);
4656 4833
4657 to_free = calc_csum_metadata_size(inode, num_bytes, 0); 4834 if (num_bytes)
4835 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4658 spin_unlock(&BTRFS_I(inode)->lock); 4836 spin_unlock(&BTRFS_I(inode)->lock);
4659 if (dropped > 0) 4837 if (dropped > 0)
4660 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4838 to_free += btrfs_calc_trans_metadata_size(root, dropped);
@@ -4721,8 +4899,7 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4721 btrfs_free_reserved_data_space(inode, num_bytes); 4899 btrfs_free_reserved_data_space(inode, num_bytes);
4722} 4900}
4723 4901
4724static int update_block_group(struct btrfs_trans_handle *trans, 4902static int update_block_group(struct btrfs_root *root,
4725 struct btrfs_root *root,
4726 u64 bytenr, u64 num_bytes, int alloc) 4903 u64 bytenr, u64 num_bytes, int alloc)
4727{ 4904{
4728 struct btrfs_block_group_cache *cache = NULL; 4905 struct btrfs_block_group_cache *cache = NULL;
@@ -4759,7 +4936,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4759 * space back to the block group, otherwise we will leak space. 4936 * space back to the block group, otherwise we will leak space.
4760 */ 4937 */
4761 if (!alloc && cache->cached == BTRFS_CACHE_NO) 4938 if (!alloc && cache->cached == BTRFS_CACHE_NO)
4762 cache_block_group(cache, trans, NULL, 1); 4939 cache_block_group(cache, 1);
4763 4940
4764 byte_in_group = bytenr - cache->key.objectid; 4941 byte_in_group = bytenr - cache->key.objectid;
4765 WARN_ON(byte_in_group > cache->key.offset); 4942 WARN_ON(byte_in_group > cache->key.offset);
@@ -4809,6 +4986,13 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
4809 struct btrfs_block_group_cache *cache; 4986 struct btrfs_block_group_cache *cache;
4810 u64 bytenr; 4987 u64 bytenr;
4811 4988
4989 spin_lock(&root->fs_info->block_group_cache_lock);
4990 bytenr = root->fs_info->first_logical_byte;
4991 spin_unlock(&root->fs_info->block_group_cache_lock);
4992
4993 if (bytenr < (u64)-1)
4994 return bytenr;
4995
4812 cache = btrfs_lookup_first_block_group(root->fs_info, search_start); 4996 cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
4813 if (!cache) 4997 if (!cache)
4814 return 0; 4998 return 0;
@@ -4859,8 +5043,7 @@ int btrfs_pin_extent(struct btrfs_root *root,
4859/* 5043/*
4860 * this function must be called within transaction 5044 * this function must be called within transaction
4861 */ 5045 */
4862int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, 5046int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
4863 struct btrfs_root *root,
4864 u64 bytenr, u64 num_bytes) 5047 u64 bytenr, u64 num_bytes)
4865{ 5048{
4866 struct btrfs_block_group_cache *cache; 5049 struct btrfs_block_group_cache *cache;
@@ -4874,7 +5057,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
4874 * to one because the slow code to read in the free extents does check 5057 * to one because the slow code to read in the free extents does check
4875 * the pinned extents. 5058 * the pinned extents.
4876 */ 5059 */
4877 cache_block_group(cache, trans, root, 1); 5060 cache_block_group(cache, 1);
4878 5061
4879 pin_down_extent(root, cache, bytenr, num_bytes, 0); 5062 pin_down_extent(root, cache, bytenr, num_bytes, 0);
4880 5063
@@ -5271,7 +5454,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5271 } 5454 }
5272 } 5455 }
5273 5456
5274 ret = update_block_group(trans, root, bytenr, num_bytes, 0); 5457 ret = update_block_group(root, bytenr, num_bytes, 0);
5275 if (ret) { 5458 if (ret) {
5276 btrfs_abort_transaction(trans, extent_root, ret); 5459 btrfs_abort_transaction(trans, extent_root, ret);
5277 goto out; 5460 goto out;
@@ -5316,7 +5499,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5316 if (head->extent_op) { 5499 if (head->extent_op) {
5317 if (!head->must_insert_reserved) 5500 if (!head->must_insert_reserved)
5318 goto out; 5501 goto out;
5319 kfree(head->extent_op); 5502 btrfs_free_delayed_extent_op(head->extent_op);
5320 head->extent_op = NULL; 5503 head->extent_op = NULL;
5321 } 5504 }
5322 5505
@@ -5439,10 +5622,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5439 return ret; 5622 return ret;
5440} 5623}
5441 5624
5442static u64 stripe_align(struct btrfs_root *root, u64 val) 5625static u64 stripe_align(struct btrfs_root *root,
5626 struct btrfs_block_group_cache *cache,
5627 u64 val, u64 num_bytes)
5443{ 5628{
5444 u64 mask = ((u64)root->stripesize - 1); 5629 u64 ret = ALIGN(val, root->stripesize);
5445 u64 ret = (val + mask) & ~mask;
5446 return ret; 5630 return ret;
5447} 5631}
5448 5632
@@ -5462,7 +5646,6 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
5462 u64 num_bytes) 5646 u64 num_bytes)
5463{ 5647{
5464 struct btrfs_caching_control *caching_ctl; 5648 struct btrfs_caching_control *caching_ctl;
5465 DEFINE_WAIT(wait);
5466 5649
5467 caching_ctl = get_caching_control(cache); 5650 caching_ctl = get_caching_control(cache);
5468 if (!caching_ctl) 5651 if (!caching_ctl)
@@ -5479,7 +5662,6 @@ static noinline int
5479wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 5662wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5480{ 5663{
5481 struct btrfs_caching_control *caching_ctl; 5664 struct btrfs_caching_control *caching_ctl;
5482 DEFINE_WAIT(wait);
5483 5665
5484 caching_ctl = get_caching_control(cache); 5666 caching_ctl = get_caching_control(cache);
5485 if (!caching_ctl) 5667 if (!caching_ctl)
@@ -5493,20 +5675,20 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5493 5675
5494int __get_raid_index(u64 flags) 5676int __get_raid_index(u64 flags)
5495{ 5677{
5496 int index;
5497
5498 if (flags & BTRFS_BLOCK_GROUP_RAID10) 5678 if (flags & BTRFS_BLOCK_GROUP_RAID10)
5499 index = 0; 5679 return BTRFS_RAID_RAID10;
5500 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 5680 else if (flags & BTRFS_BLOCK_GROUP_RAID1)
5501 index = 1; 5681 return BTRFS_RAID_RAID1;
5502 else if (flags & BTRFS_BLOCK_GROUP_DUP) 5682 else if (flags & BTRFS_BLOCK_GROUP_DUP)
5503 index = 2; 5683 return BTRFS_RAID_DUP;
5504 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 5684 else if (flags & BTRFS_BLOCK_GROUP_RAID0)
5505 index = 3; 5685 return BTRFS_RAID_RAID0;
5506 else 5686 else if (flags & BTRFS_BLOCK_GROUP_RAID5)
5507 index = 4; 5687 return BTRFS_RAID_RAID5;
5688 else if (flags & BTRFS_BLOCK_GROUP_RAID6)
5689 return BTRFS_RAID_RAID6;
5508 5690
5509 return index; 5691 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
5510} 5692}
5511 5693
5512static int get_block_group_index(struct btrfs_block_group_cache *cache) 5694static int get_block_group_index(struct btrfs_block_group_cache *cache)
@@ -5649,6 +5831,8 @@ search:
5649 if (!block_group_bits(block_group, data)) { 5831 if (!block_group_bits(block_group, data)) {
5650 u64 extra = BTRFS_BLOCK_GROUP_DUP | 5832 u64 extra = BTRFS_BLOCK_GROUP_DUP |
5651 BTRFS_BLOCK_GROUP_RAID1 | 5833 BTRFS_BLOCK_GROUP_RAID1 |
5834 BTRFS_BLOCK_GROUP_RAID5 |
5835 BTRFS_BLOCK_GROUP_RAID6 |
5652 BTRFS_BLOCK_GROUP_RAID10; 5836 BTRFS_BLOCK_GROUP_RAID10;
5653 5837
5654 /* 5838 /*
@@ -5664,8 +5848,7 @@ have_block_group:
5664 cached = block_group_cache_done(block_group); 5848 cached = block_group_cache_done(block_group);
5665 if (unlikely(!cached)) { 5849 if (unlikely(!cached)) {
5666 found_uncached_bg = true; 5850 found_uncached_bg = true;
5667 ret = cache_block_group(block_group, trans, 5851 ret = cache_block_group(block_group, 0);
5668 orig_root, 0);
5669 BUG_ON(ret < 0); 5852 BUG_ON(ret < 0);
5670 ret = 0; 5853 ret = 0;
5671 } 5854 }
@@ -5678,6 +5861,7 @@ have_block_group:
5678 * lets look there 5861 * lets look there
5679 */ 5862 */
5680 if (last_ptr) { 5863 if (last_ptr) {
5864 unsigned long aligned_cluster;
5681 /* 5865 /*
5682 * the refill lock keeps out other 5866 * the refill lock keeps out other
5683 * people trying to start a new cluster 5867 * people trying to start a new cluster
@@ -5744,11 +5928,15 @@ refill_cluster:
5744 goto unclustered_alloc; 5928 goto unclustered_alloc;
5745 } 5929 }
5746 5930
5931 aligned_cluster = max_t(unsigned long,
5932 empty_cluster + empty_size,
5933 block_group->full_stripe_len);
5934
5747 /* allocate a cluster in this block group */ 5935 /* allocate a cluster in this block group */
5748 ret = btrfs_find_space_cluster(trans, root, 5936 ret = btrfs_find_space_cluster(trans, root,
5749 block_group, last_ptr, 5937 block_group, last_ptr,
5750 search_start, num_bytes, 5938 search_start, num_bytes,
5751 empty_cluster + empty_size); 5939 aligned_cluster);
5752 if (ret == 0) { 5940 if (ret == 0) {
5753 /* 5941 /*
5754 * now pull our allocation out of this 5942 * now pull our allocation out of this
@@ -5819,7 +6007,8 @@ unclustered_alloc:
5819 goto loop; 6007 goto loop;
5820 } 6008 }
5821checks: 6009checks:
5822 search_start = stripe_align(root, offset); 6010 search_start = stripe_align(root, used_block_group,
6011 offset, num_bytes);
5823 6012
5824 /* move on to the next group */ 6013 /* move on to the next group */
5825 if (search_start + num_bytes > 6014 if (search_start + num_bytes >
@@ -5970,7 +6159,7 @@ again:
5970 if (ret == -ENOSPC) { 6159 if (ret == -ENOSPC) {
5971 if (!final_tried) { 6160 if (!final_tried) {
5972 num_bytes = num_bytes >> 1; 6161 num_bytes = num_bytes >> 1;
5973 num_bytes = num_bytes & ~(root->sectorsize - 1); 6162 num_bytes = round_down(num_bytes, root->sectorsize);
5974 num_bytes = max(num_bytes, min_alloc_size); 6163 num_bytes = max(num_bytes, min_alloc_size);
5975 if (num_bytes == min_alloc_size) 6164 if (num_bytes == min_alloc_size)
5976 final_tried = true; 6165 final_tried = true;
@@ -6094,7 +6283,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6094 btrfs_mark_buffer_dirty(path->nodes[0]); 6283 btrfs_mark_buffer_dirty(path->nodes[0]);
6095 btrfs_free_path(path); 6284 btrfs_free_path(path);
6096 6285
6097 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); 6286 ret = update_block_group(root, ins->objectid, ins->offset, 1);
6098 if (ret) { /* -ENOENT, logic error */ 6287 if (ret) { /* -ENOENT, logic error */
6099 printk(KERN_ERR "btrfs update block group failed for %llu " 6288 printk(KERN_ERR "btrfs update block group failed for %llu "
6100 "%llu\n", (unsigned long long)ins->objectid, 6289 "%llu\n", (unsigned long long)ins->objectid,
@@ -6158,7 +6347,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6158 btrfs_mark_buffer_dirty(leaf); 6347 btrfs_mark_buffer_dirty(leaf);
6159 btrfs_free_path(path); 6348 btrfs_free_path(path);
6160 6349
6161 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); 6350 ret = update_block_group(root, ins->objectid, ins->offset, 1);
6162 if (ret) { /* -ENOENT, logic error */ 6351 if (ret) { /* -ENOENT, logic error */
6163 printk(KERN_ERR "btrfs update block group failed for %llu " 6352 printk(KERN_ERR "btrfs update block group failed for %llu "
6164 "%llu\n", (unsigned long long)ins->objectid, 6353 "%llu\n", (unsigned long long)ins->objectid,
@@ -6201,7 +6390,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
6201 u64 num_bytes = ins->offset; 6390 u64 num_bytes = ins->offset;
6202 6391
6203 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 6392 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6204 cache_block_group(block_group, trans, NULL, 0); 6393 cache_block_group(block_group, 0);
6205 caching_ctl = get_caching_control(block_group); 6394 caching_ctl = get_caching_control(block_group);
6206 6395
6207 if (!caching_ctl) { 6396 if (!caching_ctl) {
@@ -6315,12 +6504,14 @@ use_block_rsv(struct btrfs_trans_handle *trans,
6315 if (!ret) 6504 if (!ret)
6316 return block_rsv; 6505 return block_rsv;
6317 if (ret && !block_rsv->failfast) { 6506 if (ret && !block_rsv->failfast) {
6318 static DEFINE_RATELIMIT_STATE(_rs, 6507 if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
6319 DEFAULT_RATELIMIT_INTERVAL, 6508 static DEFINE_RATELIMIT_STATE(_rs,
6320 /*DEFAULT_RATELIMIT_BURST*/ 2); 6509 DEFAULT_RATELIMIT_INTERVAL * 10,
6321 if (__ratelimit(&_rs)) 6510 /*DEFAULT_RATELIMIT_BURST*/ 1);
6322 WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n", 6511 if (__ratelimit(&_rs))
6323 ret); 6512 WARN(1, KERN_DEBUG
6513 "btrfs: block rsv returned %d\n", ret);
6514 }
6324 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 6515 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6325 BTRFS_RESERVE_NO_FLUSH); 6516 BTRFS_RESERVE_NO_FLUSH);
6326 if (!ret) { 6517 if (!ret) {
@@ -6386,7 +6577,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6386 6577
6387 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 6578 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
6388 struct btrfs_delayed_extent_op *extent_op; 6579 struct btrfs_delayed_extent_op *extent_op;
6389 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 6580 extent_op = btrfs_alloc_delayed_extent_op();
6390 BUG_ON(!extent_op); /* -ENOMEM */ 6581 BUG_ON(!extent_op); /* -ENOMEM */
6391 if (key) 6582 if (key)
6392 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 6583 memcpy(&extent_op->key, key, sizeof(extent_op->key));
@@ -7189,6 +7380,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7189 root->fs_info->fs_devices->missing_devices; 7380 root->fs_info->fs_devices->missing_devices;
7190 7381
7191 stripped = BTRFS_BLOCK_GROUP_RAID0 | 7382 stripped = BTRFS_BLOCK_GROUP_RAID0 |
7383 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
7192 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 7384 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7193 7385
7194 if (num_devices == 1) { 7386 if (num_devices == 1) {
@@ -7467,16 +7659,16 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7467 index = get_block_group_index(block_group); 7659 index = get_block_group_index(block_group);
7468 } 7660 }
7469 7661
7470 if (index == 0) { 7662 if (index == BTRFS_RAID_RAID10) {
7471 dev_min = 4; 7663 dev_min = 4;
7472 /* Divide by 2 */ 7664 /* Divide by 2 */
7473 min_free >>= 1; 7665 min_free >>= 1;
7474 } else if (index == 1) { 7666 } else if (index == BTRFS_RAID_RAID1) {
7475 dev_min = 2; 7667 dev_min = 2;
7476 } else if (index == 2) { 7668 } else if (index == BTRFS_RAID_DUP) {
7477 /* Multiply by 2 */ 7669 /* Multiply by 2 */
7478 min_free <<= 1; 7670 min_free <<= 1;
7479 } else if (index == 3) { 7671 } else if (index == BTRFS_RAID_RAID0) {
7480 dev_min = fs_devices->rw_devices; 7672 dev_min = fs_devices->rw_devices;
7481 do_div(min_free, dev_min); 7673 do_div(min_free, dev_min);
7482 } 7674 }
@@ -7637,11 +7829,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
7637 space_info = list_entry(info->space_info.next, 7829 space_info = list_entry(info->space_info.next,
7638 struct btrfs_space_info, 7830 struct btrfs_space_info,
7639 list); 7831 list);
7640 if (space_info->bytes_pinned > 0 || 7832 if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
7641 space_info->bytes_reserved > 0 || 7833 if (space_info->bytes_pinned > 0 ||
7642 space_info->bytes_may_use > 0) { 7834 space_info->bytes_reserved > 0 ||
7643 WARN_ON(1); 7835 space_info->bytes_may_use > 0) {
7644 dump_space_info(space_info, 0, 0); 7836 WARN_ON(1);
7837 dump_space_info(space_info, 0, 0);
7838 }
7645 } 7839 }
7646 list_del(&space_info->list); 7840 list_del(&space_info->list);
7647 kfree(space_info); 7841 kfree(space_info);
@@ -7740,7 +7934,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7740 btrfs_release_path(path); 7934 btrfs_release_path(path);
7741 cache->flags = btrfs_block_group_flags(&cache->item); 7935 cache->flags = btrfs_block_group_flags(&cache->item);
7742 cache->sectorsize = root->sectorsize; 7936 cache->sectorsize = root->sectorsize;
7743 7937 cache->full_stripe_len = btrfs_full_stripe_len(root,
7938 &root->fs_info->mapping_tree,
7939 found_key.objectid);
7744 btrfs_init_free_space_ctl(cache); 7940 btrfs_init_free_space_ctl(cache);
7745 7941
7746 /* 7942 /*
@@ -7794,6 +7990,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7794 if (!(get_alloc_profile(root, space_info->flags) & 7990 if (!(get_alloc_profile(root, space_info->flags) &
7795 (BTRFS_BLOCK_GROUP_RAID10 | 7991 (BTRFS_BLOCK_GROUP_RAID10 |
7796 BTRFS_BLOCK_GROUP_RAID1 | 7992 BTRFS_BLOCK_GROUP_RAID1 |
7993 BTRFS_BLOCK_GROUP_RAID5 |
7994 BTRFS_BLOCK_GROUP_RAID6 |
7797 BTRFS_BLOCK_GROUP_DUP))) 7995 BTRFS_BLOCK_GROUP_DUP)))
7798 continue; 7996 continue;
7799 /* 7997 /*
@@ -7869,6 +8067,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7869 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 8067 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7870 cache->sectorsize = root->sectorsize; 8068 cache->sectorsize = root->sectorsize;
7871 cache->fs_info = root->fs_info; 8069 cache->fs_info = root->fs_info;
8070 cache->full_stripe_len = btrfs_full_stripe_len(root,
8071 &root->fs_info->mapping_tree,
8072 chunk_offset);
7872 8073
7873 atomic_set(&cache->count, 1); 8074 atomic_set(&cache->count, 1);
7874 spin_lock_init(&cache->lock); 8075 spin_lock_init(&cache->lock);
@@ -7918,12 +8119,14 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
7918 u64 extra_flags = chunk_to_extended(flags) & 8119 u64 extra_flags = chunk_to_extended(flags) &
7919 BTRFS_EXTENDED_PROFILE_MASK; 8120 BTRFS_EXTENDED_PROFILE_MASK;
7920 8121
8122 write_seqlock(&fs_info->profiles_lock);
7921 if (flags & BTRFS_BLOCK_GROUP_DATA) 8123 if (flags & BTRFS_BLOCK_GROUP_DATA)
7922 fs_info->avail_data_alloc_bits &= ~extra_flags; 8124 fs_info->avail_data_alloc_bits &= ~extra_flags;
7923 if (flags & BTRFS_BLOCK_GROUP_METADATA) 8125 if (flags & BTRFS_BLOCK_GROUP_METADATA)
7924 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 8126 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
7925 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 8127 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
7926 fs_info->avail_system_alloc_bits &= ~extra_flags; 8128 fs_info->avail_system_alloc_bits &= ~extra_flags;
8129 write_sequnlock(&fs_info->profiles_lock);
7927} 8130}
7928 8131
7929int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 8132int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
@@ -8022,6 +8225,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8022 spin_lock(&root->fs_info->block_group_cache_lock); 8225 spin_lock(&root->fs_info->block_group_cache_lock);
8023 rb_erase(&block_group->cache_node, 8226 rb_erase(&block_group->cache_node,
8024 &root->fs_info->block_group_cache_tree); 8227 &root->fs_info->block_group_cache_tree);
8228
8229 if (root->fs_info->first_logical_byte == block_group->key.objectid)
8230 root->fs_info->first_logical_byte = (u64)-1;
8025 spin_unlock(&root->fs_info->block_group_cache_lock); 8231 spin_unlock(&root->fs_info->block_group_cache_lock);
8026 8232
8027 down_write(&block_group->space_info->groups_sem); 8233 down_write(&block_group->space_info->groups_sem);
@@ -8144,7 +8350,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8144 8350
8145 if (end - start >= range->minlen) { 8351 if (end - start >= range->minlen) {
8146 if (!block_group_cache_done(cache)) { 8352 if (!block_group_cache_done(cache)) {
8147 ret = cache_block_group(cache, NULL, root, 0); 8353 ret = cache_block_group(cache, 0);
8148 if (!ret) 8354 if (!ret)
8149 wait_block_group_cache_done(cache); 8355 wait_block_group_cache_done(cache);
8150 } 8356 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1b319df29eee..f173c5af6461 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4,7 +4,6 @@
4#include <linux/mm.h> 4#include <linux/mm.h>
5#include <linux/pagemap.h> 5#include <linux/pagemap.h>
6#include <linux/page-flags.h> 6#include <linux/page-flags.h>
7#include <linux/module.h>
8#include <linux/spinlock.h> 7#include <linux/spinlock.h>
9#include <linux/blkdev.h> 8#include <linux/blkdev.h>
10#include <linux/swap.h> 9#include <linux/swap.h>
@@ -1834,7 +1833,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1834 */ 1833 */
1835static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) 1834static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
1836{ 1835{
1837 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1836 u64 start = page_offset(page);
1838 u64 end = start + PAGE_CACHE_SIZE - 1; 1837 u64 end = start + PAGE_CACHE_SIZE - 1;
1839 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1838 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1840 SetPageUptodate(page); 1839 SetPageUptodate(page);
@@ -1846,7 +1845,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
1846 */ 1845 */
1847static void check_page_locked(struct extent_io_tree *tree, struct page *page) 1846static void check_page_locked(struct extent_io_tree *tree, struct page *page)
1848{ 1847{
1849 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1848 u64 start = page_offset(page);
1850 u64 end = start + PAGE_CACHE_SIZE - 1; 1849 u64 end = start + PAGE_CACHE_SIZE - 1;
1851 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) 1850 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1852 unlock_page(page); 1851 unlock_page(page);
@@ -1895,13 +1894,11 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1895 if (ret) 1894 if (ret)
1896 err = ret; 1895 err = ret;
1897 1896
1898 if (did_repair) { 1897 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1899 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, 1898 rec->start + rec->len - 1,
1900 rec->start + rec->len - 1, 1899 EXTENT_DAMAGED, GFP_NOFS);
1901 EXTENT_DAMAGED, GFP_NOFS); 1900 if (ret && !err)
1902 if (ret && !err) 1901 err = ret;
1903 err = ret;
1904 }
1905 1902
1906 kfree(rec); 1903 kfree(rec);
1907 return err; 1904 return err;
@@ -1932,10 +1929,15 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
1932 u64 map_length = 0; 1929 u64 map_length = 0;
1933 u64 sector; 1930 u64 sector;
1934 struct btrfs_bio *bbio = NULL; 1931 struct btrfs_bio *bbio = NULL;
1932 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
1935 int ret; 1933 int ret;
1936 1934
1937 BUG_ON(!mirror_num); 1935 BUG_ON(!mirror_num);
1938 1936
1937 /* we can't repair anything in raid56 yet */
1938 if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
1939 return 0;
1940
1939 bio = bio_alloc(GFP_NOFS, 1); 1941 bio = bio_alloc(GFP_NOFS, 1);
1940 if (!bio) 1942 if (!bio)
1941 return -EIO; 1943 return -EIO;
@@ -1960,7 +1962,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
1960 return -EIO; 1962 return -EIO;
1961 } 1963 }
1962 bio->bi_bdev = dev->bdev; 1964 bio->bi_bdev = dev->bdev;
1963 bio_add_page(bio, page, length, start-page_offset(page)); 1965 bio_add_page(bio, page, length, start - page_offset(page));
1964 btrfsic_submit_bio(WRITE_SYNC, bio); 1966 btrfsic_submit_bio(WRITE_SYNC, bio);
1965 wait_for_completion(&compl); 1967 wait_for_completion(&compl);
1966 1968
@@ -2052,6 +2054,7 @@ static int clean_io_failure(u64 start, struct page *page)
2052 failrec->failed_mirror); 2054 failrec->failed_mirror);
2053 did_repair = !ret; 2055 did_repair = !ret;
2054 } 2056 }
2057 ret = 0;
2055 } 2058 }
2056 2059
2057out: 2060out:
@@ -2293,8 +2296,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
2293 struct page *page = bvec->bv_page; 2296 struct page *page = bvec->bv_page;
2294 tree = &BTRFS_I(page->mapping->host)->io_tree; 2297 tree = &BTRFS_I(page->mapping->host)->io_tree;
2295 2298
2296 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2299 start = page_offset(page) + bvec->bv_offset;
2297 bvec->bv_offset;
2298 end = start + bvec->bv_len - 1; 2300 end = start + bvec->bv_len - 1;
2299 2301
2300 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 2302 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
@@ -2353,8 +2355,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2353 (long int)bio->bi_bdev); 2355 (long int)bio->bi_bdev);
2354 tree = &BTRFS_I(page->mapping->host)->io_tree; 2356 tree = &BTRFS_I(page->mapping->host)->io_tree;
2355 2357
2356 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2358 start = page_offset(page) + bvec->bv_offset;
2357 bvec->bv_offset;
2358 end = start + bvec->bv_len - 1; 2359 end = start + bvec->bv_len - 1;
2359 2360
2360 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 2361 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
@@ -2471,7 +2472,7 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
2471 struct extent_io_tree *tree = bio->bi_private; 2472 struct extent_io_tree *tree = bio->bi_private;
2472 u64 start; 2473 u64 start;
2473 2474
2474 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 2475 start = page_offset(page) + bvec->bv_offset;
2475 2476
2476 bio->bi_private = NULL; 2477 bio->bi_private = NULL;
2477 2478
@@ -2489,13 +2490,13 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
2489 return ret; 2490 return ret;
2490} 2491}
2491 2492
2492static int merge_bio(struct extent_io_tree *tree, struct page *page, 2493static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
2493 unsigned long offset, size_t size, struct bio *bio, 2494 unsigned long offset, size_t size, struct bio *bio,
2494 unsigned long bio_flags) 2495 unsigned long bio_flags)
2495{ 2496{
2496 int ret = 0; 2497 int ret = 0;
2497 if (tree->ops && tree->ops->merge_bio_hook) 2498 if (tree->ops && tree->ops->merge_bio_hook)
2498 ret = tree->ops->merge_bio_hook(page, offset, size, bio, 2499 ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio,
2499 bio_flags); 2500 bio_flags);
2500 BUG_ON(ret < 0); 2501 BUG_ON(ret < 0);
2501 return ret; 2502 return ret;
@@ -2530,7 +2531,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
2530 sector; 2531 sector;
2531 2532
2532 if (prev_bio_flags != bio_flags || !contig || 2533 if (prev_bio_flags != bio_flags || !contig ||
2533 merge_bio(tree, page, offset, page_size, bio, bio_flags) || 2534 merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
2534 bio_add_page(bio, page, page_size, offset) < page_size) { 2535 bio_add_page(bio, page, page_size, offset) < page_size) {
2535 ret = submit_one_bio(rw, bio, mirror_num, 2536 ret = submit_one_bio(rw, bio, mirror_num,
2536 prev_bio_flags); 2537 prev_bio_flags);
@@ -2595,7 +2596,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2595 unsigned long *bio_flags) 2596 unsigned long *bio_flags)
2596{ 2597{
2597 struct inode *inode = page->mapping->host; 2598 struct inode *inode = page->mapping->host;
2598 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2599 u64 start = page_offset(page);
2599 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2600 u64 page_end = start + PAGE_CACHE_SIZE - 1;
2600 u64 end; 2601 u64 end;
2601 u64 cur = start; 2602 u64 cur = start;
@@ -2648,6 +2649,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2648 } 2649 }
2649 } 2650 }
2650 while (cur <= end) { 2651 while (cur <= end) {
2652 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2653
2651 if (cur >= last_byte) { 2654 if (cur >= last_byte) {
2652 char *userpage; 2655 char *userpage;
2653 struct extent_state *cached = NULL; 2656 struct extent_state *cached = NULL;
@@ -2682,7 +2685,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2682 2685
2683 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2686 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2684 cur_end = min(extent_map_end(em) - 1, end); 2687 cur_end = min(extent_map_end(em) - 1, end);
2685 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2688 iosize = ALIGN(iosize, blocksize);
2686 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2689 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2687 disk_io_size = em->block_len; 2690 disk_io_size = em->block_len;
2688 sector = em->block_start >> 9; 2691 sector = em->block_start >> 9;
@@ -2735,26 +2738,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2735 continue; 2738 continue;
2736 } 2739 }
2737 2740
2738 ret = 0; 2741 pnr -= page->index;
2739 if (tree->ops && tree->ops->readpage_io_hook) { 2742 ret = submit_extent_page(READ, tree, page,
2740 ret = tree->ops->readpage_io_hook(page, cur,
2741 cur + iosize - 1);
2742 }
2743 if (!ret) {
2744 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2745 pnr -= page->index;
2746 ret = submit_extent_page(READ, tree, page,
2747 sector, disk_io_size, pg_offset, 2743 sector, disk_io_size, pg_offset,
2748 bdev, bio, pnr, 2744 bdev, bio, pnr,
2749 end_bio_extent_readpage, mirror_num, 2745 end_bio_extent_readpage, mirror_num,
2750 *bio_flags, 2746 *bio_flags,
2751 this_bio_flag); 2747 this_bio_flag);
2752 if (!ret) { 2748 if (!ret) {
2753 nr++; 2749 nr++;
2754 *bio_flags = this_bio_flag; 2750 *bio_flags = this_bio_flag;
2755 } 2751 } else {
2756 }
2757 if (ret) {
2758 SetPageError(page); 2752 SetPageError(page);
2759 unlock_extent(tree, cur, cur + iosize - 1); 2753 unlock_extent(tree, cur, cur + iosize - 1);
2760 } 2754 }
@@ -2806,7 +2800,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2806 struct inode *inode = page->mapping->host; 2800 struct inode *inode = page->mapping->host;
2807 struct extent_page_data *epd = data; 2801 struct extent_page_data *epd = data;
2808 struct extent_io_tree *tree = epd->tree; 2802 struct extent_io_tree *tree = epd->tree;
2809 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2803 u64 start = page_offset(page);
2810 u64 delalloc_start; 2804 u64 delalloc_start;
2811 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2805 u64 page_end = start + PAGE_CACHE_SIZE - 1;
2812 u64 end; 2806 u64 end;
@@ -2982,7 +2976,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2982 BUG_ON(extent_map_end(em) <= cur); 2976 BUG_ON(extent_map_end(em) <= cur);
2983 BUG_ON(end < cur); 2977 BUG_ON(end < cur);
2984 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2978 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2985 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2979 iosize = ALIGN(iosize, blocksize);
2986 sector = (em->block_start + extent_offset) >> 9; 2980 sector = (em->block_start + extent_offset) >> 9;
2987 bdev = em->bdev; 2981 bdev = em->bdev;
2988 block_start = em->block_start; 2982 block_start = em->block_start;
@@ -3124,12 +3118,9 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
3124 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3118 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3125 spin_unlock(&eb->refs_lock); 3119 spin_unlock(&eb->refs_lock);
3126 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3120 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3127 spin_lock(&fs_info->delalloc_lock); 3121 __percpu_counter_add(&fs_info->dirty_metadata_bytes,
3128 if (fs_info->dirty_metadata_bytes >= eb->len) 3122 -eb->len,
3129 fs_info->dirty_metadata_bytes -= eb->len; 3123 fs_info->dirty_metadata_batch);
3130 else
3131 WARN_ON(1);
3132 spin_unlock(&fs_info->delalloc_lock);
3133 ret = 1; 3124 ret = 1;
3134 } else { 3125 } else {
3135 spin_unlock(&eb->refs_lock); 3126 spin_unlock(&eb->refs_lock);
@@ -3446,15 +3437,9 @@ retry:
3446 * swizzled back from swapper_space to tmpfs file 3437 * swizzled back from swapper_space to tmpfs file
3447 * mapping 3438 * mapping
3448 */ 3439 */
3449 if (tree->ops && 3440 if (!trylock_page(page)) {
3450 tree->ops->write_cache_pages_lock_hook) { 3441 flush_fn(data);
3451 tree->ops->write_cache_pages_lock_hook(page, 3442 lock_page(page);
3452 data, flush_fn);
3453 } else {
3454 if (!trylock_page(page)) {
3455 flush_fn(data);
3456 lock_page(page);
3457 }
3458 } 3443 }
3459 3444
3460 if (unlikely(page->mapping != mapping)) { 3445 if (unlikely(page->mapping != mapping)) {
@@ -3674,11 +3659,11 @@ int extent_invalidatepage(struct extent_io_tree *tree,
3674 struct page *page, unsigned long offset) 3659 struct page *page, unsigned long offset)
3675{ 3660{
3676 struct extent_state *cached_state = NULL; 3661 struct extent_state *cached_state = NULL;
3677 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); 3662 u64 start = page_offset(page);
3678 u64 end = start + PAGE_CACHE_SIZE - 1; 3663 u64 end = start + PAGE_CACHE_SIZE - 1;
3679 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 3664 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
3680 3665
3681 start += (offset + blocksize - 1) & ~(blocksize - 1); 3666 start += ALIGN(offset, blocksize);
3682 if (start > end) 3667 if (start > end)
3683 return 0; 3668 return 0;
3684 3669
@@ -3700,7 +3685,7 @@ int try_release_extent_state(struct extent_map_tree *map,
3700 struct extent_io_tree *tree, struct page *page, 3685 struct extent_io_tree *tree, struct page *page,
3701 gfp_t mask) 3686 gfp_t mask)
3702{ 3687{
3703 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 3688 u64 start = page_offset(page);
3704 u64 end = start + PAGE_CACHE_SIZE - 1; 3689 u64 end = start + PAGE_CACHE_SIZE - 1;
3705 int ret = 1; 3690 int ret = 1;
3706 3691
@@ -3739,7 +3724,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
3739 gfp_t mask) 3724 gfp_t mask)
3740{ 3725{
3741 struct extent_map *em; 3726 struct extent_map *em;
3742 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 3727 u64 start = page_offset(page);
3743 u64 end = start + PAGE_CACHE_SIZE - 1; 3728 u64 end = start + PAGE_CACHE_SIZE - 1;
3744 3729
3745 if ((mask & __GFP_WAIT) && 3730 if ((mask & __GFP_WAIT) &&
@@ -3797,7 +3782,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
3797 len = last - offset; 3782 len = last - offset;
3798 if (len == 0) 3783 if (len == 0)
3799 break; 3784 break;
3800 len = (len + sectorsize - 1) & ~(sectorsize - 1); 3785 len = ALIGN(len, sectorsize);
3801 em = get_extent(inode, NULL, 0, offset, len, 0); 3786 em = get_extent(inode, NULL, 0, offset, len, 0);
3802 if (IS_ERR_OR_NULL(em)) 3787 if (IS_ERR_OR_NULL(em))
3803 return em; 3788 return em;
@@ -3995,8 +3980,6 @@ static void __free_extent_buffer(struct extent_buffer *eb)
3995 list_del(&eb->leak_list); 3980 list_del(&eb->leak_list);
3996 spin_unlock_irqrestore(&leak_lock, flags); 3981 spin_unlock_irqrestore(&leak_lock, flags);
3997#endif 3982#endif
3998 if (eb->pages && eb->pages != eb->inline_pages)
3999 kfree(eb->pages);
4000 kmem_cache_free(extent_buffer_cache, eb); 3983 kmem_cache_free(extent_buffer_cache, eb);
4001} 3984}
4002 3985
@@ -4037,19 +4020,12 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
4037 atomic_set(&eb->refs, 1); 4020 atomic_set(&eb->refs, 1);
4038 atomic_set(&eb->io_pages, 0); 4021 atomic_set(&eb->io_pages, 0);
4039 4022
4040 if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) { 4023 /*
4041 struct page **pages; 4024 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
4042 int num_pages = (len + PAGE_CACHE_SIZE - 1) >> 4025 */
4043 PAGE_CACHE_SHIFT; 4026 BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
4044 pages = kzalloc(num_pages, mask); 4027 > MAX_INLINE_EXTENT_BUFFER_SIZE);
4045 if (!pages) { 4028 BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
4046 __free_extent_buffer(eb);
4047 return NULL;
4048 }
4049 eb->pages = pages;
4050 } else {
4051 eb->pages = eb->inline_pages;
4052 }
4053 4029
4054 return eb; 4030 return eb;
4055} 4031}
@@ -4180,6 +4156,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4180 4156
4181static void check_buffer_tree_ref(struct extent_buffer *eb) 4157static void check_buffer_tree_ref(struct extent_buffer *eb)
4182{ 4158{
4159 int refs;
4183 /* the ref bit is tricky. We have to make sure it is set 4160 /* the ref bit is tricky. We have to make sure it is set
4184 * if we have the buffer dirty. Otherwise the 4161 * if we have the buffer dirty. Otherwise the
4185 * code to free a buffer can end up dropping a dirty 4162 * code to free a buffer can end up dropping a dirty
@@ -4200,6 +4177,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
4200 * So bump the ref count first, then set the bit. If someone 4177 * So bump the ref count first, then set the bit. If someone
4201 * beat us to it, drop the ref we added. 4178 * beat us to it, drop the ref we added.
4202 */ 4179 */
4180 refs = atomic_read(&eb->refs);
4181 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4182 return;
4183
4203 spin_lock(&eb->refs_lock); 4184 spin_lock(&eb->refs_lock);
4204 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4185 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4205 atomic_inc(&eb->refs); 4186 atomic_inc(&eb->refs);
@@ -4401,9 +4382,20 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4401 4382
4402void free_extent_buffer(struct extent_buffer *eb) 4383void free_extent_buffer(struct extent_buffer *eb)
4403{ 4384{
4385 int refs;
4386 int old;
4404 if (!eb) 4387 if (!eb)
4405 return; 4388 return;
4406 4389
4390 while (1) {
4391 refs = atomic_read(&eb->refs);
4392 if (refs <= 3)
4393 break;
4394 old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
4395 if (old == refs)
4396 return;
4397 }
4398
4407 spin_lock(&eb->refs_lock); 4399 spin_lock(&eb->refs_lock);
4408 if (atomic_read(&eb->refs) == 2 && 4400 if (atomic_read(&eb->refs) == 2 &&
4409 test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) 4401 test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 2eacfabd3263..6068a1985560 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -72,10 +72,9 @@ struct extent_io_ops {
72 int (*writepage_start_hook)(struct page *page, u64 start, u64 end); 72 int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
73 int (*writepage_io_hook)(struct page *page, u64 start, u64 end); 73 int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
74 extent_submit_bio_hook_t *submit_bio_hook; 74 extent_submit_bio_hook_t *submit_bio_hook;
75 int (*merge_bio_hook)(struct page *page, unsigned long offset, 75 int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset,
76 size_t size, struct bio *bio, 76 size_t size, struct bio *bio,
77 unsigned long bio_flags); 77 unsigned long bio_flags);
78 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
79 int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); 78 int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
80 int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, 79 int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
81 struct extent_state *state, int mirror); 80 struct extent_state *state, int mirror);
@@ -90,8 +89,6 @@ struct extent_io_ops {
90 struct extent_state *other); 89 struct extent_state *other);
91 void (*split_extent_hook)(struct inode *inode, 90 void (*split_extent_hook)(struct inode *inode,
92 struct extent_state *orig, u64 split); 91 struct extent_state *orig, u64 split);
93 int (*write_cache_pages_lock_hook)(struct page *page, void *data,
94 void (*flush_fn)(void *));
95}; 92};
96 93
97struct extent_io_tree { 94struct extent_io_tree {
@@ -161,8 +158,7 @@ struct extent_buffer {
161 */ 158 */
162 wait_queue_head_t read_lock_wq; 159 wait_queue_head_t read_lock_wq;
163 wait_queue_head_t lock_wq; 160 wait_queue_head_t lock_wq;
164 struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES]; 161 struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
165 struct page **pages;
166}; 162};
167 163
168static inline void extent_set_compress_type(unsigned long *bio_flags, 164static inline void extent_set_compress_type(unsigned long *bio_flags,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index fdb7a8db3b57..2834ca5768ea 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,6 +1,5 @@
1#include <linux/err.h> 1#include <linux/err.h>
2#include <linux/slab.h> 2#include <linux/slab.h>
3#include <linux/module.h>
4#include <linux/spinlock.h> 3#include <linux/spinlock.h>
5#include <linux/hardirq.h> 4#include <linux/hardirq.h>
6#include "ctree.h" 5#include "ctree.h"
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 94aa53b38721..ec160202be3e 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -684,6 +684,24 @@ out:
684 return ret; 684 return ret;
685} 685}
686 686
687static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums,
688 struct btrfs_sector_sum *sector_sum,
689 u64 total_bytes, u64 sectorsize)
690{
691 u64 tmp = sectorsize;
692 u64 next_sector = sector_sum->bytenr;
693 struct btrfs_sector_sum *next = sector_sum + 1;
694
695 while ((tmp + total_bytes) < sums->len) {
696 if (next_sector + sectorsize != next->bytenr)
697 break;
698 tmp += sectorsize;
699 next_sector = next->bytenr;
700 next++;
701 }
702 return tmp;
703}
704
687int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, 705int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
688 struct btrfs_root *root, 706 struct btrfs_root *root,
689 struct btrfs_ordered_sum *sums) 707 struct btrfs_ordered_sum *sums)
@@ -789,20 +807,32 @@ again:
789 goto insert; 807 goto insert;
790 } 808 }
791 809
792 if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) / 810 if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) /
793 csum_size) { 811 csum_size) {
794 u32 diff = (csum_offset + 1) * csum_size; 812 int extend_nr;
813 u64 tmp;
814 u32 diff;
815 u32 free_space;
795 816
796 /* 817 if (btrfs_leaf_free_space(root, leaf) <
797 * is the item big enough already? we dropped our lock 818 sizeof(struct btrfs_item) + csum_size * 2)
798 * before and need to recheck 819 goto insert;
799 */ 820
800 if (diff < btrfs_item_size_nr(leaf, path->slots[0])) 821 free_space = btrfs_leaf_free_space(root, leaf) -
801 goto csum; 822 sizeof(struct btrfs_item) - csum_size;
823 tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes,
824 root->sectorsize);
825 tmp >>= root->fs_info->sb->s_blocksize_bits;
826 WARN_ON(tmp < 1);
827
828 extend_nr = max_t(int, 1, (int)tmp);
829 diff = (csum_offset + extend_nr) * csum_size;
830 diff = min(diff, MAX_CSUM_ITEMS(root, csum_size) * csum_size);
802 831
803 diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); 832 diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
804 if (diff != csum_size) 833 diff = min(free_space, diff);
805 goto insert; 834 diff /= csum_size;
835 diff *= csum_size;
806 836
807 btrfs_extend_item(trans, root, path, diff); 837 btrfs_extend_item(trans, root, path, diff);
808 goto csum; 838 goto csum;
@@ -812,19 +842,14 @@ insert:
812 btrfs_release_path(path); 842 btrfs_release_path(path);
813 csum_offset = 0; 843 csum_offset = 0;
814 if (found_next) { 844 if (found_next) {
815 u64 tmp = total_bytes + root->sectorsize; 845 u64 tmp;
816 u64 next_sector = sector_sum->bytenr;
817 struct btrfs_sector_sum *next = sector_sum + 1;
818 846
819 while (tmp < sums->len) { 847 tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes,
820 if (next_sector + root->sectorsize != next->bytenr) 848 root->sectorsize);
821 break;
822 tmp += root->sectorsize;
823 next_sector = next->bytenr;
824 next++;
825 }
826 tmp = min(tmp, next_offset - file_key.offset);
827 tmp >>= root->fs_info->sb->s_blocksize_bits; 849 tmp >>= root->fs_info->sb->s_blocksize_bits;
850 tmp = min(tmp, (next_offset - file_key.offset) >>
851 root->fs_info->sb->s_blocksize_bits);
852
828 tmp = max((u64)1, tmp); 853 tmp = max((u64)1, tmp);
829 tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size)); 854 tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
830 ins_size = csum_size * tmp; 855 ins_size = csum_size * tmp;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4b241fe9d2fe..af1d0605a5c1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -30,11 +30,11 @@
30#include <linux/statfs.h> 30#include <linux/statfs.h>
31#include <linux/compat.h> 31#include <linux/compat.h>
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/btrfs.h>
33#include "ctree.h" 34#include "ctree.h"
34#include "disk-io.h" 35#include "disk-io.h"
35#include "transaction.h" 36#include "transaction.h"
36#include "btrfs_inode.h" 37#include "btrfs_inode.h"
37#include "ioctl.h"
38#include "print-tree.h" 38#include "print-tree.h"
39#include "tree-log.h" 39#include "tree-log.h"
40#include "locking.h" 40#include "locking.h"
@@ -374,6 +374,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
374 374
375 atomic_inc(&fs_info->defrag_running); 375 atomic_inc(&fs_info->defrag_running);
376 while(1) { 376 while(1) {
377 /* Pause the auto defragger. */
378 if (test_bit(BTRFS_FS_STATE_REMOUNTING,
379 &fs_info->fs_state))
380 break;
381
377 if (!__need_auto_defrag(fs_info->tree_root)) 382 if (!__need_auto_defrag(fs_info->tree_root))
378 break; 383 break;
379 384
@@ -505,8 +510,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
505 loff_t isize = i_size_read(inode); 510 loff_t isize = i_size_read(inode);
506 511
507 start_pos = pos & ~((u64)root->sectorsize - 1); 512 start_pos = pos & ~((u64)root->sectorsize - 1);
508 num_bytes = (write_bytes + pos - start_pos + 513 num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);
509 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
510 514
511 end_of_last_block = start_pos + num_bytes - 1; 515 end_of_last_block = start_pos + num_bytes - 1;
512 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 516 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
@@ -1544,7 +1548,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1544 * although we have opened a file as writable, we have 1548 * although we have opened a file as writable, we have
1545 * to stop this write operation to ensure FS consistency. 1549 * to stop this write operation to ensure FS consistency.
1546 */ 1550 */
1547 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 1551 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
1548 mutex_unlock(&inode->i_mutex); 1552 mutex_unlock(&inode->i_mutex);
1549 err = -EROFS; 1553 err = -EROFS;
1550 goto out; 1554 goto out;
@@ -1627,7 +1631,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
1627 */ 1631 */
1628 if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 1632 if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
1629 &BTRFS_I(inode)->runtime_flags)) { 1633 &BTRFS_I(inode)->runtime_flags)) {
1630 btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); 1634 struct btrfs_trans_handle *trans;
1635 struct btrfs_root *root = BTRFS_I(inode)->root;
1636
1637 /*
1638 * We need to block on a committing transaction to keep us from
1639 * throwing a ordered operation on to the list and causing
1640 * something like sync to deadlock trying to flush out this
1641 * inode.
1642 */
1643 trans = btrfs_start_transaction(root, 0);
1644 if (IS_ERR(trans))
1645 return PTR_ERR(trans);
1646 btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
1647 btrfs_end_transaction(trans, root);
1631 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 1648 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1632 filemap_flush(inode->i_mapping); 1649 filemap_flush(inode->i_mapping);
1633 } 1650 }
@@ -1654,16 +1671,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1654 struct btrfs_root *root = BTRFS_I(inode)->root; 1671 struct btrfs_root *root = BTRFS_I(inode)->root;
1655 int ret = 0; 1672 int ret = 0;
1656 struct btrfs_trans_handle *trans; 1673 struct btrfs_trans_handle *trans;
1674 bool full_sync = 0;
1657 1675
1658 trace_btrfs_sync_file(file, datasync); 1676 trace_btrfs_sync_file(file, datasync);
1659 1677
1660 /* 1678 /*
1661 * We write the dirty pages in the range and wait until they complete 1679 * We write the dirty pages in the range and wait until they complete
1662 * out of the ->i_mutex. If so, we can flush the dirty pages by 1680 * out of the ->i_mutex. If so, we can flush the dirty pages by
1663 * multi-task, and make the performance up. 1681 * multi-task, and make the performance up. See
1682 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1664 */ 1683 */
1665 atomic_inc(&BTRFS_I(inode)->sync_writers); 1684 atomic_inc(&BTRFS_I(inode)->sync_writers);
1666 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 1685 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1686 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1687 &BTRFS_I(inode)->runtime_flags))
1688 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1667 atomic_dec(&BTRFS_I(inode)->sync_writers); 1689 atomic_dec(&BTRFS_I(inode)->sync_writers);
1668 if (ret) 1690 if (ret)
1669 return ret; 1691 return ret;
@@ -1675,7 +1697,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1675 * range being left. 1697 * range being left.
1676 */ 1698 */
1677 atomic_inc(&root->log_batch); 1699 atomic_inc(&root->log_batch);
1678 btrfs_wait_ordered_range(inode, start, end - start + 1); 1700 full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1701 &BTRFS_I(inode)->runtime_flags);
1702 if (full_sync)
1703 btrfs_wait_ordered_range(inode, start, end - start + 1);
1679 atomic_inc(&root->log_batch); 1704 atomic_inc(&root->log_batch);
1680 1705
1681 /* 1706 /*
@@ -1742,13 +1767,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1742 1767
1743 if (ret != BTRFS_NO_LOG_SYNC) { 1768 if (ret != BTRFS_NO_LOG_SYNC) {
1744 if (ret > 0) { 1769 if (ret > 0) {
1770 /*
1771 * If we didn't already wait for ordered extents we need
1772 * to do that now.
1773 */
1774 if (!full_sync)
1775 btrfs_wait_ordered_range(inode, start,
1776 end - start + 1);
1745 ret = btrfs_commit_transaction(trans, root); 1777 ret = btrfs_commit_transaction(trans, root);
1746 } else { 1778 } else {
1747 ret = btrfs_sync_log(trans, root); 1779 ret = btrfs_sync_log(trans, root);
1748 if (ret == 0) 1780 if (ret == 0) {
1749 ret = btrfs_end_transaction(trans, root); 1781 ret = btrfs_end_transaction(trans, root);
1750 else 1782 } else {
1783 if (!full_sync)
1784 btrfs_wait_ordered_range(inode, start,
1785 end -
1786 start + 1);
1751 ret = btrfs_commit_transaction(trans, root); 1787 ret = btrfs_commit_transaction(trans, root);
1788 }
1752 } 1789 }
1753 } else { 1790 } else {
1754 ret = btrfs_end_transaction(trans, root); 1791 ret = btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 0be7a8742a43..1f84fc09c1a8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1356,6 +1356,8 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1356 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; 1356 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
1357 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 1357 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
1358 1358
1359 max_bitmaps = max(max_bitmaps, 1);
1360
1359 BUG_ON(ctl->total_bitmaps > max_bitmaps); 1361 BUG_ON(ctl->total_bitmaps > max_bitmaps);
1360 1362
1361 /* 1363 /*
@@ -1463,10 +1465,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1463} 1465}
1464 1466
1465static struct btrfs_free_space * 1467static struct btrfs_free_space *
1466find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) 1468find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1469 unsigned long align)
1467{ 1470{
1468 struct btrfs_free_space *entry; 1471 struct btrfs_free_space *entry;
1469 struct rb_node *node; 1472 struct rb_node *node;
1473 u64 ctl_off;
1474 u64 tmp;
1475 u64 align_off;
1470 int ret; 1476 int ret;
1471 1477
1472 if (!ctl->free_space_offset.rb_node) 1478 if (!ctl->free_space_offset.rb_node)
@@ -1481,15 +1487,34 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
1481 if (entry->bytes < *bytes) 1487 if (entry->bytes < *bytes)
1482 continue; 1488 continue;
1483 1489
1490 /* make sure the space returned is big enough
1491 * to match our requested alignment
1492 */
1493 if (*bytes >= align) {
1494 ctl_off = entry->offset - ctl->start;
1495 tmp = ctl_off + align - 1;;
1496 do_div(tmp, align);
1497 tmp = tmp * align + ctl->start;
1498 align_off = tmp - entry->offset;
1499 } else {
1500 align_off = 0;
1501 tmp = entry->offset;
1502 }
1503
1504 if (entry->bytes < *bytes + align_off)
1505 continue;
1506
1484 if (entry->bitmap) { 1507 if (entry->bitmap) {
1485 ret = search_bitmap(ctl, entry, offset, bytes); 1508 ret = search_bitmap(ctl, entry, &tmp, bytes);
1486 if (!ret) 1509 if (!ret) {
1510 *offset = tmp;
1487 return entry; 1511 return entry;
1512 }
1488 continue; 1513 continue;
1489 } 1514 }
1490 1515
1491 *offset = entry->offset; 1516 *offset = tmp;
1492 *bytes = entry->bytes; 1517 *bytes = entry->bytes - align_off;
1493 return entry; 1518 return entry;
1494 } 1519 }
1495 1520
@@ -1636,10 +1661,14 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
1636 } 1661 }
1637 1662
1638 /* 1663 /*
1639 * some block groups are so tiny they can't be enveloped by a bitmap, so 1664 * The original block groups from mkfs can be really small, like 8
1640 * don't even bother to create a bitmap for this 1665 * megabytes, so don't bother with a bitmap for those entries. However
1666 * some block groups can be smaller than what a bitmap would cover but
1667 * are still large enough that they could overflow the 32k memory limit,
1668 * so allow those block groups to still be allowed to have a bitmap
1669 * entry.
1641 */ 1670 */
1642 if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset) 1671 if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)
1643 return false; 1672 return false;
1644 1673
1645 return true; 1674 return true;
@@ -2095,9 +2124,12 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2095 struct btrfs_free_space *entry = NULL; 2124 struct btrfs_free_space *entry = NULL;
2096 u64 bytes_search = bytes + empty_size; 2125 u64 bytes_search = bytes + empty_size;
2097 u64 ret = 0; 2126 u64 ret = 0;
2127 u64 align_gap = 0;
2128 u64 align_gap_len = 0;
2098 2129
2099 spin_lock(&ctl->tree_lock); 2130 spin_lock(&ctl->tree_lock);
2100 entry = find_free_space(ctl, &offset, &bytes_search); 2131 entry = find_free_space(ctl, &offset, &bytes_search,
2132 block_group->full_stripe_len);
2101 if (!entry) 2133 if (!entry)
2102 goto out; 2134 goto out;
2103 2135
@@ -2107,9 +2139,15 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2107 if (!entry->bytes) 2139 if (!entry->bytes)
2108 free_bitmap(ctl, entry); 2140 free_bitmap(ctl, entry);
2109 } else { 2141 } else {
2142
2110 unlink_free_space(ctl, entry); 2143 unlink_free_space(ctl, entry);
2111 entry->offset += bytes; 2144 align_gap_len = offset - entry->offset;
2112 entry->bytes -= bytes; 2145 align_gap = entry->offset;
2146
2147 entry->offset = offset + bytes;
2148 WARN_ON(entry->bytes < bytes + align_gap_len);
2149
2150 entry->bytes -= bytes + align_gap_len;
2113 if (!entry->bytes) 2151 if (!entry->bytes)
2114 kmem_cache_free(btrfs_free_space_cachep, entry); 2152 kmem_cache_free(btrfs_free_space_cachep, entry);
2115 else 2153 else
@@ -2119,6 +2157,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2119out: 2157out:
2120 spin_unlock(&ctl->tree_lock); 2158 spin_unlock(&ctl->tree_lock);
2121 2159
2160 if (align_gap_len)
2161 __btrfs_add_free_space(ctl, align_gap, align_gap_len);
2122 return ret; 2162 return ret;
2123} 2163}
2124 2164
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 55c07b650378..c226daefd65d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -39,12 +39,13 @@
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/ratelimit.h> 40#include <linux/ratelimit.h>
41#include <linux/mount.h> 41#include <linux/mount.h>
42#include <linux/btrfs.h>
43#include <linux/blkdev.h>
42#include "compat.h" 44#include "compat.h"
43#include "ctree.h" 45#include "ctree.h"
44#include "disk-io.h" 46#include "disk-io.h"
45#include "transaction.h" 47#include "transaction.h"
46#include "btrfs_inode.h" 48#include "btrfs_inode.h"
47#include "ioctl.h"
48#include "print-tree.h" 49#include "print-tree.h"
49#include "ordered-data.h" 50#include "ordered-data.h"
50#include "xattr.h" 51#include "xattr.h"
@@ -54,6 +55,7 @@
54#include "locking.h" 55#include "locking.h"
55#include "free-space-cache.h" 56#include "free-space-cache.h"
56#include "inode-map.h" 57#include "inode-map.h"
58#include "backref.h"
57 59
58struct btrfs_iget_args { 60struct btrfs_iget_args {
59 u64 ino; 61 u64 ino;
@@ -231,8 +233,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
231 u64 isize = i_size_read(inode); 233 u64 isize = i_size_read(inode);
232 u64 actual_end = min(end + 1, isize); 234 u64 actual_end = min(end + 1, isize);
233 u64 inline_len = actual_end - start; 235 u64 inline_len = actual_end - start;
234 u64 aligned_end = (end + root->sectorsize - 1) & 236 u64 aligned_end = ALIGN(end, root->sectorsize);
235 ~((u64)root->sectorsize - 1);
236 u64 data_len = inline_len; 237 u64 data_len = inline_len;
237 int ret; 238 int ret;
238 239
@@ -265,6 +266,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
265 return 1; 266 return 1;
266 } 267 }
267 268
269 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
268 btrfs_delalloc_release_metadata(inode, end + 1 - start); 270 btrfs_delalloc_release_metadata(inode, end + 1 - start);
269 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 271 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
270 return 0; 272 return 0;
@@ -389,7 +391,7 @@ again:
389 * a compressed extent to 128k. 391 * a compressed extent to 128k.
390 */ 392 */
391 total_compressed = min(total_compressed, max_uncompressed); 393 total_compressed = min(total_compressed, max_uncompressed);
392 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 394 num_bytes = ALIGN(end - start + 1, blocksize);
393 num_bytes = max(blocksize, num_bytes); 395 num_bytes = max(blocksize, num_bytes);
394 total_in = 0; 396 total_in = 0;
395 ret = 0; 397 ret = 0;
@@ -488,15 +490,13 @@ cont:
488 * up to a block size boundary so the allocator does sane 490 * up to a block size boundary so the allocator does sane
489 * things 491 * things
490 */ 492 */
491 total_compressed = (total_compressed + blocksize - 1) & 493 total_compressed = ALIGN(total_compressed, blocksize);
492 ~(blocksize - 1);
493 494
494 /* 495 /*
495 * one last check to make sure the compression is really a 496 * one last check to make sure the compression is really a
496 * win, compare the page count read with the blocks on disk 497 * win, compare the page count read with the blocks on disk
497 */ 498 */
498 total_in = (total_in + PAGE_CACHE_SIZE - 1) & 499 total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
499 ~(PAGE_CACHE_SIZE - 1);
500 if (total_compressed >= total_in) { 500 if (total_compressed >= total_in) {
501 will_compress = 0; 501 will_compress = 0;
502 } else { 502 } else {
@@ -608,7 +608,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
608 if (list_empty(&async_cow->extents)) 608 if (list_empty(&async_cow->extents))
609 return 0; 609 return 0;
610 610
611 611again:
612 while (!list_empty(&async_cow->extents)) { 612 while (!list_empty(&async_cow->extents)) {
613 async_extent = list_entry(async_cow->extents.next, 613 async_extent = list_entry(async_cow->extents.next,
614 struct async_extent, list); 614 struct async_extent, list);
@@ -648,6 +648,8 @@ retry:
648 async_extent->ram_size - 1, 648 async_extent->ram_size - 1,
649 btrfs_get_extent, 649 btrfs_get_extent,
650 WB_SYNC_ALL); 650 WB_SYNC_ALL);
651 else if (ret)
652 unlock_page(async_cow->locked_page);
651 kfree(async_extent); 653 kfree(async_extent);
652 cond_resched(); 654 cond_resched();
653 continue; 655 continue;
@@ -672,6 +674,7 @@ retry:
672 674
673 if (ret) { 675 if (ret) {
674 int i; 676 int i;
677
675 for (i = 0; i < async_extent->nr_pages; i++) { 678 for (i = 0; i < async_extent->nr_pages; i++) {
676 WARN_ON(async_extent->pages[i]->mapping); 679 WARN_ON(async_extent->pages[i]->mapping);
677 page_cache_release(async_extent->pages[i]); 680 page_cache_release(async_extent->pages[i]);
@@ -679,12 +682,10 @@ retry:
679 kfree(async_extent->pages); 682 kfree(async_extent->pages);
680 async_extent->nr_pages = 0; 683 async_extent->nr_pages = 0;
681 async_extent->pages = NULL; 684 async_extent->pages = NULL;
682 unlock_extent(io_tree, async_extent->start, 685
683 async_extent->start +
684 async_extent->ram_size - 1);
685 if (ret == -ENOSPC) 686 if (ret == -ENOSPC)
686 goto retry; 687 goto retry;
687 goto out_free; /* JDM: Requeue? */ 688 goto out_free;
688 } 689 }
689 690
690 /* 691 /*
@@ -696,10 +697,13 @@ retry:
696 async_extent->ram_size - 1, 0); 697 async_extent->ram_size - 1, 0);
697 698
698 em = alloc_extent_map(); 699 em = alloc_extent_map();
699 BUG_ON(!em); /* -ENOMEM */ 700 if (!em)
701 goto out_free_reserve;
700 em->start = async_extent->start; 702 em->start = async_extent->start;
701 em->len = async_extent->ram_size; 703 em->len = async_extent->ram_size;
702 em->orig_start = em->start; 704 em->orig_start = em->start;
705 em->mod_start = em->start;
706 em->mod_len = em->len;
703 707
704 em->block_start = ins.objectid; 708 em->block_start = ins.objectid;
705 em->block_len = ins.offset; 709 em->block_len = ins.offset;
@@ -726,6 +730,9 @@ retry:
726 async_extent->ram_size - 1, 0); 730 async_extent->ram_size - 1, 0);
727 } 731 }
728 732
733 if (ret)
734 goto out_free_reserve;
735
729 ret = btrfs_add_ordered_extent_compress(inode, 736 ret = btrfs_add_ordered_extent_compress(inode,
730 async_extent->start, 737 async_extent->start,
731 ins.objectid, 738 ins.objectid,
@@ -733,7 +740,8 @@ retry:
733 ins.offset, 740 ins.offset,
734 BTRFS_ORDERED_COMPRESSED, 741 BTRFS_ORDERED_COMPRESSED,
735 async_extent->compress_type); 742 async_extent->compress_type);
736 BUG_ON(ret); /* -ENOMEM */ 743 if (ret)
744 goto out_free_reserve;
737 745
738 /* 746 /*
739 * clear dirty, set writeback and unlock the pages. 747 * clear dirty, set writeback and unlock the pages.
@@ -754,18 +762,30 @@ retry:
754 ins.objectid, 762 ins.objectid,
755 ins.offset, async_extent->pages, 763 ins.offset, async_extent->pages,
756 async_extent->nr_pages); 764 async_extent->nr_pages);
757
758 BUG_ON(ret); /* -ENOMEM */
759 alloc_hint = ins.objectid + ins.offset; 765 alloc_hint = ins.objectid + ins.offset;
760 kfree(async_extent); 766 kfree(async_extent);
767 if (ret)
768 goto out;
761 cond_resched(); 769 cond_resched();
762 } 770 }
763 ret = 0; 771 ret = 0;
764out: 772out:
765 return ret; 773 return ret;
774out_free_reserve:
775 btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
766out_free: 776out_free:
777 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
778 async_extent->start,
779 async_extent->start +
780 async_extent->ram_size - 1,
781 NULL, EXTENT_CLEAR_UNLOCK_PAGE |
782 EXTENT_CLEAR_UNLOCK |
783 EXTENT_CLEAR_DELALLOC |
784 EXTENT_CLEAR_DIRTY |
785 EXTENT_SET_WRITEBACK |
786 EXTENT_END_WRITEBACK);
767 kfree(async_extent); 787 kfree(async_extent);
768 goto out; 788 goto again;
769} 789}
770 790
771static u64 get_extent_allocation_hint(struct inode *inode, u64 start, 791static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
@@ -834,7 +854,7 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
834 854
835 BUG_ON(btrfs_is_free_space_inode(inode)); 855 BUG_ON(btrfs_is_free_space_inode(inode));
836 856
837 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 857 num_bytes = ALIGN(end - start + 1, blocksize);
838 num_bytes = max(blocksize, num_bytes); 858 num_bytes = max(blocksize, num_bytes);
839 disk_num_bytes = num_bytes; 859 disk_num_bytes = num_bytes;
840 860
@@ -892,6 +912,8 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
892 em->orig_start = em->start; 912 em->orig_start = em->start;
893 ram_size = ins.offset; 913 ram_size = ins.offset;
894 em->len = ins.offset; 914 em->len = ins.offset;
915 em->mod_start = em->start;
916 em->mod_len = em->len;
895 917
896 em->block_start = ins.objectid; 918 em->block_start = ins.objectid;
897 em->block_len = ins.offset; 919 em->block_len = ins.offset;
@@ -1338,6 +1360,8 @@ out_check:
1338 em->block_start = disk_bytenr; 1360 em->block_start = disk_bytenr;
1339 em->orig_block_len = disk_num_bytes; 1361 em->orig_block_len = disk_num_bytes;
1340 em->bdev = root->fs_info->fs_devices->latest_bdev; 1362 em->bdev = root->fs_info->fs_devices->latest_bdev;
1363 em->mod_start = em->start;
1364 em->mod_len = em->len;
1341 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1365 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1342 set_bit(EXTENT_FLAG_FILLING, &em->flags); 1366 set_bit(EXTENT_FLAG_FILLING, &em->flags);
1343 em->generation = -1; 1367 em->generation = -1;
@@ -1508,14 +1532,22 @@ static void btrfs_set_bit_hook(struct inode *inode,
1508 spin_unlock(&BTRFS_I(inode)->lock); 1532 spin_unlock(&BTRFS_I(inode)->lock);
1509 } 1533 }
1510 1534
1511 spin_lock(&root->fs_info->delalloc_lock); 1535 __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
1536 root->fs_info->delalloc_batch);
1537 spin_lock(&BTRFS_I(inode)->lock);
1512 BTRFS_I(inode)->delalloc_bytes += len; 1538 BTRFS_I(inode)->delalloc_bytes += len;
1513 root->fs_info->delalloc_bytes += len; 1539 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1514 if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1540 &BTRFS_I(inode)->runtime_flags)) {
1515 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1541 spin_lock(&root->fs_info->delalloc_lock);
1516 &root->fs_info->delalloc_inodes); 1542 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1543 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1544 &root->fs_info->delalloc_inodes);
1545 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1546 &BTRFS_I(inode)->runtime_flags);
1547 }
1548 spin_unlock(&root->fs_info->delalloc_lock);
1517 } 1549 }
1518 spin_unlock(&root->fs_info->delalloc_lock); 1550 spin_unlock(&BTRFS_I(inode)->lock);
1519 } 1551 }
1520} 1552}
1521 1553
@@ -1550,15 +1582,22 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1550 && do_list) 1582 && do_list)
1551 btrfs_free_reserved_data_space(inode, len); 1583 btrfs_free_reserved_data_space(inode, len);
1552 1584
1553 spin_lock(&root->fs_info->delalloc_lock); 1585 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
1554 root->fs_info->delalloc_bytes -= len; 1586 root->fs_info->delalloc_batch);
1587 spin_lock(&BTRFS_I(inode)->lock);
1555 BTRFS_I(inode)->delalloc_bytes -= len; 1588 BTRFS_I(inode)->delalloc_bytes -= len;
1556
1557 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && 1589 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1558 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1590 test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1559 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1591 &BTRFS_I(inode)->runtime_flags)) {
1592 spin_lock(&root->fs_info->delalloc_lock);
1593 if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1594 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1595 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1596 &BTRFS_I(inode)->runtime_flags);
1597 }
1598 spin_unlock(&root->fs_info->delalloc_lock);
1560 } 1599 }
1561 spin_unlock(&root->fs_info->delalloc_lock); 1600 spin_unlock(&BTRFS_I(inode)->lock);
1562 } 1601 }
1563} 1602}
1564 1603
@@ -1566,7 +1605,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1566 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure 1605 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1567 * we don't create bios that span stripes or chunks 1606 * we don't create bios that span stripes or chunks
1568 */ 1607 */
1569int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 1608int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
1570 size_t size, struct bio *bio, 1609 size_t size, struct bio *bio,
1571 unsigned long bio_flags) 1610 unsigned long bio_flags)
1572{ 1611{
@@ -1581,7 +1620,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1581 1620
1582 length = bio->bi_size; 1621 length = bio->bi_size;
1583 map_length = length; 1622 map_length = length;
1584 ret = btrfs_map_block(root->fs_info, READ, logical, 1623 ret = btrfs_map_block(root->fs_info, rw, logical,
1585 &map_length, NULL, 0); 1624 &map_length, NULL, 0);
1586 /* Will always return 0 with map_multi == NULL */ 1625 /* Will always return 0 with map_multi == NULL */
1587 BUG_ON(ret < 0); 1626 BUG_ON(ret < 0);
@@ -1892,6 +1931,640 @@ out:
1892 return ret; 1931 return ret;
1893} 1932}
1894 1933
1934/* snapshot-aware defrag */
1935struct sa_defrag_extent_backref {
1936 struct rb_node node;
1937 struct old_sa_defrag_extent *old;
1938 u64 root_id;
1939 u64 inum;
1940 u64 file_pos;
1941 u64 extent_offset;
1942 u64 num_bytes;
1943 u64 generation;
1944};
1945
1946struct old_sa_defrag_extent {
1947 struct list_head list;
1948 struct new_sa_defrag_extent *new;
1949
1950 u64 extent_offset;
1951 u64 bytenr;
1952 u64 offset;
1953 u64 len;
1954 int count;
1955};
1956
1957struct new_sa_defrag_extent {
1958 struct rb_root root;
1959 struct list_head head;
1960 struct btrfs_path *path;
1961 struct inode *inode;
1962 u64 file_pos;
1963 u64 len;
1964 u64 bytenr;
1965 u64 disk_len;
1966 u8 compress_type;
1967};
1968
1969static int backref_comp(struct sa_defrag_extent_backref *b1,
1970 struct sa_defrag_extent_backref *b2)
1971{
1972 if (b1->root_id < b2->root_id)
1973 return -1;
1974 else if (b1->root_id > b2->root_id)
1975 return 1;
1976
1977 if (b1->inum < b2->inum)
1978 return -1;
1979 else if (b1->inum > b2->inum)
1980 return 1;
1981
1982 if (b1->file_pos < b2->file_pos)
1983 return -1;
1984 else if (b1->file_pos > b2->file_pos)
1985 return 1;
1986
1987 /*
1988 * [------------------------------] ===> (a range of space)
1989 * |<--->| |<---->| =============> (fs/file tree A)
1990 * |<---------------------------->| ===> (fs/file tree B)
1991 *
1992 * A range of space can refer to two file extents in one tree while
1993 * refer to only one file extent in another tree.
1994 *
1995 * So we may process a disk offset more than one time(two extents in A)
1996 * and locate at the same extent(one extent in B), then insert two same
1997 * backrefs(both refer to the extent in B).
1998 */
1999 return 0;
2000}
2001
2002static void backref_insert(struct rb_root *root,
2003 struct sa_defrag_extent_backref *backref)
2004{
2005 struct rb_node **p = &root->rb_node;
2006 struct rb_node *parent = NULL;
2007 struct sa_defrag_extent_backref *entry;
2008 int ret;
2009
2010 while (*p) {
2011 parent = *p;
2012 entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2013
2014 ret = backref_comp(backref, entry);
2015 if (ret < 0)
2016 p = &(*p)->rb_left;
2017 else
2018 p = &(*p)->rb_right;
2019 }
2020
2021 rb_link_node(&backref->node, parent, p);
2022 rb_insert_color(&backref->node, root);
2023}
2024
2025/*
2026 * Note the backref might has changed, and in this case we just return 0.
2027 */
2028static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2029 void *ctx)
2030{
2031 struct btrfs_file_extent_item *extent;
2032 struct btrfs_fs_info *fs_info;
2033 struct old_sa_defrag_extent *old = ctx;
2034 struct new_sa_defrag_extent *new = old->new;
2035 struct btrfs_path *path = new->path;
2036 struct btrfs_key key;
2037 struct btrfs_root *root;
2038 struct sa_defrag_extent_backref *backref;
2039 struct extent_buffer *leaf;
2040 struct inode *inode = new->inode;
2041 int slot;
2042 int ret;
2043 u64 extent_offset;
2044 u64 num_bytes;
2045
2046 if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2047 inum == btrfs_ino(inode))
2048 return 0;
2049
2050 key.objectid = root_id;
2051 key.type = BTRFS_ROOT_ITEM_KEY;
2052 key.offset = (u64)-1;
2053
2054 fs_info = BTRFS_I(inode)->root->fs_info;
2055 root = btrfs_read_fs_root_no_name(fs_info, &key);
2056 if (IS_ERR(root)) {
2057 if (PTR_ERR(root) == -ENOENT)
2058 return 0;
2059 WARN_ON(1);
2060 pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
2061 inum, offset, root_id);
2062 return PTR_ERR(root);
2063 }
2064
2065 key.objectid = inum;
2066 key.type = BTRFS_EXTENT_DATA_KEY;
2067 if (offset > (u64)-1 << 32)
2068 key.offset = 0;
2069 else
2070 key.offset = offset;
2071
2072 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2073 if (ret < 0) {
2074 WARN_ON(1);
2075 return ret;
2076 }
2077
2078 while (1) {
2079 cond_resched();
2080
2081 leaf = path->nodes[0];
2082 slot = path->slots[0];
2083
2084 if (slot >= btrfs_header_nritems(leaf)) {
2085 ret = btrfs_next_leaf(root, path);
2086 if (ret < 0) {
2087 goto out;
2088 } else if (ret > 0) {
2089 ret = 0;
2090 goto out;
2091 }
2092 continue;
2093 }
2094
2095 path->slots[0]++;
2096
2097 btrfs_item_key_to_cpu(leaf, &key, slot);
2098
2099 if (key.objectid > inum)
2100 goto out;
2101
2102 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2103 continue;
2104
2105 extent = btrfs_item_ptr(leaf, slot,
2106 struct btrfs_file_extent_item);
2107
2108 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2109 continue;
2110
2111 extent_offset = btrfs_file_extent_offset(leaf, extent);
2112 if (key.offset - extent_offset != offset)
2113 continue;
2114
2115 num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2116 if (extent_offset >= old->extent_offset + old->offset +
2117 old->len || extent_offset + num_bytes <=
2118 old->extent_offset + old->offset)
2119 continue;
2120
2121 break;
2122 }
2123
2124 backref = kmalloc(sizeof(*backref), GFP_NOFS);
2125 if (!backref) {
2126 ret = -ENOENT;
2127 goto out;
2128 }
2129
2130 backref->root_id = root_id;
2131 backref->inum = inum;
2132 backref->file_pos = offset + extent_offset;
2133 backref->num_bytes = num_bytes;
2134 backref->extent_offset = extent_offset;
2135 backref->generation = btrfs_file_extent_generation(leaf, extent);
2136 backref->old = old;
2137 backref_insert(&new->root, backref);
2138 old->count++;
2139out:
2140 btrfs_release_path(path);
2141 WARN_ON(ret);
2142 return ret;
2143}
2144
2145static noinline bool record_extent_backrefs(struct btrfs_path *path,
2146 struct new_sa_defrag_extent *new)
2147{
2148 struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
2149 struct old_sa_defrag_extent *old, *tmp;
2150 int ret;
2151
2152 new->path = path;
2153
2154 list_for_each_entry_safe(old, tmp, &new->head, list) {
2155 ret = iterate_inodes_from_logical(old->bytenr, fs_info,
2156 path, record_one_backref,
2157 old);
2158 BUG_ON(ret < 0 && ret != -ENOENT);
2159
2160 /* no backref to be processed for this extent */
2161 if (!old->count) {
2162 list_del(&old->list);
2163 kfree(old);
2164 }
2165 }
2166
2167 if (list_empty(&new->head))
2168 return false;
2169
2170 return true;
2171}
2172
2173static int relink_is_mergable(struct extent_buffer *leaf,
2174 struct btrfs_file_extent_item *fi,
2175 u64 disk_bytenr)
2176{
2177 if (btrfs_file_extent_disk_bytenr(leaf, fi) != disk_bytenr)
2178 return 0;
2179
2180 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2181 return 0;
2182
2183 if (btrfs_file_extent_compression(leaf, fi) ||
2184 btrfs_file_extent_encryption(leaf, fi) ||
2185 btrfs_file_extent_other_encoding(leaf, fi))
2186 return 0;
2187
2188 return 1;
2189}
2190
2191/*
2192 * Note the backref might has changed, and in this case we just return 0.
2193 */
2194static noinline int relink_extent_backref(struct btrfs_path *path,
2195 struct sa_defrag_extent_backref *prev,
2196 struct sa_defrag_extent_backref *backref)
2197{
2198 struct btrfs_file_extent_item *extent;
2199 struct btrfs_file_extent_item *item;
2200 struct btrfs_ordered_extent *ordered;
2201 struct btrfs_trans_handle *trans;
2202 struct btrfs_fs_info *fs_info;
2203 struct btrfs_root *root;
2204 struct btrfs_key key;
2205 struct extent_buffer *leaf;
2206 struct old_sa_defrag_extent *old = backref->old;
2207 struct new_sa_defrag_extent *new = old->new;
2208 struct inode *src_inode = new->inode;
2209 struct inode *inode;
2210 struct extent_state *cached = NULL;
2211 int ret = 0;
2212 u64 start;
2213 u64 len;
2214 u64 lock_start;
2215 u64 lock_end;
2216 bool merge = false;
2217 int index;
2218
2219 if (prev && prev->root_id == backref->root_id &&
2220 prev->inum == backref->inum &&
2221 prev->file_pos + prev->num_bytes == backref->file_pos)
2222 merge = true;
2223
2224 /* step 1: get root */
2225 key.objectid = backref->root_id;
2226 key.type = BTRFS_ROOT_ITEM_KEY;
2227 key.offset = (u64)-1;
2228
2229 fs_info = BTRFS_I(src_inode)->root->fs_info;
2230 index = srcu_read_lock(&fs_info->subvol_srcu);
2231
2232 root = btrfs_read_fs_root_no_name(fs_info, &key);
2233 if (IS_ERR(root)) {
2234 srcu_read_unlock(&fs_info->subvol_srcu, index);
2235 if (PTR_ERR(root) == -ENOENT)
2236 return 0;
2237 return PTR_ERR(root);
2238 }
2239 if (btrfs_root_refs(&root->root_item) == 0) {
2240 srcu_read_unlock(&fs_info->subvol_srcu, index);
2241 /* parse ENOENT to 0 */
2242 return 0;
2243 }
2244
2245 /* step 2: get inode */
2246 key.objectid = backref->inum;
2247 key.type = BTRFS_INODE_ITEM_KEY;
2248 key.offset = 0;
2249
2250 inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2251 if (IS_ERR(inode)) {
2252 srcu_read_unlock(&fs_info->subvol_srcu, index);
2253 return 0;
2254 }
2255
2256 srcu_read_unlock(&fs_info->subvol_srcu, index);
2257
2258 /* step 3: relink backref */
2259 lock_start = backref->file_pos;
2260 lock_end = backref->file_pos + backref->num_bytes - 1;
2261 lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2262 0, &cached);
2263
2264 ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2265 if (ordered) {
2266 btrfs_put_ordered_extent(ordered);
2267 goto out_unlock;
2268 }
2269
2270 trans = btrfs_join_transaction(root);
2271 if (IS_ERR(trans)) {
2272 ret = PTR_ERR(trans);
2273 goto out_unlock;
2274 }
2275
2276 key.objectid = backref->inum;
2277 key.type = BTRFS_EXTENT_DATA_KEY;
2278 key.offset = backref->file_pos;
2279
2280 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2281 if (ret < 0) {
2282 goto out_free_path;
2283 } else if (ret > 0) {
2284 ret = 0;
2285 goto out_free_path;
2286 }
2287
2288 extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2289 struct btrfs_file_extent_item);
2290
2291 if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2292 backref->generation)
2293 goto out_free_path;
2294
2295 btrfs_release_path(path);
2296
2297 start = backref->file_pos;
2298 if (backref->extent_offset < old->extent_offset + old->offset)
2299 start += old->extent_offset + old->offset -
2300 backref->extent_offset;
2301
2302 len = min(backref->extent_offset + backref->num_bytes,
2303 old->extent_offset + old->offset + old->len);
2304 len -= max(backref->extent_offset, old->extent_offset + old->offset);
2305
2306 ret = btrfs_drop_extents(trans, root, inode, start,
2307 start + len, 1);
2308 if (ret)
2309 goto out_free_path;
2310again:
2311 key.objectid = btrfs_ino(inode);
2312 key.type = BTRFS_EXTENT_DATA_KEY;
2313 key.offset = start;
2314
2315 if (merge) {
2316 struct btrfs_file_extent_item *fi;
2317 u64 extent_len;
2318 struct btrfs_key found_key;
2319
2320 ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
2321 if (ret < 0)
2322 goto out_free_path;
2323
2324 path->slots[0]--;
2325 leaf = path->nodes[0];
2326 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2327
2328 fi = btrfs_item_ptr(leaf, path->slots[0],
2329 struct btrfs_file_extent_item);
2330 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2331
2332 if (relink_is_mergable(leaf, fi, new->bytenr) &&
2333 extent_len + found_key.offset == start) {
2334 btrfs_set_file_extent_num_bytes(leaf, fi,
2335 extent_len + len);
2336 btrfs_mark_buffer_dirty(leaf);
2337 inode_add_bytes(inode, len);
2338
2339 ret = 1;
2340 goto out_free_path;
2341 } else {
2342 merge = false;
2343 btrfs_release_path(path);
2344 goto again;
2345 }
2346 }
2347
2348 ret = btrfs_insert_empty_item(trans, root, path, &key,
2349 sizeof(*extent));
2350 if (ret) {
2351 btrfs_abort_transaction(trans, root, ret);
2352 goto out_free_path;
2353 }
2354
2355 leaf = path->nodes[0];
2356 item = btrfs_item_ptr(leaf, path->slots[0],
2357 struct btrfs_file_extent_item);
2358 btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2359 btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2360 btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2361 btrfs_set_file_extent_num_bytes(leaf, item, len);
2362 btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2363 btrfs_set_file_extent_generation(leaf, item, trans->transid);
2364 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2365 btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2366 btrfs_set_file_extent_encryption(leaf, item, 0);
2367 btrfs_set_file_extent_other_encoding(leaf, item, 0);
2368
2369 btrfs_mark_buffer_dirty(leaf);
2370 inode_add_bytes(inode, len);
2371
2372 ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2373 new->disk_len, 0,
2374 backref->root_id, backref->inum,
2375 new->file_pos, 0); /* start - extent_offset */
2376 if (ret) {
2377 btrfs_abort_transaction(trans, root, ret);
2378 goto out_free_path;
2379 }
2380
2381 ret = 1;
2382out_free_path:
2383 btrfs_release_path(path);
2384 btrfs_end_transaction(trans, root);
2385out_unlock:
2386 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2387 &cached, GFP_NOFS);
2388 iput(inode);
2389 return ret;
2390}
2391
2392static void relink_file_extents(struct new_sa_defrag_extent *new)
2393{
2394 struct btrfs_path *path;
2395 struct old_sa_defrag_extent *old, *tmp;
2396 struct sa_defrag_extent_backref *backref;
2397 struct sa_defrag_extent_backref *prev = NULL;
2398 struct inode *inode;
2399 struct btrfs_root *root;
2400 struct rb_node *node;
2401 int ret;
2402
2403 inode = new->inode;
2404 root = BTRFS_I(inode)->root;
2405
2406 path = btrfs_alloc_path();
2407 if (!path)
2408 return;
2409
2410 if (!record_extent_backrefs(path, new)) {
2411 btrfs_free_path(path);
2412 goto out;
2413 }
2414 btrfs_release_path(path);
2415
2416 while (1) {
2417 node = rb_first(&new->root);
2418 if (!node)
2419 break;
2420 rb_erase(node, &new->root);
2421
2422 backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2423
2424 ret = relink_extent_backref(path, prev, backref);
2425 WARN_ON(ret < 0);
2426
2427 kfree(prev);
2428
2429 if (ret == 1)
2430 prev = backref;
2431 else
2432 prev = NULL;
2433 cond_resched();
2434 }
2435 kfree(prev);
2436
2437 btrfs_free_path(path);
2438
2439 list_for_each_entry_safe(old, tmp, &new->head, list) {
2440 list_del(&old->list);
2441 kfree(old);
2442 }
2443out:
2444 atomic_dec(&root->fs_info->defrag_running);
2445 wake_up(&root->fs_info->transaction_wait);
2446
2447 kfree(new);
2448}
2449
2450static struct new_sa_defrag_extent *
2451record_old_file_extents(struct inode *inode,
2452 struct btrfs_ordered_extent *ordered)
2453{
2454 struct btrfs_root *root = BTRFS_I(inode)->root;
2455 struct btrfs_path *path;
2456 struct btrfs_key key;
2457 struct old_sa_defrag_extent *old, *tmp;
2458 struct new_sa_defrag_extent *new;
2459 int ret;
2460
2461 new = kmalloc(sizeof(*new), GFP_NOFS);
2462 if (!new)
2463 return NULL;
2464
2465 new->inode = inode;
2466 new->file_pos = ordered->file_offset;
2467 new->len = ordered->len;
2468 new->bytenr = ordered->start;
2469 new->disk_len = ordered->disk_len;
2470 new->compress_type = ordered->compress_type;
2471 new->root = RB_ROOT;
2472 INIT_LIST_HEAD(&new->head);
2473
2474 path = btrfs_alloc_path();
2475 if (!path)
2476 goto out_kfree;
2477
2478 key.objectid = btrfs_ino(inode);
2479 key.type = BTRFS_EXTENT_DATA_KEY;
2480 key.offset = new->file_pos;
2481
2482 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2483 if (ret < 0)
2484 goto out_free_path;
2485 if (ret > 0 && path->slots[0] > 0)
2486 path->slots[0]--;
2487
2488 /* find out all the old extents for the file range */
2489 while (1) {
2490 struct btrfs_file_extent_item *extent;
2491 struct extent_buffer *l;
2492 int slot;
2493 u64 num_bytes;
2494 u64 offset;
2495 u64 end;
2496 u64 disk_bytenr;
2497 u64 extent_offset;
2498
2499 l = path->nodes[0];
2500 slot = path->slots[0];
2501
2502 if (slot >= btrfs_header_nritems(l)) {
2503 ret = btrfs_next_leaf(root, path);
2504 if (ret < 0)
2505 goto out_free_list;
2506 else if (ret > 0)
2507 break;
2508 continue;
2509 }
2510
2511 btrfs_item_key_to_cpu(l, &key, slot);
2512
2513 if (key.objectid != btrfs_ino(inode))
2514 break;
2515 if (key.type != BTRFS_EXTENT_DATA_KEY)
2516 break;
2517 if (key.offset >= new->file_pos + new->len)
2518 break;
2519
2520 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2521
2522 num_bytes = btrfs_file_extent_num_bytes(l, extent);
2523 if (key.offset + num_bytes < new->file_pos)
2524 goto next;
2525
2526 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2527 if (!disk_bytenr)
2528 goto next;
2529
2530 extent_offset = btrfs_file_extent_offset(l, extent);
2531
2532 old = kmalloc(sizeof(*old), GFP_NOFS);
2533 if (!old)
2534 goto out_free_list;
2535
2536 offset = max(new->file_pos, key.offset);
2537 end = min(new->file_pos + new->len, key.offset + num_bytes);
2538
2539 old->bytenr = disk_bytenr;
2540 old->extent_offset = extent_offset;
2541 old->offset = offset - key.offset;
2542 old->len = end - offset;
2543 old->new = new;
2544 old->count = 0;
2545 list_add_tail(&old->list, &new->head);
2546next:
2547 path->slots[0]++;
2548 cond_resched();
2549 }
2550
2551 btrfs_free_path(path);
2552 atomic_inc(&root->fs_info->defrag_running);
2553
2554 return new;
2555
2556out_free_list:
2557 list_for_each_entry_safe(old, tmp, &new->head, list) {
2558 list_del(&old->list);
2559 kfree(old);
2560 }
2561out_free_path:
2562 btrfs_free_path(path);
2563out_kfree:
2564 kfree(new);
2565 return NULL;
2566}
2567
1895/* 2568/*
1896 * helper function for btrfs_finish_ordered_io, this 2569 * helper function for btrfs_finish_ordered_io, this
1897 * just reads in some of the csum leaves to prime them into ram 2570 * just reads in some of the csum leaves to prime them into ram
@@ -1909,6 +2582,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1909 struct btrfs_trans_handle *trans = NULL; 2582 struct btrfs_trans_handle *trans = NULL;
1910 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2583 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1911 struct extent_state *cached_state = NULL; 2584 struct extent_state *cached_state = NULL;
2585 struct new_sa_defrag_extent *new = NULL;
1912 int compress_type = 0; 2586 int compress_type = 0;
1913 int ret; 2587 int ret;
1914 bool nolock; 2588 bool nolock;
@@ -1943,6 +2617,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1943 ordered_extent->file_offset + ordered_extent->len - 1, 2617 ordered_extent->file_offset + ordered_extent->len - 1,
1944 0, &cached_state); 2618 0, &cached_state);
1945 2619
2620 ret = test_range_bit(io_tree, ordered_extent->file_offset,
2621 ordered_extent->file_offset + ordered_extent->len - 1,
2622 EXTENT_DEFRAG, 1, cached_state);
2623 if (ret) {
2624 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2625 if (last_snapshot >= BTRFS_I(inode)->generation)
2626 /* the inode is shared */
2627 new = record_old_file_extents(inode, ordered_extent);
2628
2629 clear_extent_bit(io_tree, ordered_extent->file_offset,
2630 ordered_extent->file_offset + ordered_extent->len - 1,
2631 EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
2632 }
2633
1946 if (nolock) 2634 if (nolock)
1947 trans = btrfs_join_transaction_nolock(root); 2635 trans = btrfs_join_transaction_nolock(root);
1948 else 2636 else
@@ -2001,17 +2689,33 @@ out:
2001 if (trans) 2689 if (trans)
2002 btrfs_end_transaction(trans, root); 2690 btrfs_end_transaction(trans, root);
2003 2691
2004 if (ret) 2692 if (ret) {
2005 clear_extent_uptodate(io_tree, ordered_extent->file_offset, 2693 clear_extent_uptodate(io_tree, ordered_extent->file_offset,
2006 ordered_extent->file_offset + 2694 ordered_extent->file_offset +
2007 ordered_extent->len - 1, NULL, GFP_NOFS); 2695 ordered_extent->len - 1, NULL, GFP_NOFS);
2008 2696
2697 /*
2698 * If the ordered extent had an IOERR or something else went
2699 * wrong we need to return the space for this ordered extent
2700 * back to the allocator.
2701 */
2702 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2703 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
2704 btrfs_free_reserved_extent(root, ordered_extent->start,
2705 ordered_extent->disk_len);
2706 }
2707
2708
2009 /* 2709 /*
2010 * This needs to be done to make sure anybody waiting knows we are done 2710 * This needs to be done to make sure anybody waiting knows we are done
2011 * updating everything for this ordered extent. 2711 * updating everything for this ordered extent.
2012 */ 2712 */
2013 btrfs_remove_ordered_extent(inode, ordered_extent); 2713 btrfs_remove_ordered_extent(inode, ordered_extent);
2014 2714
2715 /* for snapshot-aware defrag */
2716 if (new)
2717 relink_file_extents(new);
2718
2015 /* once for us */ 2719 /* once for us */
2016 btrfs_put_ordered_extent(ordered_extent); 2720 btrfs_put_ordered_extent(ordered_extent);
2017 /* once for the tree */ 2721 /* once for the tree */
@@ -2062,7 +2766,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2062static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, 2766static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
2063 struct extent_state *state, int mirror) 2767 struct extent_state *state, int mirror)
2064{ 2768{
2065 size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); 2769 size_t offset = start - page_offset(page);
2066 struct inode *inode = page->mapping->host; 2770 struct inode *inode = page->mapping->host;
2067 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2771 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2068 char *kaddr; 2772 char *kaddr;
@@ -2167,11 +2871,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
2167 } 2871 }
2168} 2872}
2169 2873
2170enum btrfs_orphan_cleanup_state {
2171 ORPHAN_CLEANUP_STARTED = 1,
2172 ORPHAN_CLEANUP_DONE = 2,
2173};
2174
2175/* 2874/*
2176 * This is called in transaction commit time. If there are no orphan 2875 * This is called in transaction commit time. If there are no orphan
2177 * files in the subvolume, it removes orphan item and frees block_rsv 2876 * files in the subvolume, it removes orphan item and frees block_rsv
@@ -2469,6 +3168,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2469 */ 3168 */
2470 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 3169 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2471 &BTRFS_I(inode)->runtime_flags); 3170 &BTRFS_I(inode)->runtime_flags);
3171 atomic_inc(&root->orphan_inodes);
2472 3172
2473 /* if we have links, this was a truncate, lets do that */ 3173 /* if we have links, this was a truncate, lets do that */
2474 if (inode->i_nlink) { 3174 if (inode->i_nlink) {
@@ -2491,6 +3191,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2491 goto out; 3191 goto out;
2492 3192
2493 ret = btrfs_truncate(inode); 3193 ret = btrfs_truncate(inode);
3194 if (ret)
3195 btrfs_orphan_del(NULL, inode);
2494 } else { 3196 } else {
2495 nr_unlink++; 3197 nr_unlink++;
2496 } 3198 }
@@ -2709,34 +3411,41 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2709 struct btrfs_inode_item *item, 3411 struct btrfs_inode_item *item,
2710 struct inode *inode) 3412 struct inode *inode)
2711{ 3413{
2712 btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); 3414 struct btrfs_map_token token;
2713 btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); 3415
2714 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); 3416 btrfs_init_map_token(&token);
2715 btrfs_set_inode_mode(leaf, item, inode->i_mode); 3417
2716 btrfs_set_inode_nlink(leaf, item, inode->i_nlink); 3418 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3419 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3420 btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
3421 &token);
3422 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3423 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
2717 3424
2718 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), 3425 btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
2719 inode->i_atime.tv_sec); 3426 inode->i_atime.tv_sec, &token);
2720 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), 3427 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
2721 inode->i_atime.tv_nsec); 3428 inode->i_atime.tv_nsec, &token);
2722 3429
2723 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), 3430 btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
2724 inode->i_mtime.tv_sec); 3431 inode->i_mtime.tv_sec, &token);
2725 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), 3432 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
2726 inode->i_mtime.tv_nsec); 3433 inode->i_mtime.tv_nsec, &token);
2727 3434
2728 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), 3435 btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
2729 inode->i_ctime.tv_sec); 3436 inode->i_ctime.tv_sec, &token);
2730 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), 3437 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
2731 inode->i_ctime.tv_nsec); 3438 inode->i_ctime.tv_nsec, &token);
2732 3439
2733 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); 3440 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
2734 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); 3441 &token);
2735 btrfs_set_inode_sequence(leaf, item, inode->i_version); 3442 btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
2736 btrfs_set_inode_transid(leaf, item, trans->transid); 3443 &token);
2737 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 3444 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
2738 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 3445 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
2739 btrfs_set_inode_block_group(leaf, item, 0); 3446 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3447 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3448 btrfs_set_token_inode_block_group(leaf, item, 0, &token);
2740} 3449}
2741 3450
2742/* 3451/*
@@ -3304,7 +4013,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3304 u64 extent_num_bytes = 0; 4013 u64 extent_num_bytes = 0;
3305 u64 extent_offset = 0; 4014 u64 extent_offset = 0;
3306 u64 item_end = 0; 4015 u64 item_end = 0;
3307 u64 mask = root->sectorsize - 1;
3308 u32 found_type = (u8)-1; 4016 u32 found_type = (u8)-1;
3309 int found_extent; 4017 int found_extent;
3310 int del_item; 4018 int del_item;
@@ -3328,7 +4036,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3328 * extent just the way it is. 4036 * extent just the way it is.
3329 */ 4037 */
3330 if (root->ref_cows || root == root->fs_info->tree_root) 4038 if (root->ref_cows || root == root->fs_info->tree_root)
3331 btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0); 4039 btrfs_drop_extent_cache(inode, ALIGN(new_size,
4040 root->sectorsize), (u64)-1, 0);
3332 4041
3333 /* 4042 /*
3334 * This function is also used to drop the items in the log tree before 4043 * This function is also used to drop the items in the log tree before
@@ -3407,10 +4116,9 @@ search_again:
3407 if (!del_item) { 4116 if (!del_item) {
3408 u64 orig_num_bytes = 4117 u64 orig_num_bytes =
3409 btrfs_file_extent_num_bytes(leaf, fi); 4118 btrfs_file_extent_num_bytes(leaf, fi);
3410 extent_num_bytes = new_size - 4119 extent_num_bytes = ALIGN(new_size -
3411 found_key.offset + root->sectorsize - 1; 4120 found_key.offset,
3412 extent_num_bytes = extent_num_bytes & 4121 root->sectorsize);
3413 ~((u64)root->sectorsize - 1);
3414 btrfs_set_file_extent_num_bytes(leaf, fi, 4122 btrfs_set_file_extent_num_bytes(leaf, fi,
3415 extent_num_bytes); 4123 extent_num_bytes);
3416 num_dec = (orig_num_bytes - 4124 num_dec = (orig_num_bytes -
@@ -3646,9 +4354,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3646 struct extent_map *em = NULL; 4354 struct extent_map *em = NULL;
3647 struct extent_state *cached_state = NULL; 4355 struct extent_state *cached_state = NULL;
3648 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 4356 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3649 u64 mask = root->sectorsize - 1; 4357 u64 hole_start = ALIGN(oldsize, root->sectorsize);
3650 u64 hole_start = (oldsize + mask) & ~mask; 4358 u64 block_end = ALIGN(size, root->sectorsize);
3651 u64 block_end = (size + mask) & ~mask;
3652 u64 last_byte; 4359 u64 last_byte;
3653 u64 cur_offset; 4360 u64 cur_offset;
3654 u64 hole_size; 4361 u64 hole_size;
@@ -3681,7 +4388,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3681 break; 4388 break;
3682 } 4389 }
3683 last_byte = min(extent_map_end(em), block_end); 4390 last_byte = min(extent_map_end(em), block_end);
3684 last_byte = (last_byte + mask) & ~mask; 4391 last_byte = ALIGN(last_byte , root->sectorsize);
3685 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 4392 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3686 struct extent_map *hole_em; 4393 struct extent_map *hole_em;
3687 hole_size = last_byte - cur_offset; 4394 hole_size = last_byte - cur_offset;
@@ -3832,6 +4539,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
3832 4539
3833 /* we don't support swapfiles, so vmtruncate shouldn't fail */ 4540 /* we don't support swapfiles, so vmtruncate shouldn't fail */
3834 truncate_setsize(inode, newsize); 4541 truncate_setsize(inode, newsize);
4542
4543 /* Disable nonlocked read DIO to avoid the end less truncate */
4544 btrfs_inode_block_unlocked_dio(inode);
4545 inode_dio_wait(inode);
4546 btrfs_inode_resume_unlocked_dio(inode);
4547
3835 ret = btrfs_truncate(inode); 4548 ret = btrfs_truncate(inode);
3836 if (ret && inode->i_nlink) 4549 if (ret && inode->i_nlink)
3837 btrfs_orphan_del(NULL, inode); 4550 btrfs_orphan_del(NULL, inode);
@@ -3904,6 +4617,12 @@ void btrfs_evict_inode(struct inode *inode)
3904 goto no_delete; 4617 goto no_delete;
3905 } 4618 }
3906 4619
4620 ret = btrfs_commit_inode_delayed_inode(inode);
4621 if (ret) {
4622 btrfs_orphan_del(NULL, inode);
4623 goto no_delete;
4624 }
4625
3907 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); 4626 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
3908 if (!rsv) { 4627 if (!rsv) {
3909 btrfs_orphan_del(NULL, inode); 4628 btrfs_orphan_del(NULL, inode);
@@ -3941,7 +4660,7 @@ void btrfs_evict_inode(struct inode *inode)
3941 goto no_delete; 4660 goto no_delete;
3942 } 4661 }
3943 4662
3944 trans = btrfs_start_transaction_lflush(root, 1); 4663 trans = btrfs_join_transaction(root);
3945 if (IS_ERR(trans)) { 4664 if (IS_ERR(trans)) {
3946 btrfs_orphan_del(NULL, inode); 4665 btrfs_orphan_del(NULL, inode);
3947 btrfs_free_block_rsv(root, rsv); 4666 btrfs_free_block_rsv(root, rsv);
@@ -3955,9 +4674,6 @@ void btrfs_evict_inode(struct inode *inode)
3955 break; 4674 break;
3956 4675
3957 trans->block_rsv = &root->fs_info->trans_block_rsv; 4676 trans->block_rsv = &root->fs_info->trans_block_rsv;
3958 ret = btrfs_update_inode(trans, root, inode);
3959 BUG_ON(ret);
3960
3961 btrfs_end_transaction(trans, root); 4677 btrfs_end_transaction(trans, root);
3962 trans = NULL; 4678 trans = NULL;
3963 btrfs_btree_balance_dirty(root); 4679 btrfs_btree_balance_dirty(root);
@@ -4854,7 +5570,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4854 if (btrfs_test_opt(root, NODATASUM)) 5570 if (btrfs_test_opt(root, NODATASUM))
4855 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 5571 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
4856 if (btrfs_test_opt(root, NODATACOW)) 5572 if (btrfs_test_opt(root, NODATACOW))
4857 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 5573 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
5574 BTRFS_INODE_NODATASUM;
4858 } 5575 }
4859 5576
4860 insert_inode_hash(inode); 5577 insert_inode_hash(inode);
@@ -5006,12 +5723,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
5006 goto out_unlock; 5723 goto out_unlock;
5007 } 5724 }
5008 5725
5009 err = btrfs_update_inode(trans, root, inode);
5010 if (err) {
5011 drop_inode = 1;
5012 goto out_unlock;
5013 }
5014
5015 /* 5726 /*
5016 * If the active LSM wants to access the inode during 5727 * If the active LSM wants to access the inode during
5017 * d_instantiate it needs these. Smack checks to see 5728 * d_instantiate it needs these. Smack checks to see
@@ -5396,8 +6107,7 @@ again:
5396 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 6107 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
5397 size_t size; 6108 size_t size;
5398 size = btrfs_file_extent_inline_len(leaf, item); 6109 size = btrfs_file_extent_inline_len(leaf, item);
5399 extent_end = (extent_start + size + root->sectorsize - 1) & 6110 extent_end = ALIGN(extent_start + size, root->sectorsize);
5400 ~((u64)root->sectorsize - 1);
5401 } 6111 }
5402 6112
5403 if (start >= extent_end) { 6113 if (start >= extent_end) {
@@ -5469,8 +6179,7 @@ again:
5469 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, 6179 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
5470 size - extent_offset); 6180 size - extent_offset);
5471 em->start = extent_start + extent_offset; 6181 em->start = extent_start + extent_offset;
5472 em->len = (copy_size + root->sectorsize - 1) & 6182 em->len = ALIGN(copy_size, root->sectorsize);
5473 ~((u64)root->sectorsize - 1);
5474 em->orig_block_len = em->len; 6183 em->orig_block_len = em->len;
5475 em->orig_start = em->start; 6184 em->orig_start = em->start;
5476 if (compress_type) { 6185 if (compress_type) {
@@ -5949,6 +6658,8 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
5949 6658
5950 em->start = start; 6659 em->start = start;
5951 em->orig_start = orig_start; 6660 em->orig_start = orig_start;
6661 em->mod_start = start;
6662 em->mod_len = len;
5952 em->len = len; 6663 em->len = len;
5953 em->block_len = block_len; 6664 em->block_len = block_len;
5954 em->block_start = block_start; 6665 em->block_start = block_start;
@@ -5990,16 +6701,12 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5990 u64 len = bh_result->b_size; 6701 u64 len = bh_result->b_size;
5991 struct btrfs_trans_handle *trans; 6702 struct btrfs_trans_handle *trans;
5992 int unlock_bits = EXTENT_LOCKED; 6703 int unlock_bits = EXTENT_LOCKED;
5993 int ret; 6704 int ret = 0;
5994 6705
5995 if (create) { 6706 if (create)
5996 ret = btrfs_delalloc_reserve_space(inode, len);
5997 if (ret)
5998 return ret;
5999 unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; 6707 unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
6000 } else { 6708 else
6001 len = min_t(u64, len, root->sectorsize); 6709 len = min_t(u64, len, root->sectorsize);
6002 }
6003 6710
6004 lockstart = start; 6711 lockstart = start;
6005 lockend = start + len - 1; 6712 lockend = start + len - 1;
@@ -6011,14 +6718,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
6011 if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create)) 6718 if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
6012 return -ENOTBLK; 6719 return -ENOTBLK;
6013 6720
6014 if (create) {
6015 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6016 lockend, EXTENT_DELALLOC, NULL,
6017 &cached_state, GFP_NOFS);
6018 if (ret)
6019 goto unlock_err;
6020 }
6021
6022 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 6721 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
6023 if (IS_ERR(em)) { 6722 if (IS_ERR(em)) {
6024 ret = PTR_ERR(em); 6723 ret = PTR_ERR(em);
@@ -6050,7 +6749,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
6050 if (!create && (em->block_start == EXTENT_MAP_HOLE || 6749 if (!create && (em->block_start == EXTENT_MAP_HOLE ||
6051 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 6750 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
6052 free_extent_map(em); 6751 free_extent_map(em);
6053 ret = 0;
6054 goto unlock_err; 6752 goto unlock_err;
6055 } 6753 }
6056 6754
@@ -6148,6 +6846,15 @@ unlock:
6148 */ 6846 */
6149 if (start + len > i_size_read(inode)) 6847 if (start + len > i_size_read(inode))
6150 i_size_write(inode, start + len); 6848 i_size_write(inode, start + len);
6849
6850 spin_lock(&BTRFS_I(inode)->lock);
6851 BTRFS_I(inode)->outstanding_extents++;
6852 spin_unlock(&BTRFS_I(inode)->lock);
6853
6854 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6855 lockstart + len - 1, EXTENT_DELALLOC, NULL,
6856 &cached_state, GFP_NOFS);
6857 BUG_ON(ret);
6151 } 6858 }
6152 6859
6153 /* 6860 /*
@@ -6156,24 +6863,9 @@ unlock:
6156 * aren't using if there is any left over space. 6863 * aren't using if there is any left over space.
6157 */ 6864 */
6158 if (lockstart < lockend) { 6865 if (lockstart < lockend) {
6159 if (create && len < lockend - lockstart) { 6866 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6160 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 6867 lockend, unlock_bits, 1, 0,
6161 lockstart + len - 1, 6868 &cached_state, GFP_NOFS);
6162 unlock_bits | EXTENT_DEFRAG, 1, 0,
6163 &cached_state, GFP_NOFS);
6164 /*
6165 * Beside unlock, we also need to cleanup reserved space
6166 * for the left range by attaching EXTENT_DO_ACCOUNTING.
6167 */
6168 clear_extent_bit(&BTRFS_I(inode)->io_tree,
6169 lockstart + len, lockend,
6170 unlock_bits | EXTENT_DO_ACCOUNTING |
6171 EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS);
6172 } else {
6173 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6174 lockend, unlock_bits, 1, 0,
6175 &cached_state, GFP_NOFS);
6176 }
6177 } else { 6869 } else {
6178 free_extent_state(cached_state); 6870 free_extent_state(cached_state);
6179 } 6871 }
@@ -6183,9 +6875,6 @@ unlock:
6183 return 0; 6875 return 0;
6184 6876
6185unlock_err: 6877unlock_err:
6186 if (create)
6187 unlock_bits |= EXTENT_DO_ACCOUNTING;
6188
6189 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6878 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6190 unlock_bits, 1, 0, &cached_state, GFP_NOFS); 6879 unlock_bits, 1, 0, &cached_state, GFP_NOFS);
6191 return ret; 6880 return ret;
@@ -6426,19 +7115,24 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6426 int async_submit = 0; 7115 int async_submit = 0;
6427 7116
6428 map_length = orig_bio->bi_size; 7117 map_length = orig_bio->bi_size;
6429 ret = btrfs_map_block(root->fs_info, READ, start_sector << 9, 7118 ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
6430 &map_length, NULL, 0); 7119 &map_length, NULL, 0);
6431 if (ret) { 7120 if (ret) {
6432 bio_put(orig_bio); 7121 bio_put(orig_bio);
6433 return -EIO; 7122 return -EIO;
6434 } 7123 }
6435
6436 if (map_length >= orig_bio->bi_size) { 7124 if (map_length >= orig_bio->bi_size) {
6437 bio = orig_bio; 7125 bio = orig_bio;
6438 goto submit; 7126 goto submit;
6439 } 7127 }
6440 7128
6441 async_submit = 1; 7129 /* async crcs make it difficult to collect full stripe writes. */
7130 if (btrfs_get_alloc_profile(root, 1) &
7131 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
7132 async_submit = 0;
7133 else
7134 async_submit = 1;
7135
6442 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 7136 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
6443 if (!bio) 7137 if (!bio)
6444 return -ENOMEM; 7138 return -ENOMEM;
@@ -6480,7 +7174,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6480 bio->bi_end_io = btrfs_end_dio_bio; 7174 bio->bi_end_io = btrfs_end_dio_bio;
6481 7175
6482 map_length = orig_bio->bi_size; 7176 map_length = orig_bio->bi_size;
6483 ret = btrfs_map_block(root->fs_info, READ, 7177 ret = btrfs_map_block(root->fs_info, rw,
6484 start_sector << 9, 7178 start_sector << 9,
6485 &map_length, NULL, 0); 7179 &map_length, NULL, 0);
6486 if (ret) { 7180 if (ret) {
@@ -6623,15 +7317,60 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
6623{ 7317{
6624 struct file *file = iocb->ki_filp; 7318 struct file *file = iocb->ki_filp;
6625 struct inode *inode = file->f_mapping->host; 7319 struct inode *inode = file->f_mapping->host;
7320 size_t count = 0;
7321 int flags = 0;
7322 bool wakeup = true;
7323 bool relock = false;
7324 ssize_t ret;
6626 7325
6627 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, 7326 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
6628 offset, nr_segs)) 7327 offset, nr_segs))
6629 return 0; 7328 return 0;
6630 7329
6631 return __blockdev_direct_IO(rw, iocb, inode, 7330 atomic_inc(&inode->i_dio_count);
6632 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, 7331 smp_mb__after_atomic_inc();
6633 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, 7332
6634 btrfs_submit_direct, 0); 7333 if (rw & WRITE) {
7334 count = iov_length(iov, nr_segs);
7335 /*
7336 * If the write DIO is beyond the EOF, we need update
7337 * the isize, but it is protected by i_mutex. So we can
7338 * not unlock the i_mutex at this case.
7339 */
7340 if (offset + count <= inode->i_size) {
7341 mutex_unlock(&inode->i_mutex);
7342 relock = true;
7343 }
7344 ret = btrfs_delalloc_reserve_space(inode, count);
7345 if (ret)
7346 goto out;
7347 } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
7348 &BTRFS_I(inode)->runtime_flags))) {
7349 inode_dio_done(inode);
7350 flags = DIO_LOCKING | DIO_SKIP_HOLES;
7351 wakeup = false;
7352 }
7353
7354 ret = __blockdev_direct_IO(rw, iocb, inode,
7355 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
7356 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
7357 btrfs_submit_direct, flags);
7358 if (rw & WRITE) {
7359 if (ret < 0 && ret != -EIOCBQUEUED)
7360 btrfs_delalloc_release_space(inode, count);
7361 else if (ret >= 0 && (size_t)ret < count)
7362 btrfs_delalloc_release_space(inode,
7363 count - (size_t)ret);
7364 else
7365 btrfs_delalloc_release_metadata(inode, 0);
7366 }
7367out:
7368 if (wakeup)
7369 inode_dio_done(inode);
7370 if (relock)
7371 mutex_lock(&inode->i_mutex);
7372
7373 return ret;
6635} 7374}
6636 7375
6637#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) 7376#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
@@ -6735,8 +7474,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
6735 return; 7474 return;
6736 } 7475 }
6737 lock_extent_bits(tree, page_start, page_end, 0, &cached_state); 7476 lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
6738 ordered = btrfs_lookup_ordered_extent(inode, 7477 ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
6739 page_offset(page));
6740 if (ordered) { 7478 if (ordered) {
6741 /* 7479 /*
6742 * IO on this page will never be started, so we need 7480 * IO on this page will never be started, so we need
@@ -7216,8 +7954,9 @@ int btrfs_drop_inode(struct inode *inode)
7216{ 7954{
7217 struct btrfs_root *root = BTRFS_I(inode)->root; 7955 struct btrfs_root *root = BTRFS_I(inode)->root;
7218 7956
7957 /* the snap/subvol tree is on deleting */
7219 if (btrfs_root_refs(&root->root_item) == 0 && 7958 if (btrfs_root_refs(&root->root_item) == 0 &&
7220 !btrfs_is_free_space_inode(inode)) 7959 root != root->fs_info->tree_root)
7221 return 1; 7960 return 1;
7222 else 7961 else
7223 return generic_drop_inode(inode); 7962 return generic_drop_inode(inode);
@@ -7299,40 +8038,22 @@ fail:
7299static int btrfs_getattr(struct vfsmount *mnt, 8038static int btrfs_getattr(struct vfsmount *mnt,
7300 struct dentry *dentry, struct kstat *stat) 8039 struct dentry *dentry, struct kstat *stat)
7301{ 8040{
8041 u64 delalloc_bytes;
7302 struct inode *inode = dentry->d_inode; 8042 struct inode *inode = dentry->d_inode;
7303 u32 blocksize = inode->i_sb->s_blocksize; 8043 u32 blocksize = inode->i_sb->s_blocksize;
7304 8044
7305 generic_fillattr(inode, stat); 8045 generic_fillattr(inode, stat);
7306 stat->dev = BTRFS_I(inode)->root->anon_dev; 8046 stat->dev = BTRFS_I(inode)->root->anon_dev;
7307 stat->blksize = PAGE_CACHE_SIZE; 8047 stat->blksize = PAGE_CACHE_SIZE;
8048
8049 spin_lock(&BTRFS_I(inode)->lock);
8050 delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
8051 spin_unlock(&BTRFS_I(inode)->lock);
7308 stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + 8052 stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
7309 ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; 8053 ALIGN(delalloc_bytes, blocksize)) >> 9;
7310 return 0; 8054 return 0;
7311} 8055}
7312 8056
7313/*
7314 * If a file is moved, it will inherit the cow and compression flags of the new
7315 * directory.
7316 */
7317static void fixup_inode_flags(struct inode *dir, struct inode *inode)
7318{
7319 struct btrfs_inode *b_dir = BTRFS_I(dir);
7320 struct btrfs_inode *b_inode = BTRFS_I(inode);
7321
7322 if (b_dir->flags & BTRFS_INODE_NODATACOW)
7323 b_inode->flags |= BTRFS_INODE_NODATACOW;
7324 else
7325 b_inode->flags &= ~BTRFS_INODE_NODATACOW;
7326
7327 if (b_dir->flags & BTRFS_INODE_COMPRESS) {
7328 b_inode->flags |= BTRFS_INODE_COMPRESS;
7329 b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
7330 } else {
7331 b_inode->flags &= ~(BTRFS_INODE_COMPRESS |
7332 BTRFS_INODE_NOCOMPRESS);
7333 }
7334}
7335
7336static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, 8057static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7337 struct inode *new_dir, struct dentry *new_dentry) 8058 struct inode *new_dir, struct dentry *new_dentry)
7338{ 8059{
@@ -7498,8 +8219,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7498 } 8219 }
7499 } 8220 }
7500 8221
7501 fixup_inode_flags(new_dir, old_inode);
7502
7503 ret = btrfs_add_link(trans, new_dir, old_inode, 8222 ret = btrfs_add_link(trans, new_dir, old_inode,
7504 new_dentry->d_name.name, 8223 new_dentry->d_name.name,
7505 new_dentry->d_name.len, 0, index); 8224 new_dentry->d_name.len, 0, index);
@@ -7583,7 +8302,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
7583 8302
7584 INIT_LIST_HEAD(&works); 8303 INIT_LIST_HEAD(&works);
7585 INIT_LIST_HEAD(&splice); 8304 INIT_LIST_HEAD(&splice);
7586again: 8305
7587 spin_lock(&root->fs_info->delalloc_lock); 8306 spin_lock(&root->fs_info->delalloc_lock);
7588 list_splice_init(&root->fs_info->delalloc_inodes, &splice); 8307 list_splice_init(&root->fs_info->delalloc_inodes, &splice);
7589 while (!list_empty(&splice)) { 8308 while (!list_empty(&splice)) {
@@ -7593,8 +8312,11 @@ again:
7593 list_del_init(&binode->delalloc_inodes); 8312 list_del_init(&binode->delalloc_inodes);
7594 8313
7595 inode = igrab(&binode->vfs_inode); 8314 inode = igrab(&binode->vfs_inode);
7596 if (!inode) 8315 if (!inode) {
8316 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
8317 &binode->runtime_flags);
7597 continue; 8318 continue;
8319 }
7598 8320
7599 list_add_tail(&binode->delalloc_inodes, 8321 list_add_tail(&binode->delalloc_inodes,
7600 &root->fs_info->delalloc_inodes); 8322 &root->fs_info->delalloc_inodes);
@@ -7619,13 +8341,6 @@ again:
7619 btrfs_wait_and_free_delalloc_work(work); 8341 btrfs_wait_and_free_delalloc_work(work);
7620 } 8342 }
7621 8343
7622 spin_lock(&root->fs_info->delalloc_lock);
7623 if (!list_empty(&root->fs_info->delalloc_inodes)) {
7624 spin_unlock(&root->fs_info->delalloc_lock);
7625 goto again;
7626 }
7627 spin_unlock(&root->fs_info->delalloc_lock);
7628
7629 /* the filemap_flush will queue IO into the worker threads, but 8344 /* the filemap_flush will queue IO into the worker threads, but
7630 * we have to make sure the IO is actually started and that 8345 * we have to make sure the IO is actually started and that
7631 * ordered extents get created before we return 8346 * ordered extents get created before we return
@@ -7801,8 +8516,9 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
7801 } 8516 }
7802 } 8517 }
7803 8518
7804 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, 8519 ret = btrfs_reserve_extent(trans, root,
7805 0, *alloc_hint, &ins, 1); 8520 min(num_bytes, 256ULL * 1024 * 1024),
8521 min_size, 0, *alloc_hint, &ins, 1);
7806 if (ret) { 8522 if (ret) {
7807 if (own_trans) 8523 if (own_trans)
7808 btrfs_end_transaction(trans, root); 8524 btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c3f09f71bedd..c83086fdda05 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -42,12 +42,12 @@
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <linux/blkdev.h> 43#include <linux/blkdev.h>
44#include <linux/uuid.h> 44#include <linux/uuid.h>
45#include <linux/btrfs.h>
45#include "compat.h" 46#include "compat.h"
46#include "ctree.h" 47#include "ctree.h"
47#include "disk-io.h" 48#include "disk-io.h"
48#include "transaction.h" 49#include "transaction.h"
49#include "btrfs_inode.h" 50#include "btrfs_inode.h"
50#include "ioctl.h"
51#include "print-tree.h" 51#include "print-tree.h"
52#include "volumes.h" 52#include "volumes.h"
53#include "locking.h" 53#include "locking.h"
@@ -363,46 +363,52 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
363 return 0; 363 return 0;
364} 364}
365 365
366static noinline int create_subvol(struct btrfs_root *root, 366static noinline int create_subvol(struct inode *dir,
367 struct dentry *dentry, 367 struct dentry *dentry,
368 char *name, int namelen, 368 char *name, int namelen,
369 u64 *async_transid, 369 u64 *async_transid,
370 struct btrfs_qgroup_inherit **inherit) 370 struct btrfs_qgroup_inherit *inherit)
371{ 371{
372 struct btrfs_trans_handle *trans; 372 struct btrfs_trans_handle *trans;
373 struct btrfs_key key; 373 struct btrfs_key key;
374 struct btrfs_root_item root_item; 374 struct btrfs_root_item root_item;
375 struct btrfs_inode_item *inode_item; 375 struct btrfs_inode_item *inode_item;
376 struct extent_buffer *leaf; 376 struct extent_buffer *leaf;
377 struct btrfs_root *root = BTRFS_I(dir)->root;
377 struct btrfs_root *new_root; 378 struct btrfs_root *new_root;
378 struct dentry *parent = dentry->d_parent; 379 struct btrfs_block_rsv block_rsv;
379 struct inode *dir;
380 struct timespec cur_time = CURRENT_TIME; 380 struct timespec cur_time = CURRENT_TIME;
381 int ret; 381 int ret;
382 int err; 382 int err;
383 u64 objectid; 383 u64 objectid;
384 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; 384 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
385 u64 index = 0; 385 u64 index = 0;
386 u64 qgroup_reserved;
386 uuid_le new_uuid; 387 uuid_le new_uuid;
387 388
388 ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); 389 ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
389 if (ret) 390 if (ret)
390 return ret; 391 return ret;
391 392
392 dir = parent->d_inode; 393 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
393
394 /* 394 /*
395 * 1 - inode item 395 * The same as the snapshot creation, please see the comment
396 * 2 - refs 396 * of create_snapshot().
397 * 1 - root item
398 * 2 - dir items
399 */ 397 */
400 trans = btrfs_start_transaction(root, 6); 398 ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
401 if (IS_ERR(trans)) 399 7, &qgroup_reserved);
402 return PTR_ERR(trans); 400 if (ret)
401 return ret;
402
403 trans = btrfs_start_transaction(root, 0);
404 if (IS_ERR(trans)) {
405 ret = PTR_ERR(trans);
406 goto out;
407 }
408 trans->block_rsv = &block_rsv;
409 trans->bytes_reserved = block_rsv.size;
403 410
404 ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, 411 ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit);
405 inherit ? *inherit : NULL);
406 if (ret) 412 if (ret)
407 goto fail; 413 goto fail;
408 414
@@ -516,6 +522,8 @@ static noinline int create_subvol(struct btrfs_root *root,
516 BUG_ON(ret); 522 BUG_ON(ret);
517 523
518fail: 524fail:
525 trans->block_rsv = NULL;
526 trans->bytes_reserved = 0;
519 if (async_transid) { 527 if (async_transid) {
520 *async_transid = trans->transid; 528 *async_transid = trans->transid;
521 err = btrfs_commit_transaction_async(trans, root, 1); 529 err = btrfs_commit_transaction_async(trans, root, 1);
@@ -527,13 +535,15 @@ fail:
527 535
528 if (!ret) 536 if (!ret)
529 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); 537 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
530 538out:
539 btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
531 return ret; 540 return ret;
532} 541}
533 542
534static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, 543static int create_snapshot(struct btrfs_root *root, struct inode *dir,
535 char *name, int namelen, u64 *async_transid, 544 struct dentry *dentry, char *name, int namelen,
536 bool readonly, struct btrfs_qgroup_inherit **inherit) 545 u64 *async_transid, bool readonly,
546 struct btrfs_qgroup_inherit *inherit)
537{ 547{
538 struct inode *inode; 548 struct inode *inode;
539 struct btrfs_pending_snapshot *pending_snapshot; 549 struct btrfs_pending_snapshot *pending_snapshot;
@@ -549,23 +559,31 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
549 559
550 btrfs_init_block_rsv(&pending_snapshot->block_rsv, 560 btrfs_init_block_rsv(&pending_snapshot->block_rsv,
551 BTRFS_BLOCK_RSV_TEMP); 561 BTRFS_BLOCK_RSV_TEMP);
562 /*
563 * 1 - parent dir inode
564 * 2 - dir entries
565 * 1 - root item
566 * 2 - root ref/backref
567 * 1 - root of snapshot
568 */
569 ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
570 &pending_snapshot->block_rsv, 7,
571 &pending_snapshot->qgroup_reserved);
572 if (ret)
573 goto out;
574
552 pending_snapshot->dentry = dentry; 575 pending_snapshot->dentry = dentry;
553 pending_snapshot->root = root; 576 pending_snapshot->root = root;
554 pending_snapshot->readonly = readonly; 577 pending_snapshot->readonly = readonly;
555 if (inherit) { 578 pending_snapshot->dir = dir;
556 pending_snapshot->inherit = *inherit; 579 pending_snapshot->inherit = inherit;
557 *inherit = NULL; /* take responsibility to free it */
558 }
559 580
560 trans = btrfs_start_transaction(root->fs_info->extent_root, 6); 581 trans = btrfs_start_transaction(root, 0);
561 if (IS_ERR(trans)) { 582 if (IS_ERR(trans)) {
562 ret = PTR_ERR(trans); 583 ret = PTR_ERR(trans);
563 goto fail; 584 goto fail;
564 } 585 }
565 586
566 ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
567 BUG_ON(ret);
568
569 spin_lock(&root->fs_info->trans_lock); 587 spin_lock(&root->fs_info->trans_lock);
570 list_add(&pending_snapshot->list, 588 list_add(&pending_snapshot->list,
571 &trans->transaction->pending_snapshots); 589 &trans->transaction->pending_snapshots);
@@ -602,6 +620,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
602 d_instantiate(dentry, inode); 620 d_instantiate(dentry, inode);
603 ret = 0; 621 ret = 0;
604fail: 622fail:
623 btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
624 &pending_snapshot->block_rsv,
625 pending_snapshot->qgroup_reserved);
626out:
605 kfree(pending_snapshot); 627 kfree(pending_snapshot);
606 return ret; 628 return ret;
607} 629}
@@ -695,7 +717,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
695 char *name, int namelen, 717 char *name, int namelen,
696 struct btrfs_root *snap_src, 718 struct btrfs_root *snap_src,
697 u64 *async_transid, bool readonly, 719 u64 *async_transid, bool readonly,
698 struct btrfs_qgroup_inherit **inherit) 720 struct btrfs_qgroup_inherit *inherit)
699{ 721{
700 struct inode *dir = parent->dentry->d_inode; 722 struct inode *dir = parent->dentry->d_inode;
701 struct dentry *dentry; 723 struct dentry *dentry;
@@ -732,11 +754,11 @@ static noinline int btrfs_mksubvol(struct path *parent,
732 goto out_up_read; 754 goto out_up_read;
733 755
734 if (snap_src) { 756 if (snap_src) {
735 error = create_snapshot(snap_src, dentry, name, namelen, 757 error = create_snapshot(snap_src, dir, dentry, name, namelen,
736 async_transid, readonly, inherit); 758 async_transid, readonly, inherit);
737 } else { 759 } else {
738 error = create_subvol(BTRFS_I(dir)->root, dentry, 760 error = create_subvol(dir, dentry, name, namelen,
739 name, namelen, async_transid, inherit); 761 async_transid, inherit);
740 } 762 }
741 if (!error) 763 if (!error)
742 fsnotify_mkdir(dir, dentry); 764 fsnotify_mkdir(dir, dentry);
@@ -818,7 +840,7 @@ static int find_new_extents(struct btrfs_root *root,
818 840
819 while(1) { 841 while(1) {
820 ret = btrfs_search_forward(root, &min_key, &max_key, 842 ret = btrfs_search_forward(root, &min_key, &max_key,
821 path, 0, newer_than); 843 path, newer_than);
822 if (ret != 0) 844 if (ret != 0)
823 goto none; 845 goto none;
824 if (min_key.objectid != ino) 846 if (min_key.objectid != ino)
@@ -1206,6 +1228,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1206 if (!(inode->i_sb->s_flags & MS_ACTIVE)) 1228 if (!(inode->i_sb->s_flags & MS_ACTIVE))
1207 break; 1229 break;
1208 1230
1231 if (btrfs_defrag_cancelled(root->fs_info)) {
1232 printk(KERN_DEBUG "btrfs: defrag_file cancelled\n");
1233 ret = -EAGAIN;
1234 break;
1235 }
1236
1209 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, 1237 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
1210 extent_thresh, &last_len, &skip, 1238 extent_thresh, &last_len, &skip,
1211 &defrag_end, range->flags & 1239 &defrag_end, range->flags &
@@ -1329,9 +1357,6 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1329 int ret = 0; 1357 int ret = 0;
1330 int mod = 0; 1358 int mod = 0;
1331 1359
1332 if (root->fs_info->sb->s_flags & MS_RDONLY)
1333 return -EROFS;
1334
1335 if (!capable(CAP_SYS_ADMIN)) 1360 if (!capable(CAP_SYS_ADMIN))
1336 return -EPERM; 1361 return -EPERM;
1337 1362
@@ -1363,6 +1388,10 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1363 *devstr = '\0'; 1388 *devstr = '\0';
1364 devstr = vol_args->name; 1389 devstr = vol_args->name;
1365 devid = simple_strtoull(devstr, &end, 10); 1390 devid = simple_strtoull(devstr, &end, 10);
1391 if (!devid) {
1392 ret = -EINVAL;
1393 goto out_free;
1394 }
1366 printk(KERN_INFO "btrfs: resizing devid %llu\n", 1395 printk(KERN_INFO "btrfs: resizing devid %llu\n",
1367 (unsigned long long)devid); 1396 (unsigned long long)devid);
1368 } 1397 }
@@ -1371,7 +1400,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1371 if (!device) { 1400 if (!device) {
1372 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", 1401 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
1373 (unsigned long long)devid); 1402 (unsigned long long)devid);
1374 ret = -EINVAL; 1403 ret = -ENODEV;
1375 goto out_free; 1404 goto out_free;
1376 } 1405 }
1377 1406
@@ -1379,7 +1408,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1379 printk(KERN_INFO "btrfs: resizer unable to apply on " 1408 printk(KERN_INFO "btrfs: resizer unable to apply on "
1380 "readonly device %llu\n", 1409 "readonly device %llu\n",
1381 (unsigned long long)devid); 1410 (unsigned long long)devid);
1382 ret = -EINVAL; 1411 ret = -EPERM;
1383 goto out_free; 1412 goto out_free;
1384 } 1413 }
1385 1414
@@ -1401,7 +1430,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1401 } 1430 }
1402 1431
1403 if (device->is_tgtdev_for_dev_replace) { 1432 if (device->is_tgtdev_for_dev_replace) {
1404 ret = -EINVAL; 1433 ret = -EPERM;
1405 goto out_free; 1434 goto out_free;
1406 } 1435 }
1407 1436
@@ -1457,7 +1486,7 @@ out:
1457static noinline int btrfs_ioctl_snap_create_transid(struct file *file, 1486static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
1458 char *name, unsigned long fd, int subvol, 1487 char *name, unsigned long fd, int subvol,
1459 u64 *transid, bool readonly, 1488 u64 *transid, bool readonly,
1460 struct btrfs_qgroup_inherit **inherit) 1489 struct btrfs_qgroup_inherit *inherit)
1461{ 1490{
1462 int namelen; 1491 int namelen;
1463 int ret = 0; 1492 int ret = 0;
@@ -1566,7 +1595,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1566 1595
1567 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, 1596 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1568 vol_args->fd, subvol, ptr, 1597 vol_args->fd, subvol, ptr,
1569 readonly, &inherit); 1598 readonly, inherit);
1570 1599
1571 if (ret == 0 && ptr && 1600 if (ret == 0 && ptr &&
1572 copy_to_user(arg + 1601 copy_to_user(arg +
@@ -1863,7 +1892,7 @@ static noinline int search_ioctl(struct inode *inode,
1863 path->keep_locks = 1; 1892 path->keep_locks = 1;
1864 1893
1865 while(1) { 1894 while(1) {
1866 ret = btrfs_search_forward(root, &key, &max_key, path, 0, 1895 ret = btrfs_search_forward(root, &key, &max_key, path,
1867 sk->min_transid); 1896 sk->min_transid);
1868 if (ret != 0) { 1897 if (ret != 0) {
1869 if (ret > 0) 1898 if (ret > 0)
@@ -2035,6 +2064,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2035 struct btrfs_root *dest = NULL; 2064 struct btrfs_root *dest = NULL;
2036 struct btrfs_ioctl_vol_args *vol_args; 2065 struct btrfs_ioctl_vol_args *vol_args;
2037 struct btrfs_trans_handle *trans; 2066 struct btrfs_trans_handle *trans;
2067 struct btrfs_block_rsv block_rsv;
2068 u64 qgroup_reserved;
2038 int namelen; 2069 int namelen;
2039 int ret; 2070 int ret;
2040 int err = 0; 2071 int err = 0;
@@ -2124,12 +2155,23 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2124 if (err) 2155 if (err)
2125 goto out_up_write; 2156 goto out_up_write;
2126 2157
2158 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
2159 /*
2160 * One for dir inode, two for dir entries, two for root
2161 * ref/backref.
2162 */
2163 err = btrfs_subvolume_reserve_metadata(root, &block_rsv,
2164 5, &qgroup_reserved);
2165 if (err)
2166 goto out_up_write;
2167
2127 trans = btrfs_start_transaction(root, 0); 2168 trans = btrfs_start_transaction(root, 0);
2128 if (IS_ERR(trans)) { 2169 if (IS_ERR(trans)) {
2129 err = PTR_ERR(trans); 2170 err = PTR_ERR(trans);
2130 goto out_up_write; 2171 goto out_release;
2131 } 2172 }
2132 trans->block_rsv = &root->fs_info->global_block_rsv; 2173 trans->block_rsv = &block_rsv;
2174 trans->bytes_reserved = block_rsv.size;
2133 2175
2134 ret = btrfs_unlink_subvol(trans, root, dir, 2176 ret = btrfs_unlink_subvol(trans, root, dir,
2135 dest->root_key.objectid, 2177 dest->root_key.objectid,
@@ -2159,10 +2201,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2159 } 2201 }
2160 } 2202 }
2161out_end_trans: 2203out_end_trans:
2204 trans->block_rsv = NULL;
2205 trans->bytes_reserved = 0;
2162 ret = btrfs_end_transaction(trans, root); 2206 ret = btrfs_end_transaction(trans, root);
2163 if (ret && !err) 2207 if (ret && !err)
2164 err = ret; 2208 err = ret;
2165 inode->i_flags |= S_DEAD; 2209 inode->i_flags |= S_DEAD;
2210out_release:
2211 btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
2166out_up_write: 2212out_up_write:
2167 up_write(&root->fs_info->subvol_sem); 2213 up_write(&root->fs_info->subvol_sem);
2168out_unlock: 2214out_unlock:
@@ -2171,6 +2217,12 @@ out_unlock:
2171 shrink_dcache_sb(root->fs_info->sb); 2217 shrink_dcache_sb(root->fs_info->sb);
2172 btrfs_invalidate_inodes(dest); 2218 btrfs_invalidate_inodes(dest);
2173 d_delete(dentry); 2219 d_delete(dentry);
2220
2221 /* the last ref */
2222 if (dest->cache_inode) {
2223 iput(dest->cache_inode);
2224 dest->cache_inode = NULL;
2225 }
2174 } 2226 }
2175out_dput: 2227out_dput:
2176 dput(dentry); 2228 dput(dentry);
@@ -2211,10 +2263,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
2211 ret = -EPERM; 2263 ret = -EPERM;
2212 goto out; 2264 goto out;
2213 } 2265 }
2214 ret = btrfs_defrag_root(root, 0); 2266 ret = btrfs_defrag_root(root);
2215 if (ret) 2267 if (ret)
2216 goto out; 2268 goto out;
2217 ret = btrfs_defrag_root(root->fs_info->extent_root, 0); 2269 ret = btrfs_defrag_root(root->fs_info->extent_root);
2218 break; 2270 break;
2219 case S_IFREG: 2271 case S_IFREG:
2220 if (!(file->f_mode & FMODE_WRITE)) { 2272 if (!(file->f_mode & FMODE_WRITE)) {
@@ -3111,7 +3163,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
3111 u64 transid; 3163 u64 transid;
3112 int ret; 3164 int ret;
3113 3165
3114 trans = btrfs_attach_transaction(root); 3166 trans = btrfs_attach_transaction_barrier(root);
3115 if (IS_ERR(trans)) { 3167 if (IS_ERR(trans)) {
3116 if (PTR_ERR(trans) != -ENOENT) 3168 if (PTR_ERR(trans) != -ENOENT)
3117 return PTR_ERR(trans); 3169 return PTR_ERR(trans);
@@ -3289,7 +3341,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
3289 struct inode_fs_paths *ipath = NULL; 3341 struct inode_fs_paths *ipath = NULL;
3290 struct btrfs_path *path; 3342 struct btrfs_path *path;
3291 3343
3292 if (!capable(CAP_SYS_ADMIN)) 3344 if (!capable(CAP_DAC_READ_SEARCH))
3293 return -EPERM; 3345 return -EPERM;
3294 3346
3295 path = btrfs_alloc_path(); 3347 path = btrfs_alloc_path();
@@ -3914,6 +3966,65 @@ out:
3914 return ret; 3966 return ret;
3915} 3967}
3916 3968
3969static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
3970{
3971 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3972 const char *label = root->fs_info->super_copy->label;
3973 size_t len = strnlen(label, BTRFS_LABEL_SIZE);
3974 int ret;
3975
3976 if (len == BTRFS_LABEL_SIZE) {
3977 pr_warn("btrfs: label is too long, return the first %zu bytes\n",
3978 --len);
3979 }
3980
3981 mutex_lock(&root->fs_info->volume_mutex);
3982 ret = copy_to_user(arg, label, len);
3983 mutex_unlock(&root->fs_info->volume_mutex);
3984
3985 return ret ? -EFAULT : 0;
3986}
3987
3988static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
3989{
3990 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3991 struct btrfs_super_block *super_block = root->fs_info->super_copy;
3992 struct btrfs_trans_handle *trans;
3993 char label[BTRFS_LABEL_SIZE];
3994 int ret;
3995
3996 if (!capable(CAP_SYS_ADMIN))
3997 return -EPERM;
3998
3999 if (copy_from_user(label, arg, sizeof(label)))
4000 return -EFAULT;
4001
4002 if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
4003 pr_err("btrfs: unable to set label with more than %d bytes\n",
4004 BTRFS_LABEL_SIZE - 1);
4005 return -EINVAL;
4006 }
4007
4008 ret = mnt_want_write_file(file);
4009 if (ret)
4010 return ret;
4011
4012 mutex_lock(&root->fs_info->volume_mutex);
4013 trans = btrfs_start_transaction(root, 0);
4014 if (IS_ERR(trans)) {
4015 ret = PTR_ERR(trans);
4016 goto out_unlock;
4017 }
4018
4019 strcpy(super_block->label, label);
4020 ret = btrfs_end_transaction(trans, root);
4021
4022out_unlock:
4023 mutex_unlock(&root->fs_info->volume_mutex);
4024 mnt_drop_write_file(file);
4025 return ret;
4026}
4027
3917long btrfs_ioctl(struct file *file, unsigned int 4028long btrfs_ioctl(struct file *file, unsigned int
3918 cmd, unsigned long arg) 4029 cmd, unsigned long arg)
3919{ 4030{
@@ -4014,6 +4125,10 @@ long btrfs_ioctl(struct file *file, unsigned int
4014 return btrfs_ioctl_qgroup_limit(file, argp); 4125 return btrfs_ioctl_qgroup_limit(file, argp);
4015 case BTRFS_IOC_DEV_REPLACE: 4126 case BTRFS_IOC_DEV_REPLACE:
4016 return btrfs_ioctl_dev_replace(root, argp); 4127 return btrfs_ioctl_dev_replace(root, argp);
4128 case BTRFS_IOC_GET_FSLABEL:
4129 return btrfs_ioctl_get_fslabel(file, argp);
4130 case BTRFS_IOC_SET_FSLABEL:
4131 return btrfs_ioctl_set_fslabel(file, argp);
4017 } 4132 }
4018 4133
4019 return -ENOTTY; 4134 return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
deleted file mode 100644
index dabca9cc8c2e..000000000000
--- a/fs/btrfs/ioctl.h
+++ /dev/null
@@ -1,502 +0,0 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __IOCTL_
20#define __IOCTL_
21#include <linux/ioctl.h>
22
23#define BTRFS_IOCTL_MAGIC 0x94
24#define BTRFS_VOL_NAME_MAX 255
25
26/* this should be 4k */
27#define BTRFS_PATH_NAME_MAX 4087
28struct btrfs_ioctl_vol_args {
29 __s64 fd;
30 char name[BTRFS_PATH_NAME_MAX + 1];
31};
32
33#define BTRFS_DEVICE_PATH_NAME_MAX 1024
34
35#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
36#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
37#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2)
38#define BTRFS_FSID_SIZE 16
39#define BTRFS_UUID_SIZE 16
40
41#define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0)
42
43struct btrfs_qgroup_limit {
44 __u64 flags;
45 __u64 max_rfer;
46 __u64 max_excl;
47 __u64 rsv_rfer;
48 __u64 rsv_excl;
49};
50
51struct btrfs_qgroup_inherit {
52 __u64 flags;
53 __u64 num_qgroups;
54 __u64 num_ref_copies;
55 __u64 num_excl_copies;
56 struct btrfs_qgroup_limit lim;
57 __u64 qgroups[0];
58};
59
60struct btrfs_ioctl_qgroup_limit_args {
61 __u64 qgroupid;
62 struct btrfs_qgroup_limit lim;
63};
64
65#define BTRFS_SUBVOL_NAME_MAX 4039
66struct btrfs_ioctl_vol_args_v2 {
67 __s64 fd;
68 __u64 transid;
69 __u64 flags;
70 union {
71 struct {
72 __u64 size;
73 struct btrfs_qgroup_inherit __user *qgroup_inherit;
74 };
75 __u64 unused[4];
76 };
77 char name[BTRFS_SUBVOL_NAME_MAX + 1];
78};
79
80/*
81 * structure to report errors and progress to userspace, either as a
82 * result of a finished scrub, a canceled scrub or a progress inquiry
83 */
84struct btrfs_scrub_progress {
85 __u64 data_extents_scrubbed; /* # of data extents scrubbed */
86 __u64 tree_extents_scrubbed; /* # of tree extents scrubbed */
87 __u64 data_bytes_scrubbed; /* # of data bytes scrubbed */
88 __u64 tree_bytes_scrubbed; /* # of tree bytes scrubbed */
89 __u64 read_errors; /* # of read errors encountered (EIO) */
90 __u64 csum_errors; /* # of failed csum checks */
91 __u64 verify_errors; /* # of occurences, where the metadata
92 * of a tree block did not match the
93 * expected values, like generation or
94 * logical */
95 __u64 no_csum; /* # of 4k data block for which no csum
96 * is present, probably the result of
97 * data written with nodatasum */
98 __u64 csum_discards; /* # of csum for which no data was found
99 * in the extent tree. */
100 __u64 super_errors; /* # of bad super blocks encountered */
101 __u64 malloc_errors; /* # of internal kmalloc errors. These
102 * will likely cause an incomplete
103 * scrub */
104 __u64 uncorrectable_errors; /* # of errors where either no intact
105 * copy was found or the writeback
106 * failed */
107 __u64 corrected_errors; /* # of errors corrected */
108 __u64 last_physical; /* last physical address scrubbed. In
109 * case a scrub was aborted, this can
110 * be used to restart the scrub */
111 __u64 unverified_errors; /* # of occurences where a read for a
112 * full (64k) bio failed, but the re-
113 * check succeeded for each 4k piece.
114 * Intermittent error. */
115};
116
117#define BTRFS_SCRUB_READONLY 1
118struct btrfs_ioctl_scrub_args {
119 __u64 devid; /* in */
120 __u64 start; /* in */
121 __u64 end; /* in */
122 __u64 flags; /* in */
123 struct btrfs_scrub_progress progress; /* out */
124 /* pad to 1k */
125 __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
126};
127
128#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
129#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
130struct btrfs_ioctl_dev_replace_start_params {
131 __u64 srcdevid; /* in, if 0, use srcdev_name instead */
132 __u64 cont_reading_from_srcdev_mode; /* in, see #define
133 * above */
134 __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
135 __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
136};
137
138#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0
139#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED 1
140#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2
141#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3
142#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4
143struct btrfs_ioctl_dev_replace_status_params {
144 __u64 replace_state; /* out, see #define above */
145 __u64 progress_1000; /* out, 0 <= x <= 1000 */
146 __u64 time_started; /* out, seconds since 1-Jan-1970 */
147 __u64 time_stopped; /* out, seconds since 1-Jan-1970 */
148 __u64 num_write_errors; /* out */
149 __u64 num_uncorrectable_read_errors; /* out */
150};
151
152#define BTRFS_IOCTL_DEV_REPLACE_CMD_START 0
153#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS 1
154#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL 2
155#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0
156#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1
157#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2
158struct btrfs_ioctl_dev_replace_args {
159 __u64 cmd; /* in */
160 __u64 result; /* out */
161
162 union {
163 struct btrfs_ioctl_dev_replace_start_params start;
164 struct btrfs_ioctl_dev_replace_status_params status;
165 }; /* in/out */
166
167 __u64 spare[64];
168};
169
170struct btrfs_ioctl_dev_info_args {
171 __u64 devid; /* in/out */
172 __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */
173 __u64 bytes_used; /* out */
174 __u64 total_bytes; /* out */
175 __u64 unused[379]; /* pad to 4k */
176 __u8 path[BTRFS_DEVICE_PATH_NAME_MAX]; /* out */
177};
178
179struct btrfs_ioctl_fs_info_args {
180 __u64 max_id; /* out */
181 __u64 num_devices; /* out */
182 __u8 fsid[BTRFS_FSID_SIZE]; /* out */
183 __u64 reserved[124]; /* pad to 1k */
184};
185
186/* balance control ioctl modes */
187#define BTRFS_BALANCE_CTL_PAUSE 1
188#define BTRFS_BALANCE_CTL_CANCEL 2
189
190/*
191 * this is packed, because it should be exactly the same as its disk
192 * byte order counterpart (struct btrfs_disk_balance_args)
193 */
194struct btrfs_balance_args {
195 __u64 profiles;
196 __u64 usage;
197 __u64 devid;
198 __u64 pstart;
199 __u64 pend;
200 __u64 vstart;
201 __u64 vend;
202
203 __u64 target;
204
205 __u64 flags;
206
207 __u64 unused[8];
208} __attribute__ ((__packed__));
209
210/* report balance progress to userspace */
211struct btrfs_balance_progress {
212 __u64 expected; /* estimated # of chunks that will be
213 * relocated to fulfill the request */
214 __u64 considered; /* # of chunks we have considered so far */
215 __u64 completed; /* # of chunks relocated so far */
216};
217
218#define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0)
219#define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1)
220#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2)
221
222struct btrfs_ioctl_balance_args {
223 __u64 flags; /* in/out */
224 __u64 state; /* out */
225
226 struct btrfs_balance_args data; /* in/out */
227 struct btrfs_balance_args meta; /* in/out */
228 struct btrfs_balance_args sys; /* in/out */
229
230 struct btrfs_balance_progress stat; /* out */
231
232 __u64 unused[72]; /* pad to 1k */
233};
234
235#define BTRFS_INO_LOOKUP_PATH_MAX 4080
236struct btrfs_ioctl_ino_lookup_args {
237 __u64 treeid;
238 __u64 objectid;
239 char name[BTRFS_INO_LOOKUP_PATH_MAX];
240};
241
242struct btrfs_ioctl_search_key {
243 /* which root are we searching. 0 is the tree of tree roots */
244 __u64 tree_id;
245
246 /* keys returned will be >= min and <= max */
247 __u64 min_objectid;
248 __u64 max_objectid;
249
250 /* keys returned will be >= min and <= max */
251 __u64 min_offset;
252 __u64 max_offset;
253
254 /* max and min transids to search for */
255 __u64 min_transid;
256 __u64 max_transid;
257
258 /* keys returned will be >= min and <= max */
259 __u32 min_type;
260 __u32 max_type;
261
262 /*
263 * how many items did userland ask for, and how many are we
264 * returning
265 */
266 __u32 nr_items;
267
268 /* align to 64 bits */
269 __u32 unused;
270
271 /* some extra for later */
272 __u64 unused1;
273 __u64 unused2;
274 __u64 unused3;
275 __u64 unused4;
276};
277
278struct btrfs_ioctl_search_header {
279 __u64 transid;
280 __u64 objectid;
281 __u64 offset;
282 __u32 type;
283 __u32 len;
284};
285
286#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
287/*
288 * the buf is an array of search headers where
289 * each header is followed by the actual item
290 * the type field is expanded to 32 bits for alignment
291 */
292struct btrfs_ioctl_search_args {
293 struct btrfs_ioctl_search_key key;
294 char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
295};
296
297struct btrfs_ioctl_clone_range_args {
298 __s64 src_fd;
299 __u64 src_offset, src_length;
300 __u64 dest_offset;
301};
302
303/* flags for the defrag range ioctl */
304#define BTRFS_DEFRAG_RANGE_COMPRESS 1
305#define BTRFS_DEFRAG_RANGE_START_IO 2
306
307struct btrfs_ioctl_space_info {
308 __u64 flags;
309 __u64 total_bytes;
310 __u64 used_bytes;
311};
312
313struct btrfs_ioctl_space_args {
314 __u64 space_slots;
315 __u64 total_spaces;
316 struct btrfs_ioctl_space_info spaces[0];
317};
318
319struct btrfs_data_container {
320 __u32 bytes_left; /* out -- bytes not needed to deliver output */
321 __u32 bytes_missing; /* out -- additional bytes needed for result */
322 __u32 elem_cnt; /* out */
323 __u32 elem_missed; /* out */
324 __u64 val[0]; /* out */
325};
326
327struct btrfs_ioctl_ino_path_args {
328 __u64 inum; /* in */
329 __u64 size; /* in */
330 __u64 reserved[4];
331 /* struct btrfs_data_container *fspath; out */
332 __u64 fspath; /* out */
333};
334
335struct btrfs_ioctl_logical_ino_args {
336 __u64 logical; /* in */
337 __u64 size; /* in */
338 __u64 reserved[4];
339 /* struct btrfs_data_container *inodes; out */
340 __u64 inodes;
341};
342
343enum btrfs_dev_stat_values {
344 /* disk I/O failure stats */
345 BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */
346 BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */
347 BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */
348
349 /* stats for indirect indications for I/O failures */
350 BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or
351 * contents is illegal: this is an
352 * indication that the block was damaged
353 * during read or write, or written to
354 * wrong location or read from wrong
355 * location */
356 BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not
357 * been written */
358
359 BTRFS_DEV_STAT_VALUES_MAX
360};
361
362/* Reset statistics after reading; needs SYS_ADMIN capability */
363#define BTRFS_DEV_STATS_RESET (1ULL << 0)
364
365struct btrfs_ioctl_get_dev_stats {
366 __u64 devid; /* in */
367 __u64 nr_items; /* in/out */
368 __u64 flags; /* in/out */
369
370 /* out values: */
371 __u64 values[BTRFS_DEV_STAT_VALUES_MAX];
372
373 __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */
374};
375
376#define BTRFS_QUOTA_CTL_ENABLE 1
377#define BTRFS_QUOTA_CTL_DISABLE 2
378#define BTRFS_QUOTA_CTL_RESCAN 3
379struct btrfs_ioctl_quota_ctl_args {
380 __u64 cmd;
381 __u64 status;
382};
383
384struct btrfs_ioctl_qgroup_assign_args {
385 __u64 assign;
386 __u64 src;
387 __u64 dst;
388};
389
390struct btrfs_ioctl_qgroup_create_args {
391 __u64 create;
392 __u64 qgroupid;
393};
394struct btrfs_ioctl_timespec {
395 __u64 sec;
396 __u32 nsec;
397};
398
399struct btrfs_ioctl_received_subvol_args {
400 char uuid[BTRFS_UUID_SIZE]; /* in */
401 __u64 stransid; /* in */
402 __u64 rtransid; /* out */
403 struct btrfs_ioctl_timespec stime; /* in */
404 struct btrfs_ioctl_timespec rtime; /* out */
405 __u64 flags; /* in */
406 __u64 reserved[16]; /* in */
407};
408
409struct btrfs_ioctl_send_args {
410 __s64 send_fd; /* in */
411 __u64 clone_sources_count; /* in */
412 __u64 __user *clone_sources; /* in */
413 __u64 parent_root; /* in */
414 __u64 flags; /* in */
415 __u64 reserved[4]; /* in */
416};
417
418#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
419 struct btrfs_ioctl_vol_args)
420#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
421 struct btrfs_ioctl_vol_args)
422#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
423 struct btrfs_ioctl_vol_args)
424#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
425 struct btrfs_ioctl_vol_args)
426/* trans start and trans end are dangerous, and only for
427 * use by applications that know how to avoid the
428 * resulting deadlocks
429 */
430#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6)
431#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7)
432#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8)
433
434#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
435#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
436 struct btrfs_ioctl_vol_args)
437#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
438 struct btrfs_ioctl_vol_args)
439#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
440 struct btrfs_ioctl_vol_args)
441
442#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
443 struct btrfs_ioctl_clone_range_args)
444
445#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
446 struct btrfs_ioctl_vol_args)
447#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
448 struct btrfs_ioctl_vol_args)
449#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \
450 struct btrfs_ioctl_defrag_range_args)
451#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
452 struct btrfs_ioctl_search_args)
453#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
454 struct btrfs_ioctl_ino_lookup_args)
455#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
456#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
457 struct btrfs_ioctl_space_args)
458#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
459#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
460#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
461 struct btrfs_ioctl_vol_args_v2)
462#define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \
463 struct btrfs_ioctl_vol_args_v2)
464#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64)
465#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
466#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \
467 struct btrfs_ioctl_scrub_args)
468#define BTRFS_IOC_SCRUB_CANCEL _IO(BTRFS_IOCTL_MAGIC, 28)
469#define BTRFS_IOC_SCRUB_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 29, \
470 struct btrfs_ioctl_scrub_args)
471#define BTRFS_IOC_DEV_INFO _IOWR(BTRFS_IOCTL_MAGIC, 30, \
472 struct btrfs_ioctl_dev_info_args)
473#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
474 struct btrfs_ioctl_fs_info_args)
475#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
476 struct btrfs_ioctl_balance_args)
477#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
478#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \
479 struct btrfs_ioctl_balance_args)
480#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
481 struct btrfs_ioctl_ino_path_args)
482#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
483 struct btrfs_ioctl_ino_path_args)
484#define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \
485 struct btrfs_ioctl_received_subvol_args)
486#define BTRFS_IOC_SEND _IOW(BTRFS_IOCTL_MAGIC, 38, struct btrfs_ioctl_send_args)
487#define BTRFS_IOC_DEVICES_READY _IOR(BTRFS_IOCTL_MAGIC, 39, \
488 struct btrfs_ioctl_vol_args)
489#define BTRFS_IOC_QUOTA_CTL _IOWR(BTRFS_IOCTL_MAGIC, 40, \
490 struct btrfs_ioctl_quota_ctl_args)
491#define BTRFS_IOC_QGROUP_ASSIGN _IOW(BTRFS_IOCTL_MAGIC, 41, \
492 struct btrfs_ioctl_qgroup_assign_args)
493#define BTRFS_IOC_QGROUP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 42, \
494 struct btrfs_ioctl_qgroup_create_args)
495#define BTRFS_IOC_QGROUP_LIMIT _IOR(BTRFS_IOCTL_MAGIC, 43, \
496 struct btrfs_ioctl_qgroup_limit_args)
497#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
498 struct btrfs_ioctl_get_dev_stats)
499#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
500 struct btrfs_ioctl_dev_replace_args)
501
502#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 2a1762c66041..e95df435d897 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -113,11 +113,10 @@ again:
113 read_unlock(&eb->lock); 113 read_unlock(&eb->lock);
114 return; 114 return;
115 } 115 }
116 read_unlock(&eb->lock);
117 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
118 read_lock(&eb->lock);
119 if (atomic_read(&eb->blocking_writers)) { 116 if (atomic_read(&eb->blocking_writers)) {
120 read_unlock(&eb->lock); 117 read_unlock(&eb->lock);
118 wait_event(eb->write_lock_wq,
119 atomic_read(&eb->blocking_writers) == 0);
121 goto again; 120 goto again;
122 } 121 }
123 atomic_inc(&eb->read_locks); 122 atomic_inc(&eb->read_locks);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e5ed56729607..dc08d77b717e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -196,6 +196,9 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
196 entry->file_offset = file_offset; 196 entry->file_offset = file_offset;
197 entry->start = start; 197 entry->start = start;
198 entry->len = len; 198 entry->len = len;
199 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
200 !(type == BTRFS_ORDERED_NOCOW))
201 entry->csum_bytes_left = disk_len;
199 entry->disk_len = disk_len; 202 entry->disk_len = disk_len;
200 entry->bytes_left = len; 203 entry->bytes_left = len;
201 entry->inode = igrab(inode); 204 entry->inode = igrab(inode);
@@ -213,6 +216,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
213 INIT_LIST_HEAD(&entry->root_extent_list); 216 INIT_LIST_HEAD(&entry->root_extent_list);
214 INIT_LIST_HEAD(&entry->work_list); 217 INIT_LIST_HEAD(&entry->work_list);
215 init_completion(&entry->completion); 218 init_completion(&entry->completion);
219 INIT_LIST_HEAD(&entry->log_list);
216 220
217 trace_btrfs_ordered_extent_add(inode, entry); 221 trace_btrfs_ordered_extent_add(inode, entry);
218 222
@@ -270,6 +274,10 @@ void btrfs_add_ordered_sum(struct inode *inode,
270 tree = &BTRFS_I(inode)->ordered_tree; 274 tree = &BTRFS_I(inode)->ordered_tree;
271 spin_lock_irq(&tree->lock); 275 spin_lock_irq(&tree->lock);
272 list_add_tail(&sum->list, &entry->list); 276 list_add_tail(&sum->list, &entry->list);
277 WARN_ON(entry->csum_bytes_left < sum->len);
278 entry->csum_bytes_left -= sum->len;
279 if (entry->csum_bytes_left == 0)
280 wake_up(&entry->wait);
273 spin_unlock_irq(&tree->lock); 281 spin_unlock_irq(&tree->lock);
274} 282}
275 283
@@ -405,6 +413,66 @@ out:
405 return ret == 0; 413 return ret == 0;
406} 414}
407 415
416/* Needs to either be called under a log transaction or the log_mutex */
417void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode)
418{
419 struct btrfs_ordered_inode_tree *tree;
420 struct btrfs_ordered_extent *ordered;
421 struct rb_node *n;
422 int index = log->log_transid % 2;
423
424 tree = &BTRFS_I(inode)->ordered_tree;
425 spin_lock_irq(&tree->lock);
426 for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
427 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
428 spin_lock(&log->log_extents_lock[index]);
429 if (list_empty(&ordered->log_list)) {
430 list_add_tail(&ordered->log_list, &log->logged_list[index]);
431 atomic_inc(&ordered->refs);
432 }
433 spin_unlock(&log->log_extents_lock[index]);
434 }
435 spin_unlock_irq(&tree->lock);
436}
437
438void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
439{
440 struct btrfs_ordered_extent *ordered;
441 int index = transid % 2;
442
443 spin_lock_irq(&log->log_extents_lock[index]);
444 while (!list_empty(&log->logged_list[index])) {
445 ordered = list_first_entry(&log->logged_list[index],
446 struct btrfs_ordered_extent,
447 log_list);
448 list_del_init(&ordered->log_list);
449 spin_unlock_irq(&log->log_extents_lock[index]);
450 wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
451 &ordered->flags));
452 btrfs_put_ordered_extent(ordered);
453 spin_lock_irq(&log->log_extents_lock[index]);
454 }
455 spin_unlock_irq(&log->log_extents_lock[index]);
456}
457
458void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid)
459{
460 struct btrfs_ordered_extent *ordered;
461 int index = transid % 2;
462
463 spin_lock_irq(&log->log_extents_lock[index]);
464 while (!list_empty(&log->logged_list[index])) {
465 ordered = list_first_entry(&log->logged_list[index],
466 struct btrfs_ordered_extent,
467 log_list);
468 list_del_init(&ordered->log_list);
469 spin_unlock_irq(&log->log_extents_lock[index]);
470 btrfs_put_ordered_extent(ordered);
471 spin_lock_irq(&log->log_extents_lock[index]);
472 }
473 spin_unlock_irq(&log->log_extents_lock[index]);
474}
475
408/* 476/*
409 * used to drop a reference on an ordered extent. This will free 477 * used to drop a reference on an ordered extent. This will free
410 * the extent if the last reference is dropped 478 * the extent if the last reference is dropped
@@ -544,10 +612,12 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
544 * extra check to make sure the ordered operation list really is empty 612 * extra check to make sure the ordered operation list really is empty
545 * before we return 613 * before we return
546 */ 614 */
547int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) 615int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
616 struct btrfs_root *root, int wait)
548{ 617{
549 struct btrfs_inode *btrfs_inode; 618 struct btrfs_inode *btrfs_inode;
550 struct inode *inode; 619 struct inode *inode;
620 struct btrfs_transaction *cur_trans = trans->transaction;
551 struct list_head splice; 621 struct list_head splice;
552 struct list_head works; 622 struct list_head works;
553 struct btrfs_delalloc_work *work, *next; 623 struct btrfs_delalloc_work *work, *next;
@@ -558,14 +628,10 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
558 628
559 mutex_lock(&root->fs_info->ordered_operations_mutex); 629 mutex_lock(&root->fs_info->ordered_operations_mutex);
560 spin_lock(&root->fs_info->ordered_extent_lock); 630 spin_lock(&root->fs_info->ordered_extent_lock);
561again: 631 list_splice_init(&cur_trans->ordered_operations, &splice);
562 list_splice_init(&root->fs_info->ordered_operations, &splice);
563
564 while (!list_empty(&splice)) { 632 while (!list_empty(&splice)) {
565
566 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 633 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
567 ordered_operations); 634 ordered_operations);
568
569 inode = &btrfs_inode->vfs_inode; 635 inode = &btrfs_inode->vfs_inode;
570 636
571 list_del_init(&btrfs_inode->ordered_operations); 637 list_del_init(&btrfs_inode->ordered_operations);
@@ -574,24 +640,22 @@ again:
574 * the inode may be getting freed (in sys_unlink path). 640 * the inode may be getting freed (in sys_unlink path).
575 */ 641 */
576 inode = igrab(inode); 642 inode = igrab(inode);
577
578 if (!wait && inode) {
579 list_add_tail(&BTRFS_I(inode)->ordered_operations,
580 &root->fs_info->ordered_operations);
581 }
582
583 if (!inode) 643 if (!inode)
584 continue; 644 continue;
645
646 if (!wait)
647 list_add_tail(&BTRFS_I(inode)->ordered_operations,
648 &cur_trans->ordered_operations);
585 spin_unlock(&root->fs_info->ordered_extent_lock); 649 spin_unlock(&root->fs_info->ordered_extent_lock);
586 650
587 work = btrfs_alloc_delalloc_work(inode, wait, 1); 651 work = btrfs_alloc_delalloc_work(inode, wait, 1);
588 if (!work) { 652 if (!work) {
653 spin_lock(&root->fs_info->ordered_extent_lock);
589 if (list_empty(&BTRFS_I(inode)->ordered_operations)) 654 if (list_empty(&BTRFS_I(inode)->ordered_operations))
590 list_add_tail(&btrfs_inode->ordered_operations, 655 list_add_tail(&btrfs_inode->ordered_operations,
591 &splice); 656 &splice);
592 spin_lock(&root->fs_info->ordered_extent_lock);
593 list_splice_tail(&splice, 657 list_splice_tail(&splice,
594 &root->fs_info->ordered_operations); 658 &cur_trans->ordered_operations);
595 spin_unlock(&root->fs_info->ordered_extent_lock); 659 spin_unlock(&root->fs_info->ordered_extent_lock);
596 ret = -ENOMEM; 660 ret = -ENOMEM;
597 goto out; 661 goto out;
@@ -603,9 +667,6 @@ again:
603 cond_resched(); 667 cond_resched();
604 spin_lock(&root->fs_info->ordered_extent_lock); 668 spin_lock(&root->fs_info->ordered_extent_lock);
605 } 669 }
606 if (wait && !list_empty(&root->fs_info->ordered_operations))
607 goto again;
608
609 spin_unlock(&root->fs_info->ordered_extent_lock); 670 spin_unlock(&root->fs_info->ordered_extent_lock);
610out: 671out:
611 list_for_each_entry_safe(work, next, &works, list) { 672 list_for_each_entry_safe(work, next, &works, list) {
@@ -974,6 +1035,7 @@ out:
974void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 1035void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
975 struct btrfs_root *root, struct inode *inode) 1036 struct btrfs_root *root, struct inode *inode)
976{ 1037{
1038 struct btrfs_transaction *cur_trans = trans->transaction;
977 u64 last_mod; 1039 u64 last_mod;
978 1040
979 last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); 1041 last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
@@ -988,7 +1050,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
988 spin_lock(&root->fs_info->ordered_extent_lock); 1050 spin_lock(&root->fs_info->ordered_extent_lock);
989 if (list_empty(&BTRFS_I(inode)->ordered_operations)) { 1051 if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
990 list_add_tail(&BTRFS_I(inode)->ordered_operations, 1052 list_add_tail(&BTRFS_I(inode)->ordered_operations,
991 &root->fs_info->ordered_operations); 1053 &cur_trans->ordered_operations);
992 } 1054 }
993 spin_unlock(&root->fs_info->ordered_extent_lock); 1055 spin_unlock(&root->fs_info->ordered_extent_lock);
994} 1056}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f29d4bf5fbe7..8eadfe406cdd 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -79,6 +79,8 @@ struct btrfs_ordered_sum {
79#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent 79#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
80 * has done its due diligence in updating 80 * has done its due diligence in updating
81 * the isize. */ 81 * the isize. */
82#define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered
83 ordered extent */
82 84
83struct btrfs_ordered_extent { 85struct btrfs_ordered_extent {
84 /* logical offset in the file */ 86 /* logical offset in the file */
@@ -96,6 +98,9 @@ struct btrfs_ordered_extent {
96 /* number of bytes that still need writing */ 98 /* number of bytes that still need writing */
97 u64 bytes_left; 99 u64 bytes_left;
98 100
101 /* number of bytes that still need csumming */
102 u64 csum_bytes_left;
103
99 /* 104 /*
100 * the end of the ordered extent which is behind it but 105 * the end of the ordered extent which is behind it but
101 * didn't update disk_i_size. Please see the comment of 106 * didn't update disk_i_size. Please see the comment of
@@ -118,6 +123,9 @@ struct btrfs_ordered_extent {
118 /* list of checksums for insertion when the extent io is done */ 123 /* list of checksums for insertion when the extent io is done */
119 struct list_head list; 124 struct list_head list;
120 125
126 /* If we need to wait on this to be done */
127 struct list_head log_list;
128
121 /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ 129 /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
122 wait_queue_head_t wait; 130 wait_queue_head_t wait;
123 131
@@ -189,11 +197,15 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
189int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 197int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
190 struct btrfs_ordered_extent *ordered); 198 struct btrfs_ordered_extent *ordered);
191int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 199int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
192int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); 200int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
201 struct btrfs_root *root, int wait);
193void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 202void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
194 struct btrfs_root *root, 203 struct btrfs_root *root,
195 struct inode *inode); 204 struct inode *inode);
196void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput); 205void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
206void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
207void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
208void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
197int __init ordered_data_init(void); 209int __init ordered_data_init(void);
198void ordered_data_exit(void); 210void ordered_data_exit(void);
199#endif 211#endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 50d95fd190a5..920957ecb27e 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -294,6 +294,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
294 btrfs_dev_extent_chunk_offset(l, dev_extent), 294 btrfs_dev_extent_chunk_offset(l, dev_extent),
295 (unsigned long long) 295 (unsigned long long)
296 btrfs_dev_extent_length(l, dev_extent)); 296 btrfs_dev_extent_length(l, dev_extent));
297 break;
297 case BTRFS_DEV_STATS_KEY: 298 case BTRFS_DEV_STATS_KEY:
298 printk(KERN_INFO "\t\tdevice stats\n"); 299 printk(KERN_INFO "\t\tdevice stats\n");
299 break; 300 break;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index a5c856234323..aee4b1cc3d98 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -23,13 +23,13 @@
23#include <linux/rbtree.h> 23#include <linux/rbtree.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/workqueue.h> 25#include <linux/workqueue.h>
26#include <linux/btrfs.h>
26 27
27#include "ctree.h" 28#include "ctree.h"
28#include "transaction.h" 29#include "transaction.h"
29#include "disk-io.h" 30#include "disk-io.h"
30#include "locking.h" 31#include "locking.h"
31#include "ulist.h" 32#include "ulist.h"
32#include "ioctl.h"
33#include "backref.h" 33#include "backref.h"
34 34
35/* TODO XXX FIXME 35/* TODO XXX FIXME
@@ -620,7 +620,9 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
620 key.offset = qgroupid; 620 key.offset = qgroupid;
621 621
622 path = btrfs_alloc_path(); 622 path = btrfs_alloc_path();
623 BUG_ON(!path); 623 if (!path)
624 return -ENOMEM;
625
624 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 626 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
625 if (ret > 0) 627 if (ret > 0)
626 ret = -ENOENT; 628 ret = -ENOENT;
@@ -661,7 +663,9 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
661 key.offset = qgroup->qgroupid; 663 key.offset = qgroup->qgroupid;
662 664
663 path = btrfs_alloc_path(); 665 path = btrfs_alloc_path();
664 BUG_ON(!path); 666 if (!path)
667 return -ENOMEM;
668
665 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 669 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
666 if (ret > 0) 670 if (ret > 0)
667 ret = -ENOENT; 671 ret = -ENOENT;
@@ -702,7 +706,9 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
702 key.offset = 0; 706 key.offset = 0;
703 707
704 path = btrfs_alloc_path(); 708 path = btrfs_alloc_path();
705 BUG_ON(!path); 709 if (!path)
710 return -ENOMEM;
711
706 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 712 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
707 if (ret > 0) 713 if (ret > 0)
708 ret = -ENOENT; 714 ret = -ENOENT;
@@ -732,33 +738,38 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
732{ 738{
733 struct btrfs_path *path; 739 struct btrfs_path *path;
734 struct btrfs_key key; 740 struct btrfs_key key;
741 struct extent_buffer *leaf = NULL;
735 int ret; 742 int ret;
736 743 int nr = 0;
737 if (!root)
738 return -EINVAL;
739 744
740 path = btrfs_alloc_path(); 745 path = btrfs_alloc_path();
741 if (!path) 746 if (!path)
742 return -ENOMEM; 747 return -ENOMEM;
743 748
744 while (1) { 749 path->leave_spinning = 1;
745 key.objectid = 0;
746 key.offset = 0;
747 key.type = 0;
748 750
749 path->leave_spinning = 1; 751 key.objectid = 0;
752 key.offset = 0;
753 key.type = 0;
754
755 while (1) {
750 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 756 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
751 if (ret > 0) { 757 if (ret < 0)
752 if (path->slots[0] == 0) 758 goto out;
753 break; 759 leaf = path->nodes[0];
754 path->slots[0]--; 760 nr = btrfs_header_nritems(leaf);
755 } else if (ret < 0) { 761 if (!nr)
756 break; 762 break;
757 } 763 /*
758 764 * delete the leaf one by one
759 ret = btrfs_del_item(trans, root, path); 765 * since the whole tree is going
766 * to be deleted.
767 */
768 path->slots[0] = 0;
769 ret = btrfs_del_items(trans, root, path, 0, nr);
760 if (ret) 770 if (ret)
761 goto out; 771 goto out;
772
762 btrfs_release_path(path); 773 btrfs_release_path(path);
763 } 774 }
764 ret = 0; 775 ret = 0;
@@ -847,6 +858,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
847 int ret = 0; 858 int ret = 0;
848 859
849 spin_lock(&fs_info->qgroup_lock); 860 spin_lock(&fs_info->qgroup_lock);
861 if (!fs_info->quota_root) {
862 spin_unlock(&fs_info->qgroup_lock);
863 return 0;
864 }
850 fs_info->quota_enabled = 0; 865 fs_info->quota_enabled = 0;
851 fs_info->pending_quota_state = 0; 866 fs_info->pending_quota_state = 0;
852 quota_root = fs_info->quota_root; 867 quota_root = fs_info->quota_root;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
new file mode 100644
index 000000000000..07222053c7d8
--- /dev/null
+++ b/fs/btrfs/raid56.c
@@ -0,0 +1,2099 @@
1/*
2 * Copyright (C) 2012 Fusion-io All rights reserved.
3 * Copyright (C) 2012 Intel Corp. All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
18 */
19#include <linux/sched.h>
20#include <linux/wait.h>
21#include <linux/bio.h>
22#include <linux/slab.h>
23#include <linux/buffer_head.h>
24#include <linux/blkdev.h>
25#include <linux/random.h>
26#include <linux/iocontext.h>
27#include <linux/capability.h>
28#include <linux/ratelimit.h>
29#include <linux/kthread.h>
30#include <linux/raid/pq.h>
31#include <linux/hash.h>
32#include <linux/list_sort.h>
33#include <linux/raid/xor.h>
34#include <asm/div64.h>
35#include "compat.h"
36#include "ctree.h"
37#include "extent_map.h"
38#include "disk-io.h"
39#include "transaction.h"
40#include "print-tree.h"
41#include "volumes.h"
42#include "raid56.h"
43#include "async-thread.h"
44#include "check-integrity.h"
45#include "rcu-string.h"
46
47/* set when additional merges to this rbio are not allowed */
48#define RBIO_RMW_LOCKED_BIT 1
49
50/*
51 * set when this rbio is sitting in the hash, but it is just a cache
52 * of past RMW
53 */
54#define RBIO_CACHE_BIT 2
55
56/*
57 * set when it is safe to trust the stripe_pages for caching
58 */
59#define RBIO_CACHE_READY_BIT 3
60
61
62#define RBIO_CACHE_SIZE 1024
63
64struct btrfs_raid_bio {
65 struct btrfs_fs_info *fs_info;
66 struct btrfs_bio *bbio;
67
68 /*
69 * logical block numbers for the start of each stripe
70 * The last one or two are p/q. These are sorted,
71 * so raid_map[0] is the start of our full stripe
72 */
73 u64 *raid_map;
74
75 /* while we're doing rmw on a stripe
76 * we put it into a hash table so we can
77 * lock the stripe and merge more rbios
78 * into it.
79 */
80 struct list_head hash_list;
81
82 /*
83 * LRU list for the stripe cache
84 */
85 struct list_head stripe_cache;
86
87 /*
88 * for scheduling work in the helper threads
89 */
90 struct btrfs_work work;
91
92 /*
93 * bio list and bio_list_lock are used
94 * to add more bios into the stripe
95 * in hopes of avoiding the full rmw
96 */
97 struct bio_list bio_list;
98 spinlock_t bio_list_lock;
99
100 /* also protected by the bio_list_lock, the
101 * plug list is used by the plugging code
102 * to collect partial bios while plugged. The
103 * stripe locking code also uses it to hand off
104 * the stripe lock to the next pending IO
105 */
106 struct list_head plug_list;
107
108 /*
109 * flags that tell us if it is safe to
110 * merge with this bio
111 */
112 unsigned long flags;
113
114 /* size of each individual stripe on disk */
115 int stripe_len;
116
117 /* number of data stripes (no p/q) */
118 int nr_data;
119
120 /*
121 * set if we're doing a parity rebuild
122 * for a read from higher up, which is handled
123 * differently from a parity rebuild as part of
124 * rmw
125 */
126 int read_rebuild;
127
128 /* first bad stripe */
129 int faila;
130
131 /* second bad stripe (for raid6 use) */
132 int failb;
133
134 /*
135 * number of pages needed to represent the full
136 * stripe
137 */
138 int nr_pages;
139
140 /*
141 * size of all the bios in the bio_list. This
142 * helps us decide if the rbio maps to a full
143 * stripe or not
144 */
145 int bio_list_bytes;
146
147 atomic_t refs;
148
149 /*
150 * these are two arrays of pointers. We allocate the
151 * rbio big enough to hold them both and setup their
152 * locations when the rbio is allocated
153 */
154
155 /* pointers to pages that we allocated for
156 * reading/writing stripes directly from the disk (including P/Q)
157 */
158 struct page **stripe_pages;
159
160 /*
161 * pointers to the pages in the bio_list. Stored
162 * here for faster lookup
163 */
164 struct page **bio_pages;
165};
166
167static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
168static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
169static void rmw_work(struct btrfs_work *work);
170static void read_rebuild_work(struct btrfs_work *work);
171static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
172static void async_read_rebuild(struct btrfs_raid_bio *rbio);
173static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
174static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
175static void __free_raid_bio(struct btrfs_raid_bio *rbio);
176static void index_rbio_pages(struct btrfs_raid_bio *rbio);
177static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
178
179/*
180 * the stripe hash table is used for locking, and to collect
181 * bios in hopes of making a full stripe
182 */
183int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
184{
185 struct btrfs_stripe_hash_table *table;
186 struct btrfs_stripe_hash_table *x;
187 struct btrfs_stripe_hash *cur;
188 struct btrfs_stripe_hash *h;
189 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
190 int i;
191 int table_size;
192
193 if (info->stripe_hash_table)
194 return 0;
195
196 /*
197 * The table is large, starting with order 4 and can go as high as
198 * order 7 in case lock debugging is turned on.
199 *
200 * Try harder to allocate and fallback to vmalloc to lower the chance
201 * of a failing mount.
202 */
203 table_size = sizeof(*table) + sizeof(*h) * num_entries;
204 table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
205 if (!table) {
206 table = vzalloc(table_size);
207 if (!table)
208 return -ENOMEM;
209 }
210
211 spin_lock_init(&table->cache_lock);
212 INIT_LIST_HEAD(&table->stripe_cache);
213
214 h = table->table;
215
216 for (i = 0; i < num_entries; i++) {
217 cur = h + i;
218 INIT_LIST_HEAD(&cur->hash_list);
219 spin_lock_init(&cur->lock);
220 init_waitqueue_head(&cur->wait);
221 }
222
223 x = cmpxchg(&info->stripe_hash_table, NULL, table);
224 if (x) {
225 if (is_vmalloc_addr(x))
226 vfree(x);
227 else
228 kfree(x);
229 }
230 return 0;
231}
232
233/*
234 * caching an rbio means to copy anything from the
235 * bio_pages array into the stripe_pages array. We
236 * use the page uptodate bit in the stripe cache array
237 * to indicate if it has valid data
238 *
239 * once the caching is done, we set the cache ready
240 * bit.
241 */
242static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
243{
244 int i;
245 char *s;
246 char *d;
247 int ret;
248
249 ret = alloc_rbio_pages(rbio);
250 if (ret)
251 return;
252
253 for (i = 0; i < rbio->nr_pages; i++) {
254 if (!rbio->bio_pages[i])
255 continue;
256
257 s = kmap(rbio->bio_pages[i]);
258 d = kmap(rbio->stripe_pages[i]);
259
260 memcpy(d, s, PAGE_CACHE_SIZE);
261
262 kunmap(rbio->bio_pages[i]);
263 kunmap(rbio->stripe_pages[i]);
264 SetPageUptodate(rbio->stripe_pages[i]);
265 }
266 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
267}
268
269/*
270 * we hash on the first logical address of the stripe
271 */
272static int rbio_bucket(struct btrfs_raid_bio *rbio)
273{
274 u64 num = rbio->raid_map[0];
275
276 /*
277 * we shift down quite a bit. We're using byte
278 * addressing, and most of the lower bits are zeros.
279 * This tends to upset hash_64, and it consistently
280 * returns just one or two different values.
281 *
282 * shifting off the lower bits fixes things.
283 */
284 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
285}
286
287/*
288 * stealing an rbio means taking all the uptodate pages from the stripe
289 * array in the source rbio and putting them into the destination rbio
290 */
291static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
292{
293 int i;
294 struct page *s;
295 struct page *d;
296
297 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
298 return;
299
300 for (i = 0; i < dest->nr_pages; i++) {
301 s = src->stripe_pages[i];
302 if (!s || !PageUptodate(s)) {
303 continue;
304 }
305
306 d = dest->stripe_pages[i];
307 if (d)
308 __free_page(d);
309
310 dest->stripe_pages[i] = s;
311 src->stripe_pages[i] = NULL;
312 }
313}
314
315/*
316 * merging means we take the bio_list from the victim and
317 * splice it into the destination. The victim should
318 * be discarded afterwards.
319 *
320 * must be called with dest->rbio_list_lock held
321 */
322static void merge_rbio(struct btrfs_raid_bio *dest,
323 struct btrfs_raid_bio *victim)
324{
325 bio_list_merge(&dest->bio_list, &victim->bio_list);
326 dest->bio_list_bytes += victim->bio_list_bytes;
327 bio_list_init(&victim->bio_list);
328}
329
330/*
331 * used to prune items that are in the cache. The caller
332 * must hold the hash table lock.
333 */
334static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
335{
336 int bucket = rbio_bucket(rbio);
337 struct btrfs_stripe_hash_table *table;
338 struct btrfs_stripe_hash *h;
339 int freeit = 0;
340
341 /*
342 * check the bit again under the hash table lock.
343 */
344 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
345 return;
346
347 table = rbio->fs_info->stripe_hash_table;
348 h = table->table + bucket;
349
350 /* hold the lock for the bucket because we may be
351 * removing it from the hash table
352 */
353 spin_lock(&h->lock);
354
355 /*
356 * hold the lock for the bio list because we need
357 * to make sure the bio list is empty
358 */
359 spin_lock(&rbio->bio_list_lock);
360
361 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
362 list_del_init(&rbio->stripe_cache);
363 table->cache_size -= 1;
364 freeit = 1;
365
366 /* if the bio list isn't empty, this rbio is
367 * still involved in an IO. We take it out
368 * of the cache list, and drop the ref that
369 * was held for the list.
370 *
371 * If the bio_list was empty, we also remove
372 * the rbio from the hash_table, and drop
373 * the corresponding ref
374 */
375 if (bio_list_empty(&rbio->bio_list)) {
376 if (!list_empty(&rbio->hash_list)) {
377 list_del_init(&rbio->hash_list);
378 atomic_dec(&rbio->refs);
379 BUG_ON(!list_empty(&rbio->plug_list));
380 }
381 }
382 }
383
384 spin_unlock(&rbio->bio_list_lock);
385 spin_unlock(&h->lock);
386
387 if (freeit)
388 __free_raid_bio(rbio);
389}
390
391/*
392 * prune a given rbio from the cache
393 */
394static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
395{
396 struct btrfs_stripe_hash_table *table;
397 unsigned long flags;
398
399 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
400 return;
401
402 table = rbio->fs_info->stripe_hash_table;
403
404 spin_lock_irqsave(&table->cache_lock, flags);
405 __remove_rbio_from_cache(rbio);
406 spin_unlock_irqrestore(&table->cache_lock, flags);
407}
408
409/*
410 * remove everything in the cache
411 */
412void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
413{
414 struct btrfs_stripe_hash_table *table;
415 unsigned long flags;
416 struct btrfs_raid_bio *rbio;
417
418 table = info->stripe_hash_table;
419
420 spin_lock_irqsave(&table->cache_lock, flags);
421 while (!list_empty(&table->stripe_cache)) {
422 rbio = list_entry(table->stripe_cache.next,
423 struct btrfs_raid_bio,
424 stripe_cache);
425 __remove_rbio_from_cache(rbio);
426 }
427 spin_unlock_irqrestore(&table->cache_lock, flags);
428}
429
430/*
431 * remove all cached entries and free the hash table
432 * used by unmount
433 */
434void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
435{
436 if (!info->stripe_hash_table)
437 return;
438 btrfs_clear_rbio_cache(info);
439 if (is_vmalloc_addr(info->stripe_hash_table))
440 vfree(info->stripe_hash_table);
441 else
442 kfree(info->stripe_hash_table);
443 info->stripe_hash_table = NULL;
444}
445
446/*
447 * insert an rbio into the stripe cache. It
448 * must have already been prepared by calling
449 * cache_rbio_pages
450 *
451 * If this rbio was already cached, it gets
452 * moved to the front of the lru.
453 *
454 * If the size of the rbio cache is too big, we
455 * prune an item.
456 */
457static void cache_rbio(struct btrfs_raid_bio *rbio)
458{
459 struct btrfs_stripe_hash_table *table;
460 unsigned long flags;
461
462 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
463 return;
464
465 table = rbio->fs_info->stripe_hash_table;
466
467 spin_lock_irqsave(&table->cache_lock, flags);
468 spin_lock(&rbio->bio_list_lock);
469
470 /* bump our ref if we were not in the list before */
471 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
472 atomic_inc(&rbio->refs);
473
474 if (!list_empty(&rbio->stripe_cache)){
475 list_move(&rbio->stripe_cache, &table->stripe_cache);
476 } else {
477 list_add(&rbio->stripe_cache, &table->stripe_cache);
478 table->cache_size += 1;
479 }
480
481 spin_unlock(&rbio->bio_list_lock);
482
483 if (table->cache_size > RBIO_CACHE_SIZE) {
484 struct btrfs_raid_bio *found;
485
486 found = list_entry(table->stripe_cache.prev,
487 struct btrfs_raid_bio,
488 stripe_cache);
489
490 if (found != rbio)
491 __remove_rbio_from_cache(found);
492 }
493
494 spin_unlock_irqrestore(&table->cache_lock, flags);
495 return;
496}
497
498/*
499 * helper function to run the xor_blocks api. It is only
500 * able to do MAX_XOR_BLOCKS at a time, so we need to
501 * loop through.
502 */
503static void run_xor(void **pages, int src_cnt, ssize_t len)
504{
505 int src_off = 0;
506 int xor_src_cnt = 0;
507 void *dest = pages[src_cnt];
508
509 while(src_cnt > 0) {
510 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
511 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
512
513 src_cnt -= xor_src_cnt;
514 src_off += xor_src_cnt;
515 }
516}
517
518/*
519 * returns true if the bio list inside this rbio
520 * covers an entire stripe (no rmw required).
521 * Must be called with the bio list lock held, or
522 * at a time when you know it is impossible to add
523 * new bios into the list
524 */
525static int __rbio_is_full(struct btrfs_raid_bio *rbio)
526{
527 unsigned long size = rbio->bio_list_bytes;
528 int ret = 1;
529
530 if (size != rbio->nr_data * rbio->stripe_len)
531 ret = 0;
532
533 BUG_ON(size > rbio->nr_data * rbio->stripe_len);
534 return ret;
535}
536
537static int rbio_is_full(struct btrfs_raid_bio *rbio)
538{
539 unsigned long flags;
540 int ret;
541
542 spin_lock_irqsave(&rbio->bio_list_lock, flags);
543 ret = __rbio_is_full(rbio);
544 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
545 return ret;
546}
547
548/*
549 * returns 1 if it is safe to merge two rbios together.
550 * The merging is safe if the two rbios correspond to
551 * the same stripe and if they are both going in the same
552 * direction (read vs write), and if neither one is
553 * locked for final IO
554 *
555 * The caller is responsible for locking such that
556 * rmw_locked is safe to test
557 */
558static int rbio_can_merge(struct btrfs_raid_bio *last,
559 struct btrfs_raid_bio *cur)
560{
561 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
562 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
563 return 0;
564
565 /*
566 * we can't merge with cached rbios, since the
567 * idea is that when we merge the destination
568 * rbio is going to run our IO for us. We can
569 * steal from cached rbio's though, other functions
570 * handle that.
571 */
572 if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
573 test_bit(RBIO_CACHE_BIT, &cur->flags))
574 return 0;
575
576 if (last->raid_map[0] !=
577 cur->raid_map[0])
578 return 0;
579
580 /* reads can't merge with writes */
581 if (last->read_rebuild !=
582 cur->read_rebuild) {
583 return 0;
584 }
585
586 return 1;
587}
588
589/*
590 * helper to index into the pstripe
591 */
592static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
593{
594 index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
595 return rbio->stripe_pages[index];
596}
597
598/*
599 * helper to index into the qstripe, returns null
600 * if there is no qstripe
601 */
602static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
603{
604 if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
605 return NULL;
606
607 index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
608 PAGE_CACHE_SHIFT;
609 return rbio->stripe_pages[index];
610}
611
612/*
613 * The first stripe in the table for a logical address
614 * has the lock. rbios are added in one of three ways:
615 *
616 * 1) Nobody has the stripe locked yet. The rbio is given
617 * the lock and 0 is returned. The caller must start the IO
618 * themselves.
619 *
620 * 2) Someone has the stripe locked, but we're able to merge
621 * with the lock owner. The rbio is freed and the IO will
622 * start automatically along with the existing rbio. 1 is returned.
623 *
624 * 3) Someone has the stripe locked, but we're not able to merge.
625 * The rbio is added to the lock owner's plug list, or merged into
626 * an rbio already on the plug list. When the lock owner unlocks,
627 * the next rbio on the list is run and the IO is started automatically.
628 * 1 is returned
629 *
630 * If we return 0, the caller still owns the rbio and must continue with
631 * IO submission. If we return 1, the caller must assume the rbio has
632 * already been freed.
633 */
634static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
635{
636 int bucket = rbio_bucket(rbio);
637 struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
638 struct btrfs_raid_bio *cur;
639 struct btrfs_raid_bio *pending;
640 unsigned long flags;
641 DEFINE_WAIT(wait);
642 struct btrfs_raid_bio *freeit = NULL;
643 struct btrfs_raid_bio *cache_drop = NULL;
644 int ret = 0;
645 int walk = 0;
646
647 spin_lock_irqsave(&h->lock, flags);
648 list_for_each_entry(cur, &h->hash_list, hash_list) {
649 walk++;
650 if (cur->raid_map[0] == rbio->raid_map[0]) {
651 spin_lock(&cur->bio_list_lock);
652
653 /* can we steal this cached rbio's pages? */
654 if (bio_list_empty(&cur->bio_list) &&
655 list_empty(&cur->plug_list) &&
656 test_bit(RBIO_CACHE_BIT, &cur->flags) &&
657 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
658 list_del_init(&cur->hash_list);
659 atomic_dec(&cur->refs);
660
661 steal_rbio(cur, rbio);
662 cache_drop = cur;
663 spin_unlock(&cur->bio_list_lock);
664
665 goto lockit;
666 }
667
668 /* can we merge into the lock owner? */
669 if (rbio_can_merge(cur, rbio)) {
670 merge_rbio(cur, rbio);
671 spin_unlock(&cur->bio_list_lock);
672 freeit = rbio;
673 ret = 1;
674 goto out;
675 }
676
677
678 /*
679 * we couldn't merge with the running
680 * rbio, see if we can merge with the
681 * pending ones. We don't have to
682 * check for rmw_locked because there
683 * is no way they are inside finish_rmw
684 * right now
685 */
686 list_for_each_entry(pending, &cur->plug_list,
687 plug_list) {
688 if (rbio_can_merge(pending, rbio)) {
689 merge_rbio(pending, rbio);
690 spin_unlock(&cur->bio_list_lock);
691 freeit = rbio;
692 ret = 1;
693 goto out;
694 }
695 }
696
697 /* no merging, put us on the tail of the plug list,
698 * our rbio will be started with the currently
699 * running rbio unlocks
700 */
701 list_add_tail(&rbio->plug_list, &cur->plug_list);
702 spin_unlock(&cur->bio_list_lock);
703 ret = 1;
704 goto out;
705 }
706 }
707lockit:
708 atomic_inc(&rbio->refs);
709 list_add(&rbio->hash_list, &h->hash_list);
710out:
711 spin_unlock_irqrestore(&h->lock, flags);
712 if (cache_drop)
713 remove_rbio_from_cache(cache_drop);
714 if (freeit)
715 __free_raid_bio(freeit);
716 return ret;
717}
718
719/*
720 * called as rmw or parity rebuild is completed. If the plug list has more
721 * rbios waiting for this stripe, the next one on the list will be started
722 */
723static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
724{
725 int bucket;
726 struct btrfs_stripe_hash *h;
727 unsigned long flags;
728 int keep_cache = 0;
729
730 bucket = rbio_bucket(rbio);
731 h = rbio->fs_info->stripe_hash_table->table + bucket;
732
733 if (list_empty(&rbio->plug_list))
734 cache_rbio(rbio);
735
736 spin_lock_irqsave(&h->lock, flags);
737 spin_lock(&rbio->bio_list_lock);
738
739 if (!list_empty(&rbio->hash_list)) {
740 /*
741 * if we're still cached and there is no other IO
742 * to perform, just leave this rbio here for others
743 * to steal from later
744 */
745 if (list_empty(&rbio->plug_list) &&
746 test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
747 keep_cache = 1;
748 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
749 BUG_ON(!bio_list_empty(&rbio->bio_list));
750 goto done;
751 }
752
753 list_del_init(&rbio->hash_list);
754 atomic_dec(&rbio->refs);
755
756 /*
757 * we use the plug list to hold all the rbios
758 * waiting for the chance to lock this stripe.
759 * hand the lock over to one of them.
760 */
761 if (!list_empty(&rbio->plug_list)) {
762 struct btrfs_raid_bio *next;
763 struct list_head *head = rbio->plug_list.next;
764
765 next = list_entry(head, struct btrfs_raid_bio,
766 plug_list);
767
768 list_del_init(&rbio->plug_list);
769
770 list_add(&next->hash_list, &h->hash_list);
771 atomic_inc(&next->refs);
772 spin_unlock(&rbio->bio_list_lock);
773 spin_unlock_irqrestore(&h->lock, flags);
774
775 if (next->read_rebuild)
776 async_read_rebuild(next);
777 else {
778 steal_rbio(rbio, next);
779 async_rmw_stripe(next);
780 }
781
782 goto done_nolock;
783 } else if (waitqueue_active(&h->wait)) {
784 spin_unlock(&rbio->bio_list_lock);
785 spin_unlock_irqrestore(&h->lock, flags);
786 wake_up(&h->wait);
787 goto done_nolock;
788 }
789 }
790done:
791 spin_unlock(&rbio->bio_list_lock);
792 spin_unlock_irqrestore(&h->lock, flags);
793
794done_nolock:
795 if (!keep_cache)
796 remove_rbio_from_cache(rbio);
797}
798
799static void __free_raid_bio(struct btrfs_raid_bio *rbio)
800{
801 int i;
802
803 WARN_ON(atomic_read(&rbio->refs) < 0);
804 if (!atomic_dec_and_test(&rbio->refs))
805 return;
806
807 WARN_ON(!list_empty(&rbio->stripe_cache));
808 WARN_ON(!list_empty(&rbio->hash_list));
809 WARN_ON(!bio_list_empty(&rbio->bio_list));
810
811 for (i = 0; i < rbio->nr_pages; i++) {
812 if (rbio->stripe_pages[i]) {
813 __free_page(rbio->stripe_pages[i]);
814 rbio->stripe_pages[i] = NULL;
815 }
816 }
817 kfree(rbio->raid_map);
818 kfree(rbio->bbio);
819 kfree(rbio);
820}
821
822static void free_raid_bio(struct btrfs_raid_bio *rbio)
823{
824 unlock_stripe(rbio);
825 __free_raid_bio(rbio);
826}
827
828/*
829 * this frees the rbio and runs through all the bios in the
830 * bio_list and calls end_io on them
831 */
832static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
833{
834 struct bio *cur = bio_list_get(&rbio->bio_list);
835 struct bio *next;
836 free_raid_bio(rbio);
837
838 while (cur) {
839 next = cur->bi_next;
840 cur->bi_next = NULL;
841 if (uptodate)
842 set_bit(BIO_UPTODATE, &cur->bi_flags);
843 bio_endio(cur, err);
844 cur = next;
845 }
846}
847
848/*
849 * end io function used by finish_rmw. When we finally
850 * get here, we've written a full stripe
851 */
852static void raid_write_end_io(struct bio *bio, int err)
853{
854 struct btrfs_raid_bio *rbio = bio->bi_private;
855
856 if (err)
857 fail_bio_stripe(rbio, bio);
858
859 bio_put(bio);
860
861 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
862 return;
863
864 err = 0;
865
866 /* OK, we have read all the stripes we need to. */
867 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
868 err = -EIO;
869
870 rbio_orig_end_io(rbio, err, 0);
871 return;
872}
873
874/*
875 * the read/modify/write code wants to use the original bio for
876 * any pages it included, and then use the rbio for everything
877 * else. This function decides if a given index (stripe number)
878 * and page number in that stripe fall inside the original bio
879 * or the rbio.
880 *
881 * if you set bio_list_only, you'll get a NULL back for any ranges
882 * that are outside the bio_list
883 *
884 * This doesn't take any refs on anything, you get a bare page pointer
885 * and the caller must bump refs as required.
886 *
887 * You must call index_rbio_pages once before you can trust
888 * the answers from this function.
889 */
890static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
891 int index, int pagenr, int bio_list_only)
892{
893 int chunk_page;
894 struct page *p = NULL;
895
896 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
897
898 spin_lock_irq(&rbio->bio_list_lock);
899 p = rbio->bio_pages[chunk_page];
900 spin_unlock_irq(&rbio->bio_list_lock);
901
902 if (p || bio_list_only)
903 return p;
904
905 return rbio->stripe_pages[chunk_page];
906}
907
908/*
909 * number of pages we need for the entire stripe across all the
910 * drives
911 */
912static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
913{
914 unsigned long nr = stripe_len * nr_stripes;
915 return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
916}
917
918/*
919 * allocation and initial setup for the btrfs_raid_bio. Not
920 * this does not allocate any pages for rbio->pages.
921 */
922static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
923 struct btrfs_bio *bbio, u64 *raid_map,
924 u64 stripe_len)
925{
926 struct btrfs_raid_bio *rbio;
927 int nr_data = 0;
928 int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
929 void *p;
930
931 rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
932 GFP_NOFS);
933 if (!rbio) {
934 kfree(raid_map);
935 kfree(bbio);
936 return ERR_PTR(-ENOMEM);
937 }
938
939 bio_list_init(&rbio->bio_list);
940 INIT_LIST_HEAD(&rbio->plug_list);
941 spin_lock_init(&rbio->bio_list_lock);
942 INIT_LIST_HEAD(&rbio->stripe_cache);
943 INIT_LIST_HEAD(&rbio->hash_list);
944 rbio->bbio = bbio;
945 rbio->raid_map = raid_map;
946 rbio->fs_info = root->fs_info;
947 rbio->stripe_len = stripe_len;
948 rbio->nr_pages = num_pages;
949 rbio->faila = -1;
950 rbio->failb = -1;
951 atomic_set(&rbio->refs, 1);
952
953 /*
954 * the stripe_pages and bio_pages array point to the extra
955 * memory we allocated past the end of the rbio
956 */
957 p = rbio + 1;
958 rbio->stripe_pages = p;
959 rbio->bio_pages = p + sizeof(struct page *) * num_pages;
960
961 if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
962 nr_data = bbio->num_stripes - 2;
963 else
964 nr_data = bbio->num_stripes - 1;
965
966 rbio->nr_data = nr_data;
967 return rbio;
968}
969
970/* allocate pages for all the stripes in the bio, including parity */
971static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
972{
973 int i;
974 struct page *page;
975
976 for (i = 0; i < rbio->nr_pages; i++) {
977 if (rbio->stripe_pages[i])
978 continue;
979 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
980 if (!page)
981 return -ENOMEM;
982 rbio->stripe_pages[i] = page;
983 ClearPageUptodate(page);
984 }
985 return 0;
986}
987
988/* allocate pages for just the p/q stripes */
989static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
990{
991 int i;
992 struct page *page;
993
994 i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
995
996 for (; i < rbio->nr_pages; i++) {
997 if (rbio->stripe_pages[i])
998 continue;
999 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1000 if (!page)
1001 return -ENOMEM;
1002 rbio->stripe_pages[i] = page;
1003 }
1004 return 0;
1005}
1006
1007/*
1008 * add a single page from a specific stripe into our list of bios for IO
1009 * this will try to merge into existing bios if possible, and returns
1010 * zero if all went well.
1011 */
1012int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1013 struct bio_list *bio_list,
1014 struct page *page,
1015 int stripe_nr,
1016 unsigned long page_index,
1017 unsigned long bio_max_len)
1018{
1019 struct bio *last = bio_list->tail;
1020 u64 last_end = 0;
1021 int ret;
1022 struct bio *bio;
1023 struct btrfs_bio_stripe *stripe;
1024 u64 disk_start;
1025
1026 stripe = &rbio->bbio->stripes[stripe_nr];
1027 disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
1028
1029 /* if the device is missing, just fail this stripe */
1030 if (!stripe->dev->bdev)
1031 return fail_rbio_index(rbio, stripe_nr);
1032
1033 /* see if we can add this page onto our existing bio */
1034 if (last) {
1035 last_end = (u64)last->bi_sector << 9;
1036 last_end += last->bi_size;
1037
1038 /*
1039 * we can't merge these if they are from different
1040 * devices or if they are not contiguous
1041 */
1042 if (last_end == disk_start && stripe->dev->bdev &&
1043 test_bit(BIO_UPTODATE, &last->bi_flags) &&
1044 last->bi_bdev == stripe->dev->bdev) {
1045 ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
1046 if (ret == PAGE_CACHE_SIZE)
1047 return 0;
1048 }
1049 }
1050
1051 /* put a new bio on the list */
1052 bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
1053 if (!bio)
1054 return -ENOMEM;
1055
1056 bio->bi_size = 0;
1057 bio->bi_bdev = stripe->dev->bdev;
1058 bio->bi_sector = disk_start >> 9;
1059 set_bit(BIO_UPTODATE, &bio->bi_flags);
1060
1061 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
1062 bio_list_add(bio_list, bio);
1063 return 0;
1064}
1065
1066/*
1067 * while we're doing the read/modify/write cycle, we could
1068 * have errors in reading pages off the disk. This checks
1069 * for errors and if we're not able to read the page it'll
1070 * trigger parity reconstruction. The rmw will be finished
1071 * after we've reconstructed the failed stripes
1072 */
1073static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1074{
1075 if (rbio->faila >= 0 || rbio->failb >= 0) {
1076 BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
1077 __raid56_parity_recover(rbio);
1078 } else {
1079 finish_rmw(rbio);
1080 }
1081}
1082
1083/*
1084 * these are just the pages from the rbio array, not from anything
1085 * the FS sent down to us
1086 */
1087static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
1088{
1089 int index;
1090 index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
1091 index += page;
1092 return rbio->stripe_pages[index];
1093}
1094
1095/*
1096 * helper function to walk our bio list and populate the bio_pages array with
1097 * the result. This seems expensive, but it is faster than constantly
1098 * searching through the bio list as we setup the IO in finish_rmw or stripe
1099 * reconstruction.
1100 *
1101 * This must be called before you trust the answers from page_in_rbio
1102 */
1103static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1104{
1105 struct bio *bio;
1106 u64 start;
1107 unsigned long stripe_offset;
1108 unsigned long page_index;
1109 struct page *p;
1110 int i;
1111
1112 spin_lock_irq(&rbio->bio_list_lock);
1113 bio_list_for_each(bio, &rbio->bio_list) {
1114 start = (u64)bio->bi_sector << 9;
1115 stripe_offset = start - rbio->raid_map[0];
1116 page_index = stripe_offset >> PAGE_CACHE_SHIFT;
1117
1118 for (i = 0; i < bio->bi_vcnt; i++) {
1119 p = bio->bi_io_vec[i].bv_page;
1120 rbio->bio_pages[page_index + i] = p;
1121 }
1122 }
1123 spin_unlock_irq(&rbio->bio_list_lock);
1124}
1125
1126/*
1127 * this is called from one of two situations. We either
1128 * have a full stripe from the higher layers, or we've read all
1129 * the missing bits off disk.
1130 *
1131 * This will calculate the parity and then send down any
1132 * changed blocks.
1133 */
1134static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1135{
1136 struct btrfs_bio *bbio = rbio->bbio;
1137 void *pointers[bbio->num_stripes];
1138 int stripe_len = rbio->stripe_len;
1139 int nr_data = rbio->nr_data;
1140 int stripe;
1141 int pagenr;
1142 int p_stripe = -1;
1143 int q_stripe = -1;
1144 struct bio_list bio_list;
1145 struct bio *bio;
1146 int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
1147 int ret;
1148
1149 bio_list_init(&bio_list);
1150
1151 if (bbio->num_stripes - rbio->nr_data == 1) {
1152 p_stripe = bbio->num_stripes - 1;
1153 } else if (bbio->num_stripes - rbio->nr_data == 2) {
1154 p_stripe = bbio->num_stripes - 2;
1155 q_stripe = bbio->num_stripes - 1;
1156 } else {
1157 BUG();
1158 }
1159
1160 /* at this point we either have a full stripe,
1161 * or we've read the full stripe from the drive.
1162 * recalculate the parity and write the new results.
1163 *
1164 * We're not allowed to add any new bios to the
1165 * bio list here, anyone else that wants to
1166 * change this stripe needs to do their own rmw.
1167 */
1168 spin_lock_irq(&rbio->bio_list_lock);
1169 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1170 spin_unlock_irq(&rbio->bio_list_lock);
1171
1172 atomic_set(&rbio->bbio->error, 0);
1173
1174 /*
1175 * now that we've set rmw_locked, run through the
1176 * bio list one last time and map the page pointers
1177 *
1178 * We don't cache full rbios because we're assuming
1179 * the higher layers are unlikely to use this area of
1180 * the disk again soon. If they do use it again,
1181 * hopefully they will send another full bio.
1182 */
1183 index_rbio_pages(rbio);
1184 if (!rbio_is_full(rbio))
1185 cache_rbio_pages(rbio);
1186 else
1187 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1188
1189 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1190 struct page *p;
1191 /* first collect one page from each data stripe */
1192 for (stripe = 0; stripe < nr_data; stripe++) {
1193 p = page_in_rbio(rbio, stripe, pagenr, 0);
1194 pointers[stripe] = kmap(p);
1195 }
1196
1197 /* then add the parity stripe */
1198 p = rbio_pstripe_page(rbio, pagenr);
1199 SetPageUptodate(p);
1200 pointers[stripe++] = kmap(p);
1201
1202 if (q_stripe != -1) {
1203
1204 /*
1205 * raid6, add the qstripe and call the
1206 * library function to fill in our p/q
1207 */
1208 p = rbio_qstripe_page(rbio, pagenr);
1209 SetPageUptodate(p);
1210 pointers[stripe++] = kmap(p);
1211
1212 raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
1213 pointers);
1214 } else {
1215 /* raid5 */
1216 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
1217 run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
1218 }
1219
1220
1221 for (stripe = 0; stripe < bbio->num_stripes; stripe++)
1222 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1223 }
1224
1225 /*
1226 * time to start writing. Make bios for everything from the
1227 * higher layers (the bio_list in our rbio) and our p/q. Ignore
1228 * everything else.
1229 */
1230 for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1231 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1232 struct page *page;
1233 if (stripe < rbio->nr_data) {
1234 page = page_in_rbio(rbio, stripe, pagenr, 1);
1235 if (!page)
1236 continue;
1237 } else {
1238 page = rbio_stripe_page(rbio, stripe, pagenr);
1239 }
1240
1241 ret = rbio_add_io_page(rbio, &bio_list,
1242 page, stripe, pagenr, rbio->stripe_len);
1243 if (ret)
1244 goto cleanup;
1245 }
1246 }
1247
1248 atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
1249 BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
1250
1251 while (1) {
1252 bio = bio_list_pop(&bio_list);
1253 if (!bio)
1254 break;
1255
1256 bio->bi_private = rbio;
1257 bio->bi_end_io = raid_write_end_io;
1258 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1259 submit_bio(WRITE, bio);
1260 }
1261 return;
1262
1263cleanup:
1264 rbio_orig_end_io(rbio, -EIO, 0);
1265}
1266
1267/*
1268 * helper to find the stripe number for a given bio. Used to figure out which
1269 * stripe has failed. This expects the bio to correspond to a physical disk,
1270 * so it looks up based on physical sector numbers.
1271 */
1272static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1273 struct bio *bio)
1274{
1275 u64 physical = bio->bi_sector;
1276 u64 stripe_start;
1277 int i;
1278 struct btrfs_bio_stripe *stripe;
1279
1280 physical <<= 9;
1281
1282 for (i = 0; i < rbio->bbio->num_stripes; i++) {
1283 stripe = &rbio->bbio->stripes[i];
1284 stripe_start = stripe->physical;
1285 if (physical >= stripe_start &&
1286 physical < stripe_start + rbio->stripe_len) {
1287 return i;
1288 }
1289 }
1290 return -1;
1291}
1292
1293/*
1294 * helper to find the stripe number for a given
1295 * bio (before mapping). Used to figure out which stripe has
1296 * failed. This looks up based on logical block numbers.
1297 */
1298static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1299 struct bio *bio)
1300{
1301 u64 logical = bio->bi_sector;
1302 u64 stripe_start;
1303 int i;
1304
1305 logical <<= 9;
1306
1307 for (i = 0; i < rbio->nr_data; i++) {
1308 stripe_start = rbio->raid_map[i];
1309 if (logical >= stripe_start &&
1310 logical < stripe_start + rbio->stripe_len) {
1311 return i;
1312 }
1313 }
1314 return -1;
1315}
1316
1317/*
1318 * returns -EIO if we had too many failures
1319 */
1320static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1321{
1322 unsigned long flags;
1323 int ret = 0;
1324
1325 spin_lock_irqsave(&rbio->bio_list_lock, flags);
1326
1327 /* we already know this stripe is bad, move on */
1328 if (rbio->faila == failed || rbio->failb == failed)
1329 goto out;
1330
1331 if (rbio->faila == -1) {
1332 /* first failure on this rbio */
1333 rbio->faila = failed;
1334 atomic_inc(&rbio->bbio->error);
1335 } else if (rbio->failb == -1) {
1336 /* second failure on this rbio */
1337 rbio->failb = failed;
1338 atomic_inc(&rbio->bbio->error);
1339 } else {
1340 ret = -EIO;
1341 }
1342out:
1343 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1344
1345 return ret;
1346}
1347
1348/*
1349 * helper to fail a stripe based on a physical disk
1350 * bio.
1351 */
1352static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1353 struct bio *bio)
1354{
1355 int failed = find_bio_stripe(rbio, bio);
1356
1357 if (failed < 0)
1358 return -EIO;
1359
1360 return fail_rbio_index(rbio, failed);
1361}
1362
1363/*
1364 * this sets each page in the bio uptodate. It should only be used on private
1365 * rbio pages, nothing that comes in from the higher layers
1366 */
1367static void set_bio_pages_uptodate(struct bio *bio)
1368{
1369 int i;
1370 struct page *p;
1371
1372 for (i = 0; i < bio->bi_vcnt; i++) {
1373 p = bio->bi_io_vec[i].bv_page;
1374 SetPageUptodate(p);
1375 }
1376}
1377
1378/*
1379 * end io for the read phase of the rmw cycle. All the bios here are physical
1380 * stripe bios we've read from the disk so we can recalculate the parity of the
1381 * stripe.
1382 *
1383 * This will usually kick off finish_rmw once all the bios are read in, but it
1384 * may trigger parity reconstruction if we had any errors along the way
1385 */
1386static void raid_rmw_end_io(struct bio *bio, int err)
1387{
1388 struct btrfs_raid_bio *rbio = bio->bi_private;
1389
1390 if (err)
1391 fail_bio_stripe(rbio, bio);
1392 else
1393 set_bio_pages_uptodate(bio);
1394
1395 bio_put(bio);
1396
1397 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1398 return;
1399
1400 err = 0;
1401 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1402 goto cleanup;
1403
1404 /*
1405 * this will normally call finish_rmw to start our write
1406 * but if there are any failed stripes we'll reconstruct
1407 * from parity first
1408 */
1409 validate_rbio_for_rmw(rbio);
1410 return;
1411
1412cleanup:
1413
1414 rbio_orig_end_io(rbio, -EIO, 0);
1415}
1416
1417static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1418{
1419 rbio->work.flags = 0;
1420 rbio->work.func = rmw_work;
1421
1422 btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1423 &rbio->work);
1424}
1425
1426static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1427{
1428 rbio->work.flags = 0;
1429 rbio->work.func = read_rebuild_work;
1430
1431 btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1432 &rbio->work);
1433}
1434
1435/*
1436 * the stripe must be locked by the caller. It will
1437 * unlock after all the writes are done
1438 */
1439static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1440{
1441 int bios_to_read = 0;
1442 struct btrfs_bio *bbio = rbio->bbio;
1443 struct bio_list bio_list;
1444 int ret;
1445 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1446 int pagenr;
1447 int stripe;
1448 struct bio *bio;
1449
1450 bio_list_init(&bio_list);
1451
1452 ret = alloc_rbio_pages(rbio);
1453 if (ret)
1454 goto cleanup;
1455
1456 index_rbio_pages(rbio);
1457
1458 atomic_set(&rbio->bbio->error, 0);
1459 /*
1460 * build a list of bios to read all the missing parts of this
1461 * stripe
1462 */
1463 for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1464 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1465 struct page *page;
1466 /*
1467 * we want to find all the pages missing from
1468 * the rbio and read them from the disk. If
1469 * page_in_rbio finds a page in the bio list
1470 * we don't need to read it off the stripe.
1471 */
1472 page = page_in_rbio(rbio, stripe, pagenr, 1);
1473 if (page)
1474 continue;
1475
1476 page = rbio_stripe_page(rbio, stripe, pagenr);
1477 /*
1478 * the bio cache may have handed us an uptodate
1479 * page. If so, be happy and use it
1480 */
1481 if (PageUptodate(page))
1482 continue;
1483
1484 ret = rbio_add_io_page(rbio, &bio_list, page,
1485 stripe, pagenr, rbio->stripe_len);
1486 if (ret)
1487 goto cleanup;
1488 }
1489 }
1490
1491 bios_to_read = bio_list_size(&bio_list);
1492 if (!bios_to_read) {
1493 /*
1494 * this can happen if others have merged with
1495 * us, it means there is nothing left to read.
1496 * But if there are missing devices it may not be
1497 * safe to do the full stripe write yet.
1498 */
1499 goto finish;
1500 }
1501
1502 /*
1503 * the bbio may be freed once we submit the last bio. Make sure
1504 * not to touch it after that
1505 */
1506 atomic_set(&bbio->stripes_pending, bios_to_read);
1507 while (1) {
1508 bio = bio_list_pop(&bio_list);
1509 if (!bio)
1510 break;
1511
1512 bio->bi_private = rbio;
1513 bio->bi_end_io = raid_rmw_end_io;
1514
1515 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1516 BTRFS_WQ_ENDIO_RAID56);
1517
1518 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1519 submit_bio(READ, bio);
1520 }
1521 /* the actual write will happen once the reads are done */
1522 return 0;
1523
1524cleanup:
1525 rbio_orig_end_io(rbio, -EIO, 0);
1526 return -EIO;
1527
1528finish:
1529 validate_rbio_for_rmw(rbio);
1530 return 0;
1531}
1532
1533/*
1534 * if the upper layers pass in a full stripe, we thank them by only allocating
1535 * enough pages to hold the parity, and sending it all down quickly.
1536 */
1537static int full_stripe_write(struct btrfs_raid_bio *rbio)
1538{
1539 int ret;
1540
1541 ret = alloc_rbio_parity_pages(rbio);
1542 if (ret)
1543 return ret;
1544
1545 ret = lock_stripe_add(rbio);
1546 if (ret == 0)
1547 finish_rmw(rbio);
1548 return 0;
1549}
1550
1551/*
1552 * partial stripe writes get handed over to async helpers.
1553 * We're really hoping to merge a few more writes into this
1554 * rbio before calculating new parity
1555 */
1556static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1557{
1558 int ret;
1559
1560 ret = lock_stripe_add(rbio);
1561 if (ret == 0)
1562 async_rmw_stripe(rbio);
1563 return 0;
1564}
1565
1566/*
1567 * sometimes while we were reading from the drive to
1568 * recalculate parity, enough new bios come into create
1569 * a full stripe. So we do a check here to see if we can
1570 * go directly to finish_rmw
1571 */
1572static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1573{
1574 /* head off into rmw land if we don't have a full stripe */
1575 if (!rbio_is_full(rbio))
1576 return partial_stripe_write(rbio);
1577 return full_stripe_write(rbio);
1578}
1579
1580/*
1581 * We use plugging call backs to collect full stripes.
1582 * Any time we get a partial stripe write while plugged
1583 * we collect it into a list. When the unplug comes down,
1584 * we sort the list by logical block number and merge
1585 * everything we can into the same rbios
1586 */
1587struct btrfs_plug_cb {
1588 struct blk_plug_cb cb;
1589 struct btrfs_fs_info *info;
1590 struct list_head rbio_list;
1591 struct btrfs_work work;
1592};
1593
1594/*
1595 * rbios on the plug list are sorted for easier merging.
1596 */
1597static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
1598{
1599 struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1600 plug_list);
1601 struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1602 plug_list);
1603 u64 a_sector = ra->bio_list.head->bi_sector;
1604 u64 b_sector = rb->bio_list.head->bi_sector;
1605
1606 if (a_sector < b_sector)
1607 return -1;
1608 if (a_sector > b_sector)
1609 return 1;
1610 return 0;
1611}
1612
1613static void run_plug(struct btrfs_plug_cb *plug)
1614{
1615 struct btrfs_raid_bio *cur;
1616 struct btrfs_raid_bio *last = NULL;
1617
1618 /*
1619 * sort our plug list then try to merge
1620 * everything we can in hopes of creating full
1621 * stripes.
1622 */
1623 list_sort(NULL, &plug->rbio_list, plug_cmp);
1624 while (!list_empty(&plug->rbio_list)) {
1625 cur = list_entry(plug->rbio_list.next,
1626 struct btrfs_raid_bio, plug_list);
1627 list_del_init(&cur->plug_list);
1628
1629 if (rbio_is_full(cur)) {
1630 /* we have a full stripe, send it down */
1631 full_stripe_write(cur);
1632 continue;
1633 }
1634 if (last) {
1635 if (rbio_can_merge(last, cur)) {
1636 merge_rbio(last, cur);
1637 __free_raid_bio(cur);
1638 continue;
1639
1640 }
1641 __raid56_parity_write(last);
1642 }
1643 last = cur;
1644 }
1645 if (last) {
1646 __raid56_parity_write(last);
1647 }
1648 kfree(plug);
1649}
1650
1651/*
1652 * if the unplug comes from schedule, we have to push the
1653 * work off to a helper thread
1654 */
1655static void unplug_work(struct btrfs_work *work)
1656{
1657 struct btrfs_plug_cb *plug;
1658 plug = container_of(work, struct btrfs_plug_cb, work);
1659 run_plug(plug);
1660}
1661
1662static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1663{
1664 struct btrfs_plug_cb *plug;
1665 plug = container_of(cb, struct btrfs_plug_cb, cb);
1666
1667 if (from_schedule) {
1668 plug->work.flags = 0;
1669 plug->work.func = unplug_work;
1670 btrfs_queue_worker(&plug->info->rmw_workers,
1671 &plug->work);
1672 return;
1673 }
1674 run_plug(plug);
1675}
1676
1677/*
1678 * our main entry point for writes from the rest of the FS.
1679 */
1680int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1681 struct btrfs_bio *bbio, u64 *raid_map,
1682 u64 stripe_len)
1683{
1684 struct btrfs_raid_bio *rbio;
1685 struct btrfs_plug_cb *plug = NULL;
1686 struct blk_plug_cb *cb;
1687
1688 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1689 if (IS_ERR(rbio)) {
1690 kfree(raid_map);
1691 kfree(bbio);
1692 return PTR_ERR(rbio);
1693 }
1694 bio_list_add(&rbio->bio_list, bio);
1695 rbio->bio_list_bytes = bio->bi_size;
1696
1697 /*
1698 * don't plug on full rbios, just get them out the door
1699 * as quickly as we can
1700 */
1701 if (rbio_is_full(rbio))
1702 return full_stripe_write(rbio);
1703
1704 cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
1705 sizeof(*plug));
1706 if (cb) {
1707 plug = container_of(cb, struct btrfs_plug_cb, cb);
1708 if (!plug->info) {
1709 plug->info = root->fs_info;
1710 INIT_LIST_HEAD(&plug->rbio_list);
1711 }
1712 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1713 } else {
1714 return __raid56_parity_write(rbio);
1715 }
1716 return 0;
1717}
1718
1719/*
1720 * all parity reconstruction happens here. We've read in everything
1721 * we can find from the drives and this does the heavy lifting of
1722 * sorting the good from the bad.
1723 */
1724static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1725{
1726 int pagenr, stripe;
1727 void **pointers;
1728 int faila = -1, failb = -1;
1729 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1730 struct page *page;
1731 int err;
1732 int i;
1733
1734 pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
1735 GFP_NOFS);
1736 if (!pointers) {
1737 err = -ENOMEM;
1738 goto cleanup_io;
1739 }
1740
1741 faila = rbio->faila;
1742 failb = rbio->failb;
1743
1744 if (rbio->read_rebuild) {
1745 spin_lock_irq(&rbio->bio_list_lock);
1746 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1747 spin_unlock_irq(&rbio->bio_list_lock);
1748 }
1749
1750 index_rbio_pages(rbio);
1751
1752 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1753 /* setup our array of pointers with pages
1754 * from each stripe
1755 */
1756 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1757 /*
1758 * if we're rebuilding a read, we have to use
1759 * pages from the bio list
1760 */
1761 if (rbio->read_rebuild &&
1762 (stripe == faila || stripe == failb)) {
1763 page = page_in_rbio(rbio, stripe, pagenr, 0);
1764 } else {
1765 page = rbio_stripe_page(rbio, stripe, pagenr);
1766 }
1767 pointers[stripe] = kmap(page);
1768 }
1769
1770 /* all raid6 handling here */
1771 if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
1772 RAID6_Q_STRIPE) {
1773
1774 /*
1775 * single failure, rebuild from parity raid5
1776 * style
1777 */
1778 if (failb < 0) {
1779 if (faila == rbio->nr_data) {
1780 /*
1781 * Just the P stripe has failed, without
1782 * a bad data or Q stripe.
1783 * TODO, we should redo the xor here.
1784 */
1785 err = -EIO;
1786 goto cleanup;
1787 }
1788 /*
1789 * a single failure in raid6 is rebuilt
1790 * in the pstripe code below
1791 */
1792 goto pstripe;
1793 }
1794
1795 /* make sure our ps and qs are in order */
1796 if (faila > failb) {
1797 int tmp = failb;
1798 failb = faila;
1799 faila = tmp;
1800 }
1801
1802 /* if the q stripe is failed, do a pstripe reconstruction
1803 * from the xors.
1804 * If both the q stripe and the P stripe are failed, we're
1805 * here due to a crc mismatch and we can't give them the
1806 * data they want
1807 */
1808 if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
1809 if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
1810 err = -EIO;
1811 goto cleanup;
1812 }
1813 /*
1814 * otherwise we have one bad data stripe and
1815 * a good P stripe. raid5!
1816 */
1817 goto pstripe;
1818 }
1819
1820 if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
1821 raid6_datap_recov(rbio->bbio->num_stripes,
1822 PAGE_SIZE, faila, pointers);
1823 } else {
1824 raid6_2data_recov(rbio->bbio->num_stripes,
1825 PAGE_SIZE, faila, failb,
1826 pointers);
1827 }
1828 } else {
1829 void *p;
1830
1831 /* rebuild from P stripe here (raid5 or raid6) */
1832 BUG_ON(failb != -1);
1833pstripe:
1834 /* Copy parity block into failed block to start with */
1835 memcpy(pointers[faila],
1836 pointers[rbio->nr_data],
1837 PAGE_CACHE_SIZE);
1838
1839 /* rearrange the pointer array */
1840 p = pointers[faila];
1841 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1842 pointers[stripe] = pointers[stripe + 1];
1843 pointers[rbio->nr_data - 1] = p;
1844
1845 /* xor in the rest */
1846 run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
1847 }
1848 /* if we're doing this rebuild as part of an rmw, go through
1849 * and set all of our private rbio pages in the
1850 * failed stripes as uptodate. This way finish_rmw will
1851 * know they can be trusted. If this was a read reconstruction,
1852 * other endio functions will fiddle the uptodate bits
1853 */
1854 if (!rbio->read_rebuild) {
1855 for (i = 0; i < nr_pages; i++) {
1856 if (faila != -1) {
1857 page = rbio_stripe_page(rbio, faila, i);
1858 SetPageUptodate(page);
1859 }
1860 if (failb != -1) {
1861 page = rbio_stripe_page(rbio, failb, i);
1862 SetPageUptodate(page);
1863 }
1864 }
1865 }
1866 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1867 /*
1868 * if we're rebuilding a read, we have to use
1869 * pages from the bio list
1870 */
1871 if (rbio->read_rebuild &&
1872 (stripe == faila || stripe == failb)) {
1873 page = page_in_rbio(rbio, stripe, pagenr, 0);
1874 } else {
1875 page = rbio_stripe_page(rbio, stripe, pagenr);
1876 }
1877 kunmap(page);
1878 }
1879 }
1880
1881 err = 0;
1882cleanup:
1883 kfree(pointers);
1884
1885cleanup_io:
1886
1887 if (rbio->read_rebuild) {
1888 if (err == 0)
1889 cache_rbio_pages(rbio);
1890 else
1891 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1892
1893 rbio_orig_end_io(rbio, err, err == 0);
1894 } else if (err == 0) {
1895 rbio->faila = -1;
1896 rbio->failb = -1;
1897 finish_rmw(rbio);
1898 } else {
1899 rbio_orig_end_io(rbio, err, 0);
1900 }
1901}
1902
1903/*
1904 * This is called only for stripes we've read from disk to
1905 * reconstruct the parity.
1906 */
1907static void raid_recover_end_io(struct bio *bio, int err)
1908{
1909 struct btrfs_raid_bio *rbio = bio->bi_private;
1910
1911 /*
1912 * we only read stripe pages off the disk, set them
1913 * up to date if there were no errors
1914 */
1915 if (err)
1916 fail_bio_stripe(rbio, bio);
1917 else
1918 set_bio_pages_uptodate(bio);
1919 bio_put(bio);
1920
1921 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1922 return;
1923
1924 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1925 rbio_orig_end_io(rbio, -EIO, 0);
1926 else
1927 __raid_recover_end_io(rbio);
1928}
1929
1930/*
1931 * reads everything we need off the disk to reconstruct
1932 * the parity. endio handlers trigger final reconstruction
1933 * when the IO is done.
1934 *
1935 * This is used both for reads from the higher layers and for
1936 * parity construction required to finish a rmw cycle.
1937 */
1938static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1939{
1940 int bios_to_read = 0;
1941 struct btrfs_bio *bbio = rbio->bbio;
1942 struct bio_list bio_list;
1943 int ret;
1944 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1945 int pagenr;
1946 int stripe;
1947 struct bio *bio;
1948
1949 bio_list_init(&bio_list);
1950
1951 ret = alloc_rbio_pages(rbio);
1952 if (ret)
1953 goto cleanup;
1954
1955 atomic_set(&rbio->bbio->error, 0);
1956
1957 /*
1958 * read everything that hasn't failed. Thanks to the
1959 * stripe cache, it is possible that some or all of these
1960 * pages are going to be uptodate.
1961 */
1962 for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1963 if (rbio->faila == stripe ||
1964 rbio->failb == stripe)
1965 continue;
1966
1967 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1968 struct page *p;
1969
1970 /*
1971 * the rmw code may have already read this
1972 * page in
1973 */
1974 p = rbio_stripe_page(rbio, stripe, pagenr);
1975 if (PageUptodate(p))
1976 continue;
1977
1978 ret = rbio_add_io_page(rbio, &bio_list,
1979 rbio_stripe_page(rbio, stripe, pagenr),
1980 stripe, pagenr, rbio->stripe_len);
1981 if (ret < 0)
1982 goto cleanup;
1983 }
1984 }
1985
1986 bios_to_read = bio_list_size(&bio_list);
1987 if (!bios_to_read) {
1988 /*
1989 * we might have no bios to read just because the pages
1990 * were up to date, or we might have no bios to read because
1991 * the devices were gone.
1992 */
1993 if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
1994 __raid_recover_end_io(rbio);
1995 goto out;
1996 } else {
1997 goto cleanup;
1998 }
1999 }
2000
2001 /*
2002 * the bbio may be freed once we submit the last bio. Make sure
2003 * not to touch it after that
2004 */
2005 atomic_set(&bbio->stripes_pending, bios_to_read);
2006 while (1) {
2007 bio = bio_list_pop(&bio_list);
2008 if (!bio)
2009 break;
2010
2011 bio->bi_private = rbio;
2012 bio->bi_end_io = raid_recover_end_io;
2013
2014 btrfs_bio_wq_end_io(rbio->fs_info, bio,
2015 BTRFS_WQ_ENDIO_RAID56);
2016
2017 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2018 submit_bio(READ, bio);
2019 }
2020out:
2021 return 0;
2022
2023cleanup:
2024 if (rbio->read_rebuild)
2025 rbio_orig_end_io(rbio, -EIO, 0);
2026 return -EIO;
2027}
2028
2029/*
2030 * the main entry point for reads from the higher layers. This
2031 * is really only called when the normal read path had a failure,
2032 * so we assume the bio they send down corresponds to a failed part
2033 * of the drive.
2034 */
2035int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
2036 struct btrfs_bio *bbio, u64 *raid_map,
2037 u64 stripe_len, int mirror_num)
2038{
2039 struct btrfs_raid_bio *rbio;
2040 int ret;
2041
2042 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
2043 if (IS_ERR(rbio)) {
2044 return PTR_ERR(rbio);
2045 }
2046
2047 rbio->read_rebuild = 1;
2048 bio_list_add(&rbio->bio_list, bio);
2049 rbio->bio_list_bytes = bio->bi_size;
2050
2051 rbio->faila = find_logical_bio_stripe(rbio, bio);
2052 if (rbio->faila == -1) {
2053 BUG();
2054 kfree(rbio);
2055 return -EIO;
2056 }
2057
2058 /*
2059 * reconstruct from the q stripe if they are
2060 * asking for mirror 3
2061 */
2062 if (mirror_num == 3)
2063 rbio->failb = bbio->num_stripes - 2;
2064
2065 ret = lock_stripe_add(rbio);
2066
2067 /*
2068 * __raid56_parity_recover will end the bio with
2069 * any errors it hits. We don't want to return
2070 * its error value up the stack because our caller
2071 * will end up calling bio_endio with any nonzero
2072 * return
2073 */
2074 if (ret == 0)
2075 __raid56_parity_recover(rbio);
2076 /*
2077 * our rbio has been added to the list of
2078 * rbios that will be handled after the
2079 * currently lock owner is done
2080 */
2081 return 0;
2082
2083}
2084
2085static void rmw_work(struct btrfs_work *work)
2086{
2087 struct btrfs_raid_bio *rbio;
2088
2089 rbio = container_of(work, struct btrfs_raid_bio, work);
2090 raid56_rmw_stripe(rbio);
2091}
2092
2093static void read_rebuild_work(struct btrfs_work *work)
2094{
2095 struct btrfs_raid_bio *rbio;
2096
2097 rbio = container_of(work, struct btrfs_raid_bio, work);
2098 __raid56_parity_recover(rbio);
2099}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
new file mode 100644
index 000000000000..ea5d73bfdfbe
--- /dev/null
+++ b/fs/btrfs/raid56.h
@@ -0,0 +1,51 @@
1/*
2 * Copyright (C) 2012 Fusion-io All rights reserved.
3 * Copyright (C) 2012 Intel Corp. All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
18 */
19
20#ifndef __BTRFS_RAID56__
21#define __BTRFS_RAID56__
22static inline int nr_parity_stripes(struct map_lookup *map)
23{
24 if (map->type & BTRFS_BLOCK_GROUP_RAID5)
25 return 1;
26 else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
27 return 2;
28 else
29 return 0;
30}
31
32static inline int nr_data_stripes(struct map_lookup *map)
33{
34 return map->num_stripes - nr_parity_stripes(map);
35}
36#define RAID5_P_STRIPE ((u64)-2)
37#define RAID6_Q_STRIPE ((u64)-1)
38
39#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \
40 ((x) == RAID6_Q_STRIPE))
41
42int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
43 struct btrfs_bio *bbio, u64 *raid_map,
44 u64 stripe_len, int mirror_num);
45int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
46 struct btrfs_bio *bbio, u64 *raid_map,
47 u64 stripe_len);
48
49int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
50void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
51#endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 17c306bf177a..50695dc5e2ab 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3017,7 +3017,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
3017 } 3017 }
3018 } 3018 }
3019 3019
3020 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 3020 page_start = page_offset(page);
3021 page_end = page_start + PAGE_CACHE_SIZE - 1; 3021 page_end = page_start + PAGE_CACHE_SIZE - 1;
3022 3022
3023 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); 3023 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 67783e03d121..53c3501fa4ca 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -28,6 +28,7 @@
28#include "dev-replace.h" 28#include "dev-replace.h"
29#include "check-integrity.h" 29#include "check-integrity.h"
30#include "rcu-string.h" 30#include "rcu-string.h"
31#include "raid56.h"
31 32
32/* 33/*
33 * This is only the first step towards a full-features scrub. It reads all 34 * This is only the first step towards a full-features scrub. It reads all
@@ -2254,6 +2255,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2254 struct btrfs_device *extent_dev; 2255 struct btrfs_device *extent_dev;
2255 int extent_mirror_num; 2256 int extent_mirror_num;
2256 2257
2258 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2259 BTRFS_BLOCK_GROUP_RAID6)) {
2260 if (num >= nr_data_stripes(map)) {
2261 return 0;
2262 }
2263 }
2264
2257 nstripes = length; 2265 nstripes = length;
2258 offset = 0; 2266 offset = 0;
2259 do_div(nstripes, map->stripe_len); 2267 do_div(nstripes, map->stripe_len);
@@ -2708,7 +2716,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2708 int ret; 2716 int ret;
2709 struct btrfs_root *root = sctx->dev_root; 2717 struct btrfs_root *root = sctx->dev_root;
2710 2718
2711 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 2719 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
2712 return -EIO; 2720 return -EIO;
2713 2721
2714 gen = root->fs_info->last_trans_committed; 2722 gen = root->fs_info->last_trans_committed;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f4ab7a9260eb..f7a8b861058b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -85,6 +85,7 @@ struct send_ctx {
85 u32 send_max_size; 85 u32 send_max_size;
86 u64 total_send_size; 86 u64 total_send_size;
87 u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; 87 u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
88 u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */
88 89
89 struct vfsmount *mnt; 90 struct vfsmount *mnt;
90 91
@@ -3709,6 +3710,39 @@ out:
3709 return ret; 3710 return ret;
3710} 3711}
3711 3712
3713/*
3714 * Send an update extent command to user space.
3715 */
3716static int send_update_extent(struct send_ctx *sctx,
3717 u64 offset, u32 len)
3718{
3719 int ret = 0;
3720 struct fs_path *p;
3721
3722 p = fs_path_alloc(sctx);
3723 if (!p)
3724 return -ENOMEM;
3725
3726 ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT);
3727 if (ret < 0)
3728 goto out;
3729
3730 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
3731 if (ret < 0)
3732 goto out;
3733
3734 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
3735 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
3736 TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
3737
3738 ret = send_cmd(sctx);
3739
3740tlv_put_failure:
3741out:
3742 fs_path_free(sctx, p);
3743 return ret;
3744}
3745
3712static int send_write_or_clone(struct send_ctx *sctx, 3746static int send_write_or_clone(struct send_ctx *sctx,
3713 struct btrfs_path *path, 3747 struct btrfs_path *path,
3714 struct btrfs_key *key, 3748 struct btrfs_key *key,
@@ -3744,7 +3778,11 @@ static int send_write_or_clone(struct send_ctx *sctx,
3744 goto out; 3778 goto out;
3745 } 3779 }
3746 3780
3747 if (!clone_root) { 3781 if (clone_root) {
3782 ret = send_clone(sctx, offset, len, clone_root);
3783 } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
3784 ret = send_update_extent(sctx, offset, len);
3785 } else {
3748 while (pos < len) { 3786 while (pos < len) {
3749 l = len - pos; 3787 l = len - pos;
3750 if (l > BTRFS_SEND_READ_SIZE) 3788 if (l > BTRFS_SEND_READ_SIZE)
@@ -3757,10 +3795,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
3757 pos += ret; 3795 pos += ret;
3758 } 3796 }
3759 ret = 0; 3797 ret = 0;
3760 } else {
3761 ret = send_clone(sctx, offset, len, clone_root);
3762 } 3798 }
3763
3764out: 3799out:
3765 return ret; 3800 return ret;
3766} 3801}
@@ -4536,7 +4571,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4536 struct btrfs_fs_info *fs_info; 4571 struct btrfs_fs_info *fs_info;
4537 struct btrfs_ioctl_send_args *arg = NULL; 4572 struct btrfs_ioctl_send_args *arg = NULL;
4538 struct btrfs_key key; 4573 struct btrfs_key key;
4539 struct file *filp = NULL;
4540 struct send_ctx *sctx = NULL; 4574 struct send_ctx *sctx = NULL;
4541 u32 i; 4575 u32 i;
4542 u64 *clone_sources_tmp = NULL; 4576 u64 *clone_sources_tmp = NULL;
@@ -4561,6 +4595,11 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4561 goto out; 4595 goto out;
4562 } 4596 }
4563 4597
4598 if (arg->flags & ~BTRFS_SEND_FLAG_NO_FILE_DATA) {
4599 ret = -EINVAL;
4600 goto out;
4601 }
4602
4564 sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS); 4603 sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS);
4565 if (!sctx) { 4604 if (!sctx) {
4566 ret = -ENOMEM; 4605 ret = -ENOMEM;
@@ -4572,6 +4611,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4572 INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS); 4611 INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS);
4573 INIT_LIST_HEAD(&sctx->name_cache_list); 4612 INIT_LIST_HEAD(&sctx->name_cache_list);
4574 4613
4614 sctx->flags = arg->flags;
4615
4575 sctx->send_filp = fget(arg->send_fd); 4616 sctx->send_filp = fget(arg->send_fd);
4576 if (IS_ERR(sctx->send_filp)) { 4617 if (IS_ERR(sctx->send_filp)) {
4577 ret = PTR_ERR(sctx->send_filp); 4618 ret = PTR_ERR(sctx->send_filp);
@@ -4673,8 +4714,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4673 goto out; 4714 goto out;
4674 4715
4675out: 4716out:
4676 if (filp)
4677 fput(filp);
4678 kfree(arg); 4717 kfree(arg);
4679 vfree(clone_sources_tmp); 4718 vfree(clone_sources_tmp);
4680 4719
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 1bf4f32fd4ef..8bb18f7ccaa6 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -86,6 +86,7 @@ enum btrfs_send_cmd {
86 BTRFS_SEND_C_UTIMES, 86 BTRFS_SEND_C_UTIMES,
87 87
88 BTRFS_SEND_C_END, 88 BTRFS_SEND_C_END,
89 BTRFS_SEND_C_UPDATE_EXTENT,
89 __BTRFS_SEND_C_MAX, 90 __BTRFS_SEND_C_MAX,
90}; 91};
91#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) 92#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d8982e9601d3..68a29a1ea068 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -41,13 +41,13 @@
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/cleancache.h> 42#include <linux/cleancache.h>
43#include <linux/ratelimit.h> 43#include <linux/ratelimit.h>
44#include <linux/btrfs.h>
44#include "compat.h" 45#include "compat.h"
45#include "delayed-inode.h" 46#include "delayed-inode.h"
46#include "ctree.h" 47#include "ctree.h"
47#include "disk-io.h" 48#include "disk-io.h"
48#include "transaction.h" 49#include "transaction.h"
49#include "btrfs_inode.h" 50#include "btrfs_inode.h"
50#include "ioctl.h"
51#include "print-tree.h" 51#include "print-tree.h"
52#include "xattr.h" 52#include "xattr.h"
53#include "volumes.h" 53#include "volumes.h"
@@ -63,8 +63,7 @@
63static const struct super_operations btrfs_super_ops; 63static const struct super_operations btrfs_super_ops;
64static struct file_system_type btrfs_fs_type; 64static struct file_system_type btrfs_fs_type;
65 65
66static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, 66static const char *btrfs_decode_error(int errno, char nbuf[16])
67 char nbuf[16])
68{ 67{
69 char *errstr = NULL; 68 char *errstr = NULL;
70 69
@@ -98,7 +97,7 @@ static void __save_error_info(struct btrfs_fs_info *fs_info)
98 * today we only save the error info into ram. Long term we'll 97 * today we only save the error info into ram. Long term we'll
99 * also send it down to the disk 98 * also send it down to the disk
100 */ 99 */
101 fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR; 100 set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
102} 101}
103 102
104static void save_error_info(struct btrfs_fs_info *fs_info) 103static void save_error_info(struct btrfs_fs_info *fs_info)
@@ -114,7 +113,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
114 if (sb->s_flags & MS_RDONLY) 113 if (sb->s_flags & MS_RDONLY)
115 return; 114 return;
116 115
117 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 116 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
118 sb->s_flags |= MS_RDONLY; 117 sb->s_flags |= MS_RDONLY;
119 printk(KERN_INFO "btrfs is forced readonly\n"); 118 printk(KERN_INFO "btrfs is forced readonly\n");
120 /* 119 /*
@@ -142,8 +141,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
142 struct super_block *sb = fs_info->sb; 141 struct super_block *sb = fs_info->sb;
143 char nbuf[16]; 142 char nbuf[16];
144 const char *errstr; 143 const char *errstr;
145 va_list args;
146 va_start(args, fmt);
147 144
148 /* 145 /*
149 * Special case: if the error is EROFS, and we're already 146 * Special case: if the error is EROFS, and we're already
@@ -152,15 +149,18 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
152 if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) 149 if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
153 return; 150 return;
154 151
155 errstr = btrfs_decode_error(fs_info, errno, nbuf); 152 errstr = btrfs_decode_error(errno, nbuf);
156 if (fmt) { 153 if (fmt) {
157 struct va_format vaf = { 154 struct va_format vaf;
158 .fmt = fmt, 155 va_list args;
159 .va = &args, 156
160 }; 157 va_start(args, fmt);
158 vaf.fmt = fmt;
159 vaf.va = &args;
161 160
162 printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n", 161 printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n",
163 sb->s_id, function, line, errstr, &vaf); 162 sb->s_id, function, line, errstr, &vaf);
163 va_end(args);
164 } else { 164 } else {
165 printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n", 165 printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
166 sb->s_id, function, line, errstr); 166 sb->s_id, function, line, errstr);
@@ -171,7 +171,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
171 save_error_info(fs_info); 171 save_error_info(fs_info);
172 btrfs_handle_error(fs_info); 172 btrfs_handle_error(fs_info);
173 } 173 }
174 va_end(args);
175} 174}
176 175
177static const char * const logtypes[] = { 176static const char * const logtypes[] = {
@@ -261,7 +260,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
261 char nbuf[16]; 260 char nbuf[16];
262 const char *errstr; 261 const char *errstr;
263 262
264 errstr = btrfs_decode_error(root->fs_info, errno, nbuf); 263 errstr = btrfs_decode_error(errno, nbuf);
265 btrfs_printk(root->fs_info, 264 btrfs_printk(root->fs_info,
266 "%s:%d: Aborting unused transaction(%s).\n", 265 "%s:%d: Aborting unused transaction(%s).\n",
267 function, line, errstr); 266 function, line, errstr);
@@ -289,8 +288,8 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
289 va_start(args, fmt); 288 va_start(args, fmt);
290 vaf.va = &args; 289 vaf.va = &args;
291 290
292 errstr = btrfs_decode_error(fs_info, errno, nbuf); 291 errstr = btrfs_decode_error(errno, nbuf);
293 if (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR) 292 if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR))
294 panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n", 293 panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n",
295 s_id, function, line, &vaf, errstr); 294 s_id, function, line, &vaf, errstr);
296 295
@@ -438,6 +437,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
438 case Opt_compress_force: 437 case Opt_compress_force:
439 case Opt_compress_force_type: 438 case Opt_compress_force_type:
440 compress_force = true; 439 compress_force = true;
440 /* Fallthrough */
441 case Opt_compress: 441 case Opt_compress:
442 case Opt_compress_type: 442 case Opt_compress_type:
443 if (token == Opt_compress || 443 if (token == Opt_compress ||
@@ -519,7 +519,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
519 case Opt_alloc_start: 519 case Opt_alloc_start:
520 num = match_strdup(&args[0]); 520 num = match_strdup(&args[0]);
521 if (num) { 521 if (num) {
522 mutex_lock(&info->chunk_mutex);
522 info->alloc_start = memparse(num, NULL); 523 info->alloc_start = memparse(num, NULL);
524 mutex_unlock(&info->chunk_mutex);
523 kfree(num); 525 kfree(num);
524 printk(KERN_INFO 526 printk(KERN_INFO
525 "btrfs: allocations start at %llu\n", 527 "btrfs: allocations start at %llu\n",
@@ -876,7 +878,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
876 878
877 btrfs_wait_ordered_extents(root, 0); 879 btrfs_wait_ordered_extents(root, 0);
878 880
879 trans = btrfs_attach_transaction(root); 881 trans = btrfs_attach_transaction_barrier(root);
880 if (IS_ERR(trans)) { 882 if (IS_ERR(trans)) {
881 /* no transaction, don't bother */ 883 /* no transaction, don't bother */
882 if (PTR_ERR(trans) == -ENOENT) 884 if (PTR_ERR(trans) == -ENOENT)
@@ -1200,6 +1202,38 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1200 new_pool_size); 1202 new_pool_size);
1201} 1203}
1202 1204
1205static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info,
1206 unsigned long old_opts, int flags)
1207{
1208 set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
1209
1210 if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
1211 (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
1212 (flags & MS_RDONLY))) {
1213 /* wait for any defraggers to finish */
1214 wait_event(fs_info->transaction_wait,
1215 (atomic_read(&fs_info->defrag_running) == 0));
1216 if (flags & MS_RDONLY)
1217 sync_filesystem(fs_info->sb);
1218 }
1219}
1220
1221static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
1222 unsigned long old_opts)
1223{
1224 /*
1225 * We need cleanup all defragable inodes if the autodefragment is
1226 * close or the fs is R/O.
1227 */
1228 if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
1229 (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
1230 (fs_info->sb->s_flags & MS_RDONLY))) {
1231 btrfs_cleanup_defrag_inodes(fs_info);
1232 }
1233
1234 clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
1235}
1236
1203static int btrfs_remount(struct super_block *sb, int *flags, char *data) 1237static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1204{ 1238{
1205 struct btrfs_fs_info *fs_info = btrfs_sb(sb); 1239 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -1213,6 +1247,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1213 unsigned int old_metadata_ratio = fs_info->metadata_ratio; 1247 unsigned int old_metadata_ratio = fs_info->metadata_ratio;
1214 int ret; 1248 int ret;
1215 1249
1250 btrfs_remount_prepare(fs_info, old_opts, *flags);
1251
1216 ret = btrfs_parse_options(root, data); 1252 ret = btrfs_parse_options(root, data);
1217 if (ret) { 1253 if (ret) {
1218 ret = -EINVAL; 1254 ret = -EINVAL;
@@ -1223,7 +1259,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1223 fs_info->thread_pool_size, old_thread_pool_size); 1259 fs_info->thread_pool_size, old_thread_pool_size);
1224 1260
1225 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 1261 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
1226 return 0; 1262 goto out;
1227 1263
1228 if (*flags & MS_RDONLY) { 1264 if (*flags & MS_RDONLY) {
1229 /* 1265 /*
@@ -1278,7 +1314,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1278 } 1314 }
1279 sb->s_flags &= ~MS_RDONLY; 1315 sb->s_flags &= ~MS_RDONLY;
1280 } 1316 }
1281 1317out:
1318 btrfs_remount_cleanup(fs_info, old_opts);
1282 return 0; 1319 return 0;
1283 1320
1284restore: 1321restore:
@@ -1289,10 +1326,13 @@ restore:
1289 fs_info->mount_opt = old_opts; 1326 fs_info->mount_opt = old_opts;
1290 fs_info->compress_type = old_compress_type; 1327 fs_info->compress_type = old_compress_type;
1291 fs_info->max_inline = old_max_inline; 1328 fs_info->max_inline = old_max_inline;
1329 mutex_lock(&fs_info->chunk_mutex);
1292 fs_info->alloc_start = old_alloc_start; 1330 fs_info->alloc_start = old_alloc_start;
1331 mutex_unlock(&fs_info->chunk_mutex);
1293 btrfs_resize_thread_pool(fs_info, 1332 btrfs_resize_thread_pool(fs_info,
1294 old_thread_pool_size, fs_info->thread_pool_size); 1333 old_thread_pool_size, fs_info->thread_pool_size);
1295 fs_info->metadata_ratio = old_metadata_ratio; 1334 fs_info->metadata_ratio = old_metadata_ratio;
1335 btrfs_remount_cleanup(fs_info, old_opts);
1296 return ret; 1336 return ret;
1297} 1337}
1298 1338
@@ -1559,7 +1599,7 @@ static int btrfs_freeze(struct super_block *sb)
1559 struct btrfs_trans_handle *trans; 1599 struct btrfs_trans_handle *trans;
1560 struct btrfs_root *root = btrfs_sb(sb)->tree_root; 1600 struct btrfs_root *root = btrfs_sb(sb)->tree_root;
1561 1601
1562 trans = btrfs_attach_transaction(root); 1602 trans = btrfs_attach_transaction_barrier(root);
1563 if (IS_ERR(trans)) { 1603 if (IS_ERR(trans)) {
1564 /* no transaction, don't bother */ 1604 /* no transaction, don't bother */
1565 if (PTR_ERR(trans) == -ENOENT) 1605 if (PTR_ERR(trans) == -ENOENT)
@@ -1684,10 +1724,14 @@ static int __init init_btrfs_fs(void)
1684 if (err) 1724 if (err)
1685 goto free_delayed_inode; 1725 goto free_delayed_inode;
1686 1726
1687 err = btrfs_interface_init(); 1727 err = btrfs_delayed_ref_init();
1688 if (err) 1728 if (err)
1689 goto free_auto_defrag; 1729 goto free_auto_defrag;
1690 1730
1731 err = btrfs_interface_init();
1732 if (err)
1733 goto free_delayed_ref;
1734
1691 err = register_filesystem(&btrfs_fs_type); 1735 err = register_filesystem(&btrfs_fs_type);
1692 if (err) 1736 if (err)
1693 goto unregister_ioctl; 1737 goto unregister_ioctl;
@@ -1699,6 +1743,8 @@ static int __init init_btrfs_fs(void)
1699 1743
1700unregister_ioctl: 1744unregister_ioctl:
1701 btrfs_interface_exit(); 1745 btrfs_interface_exit();
1746free_delayed_ref:
1747 btrfs_delayed_ref_exit();
1702free_auto_defrag: 1748free_auto_defrag:
1703 btrfs_auto_defrag_exit(); 1749 btrfs_auto_defrag_exit();
1704free_delayed_inode: 1750free_delayed_inode:
@@ -1720,6 +1766,7 @@ free_compress:
1720static void __exit exit_btrfs_fs(void) 1766static void __exit exit_btrfs_fs(void)
1721{ 1767{
1722 btrfs_destroy_cachep(); 1768 btrfs_destroy_cachep();
1769 btrfs_delayed_ref_exit();
1723 btrfs_auto_defrag_exit(); 1770 btrfs_auto_defrag_exit();
1724 btrfs_delayed_inode_exit(); 1771 btrfs_delayed_inode_exit();
1725 ordered_data_exit(); 1772 ordered_data_exit();
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index daac9ae6d731..5b326cd60a4a 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -21,7 +21,6 @@
21#include <linux/spinlock.h> 21#include <linux/spinlock.h>
22#include <linux/completion.h> 22#include <linux/completion.h>
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/module.h>
25#include <linux/kobject.h> 24#include <linux/kobject.h>
26 25
27#include "ctree.h" 26#include "ctree.h"
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4c0067c4f76d..e52da6fb1165 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -40,7 +40,6 @@ void put_transaction(struct btrfs_transaction *transaction)
40 if (atomic_dec_and_test(&transaction->use_count)) { 40 if (atomic_dec_and_test(&transaction->use_count)) {
41 BUG_ON(!list_empty(&transaction->list)); 41 BUG_ON(!list_empty(&transaction->list));
42 WARN_ON(transaction->delayed_refs.root.rb_node); 42 WARN_ON(transaction->delayed_refs.root.rb_node);
43 memset(transaction, 0, sizeof(*transaction));
44 kmem_cache_free(btrfs_transaction_cachep, transaction); 43 kmem_cache_free(btrfs_transaction_cachep, transaction);
45 } 44 }
46} 45}
@@ -51,6 +50,14 @@ static noinline void switch_commit_root(struct btrfs_root *root)
51 root->commit_root = btrfs_root_node(root); 50 root->commit_root = btrfs_root_node(root);
52} 51}
53 52
53static inline int can_join_transaction(struct btrfs_transaction *trans,
54 int type)
55{
56 return !(trans->in_commit &&
57 type != TRANS_JOIN &&
58 type != TRANS_JOIN_NOLOCK);
59}
60
54/* 61/*
55 * either allocate a new transaction or hop into the existing one 62 * either allocate a new transaction or hop into the existing one
56 */ 63 */
@@ -62,7 +69,7 @@ static noinline int join_transaction(struct btrfs_root *root, int type)
62 spin_lock(&fs_info->trans_lock); 69 spin_lock(&fs_info->trans_lock);
63loop: 70loop:
64 /* The file system has been taken offline. No new transactions. */ 71 /* The file system has been taken offline. No new transactions. */
65 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 72 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
66 spin_unlock(&fs_info->trans_lock); 73 spin_unlock(&fs_info->trans_lock);
67 return -EROFS; 74 return -EROFS;
68 } 75 }
@@ -86,6 +93,10 @@ loop:
86 spin_unlock(&fs_info->trans_lock); 93 spin_unlock(&fs_info->trans_lock);
87 return cur_trans->aborted; 94 return cur_trans->aborted;
88 } 95 }
96 if (!can_join_transaction(cur_trans, type)) {
97 spin_unlock(&fs_info->trans_lock);
98 return -EBUSY;
99 }
89 atomic_inc(&cur_trans->use_count); 100 atomic_inc(&cur_trans->use_count);
90 atomic_inc(&cur_trans->num_writers); 101 atomic_inc(&cur_trans->num_writers);
91 cur_trans->num_joined++; 102 cur_trans->num_joined++;
@@ -113,7 +124,7 @@ loop:
113 */ 124 */
114 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 125 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
115 goto loop; 126 goto loop;
116 } else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 127 } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
117 spin_unlock(&fs_info->trans_lock); 128 spin_unlock(&fs_info->trans_lock);
118 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 129 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
119 return -EROFS; 130 return -EROFS;
@@ -155,8 +166,12 @@ loop:
155 166
156 spin_lock_init(&cur_trans->commit_lock); 167 spin_lock_init(&cur_trans->commit_lock);
157 spin_lock_init(&cur_trans->delayed_refs.lock); 168 spin_lock_init(&cur_trans->delayed_refs.lock);
169 atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
170 atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
171 init_waitqueue_head(&cur_trans->delayed_refs.wait);
158 172
159 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 173 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
174 INIT_LIST_HEAD(&cur_trans->ordered_operations);
160 list_add_tail(&cur_trans->list, &fs_info->trans_list); 175 list_add_tail(&cur_trans->list, &fs_info->trans_list);
161 extent_io_tree_init(&cur_trans->dirty_pages, 176 extent_io_tree_init(&cur_trans->dirty_pages,
162 fs_info->btree_inode->i_mapping); 177 fs_info->btree_inode->i_mapping);
@@ -301,7 +316,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,
301 int ret; 316 int ret;
302 u64 qgroup_reserved = 0; 317 u64 qgroup_reserved = 0;
303 318
304 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 319 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
305 return ERR_PTR(-EROFS); 320 return ERR_PTR(-EROFS);
306 321
307 if (current->journal_info) { 322 if (current->journal_info) {
@@ -359,8 +374,11 @@ again:
359 374
360 do { 375 do {
361 ret = join_transaction(root, type); 376 ret = join_transaction(root, type);
362 if (ret == -EBUSY) 377 if (ret == -EBUSY) {
363 wait_current_trans(root); 378 wait_current_trans(root);
379 if (unlikely(type == TRANS_ATTACH))
380 ret = -ENOENT;
381 }
364 } while (ret == -EBUSY); 382 } while (ret == -EBUSY);
365 383
366 if (ret < 0) { 384 if (ret < 0) {
@@ -382,9 +400,10 @@ again:
382 h->block_rsv = NULL; 400 h->block_rsv = NULL;
383 h->orig_rsv = NULL; 401 h->orig_rsv = NULL;
384 h->aborted = 0; 402 h->aborted = 0;
385 h->qgroup_reserved = qgroup_reserved; 403 h->qgroup_reserved = 0;
386 h->delayed_ref_elem.seq = 0; 404 h->delayed_ref_elem.seq = 0;
387 h->type = type; 405 h->type = type;
406 h->allocating_chunk = false;
388 INIT_LIST_HEAD(&h->qgroup_ref_list); 407 INIT_LIST_HEAD(&h->qgroup_ref_list);
389 INIT_LIST_HEAD(&h->new_bgs); 408 INIT_LIST_HEAD(&h->new_bgs);
390 409
@@ -400,6 +419,7 @@ again:
400 h->block_rsv = &root->fs_info->trans_block_rsv; 419 h->block_rsv = &root->fs_info->trans_block_rsv;
401 h->bytes_reserved = num_bytes; 420 h->bytes_reserved = num_bytes;
402 } 421 }
422 h->qgroup_reserved = qgroup_reserved;
403 423
404got_it: 424got_it:
405 btrfs_record_root_in_trans(h, root); 425 btrfs_record_root_in_trans(h, root);
@@ -451,11 +471,43 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root
451 return start_transaction(root, 0, TRANS_USERSPACE, 0); 471 return start_transaction(root, 0, TRANS_USERSPACE, 0);
452} 472}
453 473
474/*
475 * btrfs_attach_transaction() - catch the running transaction
476 *
477 * It is used when we want to commit the current the transaction, but
478 * don't want to start a new one.
479 *
480 * Note: If this function return -ENOENT, it just means there is no
481 * running transaction. But it is possible that the inactive transaction
482 * is still in the memory, not fully on disk. If you hope there is no
483 * inactive transaction in the fs when -ENOENT is returned, you should
484 * invoke
485 * btrfs_attach_transaction_barrier()
486 */
454struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) 487struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
455{ 488{
456 return start_transaction(root, 0, TRANS_ATTACH, 0); 489 return start_transaction(root, 0, TRANS_ATTACH, 0);
457} 490}
458 491
492/*
493 * btrfs_attach_transaction() - catch the running transaction
494 *
495 * It is similar to the above function, the differentia is this one
496 * will wait for all the inactive transactions until they fully
497 * complete.
498 */
499struct btrfs_trans_handle *
500btrfs_attach_transaction_barrier(struct btrfs_root *root)
501{
502 struct btrfs_trans_handle *trans;
503
504 trans = start_transaction(root, 0, TRANS_ATTACH, 0);
505 if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT)
506 btrfs_wait_for_commit(root, 0);
507
508 return trans;
509}
510
459/* wait for a transaction commit to be fully complete */ 511/* wait for a transaction commit to be fully complete */
460static noinline void wait_for_commit(struct btrfs_root *root, 512static noinline void wait_for_commit(struct btrfs_root *root,
461 struct btrfs_transaction *commit) 513 struct btrfs_transaction *commit)
@@ -587,7 +639,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
587 if (!list_empty(&trans->new_bgs)) 639 if (!list_empty(&trans->new_bgs))
588 btrfs_create_pending_block_groups(trans, root); 640 btrfs_create_pending_block_groups(trans, root);
589 641
590 while (count < 2) { 642 while (count < 1) {
591 unsigned long cur = trans->delayed_ref_updates; 643 unsigned long cur = trans->delayed_ref_updates;
592 trans->delayed_ref_updates = 0; 644 trans->delayed_ref_updates = 0;
593 if (cur && 645 if (cur &&
@@ -599,6 +651,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
599 } 651 }
600 count++; 652 count++;
601 } 653 }
654
602 btrfs_trans_release_metadata(trans, root); 655 btrfs_trans_release_metadata(trans, root);
603 trans->block_rsv = NULL; 656 trans->block_rsv = NULL;
604 657
@@ -644,12 +697,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
644 btrfs_run_delayed_iputs(root); 697 btrfs_run_delayed_iputs(root);
645 698
646 if (trans->aborted || 699 if (trans->aborted ||
647 root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 700 test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
648 err = -EIO; 701 err = -EIO;
649 }
650 assert_qgroups_uptodate(trans); 702 assert_qgroups_uptodate(trans);
651 703
652 memset(trans, 0, sizeof(*trans));
653 kmem_cache_free(btrfs_trans_handle_cachep, trans); 704 kmem_cache_free(btrfs_trans_handle_cachep, trans);
654 return err; 705 return err;
655} 706}
@@ -696,7 +747,9 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
696 struct extent_state *cached_state = NULL; 747 struct extent_state *cached_state = NULL;
697 u64 start = 0; 748 u64 start = 0;
698 u64 end; 749 u64 end;
750 struct blk_plug plug;
699 751
752 blk_start_plug(&plug);
700 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 753 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
701 mark, &cached_state)) { 754 mark, &cached_state)) {
702 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, 755 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
@@ -710,6 +763,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
710 } 763 }
711 if (err) 764 if (err)
712 werr = err; 765 werr = err;
766 blk_finish_plug(&plug);
713 return werr; 767 return werr;
714} 768}
715 769
@@ -960,10 +1014,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
960} 1014}
961 1015
962/* 1016/*
963 * defrag a given btree. If cacheonly == 1, this won't read from the disk, 1017 * defrag a given btree.
964 * otherwise every leaf in the btree is read and defragged. 1018 * Every leaf in the btree is read and defragged.
965 */ 1019 */
966int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) 1020int btrfs_defrag_root(struct btrfs_root *root)
967{ 1021{
968 struct btrfs_fs_info *info = root->fs_info; 1022 struct btrfs_fs_info *info = root->fs_info;
969 struct btrfs_trans_handle *trans; 1023 struct btrfs_trans_handle *trans;
@@ -977,7 +1031,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
977 if (IS_ERR(trans)) 1031 if (IS_ERR(trans))
978 return PTR_ERR(trans); 1032 return PTR_ERR(trans);
979 1033
980 ret = btrfs_defrag_leaves(trans, root, cacheonly); 1034 ret = btrfs_defrag_leaves(trans, root);
981 1035
982 btrfs_end_transaction(trans, root); 1036 btrfs_end_transaction(trans, root);
983 btrfs_btree_balance_dirty(info->tree_root); 1037 btrfs_btree_balance_dirty(info->tree_root);
@@ -985,6 +1039,12 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
985 1039
986 if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) 1040 if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
987 break; 1041 break;
1042
1043 if (btrfs_defrag_cancelled(root->fs_info)) {
1044 printk(KERN_DEBUG "btrfs: defrag_root cancelled\n");
1045 ret = -EAGAIN;
1046 break;
1047 }
988 } 1048 }
989 root->defrag_running = 0; 1049 root->defrag_running = 0;
990 return ret; 1050 return ret;
@@ -1007,7 +1067,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1007 struct inode *parent_inode; 1067 struct inode *parent_inode;
1008 struct btrfs_path *path; 1068 struct btrfs_path *path;
1009 struct btrfs_dir_item *dir_item; 1069 struct btrfs_dir_item *dir_item;
1010 struct dentry *parent;
1011 struct dentry *dentry; 1070 struct dentry *dentry;
1012 struct extent_buffer *tmp; 1071 struct extent_buffer *tmp;
1013 struct extent_buffer *old; 1072 struct extent_buffer *old;
@@ -1022,7 +1081,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1022 path = btrfs_alloc_path(); 1081 path = btrfs_alloc_path();
1023 if (!path) { 1082 if (!path) {
1024 ret = pending->error = -ENOMEM; 1083 ret = pending->error = -ENOMEM;
1025 goto path_alloc_fail; 1084 return ret;
1026 } 1085 }
1027 1086
1028 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 1087 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
@@ -1062,10 +1121,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1062 1121
1063 rsv = trans->block_rsv; 1122 rsv = trans->block_rsv;
1064 trans->block_rsv = &pending->block_rsv; 1123 trans->block_rsv = &pending->block_rsv;
1124 trans->bytes_reserved = trans->block_rsv->reserved;
1065 1125
1066 dentry = pending->dentry; 1126 dentry = pending->dentry;
1067 parent = dget_parent(dentry); 1127 parent_inode = pending->dir;
1068 parent_inode = parent->d_inode;
1069 parent_root = BTRFS_I(parent_inode)->root; 1128 parent_root = BTRFS_I(parent_inode)->root;
1070 record_root_in_trans(trans, parent_root); 1129 record_root_in_trans(trans, parent_root);
1071 1130
@@ -1213,14 +1272,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1213 if (ret) 1272 if (ret)
1214 btrfs_abort_transaction(trans, root, ret); 1273 btrfs_abort_transaction(trans, root, ret);
1215fail: 1274fail:
1216 dput(parent);
1217 trans->block_rsv = rsv; 1275 trans->block_rsv = rsv;
1276 trans->bytes_reserved = 0;
1218no_free_objectid: 1277no_free_objectid:
1219 kfree(new_root_item); 1278 kfree(new_root_item);
1220root_item_alloc_fail: 1279root_item_alloc_fail:
1221 btrfs_free_path(path); 1280 btrfs_free_path(path);
1222path_alloc_fail:
1223 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
1224 return ret; 1281 return ret;
1225} 1282}
1226 1283
@@ -1306,13 +1363,13 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
1306struct btrfs_async_commit { 1363struct btrfs_async_commit {
1307 struct btrfs_trans_handle *newtrans; 1364 struct btrfs_trans_handle *newtrans;
1308 struct btrfs_root *root; 1365 struct btrfs_root *root;
1309 struct delayed_work work; 1366 struct work_struct work;
1310}; 1367};
1311 1368
1312static void do_async_commit(struct work_struct *work) 1369static void do_async_commit(struct work_struct *work)
1313{ 1370{
1314 struct btrfs_async_commit *ac = 1371 struct btrfs_async_commit *ac =
1315 container_of(work, struct btrfs_async_commit, work.work); 1372 container_of(work, struct btrfs_async_commit, work);
1316 1373
1317 /* 1374 /*
1318 * We've got freeze protection passed with the transaction. 1375 * We've got freeze protection passed with the transaction.
@@ -1340,7 +1397,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1340 if (!ac) 1397 if (!ac)
1341 return -ENOMEM; 1398 return -ENOMEM;
1342 1399
1343 INIT_DELAYED_WORK(&ac->work, do_async_commit); 1400 INIT_WORK(&ac->work, do_async_commit);
1344 ac->root = root; 1401 ac->root = root;
1345 ac->newtrans = btrfs_join_transaction(root); 1402 ac->newtrans = btrfs_join_transaction(root);
1346 if (IS_ERR(ac->newtrans)) { 1403 if (IS_ERR(ac->newtrans)) {
@@ -1364,7 +1421,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1364 &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1421 &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1365 1, _THIS_IP_); 1422 1, _THIS_IP_);
1366 1423
1367 schedule_delayed_work(&ac->work, 0); 1424 schedule_work(&ac->work);
1368 1425
1369 /* wait for transaction to start and unblock */ 1426 /* wait for transaction to start and unblock */
1370 if (wait_for_unblock) 1427 if (wait_for_unblock)
@@ -1384,6 +1441,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1384 struct btrfs_root *root, int err) 1441 struct btrfs_root *root, int err)
1385{ 1442{
1386 struct btrfs_transaction *cur_trans = trans->transaction; 1443 struct btrfs_transaction *cur_trans = trans->transaction;
1444 DEFINE_WAIT(wait);
1387 1445
1388 WARN_ON(trans->use_count > 1); 1446 WARN_ON(trans->use_count > 1);
1389 1447
@@ -1392,8 +1450,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1392 spin_lock(&root->fs_info->trans_lock); 1450 spin_lock(&root->fs_info->trans_lock);
1393 list_del_init(&cur_trans->list); 1451 list_del_init(&cur_trans->list);
1394 if (cur_trans == root->fs_info->running_transaction) { 1452 if (cur_trans == root->fs_info->running_transaction) {
1453 root->fs_info->trans_no_join = 1;
1454 spin_unlock(&root->fs_info->trans_lock);
1455 wait_event(cur_trans->writer_wait,
1456 atomic_read(&cur_trans->num_writers) == 1);
1457
1458 spin_lock(&root->fs_info->trans_lock);
1395 root->fs_info->running_transaction = NULL; 1459 root->fs_info->running_transaction = NULL;
1396 root->fs_info->trans_no_join = 0;
1397 } 1460 }
1398 spin_unlock(&root->fs_info->trans_lock); 1461 spin_unlock(&root->fs_info->trans_lock);
1399 1462
@@ -1427,7 +1490,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1427 } 1490 }
1428 1491
1429 if (flush_on_commit || snap_pending) { 1492 if (flush_on_commit || snap_pending) {
1430 btrfs_start_delalloc_inodes(root, 1); 1493 ret = btrfs_start_delalloc_inodes(root, 1);
1494 if (ret)
1495 return ret;
1431 btrfs_wait_ordered_extents(root, 1); 1496 btrfs_wait_ordered_extents(root, 1);
1432 } 1497 }
1433 1498
@@ -1449,9 +1514,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1449 * it here and no for sure that nothing new will be added 1514 * it here and no for sure that nothing new will be added
1450 * to the list 1515 * to the list
1451 */ 1516 */
1452 btrfs_run_ordered_operations(root, 1); 1517 ret = btrfs_run_ordered_operations(trans, root, 1);
1453 1518
1454 return 0; 1519 return ret;
1455} 1520}
1456 1521
1457/* 1522/*
@@ -1472,27 +1537,35 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1472 int should_grow = 0; 1537 int should_grow = 0;
1473 unsigned long now = get_seconds(); 1538 unsigned long now = get_seconds();
1474 1539
1475 ret = btrfs_run_ordered_operations(root, 0); 1540 ret = btrfs_run_ordered_operations(trans, root, 0);
1476 if (ret) { 1541 if (ret) {
1477 btrfs_abort_transaction(trans, root, ret); 1542 btrfs_abort_transaction(trans, root, ret);
1478 goto cleanup_transaction; 1543 btrfs_end_transaction(trans, root);
1544 return ret;
1479 } 1545 }
1480 1546
1481 /* Stop the commit early if ->aborted is set */ 1547 /* Stop the commit early if ->aborted is set */
1482 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { 1548 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
1483 ret = cur_trans->aborted; 1549 ret = cur_trans->aborted;
1484 goto cleanup_transaction; 1550 btrfs_end_transaction(trans, root);
1551 return ret;
1485 } 1552 }
1486 1553
1487 /* make a pass through all the delayed refs we have so far 1554 /* make a pass through all the delayed refs we have so far
1488 * any runnings procs may add more while we are here 1555 * any runnings procs may add more while we are here
1489 */ 1556 */
1490 ret = btrfs_run_delayed_refs(trans, root, 0); 1557 ret = btrfs_run_delayed_refs(trans, root, 0);
1491 if (ret) 1558 if (ret) {
1492 goto cleanup_transaction; 1559 btrfs_end_transaction(trans, root);
1560 return ret;
1561 }
1493 1562
1494 btrfs_trans_release_metadata(trans, root); 1563 btrfs_trans_release_metadata(trans, root);
1495 trans->block_rsv = NULL; 1564 trans->block_rsv = NULL;
1565 if (trans->qgroup_reserved) {
1566 btrfs_qgroup_free(root, trans->qgroup_reserved);
1567 trans->qgroup_reserved = 0;
1568 }
1496 1569
1497 cur_trans = trans->transaction; 1570 cur_trans = trans->transaction;
1498 1571
@@ -1506,8 +1579,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1506 btrfs_create_pending_block_groups(trans, root); 1579 btrfs_create_pending_block_groups(trans, root);
1507 1580
1508 ret = btrfs_run_delayed_refs(trans, root, 0); 1581 ret = btrfs_run_delayed_refs(trans, root, 0);
1509 if (ret) 1582 if (ret) {
1510 goto cleanup_transaction; 1583 btrfs_end_transaction(trans, root);
1584 return ret;
1585 }
1511 1586
1512 spin_lock(&cur_trans->commit_lock); 1587 spin_lock(&cur_trans->commit_lock);
1513 if (cur_trans->in_commit) { 1588 if (cur_trans->in_commit) {
@@ -1771,6 +1846,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1771cleanup_transaction: 1846cleanup_transaction:
1772 btrfs_trans_release_metadata(trans, root); 1847 btrfs_trans_release_metadata(trans, root);
1773 trans->block_rsv = NULL; 1848 trans->block_rsv = NULL;
1849 if (trans->qgroup_reserved) {
1850 btrfs_qgroup_free(root, trans->qgroup_reserved);
1851 trans->qgroup_reserved = 0;
1852 }
1774 btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n"); 1853 btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n");
1775// WARN_ON(1); 1854// WARN_ON(1);
1776 if (current->journal_info == trans) 1855 if (current->journal_info == trans)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 0e8aa1e6c287..3c8e0d25c8e4 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -43,6 +43,7 @@ struct btrfs_transaction {
43 wait_queue_head_t writer_wait; 43 wait_queue_head_t writer_wait;
44 wait_queue_head_t commit_wait; 44 wait_queue_head_t commit_wait;
45 struct list_head pending_snapshots; 45 struct list_head pending_snapshots;
46 struct list_head ordered_operations;
46 struct btrfs_delayed_ref_root delayed_refs; 47 struct btrfs_delayed_ref_root delayed_refs;
47 int aborted; 48 int aborted;
48}; 49};
@@ -68,6 +69,7 @@ struct btrfs_trans_handle {
68 struct btrfs_block_rsv *orig_rsv; 69 struct btrfs_block_rsv *orig_rsv;
69 short aborted; 70 short aborted;
70 short adding_csums; 71 short adding_csums;
72 bool allocating_chunk;
71 enum btrfs_trans_type type; 73 enum btrfs_trans_type type;
72 /* 74 /*
73 * this root is only needed to validate that the root passed to 75 * this root is only needed to validate that the root passed to
@@ -82,11 +84,13 @@ struct btrfs_trans_handle {
82 84
83struct btrfs_pending_snapshot { 85struct btrfs_pending_snapshot {
84 struct dentry *dentry; 86 struct dentry *dentry;
87 struct inode *dir;
85 struct btrfs_root *root; 88 struct btrfs_root *root;
86 struct btrfs_root *snap; 89 struct btrfs_root *snap;
87 struct btrfs_qgroup_inherit *inherit; 90 struct btrfs_qgroup_inherit *inherit;
88 /* block reservation for the operation */ 91 /* block reservation for the operation */
89 struct btrfs_block_rsv block_rsv; 92 struct btrfs_block_rsv block_rsv;
93 u64 qgroup_reserved;
90 /* extra metadata reseration for relocation */ 94 /* extra metadata reseration for relocation */
91 int error; 95 int error;
92 bool readonly; 96 bool readonly;
@@ -110,13 +114,15 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush(
110struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); 114struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
111struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); 115struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
112struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); 116struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
117struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
118 struct btrfs_root *root);
113struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); 119struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
114int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); 120int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
115int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 121int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
116 struct btrfs_root *root); 122 struct btrfs_root *root);
117 123
118int btrfs_add_dead_root(struct btrfs_root *root); 124int btrfs_add_dead_root(struct btrfs_root *root);
119int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); 125int btrfs_defrag_root(struct btrfs_root *root);
120int btrfs_clean_old_snapshots(struct btrfs_root *root); 126int btrfs_clean_old_snapshots(struct btrfs_root *root);
121int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 127int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
122 struct btrfs_root *root); 128 struct btrfs_root *root);
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3b580ee8ab1d..94e05c1f118a 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -23,13 +23,14 @@
23#include "transaction.h" 23#include "transaction.h"
24#include "locking.h" 24#include "locking.h"
25 25
26/* defrag all the leaves in a given btree. If cache_only == 1, don't read 26/*
27 * things from disk, otherwise read all the leaves and try to get key order to 27 * Defrag all the leaves in a given btree.
28 * Read all the leaves and try to get key order to
28 * better reflect disk order 29 * better reflect disk order
29 */ 30 */
30 31
31int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 32int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
32 struct btrfs_root *root, int cache_only) 33 struct btrfs_root *root)
33{ 34{
34 struct btrfs_path *path = NULL; 35 struct btrfs_path *path = NULL;
35 struct btrfs_key key; 36 struct btrfs_key key;
@@ -41,9 +42,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
41 u64 last_ret = 0; 42 u64 last_ret = 0;
42 u64 min_trans = 0; 43 u64 min_trans = 0;
43 44
44 if (cache_only)
45 goto out;
46
47 if (root->fs_info->extent_root == root) { 45 if (root->fs_info->extent_root == root) {
48 /* 46 /*
49 * there's recursion here right now in the tree locking, 47 * there's recursion here right now in the tree locking,
@@ -86,11 +84,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
86 } 84 }
87 85
88 path->keep_locks = 1; 86 path->keep_locks = 1;
89 if (cache_only)
90 min_trans = root->defrag_trans_start;
91 87
92 ret = btrfs_search_forward(root, &key, NULL, path, 88 ret = btrfs_search_forward(root, &key, NULL, path, min_trans);
93 cache_only, min_trans);
94 if (ret < 0) 89 if (ret < 0)
95 goto out; 90 goto out;
96 if (ret > 0) { 91 if (ret > 0) {
@@ -109,11 +104,11 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
109 goto out; 104 goto out;
110 } 105 }
111 path->slots[1] = btrfs_header_nritems(path->nodes[1]); 106 path->slots[1] = btrfs_header_nritems(path->nodes[1]);
112 next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only, 107 next_key_ret = btrfs_find_next_key(root, path, &key, 1,
113 min_trans); 108 min_trans);
114 ret = btrfs_realloc_node(trans, root, 109 ret = btrfs_realloc_node(trans, root,
115 path->nodes[1], 0, 110 path->nodes[1], 0,
116 cache_only, &last_ret, 111 &last_ret,
117 &root->defrag_progress); 112 &root->defrag_progress);
118 if (ret) { 113 if (ret) {
119 WARN_ON(ret == -EAGAIN); 114 WARN_ON(ret == -EAGAIN);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9027bb1e7466..c7ef569eb22a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -278,8 +278,7 @@ static int process_one_buffer(struct btrfs_root *log,
278 struct walk_control *wc, u64 gen) 278 struct walk_control *wc, u64 gen)
279{ 279{
280 if (wc->pin) 280 if (wc->pin)
281 btrfs_pin_extent_for_log_replay(wc->trans, 281 btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
282 log->fs_info->extent_root,
283 eb->start, eb->len); 282 eb->start, eb->len);
284 283
285 if (btrfs_buffer_uptodate(eb, gen, 0)) { 284 if (btrfs_buffer_uptodate(eb, gen, 0)) {
@@ -485,7 +484,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
485 struct btrfs_key *key) 484 struct btrfs_key *key)
486{ 485{
487 int found_type; 486 int found_type;
488 u64 mask = root->sectorsize - 1;
489 u64 extent_end; 487 u64 extent_end;
490 u64 start = key->offset; 488 u64 start = key->offset;
491 u64 saved_nbytes; 489 u64 saved_nbytes;
@@ -502,7 +500,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
502 extent_end = start + btrfs_file_extent_num_bytes(eb, item); 500 extent_end = start + btrfs_file_extent_num_bytes(eb, item);
503 else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 501 else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
504 size = btrfs_file_extent_inline_len(eb, item); 502 size = btrfs_file_extent_inline_len(eb, item);
505 extent_end = (start + size + mask) & ~mask; 503 extent_end = ALIGN(start + size, root->sectorsize);
506 } else { 504 } else {
507 ret = 0; 505 ret = 0;
508 goto out; 506 goto out;
@@ -2281,6 +2279,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2281 unsigned long log_transid = 0; 2279 unsigned long log_transid = 0;
2282 2280
2283 mutex_lock(&root->log_mutex); 2281 mutex_lock(&root->log_mutex);
2282 log_transid = root->log_transid;
2284 index1 = root->log_transid % 2; 2283 index1 = root->log_transid % 2;
2285 if (atomic_read(&root->log_commit[index1])) { 2284 if (atomic_read(&root->log_commit[index1])) {
2286 wait_log_commit(trans, root, root->log_transid); 2285 wait_log_commit(trans, root, root->log_transid);
@@ -2308,11 +2307,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2308 /* bail out if we need to do a full commit */ 2307 /* bail out if we need to do a full commit */
2309 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2308 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2310 ret = -EAGAIN; 2309 ret = -EAGAIN;
2310 btrfs_free_logged_extents(log, log_transid);
2311 mutex_unlock(&root->log_mutex); 2311 mutex_unlock(&root->log_mutex);
2312 goto out; 2312 goto out;
2313 } 2313 }
2314 2314
2315 log_transid = root->log_transid;
2316 if (log_transid % 2 == 0) 2315 if (log_transid % 2 == 0)
2317 mark = EXTENT_DIRTY; 2316 mark = EXTENT_DIRTY;
2318 else 2317 else
@@ -2324,6 +2323,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2324 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); 2323 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
2325 if (ret) { 2324 if (ret) {
2326 btrfs_abort_transaction(trans, root, ret); 2325 btrfs_abort_transaction(trans, root, ret);
2326 btrfs_free_logged_extents(log, log_transid);
2327 mutex_unlock(&root->log_mutex); 2327 mutex_unlock(&root->log_mutex);
2328 goto out; 2328 goto out;
2329 } 2329 }
@@ -2363,6 +2363,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2363 } 2363 }
2364 root->fs_info->last_trans_log_full_commit = trans->transid; 2364 root->fs_info->last_trans_log_full_commit = trans->transid;
2365 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2365 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2366 btrfs_free_logged_extents(log, log_transid);
2366 mutex_unlock(&log_root_tree->log_mutex); 2367 mutex_unlock(&log_root_tree->log_mutex);
2367 ret = -EAGAIN; 2368 ret = -EAGAIN;
2368 goto out; 2369 goto out;
@@ -2373,6 +2374,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2373 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2374 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2374 wait_log_commit(trans, log_root_tree, 2375 wait_log_commit(trans, log_root_tree,
2375 log_root_tree->log_transid); 2376 log_root_tree->log_transid);
2377 btrfs_free_logged_extents(log, log_transid);
2376 mutex_unlock(&log_root_tree->log_mutex); 2378 mutex_unlock(&log_root_tree->log_mutex);
2377 ret = 0; 2379 ret = 0;
2378 goto out; 2380 goto out;
@@ -2392,6 +2394,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2392 */ 2394 */
2393 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2395 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2394 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2396 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2397 btrfs_free_logged_extents(log, log_transid);
2395 mutex_unlock(&log_root_tree->log_mutex); 2398 mutex_unlock(&log_root_tree->log_mutex);
2396 ret = -EAGAIN; 2399 ret = -EAGAIN;
2397 goto out_wake_log_root; 2400 goto out_wake_log_root;
@@ -2402,10 +2405,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2402 EXTENT_DIRTY | EXTENT_NEW); 2405 EXTENT_DIRTY | EXTENT_NEW);
2403 if (ret) { 2406 if (ret) {
2404 btrfs_abort_transaction(trans, root, ret); 2407 btrfs_abort_transaction(trans, root, ret);
2408 btrfs_free_logged_extents(log, log_transid);
2405 mutex_unlock(&log_root_tree->log_mutex); 2409 mutex_unlock(&log_root_tree->log_mutex);
2406 goto out_wake_log_root; 2410 goto out_wake_log_root;
2407 } 2411 }
2408 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2412 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2413 btrfs_wait_logged_extents(log, log_transid);
2409 2414
2410 btrfs_set_super_log_root(root->fs_info->super_for_commit, 2415 btrfs_set_super_log_root(root->fs_info->super_for_commit,
2411 log_root_tree->node->start); 2416 log_root_tree->node->start);
@@ -2461,8 +2466,10 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
2461 .process_func = process_one_buffer 2466 .process_func = process_one_buffer
2462 }; 2467 };
2463 2468
2464 ret = walk_log_tree(trans, log, &wc); 2469 if (trans) {
2465 BUG_ON(ret); 2470 ret = walk_log_tree(trans, log, &wc);
2471 BUG_ON(ret);
2472 }
2466 2473
2467 while (1) { 2474 while (1) {
2468 ret = find_first_extent_bit(&log->dirty_log_pages, 2475 ret = find_first_extent_bit(&log->dirty_log_pages,
@@ -2475,6 +2482,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
2475 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2482 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
2476 } 2483 }
2477 2484
2485 /*
2486 * We may have short-circuited the log tree with the full commit logic
2487 * and left ordered extents on our list, so clear these out to keep us
2488 * from leaking inodes and memory.
2489 */
2490 btrfs_free_logged_extents(log, 0);
2491 btrfs_free_logged_extents(log, 1);
2492
2478 free_extent_buffer(log->node); 2493 free_extent_buffer(log->node);
2479 kfree(log); 2494 kfree(log);
2480} 2495}
@@ -2724,7 +2739,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2724 path->keep_locks = 1; 2739 path->keep_locks = 1;
2725 2740
2726 ret = btrfs_search_forward(root, &min_key, &max_key, 2741 ret = btrfs_search_forward(root, &min_key, &max_key,
2727 path, 0, trans->transid); 2742 path, trans->transid);
2728 2743
2729 /* 2744 /*
2730 * we didn't find anything from this transaction, see if there 2745 * we didn't find anything from this transaction, see if there
@@ -3271,16 +3286,21 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3271 struct btrfs_root *log = root->log_root; 3286 struct btrfs_root *log = root->log_root;
3272 struct btrfs_file_extent_item *fi; 3287 struct btrfs_file_extent_item *fi;
3273 struct extent_buffer *leaf; 3288 struct extent_buffer *leaf;
3289 struct btrfs_ordered_extent *ordered;
3274 struct list_head ordered_sums; 3290 struct list_head ordered_sums;
3275 struct btrfs_map_token token; 3291 struct btrfs_map_token token;
3276 struct btrfs_key key; 3292 struct btrfs_key key;
3277 u64 csum_offset = em->mod_start - em->start; 3293 u64 mod_start = em->mod_start;
3278 u64 csum_len = em->mod_len; 3294 u64 mod_len = em->mod_len;
3295 u64 csum_offset;
3296 u64 csum_len;
3279 u64 extent_offset = em->start - em->orig_start; 3297 u64 extent_offset = em->start - em->orig_start;
3280 u64 block_len; 3298 u64 block_len;
3281 int ret; 3299 int ret;
3300 int index = log->log_transid % 2;
3282 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3301 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3283 3302
3303insert:
3284 INIT_LIST_HEAD(&ordered_sums); 3304 INIT_LIST_HEAD(&ordered_sums);
3285 btrfs_init_map_token(&token); 3305 btrfs_init_map_token(&token);
3286 key.objectid = btrfs_ino(inode); 3306 key.objectid = btrfs_ino(inode);
@@ -3296,6 +3316,23 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3296 leaf = path->nodes[0]; 3316 leaf = path->nodes[0];
3297 fi = btrfs_item_ptr(leaf, path->slots[0], 3317 fi = btrfs_item_ptr(leaf, path->slots[0],
3298 struct btrfs_file_extent_item); 3318 struct btrfs_file_extent_item);
3319
3320 /*
3321 * If we are overwriting an inline extent with a real one then we need
3322 * to just delete the inline extent as it may not be large enough to
3323 * have the entire file_extent_item.
3324 */
3325 if (ret && btrfs_token_file_extent_type(leaf, fi, &token) ==
3326 BTRFS_FILE_EXTENT_INLINE) {
3327 ret = btrfs_del_item(trans, log, path);
3328 btrfs_release_path(path);
3329 if (ret) {
3330 path->really_keep_locks = 0;
3331 return ret;
3332 }
3333 goto insert;
3334 }
3335
3299 btrfs_set_token_file_extent_generation(leaf, fi, em->generation, 3336 btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
3300 &token); 3337 &token);
3301 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3338 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
@@ -3362,6 +3399,92 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3362 csum_len = block_len; 3399 csum_len = block_len;
3363 } 3400 }
3364 3401
3402 /*
3403 * First check and see if our csums are on our outstanding ordered
3404 * extents.
3405 */
3406again:
3407 spin_lock_irq(&log->log_extents_lock[index]);
3408 list_for_each_entry(ordered, &log->logged_list[index], log_list) {
3409 struct btrfs_ordered_sum *sum;
3410
3411 if (!mod_len)
3412 break;
3413
3414 if (ordered->inode != inode)
3415 continue;
3416
3417 if (ordered->file_offset + ordered->len <= mod_start ||
3418 mod_start + mod_len <= ordered->file_offset)
3419 continue;
3420
3421 /*
3422 * We are going to copy all the csums on this ordered extent, so
3423 * go ahead and adjust mod_start and mod_len in case this
3424 * ordered extent has already been logged.
3425 */
3426 if (ordered->file_offset > mod_start) {
3427 if (ordered->file_offset + ordered->len >=
3428 mod_start + mod_len)
3429 mod_len = ordered->file_offset - mod_start;
3430 /*
3431 * If we have this case
3432 *
3433 * |--------- logged extent ---------|
3434 * |----- ordered extent ----|
3435 *
3436 * Just don't mess with mod_start and mod_len, we'll
3437 * just end up logging more csums than we need and it
3438 * will be ok.
3439 */
3440 } else {
3441 if (ordered->file_offset + ordered->len <
3442 mod_start + mod_len) {
3443 mod_len = (mod_start + mod_len) -
3444 (ordered->file_offset + ordered->len);
3445 mod_start = ordered->file_offset +
3446 ordered->len;
3447 } else {
3448 mod_len = 0;
3449 }
3450 }
3451
3452 /*
3453 * To keep us from looping for the above case of an ordered
3454 * extent that falls inside of the logged extent.
3455 */
3456 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
3457 &ordered->flags))
3458 continue;
3459 atomic_inc(&ordered->refs);
3460 spin_unlock_irq(&log->log_extents_lock[index]);
3461 /*
3462 * we've dropped the lock, we must either break or
3463 * start over after this.
3464 */
3465
3466 wait_event(ordered->wait, ordered->csum_bytes_left == 0);
3467
3468 list_for_each_entry(sum, &ordered->list, list) {
3469 ret = btrfs_csum_file_blocks(trans, log, sum);
3470 if (ret) {
3471 btrfs_put_ordered_extent(ordered);
3472 goto unlocked;
3473 }
3474 }
3475 btrfs_put_ordered_extent(ordered);
3476 goto again;
3477
3478 }
3479 spin_unlock_irq(&log->log_extents_lock[index]);
3480unlocked:
3481
3482 if (!mod_len || ret)
3483 return ret;
3484
3485 csum_offset = mod_start - em->start;
3486 csum_len = mod_len;
3487
3365 /* block start is already adjusted for the file extent offset. */ 3488 /* block start is already adjusted for the file extent offset. */
3366 ret = btrfs_lookup_csums_range(log->fs_info->csum_root, 3489 ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
3367 em->block_start + csum_offset, 3490 em->block_start + csum_offset,
@@ -3393,6 +3516,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3393 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3516 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3394 u64 test_gen; 3517 u64 test_gen;
3395 int ret = 0; 3518 int ret = 0;
3519 int num = 0;
3396 3520
3397 INIT_LIST_HEAD(&extents); 3521 INIT_LIST_HEAD(&extents);
3398 3522
@@ -3401,16 +3525,31 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3401 3525
3402 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 3526 list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
3403 list_del_init(&em->list); 3527 list_del_init(&em->list);
3528
3529 /*
3530 * Just an arbitrary number, this can be really CPU intensive
3531 * once we start getting a lot of extents, and really once we
3532 * have a bunch of extents we just want to commit since it will
3533 * be faster.
3534 */
3535 if (++num > 32768) {
3536 list_del_init(&tree->modified_extents);
3537 ret = -EFBIG;
3538 goto process;
3539 }
3540
3404 if (em->generation <= test_gen) 3541 if (em->generation <= test_gen)
3405 continue; 3542 continue;
3406 /* Need a ref to keep it from getting evicted from cache */ 3543 /* Need a ref to keep it from getting evicted from cache */
3407 atomic_inc(&em->refs); 3544 atomic_inc(&em->refs);
3408 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 3545 set_bit(EXTENT_FLAG_LOGGING, &em->flags);
3409 list_add_tail(&em->list, &extents); 3546 list_add_tail(&em->list, &extents);
3547 num++;
3410 } 3548 }
3411 3549
3412 list_sort(NULL, &extents, extent_cmp); 3550 list_sort(NULL, &extents, extent_cmp);
3413 3551
3552process:
3414 while (!list_empty(&extents)) { 3553 while (!list_empty(&extents)) {
3415 em = list_entry(extents.next, struct extent_map, list); 3554 em = list_entry(extents.next, struct extent_map, list);
3416 3555
@@ -3513,6 +3652,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3513 3652
3514 mutex_lock(&BTRFS_I(inode)->log_mutex); 3653 mutex_lock(&BTRFS_I(inode)->log_mutex);
3515 3654
3655 btrfs_get_logged_extents(log, inode);
3656
3516 /* 3657 /*
3517 * a brute force approach to making sure we get the most uptodate 3658 * a brute force approach to making sure we get the most uptodate
3518 * copies of everything. 3659 * copies of everything.
@@ -3558,7 +3699,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3558 while (1) { 3699 while (1) {
3559 ins_nr = 0; 3700 ins_nr = 0;
3560 ret = btrfs_search_forward(root, &min_key, &max_key, 3701 ret = btrfs_search_forward(root, &min_key, &max_key,
3561 path, 0, trans->transid); 3702 path, trans->transid);
3562 if (ret != 0) 3703 if (ret != 0)
3563 break; 3704 break;
3564again: 3705again:
@@ -3656,6 +3797,8 @@ log_extents:
3656 BTRFS_I(inode)->logged_trans = trans->transid; 3797 BTRFS_I(inode)->logged_trans = trans->transid;
3657 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 3798 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
3658out_unlock: 3799out_unlock:
3800 if (err)
3801 btrfs_free_logged_extents(log, log->log_transid);
3659 mutex_unlock(&BTRFS_I(inode)->log_mutex); 3802 mutex_unlock(&BTRFS_I(inode)->log_mutex);
3660 3803
3661 btrfs_free_path(path); 3804 btrfs_free_path(path);
@@ -3822,7 +3965,6 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
3822end_trans: 3965end_trans:
3823 dput(old_parent); 3966 dput(old_parent);
3824 if (ret < 0) { 3967 if (ret < 0) {
3825 WARN_ON(ret != -ENOSPC);
3826 root->fs_info->last_trans_log_full_commit = trans->transid; 3968 root->fs_info->last_trans_log_full_commit = trans->transid;
3827 ret = 1; 3969 ret = 1;
3828 } 3970 }
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 99be4c138db6..ddc61cad0080 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -5,7 +5,7 @@
5 */ 5 */
6 6
7#include <linux/slab.h> 7#include <linux/slab.h>
8#include <linux/module.h> 8#include <linux/export.h>
9#include "ulist.h" 9#include "ulist.h"
10 10
11/* 11/*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5cbb7f4b1672..35bb2d4ed29f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,6 +25,8 @@
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/raid/pq.h>
29#include <asm/div64.h>
28#include "compat.h" 30#include "compat.h"
29#include "ctree.h" 31#include "ctree.h"
30#include "extent_map.h" 32#include "extent_map.h"
@@ -32,6 +34,7 @@
32#include "transaction.h" 34#include "transaction.h"
33#include "print-tree.h" 35#include "print-tree.h"
34#include "volumes.h" 36#include "volumes.h"
37#include "raid56.h"
35#include "async-thread.h" 38#include "async-thread.h"
36#include "check-integrity.h" 39#include "check-integrity.h"
37#include "rcu-string.h" 40#include "rcu-string.h"
@@ -647,6 +650,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
647 new_device->writeable = 0; 650 new_device->writeable = 0;
648 new_device->in_fs_metadata = 0; 651 new_device->in_fs_metadata = 0;
649 new_device->can_discard = 0; 652 new_device->can_discard = 0;
653 spin_lock_init(&new_device->io_lock);
650 list_replace_rcu(&device->dev_list, &new_device->dev_list); 654 list_replace_rcu(&device->dev_list, &new_device->dev_list);
651 655
652 call_rcu(&device->rcu, free_device); 656 call_rcu(&device->rcu, free_device);
@@ -792,26 +796,75 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
792 return ret; 796 return ret;
793} 797}
794 798
799/*
800 * Look for a btrfs signature on a device. This may be called out of the mount path
801 * and we are not allowed to call set_blocksize during the scan. The superblock
802 * is read via pagecache
803 */
795int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 804int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
796 struct btrfs_fs_devices **fs_devices_ret) 805 struct btrfs_fs_devices **fs_devices_ret)
797{ 806{
798 struct btrfs_super_block *disk_super; 807 struct btrfs_super_block *disk_super;
799 struct block_device *bdev; 808 struct block_device *bdev;
800 struct buffer_head *bh; 809 struct page *page;
801 int ret; 810 void *p;
811 int ret = -EINVAL;
802 u64 devid; 812 u64 devid;
803 u64 transid; 813 u64 transid;
804 u64 total_devices; 814 u64 total_devices;
815 u64 bytenr;
816 pgoff_t index;
805 817
818 /*
819 * we would like to check all the supers, but that would make
820 * a btrfs mount succeed after a mkfs from a different FS.
821 * So, we need to add a special mount option to scan for
822 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
823 */
824 bytenr = btrfs_sb_offset(0);
806 flags |= FMODE_EXCL; 825 flags |= FMODE_EXCL;
807 mutex_lock(&uuid_mutex); 826 mutex_lock(&uuid_mutex);
808 ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh); 827
809 if (ret) 828 bdev = blkdev_get_by_path(path, flags, holder);
829
830 if (IS_ERR(bdev)) {
831 ret = PTR_ERR(bdev);
810 goto error; 832 goto error;
811 disk_super = (struct btrfs_super_block *)bh->b_data; 833 }
834
835 /* make sure our super fits in the device */
836 if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
837 goto error_bdev_put;
838
839 /* make sure our super fits in the page */
840 if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
841 goto error_bdev_put;
842
843 /* make sure our super doesn't straddle pages on disk */
844 index = bytenr >> PAGE_CACHE_SHIFT;
845 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
846 goto error_bdev_put;
847
848 /* pull in the page with our super */
849 page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
850 index, GFP_NOFS);
851
852 if (IS_ERR_OR_NULL(page))
853 goto error_bdev_put;
854
855 p = kmap(page);
856
857 /* align our pointer to the offset of the super block */
858 disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
859
860 if (btrfs_super_bytenr(disk_super) != bytenr ||
861 disk_super->magic != cpu_to_le64(BTRFS_MAGIC))
862 goto error_unmap;
863
812 devid = btrfs_stack_device_id(&disk_super->dev_item); 864 devid = btrfs_stack_device_id(&disk_super->dev_item);
813 transid = btrfs_super_generation(disk_super); 865 transid = btrfs_super_generation(disk_super);
814 total_devices = btrfs_super_num_devices(disk_super); 866 total_devices = btrfs_super_num_devices(disk_super);
867
815 if (disk_super->label[0]) { 868 if (disk_super->label[0]) {
816 if (disk_super->label[BTRFS_LABEL_SIZE - 1]) 869 if (disk_super->label[BTRFS_LABEL_SIZE - 1])
817 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; 870 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
@@ -819,12 +872,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
819 } else { 872 } else {
820 printk(KERN_INFO "device fsid %pU ", disk_super->fsid); 873 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
821 } 874 }
875
822 printk(KERN_CONT "devid %llu transid %llu %s\n", 876 printk(KERN_CONT "devid %llu transid %llu %s\n",
823 (unsigned long long)devid, (unsigned long long)transid, path); 877 (unsigned long long)devid, (unsigned long long)transid, path);
878
824 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 879 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
825 if (!ret && fs_devices_ret) 880 if (!ret && fs_devices_ret)
826 (*fs_devices_ret)->total_devices = total_devices; 881 (*fs_devices_ret)->total_devices = total_devices;
827 brelse(bh); 882
883error_unmap:
884 kunmap(page);
885 page_cache_release(page);
886
887error_bdev_put:
828 blkdev_put(bdev, flags); 888 blkdev_put(bdev, flags);
829error: 889error:
830 mutex_unlock(&uuid_mutex); 890 mutex_unlock(&uuid_mutex);
@@ -1372,14 +1432,19 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1372 u64 devid; 1432 u64 devid;
1373 u64 num_devices; 1433 u64 num_devices;
1374 u8 *dev_uuid; 1434 u8 *dev_uuid;
1435 unsigned seq;
1375 int ret = 0; 1436 int ret = 0;
1376 bool clear_super = false; 1437 bool clear_super = false;
1377 1438
1378 mutex_lock(&uuid_mutex); 1439 mutex_lock(&uuid_mutex);
1379 1440
1380 all_avail = root->fs_info->avail_data_alloc_bits | 1441 do {
1381 root->fs_info->avail_system_alloc_bits | 1442 seq = read_seqbegin(&root->fs_info->profiles_lock);
1382 root->fs_info->avail_metadata_alloc_bits; 1443
1444 all_avail = root->fs_info->avail_data_alloc_bits |
1445 root->fs_info->avail_system_alloc_bits |
1446 root->fs_info->avail_metadata_alloc_bits;
1447 } while (read_seqretry(&root->fs_info->profiles_lock, seq));
1383 1448
1384 num_devices = root->fs_info->fs_devices->num_devices; 1449 num_devices = root->fs_info->fs_devices->num_devices;
1385 btrfs_dev_replace_lock(&root->fs_info->dev_replace); 1450 btrfs_dev_replace_lock(&root->fs_info->dev_replace);
@@ -1403,6 +1468,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1403 goto out; 1468 goto out;
1404 } 1469 }
1405 1470
1471 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
1472 root->fs_info->fs_devices->rw_devices <= 2) {
1473 printk(KERN_ERR "btrfs: unable to go below two "
1474 "devices on raid5\n");
1475 ret = -EINVAL;
1476 goto out;
1477 }
1478 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
1479 root->fs_info->fs_devices->rw_devices <= 3) {
1480 printk(KERN_ERR "btrfs: unable to go below three "
1481 "devices on raid6\n");
1482 ret = -EINVAL;
1483 goto out;
1484 }
1485
1406 if (strcmp(device_path, "missing") == 0) { 1486 if (strcmp(device_path, "missing") == 0) {
1407 struct list_head *devices; 1487 struct list_head *devices;
1408 struct btrfs_device *tmp; 1488 struct btrfs_device *tmp;
@@ -2616,7 +2696,7 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2616 chunk_used = btrfs_block_group_used(&cache->item); 2696 chunk_used = btrfs_block_group_used(&cache->item);
2617 2697
2618 if (bargs->usage == 0) 2698 if (bargs->usage == 0)
2619 user_thresh = 0; 2699 user_thresh = 1;
2620 else if (bargs->usage > 100) 2700 else if (bargs->usage > 100)
2621 user_thresh = cache->key.offset; 2701 user_thresh = cache->key.offset;
2622 else 2702 else
@@ -2664,11 +2744,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
2664 return 0; 2744 return 0;
2665 2745
2666 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 2746 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2667 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) 2747 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
2668 factor = 2; 2748 factor = num_stripes / 2;
2669 else 2749 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
2670 factor = 1; 2750 factor = num_stripes - 1;
2671 factor = num_stripes / factor; 2751 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
2752 factor = num_stripes - 2;
2753 } else {
2754 factor = num_stripes;
2755 }
2672 2756
2673 for (i = 0; i < num_stripes; i++) { 2757 for (i = 0; i < num_stripes; i++) {
2674 stripe = btrfs_stripe_nr(chunk, i); 2758 stripe = btrfs_stripe_nr(chunk, i);
@@ -2985,6 +3069,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2985 int mixed = 0; 3069 int mixed = 0;
2986 int ret; 3070 int ret;
2987 u64 num_devices; 3071 u64 num_devices;
3072 unsigned seq;
2988 3073
2989 if (btrfs_fs_closing(fs_info) || 3074 if (btrfs_fs_closing(fs_info) ||
2990 atomic_read(&fs_info->balance_pause_req) || 3075 atomic_read(&fs_info->balance_pause_req) ||
@@ -3027,7 +3112,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3027 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3112 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3028 else 3113 else
3029 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 3114 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
3030 BTRFS_BLOCK_GROUP_RAID10); 3115 BTRFS_BLOCK_GROUP_RAID10 |
3116 BTRFS_BLOCK_GROUP_RAID5 |
3117 BTRFS_BLOCK_GROUP_RAID6);
3031 3118
3032 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3119 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3033 (!alloc_profile_is_valid(bctl->data.target, 1) || 3120 (!alloc_profile_is_valid(bctl->data.target, 1) ||
@@ -3067,23 +3154,29 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3067 3154
3068 /* allow to reduce meta or sys integrity only if force set */ 3155 /* allow to reduce meta or sys integrity only if force set */
3069 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3156 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3070 BTRFS_BLOCK_GROUP_RAID10; 3157 BTRFS_BLOCK_GROUP_RAID10 |
3071 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3158 BTRFS_BLOCK_GROUP_RAID5 |
3072 (fs_info->avail_system_alloc_bits & allowed) && 3159 BTRFS_BLOCK_GROUP_RAID6;
3073 !(bctl->sys.target & allowed)) || 3160 do {
3074 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3161 seq = read_seqbegin(&fs_info->profiles_lock);
3075 (fs_info->avail_metadata_alloc_bits & allowed) && 3162
3076 !(bctl->meta.target & allowed))) { 3163 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3077 if (bctl->flags & BTRFS_BALANCE_FORCE) { 3164 (fs_info->avail_system_alloc_bits & allowed) &&
3078 printk(KERN_INFO "btrfs: force reducing metadata " 3165 !(bctl->sys.target & allowed)) ||
3079 "integrity\n"); 3166 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3080 } else { 3167 (fs_info->avail_metadata_alloc_bits & allowed) &&
3081 printk(KERN_ERR "btrfs: balance will reduce metadata " 3168 !(bctl->meta.target & allowed))) {
3082 "integrity, use force if you want this\n"); 3169 if (bctl->flags & BTRFS_BALANCE_FORCE) {
3083 ret = -EINVAL; 3170 printk(KERN_INFO "btrfs: force reducing metadata "
3084 goto out; 3171 "integrity\n");
3172 } else {
3173 printk(KERN_ERR "btrfs: balance will reduce metadata "
3174 "integrity, use force if you want this\n");
3175 ret = -EINVAL;
3176 goto out;
3177 }
3085 } 3178 }
3086 } 3179 } while (read_seqretry(&fs_info->profiles_lock, seq));
3087 3180
3088 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3181 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3089 int num_tolerated_disk_barrier_failures; 3182 int num_tolerated_disk_barrier_failures;
@@ -3127,21 +3220,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3127 mutex_lock(&fs_info->balance_mutex); 3220 mutex_lock(&fs_info->balance_mutex);
3128 atomic_dec(&fs_info->balance_running); 3221 atomic_dec(&fs_info->balance_running);
3129 3222
3130 if (bargs) {
3131 memset(bargs, 0, sizeof(*bargs));
3132 update_ioctl_balance_args(fs_info, 0, bargs);
3133 }
3134
3135 if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
3136 balance_need_close(fs_info)) {
3137 __cancel_balance(fs_info);
3138 }
3139
3140 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3223 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3141 fs_info->num_tolerated_disk_barrier_failures = 3224 fs_info->num_tolerated_disk_barrier_failures =
3142 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 3225 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3143 } 3226 }
3144 3227
3228 if (bargs) {
3229 memset(bargs, 0, sizeof(*bargs));
3230 update_ioctl_balance_args(fs_info, 0, bargs);
3231 }
3232
3145 wake_up(&fs_info->balance_wait_q); 3233 wake_up(&fs_info->balance_wait_q);
3146 3234
3147 return ret; 3235 return ret;
@@ -3504,13 +3592,86 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
3504} 3592}
3505 3593
3506struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 3594struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
3507 { 2, 1, 0, 4, 2, 2 /* raid10 */ }, 3595 [BTRFS_RAID_RAID10] = {
3508 { 1, 1, 2, 2, 2, 2 /* raid1 */ }, 3596 .sub_stripes = 2,
3509 { 1, 2, 1, 1, 1, 2 /* dup */ }, 3597 .dev_stripes = 1,
3510 { 1, 1, 0, 2, 1, 1 /* raid0 */ }, 3598 .devs_max = 0, /* 0 == as many as possible */
3511 { 1, 1, 1, 1, 1, 1 /* single */ }, 3599 .devs_min = 4,
3600 .devs_increment = 2,
3601 .ncopies = 2,
3602 },
3603 [BTRFS_RAID_RAID1] = {
3604 .sub_stripes = 1,
3605 .dev_stripes = 1,
3606 .devs_max = 2,
3607 .devs_min = 2,
3608 .devs_increment = 2,
3609 .ncopies = 2,
3610 },
3611 [BTRFS_RAID_DUP] = {
3612 .sub_stripes = 1,
3613 .dev_stripes = 2,
3614 .devs_max = 1,
3615 .devs_min = 1,
3616 .devs_increment = 1,
3617 .ncopies = 2,
3618 },
3619 [BTRFS_RAID_RAID0] = {
3620 .sub_stripes = 1,
3621 .dev_stripes = 1,
3622 .devs_max = 0,
3623 .devs_min = 2,
3624 .devs_increment = 1,
3625 .ncopies = 1,
3626 },
3627 [BTRFS_RAID_SINGLE] = {
3628 .sub_stripes = 1,
3629 .dev_stripes = 1,
3630 .devs_max = 1,
3631 .devs_min = 1,
3632 .devs_increment = 1,
3633 .ncopies = 1,
3634 },
3635 [BTRFS_RAID_RAID5] = {
3636 .sub_stripes = 1,
3637 .dev_stripes = 1,
3638 .devs_max = 0,
3639 .devs_min = 2,
3640 .devs_increment = 1,
3641 .ncopies = 2,
3642 },
3643 [BTRFS_RAID_RAID6] = {
3644 .sub_stripes = 1,
3645 .dev_stripes = 1,
3646 .devs_max = 0,
3647 .devs_min = 3,
3648 .devs_increment = 1,
3649 .ncopies = 3,
3650 },
3512}; 3651};
3513 3652
3653static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
3654{
3655 /* TODO allow them to set a preferred stripe size */
3656 return 64 * 1024;
3657}
3658
3659static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
3660{
3661 u64 features;
3662
3663 if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
3664 return;
3665
3666 features = btrfs_super_incompat_flags(info->super_copy);
3667 if (features & BTRFS_FEATURE_INCOMPAT_RAID56)
3668 return;
3669
3670 features |= BTRFS_FEATURE_INCOMPAT_RAID56;
3671 btrfs_set_super_incompat_flags(info->super_copy, features);
3672 printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n");
3673}
3674
3514static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3675static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3515 struct btrfs_root *extent_root, 3676 struct btrfs_root *extent_root,
3516 struct map_lookup **map_ret, 3677 struct map_lookup **map_ret,
@@ -3526,6 +3687,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3526 struct btrfs_device_info *devices_info = NULL; 3687 struct btrfs_device_info *devices_info = NULL;
3527 u64 total_avail; 3688 u64 total_avail;
3528 int num_stripes; /* total number of stripes to allocate */ 3689 int num_stripes; /* total number of stripes to allocate */
3690 int data_stripes; /* number of stripes that count for
3691 block group size */
3529 int sub_stripes; /* sub_stripes info for map */ 3692 int sub_stripes; /* sub_stripes info for map */
3530 int dev_stripes; /* stripes per dev */ 3693 int dev_stripes; /* stripes per dev */
3531 int devs_max; /* max devs to use */ 3694 int devs_max; /* max devs to use */
@@ -3537,6 +3700,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3537 u64 max_chunk_size; 3700 u64 max_chunk_size;
3538 u64 stripe_size; 3701 u64 stripe_size;
3539 u64 num_bytes; 3702 u64 num_bytes;
3703 u64 raid_stripe_len = BTRFS_STRIPE_LEN;
3540 int ndevs; 3704 int ndevs;
3541 int i; 3705 int i;
3542 int j; 3706 int j;
@@ -3631,12 +3795,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3631 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) 3795 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
3632 continue; 3796 continue;
3633 3797
3798 if (ndevs == fs_devices->rw_devices) {
3799 WARN(1, "%s: found more than %llu devices\n",
3800 __func__, fs_devices->rw_devices);
3801 break;
3802 }
3634 devices_info[ndevs].dev_offset = dev_offset; 3803 devices_info[ndevs].dev_offset = dev_offset;
3635 devices_info[ndevs].max_avail = max_avail; 3804 devices_info[ndevs].max_avail = max_avail;
3636 devices_info[ndevs].total_avail = total_avail; 3805 devices_info[ndevs].total_avail = total_avail;
3637 devices_info[ndevs].dev = device; 3806 devices_info[ndevs].dev = device;
3638 ++ndevs; 3807 ++ndevs;
3639 WARN_ON(ndevs > fs_devices->rw_devices);
3640 } 3808 }
3641 3809
3642 /* 3810 /*
@@ -3662,16 +3830,48 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3662 stripe_size = devices_info[ndevs-1].max_avail; 3830 stripe_size = devices_info[ndevs-1].max_avail;
3663 num_stripes = ndevs * dev_stripes; 3831 num_stripes = ndevs * dev_stripes;
3664 3832
3665 if (stripe_size * ndevs > max_chunk_size * ncopies) { 3833 /*
3666 stripe_size = max_chunk_size * ncopies; 3834 * this will have to be fixed for RAID1 and RAID10 over
3667 do_div(stripe_size, ndevs); 3835 * more drives
3836 */
3837 data_stripes = num_stripes / ncopies;
3838
3839 if (type & BTRFS_BLOCK_GROUP_RAID5) {
3840 raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
3841 btrfs_super_stripesize(info->super_copy));
3842 data_stripes = num_stripes - 1;
3843 }
3844 if (type & BTRFS_BLOCK_GROUP_RAID6) {
3845 raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
3846 btrfs_super_stripesize(info->super_copy));
3847 data_stripes = num_stripes - 2;
3848 }
3849
3850 /*
3851 * Use the number of data stripes to figure out how big this chunk
3852 * is really going to be in terms of logical address space,
3853 * and compare that answer with the max chunk size
3854 */
3855 if (stripe_size * data_stripes > max_chunk_size) {
3856 u64 mask = (1ULL << 24) - 1;
3857 stripe_size = max_chunk_size;
3858 do_div(stripe_size, data_stripes);
3859
3860 /* bump the answer up to a 16MB boundary */
3861 stripe_size = (stripe_size + mask) & ~mask;
3862
3863 /* but don't go higher than the limits we found
3864 * while searching for free extents
3865 */
3866 if (stripe_size > devices_info[ndevs-1].max_avail)
3867 stripe_size = devices_info[ndevs-1].max_avail;
3668 } 3868 }
3669 3869
3670 do_div(stripe_size, dev_stripes); 3870 do_div(stripe_size, dev_stripes);
3671 3871
3672 /* align to BTRFS_STRIPE_LEN */ 3872 /* align to BTRFS_STRIPE_LEN */
3673 do_div(stripe_size, BTRFS_STRIPE_LEN); 3873 do_div(stripe_size, raid_stripe_len);
3674 stripe_size *= BTRFS_STRIPE_LEN; 3874 stripe_size *= raid_stripe_len;
3675 3875
3676 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 3876 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3677 if (!map) { 3877 if (!map) {
@@ -3689,14 +3889,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3689 } 3889 }
3690 } 3890 }
3691 map->sector_size = extent_root->sectorsize; 3891 map->sector_size = extent_root->sectorsize;
3692 map->stripe_len = BTRFS_STRIPE_LEN; 3892 map->stripe_len = raid_stripe_len;
3693 map->io_align = BTRFS_STRIPE_LEN; 3893 map->io_align = raid_stripe_len;
3694 map->io_width = BTRFS_STRIPE_LEN; 3894 map->io_width = raid_stripe_len;
3695 map->type = type; 3895 map->type = type;
3696 map->sub_stripes = sub_stripes; 3896 map->sub_stripes = sub_stripes;
3697 3897
3698 *map_ret = map; 3898 *map_ret = map;
3699 num_bytes = stripe_size * (num_stripes / ncopies); 3899 num_bytes = stripe_size * data_stripes;
3700 3900
3701 *stripe_size_out = stripe_size; 3901 *stripe_size_out = stripe_size;
3702 *num_bytes_out = num_bytes; 3902 *num_bytes_out = num_bytes;
@@ -3718,15 +3918,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3718 write_lock(&em_tree->lock); 3918 write_lock(&em_tree->lock);
3719 ret = add_extent_mapping(em_tree, em); 3919 ret = add_extent_mapping(em_tree, em);
3720 write_unlock(&em_tree->lock); 3920 write_unlock(&em_tree->lock);
3721 free_extent_map(em); 3921 if (ret) {
3722 if (ret) 3922 free_extent_map(em);
3723 goto error;
3724
3725 ret = btrfs_make_block_group(trans, extent_root, 0, type,
3726 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3727 start, num_bytes);
3728 if (ret)
3729 goto error; 3923 goto error;
3924 }
3730 3925
3731 for (i = 0; i < map->num_stripes; ++i) { 3926 for (i = 0; i < map->num_stripes; ++i) {
3732 struct btrfs_device *device; 3927 struct btrfs_device *device;
@@ -3739,15 +3934,44 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3739 info->chunk_root->root_key.objectid, 3934 info->chunk_root->root_key.objectid,
3740 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 3935 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3741 start, dev_offset, stripe_size); 3936 start, dev_offset, stripe_size);
3742 if (ret) { 3937 if (ret)
3743 btrfs_abort_transaction(trans, extent_root, ret); 3938 goto error_dev_extent;
3744 goto error; 3939 }
3745 } 3940
3941 ret = btrfs_make_block_group(trans, extent_root, 0, type,
3942 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3943 start, num_bytes);
3944 if (ret) {
3945 i = map->num_stripes - 1;
3946 goto error_dev_extent;
3746 } 3947 }
3747 3948
3949 free_extent_map(em);
3950 check_raid56_incompat_flag(extent_root->fs_info, type);
3951
3748 kfree(devices_info); 3952 kfree(devices_info);
3749 return 0; 3953 return 0;
3750 3954
3955error_dev_extent:
3956 for (; i >= 0; i--) {
3957 struct btrfs_device *device;
3958 int err;
3959
3960 device = map->stripes[i].dev;
3961 err = btrfs_free_dev_extent(trans, device, start);
3962 if (err) {
3963 btrfs_abort_transaction(trans, extent_root, err);
3964 break;
3965 }
3966 }
3967 write_lock(&em_tree->lock);
3968 remove_extent_mapping(em_tree, em);
3969 write_unlock(&em_tree->lock);
3970
3971 /* One for our allocation */
3972 free_extent_map(em);
3973 /* One for the tree reference */
3974 free_extent_map(em);
3751error: 3975error:
3752 kfree(map); 3976 kfree(map);
3753 kfree(devices_info); 3977 kfree(devices_info);
@@ -3887,10 +4111,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3887 if (ret) 4111 if (ret)
3888 return ret; 4112 return ret;
3889 4113
3890 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 4114 alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
3891 fs_info->avail_metadata_alloc_bits;
3892 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3893
3894 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 4115 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
3895 &stripe_size, chunk_offset, alloc_profile); 4116 &stripe_size, chunk_offset, alloc_profile);
3896 if (ret) 4117 if (ret)
@@ -3898,10 +4119,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3898 4119
3899 sys_chunk_offset = chunk_offset + chunk_size; 4120 sys_chunk_offset = chunk_offset + chunk_size;
3900 4121
3901 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | 4122 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
3902 fs_info->avail_system_alloc_bits;
3903 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3904
3905 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 4123 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
3906 &sys_chunk_size, &sys_stripe_size, 4124 &sys_chunk_size, &sys_stripe_size,
3907 sys_chunk_offset, alloc_profile); 4125 sys_chunk_offset, alloc_profile);
@@ -4014,6 +4232,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4014 ret = map->num_stripes; 4232 ret = map->num_stripes;
4015 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4233 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4016 ret = map->sub_stripes; 4234 ret = map->sub_stripes;
4235 else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
4236 ret = 2;
4237 else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4238 ret = 3;
4017 else 4239 else
4018 ret = 1; 4240 ret = 1;
4019 free_extent_map(em); 4241 free_extent_map(em);
@@ -4026,6 +4248,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4026 return ret; 4248 return ret;
4027} 4249}
4028 4250
4251unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
4252 struct btrfs_mapping_tree *map_tree,
4253 u64 logical)
4254{
4255 struct extent_map *em;
4256 struct map_lookup *map;
4257 struct extent_map_tree *em_tree = &map_tree->map_tree;
4258 unsigned long len = root->sectorsize;
4259
4260 read_lock(&em_tree->lock);
4261 em = lookup_extent_mapping(em_tree, logical, len);
4262 read_unlock(&em_tree->lock);
4263 BUG_ON(!em);
4264
4265 BUG_ON(em->start > logical || em->start + em->len < logical);
4266 map = (struct map_lookup *)em->bdev;
4267 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4268 BTRFS_BLOCK_GROUP_RAID6)) {
4269 len = map->stripe_len * nr_data_stripes(map);
4270 }
4271 free_extent_map(em);
4272 return len;
4273}
4274
4275int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
4276 u64 logical, u64 len, int mirror_num)
4277{
4278 struct extent_map *em;
4279 struct map_lookup *map;
4280 struct extent_map_tree *em_tree = &map_tree->map_tree;
4281 int ret = 0;
4282
4283 read_lock(&em_tree->lock);
4284 em = lookup_extent_mapping(em_tree, logical, len);
4285 read_unlock(&em_tree->lock);
4286 BUG_ON(!em);
4287
4288 BUG_ON(em->start > logical || em->start + em->len < logical);
4289 map = (struct map_lookup *)em->bdev;
4290 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4291 BTRFS_BLOCK_GROUP_RAID6))
4292 ret = 1;
4293 free_extent_map(em);
4294 return ret;
4295}
4296
4029static int find_live_mirror(struct btrfs_fs_info *fs_info, 4297static int find_live_mirror(struct btrfs_fs_info *fs_info,
4030 struct map_lookup *map, int first, int num, 4298 struct map_lookup *map, int first, int num,
4031 int optimal, int dev_replace_is_ongoing) 4299 int optimal, int dev_replace_is_ongoing)
@@ -4063,10 +4331,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
4063 return optimal; 4331 return optimal;
4064} 4332}
4065 4333
4334static inline int parity_smaller(u64 a, u64 b)
4335{
4336 return a > b;
4337}
4338
4339/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
4340static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
4341{
4342 struct btrfs_bio_stripe s;
4343 int i;
4344 u64 l;
4345 int again = 1;
4346
4347 while (again) {
4348 again = 0;
4349 for (i = 0; i < bbio->num_stripes - 1; i++) {
4350 if (parity_smaller(raid_map[i], raid_map[i+1])) {
4351 s = bbio->stripes[i];
4352 l = raid_map[i];
4353 bbio->stripes[i] = bbio->stripes[i+1];
4354 raid_map[i] = raid_map[i+1];
4355 bbio->stripes[i+1] = s;
4356 raid_map[i+1] = l;
4357 again = 1;
4358 }
4359 }
4360 }
4361}
4362
4066static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 4363static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4067 u64 logical, u64 *length, 4364 u64 logical, u64 *length,
4068 struct btrfs_bio **bbio_ret, 4365 struct btrfs_bio **bbio_ret,
4069 int mirror_num) 4366 int mirror_num, u64 **raid_map_ret)
4070{ 4367{
4071 struct extent_map *em; 4368 struct extent_map *em;
4072 struct map_lookup *map; 4369 struct map_lookup *map;
@@ -4078,6 +4375,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4078 u64 stripe_nr; 4375 u64 stripe_nr;
4079 u64 stripe_nr_orig; 4376 u64 stripe_nr_orig;
4080 u64 stripe_nr_end; 4377 u64 stripe_nr_end;
4378 u64 stripe_len;
4379 u64 *raid_map = NULL;
4081 int stripe_index; 4380 int stripe_index;
4082 int i; 4381 int i;
4083 int ret = 0; 4382 int ret = 0;
@@ -4089,6 +4388,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4089 int num_alloc_stripes; 4388 int num_alloc_stripes;
4090 int patch_the_first_stripe_for_dev_replace = 0; 4389 int patch_the_first_stripe_for_dev_replace = 0;
4091 u64 physical_to_patch_in_first_stripe = 0; 4390 u64 physical_to_patch_in_first_stripe = 0;
4391 u64 raid56_full_stripe_start = (u64)-1;
4092 4392
4093 read_lock(&em_tree->lock); 4393 read_lock(&em_tree->lock);
4094 em = lookup_extent_mapping(em_tree, logical, *length); 4394 em = lookup_extent_mapping(em_tree, logical, *length);
@@ -4105,29 +4405,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4105 map = (struct map_lookup *)em->bdev; 4405 map = (struct map_lookup *)em->bdev;
4106 offset = logical - em->start; 4406 offset = logical - em->start;
4107 4407
4408 if (mirror_num > map->num_stripes)
4409 mirror_num = 0;
4410
4411 stripe_len = map->stripe_len;
4108 stripe_nr = offset; 4412 stripe_nr = offset;
4109 /* 4413 /*
4110 * stripe_nr counts the total number of stripes we have to stride 4414 * stripe_nr counts the total number of stripes we have to stride
4111 * to get to this block 4415 * to get to this block
4112 */ 4416 */
4113 do_div(stripe_nr, map->stripe_len); 4417 do_div(stripe_nr, stripe_len);
4114 4418
4115 stripe_offset = stripe_nr * map->stripe_len; 4419 stripe_offset = stripe_nr * stripe_len;
4116 BUG_ON(offset < stripe_offset); 4420 BUG_ON(offset < stripe_offset);
4117 4421
4118 /* stripe_offset is the offset of this block in its stripe*/ 4422 /* stripe_offset is the offset of this block in its stripe*/
4119 stripe_offset = offset - stripe_offset; 4423 stripe_offset = offset - stripe_offset;
4120 4424
4121 if (rw & REQ_DISCARD) 4425 /* if we're here for raid56, we need to know the stripe aligned start */
4426 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4427 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
4428 raid56_full_stripe_start = offset;
4429
4430 /* allow a write of a full stripe, but make sure we don't
4431 * allow straddling of stripes
4432 */
4433 do_div(raid56_full_stripe_start, full_stripe_len);
4434 raid56_full_stripe_start *= full_stripe_len;
4435 }
4436
4437 if (rw & REQ_DISCARD) {
4438 /* we don't discard raid56 yet */
4439 if (map->type &
4440 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4441 ret = -EOPNOTSUPP;
4442 goto out;
4443 }
4122 *length = min_t(u64, em->len - offset, *length); 4444 *length = min_t(u64, em->len - offset, *length);
4123 else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 4445 } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
4124 /* we limit the length of each bio to what fits in a stripe */ 4446 u64 max_len;
4125 *length = min_t(u64, em->len - offset, 4447 /* For writes to RAID[56], allow a full stripeset across all disks.
4126 map->stripe_len - stripe_offset); 4448 For other RAID types and for RAID[56] reads, just allow a single
4449 stripe (on a single disk). */
4450 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
4451 (rw & REQ_WRITE)) {
4452 max_len = stripe_len * nr_data_stripes(map) -
4453 (offset - raid56_full_stripe_start);
4454 } else {
4455 /* we limit the length of each bio to what fits in a stripe */
4456 max_len = stripe_len - stripe_offset;
4457 }
4458 *length = min_t(u64, em->len - offset, max_len);
4127 } else { 4459 } else {
4128 *length = em->len - offset; 4460 *length = em->len - offset;
4129 } 4461 }
4130 4462
4463 /* This is for when we're called from btrfs_merge_bio_hook() and all
4464 it cares about is the length */
4131 if (!bbio_ret) 4465 if (!bbio_ret)
4132 goto out; 4466 goto out;
4133 4467
@@ -4160,7 +4494,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4160 u64 physical_of_found = 0; 4494 u64 physical_of_found = 0;
4161 4495
4162 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, 4496 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
4163 logical, &tmp_length, &tmp_bbio, 0); 4497 logical, &tmp_length, &tmp_bbio, 0, NULL);
4164 if (ret) { 4498 if (ret) {
4165 WARN_ON(tmp_bbio != NULL); 4499 WARN_ON(tmp_bbio != NULL);
4166 goto out; 4500 goto out;
@@ -4221,11 +4555,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4221 num_stripes = 1; 4555 num_stripes = 1;
4222 stripe_index = 0; 4556 stripe_index = 0;
4223 stripe_nr_orig = stripe_nr; 4557 stripe_nr_orig = stripe_nr;
4224 stripe_nr_end = (offset + *length + map->stripe_len - 1) & 4558 stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
4225 (~(map->stripe_len - 1));
4226 do_div(stripe_nr_end, map->stripe_len); 4559 do_div(stripe_nr_end, map->stripe_len);
4227 stripe_end_offset = stripe_nr_end * map->stripe_len - 4560 stripe_end_offset = stripe_nr_end * map->stripe_len -
4228 (offset + *length); 4561 (offset + *length);
4562
4229 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4563 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4230 if (rw & REQ_DISCARD) 4564 if (rw & REQ_DISCARD)
4231 num_stripes = min_t(u64, map->num_stripes, 4565 num_stripes = min_t(u64, map->num_stripes,
@@ -4276,6 +4610,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4276 dev_replace_is_ongoing); 4610 dev_replace_is_ongoing);
4277 mirror_num = stripe_index - old_stripe_index + 1; 4611 mirror_num = stripe_index - old_stripe_index + 1;
4278 } 4612 }
4613
4614 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4615 BTRFS_BLOCK_GROUP_RAID6)) {
4616 u64 tmp;
4617
4618 if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
4619 && raid_map_ret) {
4620 int i, rot;
4621
4622 /* push stripe_nr back to the start of the full stripe */
4623 stripe_nr = raid56_full_stripe_start;
4624 do_div(stripe_nr, stripe_len);
4625
4626 stripe_index = do_div(stripe_nr, nr_data_stripes(map));
4627
4628 /* RAID[56] write or recovery. Return all stripes */
4629 num_stripes = map->num_stripes;
4630 max_errors = nr_parity_stripes(map);
4631
4632 raid_map = kmalloc(sizeof(u64) * num_stripes,
4633 GFP_NOFS);
4634 if (!raid_map) {
4635 ret = -ENOMEM;
4636 goto out;
4637 }
4638
4639 /* Work out the disk rotation on this stripe-set */
4640 tmp = stripe_nr;
4641 rot = do_div(tmp, num_stripes);
4642
4643 /* Fill in the logical address of each stripe */
4644 tmp = stripe_nr * nr_data_stripes(map);
4645 for (i = 0; i < nr_data_stripes(map); i++)
4646 raid_map[(i+rot) % num_stripes] =
4647 em->start + (tmp + i) * map->stripe_len;
4648
4649 raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
4650 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4651 raid_map[(i+rot+1) % num_stripes] =
4652 RAID6_Q_STRIPE;
4653
4654 *length = map->stripe_len;
4655 stripe_index = 0;
4656 stripe_offset = 0;
4657 } else {
4658 /*
4659 * Mirror #0 or #1 means the original data block.
4660 * Mirror #2 is RAID5 parity block.
4661 * Mirror #3 is RAID6 Q block.
4662 */
4663 stripe_index = do_div(stripe_nr, nr_data_stripes(map));
4664 if (mirror_num > 1)
4665 stripe_index = nr_data_stripes(map) +
4666 mirror_num - 2;
4667
4668 /* We distribute the parity blocks across stripes */
4669 tmp = stripe_nr + stripe_index;
4670 stripe_index = do_div(tmp, map->num_stripes);
4671 }
4279 } else { 4672 } else {
4280 /* 4673 /*
4281 * after this do_div call, stripe_nr is the number of stripes 4674 * after this do_div call, stripe_nr is the number of stripes
@@ -4384,8 +4777,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4384 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { 4777 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
4385 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 4778 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4386 BTRFS_BLOCK_GROUP_RAID10 | 4779 BTRFS_BLOCK_GROUP_RAID10 |
4780 BTRFS_BLOCK_GROUP_RAID5 |
4387 BTRFS_BLOCK_GROUP_DUP)) { 4781 BTRFS_BLOCK_GROUP_DUP)) {
4388 max_errors = 1; 4782 max_errors = 1;
4783 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
4784 max_errors = 2;
4389 } 4785 }
4390 } 4786 }
4391 4787
@@ -4486,6 +4882,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4486 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 4882 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
4487 bbio->mirror_num = map->num_stripes + 1; 4883 bbio->mirror_num = map->num_stripes + 1;
4488 } 4884 }
4885 if (raid_map) {
4886 sort_parity_stripes(bbio, raid_map);
4887 *raid_map_ret = raid_map;
4888 }
4489out: 4889out:
4490 if (dev_replace_is_ongoing) 4890 if (dev_replace_is_ongoing)
4491 btrfs_dev_replace_unlock(dev_replace); 4891 btrfs_dev_replace_unlock(dev_replace);
@@ -4498,7 +4898,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4498 struct btrfs_bio **bbio_ret, int mirror_num) 4898 struct btrfs_bio **bbio_ret, int mirror_num)
4499{ 4899{
4500 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, 4900 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
4501 mirror_num); 4901 mirror_num, NULL);
4502} 4902}
4503 4903
4504int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 4904int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -4512,6 +4912,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4512 u64 bytenr; 4912 u64 bytenr;
4513 u64 length; 4913 u64 length;
4514 u64 stripe_nr; 4914 u64 stripe_nr;
4915 u64 rmap_len;
4515 int i, j, nr = 0; 4916 int i, j, nr = 0;
4516 4917
4517 read_lock(&em_tree->lock); 4918 read_lock(&em_tree->lock);
@@ -4522,10 +4923,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4522 map = (struct map_lookup *)em->bdev; 4923 map = (struct map_lookup *)em->bdev;
4523 4924
4524 length = em->len; 4925 length = em->len;
4926 rmap_len = map->stripe_len;
4927
4525 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4928 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4526 do_div(length, map->num_stripes / map->sub_stripes); 4929 do_div(length, map->num_stripes / map->sub_stripes);
4527 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 4930 else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
4528 do_div(length, map->num_stripes); 4931 do_div(length, map->num_stripes);
4932 else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4933 BTRFS_BLOCK_GROUP_RAID6)) {
4934 do_div(length, nr_data_stripes(map));
4935 rmap_len = map->stripe_len * nr_data_stripes(map);
4936 }
4529 4937
4530 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 4938 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
4531 BUG_ON(!buf); /* -ENOMEM */ 4939 BUG_ON(!buf); /* -ENOMEM */
@@ -4545,8 +4953,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4545 do_div(stripe_nr, map->sub_stripes); 4953 do_div(stripe_nr, map->sub_stripes);
4546 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4954 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4547 stripe_nr = stripe_nr * map->num_stripes + i; 4955 stripe_nr = stripe_nr * map->num_stripes + i;
4548 } 4956 } /* else if RAID[56], multiply by nr_data_stripes().
4549 bytenr = chunk_start + stripe_nr * map->stripe_len; 4957 * Alternatively, just use rmap_len below instead of
4958 * map->stripe_len */
4959
4960 bytenr = chunk_start + stripe_nr * rmap_len;
4550 WARN_ON(nr >= map->num_stripes); 4961 WARN_ON(nr >= map->num_stripes);
4551 for (j = 0; j < nr; j++) { 4962 for (j = 0; j < nr; j++) {
4552 if (buf[j] == bytenr) 4963 if (buf[j] == bytenr)
@@ -4560,7 +4971,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4560 4971
4561 *logical = buf; 4972 *logical = buf;
4562 *naddrs = nr; 4973 *naddrs = nr;
4563 *stripe_len = map->stripe_len; 4974 *stripe_len = rmap_len;
4564 4975
4565 free_extent_map(em); 4976 free_extent_map(em);
4566 return 0; 4977 return 0;
@@ -4634,7 +5045,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
4634 bio->bi_bdev = (struct block_device *) 5045 bio->bi_bdev = (struct block_device *)
4635 (unsigned long)bbio->mirror_num; 5046 (unsigned long)bbio->mirror_num;
4636 /* only send an error to the higher layers if it is 5047 /* only send an error to the higher layers if it is
4637 * beyond the tolerance of the multi-bio 5048 * beyond the tolerance of the btrfs bio
4638 */ 5049 */
4639 if (atomic_read(&bbio->error) > bbio->max_errors) { 5050 if (atomic_read(&bbio->error) > bbio->max_errors) {
4640 err = -EIO; 5051 err = -EIO;
@@ -4668,13 +5079,18 @@ struct async_sched {
4668 * This will add one bio to the pending list for a device and make sure 5079 * This will add one bio to the pending list for a device and make sure
4669 * the work struct is scheduled. 5080 * the work struct is scheduled.
4670 */ 5081 */
4671static noinline void schedule_bio(struct btrfs_root *root, 5082noinline void btrfs_schedule_bio(struct btrfs_root *root,
4672 struct btrfs_device *device, 5083 struct btrfs_device *device,
4673 int rw, struct bio *bio) 5084 int rw, struct bio *bio)
4674{ 5085{
4675 int should_queue = 1; 5086 int should_queue = 1;
4676 struct btrfs_pending_bios *pending_bios; 5087 struct btrfs_pending_bios *pending_bios;
4677 5088
5089 if (device->missing || !device->bdev) {
5090 bio_endio(bio, -EIO);
5091 return;
5092 }
5093
4678 /* don't bother with additional async steps for reads, right now */ 5094 /* don't bother with additional async steps for reads, right now */
4679 if (!(rw & REQ_WRITE)) { 5095 if (!(rw & REQ_WRITE)) {
4680 bio_get(bio); 5096 bio_get(bio);
@@ -4772,7 +5188,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
4772#endif 5188#endif
4773 bio->bi_bdev = dev->bdev; 5189 bio->bi_bdev = dev->bdev;
4774 if (async) 5190 if (async)
4775 schedule_bio(root, dev, rw, bio); 5191 btrfs_schedule_bio(root, dev, rw, bio);
4776 else 5192 else
4777 btrfsic_submit_bio(rw, bio); 5193 btrfsic_submit_bio(rw, bio);
4778} 5194}
@@ -4831,6 +5247,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4831 u64 logical = (u64)bio->bi_sector << 9; 5247 u64 logical = (u64)bio->bi_sector << 9;
4832 u64 length = 0; 5248 u64 length = 0;
4833 u64 map_length; 5249 u64 map_length;
5250 u64 *raid_map = NULL;
4834 int ret; 5251 int ret;
4835 int dev_nr = 0; 5252 int dev_nr = 0;
4836 int total_devs = 1; 5253 int total_devs = 1;
@@ -4839,12 +5256,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4839 length = bio->bi_size; 5256 length = bio->bi_size;
4840 map_length = length; 5257 map_length = length;
4841 5258
4842 ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5259 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
4843 mirror_num); 5260 mirror_num, &raid_map);
4844 if (ret) 5261 if (ret) /* -ENOMEM */
4845 return ret; 5262 return ret;
4846 5263
4847 total_devs = bbio->num_stripes; 5264 total_devs = bbio->num_stripes;
5265 bbio->orig_bio = first_bio;
5266 bbio->private = first_bio->bi_private;
5267 bbio->end_io = first_bio->bi_end_io;
5268 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
5269
5270 if (raid_map) {
5271 /* In this case, map_length has been set to the length of
5272 a single stripe; not the whole write */
5273 if (rw & WRITE) {
5274 return raid56_parity_write(root, bio, bbio,
5275 raid_map, map_length);
5276 } else {
5277 return raid56_parity_recover(root, bio, bbio,
5278 raid_map, map_length,
5279 mirror_num);
5280 }
5281 }
5282
4848 if (map_length < length) { 5283 if (map_length < length) {
4849 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " 5284 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
4850 "len %llu\n", (unsigned long long)logical, 5285 "len %llu\n", (unsigned long long)logical,
@@ -4853,11 +5288,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4853 BUG(); 5288 BUG();
4854 } 5289 }
4855 5290
4856 bbio->orig_bio = first_bio;
4857 bbio->private = first_bio->bi_private;
4858 bbio->end_io = first_bio->bi_end_io;
4859 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4860
4861 while (dev_nr < total_devs) { 5291 while (dev_nr < total_devs) {
4862 dev = bbio->stripes[dev_nr].dev; 5292 dev = bbio->stripes[dev_nr].dev;
4863 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { 5293 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d3c3939ac751..062d8604d35b 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -21,8 +21,8 @@
21 21
22#include <linux/bio.h> 22#include <linux/bio.h>
23#include <linux/sort.h> 23#include <linux/sort.h>
24#include <linux/btrfs.h>
24#include "async-thread.h" 25#include "async-thread.h"
25#include "ioctl.h"
26 26
27#define BTRFS_STRIPE_LEN (64 * 1024) 27#define BTRFS_STRIPE_LEN (64 * 1024)
28 28
@@ -321,7 +321,14 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
321void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, 321void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
322 struct btrfs_device *tgtdev); 322 struct btrfs_device *tgtdev);
323int btrfs_scratch_superblock(struct btrfs_device *device); 323int btrfs_scratch_superblock(struct btrfs_device *device);
324 324void btrfs_schedule_bio(struct btrfs_root *root,
325 struct btrfs_device *device,
326 int rw, struct bio *bio);
327int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
328 u64 logical, u64 len, int mirror_num);
329unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
330 struct btrfs_mapping_tree *map_tree,
331 u64 logical);
325static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 332static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
326 int index) 333 int index)
327{ 334{