aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig6
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/backref.c5
-rw-r--r--fs/btrfs/backref.h2
-rw-r--r--fs/btrfs/btrfs_inode.h20
-rw-r--r--fs/btrfs/check-integrity.c3
-rw-r--r--fs/btrfs/compression.c4
-rw-r--r--fs/btrfs/ctree.c68
-rw-r--r--fs/btrfs/ctree.h150
-rw-r--r--fs/btrfs/delayed-inode.c298
-rw-r--r--fs/btrfs/delayed-inode.h3
-rw-r--r--fs/btrfs/delayed-ref.c82
-rw-r--r--fs/btrfs/delayed-ref.h52
-rw-r--r--fs/btrfs/dev-replace.c6
-rw-r--r--fs/btrfs/disk-io.c243
-rw-r--r--fs/btrfs/disk-io.h7
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-tree.c594
-rw-r--r--fs/btrfs/extent_io.c138
-rw-r--r--fs/btrfs/extent_io.h8
-rw-r--r--fs/btrfs/extent_map.c15
-rw-r--r--fs/btrfs/extent_map.h1
-rw-r--r--fs/btrfs/file-item.c71
-rw-r--r--fs/btrfs/file.c100
-rw-r--r--fs/btrfs/free-space-cache.c82
-rw-r--r--fs/btrfs/inode.c1189
-rw-r--r--fs/btrfs/ioctl.c403
-rw-r--r--fs/btrfs/ioctl.h502
-rw-r--r--fs/btrfs/locking.c5
-rw-r--r--fs/btrfs/ordered-data.c111
-rw-r--r--fs/btrfs/ordered-data.h14
-rw-r--r--fs/btrfs/print-tree.c1
-rw-r--r--fs/btrfs/qgroup.c75
-rw-r--r--fs/btrfs/raid56.c2100
-rw-r--r--fs/btrfs/raid56.h51
-rw-r--r--fs/btrfs/relocation.c80
-rw-r--r--fs/btrfs/scrub.c35
-rw-r--r--fs/btrfs/send.c59
-rw-r--r--fs/btrfs/send.h1
-rw-r--r--fs/btrfs/super.c92
-rw-r--r--fs/btrfs/sysfs.c1
-rw-r--r--fs/btrfs/transaction.c261
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-defrag.c19
-rw-r--r--fs/btrfs/tree-log.c181
-rw-r--r--fs/btrfs/ulist.c2
-rw-r--r--fs/btrfs/volumes.c662
-rw-r--r--fs/btrfs/volumes.h11
48 files changed, 6030 insertions, 1797 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index d33f01c08b60..9a8622a5b867 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -1,11 +1,13 @@
1config BTRFS_FS 1config BTRFS_FS
2 tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format" 2 tristate "Btrfs filesystem Unstable disk format"
3 depends on EXPERIMENTAL
4 select LIBCRC32C 3 select LIBCRC32C
5 select ZLIB_INFLATE 4 select ZLIB_INFLATE
6 select ZLIB_DEFLATE 5 select ZLIB_DEFLATE
7 select LZO_COMPRESS 6 select LZO_COMPRESS
8 select LZO_DECOMPRESS 7 select LZO_DECOMPRESS
8 select RAID6_PQ
9 select XOR_BLOCKS
10
9 help 11 help
10 Btrfs is a new filesystem with extents, writable snapshotting, 12 Btrfs is a new filesystem with extents, writable snapshotting,
11 support for multiple devices and many more features. 13 support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 7df3e0f0ee51..3932224f99e9 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o 11 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o
12 12
13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o 14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 04edf69be875..bd605c87adfd 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -352,11 +352,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
352 err = __resolve_indirect_ref(fs_info, search_commit_root, 352 err = __resolve_indirect_ref(fs_info, search_commit_root,
353 time_seq, ref, parents, 353 time_seq, ref, parents,
354 extent_item_pos); 354 extent_item_pos);
355 if (err) { 355 if (err)
356 if (ret == 0)
357 ret = err;
358 continue; 356 continue;
359 }
360 357
361 /* we put the first parent into the ref at hand */ 358 /* we put the first parent into the ref at hand */
362 ULIST_ITER_INIT(&uiter); 359 ULIST_ITER_INIT(&uiter);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index d61feca79455..310a7f6d09b1 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -19,7 +19,7 @@
19#ifndef __BTRFS_BACKREF__ 19#ifndef __BTRFS_BACKREF__
20#define __BTRFS_BACKREF__ 20#define __BTRFS_BACKREF__
21 21
22#include "ioctl.h" 22#include <linux/btrfs.h>
23#include "ulist.h" 23#include "ulist.h"
24#include "extent_io.h" 24#include "extent_io.h"
25 25
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 2a8c242bc4f5..d9b97d4960e6 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -40,6 +40,8 @@
40#define BTRFS_INODE_HAS_ASYNC_EXTENT 6 40#define BTRFS_INODE_HAS_ASYNC_EXTENT 6
41#define BTRFS_INODE_NEEDS_FULL_SYNC 7 41#define BTRFS_INODE_NEEDS_FULL_SYNC 7
42#define BTRFS_INODE_COPY_EVERYTHING 8 42#define BTRFS_INODE_COPY_EVERYTHING 8
43#define BTRFS_INODE_IN_DELALLOC_LIST 9
44#define BTRFS_INODE_READDIO_NEED_LOCK 10
43 45
44/* in memory btrfs inode */ 46/* in memory btrfs inode */
45struct btrfs_inode { 47struct btrfs_inode {
@@ -216,4 +218,22 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
216 return 0; 218 return 0;
217} 219}
218 220
221/*
222 * Disable DIO read nolock optimization, so new dio readers will be forced
223 * to grab i_mutex. It is used to avoid the endless truncate due to
224 * nonlocked dio read.
225 */
226static inline void btrfs_inode_block_unlocked_dio(struct inode *inode)
227{
228 set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags);
229 smp_mb();
230}
231
232static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode)
233{
234 smp_mb__before_clear_bit();
235 clear_bit(BTRFS_INODE_READDIO_NEED_LOCK,
236 &BTRFS_I(inode)->runtime_flags);
237}
238
219#endif 239#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 11d47bfb62b4..18af6f48781a 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -813,8 +813,7 @@ static int btrfsic_process_superblock_dev_mirror(
813 (bh->b_data + (dev_bytenr & 4095)); 813 (bh->b_data + (dev_bytenr & 4095));
814 814
815 if (btrfs_super_bytenr(super_tmp) != dev_bytenr || 815 if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
816 strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC, 816 super_tmp->magic != cpu_to_le64(BTRFS_MAGIC) ||
817 sizeof(super_tmp->magic)) ||
818 memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || 817 memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
819 btrfs_super_nodesize(super_tmp) != state->metablock_size || 818 btrfs_super_nodesize(super_tmp) != state->metablock_size ||
820 btrfs_super_leafsize(super_tmp) != state->metablock_size || 819 btrfs_super_leafsize(super_tmp) != state->metablock_size ||
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 94ab2f80e7e3..15b94089abc4 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -372,7 +372,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
372 page = compressed_pages[pg_index]; 372 page = compressed_pages[pg_index];
373 page->mapping = inode->i_mapping; 373 page->mapping = inode->i_mapping;
374 if (bio->bi_size) 374 if (bio->bi_size)
375 ret = io_tree->ops->merge_bio_hook(page, 0, 375 ret = io_tree->ops->merge_bio_hook(WRITE, page, 0,
376 PAGE_CACHE_SIZE, 376 PAGE_CACHE_SIZE,
377 bio, 0); 377 bio, 0);
378 else 378 else
@@ -655,7 +655,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
655 page->index = em_start >> PAGE_CACHE_SHIFT; 655 page->index = em_start >> PAGE_CACHE_SHIFT;
656 656
657 if (comp_bio->bi_size) 657 if (comp_bio->bi_size)
658 ret = tree->ops->merge_bio_hook(page, 0, 658 ret = tree->ops->merge_bio_hook(READ, page, 0,
659 PAGE_CACHE_SIZE, 659 PAGE_CACHE_SIZE,
660 comp_bio, 0); 660 comp_bio, 0);
661 else 661 else
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index eea5da7a2b9a..ecd25a1b4e51 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1138,6 +1138,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1138 switch (tm->op) { 1138 switch (tm->op) {
1139 case MOD_LOG_KEY_REMOVE_WHILE_FREEING: 1139 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1140 BUG_ON(tm->slot < n); 1140 BUG_ON(tm->slot < n);
1141 /* Fallthrough */
1141 case MOD_LOG_KEY_REMOVE_WHILE_MOVING: 1142 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1142 case MOD_LOG_KEY_REMOVE: 1143 case MOD_LOG_KEY_REMOVE:
1143 btrfs_set_node_key(eb, &tm->key, tm->slot); 1144 btrfs_set_node_key(eb, &tm->key, tm->slot);
@@ -1222,7 +1223,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1222 1223
1223 __tree_mod_log_rewind(eb_rewin, time_seq, tm); 1224 __tree_mod_log_rewind(eb_rewin, time_seq, tm);
1224 WARN_ON(btrfs_header_nritems(eb_rewin) > 1225 WARN_ON(btrfs_header_nritems(eb_rewin) >
1225 BTRFS_NODEPTRS_PER_BLOCK(fs_info->fs_root)); 1226 BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1226 1227
1227 return eb_rewin; 1228 return eb_rewin;
1228} 1229}
@@ -1441,7 +1442,7 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1441 */ 1442 */
1442int btrfs_realloc_node(struct btrfs_trans_handle *trans, 1443int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1443 struct btrfs_root *root, struct extent_buffer *parent, 1444 struct btrfs_root *root, struct extent_buffer *parent,
1444 int start_slot, int cache_only, u64 *last_ret, 1445 int start_slot, u64 *last_ret,
1445 struct btrfs_key *progress) 1446 struct btrfs_key *progress)
1446{ 1447{
1447 struct extent_buffer *cur; 1448 struct extent_buffer *cur;
@@ -1461,8 +1462,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1461 struct btrfs_disk_key disk_key; 1462 struct btrfs_disk_key disk_key;
1462 1463
1463 parent_level = btrfs_header_level(parent); 1464 parent_level = btrfs_header_level(parent);
1464 if (cache_only && parent_level != 1)
1465 return 0;
1466 1465
1467 WARN_ON(trans->transaction != root->fs_info->running_transaction); 1466 WARN_ON(trans->transaction != root->fs_info->running_transaction);
1468 WARN_ON(trans->transid != root->fs_info->generation); 1467 WARN_ON(trans->transid != root->fs_info->generation);
@@ -1508,10 +1507,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1508 else 1507 else
1509 uptodate = 0; 1508 uptodate = 0;
1510 if (!cur || !uptodate) { 1509 if (!cur || !uptodate) {
1511 if (cache_only) {
1512 free_extent_buffer(cur);
1513 continue;
1514 }
1515 if (!cur) { 1510 if (!cur) {
1516 cur = read_tree_block(root, blocknr, 1511 cur = read_tree_block(root, blocknr,
1517 blocksize, gen); 1512 blocksize, gen);
@@ -4825,8 +4820,8 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4825 4820
4826/* 4821/*
4827 * A helper function to walk down the tree starting at min_key, and looking 4822 * A helper function to walk down the tree starting at min_key, and looking
4828 * for nodes or leaves that are either in cache or have a minimum 4823 * for nodes or leaves that are have a minimum transaction id.
4829 * transaction id. This is used by the btree defrag code, and tree logging 4824 * This is used by the btree defrag code, and tree logging
4830 * 4825 *
4831 * This does not cow, but it does stuff the starting key it finds back 4826 * This does not cow, but it does stuff the starting key it finds back
4832 * into min_key, so you can call btrfs_search_slot with cow=1 on the 4827 * into min_key, so you can call btrfs_search_slot with cow=1 on the
@@ -4847,7 +4842,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4847 */ 4842 */
4848int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 4843int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4849 struct btrfs_key *max_key, 4844 struct btrfs_key *max_key,
4850 struct btrfs_path *path, int cache_only, 4845 struct btrfs_path *path,
4851 u64 min_trans) 4846 u64 min_trans)
4852{ 4847{
4853 struct extent_buffer *cur; 4848 struct extent_buffer *cur;
@@ -4887,15 +4882,12 @@ again:
4887 if (sret && slot > 0) 4882 if (sret && slot > 0)
4888 slot--; 4883 slot--;
4889 /* 4884 /*
4890 * check this node pointer against the cache_only and 4885 * check this node pointer against the min_trans parameters.
4891 * min_trans parameters. If it isn't in cache or is too 4886 * If it is too old, old, skip to the next one.
4892 * old, skip to the next one.
4893 */ 4887 */
4894 while (slot < nritems) { 4888 while (slot < nritems) {
4895 u64 blockptr; 4889 u64 blockptr;
4896 u64 gen; 4890 u64 gen;
4897 struct extent_buffer *tmp;
4898 struct btrfs_disk_key disk_key;
4899 4891
4900 blockptr = btrfs_node_blockptr(cur, slot); 4892 blockptr = btrfs_node_blockptr(cur, slot);
4901 gen = btrfs_node_ptr_generation(cur, slot); 4893 gen = btrfs_node_ptr_generation(cur, slot);
@@ -4903,27 +4895,7 @@ again:
4903 slot++; 4895 slot++;
4904 continue; 4896 continue;
4905 } 4897 }
4906 if (!cache_only) 4898 break;
4907 break;
4908
4909 if (max_key) {
4910 btrfs_node_key(cur, &disk_key, slot);
4911 if (comp_keys(&disk_key, max_key) >= 0) {
4912 ret = 1;
4913 goto out;
4914 }
4915 }
4916
4917 tmp = btrfs_find_tree_block(root, blockptr,
4918 btrfs_level_size(root, level - 1));
4919
4920 if (tmp && btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
4921 free_extent_buffer(tmp);
4922 break;
4923 }
4924 if (tmp)
4925 free_extent_buffer(tmp);
4926 slot++;
4927 } 4899 }
4928find_next_key: 4900find_next_key:
4929 /* 4901 /*
@@ -4934,7 +4906,7 @@ find_next_key:
4934 path->slots[level] = slot; 4906 path->slots[level] = slot;
4935 btrfs_set_path_blocking(path); 4907 btrfs_set_path_blocking(path);
4936 sret = btrfs_find_next_key(root, path, min_key, level, 4908 sret = btrfs_find_next_key(root, path, min_key, level,
4937 cache_only, min_trans); 4909 min_trans);
4938 if (sret == 0) { 4910 if (sret == 0) {
4939 btrfs_release_path(path); 4911 btrfs_release_path(path);
4940 goto again; 4912 goto again;
@@ -5399,8 +5371,7 @@ out:
5399/* 5371/*
5400 * this is similar to btrfs_next_leaf, but does not try to preserve 5372 * this is similar to btrfs_next_leaf, but does not try to preserve
5401 * and fixup the path. It looks for and returns the next key in the 5373 * and fixup the path. It looks for and returns the next key in the
5402 * tree based on the current path and the cache_only and min_trans 5374 * tree based on the current path and the min_trans parameters.
5403 * parameters.
5404 * 5375 *
5405 * 0 is returned if another key is found, < 0 if there are any errors 5376 * 0 is returned if another key is found, < 0 if there are any errors
5406 * and 1 is returned if there are no higher keys in the tree 5377 * and 1 is returned if there are no higher keys in the tree
@@ -5409,8 +5380,7 @@ out:
5409 * calling this function. 5380 * calling this function.
5410 */ 5381 */
5411int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 5382int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5412 struct btrfs_key *key, int level, 5383 struct btrfs_key *key, int level, u64 min_trans)
5413 int cache_only, u64 min_trans)
5414{ 5384{
5415 int slot; 5385 int slot;
5416 struct extent_buffer *c; 5386 struct extent_buffer *c;
@@ -5461,22 +5431,8 @@ next:
5461 if (level == 0) 5431 if (level == 0)
5462 btrfs_item_key_to_cpu(c, key, slot); 5432 btrfs_item_key_to_cpu(c, key, slot);
5463 else { 5433 else {
5464 u64 blockptr = btrfs_node_blockptr(c, slot);
5465 u64 gen = btrfs_node_ptr_generation(c, slot); 5434 u64 gen = btrfs_node_ptr_generation(c, slot);
5466 5435
5467 if (cache_only) {
5468 struct extent_buffer *cur;
5469 cur = btrfs_find_tree_block(root, blockptr,
5470 btrfs_level_size(root, level - 1));
5471 if (!cur ||
5472 btrfs_buffer_uptodate(cur, gen, 1) <= 0) {
5473 slot++;
5474 if (cur)
5475 free_extent_buffer(cur);
5476 goto next;
5477 }
5478 free_extent_buffer(cur);
5479 }
5480 if (gen < min_trans) { 5436 if (gen < min_trans) {
5481 slot++; 5437 slot++;
5482 goto next; 5438 goto next;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 547b7b05727f..0d82922179db 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -31,10 +31,10 @@
31#include <trace/events/btrfs.h> 31#include <trace/events/btrfs.h>
32#include <asm/kmap_types.h> 32#include <asm/kmap_types.h>
33#include <linux/pagemap.h> 33#include <linux/pagemap.h>
34#include <linux/btrfs.h>
34#include "extent_io.h" 35#include "extent_io.h"
35#include "extent_map.h" 36#include "extent_map.h"
36#include "async-thread.h" 37#include "async-thread.h"
37#include "ioctl.h"
38 38
39struct btrfs_trans_handle; 39struct btrfs_trans_handle;
40struct btrfs_transaction; 40struct btrfs_transaction;
@@ -46,7 +46,7 @@ extern struct kmem_cache *btrfs_path_cachep;
46extern struct kmem_cache *btrfs_free_space_cachep; 46extern struct kmem_cache *btrfs_free_space_cachep;
47struct btrfs_ordered_sum; 47struct btrfs_ordered_sum;
48 48
49#define BTRFS_MAGIC "_BHRfS_M" 49#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
50 50
51#define BTRFS_MAX_MIRRORS 3 51#define BTRFS_MAX_MIRRORS 3
52 52
@@ -191,6 +191,8 @@ static int btrfs_csum_sizes[] = { 4, 0 };
191/* ioprio of readahead is set to idle */ 191/* ioprio of readahead is set to idle */
192#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) 192#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
193 193
194#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024)
195
194/* 196/*
195 * The key defines the order in the tree, and so it also defines (optimal) 197 * The key defines the order in the tree, and so it also defines (optimal)
196 * block layout. 198 * block layout.
@@ -336,7 +338,10 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
336/* 338/*
337 * File system states 339 * File system states
338 */ 340 */
341#define BTRFS_FS_STATE_ERROR 0
342#define BTRFS_FS_STATE_REMOUNTING 1
339 343
344/* Super block flags */
340/* Errors detected */ 345/* Errors detected */
341#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) 346#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2)
342 347
@@ -502,6 +507,7 @@ struct btrfs_super_block {
502#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) 507#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
503 508
504#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) 509#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
510#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7)
505 511
506#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 512#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
507#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 513#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
@@ -511,6 +517,7 @@ struct btrfs_super_block {
511 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ 517 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
512 BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ 518 BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
513 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ 519 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
520 BTRFS_FEATURE_INCOMPAT_RAID56 | \
514 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) 521 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
515 522
516/* 523/*
@@ -952,8 +959,20 @@ struct btrfs_dev_replace_item {
952#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) 959#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
953#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) 960#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
954#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) 961#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
962#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7)
963#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8)
955#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE 964#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
956#define BTRFS_NR_RAID_TYPES 5 965
966enum btrfs_raid_types {
967 BTRFS_RAID_RAID10,
968 BTRFS_RAID_RAID1,
969 BTRFS_RAID_DUP,
970 BTRFS_RAID_RAID0,
971 BTRFS_RAID_SINGLE,
972 BTRFS_RAID_RAID5,
973 BTRFS_RAID_RAID6,
974 BTRFS_NR_RAID_TYPES
975};
957 976
958#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ 977#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
959 BTRFS_BLOCK_GROUP_SYSTEM | \ 978 BTRFS_BLOCK_GROUP_SYSTEM | \
@@ -961,6 +980,8 @@ struct btrfs_dev_replace_item {
961 980
962#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ 981#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
963 BTRFS_BLOCK_GROUP_RAID1 | \ 982 BTRFS_BLOCK_GROUP_RAID1 | \
983 BTRFS_BLOCK_GROUP_RAID5 | \
984 BTRFS_BLOCK_GROUP_RAID6 | \
964 BTRFS_BLOCK_GROUP_DUP | \ 985 BTRFS_BLOCK_GROUP_DUP | \
965 BTRFS_BLOCK_GROUP_RAID10) 986 BTRFS_BLOCK_GROUP_RAID10)
966/* 987/*
@@ -1185,6 +1206,10 @@ struct btrfs_block_group_cache {
1185 u64 flags; 1206 u64 flags;
1186 u64 sectorsize; 1207 u64 sectorsize;
1187 u64 cache_generation; 1208 u64 cache_generation;
1209
1210 /* for raid56, this is a full stripe, without parity */
1211 unsigned long full_stripe_len;
1212
1188 unsigned int ro:1; 1213 unsigned int ro:1;
1189 unsigned int dirty:1; 1214 unsigned int dirty:1;
1190 unsigned int iref:1; 1215 unsigned int iref:1;
@@ -1225,6 +1250,28 @@ struct seq_list {
1225 u64 seq; 1250 u64 seq;
1226}; 1251};
1227 1252
1253enum btrfs_orphan_cleanup_state {
1254 ORPHAN_CLEANUP_STARTED = 1,
1255 ORPHAN_CLEANUP_DONE = 2,
1256};
1257
1258/* used by the raid56 code to lock stripes for read/modify/write */
1259struct btrfs_stripe_hash {
1260 struct list_head hash_list;
1261 wait_queue_head_t wait;
1262 spinlock_t lock;
1263};
1264
1265/* used by the raid56 code to lock stripes for read/modify/write */
1266struct btrfs_stripe_hash_table {
1267 struct list_head stripe_cache;
1268 spinlock_t cache_lock;
1269 int cache_size;
1270 struct btrfs_stripe_hash table[];
1271};
1272
1273#define BTRFS_STRIPE_HASH_TABLE_BITS 11
1274
1228/* fs_info */ 1275/* fs_info */
1229struct reloc_control; 1276struct reloc_control;
1230struct btrfs_device; 1277struct btrfs_device;
@@ -1250,6 +1297,7 @@ struct btrfs_fs_info {
1250 1297
1251 /* block group cache stuff */ 1298 /* block group cache stuff */
1252 spinlock_t block_group_cache_lock; 1299 spinlock_t block_group_cache_lock;
1300 u64 first_logical_byte;
1253 struct rb_root block_group_cache_tree; 1301 struct rb_root block_group_cache_tree;
1254 1302
1255 /* keep track of unallocated space */ 1303 /* keep track of unallocated space */
@@ -1288,7 +1336,23 @@ struct btrfs_fs_info {
1288 u64 last_trans_log_full_commit; 1336 u64 last_trans_log_full_commit;
1289 unsigned long mount_opt; 1337 unsigned long mount_opt;
1290 unsigned long compress_type:4; 1338 unsigned long compress_type:4;
1339 /*
1340 * It is a suggestive number, the read side is safe even it gets a
1341 * wrong number because we will write out the data into a regular
1342 * extent. The write side(mount/remount) is under ->s_umount lock,
1343 * so it is also safe.
1344 */
1291 u64 max_inline; 1345 u64 max_inline;
1346 /*
1347 * Protected by ->chunk_mutex and sb->s_umount.
1348 *
1349 * The reason that we use two lock to protect it is because only
1350 * remount and mount operations can change it and these two operations
1351 * are under sb->s_umount, but the read side (chunk allocation) can not
1352 * acquire sb->s_umount or the deadlock would happen. So we use two
1353 * locks to protect it. On the write side, we must acquire two locks,
1354 * and on the read side, we just need acquire one of them.
1355 */
1292 u64 alloc_start; 1356 u64 alloc_start;
1293 struct btrfs_transaction *running_transaction; 1357 struct btrfs_transaction *running_transaction;
1294 wait_queue_head_t transaction_throttle; 1358 wait_queue_head_t transaction_throttle;
@@ -1307,6 +1371,13 @@ struct btrfs_fs_info {
1307 struct mutex cleaner_mutex; 1371 struct mutex cleaner_mutex;
1308 struct mutex chunk_mutex; 1372 struct mutex chunk_mutex;
1309 struct mutex volume_mutex; 1373 struct mutex volume_mutex;
1374
1375 /* this is used during read/modify/write to make sure
1376 * no two ios are trying to mod the same stripe at the same
1377 * time
1378 */
1379 struct btrfs_stripe_hash_table *stripe_hash_table;
1380
1310 /* 1381 /*
1311 * this protects the ordered operations list only while we are 1382 * this protects the ordered operations list only while we are
1312 * processing all of the entries on it. This way we make 1383 * processing all of the entries on it. This way we make
@@ -1365,6 +1436,7 @@ struct btrfs_fs_info {
1365 */ 1436 */
1366 struct list_head ordered_extents; 1437 struct list_head ordered_extents;
1367 1438
1439 spinlock_t delalloc_lock;
1368 /* 1440 /*
1369 * all of the inodes that have delalloc bytes. It is possible for 1441 * all of the inodes that have delalloc bytes. It is possible for
1370 * this list to be empty even when there is still dirty data=ordered 1442 * this list to be empty even when there is still dirty data=ordered
@@ -1373,13 +1445,6 @@ struct btrfs_fs_info {
1373 struct list_head delalloc_inodes; 1445 struct list_head delalloc_inodes;
1374 1446
1375 /* 1447 /*
1376 * special rename and truncate targets that must be on disk before
1377 * we're allowed to commit. This is basically the ext3 style
1378 * data=ordered list.
1379 */
1380 struct list_head ordered_operations;
1381
1382 /*
1383 * there is a pool of worker threads for checksumming during writes 1448 * there is a pool of worker threads for checksumming during writes
1384 * and a pool for checksumming after reads. This is because readers 1449 * and a pool for checksumming after reads. This is because readers
1385 * can run with FS locks held, and the writers may be waiting for 1450 * can run with FS locks held, and the writers may be waiting for
@@ -1395,6 +1460,8 @@ struct btrfs_fs_info {
1395 struct btrfs_workers flush_workers; 1460 struct btrfs_workers flush_workers;
1396 struct btrfs_workers endio_workers; 1461 struct btrfs_workers endio_workers;
1397 struct btrfs_workers endio_meta_workers; 1462 struct btrfs_workers endio_meta_workers;
1463 struct btrfs_workers endio_raid56_workers;
1464 struct btrfs_workers rmw_workers;
1398 struct btrfs_workers endio_meta_write_workers; 1465 struct btrfs_workers endio_meta_write_workers;
1399 struct btrfs_workers endio_write_workers; 1466 struct btrfs_workers endio_write_workers;
1400 struct btrfs_workers endio_freespace_worker; 1467 struct btrfs_workers endio_freespace_worker;
@@ -1423,10 +1490,12 @@ struct btrfs_fs_info {
1423 1490
1424 u64 total_pinned; 1491 u64 total_pinned;
1425 1492
1426 /* protected by the delalloc lock, used to keep from writing 1493 /* used to keep from writing metadata until there is a nice batch */
1427 * metadata until there is a nice batch 1494 struct percpu_counter dirty_metadata_bytes;
1428 */ 1495 struct percpu_counter delalloc_bytes;
1429 u64 dirty_metadata_bytes; 1496 s32 dirty_metadata_batch;
1497 s32 delalloc_batch;
1498
1430 struct list_head dirty_cowonly_roots; 1499 struct list_head dirty_cowonly_roots;
1431 1500
1432 struct btrfs_fs_devices *fs_devices; 1501 struct btrfs_fs_devices *fs_devices;
@@ -1442,9 +1511,6 @@ struct btrfs_fs_info {
1442 1511
1443 struct reloc_control *reloc_ctl; 1512 struct reloc_control *reloc_ctl;
1444 1513
1445 spinlock_t delalloc_lock;
1446 u64 delalloc_bytes;
1447
1448 /* data_alloc_cluster is only used in ssd mode */ 1514 /* data_alloc_cluster is only used in ssd mode */
1449 struct btrfs_free_cluster data_alloc_cluster; 1515 struct btrfs_free_cluster data_alloc_cluster;
1450 1516
@@ -1456,6 +1522,8 @@ struct btrfs_fs_info {
1456 struct rb_root defrag_inodes; 1522 struct rb_root defrag_inodes;
1457 atomic_t defrag_running; 1523 atomic_t defrag_running;
1458 1524
1525 /* Used to protect avail_{data, metadata, system}_alloc_bits */
1526 seqlock_t profiles_lock;
1459 /* 1527 /*
1460 * these three are in extended format (availability of single 1528 * these three are in extended format (availability of single
1461 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other 1529 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
@@ -1520,7 +1588,7 @@ struct btrfs_fs_info {
1520 u64 qgroup_seq; 1588 u64 qgroup_seq;
1521 1589
1522 /* filesystem state */ 1590 /* filesystem state */
1523 u64 fs_state; 1591 unsigned long fs_state;
1524 1592
1525 struct btrfs_delayed_root *delayed_root; 1593 struct btrfs_delayed_root *delayed_root;
1526 1594
@@ -1623,6 +1691,9 @@ struct btrfs_root {
1623 1691
1624 struct list_head root_list; 1692 struct list_head root_list;
1625 1693
1694 spinlock_t log_extents_lock[2];
1695 struct list_head logged_list[2];
1696
1626 spinlock_t orphan_lock; 1697 spinlock_t orphan_lock;
1627 atomic_t orphan_inodes; 1698 atomic_t orphan_inodes;
1628 struct btrfs_block_rsv *orphan_block_rsv; 1699 struct btrfs_block_rsv *orphan_block_rsv;
@@ -1832,6 +1903,7 @@ struct btrfs_ioctl_defrag_range_args {
1832 1903
1833#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1904#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1834#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1905#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
1906#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt)
1835#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ 1907#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
1836 BTRFS_MOUNT_##opt) 1908 BTRFS_MOUNT_##opt)
1837/* 1909/*
@@ -2936,8 +3008,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
2936 u64 num_bytes, u64 *refs, u64 *flags); 3008 u64 num_bytes, u64 *refs, u64 *flags);
2937int btrfs_pin_extent(struct btrfs_root *root, 3009int btrfs_pin_extent(struct btrfs_root *root,
2938 u64 bytenr, u64 num, int reserved); 3010 u64 bytenr, u64 num, int reserved);
2939int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, 3011int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
2940 struct btrfs_root *root,
2941 u64 bytenr, u64 num_bytes); 3012 u64 bytenr, u64 num_bytes);
2942int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 3013int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2943 struct btrfs_root *root, 3014 struct btrfs_root *root,
@@ -3035,8 +3106,13 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3035int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 3106int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
3036 struct inode *inode); 3107 struct inode *inode);
3037void btrfs_orphan_release_metadata(struct inode *inode); 3108void btrfs_orphan_release_metadata(struct inode *inode);
3038int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, 3109int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
3039 struct btrfs_pending_snapshot *pending); 3110 struct btrfs_block_rsv *rsv,
3111 int nitems,
3112 u64 *qgroup_reserved);
3113void btrfs_subvolume_release_metadata(struct btrfs_root *root,
3114 struct btrfs_block_rsv *rsv,
3115 u64 qgroup_reserved);
3040int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); 3116int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
3041void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); 3117void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
3042int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); 3118int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
@@ -3092,10 +3168,10 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
3092struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); 3168struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
3093int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 3169int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
3094 struct btrfs_key *key, int lowest_level, 3170 struct btrfs_key *key, int lowest_level,
3095 int cache_only, u64 min_trans); 3171 u64 min_trans);
3096int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 3172int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3097 struct btrfs_key *max_key, 3173 struct btrfs_key *max_key,
3098 struct btrfs_path *path, int cache_only, 3174 struct btrfs_path *path,
3099 u64 min_trans); 3175 u64 min_trans);
3100enum btrfs_compare_tree_result { 3176enum btrfs_compare_tree_result {
3101 BTRFS_COMPARE_TREE_NEW, 3177 BTRFS_COMPARE_TREE_NEW,
@@ -3148,7 +3224,7 @@ int btrfs_search_slot_for_read(struct btrfs_root *root,
3148 int find_higher, int return_any); 3224 int find_higher, int return_any);
3149int btrfs_realloc_node(struct btrfs_trans_handle *trans, 3225int btrfs_realloc_node(struct btrfs_trans_handle *trans,
3150 struct btrfs_root *root, struct extent_buffer *parent, 3226 struct btrfs_root *root, struct extent_buffer *parent,
3151 int start_slot, int cache_only, u64 *last_ret, 3227 int start_slot, u64 *last_ret,
3152 struct btrfs_key *progress); 3228 struct btrfs_key *progress);
3153void btrfs_release_path(struct btrfs_path *p); 3229void btrfs_release_path(struct btrfs_path *p);
3154struct btrfs_path *btrfs_alloc_path(void); 3230struct btrfs_path *btrfs_alloc_path(void);
@@ -3459,9 +3535,9 @@ int btrfs_writepages(struct address_space *mapping,
3459 struct writeback_control *wbc); 3535 struct writeback_control *wbc);
3460int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 3536int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
3461 struct btrfs_root *new_root, u64 new_dirid); 3537 struct btrfs_root *new_root, u64 new_dirid);
3462int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 3538int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
3463 size_t size, struct bio *bio, unsigned long bio_flags); 3539 size_t size, struct bio *bio,
3464 3540 unsigned long bio_flags);
3465int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 3541int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
3466int btrfs_readpage(struct file *file, struct page *page); 3542int btrfs_readpage(struct file *file, struct page *page);
3467void btrfs_evict_inode(struct inode *inode); 3543void btrfs_evict_inode(struct inode *inode);
@@ -3543,7 +3619,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
3543 3619
3544/* tree-defrag.c */ 3620/* tree-defrag.c */
3545int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 3621int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
3546 struct btrfs_root *root, int cache_only); 3622 struct btrfs_root *root);
3547 3623
3548/* sysfs.c */ 3624/* sysfs.c */
3549int btrfs_init_sysfs(void); 3625int btrfs_init_sysfs(void);
@@ -3620,11 +3696,14 @@ __printf(5, 6)
3620void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, 3696void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
3621 unsigned int line, int errno, const char *fmt, ...); 3697 unsigned int line, int errno, const char *fmt, ...);
3622 3698
3699/*
3700 * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
3701 * will panic(). Otherwise we BUG() here.
3702 */
3623#define btrfs_panic(fs_info, errno, fmt, args...) \ 3703#define btrfs_panic(fs_info, errno, fmt, args...) \
3624do { \ 3704do { \
3625 struct btrfs_fs_info *_i = (fs_info); \ 3705 __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
3626 __btrfs_panic(_i, __func__, __LINE__, errno, fmt, ##args); \ 3706 BUG(); \
3627 BUG_ON(!(_i->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)); \
3628} while (0) 3707} while (0)
3629 3708
3630/* acl.c */ 3709/* acl.c */
@@ -3745,4 +3824,11 @@ static inline int is_fstree(u64 rootid)
3745 return 1; 3824 return 1;
3746 return 0; 3825 return 0;
3747} 3826}
3827
3828static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
3829{
3830 return signal_pending(current);
3831}
3832
3833
3748#endif 3834#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 34836036f01b..14fce27b4780 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -22,8 +22,9 @@
22#include "disk-io.h" 22#include "disk-io.h"
23#include "transaction.h" 23#include "transaction.h"
24 24
25#define BTRFS_DELAYED_WRITEBACK 400 25#define BTRFS_DELAYED_WRITEBACK 512
26#define BTRFS_DELAYED_BACKGROUND 100 26#define BTRFS_DELAYED_BACKGROUND 128
27#define BTRFS_DELAYED_BATCH 16
27 28
28static struct kmem_cache *delayed_node_cache; 29static struct kmem_cache *delayed_node_cache;
29 30
@@ -494,6 +495,15 @@ static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
494 BTRFS_DELAYED_DELETION_ITEM); 495 BTRFS_DELAYED_DELETION_ITEM);
495} 496}
496 497
498static void finish_one_item(struct btrfs_delayed_root *delayed_root)
499{
500 int seq = atomic_inc_return(&delayed_root->items_seq);
501 if ((atomic_dec_return(&delayed_root->items) <
502 BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) &&
503 waitqueue_active(&delayed_root->wait))
504 wake_up(&delayed_root->wait);
505}
506
497static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) 507static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
498{ 508{
499 struct rb_root *root; 509 struct rb_root *root;
@@ -512,10 +522,8 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
512 522
513 rb_erase(&delayed_item->rb_node, root); 523 rb_erase(&delayed_item->rb_node, root);
514 delayed_item->delayed_node->count--; 524 delayed_item->delayed_node->count--;
515 if (atomic_dec_return(&delayed_root->items) < 525
516 BTRFS_DELAYED_BACKGROUND && 526 finish_one_item(delayed_root);
517 waitqueue_active(&delayed_root->wait))
518 wake_up(&delayed_root->wait);
519} 527}
520 528
521static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) 529static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
@@ -875,7 +883,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
875 struct btrfs_delayed_item *delayed_item) 883 struct btrfs_delayed_item *delayed_item)
876{ 884{
877 struct extent_buffer *leaf; 885 struct extent_buffer *leaf;
878 struct btrfs_item *item;
879 char *ptr; 886 char *ptr;
880 int ret; 887 int ret;
881 888
@@ -886,7 +893,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
886 893
887 leaf = path->nodes[0]; 894 leaf = path->nodes[0];
888 895
889 item = btrfs_item_nr(leaf, path->slots[0]);
890 ptr = btrfs_item_ptr(leaf, path->slots[0], char); 896 ptr = btrfs_item_ptr(leaf, path->slots[0], char);
891 897
892 write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr, 898 write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
@@ -1058,39 +1064,29 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
1058 delayed_node->count--; 1064 delayed_node->count--;
1059 1065
1060 delayed_root = delayed_node->root->fs_info->delayed_root; 1066 delayed_root = delayed_node->root->fs_info->delayed_root;
1061 if (atomic_dec_return(&delayed_root->items) < 1067 finish_one_item(delayed_root);
1062 BTRFS_DELAYED_BACKGROUND &&
1063 waitqueue_active(&delayed_root->wait))
1064 wake_up(&delayed_root->wait);
1065 } 1068 }
1066} 1069}
1067 1070
1068static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, 1071static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1069 struct btrfs_root *root, 1072 struct btrfs_root *root,
1070 struct btrfs_path *path, 1073 struct btrfs_path *path,
1071 struct btrfs_delayed_node *node) 1074 struct btrfs_delayed_node *node)
1072{ 1075{
1073 struct btrfs_key key; 1076 struct btrfs_key key;
1074 struct btrfs_inode_item *inode_item; 1077 struct btrfs_inode_item *inode_item;
1075 struct extent_buffer *leaf; 1078 struct extent_buffer *leaf;
1076 int ret; 1079 int ret;
1077 1080
1078 mutex_lock(&node->mutex);
1079 if (!node->inode_dirty) {
1080 mutex_unlock(&node->mutex);
1081 return 0;
1082 }
1083
1084 key.objectid = node->inode_id; 1081 key.objectid = node->inode_id;
1085 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 1082 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
1086 key.offset = 0; 1083 key.offset = 0;
1084
1087 ret = btrfs_lookup_inode(trans, root, path, &key, 1); 1085 ret = btrfs_lookup_inode(trans, root, path, &key, 1);
1088 if (ret > 0) { 1086 if (ret > 0) {
1089 btrfs_release_path(path); 1087 btrfs_release_path(path);
1090 mutex_unlock(&node->mutex);
1091 return -ENOENT; 1088 return -ENOENT;
1092 } else if (ret < 0) { 1089 } else if (ret < 0) {
1093 mutex_unlock(&node->mutex);
1094 return ret; 1090 return ret;
1095 } 1091 }
1096 1092
@@ -1105,11 +1101,47 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1105 1101
1106 btrfs_delayed_inode_release_metadata(root, node); 1102 btrfs_delayed_inode_release_metadata(root, node);
1107 btrfs_release_delayed_inode(node); 1103 btrfs_release_delayed_inode(node);
1108 mutex_unlock(&node->mutex);
1109 1104
1110 return 0; 1105 return 0;
1111} 1106}
1112 1107
1108static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1109 struct btrfs_root *root,
1110 struct btrfs_path *path,
1111 struct btrfs_delayed_node *node)
1112{
1113 int ret;
1114
1115 mutex_lock(&node->mutex);
1116 if (!node->inode_dirty) {
1117 mutex_unlock(&node->mutex);
1118 return 0;
1119 }
1120
1121 ret = __btrfs_update_delayed_inode(trans, root, path, node);
1122 mutex_unlock(&node->mutex);
1123 return ret;
1124}
1125
1126static inline int
1127__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1128 struct btrfs_path *path,
1129 struct btrfs_delayed_node *node)
1130{
1131 int ret;
1132
1133 ret = btrfs_insert_delayed_items(trans, path, node->root, node);
1134 if (ret)
1135 return ret;
1136
1137 ret = btrfs_delete_delayed_items(trans, path, node->root, node);
1138 if (ret)
1139 return ret;
1140
1141 ret = btrfs_update_delayed_inode(trans, node->root, path, node);
1142 return ret;
1143}
1144
1113/* 1145/*
1114 * Called when committing the transaction. 1146 * Called when committing the transaction.
1115 * Returns 0 on success. 1147 * Returns 0 on success.
@@ -1119,7 +1151,6 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1119static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, 1151static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1120 struct btrfs_root *root, int nr) 1152 struct btrfs_root *root, int nr)
1121{ 1153{
1122 struct btrfs_root *curr_root = root;
1123 struct btrfs_delayed_root *delayed_root; 1154 struct btrfs_delayed_root *delayed_root;
1124 struct btrfs_delayed_node *curr_node, *prev_node; 1155 struct btrfs_delayed_node *curr_node, *prev_node;
1125 struct btrfs_path *path; 1156 struct btrfs_path *path;
@@ -1142,15 +1173,8 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1142 1173
1143 curr_node = btrfs_first_delayed_node(delayed_root); 1174 curr_node = btrfs_first_delayed_node(delayed_root);
1144 while (curr_node && (!count || (count && nr--))) { 1175 while (curr_node && (!count || (count && nr--))) {
1145 curr_root = curr_node->root; 1176 ret = __btrfs_commit_inode_delayed_items(trans, path,
1146 ret = btrfs_insert_delayed_items(trans, path, curr_root, 1177 curr_node);
1147 curr_node);
1148 if (!ret)
1149 ret = btrfs_delete_delayed_items(trans, path,
1150 curr_root, curr_node);
1151 if (!ret)
1152 ret = btrfs_update_delayed_inode(trans, curr_root,
1153 path, curr_node);
1154 if (ret) { 1178 if (ret) {
1155 btrfs_release_delayed_node(curr_node); 1179 btrfs_release_delayed_node(curr_node);
1156 curr_node = NULL; 1180 curr_node = NULL;
@@ -1183,51 +1207,93 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
1183 return __btrfs_run_delayed_items(trans, root, nr); 1207 return __btrfs_run_delayed_items(trans, root, nr);
1184} 1208}
1185 1209
1186static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, 1210int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1187 struct btrfs_delayed_node *node) 1211 struct inode *inode)
1188{ 1212{
1213 struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
1189 struct btrfs_path *path; 1214 struct btrfs_path *path;
1190 struct btrfs_block_rsv *block_rsv; 1215 struct btrfs_block_rsv *block_rsv;
1191 int ret; 1216 int ret;
1192 1217
1218 if (!delayed_node)
1219 return 0;
1220
1221 mutex_lock(&delayed_node->mutex);
1222 if (!delayed_node->count) {
1223 mutex_unlock(&delayed_node->mutex);
1224 btrfs_release_delayed_node(delayed_node);
1225 return 0;
1226 }
1227 mutex_unlock(&delayed_node->mutex);
1228
1193 path = btrfs_alloc_path(); 1229 path = btrfs_alloc_path();
1194 if (!path) 1230 if (!path)
1195 return -ENOMEM; 1231 return -ENOMEM;
1196 path->leave_spinning = 1; 1232 path->leave_spinning = 1;
1197 1233
1198 block_rsv = trans->block_rsv; 1234 block_rsv = trans->block_rsv;
1199 trans->block_rsv = &node->root->fs_info->delayed_block_rsv; 1235 trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
1200 1236
1201 ret = btrfs_insert_delayed_items(trans, path, node->root, node); 1237 ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
1202 if (!ret)
1203 ret = btrfs_delete_delayed_items(trans, path, node->root, node);
1204 if (!ret)
1205 ret = btrfs_update_delayed_inode(trans, node->root, path, node);
1206 btrfs_free_path(path);
1207 1238
1239 btrfs_release_delayed_node(delayed_node);
1240 btrfs_free_path(path);
1208 trans->block_rsv = block_rsv; 1241 trans->block_rsv = block_rsv;
1242
1209 return ret; 1243 return ret;
1210} 1244}
1211 1245
1212int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, 1246int btrfs_commit_inode_delayed_inode(struct inode *inode)
1213 struct inode *inode)
1214{ 1247{
1248 struct btrfs_trans_handle *trans;
1215 struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); 1249 struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
1250 struct btrfs_path *path;
1251 struct btrfs_block_rsv *block_rsv;
1216 int ret; 1252 int ret;
1217 1253
1218 if (!delayed_node) 1254 if (!delayed_node)
1219 return 0; 1255 return 0;
1220 1256
1221 mutex_lock(&delayed_node->mutex); 1257 mutex_lock(&delayed_node->mutex);
1222 if (!delayed_node->count) { 1258 if (!delayed_node->inode_dirty) {
1223 mutex_unlock(&delayed_node->mutex); 1259 mutex_unlock(&delayed_node->mutex);
1224 btrfs_release_delayed_node(delayed_node); 1260 btrfs_release_delayed_node(delayed_node);
1225 return 0; 1261 return 0;
1226 } 1262 }
1227 mutex_unlock(&delayed_node->mutex); 1263 mutex_unlock(&delayed_node->mutex);
1228 1264
1229 ret = __btrfs_commit_inode_delayed_items(trans, delayed_node); 1265 trans = btrfs_join_transaction(delayed_node->root);
1266 if (IS_ERR(trans)) {
1267 ret = PTR_ERR(trans);
1268 goto out;
1269 }
1270
1271 path = btrfs_alloc_path();
1272 if (!path) {
1273 ret = -ENOMEM;
1274 goto trans_out;
1275 }
1276 path->leave_spinning = 1;
1277
1278 block_rsv = trans->block_rsv;
1279 trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
1280
1281 mutex_lock(&delayed_node->mutex);
1282 if (delayed_node->inode_dirty)
1283 ret = __btrfs_update_delayed_inode(trans, delayed_node->root,
1284 path, delayed_node);
1285 else
1286 ret = 0;
1287 mutex_unlock(&delayed_node->mutex);
1288
1289 btrfs_free_path(path);
1290 trans->block_rsv = block_rsv;
1291trans_out:
1292 btrfs_end_transaction(trans, delayed_node->root);
1293 btrfs_btree_balance_dirty(delayed_node->root);
1294out:
1230 btrfs_release_delayed_node(delayed_node); 1295 btrfs_release_delayed_node(delayed_node);
1296
1231 return ret; 1297 return ret;
1232} 1298}
1233 1299
@@ -1243,48 +1309,49 @@ void btrfs_remove_delayed_node(struct inode *inode)
1243 btrfs_release_delayed_node(delayed_node); 1309 btrfs_release_delayed_node(delayed_node);
1244} 1310}
1245 1311
1246struct btrfs_async_delayed_node { 1312struct btrfs_async_delayed_work {
1247 struct btrfs_root *root; 1313 struct btrfs_delayed_root *delayed_root;
1248 struct btrfs_delayed_node *delayed_node; 1314 int nr;
1249 struct btrfs_work work; 1315 struct btrfs_work work;
1250}; 1316};
1251 1317
1252static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) 1318static void btrfs_async_run_delayed_root(struct btrfs_work *work)
1253{ 1319{
1254 struct btrfs_async_delayed_node *async_node; 1320 struct btrfs_async_delayed_work *async_work;
1321 struct btrfs_delayed_root *delayed_root;
1255 struct btrfs_trans_handle *trans; 1322 struct btrfs_trans_handle *trans;
1256 struct btrfs_path *path; 1323 struct btrfs_path *path;
1257 struct btrfs_delayed_node *delayed_node = NULL; 1324 struct btrfs_delayed_node *delayed_node = NULL;
1258 struct btrfs_root *root; 1325 struct btrfs_root *root;
1259 struct btrfs_block_rsv *block_rsv; 1326 struct btrfs_block_rsv *block_rsv;
1260 int need_requeue = 0; 1327 int total_done = 0;
1261 int ret;
1262 1328
1263 async_node = container_of(work, struct btrfs_async_delayed_node, work); 1329 async_work = container_of(work, struct btrfs_async_delayed_work, work);
1330 delayed_root = async_work->delayed_root;
1264 1331
1265 path = btrfs_alloc_path(); 1332 path = btrfs_alloc_path();
1266 if (!path) 1333 if (!path)
1267 goto out; 1334 goto out;
1268 path->leave_spinning = 1;
1269 1335
1270 delayed_node = async_node->delayed_node; 1336again:
1337 if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND / 2)
1338 goto free_path;
1339
1340 delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
1341 if (!delayed_node)
1342 goto free_path;
1343
1344 path->leave_spinning = 1;
1271 root = delayed_node->root; 1345 root = delayed_node->root;
1272 1346
1273 trans = btrfs_join_transaction(root); 1347 trans = btrfs_join_transaction(root);
1274 if (IS_ERR(trans)) 1348 if (IS_ERR(trans))
1275 goto free_path; 1349 goto release_path;
1276 1350
1277 block_rsv = trans->block_rsv; 1351 block_rsv = trans->block_rsv;
1278 trans->block_rsv = &root->fs_info->delayed_block_rsv; 1352 trans->block_rsv = &root->fs_info->delayed_block_rsv;
1279 1353
1280 ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); 1354 __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
1281 if (!ret)
1282 ret = btrfs_delete_delayed_items(trans, path, root,
1283 delayed_node);
1284
1285 if (!ret)
1286 btrfs_update_delayed_inode(trans, root, path, delayed_node);
1287
1288 /* 1355 /*
1289 * Maybe new delayed items have been inserted, so we need requeue 1356 * Maybe new delayed items have been inserted, so we need requeue
1290 * the work. Besides that, we must dequeue the empty delayed nodes 1357 * the work. Besides that, we must dequeue the empty delayed nodes
@@ -1310,57 +1377,47 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1310 * Task1 will sleep until the transaction is commited. 1377 * Task1 will sleep until the transaction is commited.
1311 */ 1378 */
1312 mutex_lock(&delayed_node->mutex); 1379 mutex_lock(&delayed_node->mutex);
1313 if (delayed_node->count) 1380 btrfs_dequeue_delayed_node(root->fs_info->delayed_root, delayed_node);
1314 need_requeue = 1;
1315 else
1316 btrfs_dequeue_delayed_node(root->fs_info->delayed_root,
1317 delayed_node);
1318 mutex_unlock(&delayed_node->mutex); 1381 mutex_unlock(&delayed_node->mutex);
1319 1382
1320 trans->block_rsv = block_rsv; 1383 trans->block_rsv = block_rsv;
1321 btrfs_end_transaction_dmeta(trans, root); 1384 btrfs_end_transaction_dmeta(trans, root);
1322 btrfs_btree_balance_dirty_nodelay(root); 1385 btrfs_btree_balance_dirty_nodelay(root);
1386
1387release_path:
1388 btrfs_release_path(path);
1389 total_done++;
1390
1391 btrfs_release_prepared_delayed_node(delayed_node);
1392 if (async_work->nr == 0 || total_done < async_work->nr)
1393 goto again;
1394
1323free_path: 1395free_path:
1324 btrfs_free_path(path); 1396 btrfs_free_path(path);
1325out: 1397out:
1326 if (need_requeue) 1398 wake_up(&delayed_root->wait);
1327 btrfs_requeue_work(&async_node->work); 1399 kfree(async_work);
1328 else {
1329 btrfs_release_prepared_delayed_node(delayed_node);
1330 kfree(async_node);
1331 }
1332} 1400}
1333 1401
1402
1334static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, 1403static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
1335 struct btrfs_root *root, int all) 1404 struct btrfs_root *root, int nr)
1336{ 1405{
1337 struct btrfs_async_delayed_node *async_node; 1406 struct btrfs_async_delayed_work *async_work;
1338 struct btrfs_delayed_node *curr;
1339 int count = 0;
1340 1407
1341again: 1408 if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
1342 curr = btrfs_first_prepared_delayed_node(delayed_root);
1343 if (!curr)
1344 return 0; 1409 return 0;
1345 1410
1346 async_node = kmalloc(sizeof(*async_node), GFP_NOFS); 1411 async_work = kmalloc(sizeof(*async_work), GFP_NOFS);
1347 if (!async_node) { 1412 if (!async_work)
1348 btrfs_release_prepared_delayed_node(curr);
1349 return -ENOMEM; 1413 return -ENOMEM;
1350 }
1351
1352 async_node->root = root;
1353 async_node->delayed_node = curr;
1354 1414
1355 async_node->work.func = btrfs_async_run_delayed_node_done; 1415 async_work->delayed_root = delayed_root;
1356 async_node->work.flags = 0; 1416 async_work->work.func = btrfs_async_run_delayed_root;
1357 1417 async_work->work.flags = 0;
1358 btrfs_queue_worker(&root->fs_info->delayed_workers, &async_node->work); 1418 async_work->nr = nr;
1359 count++;
1360
1361 if (all || count < 4)
1362 goto again;
1363 1419
1420 btrfs_queue_worker(&root->fs_info->delayed_workers, &async_work->work);
1364 return 0; 1421 return 0;
1365} 1422}
1366 1423
@@ -1371,30 +1428,55 @@ void btrfs_assert_delayed_root_empty(struct btrfs_root *root)
1371 WARN_ON(btrfs_first_delayed_node(delayed_root)); 1428 WARN_ON(btrfs_first_delayed_node(delayed_root));
1372} 1429}
1373 1430
1431static int refs_newer(struct btrfs_delayed_root *delayed_root,
1432 int seq, int count)
1433{
1434 int val = atomic_read(&delayed_root->items_seq);
1435
1436 if (val < seq || val >= seq + count)
1437 return 1;
1438 return 0;
1439}
1440
1374void btrfs_balance_delayed_items(struct btrfs_root *root) 1441void btrfs_balance_delayed_items(struct btrfs_root *root)
1375{ 1442{
1376 struct btrfs_delayed_root *delayed_root; 1443 struct btrfs_delayed_root *delayed_root;
1444 int seq;
1377 1445
1378 delayed_root = btrfs_get_delayed_root(root); 1446 delayed_root = btrfs_get_delayed_root(root);
1379 1447
1380 if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) 1448 if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
1381 return; 1449 return;
1382 1450
1451 seq = atomic_read(&delayed_root->items_seq);
1452
1383 if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) { 1453 if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
1384 int ret; 1454 int ret;
1385 ret = btrfs_wq_run_delayed_node(delayed_root, root, 1); 1455 DEFINE_WAIT(__wait);
1456
1457 ret = btrfs_wq_run_delayed_node(delayed_root, root, 0);
1386 if (ret) 1458 if (ret)
1387 return; 1459 return;
1388 1460
1389 wait_event_interruptible_timeout( 1461 while (1) {
1390 delayed_root->wait, 1462 prepare_to_wait(&delayed_root->wait, &__wait,
1391 (atomic_read(&delayed_root->items) < 1463 TASK_INTERRUPTIBLE);
1392 BTRFS_DELAYED_BACKGROUND), 1464
1393 HZ); 1465 if (refs_newer(delayed_root, seq,
1394 return; 1466 BTRFS_DELAYED_BATCH) ||
1467 atomic_read(&delayed_root->items) <
1468 BTRFS_DELAYED_BACKGROUND) {
1469 break;
1470 }
1471 if (!signal_pending(current))
1472 schedule();
1473 else
1474 break;
1475 }
1476 finish_wait(&delayed_root->wait, &__wait);
1395 } 1477 }
1396 1478
1397 btrfs_wq_run_delayed_node(delayed_root, root, 0); 1479 btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH);
1398} 1480}
1399 1481
1400/* Will return 0 or -ENOMEM */ 1482/* Will return 0 or -ENOMEM */
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 4f808e1baeed..1d5c5f7abe3e 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -43,6 +43,7 @@ struct btrfs_delayed_root {
43 */ 43 */
44 struct list_head prepare_list; 44 struct list_head prepare_list;
45 atomic_t items; /* for delayed items */ 45 atomic_t items; /* for delayed items */
46 atomic_t items_seq; /* for delayed items */
46 int nodes; /* for delayed nodes */ 47 int nodes; /* for delayed nodes */
47 wait_queue_head_t wait; 48 wait_queue_head_t wait;
48}; 49};
@@ -86,6 +87,7 @@ static inline void btrfs_init_delayed_root(
86 struct btrfs_delayed_root *delayed_root) 87 struct btrfs_delayed_root *delayed_root)
87{ 88{
88 atomic_set(&delayed_root->items, 0); 89 atomic_set(&delayed_root->items, 0);
90 atomic_set(&delayed_root->items_seq, 0);
89 delayed_root->nodes = 0; 91 delayed_root->nodes = 0;
90 spin_lock_init(&delayed_root->lock); 92 spin_lock_init(&delayed_root->lock);
91 init_waitqueue_head(&delayed_root->wait); 93 init_waitqueue_head(&delayed_root->wait);
@@ -117,6 +119,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
117/* Used for evicting the inode. */ 119/* Used for evicting the inode. */
118void btrfs_remove_delayed_node(struct inode *inode); 120void btrfs_remove_delayed_node(struct inode *inode);
119void btrfs_kill_delayed_inode_items(struct inode *inode); 121void btrfs_kill_delayed_inode_items(struct inode *inode);
122int btrfs_commit_inode_delayed_inode(struct inode *inode);
120 123
121 124
122int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, 125int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index ae9411773397..b7a0641ead77 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -23,6 +23,10 @@
23#include "delayed-ref.h" 23#include "delayed-ref.h"
24#include "transaction.h" 24#include "transaction.h"
25 25
26struct kmem_cache *btrfs_delayed_ref_head_cachep;
27struct kmem_cache *btrfs_delayed_tree_ref_cachep;
28struct kmem_cache *btrfs_delayed_data_ref_cachep;
29struct kmem_cache *btrfs_delayed_extent_op_cachep;
26/* 30/*
27 * delayed back reference update tracking. For subvolume trees 31 * delayed back reference update tracking. For subvolume trees
28 * we queue up extent allocations and backref maintenance for 32 * we queue up extent allocations and backref maintenance for
@@ -422,6 +426,14 @@ again:
422 return 1; 426 return 1;
423} 427}
424 428
429void btrfs_release_ref_cluster(struct list_head *cluster)
430{
431 struct list_head *pos, *q;
432
433 list_for_each_safe(pos, q, cluster)
434 list_del_init(pos);
435}
436
425/* 437/*
426 * helper function to update an extent delayed ref in the 438 * helper function to update an extent delayed ref in the
427 * rbtree. existing and update must both have the same 439 * rbtree. existing and update must both have the same
@@ -511,7 +523,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
511 ref->extent_op->flags_to_set; 523 ref->extent_op->flags_to_set;
512 existing_ref->extent_op->update_flags = 1; 524 existing_ref->extent_op->update_flags = 1;
513 } 525 }
514 kfree(ref->extent_op); 526 btrfs_free_delayed_extent_op(ref->extent_op);
515 } 527 }
516 } 528 }
517 /* 529 /*
@@ -592,7 +604,7 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
592 * we've updated the existing ref, free the newly 604 * we've updated the existing ref, free the newly
593 * allocated ref 605 * allocated ref
594 */ 606 */
595 kfree(head_ref); 607 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
596 } else { 608 } else {
597 delayed_refs->num_heads++; 609 delayed_refs->num_heads++;
598 delayed_refs->num_heads_ready++; 610 delayed_refs->num_heads_ready++;
@@ -653,7 +665,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
653 * we've updated the existing ref, free the newly 665 * we've updated the existing ref, free the newly
654 * allocated ref 666 * allocated ref
655 */ 667 */
656 kfree(full_ref); 668 kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
657 } else { 669 } else {
658 delayed_refs->num_entries++; 670 delayed_refs->num_entries++;
659 trans->delayed_ref_updates++; 671 trans->delayed_ref_updates++;
@@ -714,7 +726,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
714 * we've updated the existing ref, free the newly 726 * we've updated the existing ref, free the newly
715 * allocated ref 727 * allocated ref
716 */ 728 */
717 kfree(full_ref); 729 kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
718 } else { 730 } else {
719 delayed_refs->num_entries++; 731 delayed_refs->num_entries++;
720 trans->delayed_ref_updates++; 732 trans->delayed_ref_updates++;
@@ -738,13 +750,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
738 struct btrfs_delayed_ref_root *delayed_refs; 750 struct btrfs_delayed_ref_root *delayed_refs;
739 751
740 BUG_ON(extent_op && extent_op->is_data); 752 BUG_ON(extent_op && extent_op->is_data);
741 ref = kmalloc(sizeof(*ref), GFP_NOFS); 753 ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
742 if (!ref) 754 if (!ref)
743 return -ENOMEM; 755 return -ENOMEM;
744 756
745 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); 757 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
746 if (!head_ref) { 758 if (!head_ref) {
747 kfree(ref); 759 kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
748 return -ENOMEM; 760 return -ENOMEM;
749 } 761 }
750 762
@@ -786,13 +798,13 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
786 struct btrfs_delayed_ref_root *delayed_refs; 798 struct btrfs_delayed_ref_root *delayed_refs;
787 799
788 BUG_ON(extent_op && !extent_op->is_data); 800 BUG_ON(extent_op && !extent_op->is_data);
789 ref = kmalloc(sizeof(*ref), GFP_NOFS); 801 ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
790 if (!ref) 802 if (!ref)
791 return -ENOMEM; 803 return -ENOMEM;
792 804
793 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); 805 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
794 if (!head_ref) { 806 if (!head_ref) {
795 kfree(ref); 807 kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
796 return -ENOMEM; 808 return -ENOMEM;
797 } 809 }
798 810
@@ -826,7 +838,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
826 struct btrfs_delayed_ref_head *head_ref; 838 struct btrfs_delayed_ref_head *head_ref;
827 struct btrfs_delayed_ref_root *delayed_refs; 839 struct btrfs_delayed_ref_root *delayed_refs;
828 840
829 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); 841 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
830 if (!head_ref) 842 if (!head_ref)
831 return -ENOMEM; 843 return -ENOMEM;
832 844
@@ -860,3 +872,51 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
860 return btrfs_delayed_node_to_head(ref); 872 return btrfs_delayed_node_to_head(ref);
861 return NULL; 873 return NULL;
862} 874}
875
876void btrfs_delayed_ref_exit(void)
877{
878 if (btrfs_delayed_ref_head_cachep)
879 kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
880 if (btrfs_delayed_tree_ref_cachep)
881 kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
882 if (btrfs_delayed_data_ref_cachep)
883 kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
884 if (btrfs_delayed_extent_op_cachep)
885 kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
886}
887
888int btrfs_delayed_ref_init(void)
889{
890 btrfs_delayed_ref_head_cachep = kmem_cache_create(
891 "btrfs_delayed_ref_head",
892 sizeof(struct btrfs_delayed_ref_head), 0,
893 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
894 if (!btrfs_delayed_ref_head_cachep)
895 goto fail;
896
897 btrfs_delayed_tree_ref_cachep = kmem_cache_create(
898 "btrfs_delayed_tree_ref",
899 sizeof(struct btrfs_delayed_tree_ref), 0,
900 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
901 if (!btrfs_delayed_tree_ref_cachep)
902 goto fail;
903
904 btrfs_delayed_data_ref_cachep = kmem_cache_create(
905 "btrfs_delayed_data_ref",
906 sizeof(struct btrfs_delayed_data_ref), 0,
907 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
908 if (!btrfs_delayed_data_ref_cachep)
909 goto fail;
910
911 btrfs_delayed_extent_op_cachep = kmem_cache_create(
912 "btrfs_delayed_extent_op",
913 sizeof(struct btrfs_delayed_extent_op), 0,
914 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
915 if (!btrfs_delayed_extent_op_cachep)
916 goto fail;
917
918 return 0;
919fail:
920 btrfs_delayed_ref_exit();
921 return -ENOMEM;
922}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index c9d703693df0..f75fcaf79aeb 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -132,6 +132,15 @@ struct btrfs_delayed_ref_root {
132 unsigned long num_heads_ready; 132 unsigned long num_heads_ready;
133 133
134 /* 134 /*
135 * bumped when someone is making progress on the delayed
136 * refs, so that other procs know they are just adding to
137 * contention intead of helping
138 */
139 atomic_t procs_running_refs;
140 atomic_t ref_seq;
141 wait_queue_head_t wait;
142
143 /*
135 * set when the tree is flushing before a transaction commit, 144 * set when the tree is flushing before a transaction commit,
136 * used by the throttling code to decide if new updates need 145 * used by the throttling code to decide if new updates need
137 * to be run right away 146 * to be run right away
@@ -141,12 +150,47 @@ struct btrfs_delayed_ref_root {
141 u64 run_delayed_start; 150 u64 run_delayed_start;
142}; 151};
143 152
153extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
154extern struct kmem_cache *btrfs_delayed_tree_ref_cachep;
155extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
156extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
157
158int btrfs_delayed_ref_init(void);
159void btrfs_delayed_ref_exit(void);
160
161static inline struct btrfs_delayed_extent_op *
162btrfs_alloc_delayed_extent_op(void)
163{
164 return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS);
165}
166
167static inline void
168btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
169{
170 if (op)
171 kmem_cache_free(btrfs_delayed_extent_op_cachep, op);
172}
173
144static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) 174static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
145{ 175{
146 WARN_ON(atomic_read(&ref->refs) == 0); 176 WARN_ON(atomic_read(&ref->refs) == 0);
147 if (atomic_dec_and_test(&ref->refs)) { 177 if (atomic_dec_and_test(&ref->refs)) {
148 WARN_ON(ref->in_tree); 178 WARN_ON(ref->in_tree);
149 kfree(ref); 179 switch (ref->type) {
180 case BTRFS_TREE_BLOCK_REF_KEY:
181 case BTRFS_SHARED_BLOCK_REF_KEY:
182 kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
183 break;
184 case BTRFS_EXTENT_DATA_REF_KEY:
185 case BTRFS_SHARED_DATA_REF_KEY:
186 kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
187 break;
188 case 0:
189 kmem_cache_free(btrfs_delayed_ref_head_cachep, ref);
190 break;
191 default:
192 BUG();
193 }
150 } 194 }
151} 195}
152 196
@@ -176,8 +220,14 @@ struct btrfs_delayed_ref_head *
176btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); 220btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
177int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, 221int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
178 struct btrfs_delayed_ref_head *head); 222 struct btrfs_delayed_ref_head *head);
223static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
224{
225 mutex_unlock(&head->mutex);
226}
227
179int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 228int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
180 struct list_head *cluster, u64 search_start); 229 struct list_head *cluster, u64 search_start);
230void btrfs_release_ref_cluster(struct list_head *cluster);
181 231
182int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, 232int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
183 struct btrfs_delayed_ref_root *delayed_refs, 233 struct btrfs_delayed_ref_root *delayed_refs,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 66dbc8dbddf7..7ba7b3900cb8 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -465,7 +465,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
465 * flush all outstanding I/O and inode extent mappings before the 465 * flush all outstanding I/O and inode extent mappings before the
466 * copy operation is declared as being finished 466 * copy operation is declared as being finished
467 */ 467 */
468 btrfs_start_delalloc_inodes(root, 0); 468 ret = btrfs_start_delalloc_inodes(root, 0);
469 if (ret) {
470 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
471 return ret;
472 }
469 btrfs_wait_ordered_extents(root, 0); 473 btrfs_wait_ordered_extents(root, 0);
470 474
471 trans = btrfs_start_transaction(root, 0); 475 trans = btrfs_start_transaction(root, 0);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a8f652dc940b..7d84651e850b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,6 +46,7 @@
46#include "check-integrity.h" 46#include "check-integrity.h"
47#include "rcu-string.h" 47#include "rcu-string.h"
48#include "dev-replace.h" 48#include "dev-replace.h"
49#include "raid56.h"
49 50
50#ifdef CONFIG_X86 51#ifdef CONFIG_X86
51#include <asm/cpufeature.h> 52#include <asm/cpufeature.h>
@@ -56,11 +57,12 @@ static void end_workqueue_fn(struct btrfs_work *work);
56static void free_fs_root(struct btrfs_root *root); 57static void free_fs_root(struct btrfs_root *root);
57static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, 58static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
58 int read_only); 59 int read_only);
59static void btrfs_destroy_ordered_operations(struct btrfs_root *root); 60static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
61 struct btrfs_root *root);
60static void btrfs_destroy_ordered_extents(struct btrfs_root *root); 62static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
61static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, 63static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
62 struct btrfs_root *root); 64 struct btrfs_root *root);
63static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t); 65static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t);
64static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root); 66static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
65static int btrfs_destroy_marked_extents(struct btrfs_root *root, 67static int btrfs_destroy_marked_extents(struct btrfs_root *root,
66 struct extent_io_tree *dirty_pages, 68 struct extent_io_tree *dirty_pages,
@@ -420,7 +422,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
420static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) 422static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
421{ 423{
422 struct extent_io_tree *tree; 424 struct extent_io_tree *tree;
423 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 425 u64 start = page_offset(page);
424 u64 found_start; 426 u64 found_start;
425 struct extent_buffer *eb; 427 struct extent_buffer *eb;
426 428
@@ -639,8 +641,15 @@ err:
639 btree_readahead_hook(root, eb, eb->start, ret); 641 btree_readahead_hook(root, eb, eb->start, ret);
640 } 642 }
641 643
642 if (ret) 644 if (ret) {
645 /*
646 * our io error hook is going to dec the io pages
647 * again, we have to make sure it has something
648 * to decrement
649 */
650 atomic_inc(&eb->io_pages);
643 clear_extent_buffer_uptodate(eb); 651 clear_extent_buffer_uptodate(eb);
652 }
644 free_extent_buffer(eb); 653 free_extent_buffer(eb);
645out: 654out:
646 return ret; 655 return ret;
@@ -654,6 +663,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
654 eb = (struct extent_buffer *)page->private; 663 eb = (struct extent_buffer *)page->private;
655 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 664 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
656 eb->read_mirror = failed_mirror; 665 eb->read_mirror = failed_mirror;
666 atomic_dec(&eb->io_pages);
657 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) 667 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
658 btree_readahead_hook(root, eb, eb->start, -EIO); 668 btree_readahead_hook(root, eb, eb->start, -EIO);
659 return -EIO; /* we fixed nothing */ 669 return -EIO; /* we fixed nothing */
@@ -670,17 +680,23 @@ static void end_workqueue_bio(struct bio *bio, int err)
670 end_io_wq->work.flags = 0; 680 end_io_wq->work.flags = 0;
671 681
672 if (bio->bi_rw & REQ_WRITE) { 682 if (bio->bi_rw & REQ_WRITE) {
673 if (end_io_wq->metadata == 1) 683 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
674 btrfs_queue_worker(&fs_info->endio_meta_write_workers, 684 btrfs_queue_worker(&fs_info->endio_meta_write_workers,
675 &end_io_wq->work); 685 &end_io_wq->work);
676 else if (end_io_wq->metadata == 2) 686 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
677 btrfs_queue_worker(&fs_info->endio_freespace_worker, 687 btrfs_queue_worker(&fs_info->endio_freespace_worker,
678 &end_io_wq->work); 688 &end_io_wq->work);
689 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
690 btrfs_queue_worker(&fs_info->endio_raid56_workers,
691 &end_io_wq->work);
679 else 692 else
680 btrfs_queue_worker(&fs_info->endio_write_workers, 693 btrfs_queue_worker(&fs_info->endio_write_workers,
681 &end_io_wq->work); 694 &end_io_wq->work);
682 } else { 695 } else {
683 if (end_io_wq->metadata) 696 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
697 btrfs_queue_worker(&fs_info->endio_raid56_workers,
698 &end_io_wq->work);
699 else if (end_io_wq->metadata)
684 btrfs_queue_worker(&fs_info->endio_meta_workers, 700 btrfs_queue_worker(&fs_info->endio_meta_workers,
685 &end_io_wq->work); 701 &end_io_wq->work);
686 else 702 else
@@ -695,6 +711,7 @@ static void end_workqueue_bio(struct bio *bio, int err)
695 * 0 - if data 711 * 0 - if data
696 * 1 - if normal metadta 712 * 1 - if normal metadta
697 * 2 - if writing to the free space cache area 713 * 2 - if writing to the free space cache area
714 * 3 - raid parity work
698 */ 715 */
699int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 716int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
700 int metadata) 717 int metadata)
@@ -946,18 +963,20 @@ static int btree_writepages(struct address_space *mapping,
946 struct writeback_control *wbc) 963 struct writeback_control *wbc)
947{ 964{
948 struct extent_io_tree *tree; 965 struct extent_io_tree *tree;
966 struct btrfs_fs_info *fs_info;
967 int ret;
968
949 tree = &BTRFS_I(mapping->host)->io_tree; 969 tree = &BTRFS_I(mapping->host)->io_tree;
950 if (wbc->sync_mode == WB_SYNC_NONE) { 970 if (wbc->sync_mode == WB_SYNC_NONE) {
951 struct btrfs_root *root = BTRFS_I(mapping->host)->root;
952 u64 num_dirty;
953 unsigned long thresh = 32 * 1024 * 1024;
954 971
955 if (wbc->for_kupdate) 972 if (wbc->for_kupdate)
956 return 0; 973 return 0;
957 974
975 fs_info = BTRFS_I(mapping->host)->root->fs_info;
958 /* this is a bit racy, but that's ok */ 976 /* this is a bit racy, but that's ok */
959 num_dirty = root->fs_info->dirty_metadata_bytes; 977 ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
960 if (num_dirty < thresh) 978 BTRFS_DIRTY_METADATA_THRESH);
979 if (ret < 0)
961 return 0; 980 return 0;
962 } 981 }
963 return btree_write_cache_pages(mapping, wbc); 982 return btree_write_cache_pages(mapping, wbc);
@@ -1125,24 +1144,16 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
1125void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1144void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1126 struct extent_buffer *buf) 1145 struct extent_buffer *buf)
1127{ 1146{
1147 struct btrfs_fs_info *fs_info = root->fs_info;
1148
1128 if (btrfs_header_generation(buf) == 1149 if (btrfs_header_generation(buf) ==
1129 root->fs_info->running_transaction->transid) { 1150 fs_info->running_transaction->transid) {
1130 btrfs_assert_tree_locked(buf); 1151 btrfs_assert_tree_locked(buf);
1131 1152
1132 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { 1153 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
1133 spin_lock(&root->fs_info->delalloc_lock); 1154 __percpu_counter_add(&fs_info->dirty_metadata_bytes,
1134 if (root->fs_info->dirty_metadata_bytes >= buf->len) 1155 -buf->len,
1135 root->fs_info->dirty_metadata_bytes -= buf->len; 1156 fs_info->dirty_metadata_batch);
1136 else {
1137 spin_unlock(&root->fs_info->delalloc_lock);
1138 btrfs_panic(root->fs_info, -EOVERFLOW,
1139 "Can't clear %lu bytes from "
1140 " dirty_mdatadata_bytes (%llu)",
1141 buf->len,
1142 root->fs_info->dirty_metadata_bytes);
1143 }
1144 spin_unlock(&root->fs_info->delalloc_lock);
1145
1146 /* ugh, clear_extent_buffer_dirty needs to lock the page */ 1157 /* ugh, clear_extent_buffer_dirty needs to lock the page */
1147 btrfs_set_lock_blocking(buf); 1158 btrfs_set_lock_blocking(buf);
1148 clear_extent_buffer_dirty(buf); 1159 clear_extent_buffer_dirty(buf);
@@ -1178,9 +1189,13 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1178 1189
1179 INIT_LIST_HEAD(&root->dirty_list); 1190 INIT_LIST_HEAD(&root->dirty_list);
1180 INIT_LIST_HEAD(&root->root_list); 1191 INIT_LIST_HEAD(&root->root_list);
1192 INIT_LIST_HEAD(&root->logged_list[0]);
1193 INIT_LIST_HEAD(&root->logged_list[1]);
1181 spin_lock_init(&root->orphan_lock); 1194 spin_lock_init(&root->orphan_lock);
1182 spin_lock_init(&root->inode_lock); 1195 spin_lock_init(&root->inode_lock);
1183 spin_lock_init(&root->accounting_lock); 1196 spin_lock_init(&root->accounting_lock);
1197 spin_lock_init(&root->log_extents_lock[0]);
1198 spin_lock_init(&root->log_extents_lock[1]);
1184 mutex_init(&root->objectid_mutex); 1199 mutex_init(&root->objectid_mutex);
1185 mutex_init(&root->log_mutex); 1200 mutex_init(&root->log_mutex);
1186 init_waitqueue_head(&root->log_writer_wait); 1201 init_waitqueue_head(&root->log_writer_wait);
@@ -2004,10 +2019,24 @@ int open_ctree(struct super_block *sb,
2004 goto fail_srcu; 2019 goto fail_srcu;
2005 } 2020 }
2006 2021
2022 ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0);
2023 if (ret) {
2024 err = ret;
2025 goto fail_bdi;
2026 }
2027 fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
2028 (1 + ilog2(nr_cpu_ids));
2029
2030 ret = percpu_counter_init(&fs_info->delalloc_bytes, 0);
2031 if (ret) {
2032 err = ret;
2033 goto fail_dirty_metadata_bytes;
2034 }
2035
2007 fs_info->btree_inode = new_inode(sb); 2036 fs_info->btree_inode = new_inode(sb);
2008 if (!fs_info->btree_inode) { 2037 if (!fs_info->btree_inode) {
2009 err = -ENOMEM; 2038 err = -ENOMEM;
2010 goto fail_bdi; 2039 goto fail_delalloc_bytes;
2011 } 2040 }
2012 2041
2013 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); 2042 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -2017,7 +2046,6 @@ int open_ctree(struct super_block *sb,
2017 INIT_LIST_HEAD(&fs_info->dead_roots); 2046 INIT_LIST_HEAD(&fs_info->dead_roots);
2018 INIT_LIST_HEAD(&fs_info->delayed_iputs); 2047 INIT_LIST_HEAD(&fs_info->delayed_iputs);
2019 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 2048 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
2020 INIT_LIST_HEAD(&fs_info->ordered_operations);
2021 INIT_LIST_HEAD(&fs_info->caching_block_groups); 2049 INIT_LIST_HEAD(&fs_info->caching_block_groups);
2022 spin_lock_init(&fs_info->delalloc_lock); 2050 spin_lock_init(&fs_info->delalloc_lock);
2023 spin_lock_init(&fs_info->trans_lock); 2051 spin_lock_init(&fs_info->trans_lock);
@@ -2028,6 +2056,7 @@ int open_ctree(struct super_block *sb,
2028 spin_lock_init(&fs_info->tree_mod_seq_lock); 2056 spin_lock_init(&fs_info->tree_mod_seq_lock);
2029 rwlock_init(&fs_info->tree_mod_log_lock); 2057 rwlock_init(&fs_info->tree_mod_log_lock);
2030 mutex_init(&fs_info->reloc_mutex); 2058 mutex_init(&fs_info->reloc_mutex);
2059 seqlock_init(&fs_info->profiles_lock);
2031 2060
2032 init_completion(&fs_info->kobj_unregister); 2061 init_completion(&fs_info->kobj_unregister);
2033 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 2062 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2126,6 +2155,7 @@ int open_ctree(struct super_block *sb,
2126 2155
2127 spin_lock_init(&fs_info->block_group_cache_lock); 2156 spin_lock_init(&fs_info->block_group_cache_lock);
2128 fs_info->block_group_cache_tree = RB_ROOT; 2157 fs_info->block_group_cache_tree = RB_ROOT;
2158 fs_info->first_logical_byte = (u64)-1;
2129 2159
2130 extent_io_tree_init(&fs_info->freed_extents[0], 2160 extent_io_tree_init(&fs_info->freed_extents[0],
2131 fs_info->btree_inode->i_mapping); 2161 fs_info->btree_inode->i_mapping);
@@ -2165,6 +2195,12 @@ int open_ctree(struct super_block *sb,
2165 init_waitqueue_head(&fs_info->transaction_blocked_wait); 2195 init_waitqueue_head(&fs_info->transaction_blocked_wait);
2166 init_waitqueue_head(&fs_info->async_submit_wait); 2196 init_waitqueue_head(&fs_info->async_submit_wait);
2167 2197
2198 ret = btrfs_alloc_stripe_hash_table(fs_info);
2199 if (ret) {
2200 err = ret;
2201 goto fail_alloc;
2202 }
2203
2168 __setup_root(4096, 4096, 4096, 4096, tree_root, 2204 __setup_root(4096, 4096, 4096, 4096, tree_root,
2169 fs_info, BTRFS_ROOT_TREE_OBJECTID); 2205 fs_info, BTRFS_ROOT_TREE_OBJECTID);
2170 2206
@@ -2187,7 +2223,8 @@ int open_ctree(struct super_block *sb,
2187 goto fail_alloc; 2223 goto fail_alloc;
2188 2224
2189 /* check FS state, whether FS is broken. */ 2225 /* check FS state, whether FS is broken. */
2190 fs_info->fs_state |= btrfs_super_flags(disk_super); 2226 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
2227 set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
2191 2228
2192 ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); 2229 ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
2193 if (ret) { 2230 if (ret) {
@@ -2261,6 +2298,8 @@ int open_ctree(struct super_block *sb,
2261 leafsize = btrfs_super_leafsize(disk_super); 2298 leafsize = btrfs_super_leafsize(disk_super);
2262 sectorsize = btrfs_super_sectorsize(disk_super); 2299 sectorsize = btrfs_super_sectorsize(disk_super);
2263 stripesize = btrfs_super_stripesize(disk_super); 2300 stripesize = btrfs_super_stripesize(disk_super);
2301 fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids));
2302 fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
2264 2303
2265 /* 2304 /*
2266 * mixed block groups end up with duplicate but slightly offset 2305 * mixed block groups end up with duplicate but slightly offset
@@ -2332,6 +2371,12 @@ int open_ctree(struct super_block *sb,
2332 btrfs_init_workers(&fs_info->endio_meta_write_workers, 2371 btrfs_init_workers(&fs_info->endio_meta_write_workers,
2333 "endio-meta-write", fs_info->thread_pool_size, 2372 "endio-meta-write", fs_info->thread_pool_size,
2334 &fs_info->generic_worker); 2373 &fs_info->generic_worker);
2374 btrfs_init_workers(&fs_info->endio_raid56_workers,
2375 "endio-raid56", fs_info->thread_pool_size,
2376 &fs_info->generic_worker);
2377 btrfs_init_workers(&fs_info->rmw_workers,
2378 "rmw", fs_info->thread_pool_size,
2379 &fs_info->generic_worker);
2335 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", 2380 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
2336 fs_info->thread_pool_size, 2381 fs_info->thread_pool_size,
2337 &fs_info->generic_worker); 2382 &fs_info->generic_worker);
@@ -2350,6 +2395,8 @@ int open_ctree(struct super_block *sb,
2350 */ 2395 */
2351 fs_info->endio_workers.idle_thresh = 4; 2396 fs_info->endio_workers.idle_thresh = 4;
2352 fs_info->endio_meta_workers.idle_thresh = 4; 2397 fs_info->endio_meta_workers.idle_thresh = 4;
2398 fs_info->endio_raid56_workers.idle_thresh = 4;
2399 fs_info->rmw_workers.idle_thresh = 2;
2353 2400
2354 fs_info->endio_write_workers.idle_thresh = 2; 2401 fs_info->endio_write_workers.idle_thresh = 2;
2355 fs_info->endio_meta_write_workers.idle_thresh = 2; 2402 fs_info->endio_meta_write_workers.idle_thresh = 2;
@@ -2366,6 +2413,8 @@ int open_ctree(struct super_block *sb,
2366 ret |= btrfs_start_workers(&fs_info->fixup_workers); 2413 ret |= btrfs_start_workers(&fs_info->fixup_workers);
2367 ret |= btrfs_start_workers(&fs_info->endio_workers); 2414 ret |= btrfs_start_workers(&fs_info->endio_workers);
2368 ret |= btrfs_start_workers(&fs_info->endio_meta_workers); 2415 ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
2416 ret |= btrfs_start_workers(&fs_info->rmw_workers);
2417 ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);
2369 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); 2418 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
2370 ret |= btrfs_start_workers(&fs_info->endio_write_workers); 2419 ret |= btrfs_start_workers(&fs_info->endio_write_workers);
2371 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); 2420 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
@@ -2390,8 +2439,7 @@ int open_ctree(struct super_block *sb,
2390 sb->s_blocksize = sectorsize; 2439 sb->s_blocksize = sectorsize;
2391 sb->s_blocksize_bits = blksize_bits(sectorsize); 2440 sb->s_blocksize_bits = blksize_bits(sectorsize);
2392 2441
2393 if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, 2442 if (disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) {
2394 sizeof(disk_super->magic))) {
2395 printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); 2443 printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
2396 goto fail_sb_buffer; 2444 goto fail_sb_buffer;
2397 } 2445 }
@@ -2694,13 +2742,13 @@ fail_cleaner:
2694 * kthreads 2742 * kthreads
2695 */ 2743 */
2696 filemap_write_and_wait(fs_info->btree_inode->i_mapping); 2744 filemap_write_and_wait(fs_info->btree_inode->i_mapping);
2697 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2698 2745
2699fail_block_groups: 2746fail_block_groups:
2700 btrfs_free_block_groups(fs_info); 2747 btrfs_free_block_groups(fs_info);
2701 2748
2702fail_tree_roots: 2749fail_tree_roots:
2703 free_root_pointers(fs_info, 1); 2750 free_root_pointers(fs_info, 1);
2751 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2704 2752
2705fail_sb_buffer: 2753fail_sb_buffer:
2706 btrfs_stop_workers(&fs_info->generic_worker); 2754 btrfs_stop_workers(&fs_info->generic_worker);
@@ -2710,6 +2758,8 @@ fail_sb_buffer:
2710 btrfs_stop_workers(&fs_info->workers); 2758 btrfs_stop_workers(&fs_info->workers);
2711 btrfs_stop_workers(&fs_info->endio_workers); 2759 btrfs_stop_workers(&fs_info->endio_workers);
2712 btrfs_stop_workers(&fs_info->endio_meta_workers); 2760 btrfs_stop_workers(&fs_info->endio_meta_workers);
2761 btrfs_stop_workers(&fs_info->endio_raid56_workers);
2762 btrfs_stop_workers(&fs_info->rmw_workers);
2713 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2763 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2714 btrfs_stop_workers(&fs_info->endio_write_workers); 2764 btrfs_stop_workers(&fs_info->endio_write_workers);
2715 btrfs_stop_workers(&fs_info->endio_freespace_worker); 2765 btrfs_stop_workers(&fs_info->endio_freespace_worker);
@@ -2721,13 +2771,17 @@ fail_alloc:
2721fail_iput: 2771fail_iput:
2722 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2772 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2723 2773
2724 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2725 iput(fs_info->btree_inode); 2774 iput(fs_info->btree_inode);
2775fail_delalloc_bytes:
2776 percpu_counter_destroy(&fs_info->delalloc_bytes);
2777fail_dirty_metadata_bytes:
2778 percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
2726fail_bdi: 2779fail_bdi:
2727 bdi_destroy(&fs_info->bdi); 2780 bdi_destroy(&fs_info->bdi);
2728fail_srcu: 2781fail_srcu:
2729 cleanup_srcu_struct(&fs_info->subvol_srcu); 2782 cleanup_srcu_struct(&fs_info->subvol_srcu);
2730fail: 2783fail:
2784 btrfs_free_stripe_hash_table(fs_info);
2731 btrfs_close_devices(fs_info->fs_devices); 2785 btrfs_close_devices(fs_info->fs_devices);
2732 return err; 2786 return err;
2733 2787
@@ -2795,8 +2849,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
2795 2849
2796 super = (struct btrfs_super_block *)bh->b_data; 2850 super = (struct btrfs_super_block *)bh->b_data;
2797 if (btrfs_super_bytenr(super) != bytenr || 2851 if (btrfs_super_bytenr(super) != bytenr ||
2798 strncmp((char *)(&super->magic), BTRFS_MAGIC, 2852 super->magic != cpu_to_le64(BTRFS_MAGIC)) {
2799 sizeof(super->magic))) {
2800 brelse(bh); 2853 brelse(bh);
2801 continue; 2854 continue;
2802 } 2855 }
@@ -3076,11 +3129,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
3076 ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) 3129 ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
3077 == 0))) 3130 == 0)))
3078 num_tolerated_disk_barrier_failures = 0; 3131 num_tolerated_disk_barrier_failures = 0;
3079 else if (num_tolerated_disk_barrier_failures > 1 3132 else if (num_tolerated_disk_barrier_failures > 1) {
3080 && 3133 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3081 (flags & (BTRFS_BLOCK_GROUP_RAID1 | 3134 BTRFS_BLOCK_GROUP_RAID5 |
3082 BTRFS_BLOCK_GROUP_RAID10))) 3135 BTRFS_BLOCK_GROUP_RAID10)) {
3083 num_tolerated_disk_barrier_failures = 1; 3136 num_tolerated_disk_barrier_failures = 1;
3137 } else if (flags &
3138 BTRFS_BLOCK_GROUP_RAID5) {
3139 num_tolerated_disk_barrier_failures = 2;
3140 }
3141 }
3084 } 3142 }
3085 } 3143 }
3086 up_read(&sinfo->groups_sem); 3144 up_read(&sinfo->groups_sem);
@@ -3195,6 +3253,11 @@ void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
3195 if (btrfs_root_refs(&root->root_item) == 0) 3253 if (btrfs_root_refs(&root->root_item) == 0)
3196 synchronize_srcu(&fs_info->subvol_srcu); 3254 synchronize_srcu(&fs_info->subvol_srcu);
3197 3255
3256 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
3257 btrfs_free_log(NULL, root);
3258 btrfs_free_log_root_tree(NULL, fs_info);
3259 }
3260
3198 __btrfs_remove_free_space_cache(root->free_ino_pinned); 3261 __btrfs_remove_free_space_cache(root->free_ino_pinned);
3199 __btrfs_remove_free_space_cache(root->free_ino_ctl); 3262 __btrfs_remove_free_space_cache(root->free_ino_ctl);
3200 free_fs_root(root); 3263 free_fs_root(root);
@@ -3339,7 +3402,7 @@ int close_ctree(struct btrfs_root *root)
3339 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 3402 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
3340 } 3403 }
3341 3404
3342 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 3405 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3343 btrfs_error_commit_super(root); 3406 btrfs_error_commit_super(root);
3344 3407
3345 btrfs_put_block_group_cache(fs_info); 3408 btrfs_put_block_group_cache(fs_info);
@@ -3352,9 +3415,9 @@ int close_ctree(struct btrfs_root *root)
3352 3415
3353 btrfs_free_qgroup_config(root->fs_info); 3416 btrfs_free_qgroup_config(root->fs_info);
3354 3417
3355 if (fs_info->delalloc_bytes) { 3418 if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
3356 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", 3419 printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n",
3357 (unsigned long long)fs_info->delalloc_bytes); 3420 percpu_counter_sum(&fs_info->delalloc_bytes));
3358 } 3421 }
3359 3422
3360 free_extent_buffer(fs_info->extent_root->node); 3423 free_extent_buffer(fs_info->extent_root->node);
@@ -3384,6 +3447,8 @@ int close_ctree(struct btrfs_root *root)
3384 btrfs_stop_workers(&fs_info->workers); 3447 btrfs_stop_workers(&fs_info->workers);
3385 btrfs_stop_workers(&fs_info->endio_workers); 3448 btrfs_stop_workers(&fs_info->endio_workers);
3386 btrfs_stop_workers(&fs_info->endio_meta_workers); 3449 btrfs_stop_workers(&fs_info->endio_meta_workers);
3450 btrfs_stop_workers(&fs_info->endio_raid56_workers);
3451 btrfs_stop_workers(&fs_info->rmw_workers);
3387 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 3452 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
3388 btrfs_stop_workers(&fs_info->endio_write_workers); 3453 btrfs_stop_workers(&fs_info->endio_write_workers);
3389 btrfs_stop_workers(&fs_info->endio_freespace_worker); 3454 btrfs_stop_workers(&fs_info->endio_freespace_worker);
@@ -3401,9 +3466,13 @@ int close_ctree(struct btrfs_root *root)
3401 btrfs_close_devices(fs_info->fs_devices); 3466 btrfs_close_devices(fs_info->fs_devices);
3402 btrfs_mapping_tree_free(&fs_info->mapping_tree); 3467 btrfs_mapping_tree_free(&fs_info->mapping_tree);
3403 3468
3469 percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
3470 percpu_counter_destroy(&fs_info->delalloc_bytes);
3404 bdi_destroy(&fs_info->bdi); 3471 bdi_destroy(&fs_info->bdi);
3405 cleanup_srcu_struct(&fs_info->subvol_srcu); 3472 cleanup_srcu_struct(&fs_info->subvol_srcu);
3406 3473
3474 btrfs_free_stripe_hash_table(fs_info);
3475
3407 return 0; 3476 return 0;
3408} 3477}
3409 3478
@@ -3443,11 +3512,10 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
3443 (unsigned long long)transid, 3512 (unsigned long long)transid,
3444 (unsigned long long)root->fs_info->generation); 3513 (unsigned long long)root->fs_info->generation);
3445 was_dirty = set_extent_buffer_dirty(buf); 3514 was_dirty = set_extent_buffer_dirty(buf);
3446 if (!was_dirty) { 3515 if (!was_dirty)
3447 spin_lock(&root->fs_info->delalloc_lock); 3516 __percpu_counter_add(&root->fs_info->dirty_metadata_bytes,
3448 root->fs_info->dirty_metadata_bytes += buf->len; 3517 buf->len,
3449 spin_unlock(&root->fs_info->delalloc_lock); 3518 root->fs_info->dirty_metadata_batch);
3450 }
3451} 3519}
3452 3520
3453static void __btrfs_btree_balance_dirty(struct btrfs_root *root, 3521static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
@@ -3457,8 +3525,7 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
3457 * looks as though older kernels can get into trouble with 3525 * looks as though older kernels can get into trouble with
3458 * this code, they end up stuck in balance_dirty_pages forever 3526 * this code, they end up stuck in balance_dirty_pages forever
3459 */ 3527 */
3460 u64 num_dirty; 3528 int ret;
3461 unsigned long thresh = 32 * 1024 * 1024;
3462 3529
3463 if (current->flags & PF_MEMALLOC) 3530 if (current->flags & PF_MEMALLOC)
3464 return; 3531 return;
@@ -3466,9 +3533,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
3466 if (flush_delayed) 3533 if (flush_delayed)
3467 btrfs_balance_delayed_items(root); 3534 btrfs_balance_delayed_items(root);
3468 3535
3469 num_dirty = root->fs_info->dirty_metadata_bytes; 3536 ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
3470 3537 BTRFS_DIRTY_METADATA_THRESH);
3471 if (num_dirty > thresh) { 3538 if (ret > 0) {
3472 balance_dirty_pages_ratelimited( 3539 balance_dirty_pages_ratelimited(
3473 root->fs_info->btree_inode->i_mapping); 3540 root->fs_info->btree_inode->i_mapping);
3474 } 3541 }
@@ -3518,7 +3585,8 @@ void btrfs_error_commit_super(struct btrfs_root *root)
3518 btrfs_cleanup_transaction(root); 3585 btrfs_cleanup_transaction(root);
3519} 3586}
3520 3587
3521static void btrfs_destroy_ordered_operations(struct btrfs_root *root) 3588static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
3589 struct btrfs_root *root)
3522{ 3590{
3523 struct btrfs_inode *btrfs_inode; 3591 struct btrfs_inode *btrfs_inode;
3524 struct list_head splice; 3592 struct list_head splice;
@@ -3528,7 +3596,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
3528 mutex_lock(&root->fs_info->ordered_operations_mutex); 3596 mutex_lock(&root->fs_info->ordered_operations_mutex);
3529 spin_lock(&root->fs_info->ordered_extent_lock); 3597 spin_lock(&root->fs_info->ordered_extent_lock);
3530 3598
3531 list_splice_init(&root->fs_info->ordered_operations, &splice); 3599 list_splice_init(&t->ordered_operations, &splice);
3532 while (!list_empty(&splice)) { 3600 while (!list_empty(&splice)) {
3533 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 3601 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
3534 ordered_operations); 3602 ordered_operations);
@@ -3544,35 +3612,16 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
3544 3612
3545static void btrfs_destroy_ordered_extents(struct btrfs_root *root) 3613static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
3546{ 3614{
3547 struct list_head splice;
3548 struct btrfs_ordered_extent *ordered; 3615 struct btrfs_ordered_extent *ordered;
3549 struct inode *inode;
3550
3551 INIT_LIST_HEAD(&splice);
3552 3616
3553 spin_lock(&root->fs_info->ordered_extent_lock); 3617 spin_lock(&root->fs_info->ordered_extent_lock);
3554 3618 /*
3555 list_splice_init(&root->fs_info->ordered_extents, &splice); 3619 * This will just short circuit the ordered completion stuff which will
3556 while (!list_empty(&splice)) { 3620 * make sure the ordered extent gets properly cleaned up.
3557 ordered = list_entry(splice.next, struct btrfs_ordered_extent, 3621 */
3558 root_extent_list); 3622 list_for_each_entry(ordered, &root->fs_info->ordered_extents,
3559 3623 root_extent_list)
3560 list_del_init(&ordered->root_extent_list); 3624 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
3561 atomic_inc(&ordered->refs);
3562
3563 /* the inode may be getting freed (in sys_unlink path). */
3564 inode = igrab(ordered->inode);
3565
3566 spin_unlock(&root->fs_info->ordered_extent_lock);
3567 if (inode)
3568 iput(inode);
3569
3570 atomic_set(&ordered->refs, 1);
3571 btrfs_put_ordered_extent(ordered);
3572
3573 spin_lock(&root->fs_info->ordered_extent_lock);
3574 }
3575
3576 spin_unlock(&root->fs_info->ordered_extent_lock); 3625 spin_unlock(&root->fs_info->ordered_extent_lock);
3577} 3626}
3578 3627
@@ -3594,11 +3643,11 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3594 } 3643 }
3595 3644
3596 while ((node = rb_first(&delayed_refs->root)) != NULL) { 3645 while ((node = rb_first(&delayed_refs->root)) != NULL) {
3597 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 3646 struct btrfs_delayed_ref_head *head = NULL;
3598 3647
3648 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
3599 atomic_set(&ref->refs, 1); 3649 atomic_set(&ref->refs, 1);
3600 if (btrfs_delayed_ref_is_head(ref)) { 3650 if (btrfs_delayed_ref_is_head(ref)) {
3601 struct btrfs_delayed_ref_head *head;
3602 3651
3603 head = btrfs_delayed_node_to_head(ref); 3652 head = btrfs_delayed_node_to_head(ref);
3604 if (!mutex_trylock(&head->mutex)) { 3653 if (!mutex_trylock(&head->mutex)) {
@@ -3614,16 +3663,18 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3614 continue; 3663 continue;
3615 } 3664 }
3616 3665
3617 kfree(head->extent_op); 3666 btrfs_free_delayed_extent_op(head->extent_op);
3618 delayed_refs->num_heads--; 3667 delayed_refs->num_heads--;
3619 if (list_empty(&head->cluster)) 3668 if (list_empty(&head->cluster))
3620 delayed_refs->num_heads_ready--; 3669 delayed_refs->num_heads_ready--;
3621 list_del_init(&head->cluster); 3670 list_del_init(&head->cluster);
3622 } 3671 }
3672
3623 ref->in_tree = 0; 3673 ref->in_tree = 0;
3624 rb_erase(&ref->rb_node, &delayed_refs->root); 3674 rb_erase(&ref->rb_node, &delayed_refs->root);
3625 delayed_refs->num_entries--; 3675 delayed_refs->num_entries--;
3626 3676 if (head)
3677 mutex_unlock(&head->mutex);
3627 spin_unlock(&delayed_refs->lock); 3678 spin_unlock(&delayed_refs->lock);
3628 btrfs_put_delayed_ref(ref); 3679 btrfs_put_delayed_ref(ref);
3629 3680
@@ -3636,7 +3687,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3636 return ret; 3687 return ret;
3637} 3688}
3638 3689
3639static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) 3690static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t)
3640{ 3691{
3641 struct btrfs_pending_snapshot *snapshot; 3692 struct btrfs_pending_snapshot *snapshot;
3642 struct list_head splice; 3693 struct list_head splice;
@@ -3649,10 +3700,8 @@ static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
3649 snapshot = list_entry(splice.next, 3700 snapshot = list_entry(splice.next,
3650 struct btrfs_pending_snapshot, 3701 struct btrfs_pending_snapshot,
3651 list); 3702 list);
3652 3703 snapshot->error = -ECANCELED;
3653 list_del_init(&snapshot->list); 3704 list_del_init(&snapshot->list);
3654
3655 kfree(snapshot);
3656 } 3705 }
3657} 3706}
3658 3707
@@ -3671,6 +3720,8 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
3671 delalloc_inodes); 3720 delalloc_inodes);
3672 3721
3673 list_del_init(&btrfs_inode->delalloc_inodes); 3722 list_del_init(&btrfs_inode->delalloc_inodes);
3723 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
3724 &btrfs_inode->runtime_flags);
3674 3725
3675 btrfs_invalidate_inodes(btrfs_inode->root); 3726 btrfs_invalidate_inodes(btrfs_inode->root);
3676 } 3727 }
@@ -3787,6 +3838,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
3787 cur_trans->blocked = 1; 3838 cur_trans->blocked = 1;
3788 wake_up(&root->fs_info->transaction_blocked_wait); 3839 wake_up(&root->fs_info->transaction_blocked_wait);
3789 3840
3841 btrfs_evict_pending_snapshots(cur_trans);
3842
3790 cur_trans->blocked = 0; 3843 cur_trans->blocked = 0;
3791 wake_up(&root->fs_info->transaction_wait); 3844 wake_up(&root->fs_info->transaction_wait);
3792 3845
@@ -3796,8 +3849,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
3796 btrfs_destroy_delayed_inodes(root); 3849 btrfs_destroy_delayed_inodes(root);
3797 btrfs_assert_delayed_root_empty(root); 3850 btrfs_assert_delayed_root_empty(root);
3798 3851
3799 btrfs_destroy_pending_snapshots(cur_trans);
3800
3801 btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages, 3852 btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages,
3802 EXTENT_DIRTY); 3853 EXTENT_DIRTY);
3803 btrfs_destroy_pinned_extent(root, 3854 btrfs_destroy_pinned_extent(root,
@@ -3823,10 +3874,8 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
3823 3874
3824 while (!list_empty(&list)) { 3875 while (!list_empty(&list)) {
3825 t = list_entry(list.next, struct btrfs_transaction, list); 3876 t = list_entry(list.next, struct btrfs_transaction, list);
3826 if (!t)
3827 break;
3828 3877
3829 btrfs_destroy_ordered_operations(root); 3878 btrfs_destroy_ordered_operations(t, root);
3830 3879
3831 btrfs_destroy_ordered_extents(root); 3880 btrfs_destroy_ordered_extents(root);
3832 3881
@@ -3843,6 +3892,8 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
3843 if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) 3892 if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
3844 wake_up(&root->fs_info->transaction_blocked_wait); 3893 wake_up(&root->fs_info->transaction_blocked_wait);
3845 3894
3895 btrfs_evict_pending_snapshots(t);
3896
3846 t->blocked = 0; 3897 t->blocked = 0;
3847 smp_mb(); 3898 smp_mb();
3848 if (waitqueue_active(&root->fs_info->transaction_wait)) 3899 if (waitqueue_active(&root->fs_info->transaction_wait))
@@ -3856,8 +3907,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
3856 btrfs_destroy_delayed_inodes(root); 3907 btrfs_destroy_delayed_inodes(root);
3857 btrfs_assert_delayed_root_empty(root); 3908 btrfs_assert_delayed_root_empty(root);
3858 3909
3859 btrfs_destroy_pending_snapshots(t);
3860
3861 btrfs_destroy_delalloc_inodes(root); 3910 btrfs_destroy_delalloc_inodes(root);
3862 3911
3863 spin_lock(&root->fs_info->trans_lock); 3912 spin_lock(&root->fs_info->trans_lock);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 305c33efb0e3..034d7dc552b2 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,6 +25,13 @@
25#define BTRFS_SUPER_MIRROR_MAX 3 25#define BTRFS_SUPER_MIRROR_MAX 3
26#define BTRFS_SUPER_MIRROR_SHIFT 12 26#define BTRFS_SUPER_MIRROR_SHIFT 12
27 27
28enum {
29 BTRFS_WQ_ENDIO_DATA = 0,
30 BTRFS_WQ_ENDIO_METADATA = 1,
31 BTRFS_WQ_ENDIO_FREE_SPACE = 2,
32 BTRFS_WQ_ENDIO_RAID56 = 3,
33};
34
28static inline u64 btrfs_sb_offset(int mirror) 35static inline u64 btrfs_sb_offset(int mirror)
29{ 36{
30 u64 start = 16 * 1024; 37 u64 start = 16 * 1024;
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 614f34a899c2..81ee29eeb7ca 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -22,10 +22,10 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
22 22
23 if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) { 23 if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
24 *max_len = BTRFS_FID_SIZE_CONNECTABLE; 24 *max_len = BTRFS_FID_SIZE_CONNECTABLE;
25 return 255; 25 return FILEID_INVALID;
26 } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) { 26 } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
27 *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE; 27 *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
28 return 255; 28 return FILEID_INVALID;
29 } 29 }
30 30
31 len = BTRFS_FID_SIZE_NON_CONNECTABLE; 31 len = BTRFS_FID_SIZE_NON_CONNECTABLE;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 521e9d4424f6..3e074dab2d57 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,6 +31,7 @@
31#include "print-tree.h" 31#include "print-tree.h"
32#include "transaction.h" 32#include "transaction.h"
33#include "volumes.h" 33#include "volumes.h"
34#include "raid56.h"
34#include "locking.h" 35#include "locking.h"
35#include "free-space-cache.h" 36#include "free-space-cache.h"
36#include "math.h" 37#include "math.h"
@@ -72,8 +73,7 @@ enum {
72 RESERVE_ALLOC_NO_ACCOUNT = 2, 73 RESERVE_ALLOC_NO_ACCOUNT = 2,
73}; 74};
74 75
75static int update_block_group(struct btrfs_trans_handle *trans, 76static int update_block_group(struct btrfs_root *root,
76 struct btrfs_root *root,
77 u64 bytenr, u64 num_bytes, int alloc); 77 u64 bytenr, u64 num_bytes, int alloc);
78static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 78static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
79 struct btrfs_root *root, 79 struct btrfs_root *root,
@@ -103,6 +103,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
103 int dump_block_groups); 103 int dump_block_groups);
104static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 104static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
105 u64 num_bytes, int reserve); 105 u64 num_bytes, int reserve);
106static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
107 u64 num_bytes);
106 108
107static noinline int 109static noinline int
108block_group_cache_done(struct btrfs_block_group_cache *cache) 110block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -162,6 +164,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
162 rb_link_node(&block_group->cache_node, parent, p); 164 rb_link_node(&block_group->cache_node, parent, p);
163 rb_insert_color(&block_group->cache_node, 165 rb_insert_color(&block_group->cache_node,
164 &info->block_group_cache_tree); 166 &info->block_group_cache_tree);
167
168 if (info->first_logical_byte > block_group->key.objectid)
169 info->first_logical_byte = block_group->key.objectid;
170
165 spin_unlock(&info->block_group_cache_lock); 171 spin_unlock(&info->block_group_cache_lock);
166 172
167 return 0; 173 return 0;
@@ -203,8 +209,11 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
203 break; 209 break;
204 } 210 }
205 } 211 }
206 if (ret) 212 if (ret) {
207 btrfs_get_block_group(ret); 213 btrfs_get_block_group(ret);
214 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
215 info->first_logical_byte = ret->key.objectid;
216 }
208 spin_unlock(&info->block_group_cache_lock); 217 spin_unlock(&info->block_group_cache_lock);
209 218
210 return ret; 219 return ret;
@@ -468,8 +477,6 @@ out:
468} 477}
469 478
470static int cache_block_group(struct btrfs_block_group_cache *cache, 479static int cache_block_group(struct btrfs_block_group_cache *cache,
471 struct btrfs_trans_handle *trans,
472 struct btrfs_root *root,
473 int load_cache_only) 480 int load_cache_only)
474{ 481{
475 DEFINE_WAIT(wait); 482 DEFINE_WAIT(wait);
@@ -527,12 +534,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
527 cache->cached = BTRFS_CACHE_FAST; 534 cache->cached = BTRFS_CACHE_FAST;
528 spin_unlock(&cache->lock); 535 spin_unlock(&cache->lock);
529 536
530 /*
531 * We can't do the read from on-disk cache during a commit since we need
532 * to have the normal tree locking. Also if we are currently trying to
533 * allocate blocks for the tree root we can't do the fast caching since
534 * we likely hold important locks.
535 */
536 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 537 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
537 ret = load_free_space_cache(fs_info, cache); 538 ret = load_free_space_cache(fs_info, cache);
538 539
@@ -1852,6 +1853,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1852 *actual_bytes = discarded_bytes; 1853 *actual_bytes = discarded_bytes;
1853 1854
1854 1855
1856 if (ret == -EOPNOTSUPP)
1857 ret = 0;
1855 return ret; 1858 return ret;
1856} 1859}
1857 1860
@@ -2143,7 +2146,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2143 node->num_bytes); 2146 node->num_bytes);
2144 } 2147 }
2145 } 2148 }
2146 mutex_unlock(&head->mutex);
2147 return ret; 2149 return ret;
2148 } 2150 }
2149 2151
@@ -2258,7 +2260,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2258 * process of being added. Don't run this ref yet. 2260 * process of being added. Don't run this ref yet.
2259 */ 2261 */
2260 list_del_init(&locked_ref->cluster); 2262 list_del_init(&locked_ref->cluster);
2261 mutex_unlock(&locked_ref->mutex); 2263 btrfs_delayed_ref_unlock(locked_ref);
2262 locked_ref = NULL; 2264 locked_ref = NULL;
2263 delayed_refs->num_heads_ready++; 2265 delayed_refs->num_heads_ready++;
2264 spin_unlock(&delayed_refs->lock); 2266 spin_unlock(&delayed_refs->lock);
@@ -2285,7 +2287,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2285 ref = &locked_ref->node; 2287 ref = &locked_ref->node;
2286 2288
2287 if (extent_op && must_insert_reserved) { 2289 if (extent_op && must_insert_reserved) {
2288 kfree(extent_op); 2290 btrfs_free_delayed_extent_op(extent_op);
2289 extent_op = NULL; 2291 extent_op = NULL;
2290 } 2292 }
2291 2293
@@ -2294,28 +2296,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2294 2296
2295 ret = run_delayed_extent_op(trans, root, 2297 ret = run_delayed_extent_op(trans, root,
2296 ref, extent_op); 2298 ref, extent_op);
2297 kfree(extent_op); 2299 btrfs_free_delayed_extent_op(extent_op);
2298 2300
2299 if (ret) { 2301 if (ret) {
2300 list_del_init(&locked_ref->cluster); 2302 printk(KERN_DEBUG
2301 mutex_unlock(&locked_ref->mutex); 2303 "btrfs: run_delayed_extent_op "
2302 2304 "returned %d\n", ret);
2303 printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
2304 spin_lock(&delayed_refs->lock); 2305 spin_lock(&delayed_refs->lock);
2306 btrfs_delayed_ref_unlock(locked_ref);
2305 return ret; 2307 return ret;
2306 } 2308 }
2307 2309
2308 goto next; 2310 goto next;
2309 } 2311 }
2310
2311 list_del_init(&locked_ref->cluster);
2312 locked_ref = NULL;
2313 } 2312 }
2314 2313
2315 ref->in_tree = 0; 2314 ref->in_tree = 0;
2316 rb_erase(&ref->rb_node, &delayed_refs->root); 2315 rb_erase(&ref->rb_node, &delayed_refs->root);
2317 delayed_refs->num_entries--; 2316 delayed_refs->num_entries--;
2318 if (locked_ref) { 2317 if (!btrfs_delayed_ref_is_head(ref)) {
2319 /* 2318 /*
2320 * when we play the delayed ref, also correct the 2319 * when we play the delayed ref, also correct the
2321 * ref_mod on head 2320 * ref_mod on head
@@ -2337,20 +2336,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2337 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2336 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2338 must_insert_reserved); 2337 must_insert_reserved);
2339 2338
2340 btrfs_put_delayed_ref(ref); 2339 btrfs_free_delayed_extent_op(extent_op);
2341 kfree(extent_op);
2342 count++;
2343
2344 if (ret) { 2340 if (ret) {
2345 if (locked_ref) { 2341 btrfs_delayed_ref_unlock(locked_ref);
2346 list_del_init(&locked_ref->cluster); 2342 btrfs_put_delayed_ref(ref);
2347 mutex_unlock(&locked_ref->mutex); 2343 printk(KERN_DEBUG
2348 } 2344 "btrfs: run_one_delayed_ref returned %d\n", ret);
2349 printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
2350 spin_lock(&delayed_refs->lock); 2345 spin_lock(&delayed_refs->lock);
2351 return ret; 2346 return ret;
2352 } 2347 }
2353 2348
2349 /*
2350 * If this node is a head, that means all the refs in this head
2351 * have been dealt with, and we will pick the next head to deal
2352 * with, so we must unlock the head and drop it from the cluster
2353 * list before we release it.
2354 */
2355 if (btrfs_delayed_ref_is_head(ref)) {
2356 list_del_init(&locked_ref->cluster);
2357 btrfs_delayed_ref_unlock(locked_ref);
2358 locked_ref = NULL;
2359 }
2360 btrfs_put_delayed_ref(ref);
2361 count++;
2354next: 2362next:
2355 cond_resched(); 2363 cond_resched();
2356 spin_lock(&delayed_refs->lock); 2364 spin_lock(&delayed_refs->lock);
@@ -2435,6 +2443,16 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2435 return ret; 2443 return ret;
2436} 2444}
2437 2445
2446static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
2447 int count)
2448{
2449 int val = atomic_read(&delayed_refs->ref_seq);
2450
2451 if (val < seq || val >= seq + count)
2452 return 1;
2453 return 0;
2454}
2455
2438/* 2456/*
2439 * this starts processing the delayed reference count updates and 2457 * this starts processing the delayed reference count updates and
2440 * extent insertions we have queued up so far. count can be 2458 * extent insertions we have queued up so far. count can be
@@ -2469,6 +2487,44 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2469 2487
2470 delayed_refs = &trans->transaction->delayed_refs; 2488 delayed_refs = &trans->transaction->delayed_refs;
2471 INIT_LIST_HEAD(&cluster); 2489 INIT_LIST_HEAD(&cluster);
2490 if (count == 0) {
2491 count = delayed_refs->num_entries * 2;
2492 run_most = 1;
2493 }
2494
2495 if (!run_all && !run_most) {
2496 int old;
2497 int seq = atomic_read(&delayed_refs->ref_seq);
2498
2499progress:
2500 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2501 if (old) {
2502 DEFINE_WAIT(__wait);
2503 if (delayed_refs->num_entries < 16348)
2504 return 0;
2505
2506 prepare_to_wait(&delayed_refs->wait, &__wait,
2507 TASK_UNINTERRUPTIBLE);
2508
2509 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2510 if (old) {
2511 schedule();
2512 finish_wait(&delayed_refs->wait, &__wait);
2513
2514 if (!refs_newer(delayed_refs, seq, 256))
2515 goto progress;
2516 else
2517 return 0;
2518 } else {
2519 finish_wait(&delayed_refs->wait, &__wait);
2520 goto again;
2521 }
2522 }
2523
2524 } else {
2525 atomic_inc(&delayed_refs->procs_running_refs);
2526 }
2527
2472again: 2528again:
2473 loops = 0; 2529 loops = 0;
2474 spin_lock(&delayed_refs->lock); 2530 spin_lock(&delayed_refs->lock);
@@ -2477,10 +2533,6 @@ again:
2477 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2533 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2478#endif 2534#endif
2479 2535
2480 if (count == 0) {
2481 count = delayed_refs->num_entries * 2;
2482 run_most = 1;
2483 }
2484 while (1) { 2536 while (1) {
2485 if (!(run_all || run_most) && 2537 if (!(run_all || run_most) &&
2486 delayed_refs->num_heads_ready < 64) 2538 delayed_refs->num_heads_ready < 64)
@@ -2500,11 +2552,15 @@ again:
2500 2552
2501 ret = run_clustered_refs(trans, root, &cluster); 2553 ret = run_clustered_refs(trans, root, &cluster);
2502 if (ret < 0) { 2554 if (ret < 0) {
2555 btrfs_release_ref_cluster(&cluster);
2503 spin_unlock(&delayed_refs->lock); 2556 spin_unlock(&delayed_refs->lock);
2504 btrfs_abort_transaction(trans, root, ret); 2557 btrfs_abort_transaction(trans, root, ret);
2558 atomic_dec(&delayed_refs->procs_running_refs);
2505 return ret; 2559 return ret;
2506 } 2560 }
2507 2561
2562 atomic_add(ret, &delayed_refs->ref_seq);
2563
2508 count -= min_t(unsigned long, ret, count); 2564 count -= min_t(unsigned long, ret, count);
2509 2565
2510 if (count == 0) 2566 if (count == 0)
@@ -2573,6 +2629,11 @@ again:
2573 goto again; 2629 goto again;
2574 } 2630 }
2575out: 2631out:
2632 atomic_dec(&delayed_refs->procs_running_refs);
2633 smp_mb();
2634 if (waitqueue_active(&delayed_refs->wait))
2635 wake_up(&delayed_refs->wait);
2636
2576 spin_unlock(&delayed_refs->lock); 2637 spin_unlock(&delayed_refs->lock);
2577 assert_qgroups_uptodate(trans); 2638 assert_qgroups_uptodate(trans);
2578 return 0; 2639 return 0;
@@ -2586,7 +2647,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2586 struct btrfs_delayed_extent_op *extent_op; 2647 struct btrfs_delayed_extent_op *extent_op;
2587 int ret; 2648 int ret;
2588 2649
2589 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 2650 extent_op = btrfs_alloc_delayed_extent_op();
2590 if (!extent_op) 2651 if (!extent_op)
2591 return -ENOMEM; 2652 return -ENOMEM;
2592 2653
@@ -2598,7 +2659,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2598 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, 2659 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2599 num_bytes, extent_op); 2660 num_bytes, extent_op);
2600 if (ret) 2661 if (ret)
2601 kfree(extent_op); 2662 btrfs_free_delayed_extent_op(extent_op);
2602 return ret; 2663 return ret;
2603} 2664}
2604 2665
@@ -3223,12 +3284,14 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3223 u64 extra_flags = chunk_to_extended(flags) & 3284 u64 extra_flags = chunk_to_extended(flags) &
3224 BTRFS_EXTENDED_PROFILE_MASK; 3285 BTRFS_EXTENDED_PROFILE_MASK;
3225 3286
3287 write_seqlock(&fs_info->profiles_lock);
3226 if (flags & BTRFS_BLOCK_GROUP_DATA) 3288 if (flags & BTRFS_BLOCK_GROUP_DATA)
3227 fs_info->avail_data_alloc_bits |= extra_flags; 3289 fs_info->avail_data_alloc_bits |= extra_flags;
3228 if (flags & BTRFS_BLOCK_GROUP_METADATA) 3290 if (flags & BTRFS_BLOCK_GROUP_METADATA)
3229 fs_info->avail_metadata_alloc_bits |= extra_flags; 3291 fs_info->avail_metadata_alloc_bits |= extra_flags;
3230 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3292 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3231 fs_info->avail_system_alloc_bits |= extra_flags; 3293 fs_info->avail_system_alloc_bits |= extra_flags;
3294 write_sequnlock(&fs_info->profiles_lock);
3232} 3295}
3233 3296
3234/* 3297/*
@@ -3276,6 +3339,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3276 u64 num_devices = root->fs_info->fs_devices->rw_devices + 3339 u64 num_devices = root->fs_info->fs_devices->rw_devices +
3277 root->fs_info->fs_devices->missing_devices; 3340 root->fs_info->fs_devices->missing_devices;
3278 u64 target; 3341 u64 target;
3342 u64 tmp;
3279 3343
3280 /* 3344 /*
3281 * see if restripe for this chunk_type is in progress, if so 3345 * see if restripe for this chunk_type is in progress, if so
@@ -3292,40 +3356,48 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3292 } 3356 }
3293 spin_unlock(&root->fs_info->balance_lock); 3357 spin_unlock(&root->fs_info->balance_lock);
3294 3358
3359 /* First, mask out the RAID levels which aren't possible */
3295 if (num_devices == 1) 3360 if (num_devices == 1)
3296 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 3361 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
3362 BTRFS_BLOCK_GROUP_RAID5);
3363 if (num_devices < 3)
3364 flags &= ~BTRFS_BLOCK_GROUP_RAID6;
3297 if (num_devices < 4) 3365 if (num_devices < 4)
3298 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 3366 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3299 3367
3300 if ((flags & BTRFS_BLOCK_GROUP_DUP) && 3368 tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3301 (flags & (BTRFS_BLOCK_GROUP_RAID1 | 3369 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
3302 BTRFS_BLOCK_GROUP_RAID10))) { 3370 BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
3303 flags &= ~BTRFS_BLOCK_GROUP_DUP; 3371 flags &= ~tmp;
3304 }
3305
3306 if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
3307 (flags & BTRFS_BLOCK_GROUP_RAID10)) {
3308 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
3309 }
3310 3372
3311 if ((flags & BTRFS_BLOCK_GROUP_RAID0) && 3373 if (tmp & BTRFS_BLOCK_GROUP_RAID6)
3312 ((flags & BTRFS_BLOCK_GROUP_RAID1) | 3374 tmp = BTRFS_BLOCK_GROUP_RAID6;
3313 (flags & BTRFS_BLOCK_GROUP_RAID10) | 3375 else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
3314 (flags & BTRFS_BLOCK_GROUP_DUP))) { 3376 tmp = BTRFS_BLOCK_GROUP_RAID5;
3315 flags &= ~BTRFS_BLOCK_GROUP_RAID0; 3377 else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
3316 } 3378 tmp = BTRFS_BLOCK_GROUP_RAID10;
3379 else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
3380 tmp = BTRFS_BLOCK_GROUP_RAID1;
3381 else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
3382 tmp = BTRFS_BLOCK_GROUP_RAID0;
3317 3383
3318 return extended_to_chunk(flags); 3384 return extended_to_chunk(flags | tmp);
3319} 3385}
3320 3386
3321static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) 3387static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3322{ 3388{
3323 if (flags & BTRFS_BLOCK_GROUP_DATA) 3389 unsigned seq;
3324 flags |= root->fs_info->avail_data_alloc_bits; 3390
3325 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3391 do {
3326 flags |= root->fs_info->avail_system_alloc_bits; 3392 seq = read_seqbegin(&root->fs_info->profiles_lock);
3327 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 3393
3328 flags |= root->fs_info->avail_metadata_alloc_bits; 3394 if (flags & BTRFS_BLOCK_GROUP_DATA)
3395 flags |= root->fs_info->avail_data_alloc_bits;
3396 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3397 flags |= root->fs_info->avail_system_alloc_bits;
3398 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3399 flags |= root->fs_info->avail_metadata_alloc_bits;
3400 } while (read_seqretry(&root->fs_info->profiles_lock, seq));
3329 3401
3330 return btrfs_reduce_alloc_profile(root, flags); 3402 return btrfs_reduce_alloc_profile(root, flags);
3331} 3403}
@@ -3333,6 +3405,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3333u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3405u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3334{ 3406{
3335 u64 flags; 3407 u64 flags;
3408 u64 ret;
3336 3409
3337 if (data) 3410 if (data)
3338 flags = BTRFS_BLOCK_GROUP_DATA; 3411 flags = BTRFS_BLOCK_GROUP_DATA;
@@ -3341,7 +3414,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3341 else 3414 else
3342 flags = BTRFS_BLOCK_GROUP_METADATA; 3415 flags = BTRFS_BLOCK_GROUP_METADATA;
3343 3416
3344 return get_alloc_profile(root, flags); 3417 ret = get_alloc_profile(root, flags);
3418 return ret;
3345} 3419}
3346 3420
3347/* 3421/*
@@ -3357,7 +3431,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3357 int ret = 0, committed = 0, alloc_chunk = 1; 3431 int ret = 0, committed = 0, alloc_chunk = 1;
3358 3432
3359 /* make sure bytes are sectorsize aligned */ 3433 /* make sure bytes are sectorsize aligned */
3360 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3434 bytes = ALIGN(bytes, root->sectorsize);
3361 3435
3362 if (root == root->fs_info->tree_root || 3436 if (root == root->fs_info->tree_root ||
3363 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { 3437 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
@@ -3452,7 +3526,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3452 struct btrfs_space_info *data_sinfo; 3526 struct btrfs_space_info *data_sinfo;
3453 3527
3454 /* make sure bytes are sectorsize aligned */ 3528 /* make sure bytes are sectorsize aligned */
3455 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3529 bytes = ALIGN(bytes, root->sectorsize);
3456 3530
3457 data_sinfo = root->fs_info->data_sinfo; 3531 data_sinfo = root->fs_info->data_sinfo;
3458 spin_lock(&data_sinfo->lock); 3532 spin_lock(&data_sinfo->lock);
@@ -3516,8 +3590,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
3516{ 3590{
3517 u64 num_dev; 3591 u64 num_dev;
3518 3592
3519 if (type & BTRFS_BLOCK_GROUP_RAID10 || 3593 if (type & (BTRFS_BLOCK_GROUP_RAID10 |
3520 type & BTRFS_BLOCK_GROUP_RAID0) 3594 BTRFS_BLOCK_GROUP_RAID0 |
3595 BTRFS_BLOCK_GROUP_RAID5 |
3596 BTRFS_BLOCK_GROUP_RAID6))
3521 num_dev = root->fs_info->fs_devices->rw_devices; 3597 num_dev = root->fs_info->fs_devices->rw_devices;
3522 else if (type & BTRFS_BLOCK_GROUP_RAID1) 3598 else if (type & BTRFS_BLOCK_GROUP_RAID1)
3523 num_dev = 2; 3599 num_dev = 2;
@@ -3564,6 +3640,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3564 int wait_for_alloc = 0; 3640 int wait_for_alloc = 0;
3565 int ret = 0; 3641 int ret = 0;
3566 3642
3643 /* Don't re-enter if we're already allocating a chunk */
3644 if (trans->allocating_chunk)
3645 return -ENOSPC;
3646
3567 space_info = __find_space_info(extent_root->fs_info, flags); 3647 space_info = __find_space_info(extent_root->fs_info, flags);
3568 if (!space_info) { 3648 if (!space_info) {
3569 ret = update_space_info(extent_root->fs_info, flags, 3649 ret = update_space_info(extent_root->fs_info, flags,
@@ -3606,6 +3686,8 @@ again:
3606 goto again; 3686 goto again;
3607 } 3687 }
3608 3688
3689 trans->allocating_chunk = true;
3690
3609 /* 3691 /*
3610 * If we have mixed data/metadata chunks we want to make sure we keep 3692 * If we have mixed data/metadata chunks we want to make sure we keep
3611 * allocating mixed chunks instead of individual chunks. 3693 * allocating mixed chunks instead of individual chunks.
@@ -3632,19 +3714,20 @@ again:
3632 check_system_chunk(trans, extent_root, flags); 3714 check_system_chunk(trans, extent_root, flags);
3633 3715
3634 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3716 ret = btrfs_alloc_chunk(trans, extent_root, flags);
3635 if (ret < 0 && ret != -ENOSPC) 3717 trans->allocating_chunk = false;
3636 goto out;
3637 3718
3638 spin_lock(&space_info->lock); 3719 spin_lock(&space_info->lock);
3720 if (ret < 0 && ret != -ENOSPC)
3721 goto out;
3639 if (ret) 3722 if (ret)
3640 space_info->full = 1; 3723 space_info->full = 1;
3641 else 3724 else
3642 ret = 1; 3725 ret = 1;
3643 3726
3644 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 3727 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3728out:
3645 space_info->chunk_alloc = 0; 3729 space_info->chunk_alloc = 0;
3646 spin_unlock(&space_info->lock); 3730 spin_unlock(&space_info->lock);
3647out:
3648 mutex_unlock(&fs_info->chunk_mutex); 3731 mutex_unlock(&fs_info->chunk_mutex);
3649 return ret; 3732 return ret;
3650} 3733}
@@ -3653,13 +3736,31 @@ static int can_overcommit(struct btrfs_root *root,
3653 struct btrfs_space_info *space_info, u64 bytes, 3736 struct btrfs_space_info *space_info, u64 bytes,
3654 enum btrfs_reserve_flush_enum flush) 3737 enum btrfs_reserve_flush_enum flush)
3655{ 3738{
3739 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3656 u64 profile = btrfs_get_alloc_profile(root, 0); 3740 u64 profile = btrfs_get_alloc_profile(root, 0);
3741 u64 rsv_size = 0;
3657 u64 avail; 3742 u64 avail;
3658 u64 used; 3743 u64 used;
3744 u64 to_add;
3659 3745
3660 used = space_info->bytes_used + space_info->bytes_reserved + 3746 used = space_info->bytes_used + space_info->bytes_reserved +
3661 space_info->bytes_pinned + space_info->bytes_readonly + 3747 space_info->bytes_pinned + space_info->bytes_readonly;
3662 space_info->bytes_may_use; 3748
3749 spin_lock(&global_rsv->lock);
3750 rsv_size = global_rsv->size;
3751 spin_unlock(&global_rsv->lock);
3752
3753 /*
3754 * We only want to allow over committing if we have lots of actual space
3755 * free, but if we don't have enough space to handle the global reserve
3756 * space then we could end up having a real enospc problem when trying
3757 * to allocate a chunk or some other such important allocation.
3758 */
3759 rsv_size <<= 1;
3760 if (used + rsv_size >= space_info->total_bytes)
3761 return 0;
3762
3763 used += space_info->bytes_may_use;
3663 3764
3664 spin_lock(&root->fs_info->free_chunk_lock); 3765 spin_lock(&root->fs_info->free_chunk_lock);
3665 avail = root->fs_info->free_chunk_space; 3766 avail = root->fs_info->free_chunk_space;
@@ -3667,40 +3768,58 @@ static int can_overcommit(struct btrfs_root *root,
3667 3768
3668 /* 3769 /*
3669 * If we have dup, raid1 or raid10 then only half of the free 3770 * If we have dup, raid1 or raid10 then only half of the free
3670 * space is actually useable. 3771 * space is actually useable. For raid56, the space info used
3772 * doesn't include the parity drive, so we don't have to
3773 * change the math
3671 */ 3774 */
3672 if (profile & (BTRFS_BLOCK_GROUP_DUP | 3775 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3673 BTRFS_BLOCK_GROUP_RAID1 | 3776 BTRFS_BLOCK_GROUP_RAID1 |
3674 BTRFS_BLOCK_GROUP_RAID10)) 3777 BTRFS_BLOCK_GROUP_RAID10))
3675 avail >>= 1; 3778 avail >>= 1;
3676 3779
3780 to_add = space_info->total_bytes;
3781
3677 /* 3782 /*
3678 * If we aren't flushing all things, let us overcommit up to 3783 * If we aren't flushing all things, let us overcommit up to
3679 * 1/2th of the space. If we can flush, don't let us overcommit 3784 * 1/2th of the space. If we can flush, don't let us overcommit
3680 * too much, let it overcommit up to 1/8 of the space. 3785 * too much, let it overcommit up to 1/8 of the space.
3681 */ 3786 */
3682 if (flush == BTRFS_RESERVE_FLUSH_ALL) 3787 if (flush == BTRFS_RESERVE_FLUSH_ALL)
3683 avail >>= 3; 3788 to_add >>= 3;
3684 else 3789 else
3685 avail >>= 1; 3790 to_add >>= 1;
3791
3792 /*
3793 * Limit the overcommit to the amount of free space we could possibly
3794 * allocate for chunks.
3795 */
3796 to_add = min(avail, to_add);
3686 3797
3687 if (used + bytes < space_info->total_bytes + avail) 3798 if (used + bytes < space_info->total_bytes + to_add)
3688 return 1; 3799 return 1;
3689 return 0; 3800 return 0;
3690} 3801}
3691 3802
3692static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb, 3803void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3693 unsigned long nr_pages, 3804 unsigned long nr_pages)
3694 enum wb_reason reason)
3695{ 3805{
3696 if (!writeback_in_progress(sb->s_bdi) && 3806 struct super_block *sb = root->fs_info->sb;
3697 down_read_trylock(&sb->s_umount)) { 3807 int started;
3698 writeback_inodes_sb_nr(sb, nr_pages, reason);
3699 up_read(&sb->s_umount);
3700 return 1;
3701 }
3702 3808
3703 return 0; 3809 /* If we can not start writeback, just sync all the delalloc file. */
3810 started = try_to_writeback_inodes_sb_nr(sb, nr_pages,
3811 WB_REASON_FS_FREE_SPACE);
3812 if (!started) {
3813 /*
3814 * We needn't worry the filesystem going from r/w to r/o though
3815 * we don't acquire ->s_umount mutex, because the filesystem
3816 * should guarantee the delalloc inodes list be empty after
3817 * the filesystem is readonly(all dirty pages are written to
3818 * the disk).
3819 */
3820 btrfs_start_delalloc_inodes(root, 0);
3821 btrfs_wait_ordered_extents(root, 0);
3822 }
3704} 3823}
3705 3824
3706/* 3825/*
@@ -3724,7 +3843,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3724 space_info = block_rsv->space_info; 3843 space_info = block_rsv->space_info;
3725 3844
3726 smp_mb(); 3845 smp_mb();
3727 delalloc_bytes = root->fs_info->delalloc_bytes; 3846 delalloc_bytes = percpu_counter_sum_positive(
3847 &root->fs_info->delalloc_bytes);
3728 if (delalloc_bytes == 0) { 3848 if (delalloc_bytes == 0) {
3729 if (trans) 3849 if (trans)
3730 return; 3850 return;
@@ -3735,10 +3855,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3735 while (delalloc_bytes && loops < 3) { 3855 while (delalloc_bytes && loops < 3) {
3736 max_reclaim = min(delalloc_bytes, to_reclaim); 3856 max_reclaim = min(delalloc_bytes, to_reclaim);
3737 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 3857 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
3738 writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb, 3858 btrfs_writeback_inodes_sb_nr(root, nr_pages);
3739 nr_pages,
3740 WB_REASON_FS_FREE_SPACE);
3741
3742 /* 3859 /*
3743 * We need to wait for the async pages to actually start before 3860 * We need to wait for the async pages to actually start before
3744 * we do anything. 3861 * we do anything.
@@ -3766,7 +3883,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3766 break; 3883 break;
3767 } 3884 }
3768 smp_mb(); 3885 smp_mb();
3769 delalloc_bytes = root->fs_info->delalloc_bytes; 3886 delalloc_bytes = percpu_counter_sum_positive(
3887 &root->fs_info->delalloc_bytes);
3770 } 3888 }
3771} 3889}
3772 3890
@@ -3997,7 +4115,7 @@ again:
3997 * We make the other tasks wait for the flush only when we can flush 4115 * We make the other tasks wait for the flush only when we can flush
3998 * all things. 4116 * all things.
3999 */ 4117 */
4000 if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) { 4118 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
4001 flushing = true; 4119 flushing = true;
4002 space_info->flush = 1; 4120 space_info->flush = 1;
4003 } 4121 }
@@ -4030,6 +4148,15 @@ again:
4030 goto again; 4148 goto again;
4031 4149
4032out: 4150out:
4151 if (ret == -ENOSPC &&
4152 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
4153 struct btrfs_block_rsv *global_rsv =
4154 &root->fs_info->global_block_rsv;
4155
4156 if (block_rsv != global_rsv &&
4157 !block_rsv_use_bytes(global_rsv, orig_bytes))
4158 ret = 0;
4159 }
4033 if (flushing) { 4160 if (flushing) {
4034 spin_lock(&space_info->lock); 4161 spin_lock(&space_info->lock);
4035 space_info->flush = 0; 4162 space_info->flush = 0;
@@ -4416,19 +4543,60 @@ void btrfs_orphan_release_metadata(struct inode *inode)
4416 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 4543 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4417} 4544}
4418 4545
4419int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, 4546/*
4420 struct btrfs_pending_snapshot *pending) 4547 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
4548 * root: the root of the parent directory
4549 * rsv: block reservation
4550 * items: the number of items that we need do reservation
4551 * qgroup_reserved: used to return the reserved size in qgroup
4552 *
4553 * This function is used to reserve the space for snapshot/subvolume
4554 * creation and deletion. Those operations are different with the
4555 * common file/directory operations, they change two fs/file trees
4556 * and root tree, the number of items that the qgroup reserves is
4557 * different with the free space reservation. So we can not use
4558 * the space reseravtion mechanism in start_transaction().
4559 */
4560int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
4561 struct btrfs_block_rsv *rsv,
4562 int items,
4563 u64 *qgroup_reserved)
4421{ 4564{
4422 struct btrfs_root *root = pending->root; 4565 u64 num_bytes;
4423 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); 4566 int ret;
4424 struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; 4567
4425 /* 4568 if (root->fs_info->quota_enabled) {
4426 * two for root back/forward refs, two for directory entries, 4569 /* One for parent inode, two for dir entries */
4427 * one for root of the snapshot and one for parent inode. 4570 num_bytes = 3 * root->leafsize;
4428 */ 4571 ret = btrfs_qgroup_reserve(root, num_bytes);
4429 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6); 4572 if (ret)
4430 dst_rsv->space_info = src_rsv->space_info; 4573 return ret;
4431 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4574 } else {
4575 num_bytes = 0;
4576 }
4577
4578 *qgroup_reserved = num_bytes;
4579
4580 num_bytes = btrfs_calc_trans_metadata_size(root, items);
4581 rsv->space_info = __find_space_info(root->fs_info,
4582 BTRFS_BLOCK_GROUP_METADATA);
4583 ret = btrfs_block_rsv_add(root, rsv, num_bytes,
4584 BTRFS_RESERVE_FLUSH_ALL);
4585 if (ret) {
4586 if (*qgroup_reserved)
4587 btrfs_qgroup_free(root, *qgroup_reserved);
4588 }
4589
4590 return ret;
4591}
4592
4593void btrfs_subvolume_release_metadata(struct btrfs_root *root,
4594 struct btrfs_block_rsv *rsv,
4595 u64 qgroup_reserved)
4596{
4597 btrfs_block_rsv_release(root, rsv, (u64)-1);
4598 if (qgroup_reserved)
4599 btrfs_qgroup_free(root, qgroup_reserved);
4432} 4600}
4433 4601
4434/** 4602/**
@@ -4534,8 +4702,10 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4534 unsigned nr_extents = 0; 4702 unsigned nr_extents = 0;
4535 int extra_reserve = 0; 4703 int extra_reserve = 0;
4536 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 4704 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
4537 int ret; 4705 int ret = 0;
4538 bool delalloc_lock = true; 4706 bool delalloc_lock = true;
4707 u64 to_free = 0;
4708 unsigned dropped;
4539 4709
4540 /* If we are a free space inode we need to not flush since we will be in 4710 /* If we are a free space inode we need to not flush since we will be in
4541 * the middle of a transaction commit. We also don't need the delalloc 4711 * the middle of a transaction commit. We also don't need the delalloc
@@ -4582,53 +4752,16 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4582 if (root->fs_info->quota_enabled) { 4752 if (root->fs_info->quota_enabled) {
4583 ret = btrfs_qgroup_reserve(root, num_bytes + 4753 ret = btrfs_qgroup_reserve(root, num_bytes +
4584 nr_extents * root->leafsize); 4754 nr_extents * root->leafsize);
4585 if (ret) { 4755 if (ret)
4586 spin_lock(&BTRFS_I(inode)->lock); 4756 goto out_fail;
4587 calc_csum_metadata_size(inode, num_bytes, 0);
4588 spin_unlock(&BTRFS_I(inode)->lock);
4589 if (delalloc_lock)
4590 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4591 return ret;
4592 }
4593 } 4757 }
4594 4758
4595 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); 4759 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4596 if (ret) { 4760 if (unlikely(ret)) {
4597 u64 to_free = 0; 4761 if (root->fs_info->quota_enabled)
4598 unsigned dropped;
4599
4600 spin_lock(&BTRFS_I(inode)->lock);
4601 dropped = drop_outstanding_extent(inode);
4602 /*
4603 * If the inodes csum_bytes is the same as the original
4604 * csum_bytes then we know we haven't raced with any free()ers
4605 * so we can just reduce our inodes csum bytes and carry on.
4606 * Otherwise we have to do the normal free thing to account for
4607 * the case that the free side didn't free up its reserve
4608 * because of this outstanding reservation.
4609 */
4610 if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4611 calc_csum_metadata_size(inode, num_bytes, 0);
4612 else
4613 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4614 spin_unlock(&BTRFS_I(inode)->lock);
4615 if (dropped)
4616 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4617
4618 if (to_free) {
4619 btrfs_block_rsv_release(root, block_rsv, to_free);
4620 trace_btrfs_space_reservation(root->fs_info,
4621 "delalloc",
4622 btrfs_ino(inode),
4623 to_free, 0);
4624 }
4625 if (root->fs_info->quota_enabled) {
4626 btrfs_qgroup_free(root, num_bytes + 4762 btrfs_qgroup_free(root, num_bytes +
4627 nr_extents * root->leafsize); 4763 nr_extents * root->leafsize);
4628 } 4764 goto out_fail;
4629 if (delalloc_lock)
4630 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4631 return ret;
4632 } 4765 }
4633 4766
4634 spin_lock(&BTRFS_I(inode)->lock); 4767 spin_lock(&BTRFS_I(inode)->lock);
@@ -4649,6 +4782,34 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4649 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4782 block_rsv_add_bytes(block_rsv, to_reserve, 1);
4650 4783
4651 return 0; 4784 return 0;
4785
4786out_fail:
4787 spin_lock(&BTRFS_I(inode)->lock);
4788 dropped = drop_outstanding_extent(inode);
4789 /*
4790 * If the inodes csum_bytes is the same as the original
4791 * csum_bytes then we know we haven't raced with any free()ers
4792 * so we can just reduce our inodes csum bytes and carry on.
4793 * Otherwise we have to do the normal free thing to account for
4794 * the case that the free side didn't free up its reserve
4795 * because of this outstanding reservation.
4796 */
4797 if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4798 calc_csum_metadata_size(inode, num_bytes, 0);
4799 else
4800 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4801 spin_unlock(&BTRFS_I(inode)->lock);
4802 if (dropped)
4803 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4804
4805 if (to_free) {
4806 btrfs_block_rsv_release(root, block_rsv, to_free);
4807 trace_btrfs_space_reservation(root->fs_info, "delalloc",
4808 btrfs_ino(inode), to_free, 0);
4809 }
4810 if (delalloc_lock)
4811 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4812 return ret;
4652} 4813}
4653 4814
4654/** 4815/**
@@ -4670,7 +4831,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4670 spin_lock(&BTRFS_I(inode)->lock); 4831 spin_lock(&BTRFS_I(inode)->lock);
4671 dropped = drop_outstanding_extent(inode); 4832 dropped = drop_outstanding_extent(inode);
4672 4833
4673 to_free = calc_csum_metadata_size(inode, num_bytes, 0); 4834 if (num_bytes)
4835 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4674 spin_unlock(&BTRFS_I(inode)->lock); 4836 spin_unlock(&BTRFS_I(inode)->lock);
4675 if (dropped > 0) 4837 if (dropped > 0)
4676 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4838 to_free += btrfs_calc_trans_metadata_size(root, dropped);
@@ -4737,8 +4899,7 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4737 btrfs_free_reserved_data_space(inode, num_bytes); 4899 btrfs_free_reserved_data_space(inode, num_bytes);
4738} 4900}
4739 4901
4740static int update_block_group(struct btrfs_trans_handle *trans, 4902static int update_block_group(struct btrfs_root *root,
4741 struct btrfs_root *root,
4742 u64 bytenr, u64 num_bytes, int alloc) 4903 u64 bytenr, u64 num_bytes, int alloc)
4743{ 4904{
4744 struct btrfs_block_group_cache *cache = NULL; 4905 struct btrfs_block_group_cache *cache = NULL;
@@ -4775,7 +4936,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4775 * space back to the block group, otherwise we will leak space. 4936 * space back to the block group, otherwise we will leak space.
4776 */ 4937 */
4777 if (!alloc && cache->cached == BTRFS_CACHE_NO) 4938 if (!alloc && cache->cached == BTRFS_CACHE_NO)
4778 cache_block_group(cache, trans, NULL, 1); 4939 cache_block_group(cache, 1);
4779 4940
4780 byte_in_group = bytenr - cache->key.objectid; 4941 byte_in_group = bytenr - cache->key.objectid;
4781 WARN_ON(byte_in_group > cache->key.offset); 4942 WARN_ON(byte_in_group > cache->key.offset);
@@ -4825,6 +4986,13 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
4825 struct btrfs_block_group_cache *cache; 4986 struct btrfs_block_group_cache *cache;
4826 u64 bytenr; 4987 u64 bytenr;
4827 4988
4989 spin_lock(&root->fs_info->block_group_cache_lock);
4990 bytenr = root->fs_info->first_logical_byte;
4991 spin_unlock(&root->fs_info->block_group_cache_lock);
4992
4993 if (bytenr < (u64)-1)
4994 return bytenr;
4995
4828 cache = btrfs_lookup_first_block_group(root->fs_info, search_start); 4996 cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
4829 if (!cache) 4997 if (!cache)
4830 return 0; 4998 return 0;
@@ -4875,8 +5043,7 @@ int btrfs_pin_extent(struct btrfs_root *root,
4875/* 5043/*
4876 * this function must be called within transaction 5044 * this function must be called within transaction
4877 */ 5045 */
4878int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, 5046int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
4879 struct btrfs_root *root,
4880 u64 bytenr, u64 num_bytes) 5047 u64 bytenr, u64 num_bytes)
4881{ 5048{
4882 struct btrfs_block_group_cache *cache; 5049 struct btrfs_block_group_cache *cache;
@@ -4890,7 +5057,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
4890 * to one because the slow code to read in the free extents does check 5057 * to one because the slow code to read in the free extents does check
4891 * the pinned extents. 5058 * the pinned extents.
4892 */ 5059 */
4893 cache_block_group(cache, trans, root, 1); 5060 cache_block_group(cache, 1);
4894 5061
4895 pin_down_extent(root, cache, bytenr, num_bytes, 0); 5062 pin_down_extent(root, cache, bytenr, num_bytes, 0);
4896 5063
@@ -5287,7 +5454,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5287 } 5454 }
5288 } 5455 }
5289 5456
5290 ret = update_block_group(trans, root, bytenr, num_bytes, 0); 5457 ret = update_block_group(root, bytenr, num_bytes, 0);
5291 if (ret) { 5458 if (ret) {
5292 btrfs_abort_transaction(trans, extent_root, ret); 5459 btrfs_abort_transaction(trans, extent_root, ret);
5293 goto out; 5460 goto out;
@@ -5332,7 +5499,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5332 if (head->extent_op) { 5499 if (head->extent_op) {
5333 if (!head->must_insert_reserved) 5500 if (!head->must_insert_reserved)
5334 goto out; 5501 goto out;
5335 kfree(head->extent_op); 5502 btrfs_free_delayed_extent_op(head->extent_op);
5336 head->extent_op = NULL; 5503 head->extent_op = NULL;
5337 } 5504 }
5338 5505
@@ -5455,10 +5622,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5455 return ret; 5622 return ret;
5456} 5623}
5457 5624
5458static u64 stripe_align(struct btrfs_root *root, u64 val) 5625static u64 stripe_align(struct btrfs_root *root,
5626 struct btrfs_block_group_cache *cache,
5627 u64 val, u64 num_bytes)
5459{ 5628{
5460 u64 mask = ((u64)root->stripesize - 1); 5629 u64 ret = ALIGN(val, root->stripesize);
5461 u64 ret = (val + mask) & ~mask;
5462 return ret; 5630 return ret;
5463} 5631}
5464 5632
@@ -5478,7 +5646,6 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
5478 u64 num_bytes) 5646 u64 num_bytes)
5479{ 5647{
5480 struct btrfs_caching_control *caching_ctl; 5648 struct btrfs_caching_control *caching_ctl;
5481 DEFINE_WAIT(wait);
5482 5649
5483 caching_ctl = get_caching_control(cache); 5650 caching_ctl = get_caching_control(cache);
5484 if (!caching_ctl) 5651 if (!caching_ctl)
@@ -5495,7 +5662,6 @@ static noinline int
5495wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 5662wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5496{ 5663{
5497 struct btrfs_caching_control *caching_ctl; 5664 struct btrfs_caching_control *caching_ctl;
5498 DEFINE_WAIT(wait);
5499 5665
5500 caching_ctl = get_caching_control(cache); 5666 caching_ctl = get_caching_control(cache);
5501 if (!caching_ctl) 5667 if (!caching_ctl)
@@ -5509,20 +5675,20 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5509 5675
5510int __get_raid_index(u64 flags) 5676int __get_raid_index(u64 flags)
5511{ 5677{
5512 int index;
5513
5514 if (flags & BTRFS_BLOCK_GROUP_RAID10) 5678 if (flags & BTRFS_BLOCK_GROUP_RAID10)
5515 index = 0; 5679 return BTRFS_RAID_RAID10;
5516 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 5680 else if (flags & BTRFS_BLOCK_GROUP_RAID1)
5517 index = 1; 5681 return BTRFS_RAID_RAID1;
5518 else if (flags & BTRFS_BLOCK_GROUP_DUP) 5682 else if (flags & BTRFS_BLOCK_GROUP_DUP)
5519 index = 2; 5683 return BTRFS_RAID_DUP;
5520 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 5684 else if (flags & BTRFS_BLOCK_GROUP_RAID0)
5521 index = 3; 5685 return BTRFS_RAID_RAID0;
5522 else 5686 else if (flags & BTRFS_BLOCK_GROUP_RAID5)
5523 index = 4; 5687 return BTRFS_RAID_RAID5;
5688 else if (flags & BTRFS_BLOCK_GROUP_RAID6)
5689 return BTRFS_RAID_RAID6;
5524 5690
5525 return index; 5691 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
5526} 5692}
5527 5693
5528static int get_block_group_index(struct btrfs_block_group_cache *cache) 5694static int get_block_group_index(struct btrfs_block_group_cache *cache)
@@ -5560,7 +5726,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5560 int empty_cluster = 2 * 1024 * 1024; 5726 int empty_cluster = 2 * 1024 * 1024;
5561 struct btrfs_space_info *space_info; 5727 struct btrfs_space_info *space_info;
5562 int loop = 0; 5728 int loop = 0;
5563 int index = 0; 5729 int index = __get_raid_index(data);
5564 int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? 5730 int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5565 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; 5731 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
5566 bool found_uncached_bg = false; 5732 bool found_uncached_bg = false;
@@ -5665,6 +5831,8 @@ search:
5665 if (!block_group_bits(block_group, data)) { 5831 if (!block_group_bits(block_group, data)) {
5666 u64 extra = BTRFS_BLOCK_GROUP_DUP | 5832 u64 extra = BTRFS_BLOCK_GROUP_DUP |
5667 BTRFS_BLOCK_GROUP_RAID1 | 5833 BTRFS_BLOCK_GROUP_RAID1 |
5834 BTRFS_BLOCK_GROUP_RAID5 |
5835 BTRFS_BLOCK_GROUP_RAID6 |
5668 BTRFS_BLOCK_GROUP_RAID10; 5836 BTRFS_BLOCK_GROUP_RAID10;
5669 5837
5670 /* 5838 /*
@@ -5680,8 +5848,7 @@ have_block_group:
5680 cached = block_group_cache_done(block_group); 5848 cached = block_group_cache_done(block_group);
5681 if (unlikely(!cached)) { 5849 if (unlikely(!cached)) {
5682 found_uncached_bg = true; 5850 found_uncached_bg = true;
5683 ret = cache_block_group(block_group, trans, 5851 ret = cache_block_group(block_group, 0);
5684 orig_root, 0);
5685 BUG_ON(ret < 0); 5852 BUG_ON(ret < 0);
5686 ret = 0; 5853 ret = 0;
5687 } 5854 }
@@ -5694,6 +5861,7 @@ have_block_group:
5694 * lets look there 5861 * lets look there
5695 */ 5862 */
5696 if (last_ptr) { 5863 if (last_ptr) {
5864 unsigned long aligned_cluster;
5697 /* 5865 /*
5698 * the refill lock keeps out other 5866 * the refill lock keeps out other
5699 * people trying to start a new cluster 5867 * people trying to start a new cluster
@@ -5760,11 +5928,15 @@ refill_cluster:
5760 goto unclustered_alloc; 5928 goto unclustered_alloc;
5761 } 5929 }
5762 5930
5931 aligned_cluster = max_t(unsigned long,
5932 empty_cluster + empty_size,
5933 block_group->full_stripe_len);
5934
5763 /* allocate a cluster in this block group */ 5935 /* allocate a cluster in this block group */
5764 ret = btrfs_find_space_cluster(trans, root, 5936 ret = btrfs_find_space_cluster(trans, root,
5765 block_group, last_ptr, 5937 block_group, last_ptr,
5766 search_start, num_bytes, 5938 search_start, num_bytes,
5767 empty_cluster + empty_size); 5939 aligned_cluster);
5768 if (ret == 0) { 5940 if (ret == 0) {
5769 /* 5941 /*
5770 * now pull our allocation out of this 5942 * now pull our allocation out of this
@@ -5835,7 +6007,8 @@ unclustered_alloc:
5835 goto loop; 6007 goto loop;
5836 } 6008 }
5837checks: 6009checks:
5838 search_start = stripe_align(root, offset); 6010 search_start = stripe_align(root, used_block_group,
6011 offset, num_bytes);
5839 6012
5840 /* move on to the next group */ 6013 /* move on to the next group */
5841 if (search_start + num_bytes > 6014 if (search_start + num_bytes >
@@ -5986,7 +6159,7 @@ again:
5986 if (ret == -ENOSPC) { 6159 if (ret == -ENOSPC) {
5987 if (!final_tried) { 6160 if (!final_tried) {
5988 num_bytes = num_bytes >> 1; 6161 num_bytes = num_bytes >> 1;
5989 num_bytes = num_bytes & ~(root->sectorsize - 1); 6162 num_bytes = round_down(num_bytes, root->sectorsize);
5990 num_bytes = max(num_bytes, min_alloc_size); 6163 num_bytes = max(num_bytes, min_alloc_size);
5991 if (num_bytes == min_alloc_size) 6164 if (num_bytes == min_alloc_size)
5992 final_tried = true; 6165 final_tried = true;
@@ -6110,7 +6283,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6110 btrfs_mark_buffer_dirty(path->nodes[0]); 6283 btrfs_mark_buffer_dirty(path->nodes[0]);
6111 btrfs_free_path(path); 6284 btrfs_free_path(path);
6112 6285
6113 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); 6286 ret = update_block_group(root, ins->objectid, ins->offset, 1);
6114 if (ret) { /* -ENOENT, logic error */ 6287 if (ret) { /* -ENOENT, logic error */
6115 printk(KERN_ERR "btrfs update block group failed for %llu " 6288 printk(KERN_ERR "btrfs update block group failed for %llu "
6116 "%llu\n", (unsigned long long)ins->objectid, 6289 "%llu\n", (unsigned long long)ins->objectid,
@@ -6174,7 +6347,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6174 btrfs_mark_buffer_dirty(leaf); 6347 btrfs_mark_buffer_dirty(leaf);
6175 btrfs_free_path(path); 6348 btrfs_free_path(path);
6176 6349
6177 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); 6350 ret = update_block_group(root, ins->objectid, ins->offset, 1);
6178 if (ret) { /* -ENOENT, logic error */ 6351 if (ret) { /* -ENOENT, logic error */
6179 printk(KERN_ERR "btrfs update block group failed for %llu " 6352 printk(KERN_ERR "btrfs update block group failed for %llu "
6180 "%llu\n", (unsigned long long)ins->objectid, 6353 "%llu\n", (unsigned long long)ins->objectid,
@@ -6217,7 +6390,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
6217 u64 num_bytes = ins->offset; 6390 u64 num_bytes = ins->offset;
6218 6391
6219 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 6392 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6220 cache_block_group(block_group, trans, NULL, 0); 6393 cache_block_group(block_group, 0);
6221 caching_ctl = get_caching_control(block_group); 6394 caching_ctl = get_caching_control(block_group);
6222 6395
6223 if (!caching_ctl) { 6396 if (!caching_ctl) {
@@ -6331,12 +6504,14 @@ use_block_rsv(struct btrfs_trans_handle *trans,
6331 if (!ret) 6504 if (!ret)
6332 return block_rsv; 6505 return block_rsv;
6333 if (ret && !block_rsv->failfast) { 6506 if (ret && !block_rsv->failfast) {
6334 static DEFINE_RATELIMIT_STATE(_rs, 6507 if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
6335 DEFAULT_RATELIMIT_INTERVAL, 6508 static DEFINE_RATELIMIT_STATE(_rs,
6336 /*DEFAULT_RATELIMIT_BURST*/ 2); 6509 DEFAULT_RATELIMIT_INTERVAL * 10,
6337 if (__ratelimit(&_rs)) 6510 /*DEFAULT_RATELIMIT_BURST*/ 1);
6338 WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n", 6511 if (__ratelimit(&_rs))
6339 ret); 6512 WARN(1, KERN_DEBUG
6513 "btrfs: block rsv returned %d\n", ret);
6514 }
6340 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 6515 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6341 BTRFS_RESERVE_NO_FLUSH); 6516 BTRFS_RESERVE_NO_FLUSH);
6342 if (!ret) { 6517 if (!ret) {
@@ -6402,7 +6577,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6402 6577
6403 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 6578 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
6404 struct btrfs_delayed_extent_op *extent_op; 6579 struct btrfs_delayed_extent_op *extent_op;
6405 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 6580 extent_op = btrfs_alloc_delayed_extent_op();
6406 BUG_ON(!extent_op); /* -ENOMEM */ 6581 BUG_ON(!extent_op); /* -ENOMEM */
6407 if (key) 6582 if (key)
6408 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 6583 memcpy(&extent_op->key, key, sizeof(extent_op->key));
@@ -6524,7 +6699,7 @@ reada:
6524} 6699}
6525 6700
6526/* 6701/*
6527 * hepler to process tree block while walking down the tree. 6702 * helper to process tree block while walking down the tree.
6528 * 6703 *
6529 * when wc->stage == UPDATE_BACKREF, this function updates 6704 * when wc->stage == UPDATE_BACKREF, this function updates
6530 * back refs for pointers in the block. 6705 * back refs for pointers in the block.
@@ -6599,7 +6774,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
6599} 6774}
6600 6775
6601/* 6776/*
6602 * hepler to process tree block pointer. 6777 * helper to process tree block pointer.
6603 * 6778 *
6604 * when wc->stage == DROP_REFERENCE, this function checks 6779 * when wc->stage == DROP_REFERENCE, this function checks
6605 * reference count of the block pointed to. if the block 6780 * reference count of the block pointed to. if the block
@@ -6737,7 +6912,7 @@ skip:
6737} 6912}
6738 6913
6739/* 6914/*
6740 * hepler to process tree block while walking up the tree. 6915 * helper to process tree block while walking up the tree.
6741 * 6916 *
6742 * when wc->stage == DROP_REFERENCE, this function drops 6917 * when wc->stage == DROP_REFERENCE, this function drops
6743 * reference count on the block. 6918 * reference count on the block.
@@ -6788,11 +6963,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6788 &wc->flags[level]); 6963 &wc->flags[level]);
6789 if (ret < 0) { 6964 if (ret < 0) {
6790 btrfs_tree_unlock_rw(eb, path->locks[level]); 6965 btrfs_tree_unlock_rw(eb, path->locks[level]);
6966 path->locks[level] = 0;
6791 return ret; 6967 return ret;
6792 } 6968 }
6793 BUG_ON(wc->refs[level] == 0); 6969 BUG_ON(wc->refs[level] == 0);
6794 if (wc->refs[level] == 1) { 6970 if (wc->refs[level] == 1) {
6795 btrfs_tree_unlock_rw(eb, path->locks[level]); 6971 btrfs_tree_unlock_rw(eb, path->locks[level]);
6972 path->locks[level] = 0;
6796 return 1; 6973 return 1;
6797 } 6974 }
6798 } 6975 }
@@ -7203,6 +7380,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7203 root->fs_info->fs_devices->missing_devices; 7380 root->fs_info->fs_devices->missing_devices;
7204 7381
7205 stripped = BTRFS_BLOCK_GROUP_RAID0 | 7382 stripped = BTRFS_BLOCK_GROUP_RAID0 |
7383 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
7206 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 7384 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7207 7385
7208 if (num_devices == 1) { 7386 if (num_devices == 1) {
@@ -7481,16 +7659,16 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7481 index = get_block_group_index(block_group); 7659 index = get_block_group_index(block_group);
7482 } 7660 }
7483 7661
7484 if (index == 0) { 7662 if (index == BTRFS_RAID_RAID10) {
7485 dev_min = 4; 7663 dev_min = 4;
7486 /* Divide by 2 */ 7664 /* Divide by 2 */
7487 min_free >>= 1; 7665 min_free >>= 1;
7488 } else if (index == 1) { 7666 } else if (index == BTRFS_RAID_RAID1) {
7489 dev_min = 2; 7667 dev_min = 2;
7490 } else if (index == 2) { 7668 } else if (index == BTRFS_RAID_DUP) {
7491 /* Multiply by 2 */ 7669 /* Multiply by 2 */
7492 min_free <<= 1; 7670 min_free <<= 1;
7493 } else if (index == 3) { 7671 } else if (index == BTRFS_RAID_RAID0) {
7494 dev_min = fs_devices->rw_devices; 7672 dev_min = fs_devices->rw_devices;
7495 do_div(min_free, dev_min); 7673 do_div(min_free, dev_min);
7496 } 7674 }
@@ -7651,11 +7829,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
7651 space_info = list_entry(info->space_info.next, 7829 space_info = list_entry(info->space_info.next,
7652 struct btrfs_space_info, 7830 struct btrfs_space_info,
7653 list); 7831 list);
7654 if (space_info->bytes_pinned > 0 || 7832 if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
7655 space_info->bytes_reserved > 0 || 7833 if (space_info->bytes_pinned > 0 ||
7656 space_info->bytes_may_use > 0) { 7834 space_info->bytes_reserved > 0 ||
7657 WARN_ON(1); 7835 space_info->bytes_may_use > 0) {
7658 dump_space_info(space_info, 0, 0); 7836 WARN_ON(1);
7837 dump_space_info(space_info, 0, 0);
7838 }
7659 } 7839 }
7660 list_del(&space_info->list); 7840 list_del(&space_info->list);
7661 kfree(space_info); 7841 kfree(space_info);
@@ -7754,7 +7934,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7754 btrfs_release_path(path); 7934 btrfs_release_path(path);
7755 cache->flags = btrfs_block_group_flags(&cache->item); 7935 cache->flags = btrfs_block_group_flags(&cache->item);
7756 cache->sectorsize = root->sectorsize; 7936 cache->sectorsize = root->sectorsize;
7757 7937 cache->full_stripe_len = btrfs_full_stripe_len(root,
7938 &root->fs_info->mapping_tree,
7939 found_key.objectid);
7758 btrfs_init_free_space_ctl(cache); 7940 btrfs_init_free_space_ctl(cache);
7759 7941
7760 /* 7942 /*
@@ -7808,6 +7990,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7808 if (!(get_alloc_profile(root, space_info->flags) & 7990 if (!(get_alloc_profile(root, space_info->flags) &
7809 (BTRFS_BLOCK_GROUP_RAID10 | 7991 (BTRFS_BLOCK_GROUP_RAID10 |
7810 BTRFS_BLOCK_GROUP_RAID1 | 7992 BTRFS_BLOCK_GROUP_RAID1 |
7993 BTRFS_BLOCK_GROUP_RAID5 |
7994 BTRFS_BLOCK_GROUP_RAID6 |
7811 BTRFS_BLOCK_GROUP_DUP))) 7995 BTRFS_BLOCK_GROUP_DUP)))
7812 continue; 7996 continue;
7813 /* 7997 /*
@@ -7883,6 +8067,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7883 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 8067 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7884 cache->sectorsize = root->sectorsize; 8068 cache->sectorsize = root->sectorsize;
7885 cache->fs_info = root->fs_info; 8069 cache->fs_info = root->fs_info;
8070 cache->full_stripe_len = btrfs_full_stripe_len(root,
8071 &root->fs_info->mapping_tree,
8072 chunk_offset);
7886 8073
7887 atomic_set(&cache->count, 1); 8074 atomic_set(&cache->count, 1);
7888 spin_lock_init(&cache->lock); 8075 spin_lock_init(&cache->lock);
@@ -7932,12 +8119,14 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
7932 u64 extra_flags = chunk_to_extended(flags) & 8119 u64 extra_flags = chunk_to_extended(flags) &
7933 BTRFS_EXTENDED_PROFILE_MASK; 8120 BTRFS_EXTENDED_PROFILE_MASK;
7934 8121
8122 write_seqlock(&fs_info->profiles_lock);
7935 if (flags & BTRFS_BLOCK_GROUP_DATA) 8123 if (flags & BTRFS_BLOCK_GROUP_DATA)
7936 fs_info->avail_data_alloc_bits &= ~extra_flags; 8124 fs_info->avail_data_alloc_bits &= ~extra_flags;
7937 if (flags & BTRFS_BLOCK_GROUP_METADATA) 8125 if (flags & BTRFS_BLOCK_GROUP_METADATA)
7938 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 8126 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
7939 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 8127 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
7940 fs_info->avail_system_alloc_bits &= ~extra_flags; 8128 fs_info->avail_system_alloc_bits &= ~extra_flags;
8129 write_sequnlock(&fs_info->profiles_lock);
7941} 8130}
7942 8131
7943int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 8132int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
@@ -8036,6 +8225,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8036 spin_lock(&root->fs_info->block_group_cache_lock); 8225 spin_lock(&root->fs_info->block_group_cache_lock);
8037 rb_erase(&block_group->cache_node, 8226 rb_erase(&block_group->cache_node,
8038 &root->fs_info->block_group_cache_tree); 8227 &root->fs_info->block_group_cache_tree);
8228
8229 if (root->fs_info->first_logical_byte == block_group->key.objectid)
8230 root->fs_info->first_logical_byte = (u64)-1;
8039 spin_unlock(&root->fs_info->block_group_cache_lock); 8231 spin_unlock(&root->fs_info->block_group_cache_lock);
8040 8232
8041 down_write(&block_group->space_info->groups_sem); 8233 down_write(&block_group->space_info->groups_sem);
@@ -8158,7 +8350,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8158 8350
8159 if (end - start >= range->minlen) { 8351 if (end - start >= range->minlen) {
8160 if (!block_group_cache_done(cache)) { 8352 if (!block_group_cache_done(cache)) {
8161 ret = cache_block_group(cache, NULL, root, 0); 8353 ret = cache_block_group(cache, 0);
8162 if (!ret) 8354 if (!ret)
8163 wait_block_group_cache_done(cache); 8355 wait_block_group_cache_done(cache);
8164 } 8356 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1b319df29eee..f173c5af6461 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4,7 +4,6 @@
4#include <linux/mm.h> 4#include <linux/mm.h>
5#include <linux/pagemap.h> 5#include <linux/pagemap.h>
6#include <linux/page-flags.h> 6#include <linux/page-flags.h>
7#include <linux/module.h>
8#include <linux/spinlock.h> 7#include <linux/spinlock.h>
9#include <linux/blkdev.h> 8#include <linux/blkdev.h>
10#include <linux/swap.h> 9#include <linux/swap.h>
@@ -1834,7 +1833,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1834 */ 1833 */
1835static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) 1834static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
1836{ 1835{
1837 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1836 u64 start = page_offset(page);
1838 u64 end = start + PAGE_CACHE_SIZE - 1; 1837 u64 end = start + PAGE_CACHE_SIZE - 1;
1839 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1838 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1840 SetPageUptodate(page); 1839 SetPageUptodate(page);
@@ -1846,7 +1845,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
1846 */ 1845 */
1847static void check_page_locked(struct extent_io_tree *tree, struct page *page) 1846static void check_page_locked(struct extent_io_tree *tree, struct page *page)
1848{ 1847{
1849 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1848 u64 start = page_offset(page);
1850 u64 end = start + PAGE_CACHE_SIZE - 1; 1849 u64 end = start + PAGE_CACHE_SIZE - 1;
1851 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) 1850 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1852 unlock_page(page); 1851 unlock_page(page);
@@ -1895,13 +1894,11 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1895 if (ret) 1894 if (ret)
1896 err = ret; 1895 err = ret;
1897 1896
1898 if (did_repair) { 1897 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1899 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, 1898 rec->start + rec->len - 1,
1900 rec->start + rec->len - 1, 1899 EXTENT_DAMAGED, GFP_NOFS);
1901 EXTENT_DAMAGED, GFP_NOFS); 1900 if (ret && !err)
1902 if (ret && !err) 1901 err = ret;
1903 err = ret;
1904 }
1905 1902
1906 kfree(rec); 1903 kfree(rec);
1907 return err; 1904 return err;
@@ -1932,10 +1929,15 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
1932 u64 map_length = 0; 1929 u64 map_length = 0;
1933 u64 sector; 1930 u64 sector;
1934 struct btrfs_bio *bbio = NULL; 1931 struct btrfs_bio *bbio = NULL;
1932 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
1935 int ret; 1933 int ret;
1936 1934
1937 BUG_ON(!mirror_num); 1935 BUG_ON(!mirror_num);
1938 1936
1937 /* we can't repair anything in raid56 yet */
1938 if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
1939 return 0;
1940
1939 bio = bio_alloc(GFP_NOFS, 1); 1941 bio = bio_alloc(GFP_NOFS, 1);
1940 if (!bio) 1942 if (!bio)
1941 return -EIO; 1943 return -EIO;
@@ -1960,7 +1962,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
1960 return -EIO; 1962 return -EIO;
1961 } 1963 }
1962 bio->bi_bdev = dev->bdev; 1964 bio->bi_bdev = dev->bdev;
1963 bio_add_page(bio, page, length, start-page_offset(page)); 1965 bio_add_page(bio, page, length, start - page_offset(page));
1964 btrfsic_submit_bio(WRITE_SYNC, bio); 1966 btrfsic_submit_bio(WRITE_SYNC, bio);
1965 wait_for_completion(&compl); 1967 wait_for_completion(&compl);
1966 1968
@@ -2052,6 +2054,7 @@ static int clean_io_failure(u64 start, struct page *page)
2052 failrec->failed_mirror); 2054 failrec->failed_mirror);
2053 did_repair = !ret; 2055 did_repair = !ret;
2054 } 2056 }
2057 ret = 0;
2055 } 2058 }
2056 2059
2057out: 2060out:
@@ -2293,8 +2296,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
2293 struct page *page = bvec->bv_page; 2296 struct page *page = bvec->bv_page;
2294 tree = &BTRFS_I(page->mapping->host)->io_tree; 2297 tree = &BTRFS_I(page->mapping->host)->io_tree;
2295 2298
2296 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2299 start = page_offset(page) + bvec->bv_offset;
2297 bvec->bv_offset;
2298 end = start + bvec->bv_len - 1; 2300 end = start + bvec->bv_len - 1;
2299 2301
2300 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 2302 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
@@ -2353,8 +2355,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2353 (long int)bio->bi_bdev); 2355 (long int)bio->bi_bdev);
2354 tree = &BTRFS_I(page->mapping->host)->io_tree; 2356 tree = &BTRFS_I(page->mapping->host)->io_tree;
2355 2357
2356 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2358 start = page_offset(page) + bvec->bv_offset;
2357 bvec->bv_offset;
2358 end = start + bvec->bv_len - 1; 2359 end = start + bvec->bv_len - 1;
2359 2360
2360 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 2361 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
@@ -2471,7 +2472,7 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
2471 struct extent_io_tree *tree = bio->bi_private; 2472 struct extent_io_tree *tree = bio->bi_private;
2472 u64 start; 2473 u64 start;
2473 2474
2474 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 2475 start = page_offset(page) + bvec->bv_offset;
2475 2476
2476 bio->bi_private = NULL; 2477 bio->bi_private = NULL;
2477 2478
@@ -2489,13 +2490,13 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
2489 return ret; 2490 return ret;
2490} 2491}
2491 2492
2492static int merge_bio(struct extent_io_tree *tree, struct page *page, 2493static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
2493 unsigned long offset, size_t size, struct bio *bio, 2494 unsigned long offset, size_t size, struct bio *bio,
2494 unsigned long bio_flags) 2495 unsigned long bio_flags)
2495{ 2496{
2496 int ret = 0; 2497 int ret = 0;
2497 if (tree->ops && tree->ops->merge_bio_hook) 2498 if (tree->ops && tree->ops->merge_bio_hook)
2498 ret = tree->ops->merge_bio_hook(page, offset, size, bio, 2499 ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio,
2499 bio_flags); 2500 bio_flags);
2500 BUG_ON(ret < 0); 2501 BUG_ON(ret < 0);
2501 return ret; 2502 return ret;
@@ -2530,7 +2531,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
2530 sector; 2531 sector;
2531 2532
2532 if (prev_bio_flags != bio_flags || !contig || 2533 if (prev_bio_flags != bio_flags || !contig ||
2533 merge_bio(tree, page, offset, page_size, bio, bio_flags) || 2534 merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
2534 bio_add_page(bio, page, page_size, offset) < page_size) { 2535 bio_add_page(bio, page, page_size, offset) < page_size) {
2535 ret = submit_one_bio(rw, bio, mirror_num, 2536 ret = submit_one_bio(rw, bio, mirror_num,
2536 prev_bio_flags); 2537 prev_bio_flags);
@@ -2595,7 +2596,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2595 unsigned long *bio_flags) 2596 unsigned long *bio_flags)
2596{ 2597{
2597 struct inode *inode = page->mapping->host; 2598 struct inode *inode = page->mapping->host;
2598 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2599 u64 start = page_offset(page);
2599 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2600 u64 page_end = start + PAGE_CACHE_SIZE - 1;
2600 u64 end; 2601 u64 end;
2601 u64 cur = start; 2602 u64 cur = start;
@@ -2648,6 +2649,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2648 } 2649 }
2649 } 2650 }
2650 while (cur <= end) { 2651 while (cur <= end) {
2652 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2653
2651 if (cur >= last_byte) { 2654 if (cur >= last_byte) {
2652 char *userpage; 2655 char *userpage;
2653 struct extent_state *cached = NULL; 2656 struct extent_state *cached = NULL;
@@ -2682,7 +2685,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2682 2685
2683 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2686 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2684 cur_end = min(extent_map_end(em) - 1, end); 2687 cur_end = min(extent_map_end(em) - 1, end);
2685 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2688 iosize = ALIGN(iosize, blocksize);
2686 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2689 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2687 disk_io_size = em->block_len; 2690 disk_io_size = em->block_len;
2688 sector = em->block_start >> 9; 2691 sector = em->block_start >> 9;
@@ -2735,26 +2738,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2735 continue; 2738 continue;
2736 } 2739 }
2737 2740
2738 ret = 0; 2741 pnr -= page->index;
2739 if (tree->ops && tree->ops->readpage_io_hook) { 2742 ret = submit_extent_page(READ, tree, page,
2740 ret = tree->ops->readpage_io_hook(page, cur,
2741 cur + iosize - 1);
2742 }
2743 if (!ret) {
2744 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2745 pnr -= page->index;
2746 ret = submit_extent_page(READ, tree, page,
2747 sector, disk_io_size, pg_offset, 2743 sector, disk_io_size, pg_offset,
2748 bdev, bio, pnr, 2744 bdev, bio, pnr,
2749 end_bio_extent_readpage, mirror_num, 2745 end_bio_extent_readpage, mirror_num,
2750 *bio_flags, 2746 *bio_flags,
2751 this_bio_flag); 2747 this_bio_flag);
2752 if (!ret) { 2748 if (!ret) {
2753 nr++; 2749 nr++;
2754 *bio_flags = this_bio_flag; 2750 *bio_flags = this_bio_flag;
2755 } 2751 } else {
2756 }
2757 if (ret) {
2758 SetPageError(page); 2752 SetPageError(page);
2759 unlock_extent(tree, cur, cur + iosize - 1); 2753 unlock_extent(tree, cur, cur + iosize - 1);
2760 } 2754 }
@@ -2806,7 +2800,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2806 struct inode *inode = page->mapping->host; 2800 struct inode *inode = page->mapping->host;
2807 struct extent_page_data *epd = data; 2801 struct extent_page_data *epd = data;
2808 struct extent_io_tree *tree = epd->tree; 2802 struct extent_io_tree *tree = epd->tree;
2809 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2803 u64 start = page_offset(page);
2810 u64 delalloc_start; 2804 u64 delalloc_start;
2811 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2805 u64 page_end = start + PAGE_CACHE_SIZE - 1;
2812 u64 end; 2806 u64 end;
@@ -2982,7 +2976,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2982 BUG_ON(extent_map_end(em) <= cur); 2976 BUG_ON(extent_map_end(em) <= cur);
2983 BUG_ON(end < cur); 2977 BUG_ON(end < cur);
2984 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2978 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2985 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2979 iosize = ALIGN(iosize, blocksize);
2986 sector = (em->block_start + extent_offset) >> 9; 2980 sector = (em->block_start + extent_offset) >> 9;
2987 bdev = em->bdev; 2981 bdev = em->bdev;
2988 block_start = em->block_start; 2982 block_start = em->block_start;
@@ -3124,12 +3118,9 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
3124 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3118 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3125 spin_unlock(&eb->refs_lock); 3119 spin_unlock(&eb->refs_lock);
3126 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3120 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3127 spin_lock(&fs_info->delalloc_lock); 3121 __percpu_counter_add(&fs_info->dirty_metadata_bytes,
3128 if (fs_info->dirty_metadata_bytes >= eb->len) 3122 -eb->len,
3129 fs_info->dirty_metadata_bytes -= eb->len; 3123 fs_info->dirty_metadata_batch);
3130 else
3131 WARN_ON(1);
3132 spin_unlock(&fs_info->delalloc_lock);
3133 ret = 1; 3124 ret = 1;
3134 } else { 3125 } else {
3135 spin_unlock(&eb->refs_lock); 3126 spin_unlock(&eb->refs_lock);
@@ -3446,15 +3437,9 @@ retry:
3446 * swizzled back from swapper_space to tmpfs file 3437 * swizzled back from swapper_space to tmpfs file
3447 * mapping 3438 * mapping
3448 */ 3439 */
3449 if (tree->ops && 3440 if (!trylock_page(page)) {
3450 tree->ops->write_cache_pages_lock_hook) { 3441 flush_fn(data);
3451 tree->ops->write_cache_pages_lock_hook(page, 3442 lock_page(page);
3452 data, flush_fn);
3453 } else {
3454 if (!trylock_page(page)) {
3455 flush_fn(data);
3456 lock_page(page);
3457 }
3458 } 3443 }
3459 3444
3460 if (unlikely(page->mapping != mapping)) { 3445 if (unlikely(page->mapping != mapping)) {
@@ -3674,11 +3659,11 @@ int extent_invalidatepage(struct extent_io_tree *tree,
3674 struct page *page, unsigned long offset) 3659 struct page *page, unsigned long offset)
3675{ 3660{
3676 struct extent_state *cached_state = NULL; 3661 struct extent_state *cached_state = NULL;
3677 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); 3662 u64 start = page_offset(page);
3678 u64 end = start + PAGE_CACHE_SIZE - 1; 3663 u64 end = start + PAGE_CACHE_SIZE - 1;
3679 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 3664 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
3680 3665
3681 start += (offset + blocksize - 1) & ~(blocksize - 1); 3666 start += ALIGN(offset, blocksize);
3682 if (start > end) 3667 if (start > end)
3683 return 0; 3668 return 0;
3684 3669
@@ -3700,7 +3685,7 @@ int try_release_extent_state(struct extent_map_tree *map,
3700 struct extent_io_tree *tree, struct page *page, 3685 struct extent_io_tree *tree, struct page *page,
3701 gfp_t mask) 3686 gfp_t mask)
3702{ 3687{
3703 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 3688 u64 start = page_offset(page);
3704 u64 end = start + PAGE_CACHE_SIZE - 1; 3689 u64 end = start + PAGE_CACHE_SIZE - 1;
3705 int ret = 1; 3690 int ret = 1;
3706 3691
@@ -3739,7 +3724,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
3739 gfp_t mask) 3724 gfp_t mask)
3740{ 3725{
3741 struct extent_map *em; 3726 struct extent_map *em;
3742 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 3727 u64 start = page_offset(page);
3743 u64 end = start + PAGE_CACHE_SIZE - 1; 3728 u64 end = start + PAGE_CACHE_SIZE - 1;
3744 3729
3745 if ((mask & __GFP_WAIT) && 3730 if ((mask & __GFP_WAIT) &&
@@ -3797,7 +3782,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
3797 len = last - offset; 3782 len = last - offset;
3798 if (len == 0) 3783 if (len == 0)
3799 break; 3784 break;
3800 len = (len + sectorsize - 1) & ~(sectorsize - 1); 3785 len = ALIGN(len, sectorsize);
3801 em = get_extent(inode, NULL, 0, offset, len, 0); 3786 em = get_extent(inode, NULL, 0, offset, len, 0);
3802 if (IS_ERR_OR_NULL(em)) 3787 if (IS_ERR_OR_NULL(em))
3803 return em; 3788 return em;
@@ -3995,8 +3980,6 @@ static void __free_extent_buffer(struct extent_buffer *eb)
3995 list_del(&eb->leak_list); 3980 list_del(&eb->leak_list);
3996 spin_unlock_irqrestore(&leak_lock, flags); 3981 spin_unlock_irqrestore(&leak_lock, flags);
3997#endif 3982#endif
3998 if (eb->pages && eb->pages != eb->inline_pages)
3999 kfree(eb->pages);
4000 kmem_cache_free(extent_buffer_cache, eb); 3983 kmem_cache_free(extent_buffer_cache, eb);
4001} 3984}
4002 3985
@@ -4037,19 +4020,12 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
4037 atomic_set(&eb->refs, 1); 4020 atomic_set(&eb->refs, 1);
4038 atomic_set(&eb->io_pages, 0); 4021 atomic_set(&eb->io_pages, 0);
4039 4022
4040 if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) { 4023 /*
4041 struct page **pages; 4024 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
4042 int num_pages = (len + PAGE_CACHE_SIZE - 1) >> 4025 */
4043 PAGE_CACHE_SHIFT; 4026 BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
4044 pages = kzalloc(num_pages, mask); 4027 > MAX_INLINE_EXTENT_BUFFER_SIZE);
4045 if (!pages) { 4028 BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
4046 __free_extent_buffer(eb);
4047 return NULL;
4048 }
4049 eb->pages = pages;
4050 } else {
4051 eb->pages = eb->inline_pages;
4052 }
4053 4029
4054 return eb; 4030 return eb;
4055} 4031}
@@ -4180,6 +4156,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4180 4156
4181static void check_buffer_tree_ref(struct extent_buffer *eb) 4157static void check_buffer_tree_ref(struct extent_buffer *eb)
4182{ 4158{
4159 int refs;
4183 /* the ref bit is tricky. We have to make sure it is set 4160 /* the ref bit is tricky. We have to make sure it is set
4184 * if we have the buffer dirty. Otherwise the 4161 * if we have the buffer dirty. Otherwise the
4185 * code to free a buffer can end up dropping a dirty 4162 * code to free a buffer can end up dropping a dirty
@@ -4200,6 +4177,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
4200 * So bump the ref count first, then set the bit. If someone 4177 * So bump the ref count first, then set the bit. If someone
4201 * beat us to it, drop the ref we added. 4178 * beat us to it, drop the ref we added.
4202 */ 4179 */
4180 refs = atomic_read(&eb->refs);
4181 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4182 return;
4183
4203 spin_lock(&eb->refs_lock); 4184 spin_lock(&eb->refs_lock);
4204 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4185 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4205 atomic_inc(&eb->refs); 4186 atomic_inc(&eb->refs);
@@ -4401,9 +4382,20 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4401 4382
4402void free_extent_buffer(struct extent_buffer *eb) 4383void free_extent_buffer(struct extent_buffer *eb)
4403{ 4384{
4385 int refs;
4386 int old;
4404 if (!eb) 4387 if (!eb)
4405 return; 4388 return;
4406 4389
4390 while (1) {
4391 refs = atomic_read(&eb->refs);
4392 if (refs <= 3)
4393 break;
4394 old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
4395 if (old == refs)
4396 return;
4397 }
4398
4407 spin_lock(&eb->refs_lock); 4399 spin_lock(&eb->refs_lock);
4408 if (atomic_read(&eb->refs) == 2 && 4400 if (atomic_read(&eb->refs) == 2 &&
4409 test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) 4401 test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 2eacfabd3263..6068a1985560 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -72,10 +72,9 @@ struct extent_io_ops {
72 int (*writepage_start_hook)(struct page *page, u64 start, u64 end); 72 int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
73 int (*writepage_io_hook)(struct page *page, u64 start, u64 end); 73 int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
74 extent_submit_bio_hook_t *submit_bio_hook; 74 extent_submit_bio_hook_t *submit_bio_hook;
75 int (*merge_bio_hook)(struct page *page, unsigned long offset, 75 int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset,
76 size_t size, struct bio *bio, 76 size_t size, struct bio *bio,
77 unsigned long bio_flags); 77 unsigned long bio_flags);
78 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
79 int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); 78 int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
80 int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, 79 int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
81 struct extent_state *state, int mirror); 80 struct extent_state *state, int mirror);
@@ -90,8 +89,6 @@ struct extent_io_ops {
90 struct extent_state *other); 89 struct extent_state *other);
91 void (*split_extent_hook)(struct inode *inode, 90 void (*split_extent_hook)(struct inode *inode,
92 struct extent_state *orig, u64 split); 91 struct extent_state *orig, u64 split);
93 int (*write_cache_pages_lock_hook)(struct page *page, void *data,
94 void (*flush_fn)(void *));
95}; 92};
96 93
97struct extent_io_tree { 94struct extent_io_tree {
@@ -161,8 +158,7 @@ struct extent_buffer {
161 */ 158 */
162 wait_queue_head_t read_lock_wq; 159 wait_queue_head_t read_lock_wq;
163 wait_queue_head_t lock_wq; 160 wait_queue_head_t lock_wq;
164 struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES]; 161 struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
165 struct page **pages;
166}; 162};
167 163
168static inline void extent_set_compress_type(unsigned long *bio_flags, 164static inline void extent_set_compress_type(unsigned long *bio_flags,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index f169d6b11d7f..2834ca5768ea 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,6 +1,5 @@
1#include <linux/err.h> 1#include <linux/err.h>
2#include <linux/slab.h> 2#include <linux/slab.h>
3#include <linux/module.h>
4#include <linux/spinlock.h> 3#include <linux/spinlock.h>
5#include <linux/hardirq.h> 4#include <linux/hardirq.h>
6#include "ctree.h" 5#include "ctree.h"
@@ -171,6 +170,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
171 if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) 170 if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
172 return 0; 171 return 0;
173 172
173 if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) ||
174 test_bit(EXTENT_FLAG_LOGGING, &next->flags))
175 return 0;
176
174 if (extent_map_end(prev) == next->start && 177 if (extent_map_end(prev) == next->start &&
175 prev->flags == next->flags && 178 prev->flags == next->flags &&
176 prev->bdev == next->bdev && 179 prev->bdev == next->bdev &&
@@ -255,7 +258,8 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
255 if (!em) 258 if (!em)
256 goto out; 259 goto out;
257 260
258 list_move(&em->list, &tree->modified_extents); 261 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
262 list_move(&em->list, &tree->modified_extents);
259 em->generation = gen; 263 em->generation = gen;
260 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 264 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
261 em->mod_start = em->start; 265 em->mod_start = em->start;
@@ -280,6 +284,13 @@ out:
280 284
281} 285}
282 286
287void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
288{
289 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
290 if (em->in_tree)
291 try_merge_map(tree, em);
292}
293
283/** 294/**
284 * add_extent_mapping - add new extent map to the extent tree 295 * add_extent_mapping - add new extent map to the extent tree
285 * @tree: tree to insert new map in 296 * @tree: tree to insert new map in
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 922943ce29e8..c6598c89cff8 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -69,6 +69,7 @@ void free_extent_map(struct extent_map *em);
69int __init extent_map_init(void); 69int __init extent_map_init(void);
70void extent_map_exit(void); 70void extent_map_exit(void);
71int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen); 71int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
72void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
72struct extent_map *search_extent_mapping(struct extent_map_tree *tree, 73struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
73 u64 start, u64 len); 74 u64 start, u64 len);
74#endif 75#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index bd38cef42358..ec160202be3e 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -460,8 +460,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
460 if (!contig) 460 if (!contig)
461 offset = page_offset(bvec->bv_page) + bvec->bv_offset; 461 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
462 462
463 if (!contig && (offset >= ordered->file_offset + ordered->len || 463 if (offset >= ordered->file_offset + ordered->len ||
464 offset < ordered->file_offset)) { 464 offset < ordered->file_offset) {
465 unsigned long bytes_left; 465 unsigned long bytes_left;
466 sums->len = this_sum_bytes; 466 sums->len = this_sum_bytes;
467 this_sum_bytes = 0; 467 this_sum_bytes = 0;
@@ -684,6 +684,24 @@ out:
684 return ret; 684 return ret;
685} 685}
686 686
687static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums,
688 struct btrfs_sector_sum *sector_sum,
689 u64 total_bytes, u64 sectorsize)
690{
691 u64 tmp = sectorsize;
692 u64 next_sector = sector_sum->bytenr;
693 struct btrfs_sector_sum *next = sector_sum + 1;
694
695 while ((tmp + total_bytes) < sums->len) {
696 if (next_sector + sectorsize != next->bytenr)
697 break;
698 tmp += sectorsize;
699 next_sector = next->bytenr;
700 next++;
701 }
702 return tmp;
703}
704
687int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, 705int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
688 struct btrfs_root *root, 706 struct btrfs_root *root,
689 struct btrfs_ordered_sum *sums) 707 struct btrfs_ordered_sum *sums)
@@ -789,20 +807,32 @@ again:
789 goto insert; 807 goto insert;
790 } 808 }
791 809
792 if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) / 810 if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) /
793 csum_size) { 811 csum_size) {
794 u32 diff = (csum_offset + 1) * csum_size; 812 int extend_nr;
813 u64 tmp;
814 u32 diff;
815 u32 free_space;
795 816
796 /* 817 if (btrfs_leaf_free_space(root, leaf) <
797 * is the item big enough already? we dropped our lock 818 sizeof(struct btrfs_item) + csum_size * 2)
798 * before and need to recheck 819 goto insert;
799 */ 820
800 if (diff < btrfs_item_size_nr(leaf, path->slots[0])) 821 free_space = btrfs_leaf_free_space(root, leaf) -
801 goto csum; 822 sizeof(struct btrfs_item) - csum_size;
823 tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes,
824 root->sectorsize);
825 tmp >>= root->fs_info->sb->s_blocksize_bits;
826 WARN_ON(tmp < 1);
827
828 extend_nr = max_t(int, 1, (int)tmp);
829 diff = (csum_offset + extend_nr) * csum_size;
830 diff = min(diff, MAX_CSUM_ITEMS(root, csum_size) * csum_size);
802 831
803 diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); 832 diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
804 if (diff != csum_size) 833 diff = min(free_space, diff);
805 goto insert; 834 diff /= csum_size;
835 diff *= csum_size;
806 836
807 btrfs_extend_item(trans, root, path, diff); 837 btrfs_extend_item(trans, root, path, diff);
808 goto csum; 838 goto csum;
@@ -812,19 +842,14 @@ insert:
812 btrfs_release_path(path); 842 btrfs_release_path(path);
813 csum_offset = 0; 843 csum_offset = 0;
814 if (found_next) { 844 if (found_next) {
815 u64 tmp = total_bytes + root->sectorsize; 845 u64 tmp;
816 u64 next_sector = sector_sum->bytenr;
817 struct btrfs_sector_sum *next = sector_sum + 1;
818 846
819 while (tmp < sums->len) { 847 tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes,
820 if (next_sector + root->sectorsize != next->bytenr) 848 root->sectorsize);
821 break;
822 tmp += root->sectorsize;
823 next_sector = next->bytenr;
824 next++;
825 }
826 tmp = min(tmp, next_offset - file_key.offset);
827 tmp >>= root->fs_info->sb->s_blocksize_bits; 849 tmp >>= root->fs_info->sb->s_blocksize_bits;
850 tmp = min(tmp, (next_offset - file_key.offset) >>
851 root->fs_info->sb->s_blocksize_bits);
852
828 tmp = max((u64)1, tmp); 853 tmp = max((u64)1, tmp);
829 tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size)); 854 tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
830 ins_size = csum_size * tmp; 855 ins_size = csum_size * tmp;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 77061bf43edb..af1d0605a5c1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -30,11 +30,11 @@
30#include <linux/statfs.h> 30#include <linux/statfs.h>
31#include <linux/compat.h> 31#include <linux/compat.h>
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/btrfs.h>
33#include "ctree.h" 34#include "ctree.h"
34#include "disk-io.h" 35#include "disk-io.h"
35#include "transaction.h" 36#include "transaction.h"
36#include "btrfs_inode.h" 37#include "btrfs_inode.h"
37#include "ioctl.h"
38#include "print-tree.h" 38#include "print-tree.h"
39#include "tree-log.h" 39#include "tree-log.h"
40#include "locking.h" 40#include "locking.h"
@@ -293,15 +293,24 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
293 struct btrfs_key key; 293 struct btrfs_key key;
294 struct btrfs_ioctl_defrag_range_args range; 294 struct btrfs_ioctl_defrag_range_args range;
295 int num_defrag; 295 int num_defrag;
296 int index;
297 int ret;
296 298
297 /* get the inode */ 299 /* get the inode */
298 key.objectid = defrag->root; 300 key.objectid = defrag->root;
299 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 301 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
300 key.offset = (u64)-1; 302 key.offset = (u64)-1;
303
304 index = srcu_read_lock(&fs_info->subvol_srcu);
305
301 inode_root = btrfs_read_fs_root_no_name(fs_info, &key); 306 inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
302 if (IS_ERR(inode_root)) { 307 if (IS_ERR(inode_root)) {
303 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 308 ret = PTR_ERR(inode_root);
304 return PTR_ERR(inode_root); 309 goto cleanup;
310 }
311 if (btrfs_root_refs(&inode_root->root_item) == 0) {
312 ret = -ENOENT;
313 goto cleanup;
305 } 314 }
306 315
307 key.objectid = defrag->ino; 316 key.objectid = defrag->ino;
@@ -309,9 +318,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
309 key.offset = 0; 318 key.offset = 0;
310 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); 319 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
311 if (IS_ERR(inode)) { 320 if (IS_ERR(inode)) {
312 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 321 ret = PTR_ERR(inode);
313 return PTR_ERR(inode); 322 goto cleanup;
314 } 323 }
324 srcu_read_unlock(&fs_info->subvol_srcu, index);
315 325
316 /* do a chunk of defrag */ 326 /* do a chunk of defrag */
317 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 327 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
@@ -346,6 +356,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
346 356
347 iput(inode); 357 iput(inode);
348 return 0; 358 return 0;
359cleanup:
360 srcu_read_unlock(&fs_info->subvol_srcu, index);
361 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
362 return ret;
349} 363}
350 364
351/* 365/*
@@ -360,6 +374,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
360 374
361 atomic_inc(&fs_info->defrag_running); 375 atomic_inc(&fs_info->defrag_running);
362 while(1) { 376 while(1) {
377 /* Pause the auto defragger. */
378 if (test_bit(BTRFS_FS_STATE_REMOUNTING,
379 &fs_info->fs_state))
380 break;
381
363 if (!__need_auto_defrag(fs_info->tree_root)) 382 if (!__need_auto_defrag(fs_info->tree_root))
364 break; 383 break;
365 384
@@ -491,8 +510,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
491 loff_t isize = i_size_read(inode); 510 loff_t isize = i_size_read(inode);
492 511
493 start_pos = pos & ~((u64)root->sectorsize - 1); 512 start_pos = pos & ~((u64)root->sectorsize - 1);
494 num_bytes = (write_bytes + pos - start_pos + 513 num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);
495 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
496 514
497 end_of_last_block = start_pos + num_bytes - 1; 515 end_of_last_block = start_pos + num_bytes - 1;
498 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 516 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
@@ -1211,7 +1229,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1211 struct extent_state *cached_state = NULL; 1229 struct extent_state *cached_state = NULL;
1212 int i; 1230 int i;
1213 unsigned long index = pos >> PAGE_CACHE_SHIFT; 1231 unsigned long index = pos >> PAGE_CACHE_SHIFT;
1214 struct inode *inode = fdentry(file)->d_inode; 1232 struct inode *inode = file_inode(file);
1215 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 1233 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1216 int err = 0; 1234 int err = 0;
1217 int faili = 0; 1235 int faili = 0;
@@ -1298,7 +1316,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1298 struct iov_iter *i, 1316 struct iov_iter *i,
1299 loff_t pos) 1317 loff_t pos)
1300{ 1318{
1301 struct inode *inode = fdentry(file)->d_inode; 1319 struct inode *inode = file_inode(file);
1302 struct btrfs_root *root = BTRFS_I(inode)->root; 1320 struct btrfs_root *root = BTRFS_I(inode)->root;
1303 struct page **pages = NULL; 1321 struct page **pages = NULL;
1304 unsigned long first_index; 1322 unsigned long first_index;
@@ -1486,7 +1504,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1486 unsigned long nr_segs, loff_t pos) 1504 unsigned long nr_segs, loff_t pos)
1487{ 1505{
1488 struct file *file = iocb->ki_filp; 1506 struct file *file = iocb->ki_filp;
1489 struct inode *inode = fdentry(file)->d_inode; 1507 struct inode *inode = file_inode(file);
1490 struct btrfs_root *root = BTRFS_I(inode)->root; 1508 struct btrfs_root *root = BTRFS_I(inode)->root;
1491 loff_t *ppos = &iocb->ki_pos; 1509 loff_t *ppos = &iocb->ki_pos;
1492 u64 start_pos; 1510 u64 start_pos;
@@ -1530,7 +1548,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1530 * although we have opened a file as writable, we have 1548 * although we have opened a file as writable, we have
1531 * to stop this write operation to ensure FS consistency. 1549 * to stop this write operation to ensure FS consistency.
1532 */ 1550 */
1533 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 1551 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
1534 mutex_unlock(&inode->i_mutex); 1552 mutex_unlock(&inode->i_mutex);
1535 err = -EROFS; 1553 err = -EROFS;
1536 goto out; 1554 goto out;
@@ -1594,9 +1612,10 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1594 if (err < 0 && num_written > 0) 1612 if (err < 0 && num_written > 0)
1595 num_written = err; 1613 num_written = err;
1596 } 1614 }
1597out: 1615
1598 if (sync) 1616 if (sync)
1599 atomic_dec(&BTRFS_I(inode)->sync_writers); 1617 atomic_dec(&BTRFS_I(inode)->sync_writers);
1618out:
1600 sb_end_write(inode->i_sb); 1619 sb_end_write(inode->i_sb);
1601 current->backing_dev_info = NULL; 1620 current->backing_dev_info = NULL;
1602 return num_written ? num_written : err; 1621 return num_written ? num_written : err;
@@ -1612,7 +1631,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
1612 */ 1631 */
1613 if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 1632 if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
1614 &BTRFS_I(inode)->runtime_flags)) { 1633 &BTRFS_I(inode)->runtime_flags)) {
1615 btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); 1634 struct btrfs_trans_handle *trans;
1635 struct btrfs_root *root = BTRFS_I(inode)->root;
1636
1637 /*
1638 * We need to block on a committing transaction to keep us from
1639 * throwing a ordered operation on to the list and causing
1640 * something like sync to deadlock trying to flush out this
1641 * inode.
1642 */
1643 trans = btrfs_start_transaction(root, 0);
1644 if (IS_ERR(trans))
1645 return PTR_ERR(trans);
1646 btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
1647 btrfs_end_transaction(trans, root);
1616 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 1648 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1617 filemap_flush(inode->i_mapping); 1649 filemap_flush(inode->i_mapping);
1618 } 1650 }
@@ -1639,16 +1671,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1639 struct btrfs_root *root = BTRFS_I(inode)->root; 1671 struct btrfs_root *root = BTRFS_I(inode)->root;
1640 int ret = 0; 1672 int ret = 0;
1641 struct btrfs_trans_handle *trans; 1673 struct btrfs_trans_handle *trans;
1674 bool full_sync = 0;
1642 1675
1643 trace_btrfs_sync_file(file, datasync); 1676 trace_btrfs_sync_file(file, datasync);
1644 1677
1645 /* 1678 /*
1646 * We write the dirty pages in the range and wait until they complete 1679 * We write the dirty pages in the range and wait until they complete
1647 * out of the ->i_mutex. If so, we can flush the dirty pages by 1680 * out of the ->i_mutex. If so, we can flush the dirty pages by
1648 * multi-task, and make the performance up. 1681 * multi-task, and make the performance up. See
1682 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1649 */ 1683 */
1650 atomic_inc(&BTRFS_I(inode)->sync_writers); 1684 atomic_inc(&BTRFS_I(inode)->sync_writers);
1651 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 1685 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1686 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1687 &BTRFS_I(inode)->runtime_flags))
1688 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1652 atomic_dec(&BTRFS_I(inode)->sync_writers); 1689 atomic_dec(&BTRFS_I(inode)->sync_writers);
1653 if (ret) 1690 if (ret)
1654 return ret; 1691 return ret;
@@ -1660,7 +1697,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1660 * range being left. 1697 * range being left.
1661 */ 1698 */
1662 atomic_inc(&root->log_batch); 1699 atomic_inc(&root->log_batch);
1663 btrfs_wait_ordered_range(inode, start, end - start + 1); 1700 full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1701 &BTRFS_I(inode)->runtime_flags);
1702 if (full_sync)
1703 btrfs_wait_ordered_range(inode, start, end - start + 1);
1664 atomic_inc(&root->log_batch); 1704 atomic_inc(&root->log_batch);
1665 1705
1666 /* 1706 /*
@@ -1727,13 +1767,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1727 1767
1728 if (ret != BTRFS_NO_LOG_SYNC) { 1768 if (ret != BTRFS_NO_LOG_SYNC) {
1729 if (ret > 0) { 1769 if (ret > 0) {
1770 /*
1771 * If we didn't already wait for ordered extents we need
1772 * to do that now.
1773 */
1774 if (!full_sync)
1775 btrfs_wait_ordered_range(inode, start,
1776 end - start + 1);
1730 ret = btrfs_commit_transaction(trans, root); 1777 ret = btrfs_commit_transaction(trans, root);
1731 } else { 1778 } else {
1732 ret = btrfs_sync_log(trans, root); 1779 ret = btrfs_sync_log(trans, root);
1733 if (ret == 0) 1780 if (ret == 0) {
1734 ret = btrfs_end_transaction(trans, root); 1781 ret = btrfs_end_transaction(trans, root);
1735 else 1782 } else {
1783 if (!full_sync)
1784 btrfs_wait_ordered_range(inode, start,
1785 end -
1786 start + 1);
1736 ret = btrfs_commit_transaction(trans, root); 1787 ret = btrfs_commit_transaction(trans, root);
1788 }
1737 } 1789 }
1738 } else { 1790 } else {
1739 ret = btrfs_end_transaction(trans, root); 1791 ret = btrfs_end_transaction(trans, root);
@@ -2087,7 +2139,7 @@ out:
2087static long btrfs_fallocate(struct file *file, int mode, 2139static long btrfs_fallocate(struct file *file, int mode,
2088 loff_t offset, loff_t len) 2140 loff_t offset, loff_t len)
2089{ 2141{
2090 struct inode *inode = file->f_path.dentry->d_inode; 2142 struct inode *inode = file_inode(file);
2091 struct extent_state *cached_state = NULL; 2143 struct extent_state *cached_state = NULL;
2092 u64 cur_offset; 2144 u64 cur_offset;
2093 u64 last_byte; 2145 u64 last_byte;
@@ -2241,6 +2293,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
2241 if (lockend <= lockstart) 2293 if (lockend <= lockstart)
2242 lockend = lockstart + root->sectorsize; 2294 lockend = lockstart + root->sectorsize;
2243 2295
2296 lockend--;
2244 len = lockend - lockstart + 1; 2297 len = lockend - lockstart + 1;
2245 2298
2246 len = max_t(u64, len, root->sectorsize); 2299 len = max_t(u64, len, root->sectorsize);
@@ -2307,9 +2360,12 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
2307 } 2360 }
2308 } 2361 }
2309 2362
2310 *offset = start; 2363 if (!test_bit(EXTENT_FLAG_PREALLOC,
2311 free_extent_map(em); 2364 &em->flags)) {
2312 break; 2365 *offset = start;
2366 free_extent_map(em);
2367 break;
2368 }
2313 } 2369 }
2314 } 2370 }
2315 2371
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 59ea2e4349c9..1f84fc09c1a8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1356,6 +1356,8 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1356 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; 1356 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
1357 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 1357 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
1358 1358
1359 max_bitmaps = max(max_bitmaps, 1);
1360
1359 BUG_ON(ctl->total_bitmaps > max_bitmaps); 1361 BUG_ON(ctl->total_bitmaps > max_bitmaps);
1360 1362
1361 /* 1363 /*
@@ -1463,10 +1465,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1463} 1465}
1464 1466
1465static struct btrfs_free_space * 1467static struct btrfs_free_space *
1466find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) 1468find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1469 unsigned long align)
1467{ 1470{
1468 struct btrfs_free_space *entry; 1471 struct btrfs_free_space *entry;
1469 struct rb_node *node; 1472 struct rb_node *node;
1473 u64 ctl_off;
1474 u64 tmp;
1475 u64 align_off;
1470 int ret; 1476 int ret;
1471 1477
1472 if (!ctl->free_space_offset.rb_node) 1478 if (!ctl->free_space_offset.rb_node)
@@ -1481,15 +1487,34 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
1481 if (entry->bytes < *bytes) 1487 if (entry->bytes < *bytes)
1482 continue; 1488 continue;
1483 1489
1490 /* make sure the space returned is big enough
1491 * to match our requested alignment
1492 */
1493 if (*bytes >= align) {
1494 ctl_off = entry->offset - ctl->start;
1495 tmp = ctl_off + align - 1;;
1496 do_div(tmp, align);
1497 tmp = tmp * align + ctl->start;
1498 align_off = tmp - entry->offset;
1499 } else {
1500 align_off = 0;
1501 tmp = entry->offset;
1502 }
1503
1504 if (entry->bytes < *bytes + align_off)
1505 continue;
1506
1484 if (entry->bitmap) { 1507 if (entry->bitmap) {
1485 ret = search_bitmap(ctl, entry, offset, bytes); 1508 ret = search_bitmap(ctl, entry, &tmp, bytes);
1486 if (!ret) 1509 if (!ret) {
1510 *offset = tmp;
1487 return entry; 1511 return entry;
1512 }
1488 continue; 1513 continue;
1489 } 1514 }
1490 1515
1491 *offset = entry->offset; 1516 *offset = tmp;
1492 *bytes = entry->bytes; 1517 *bytes = entry->bytes - align_off;
1493 return entry; 1518 return entry;
1494 } 1519 }
1495 1520
@@ -1636,10 +1661,14 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
1636 } 1661 }
1637 1662
1638 /* 1663 /*
1639 * some block groups are so tiny they can't be enveloped by a bitmap, so 1664 * The original block groups from mkfs can be really small, like 8
1640 * don't even bother to create a bitmap for this 1665 * megabytes, so don't bother with a bitmap for those entries. However
1666 * some block groups can be smaller than what a bitmap would cover but
1667 * are still large enough that they could overflow the 32k memory limit,
1668 * so allow those block groups to still be allowed to have a bitmap
1669 * entry.
1641 */ 1670 */
1642 if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset) 1671 if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)
1643 return false; 1672 return false;
1644 1673
1645 return true; 1674 return true;
@@ -1862,11 +1891,13 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
1862{ 1891{
1863 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1892 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1864 struct btrfs_free_space *info; 1893 struct btrfs_free_space *info;
1865 int ret = 0; 1894 int ret;
1895 bool re_search = false;
1866 1896
1867 spin_lock(&ctl->tree_lock); 1897 spin_lock(&ctl->tree_lock);
1868 1898
1869again: 1899again:
1900 ret = 0;
1870 if (!bytes) 1901 if (!bytes)
1871 goto out_lock; 1902 goto out_lock;
1872 1903
@@ -1879,17 +1910,17 @@ again:
1879 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1910 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1880 1, 0); 1911 1, 0);
1881 if (!info) { 1912 if (!info) {
1882 /* the tree logging code might be calling us before we 1913 /*
1883 * have fully loaded the free space rbtree for this 1914 * If we found a partial bit of our free space in a
1884 * block group. So it is possible the entry won't 1915 * bitmap but then couldn't find the other part this may
1885 * be in the rbtree yet at all. The caching code 1916 * be a problem, so WARN about it.
1886 * will make sure not to put it in the rbtree if
1887 * the logging code has pinned it.
1888 */ 1917 */
1918 WARN_ON(re_search);
1889 goto out_lock; 1919 goto out_lock;
1890 } 1920 }
1891 } 1921 }
1892 1922
1923 re_search = false;
1893 if (!info->bitmap) { 1924 if (!info->bitmap) {
1894 unlink_free_space(ctl, info); 1925 unlink_free_space(ctl, info);
1895 if (offset == info->offset) { 1926 if (offset == info->offset) {
@@ -1935,8 +1966,10 @@ again:
1935 } 1966 }
1936 1967
1937 ret = remove_from_bitmap(ctl, info, &offset, &bytes); 1968 ret = remove_from_bitmap(ctl, info, &offset, &bytes);
1938 if (ret == -EAGAIN) 1969 if (ret == -EAGAIN) {
1970 re_search = true;
1939 goto again; 1971 goto again;
1972 }
1940 BUG_ON(ret); /* logic error */ 1973 BUG_ON(ret); /* logic error */
1941out_lock: 1974out_lock:
1942 spin_unlock(&ctl->tree_lock); 1975 spin_unlock(&ctl->tree_lock);
@@ -2091,9 +2124,12 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2091 struct btrfs_free_space *entry = NULL; 2124 struct btrfs_free_space *entry = NULL;
2092 u64 bytes_search = bytes + empty_size; 2125 u64 bytes_search = bytes + empty_size;
2093 u64 ret = 0; 2126 u64 ret = 0;
2127 u64 align_gap = 0;
2128 u64 align_gap_len = 0;
2094 2129
2095 spin_lock(&ctl->tree_lock); 2130 spin_lock(&ctl->tree_lock);
2096 entry = find_free_space(ctl, &offset, &bytes_search); 2131 entry = find_free_space(ctl, &offset, &bytes_search,
2132 block_group->full_stripe_len);
2097 if (!entry) 2133 if (!entry)
2098 goto out; 2134 goto out;
2099 2135
@@ -2103,9 +2139,15 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2103 if (!entry->bytes) 2139 if (!entry->bytes)
2104 free_bitmap(ctl, entry); 2140 free_bitmap(ctl, entry);
2105 } else { 2141 } else {
2142
2106 unlink_free_space(ctl, entry); 2143 unlink_free_space(ctl, entry);
2107 entry->offset += bytes; 2144 align_gap_len = offset - entry->offset;
2108 entry->bytes -= bytes; 2145 align_gap = entry->offset;
2146
2147 entry->offset = offset + bytes;
2148 WARN_ON(entry->bytes < bytes + align_gap_len);
2149
2150 entry->bytes -= bytes + align_gap_len;
2109 if (!entry->bytes) 2151 if (!entry->bytes)
2110 kmem_cache_free(btrfs_free_space_cachep, entry); 2152 kmem_cache_free(btrfs_free_space_cachep, entry);
2111 else 2153 else
@@ -2115,6 +2157,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2115out: 2157out:
2116 spin_unlock(&ctl->tree_lock); 2158 spin_unlock(&ctl->tree_lock);
2117 2159
2160 if (align_gap_len)
2161 __btrfs_add_free_space(ctl, align_gap, align_gap_len);
2118 return ret; 2162 return ret;
2119} 2163}
2120 2164
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 16d9e8e191e6..d1470adca8f8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -39,12 +39,13 @@
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/ratelimit.h> 40#include <linux/ratelimit.h>
41#include <linux/mount.h> 41#include <linux/mount.h>
42#include <linux/btrfs.h>
43#include <linux/blkdev.h>
42#include "compat.h" 44#include "compat.h"
43#include "ctree.h" 45#include "ctree.h"
44#include "disk-io.h" 46#include "disk-io.h"
45#include "transaction.h" 47#include "transaction.h"
46#include "btrfs_inode.h" 48#include "btrfs_inode.h"
47#include "ioctl.h"
48#include "print-tree.h" 49#include "print-tree.h"
49#include "ordered-data.h" 50#include "ordered-data.h"
50#include "xattr.h" 51#include "xattr.h"
@@ -54,6 +55,7 @@
54#include "locking.h" 55#include "locking.h"
55#include "free-space-cache.h" 56#include "free-space-cache.h"
56#include "inode-map.h" 57#include "inode-map.h"
58#include "backref.h"
57 59
58struct btrfs_iget_args { 60struct btrfs_iget_args {
59 u64 ino; 61 u64 ino;
@@ -88,7 +90,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
88 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, 90 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
89}; 91};
90 92
91static int btrfs_setsize(struct inode *inode, loff_t newsize); 93static int btrfs_setsize(struct inode *inode, struct iattr *attr);
92static int btrfs_truncate(struct inode *inode); 94static int btrfs_truncate(struct inode *inode);
93static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); 95static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
94static noinline int cow_file_range(struct inode *inode, 96static noinline int cow_file_range(struct inode *inode,
@@ -231,8 +233,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
231 u64 isize = i_size_read(inode); 233 u64 isize = i_size_read(inode);
232 u64 actual_end = min(end + 1, isize); 234 u64 actual_end = min(end + 1, isize);
233 u64 inline_len = actual_end - start; 235 u64 inline_len = actual_end - start;
234 u64 aligned_end = (end + root->sectorsize - 1) & 236 u64 aligned_end = ALIGN(end, root->sectorsize);
235 ~((u64)root->sectorsize - 1);
236 u64 data_len = inline_len; 237 u64 data_len = inline_len;
237 int ret; 238 int ret;
238 239
@@ -265,6 +266,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
265 return 1; 266 return 1;
266 } 267 }
267 268
269 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
268 btrfs_delalloc_release_metadata(inode, end + 1 - start); 270 btrfs_delalloc_release_metadata(inode, end + 1 - start);
269 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 271 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
270 return 0; 272 return 0;
@@ -389,7 +391,7 @@ again:
389 * a compressed extent to 128k. 391 * a compressed extent to 128k.
390 */ 392 */
391 total_compressed = min(total_compressed, max_uncompressed); 393 total_compressed = min(total_compressed, max_uncompressed);
392 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 394 num_bytes = ALIGN(end - start + 1, blocksize);
393 num_bytes = max(blocksize, num_bytes); 395 num_bytes = max(blocksize, num_bytes);
394 total_in = 0; 396 total_in = 0;
395 ret = 0; 397 ret = 0;
@@ -488,15 +490,13 @@ cont:
488 * up to a block size boundary so the allocator does sane 490 * up to a block size boundary so the allocator does sane
489 * things 491 * things
490 */ 492 */
491 total_compressed = (total_compressed + blocksize - 1) & 493 total_compressed = ALIGN(total_compressed, blocksize);
492 ~(blocksize - 1);
493 494
494 /* 495 /*
495 * one last check to make sure the compression is really a 496 * one last check to make sure the compression is really a
496 * win, compare the page count read with the blocks on disk 497 * win, compare the page count read with the blocks on disk
497 */ 498 */
498 total_in = (total_in + PAGE_CACHE_SIZE - 1) & 499 total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
499 ~(PAGE_CACHE_SIZE - 1);
500 if (total_compressed >= total_in) { 500 if (total_compressed >= total_in) {
501 will_compress = 0; 501 will_compress = 0;
502 } else { 502 } else {
@@ -608,7 +608,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
608 if (list_empty(&async_cow->extents)) 608 if (list_empty(&async_cow->extents))
609 return 0; 609 return 0;
610 610
611 611again:
612 while (!list_empty(&async_cow->extents)) { 612 while (!list_empty(&async_cow->extents)) {
613 async_extent = list_entry(async_cow->extents.next, 613 async_extent = list_entry(async_cow->extents.next,
614 struct async_extent, list); 614 struct async_extent, list);
@@ -648,6 +648,8 @@ retry:
648 async_extent->ram_size - 1, 648 async_extent->ram_size - 1,
649 btrfs_get_extent, 649 btrfs_get_extent,
650 WB_SYNC_ALL); 650 WB_SYNC_ALL);
651 else if (ret)
652 unlock_page(async_cow->locked_page);
651 kfree(async_extent); 653 kfree(async_extent);
652 cond_resched(); 654 cond_resched();
653 continue; 655 continue;
@@ -672,6 +674,7 @@ retry:
672 674
673 if (ret) { 675 if (ret) {
674 int i; 676 int i;
677
675 for (i = 0; i < async_extent->nr_pages; i++) { 678 for (i = 0; i < async_extent->nr_pages; i++) {
676 WARN_ON(async_extent->pages[i]->mapping); 679 WARN_ON(async_extent->pages[i]->mapping);
677 page_cache_release(async_extent->pages[i]); 680 page_cache_release(async_extent->pages[i]);
@@ -679,12 +682,10 @@ retry:
679 kfree(async_extent->pages); 682 kfree(async_extent->pages);
680 async_extent->nr_pages = 0; 683 async_extent->nr_pages = 0;
681 async_extent->pages = NULL; 684 async_extent->pages = NULL;
682 unlock_extent(io_tree, async_extent->start, 685
683 async_extent->start +
684 async_extent->ram_size - 1);
685 if (ret == -ENOSPC) 686 if (ret == -ENOSPC)
686 goto retry; 687 goto retry;
687 goto out_free; /* JDM: Requeue? */ 688 goto out_free;
688 } 689 }
689 690
690 /* 691 /*
@@ -696,10 +697,13 @@ retry:
696 async_extent->ram_size - 1, 0); 697 async_extent->ram_size - 1, 0);
697 698
698 em = alloc_extent_map(); 699 em = alloc_extent_map();
699 BUG_ON(!em); /* -ENOMEM */ 700 if (!em)
701 goto out_free_reserve;
700 em->start = async_extent->start; 702 em->start = async_extent->start;
701 em->len = async_extent->ram_size; 703 em->len = async_extent->ram_size;
702 em->orig_start = em->start; 704 em->orig_start = em->start;
705 em->mod_start = em->start;
706 em->mod_len = em->len;
703 707
704 em->block_start = ins.objectid; 708 em->block_start = ins.objectid;
705 em->block_len = ins.offset; 709 em->block_len = ins.offset;
@@ -726,6 +730,9 @@ retry:
726 async_extent->ram_size - 1, 0); 730 async_extent->ram_size - 1, 0);
727 } 731 }
728 732
733 if (ret)
734 goto out_free_reserve;
735
729 ret = btrfs_add_ordered_extent_compress(inode, 736 ret = btrfs_add_ordered_extent_compress(inode,
730 async_extent->start, 737 async_extent->start,
731 ins.objectid, 738 ins.objectid,
@@ -733,7 +740,8 @@ retry:
733 ins.offset, 740 ins.offset,
734 BTRFS_ORDERED_COMPRESSED, 741 BTRFS_ORDERED_COMPRESSED,
735 async_extent->compress_type); 742 async_extent->compress_type);
736 BUG_ON(ret); /* -ENOMEM */ 743 if (ret)
744 goto out_free_reserve;
737 745
738 /* 746 /*
739 * clear dirty, set writeback and unlock the pages. 747 * clear dirty, set writeback and unlock the pages.
@@ -754,18 +762,30 @@ retry:
754 ins.objectid, 762 ins.objectid,
755 ins.offset, async_extent->pages, 763 ins.offset, async_extent->pages,
756 async_extent->nr_pages); 764 async_extent->nr_pages);
757
758 BUG_ON(ret); /* -ENOMEM */
759 alloc_hint = ins.objectid + ins.offset; 765 alloc_hint = ins.objectid + ins.offset;
760 kfree(async_extent); 766 kfree(async_extent);
767 if (ret)
768 goto out;
761 cond_resched(); 769 cond_resched();
762 } 770 }
763 ret = 0; 771 ret = 0;
764out: 772out:
765 return ret; 773 return ret;
774out_free_reserve:
775 btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
766out_free: 776out_free:
777 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
778 async_extent->start,
779 async_extent->start +
780 async_extent->ram_size - 1,
781 NULL, EXTENT_CLEAR_UNLOCK_PAGE |
782 EXTENT_CLEAR_UNLOCK |
783 EXTENT_CLEAR_DELALLOC |
784 EXTENT_CLEAR_DIRTY |
785 EXTENT_SET_WRITEBACK |
786 EXTENT_END_WRITEBACK);
767 kfree(async_extent); 787 kfree(async_extent);
768 goto out; 788 goto again;
769} 789}
770 790
771static u64 get_extent_allocation_hint(struct inode *inode, u64 start, 791static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
@@ -834,7 +854,7 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
834 854
835 BUG_ON(btrfs_is_free_space_inode(inode)); 855 BUG_ON(btrfs_is_free_space_inode(inode));
836 856
837 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 857 num_bytes = ALIGN(end - start + 1, blocksize);
838 num_bytes = max(blocksize, num_bytes); 858 num_bytes = max(blocksize, num_bytes);
839 disk_num_bytes = num_bytes; 859 disk_num_bytes = num_bytes;
840 860
@@ -892,6 +912,8 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
892 em->orig_start = em->start; 912 em->orig_start = em->start;
893 ram_size = ins.offset; 913 ram_size = ins.offset;
894 em->len = ins.offset; 914 em->len = ins.offset;
915 em->mod_start = em->start;
916 em->mod_len = em->len;
895 917
896 em->block_start = ins.objectid; 918 em->block_start = ins.objectid;
897 em->block_len = ins.offset; 919 em->block_len = ins.offset;
@@ -1338,6 +1360,8 @@ out_check:
1338 em->block_start = disk_bytenr; 1360 em->block_start = disk_bytenr;
1339 em->orig_block_len = disk_num_bytes; 1361 em->orig_block_len = disk_num_bytes;
1340 em->bdev = root->fs_info->fs_devices->latest_bdev; 1362 em->bdev = root->fs_info->fs_devices->latest_bdev;
1363 em->mod_start = em->start;
1364 em->mod_len = em->len;
1341 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1365 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1342 set_bit(EXTENT_FLAG_FILLING, &em->flags); 1366 set_bit(EXTENT_FLAG_FILLING, &em->flags);
1343 em->generation = -1; 1367 em->generation = -1;
@@ -1508,14 +1532,22 @@ static void btrfs_set_bit_hook(struct inode *inode,
1508 spin_unlock(&BTRFS_I(inode)->lock); 1532 spin_unlock(&BTRFS_I(inode)->lock);
1509 } 1533 }
1510 1534
1511 spin_lock(&root->fs_info->delalloc_lock); 1535 __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
1536 root->fs_info->delalloc_batch);
1537 spin_lock(&BTRFS_I(inode)->lock);
1512 BTRFS_I(inode)->delalloc_bytes += len; 1538 BTRFS_I(inode)->delalloc_bytes += len;
1513 root->fs_info->delalloc_bytes += len; 1539 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1514 if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1540 &BTRFS_I(inode)->runtime_flags)) {
1515 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1541 spin_lock(&root->fs_info->delalloc_lock);
1516 &root->fs_info->delalloc_inodes); 1542 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1543 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1544 &root->fs_info->delalloc_inodes);
1545 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1546 &BTRFS_I(inode)->runtime_flags);
1547 }
1548 spin_unlock(&root->fs_info->delalloc_lock);
1517 } 1549 }
1518 spin_unlock(&root->fs_info->delalloc_lock); 1550 spin_unlock(&BTRFS_I(inode)->lock);
1519 } 1551 }
1520} 1552}
1521 1553
@@ -1550,15 +1582,22 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1550 && do_list) 1582 && do_list)
1551 btrfs_free_reserved_data_space(inode, len); 1583 btrfs_free_reserved_data_space(inode, len);
1552 1584
1553 spin_lock(&root->fs_info->delalloc_lock); 1585 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
1554 root->fs_info->delalloc_bytes -= len; 1586 root->fs_info->delalloc_batch);
1587 spin_lock(&BTRFS_I(inode)->lock);
1555 BTRFS_I(inode)->delalloc_bytes -= len; 1588 BTRFS_I(inode)->delalloc_bytes -= len;
1556
1557 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && 1589 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1558 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1590 test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1559 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1591 &BTRFS_I(inode)->runtime_flags)) {
1592 spin_lock(&root->fs_info->delalloc_lock);
1593 if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1594 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1595 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1596 &BTRFS_I(inode)->runtime_flags);
1597 }
1598 spin_unlock(&root->fs_info->delalloc_lock);
1560 } 1599 }
1561 spin_unlock(&root->fs_info->delalloc_lock); 1600 spin_unlock(&BTRFS_I(inode)->lock);
1562 } 1601 }
1563} 1602}
1564 1603
@@ -1566,7 +1605,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1566 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure 1605 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1567 * we don't create bios that span stripes or chunks 1606 * we don't create bios that span stripes or chunks
1568 */ 1607 */
1569int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 1608int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
1570 size_t size, struct bio *bio, 1609 size_t size, struct bio *bio,
1571 unsigned long bio_flags) 1610 unsigned long bio_flags)
1572{ 1611{
@@ -1581,7 +1620,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1581 1620
1582 length = bio->bi_size; 1621 length = bio->bi_size;
1583 map_length = length; 1622 map_length = length;
1584 ret = btrfs_map_block(root->fs_info, READ, logical, 1623 ret = btrfs_map_block(root->fs_info, rw, logical,
1585 &map_length, NULL, 0); 1624 &map_length, NULL, 0);
1586 /* Will always return 0 with map_multi == NULL */ 1625 /* Will always return 0 with map_multi == NULL */
1587 BUG_ON(ret < 0); 1626 BUG_ON(ret < 0);
@@ -1892,6 +1931,640 @@ out:
1892 return ret; 1931 return ret;
1893} 1932}
1894 1933
1934/* snapshot-aware defrag */
1935struct sa_defrag_extent_backref {
1936 struct rb_node node;
1937 struct old_sa_defrag_extent *old;
1938 u64 root_id;
1939 u64 inum;
1940 u64 file_pos;
1941 u64 extent_offset;
1942 u64 num_bytes;
1943 u64 generation;
1944};
1945
1946struct old_sa_defrag_extent {
1947 struct list_head list;
1948 struct new_sa_defrag_extent *new;
1949
1950 u64 extent_offset;
1951 u64 bytenr;
1952 u64 offset;
1953 u64 len;
1954 int count;
1955};
1956
1957struct new_sa_defrag_extent {
1958 struct rb_root root;
1959 struct list_head head;
1960 struct btrfs_path *path;
1961 struct inode *inode;
1962 u64 file_pos;
1963 u64 len;
1964 u64 bytenr;
1965 u64 disk_len;
1966 u8 compress_type;
1967};
1968
1969static int backref_comp(struct sa_defrag_extent_backref *b1,
1970 struct sa_defrag_extent_backref *b2)
1971{
1972 if (b1->root_id < b2->root_id)
1973 return -1;
1974 else if (b1->root_id > b2->root_id)
1975 return 1;
1976
1977 if (b1->inum < b2->inum)
1978 return -1;
1979 else if (b1->inum > b2->inum)
1980 return 1;
1981
1982 if (b1->file_pos < b2->file_pos)
1983 return -1;
1984 else if (b1->file_pos > b2->file_pos)
1985 return 1;
1986
1987 /*
1988 * [------------------------------] ===> (a range of space)
1989 * |<--->| |<---->| =============> (fs/file tree A)
1990 * |<---------------------------->| ===> (fs/file tree B)
1991 *
1992 * A range of space can refer to two file extents in one tree while
1993 * refer to only one file extent in another tree.
1994 *
1995 * So we may process a disk offset more than one time(two extents in A)
1996 * and locate at the same extent(one extent in B), then insert two same
1997 * backrefs(both refer to the extent in B).
1998 */
1999 return 0;
2000}
2001
2002static void backref_insert(struct rb_root *root,
2003 struct sa_defrag_extent_backref *backref)
2004{
2005 struct rb_node **p = &root->rb_node;
2006 struct rb_node *parent = NULL;
2007 struct sa_defrag_extent_backref *entry;
2008 int ret;
2009
2010 while (*p) {
2011 parent = *p;
2012 entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2013
2014 ret = backref_comp(backref, entry);
2015 if (ret < 0)
2016 p = &(*p)->rb_left;
2017 else
2018 p = &(*p)->rb_right;
2019 }
2020
2021 rb_link_node(&backref->node, parent, p);
2022 rb_insert_color(&backref->node, root);
2023}
2024
2025/*
2026 * Note the backref might has changed, and in this case we just return 0.
2027 */
2028static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2029 void *ctx)
2030{
2031 struct btrfs_file_extent_item *extent;
2032 struct btrfs_fs_info *fs_info;
2033 struct old_sa_defrag_extent *old = ctx;
2034 struct new_sa_defrag_extent *new = old->new;
2035 struct btrfs_path *path = new->path;
2036 struct btrfs_key key;
2037 struct btrfs_root *root;
2038 struct sa_defrag_extent_backref *backref;
2039 struct extent_buffer *leaf;
2040 struct inode *inode = new->inode;
2041 int slot;
2042 int ret;
2043 u64 extent_offset;
2044 u64 num_bytes;
2045
2046 if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2047 inum == btrfs_ino(inode))
2048 return 0;
2049
2050 key.objectid = root_id;
2051 key.type = BTRFS_ROOT_ITEM_KEY;
2052 key.offset = (u64)-1;
2053
2054 fs_info = BTRFS_I(inode)->root->fs_info;
2055 root = btrfs_read_fs_root_no_name(fs_info, &key);
2056 if (IS_ERR(root)) {
2057 if (PTR_ERR(root) == -ENOENT)
2058 return 0;
2059 WARN_ON(1);
2060 pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
2061 inum, offset, root_id);
2062 return PTR_ERR(root);
2063 }
2064
2065 key.objectid = inum;
2066 key.type = BTRFS_EXTENT_DATA_KEY;
2067 if (offset > (u64)-1 << 32)
2068 key.offset = 0;
2069 else
2070 key.offset = offset;
2071
2072 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2073 if (ret < 0) {
2074 WARN_ON(1);
2075 return ret;
2076 }
2077
2078 while (1) {
2079 cond_resched();
2080
2081 leaf = path->nodes[0];
2082 slot = path->slots[0];
2083
2084 if (slot >= btrfs_header_nritems(leaf)) {
2085 ret = btrfs_next_leaf(root, path);
2086 if (ret < 0) {
2087 goto out;
2088 } else if (ret > 0) {
2089 ret = 0;
2090 goto out;
2091 }
2092 continue;
2093 }
2094
2095 path->slots[0]++;
2096
2097 btrfs_item_key_to_cpu(leaf, &key, slot);
2098
2099 if (key.objectid > inum)
2100 goto out;
2101
2102 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2103 continue;
2104
2105 extent = btrfs_item_ptr(leaf, slot,
2106 struct btrfs_file_extent_item);
2107
2108 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2109 continue;
2110
2111 extent_offset = btrfs_file_extent_offset(leaf, extent);
2112 if (key.offset - extent_offset != offset)
2113 continue;
2114
2115 num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2116 if (extent_offset >= old->extent_offset + old->offset +
2117 old->len || extent_offset + num_bytes <=
2118 old->extent_offset + old->offset)
2119 continue;
2120
2121 break;
2122 }
2123
2124 backref = kmalloc(sizeof(*backref), GFP_NOFS);
2125 if (!backref) {
2126 ret = -ENOENT;
2127 goto out;
2128 }
2129
2130 backref->root_id = root_id;
2131 backref->inum = inum;
2132 backref->file_pos = offset + extent_offset;
2133 backref->num_bytes = num_bytes;
2134 backref->extent_offset = extent_offset;
2135 backref->generation = btrfs_file_extent_generation(leaf, extent);
2136 backref->old = old;
2137 backref_insert(&new->root, backref);
2138 old->count++;
2139out:
2140 btrfs_release_path(path);
2141 WARN_ON(ret);
2142 return ret;
2143}
2144
2145static noinline bool record_extent_backrefs(struct btrfs_path *path,
2146 struct new_sa_defrag_extent *new)
2147{
2148 struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
2149 struct old_sa_defrag_extent *old, *tmp;
2150 int ret;
2151
2152 new->path = path;
2153
2154 list_for_each_entry_safe(old, tmp, &new->head, list) {
2155 ret = iterate_inodes_from_logical(old->bytenr, fs_info,
2156 path, record_one_backref,
2157 old);
2158 BUG_ON(ret < 0 && ret != -ENOENT);
2159
2160 /* no backref to be processed for this extent */
2161 if (!old->count) {
2162 list_del(&old->list);
2163 kfree(old);
2164 }
2165 }
2166
2167 if (list_empty(&new->head))
2168 return false;
2169
2170 return true;
2171}
2172
2173static int relink_is_mergable(struct extent_buffer *leaf,
2174 struct btrfs_file_extent_item *fi,
2175 u64 disk_bytenr)
2176{
2177 if (btrfs_file_extent_disk_bytenr(leaf, fi) != disk_bytenr)
2178 return 0;
2179
2180 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2181 return 0;
2182
2183 if (btrfs_file_extent_compression(leaf, fi) ||
2184 btrfs_file_extent_encryption(leaf, fi) ||
2185 btrfs_file_extent_other_encoding(leaf, fi))
2186 return 0;
2187
2188 return 1;
2189}
2190
2191/*
2192 * Note the backref might has changed, and in this case we just return 0.
2193 */
2194static noinline int relink_extent_backref(struct btrfs_path *path,
2195 struct sa_defrag_extent_backref *prev,
2196 struct sa_defrag_extent_backref *backref)
2197{
2198 struct btrfs_file_extent_item *extent;
2199 struct btrfs_file_extent_item *item;
2200 struct btrfs_ordered_extent *ordered;
2201 struct btrfs_trans_handle *trans;
2202 struct btrfs_fs_info *fs_info;
2203 struct btrfs_root *root;
2204 struct btrfs_key key;
2205 struct extent_buffer *leaf;
2206 struct old_sa_defrag_extent *old = backref->old;
2207 struct new_sa_defrag_extent *new = old->new;
2208 struct inode *src_inode = new->inode;
2209 struct inode *inode;
2210 struct extent_state *cached = NULL;
2211 int ret = 0;
2212 u64 start;
2213 u64 len;
2214 u64 lock_start;
2215 u64 lock_end;
2216 bool merge = false;
2217 int index;
2218
2219 if (prev && prev->root_id == backref->root_id &&
2220 prev->inum == backref->inum &&
2221 prev->file_pos + prev->num_bytes == backref->file_pos)
2222 merge = true;
2223
2224 /* step 1: get root */
2225 key.objectid = backref->root_id;
2226 key.type = BTRFS_ROOT_ITEM_KEY;
2227 key.offset = (u64)-1;
2228
2229 fs_info = BTRFS_I(src_inode)->root->fs_info;
2230 index = srcu_read_lock(&fs_info->subvol_srcu);
2231
2232 root = btrfs_read_fs_root_no_name(fs_info, &key);
2233 if (IS_ERR(root)) {
2234 srcu_read_unlock(&fs_info->subvol_srcu, index);
2235 if (PTR_ERR(root) == -ENOENT)
2236 return 0;
2237 return PTR_ERR(root);
2238 }
2239 if (btrfs_root_refs(&root->root_item) == 0) {
2240 srcu_read_unlock(&fs_info->subvol_srcu, index);
2241 /* parse ENOENT to 0 */
2242 return 0;
2243 }
2244
2245 /* step 2: get inode */
2246 key.objectid = backref->inum;
2247 key.type = BTRFS_INODE_ITEM_KEY;
2248 key.offset = 0;
2249
2250 inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2251 if (IS_ERR(inode)) {
2252 srcu_read_unlock(&fs_info->subvol_srcu, index);
2253 return 0;
2254 }
2255
2256 srcu_read_unlock(&fs_info->subvol_srcu, index);
2257
2258 /* step 3: relink backref */
2259 lock_start = backref->file_pos;
2260 lock_end = backref->file_pos + backref->num_bytes - 1;
2261 lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2262 0, &cached);
2263
2264 ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2265 if (ordered) {
2266 btrfs_put_ordered_extent(ordered);
2267 goto out_unlock;
2268 }
2269
2270 trans = btrfs_join_transaction(root);
2271 if (IS_ERR(trans)) {
2272 ret = PTR_ERR(trans);
2273 goto out_unlock;
2274 }
2275
2276 key.objectid = backref->inum;
2277 key.type = BTRFS_EXTENT_DATA_KEY;
2278 key.offset = backref->file_pos;
2279
2280 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2281 if (ret < 0) {
2282 goto out_free_path;
2283 } else if (ret > 0) {
2284 ret = 0;
2285 goto out_free_path;
2286 }
2287
2288 extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2289 struct btrfs_file_extent_item);
2290
2291 if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2292 backref->generation)
2293 goto out_free_path;
2294
2295 btrfs_release_path(path);
2296
2297 start = backref->file_pos;
2298 if (backref->extent_offset < old->extent_offset + old->offset)
2299 start += old->extent_offset + old->offset -
2300 backref->extent_offset;
2301
2302 len = min(backref->extent_offset + backref->num_bytes,
2303 old->extent_offset + old->offset + old->len);
2304 len -= max(backref->extent_offset, old->extent_offset + old->offset);
2305
2306 ret = btrfs_drop_extents(trans, root, inode, start,
2307 start + len, 1);
2308 if (ret)
2309 goto out_free_path;
2310again:
2311 key.objectid = btrfs_ino(inode);
2312 key.type = BTRFS_EXTENT_DATA_KEY;
2313 key.offset = start;
2314
2315 if (merge) {
2316 struct btrfs_file_extent_item *fi;
2317 u64 extent_len;
2318 struct btrfs_key found_key;
2319
2320 ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
2321 if (ret < 0)
2322 goto out_free_path;
2323
2324 path->slots[0]--;
2325 leaf = path->nodes[0];
2326 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2327
2328 fi = btrfs_item_ptr(leaf, path->slots[0],
2329 struct btrfs_file_extent_item);
2330 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2331
2332 if (relink_is_mergable(leaf, fi, new->bytenr) &&
2333 extent_len + found_key.offset == start) {
2334 btrfs_set_file_extent_num_bytes(leaf, fi,
2335 extent_len + len);
2336 btrfs_mark_buffer_dirty(leaf);
2337 inode_add_bytes(inode, len);
2338
2339 ret = 1;
2340 goto out_free_path;
2341 } else {
2342 merge = false;
2343 btrfs_release_path(path);
2344 goto again;
2345 }
2346 }
2347
2348 ret = btrfs_insert_empty_item(trans, root, path, &key,
2349 sizeof(*extent));
2350 if (ret) {
2351 btrfs_abort_transaction(trans, root, ret);
2352 goto out_free_path;
2353 }
2354
2355 leaf = path->nodes[0];
2356 item = btrfs_item_ptr(leaf, path->slots[0],
2357 struct btrfs_file_extent_item);
2358 btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2359 btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2360 btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2361 btrfs_set_file_extent_num_bytes(leaf, item, len);
2362 btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2363 btrfs_set_file_extent_generation(leaf, item, trans->transid);
2364 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2365 btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2366 btrfs_set_file_extent_encryption(leaf, item, 0);
2367 btrfs_set_file_extent_other_encoding(leaf, item, 0);
2368
2369 btrfs_mark_buffer_dirty(leaf);
2370 inode_add_bytes(inode, len);
2371
2372 ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2373 new->disk_len, 0,
2374 backref->root_id, backref->inum,
2375 new->file_pos, 0); /* start - extent_offset */
2376 if (ret) {
2377 btrfs_abort_transaction(trans, root, ret);
2378 goto out_free_path;
2379 }
2380
2381 ret = 1;
2382out_free_path:
2383 btrfs_release_path(path);
2384 btrfs_end_transaction(trans, root);
2385out_unlock:
2386 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2387 &cached, GFP_NOFS);
2388 iput(inode);
2389 return ret;
2390}
2391
2392static void relink_file_extents(struct new_sa_defrag_extent *new)
2393{
2394 struct btrfs_path *path;
2395 struct old_sa_defrag_extent *old, *tmp;
2396 struct sa_defrag_extent_backref *backref;
2397 struct sa_defrag_extent_backref *prev = NULL;
2398 struct inode *inode;
2399 struct btrfs_root *root;
2400 struct rb_node *node;
2401 int ret;
2402
2403 inode = new->inode;
2404 root = BTRFS_I(inode)->root;
2405
2406 path = btrfs_alloc_path();
2407 if (!path)
2408 return;
2409
2410 if (!record_extent_backrefs(path, new)) {
2411 btrfs_free_path(path);
2412 goto out;
2413 }
2414 btrfs_release_path(path);
2415
2416 while (1) {
2417 node = rb_first(&new->root);
2418 if (!node)
2419 break;
2420 rb_erase(node, &new->root);
2421
2422 backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2423
2424 ret = relink_extent_backref(path, prev, backref);
2425 WARN_ON(ret < 0);
2426
2427 kfree(prev);
2428
2429 if (ret == 1)
2430 prev = backref;
2431 else
2432 prev = NULL;
2433 cond_resched();
2434 }
2435 kfree(prev);
2436
2437 btrfs_free_path(path);
2438
2439 list_for_each_entry_safe(old, tmp, &new->head, list) {
2440 list_del(&old->list);
2441 kfree(old);
2442 }
2443out:
2444 atomic_dec(&root->fs_info->defrag_running);
2445 wake_up(&root->fs_info->transaction_wait);
2446
2447 kfree(new);
2448}
2449
2450static struct new_sa_defrag_extent *
2451record_old_file_extents(struct inode *inode,
2452 struct btrfs_ordered_extent *ordered)
2453{
2454 struct btrfs_root *root = BTRFS_I(inode)->root;
2455 struct btrfs_path *path;
2456 struct btrfs_key key;
2457 struct old_sa_defrag_extent *old, *tmp;
2458 struct new_sa_defrag_extent *new;
2459 int ret;
2460
2461 new = kmalloc(sizeof(*new), GFP_NOFS);
2462 if (!new)
2463 return NULL;
2464
2465 new->inode = inode;
2466 new->file_pos = ordered->file_offset;
2467 new->len = ordered->len;
2468 new->bytenr = ordered->start;
2469 new->disk_len = ordered->disk_len;
2470 new->compress_type = ordered->compress_type;
2471 new->root = RB_ROOT;
2472 INIT_LIST_HEAD(&new->head);
2473
2474 path = btrfs_alloc_path();
2475 if (!path)
2476 goto out_kfree;
2477
2478 key.objectid = btrfs_ino(inode);
2479 key.type = BTRFS_EXTENT_DATA_KEY;
2480 key.offset = new->file_pos;
2481
2482 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2483 if (ret < 0)
2484 goto out_free_path;
2485 if (ret > 0 && path->slots[0] > 0)
2486 path->slots[0]--;
2487
2488 /* find out all the old extents for the file range */
2489 while (1) {
2490 struct btrfs_file_extent_item *extent;
2491 struct extent_buffer *l;
2492 int slot;
2493 u64 num_bytes;
2494 u64 offset;
2495 u64 end;
2496 u64 disk_bytenr;
2497 u64 extent_offset;
2498
2499 l = path->nodes[0];
2500 slot = path->slots[0];
2501
2502 if (slot >= btrfs_header_nritems(l)) {
2503 ret = btrfs_next_leaf(root, path);
2504 if (ret < 0)
2505 goto out_free_list;
2506 else if (ret > 0)
2507 break;
2508 continue;
2509 }
2510
2511 btrfs_item_key_to_cpu(l, &key, slot);
2512
2513 if (key.objectid != btrfs_ino(inode))
2514 break;
2515 if (key.type != BTRFS_EXTENT_DATA_KEY)
2516 break;
2517 if (key.offset >= new->file_pos + new->len)
2518 break;
2519
2520 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2521
2522 num_bytes = btrfs_file_extent_num_bytes(l, extent);
2523 if (key.offset + num_bytes < new->file_pos)
2524 goto next;
2525
2526 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2527 if (!disk_bytenr)
2528 goto next;
2529
2530 extent_offset = btrfs_file_extent_offset(l, extent);
2531
2532 old = kmalloc(sizeof(*old), GFP_NOFS);
2533 if (!old)
2534 goto out_free_list;
2535
2536 offset = max(new->file_pos, key.offset);
2537 end = min(new->file_pos + new->len, key.offset + num_bytes);
2538
2539 old->bytenr = disk_bytenr;
2540 old->extent_offset = extent_offset;
2541 old->offset = offset - key.offset;
2542 old->len = end - offset;
2543 old->new = new;
2544 old->count = 0;
2545 list_add_tail(&old->list, &new->head);
2546next:
2547 path->slots[0]++;
2548 cond_resched();
2549 }
2550
2551 btrfs_free_path(path);
2552 atomic_inc(&root->fs_info->defrag_running);
2553
2554 return new;
2555
2556out_free_list:
2557 list_for_each_entry_safe(old, tmp, &new->head, list) {
2558 list_del(&old->list);
2559 kfree(old);
2560 }
2561out_free_path:
2562 btrfs_free_path(path);
2563out_kfree:
2564 kfree(new);
2565 return NULL;
2566}
2567
1895/* 2568/*
1896 * helper function for btrfs_finish_ordered_io, this 2569 * helper function for btrfs_finish_ordered_io, this
1897 * just reads in some of the csum leaves to prime them into ram 2570 * just reads in some of the csum leaves to prime them into ram
@@ -1909,6 +2582,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1909 struct btrfs_trans_handle *trans = NULL; 2582 struct btrfs_trans_handle *trans = NULL;
1910 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2583 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1911 struct extent_state *cached_state = NULL; 2584 struct extent_state *cached_state = NULL;
2585 struct new_sa_defrag_extent *new = NULL;
1912 int compress_type = 0; 2586 int compress_type = 0;
1913 int ret; 2587 int ret;
1914 bool nolock; 2588 bool nolock;
@@ -1943,6 +2617,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1943 ordered_extent->file_offset + ordered_extent->len - 1, 2617 ordered_extent->file_offset + ordered_extent->len - 1,
1944 0, &cached_state); 2618 0, &cached_state);
1945 2619
2620 ret = test_range_bit(io_tree, ordered_extent->file_offset,
2621 ordered_extent->file_offset + ordered_extent->len - 1,
2622 EXTENT_DEFRAG, 1, cached_state);
2623 if (ret) {
2624 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2625 if (last_snapshot >= BTRFS_I(inode)->generation)
2626 /* the inode is shared */
2627 new = record_old_file_extents(inode, ordered_extent);
2628
2629 clear_extent_bit(io_tree, ordered_extent->file_offset,
2630 ordered_extent->file_offset + ordered_extent->len - 1,
2631 EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
2632 }
2633
1946 if (nolock) 2634 if (nolock)
1947 trans = btrfs_join_transaction_nolock(root); 2635 trans = btrfs_join_transaction_nolock(root);
1948 else 2636 else
@@ -2001,17 +2689,33 @@ out:
2001 if (trans) 2689 if (trans)
2002 btrfs_end_transaction(trans, root); 2690 btrfs_end_transaction(trans, root);
2003 2691
2004 if (ret) 2692 if (ret) {
2005 clear_extent_uptodate(io_tree, ordered_extent->file_offset, 2693 clear_extent_uptodate(io_tree, ordered_extent->file_offset,
2006 ordered_extent->file_offset + 2694 ordered_extent->file_offset +
2007 ordered_extent->len - 1, NULL, GFP_NOFS); 2695 ordered_extent->len - 1, NULL, GFP_NOFS);
2008 2696
2697 /*
2698 * If the ordered extent had an IOERR or something else went
2699 * wrong we need to return the space for this ordered extent
2700 * back to the allocator.
2701 */
2702 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2703 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
2704 btrfs_free_reserved_extent(root, ordered_extent->start,
2705 ordered_extent->disk_len);
2706 }
2707
2708
2009 /* 2709 /*
2010 * This needs to be done to make sure anybody waiting knows we are done 2710 * This needs to be done to make sure anybody waiting knows we are done
2011 * updating everything for this ordered extent. 2711 * updating everything for this ordered extent.
2012 */ 2712 */
2013 btrfs_remove_ordered_extent(inode, ordered_extent); 2713 btrfs_remove_ordered_extent(inode, ordered_extent);
2014 2714
2715 /* for snapshot-aware defrag */
2716 if (new)
2717 relink_file_extents(new);
2718
2015 /* once for us */ 2719 /* once for us */
2016 btrfs_put_ordered_extent(ordered_extent); 2720 btrfs_put_ordered_extent(ordered_extent);
2017 /* once for the tree */ 2721 /* once for the tree */
@@ -2062,7 +2766,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2062static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, 2766static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
2063 struct extent_state *state, int mirror) 2767 struct extent_state *state, int mirror)
2064{ 2768{
2065 size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); 2769 size_t offset = start - page_offset(page);
2066 struct inode *inode = page->mapping->host; 2770 struct inode *inode = page->mapping->host;
2067 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2771 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2068 char *kaddr; 2772 char *kaddr;
@@ -2167,11 +2871,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
2167 } 2871 }
2168} 2872}
2169 2873
2170enum btrfs_orphan_cleanup_state {
2171 ORPHAN_CLEANUP_STARTED = 1,
2172 ORPHAN_CLEANUP_DONE = 2,
2173};
2174
2175/* 2874/*
2176 * This is called in transaction commit time. If there are no orphan 2875 * This is called in transaction commit time. If there are no orphan
2177 * files in the subvolume, it removes orphan item and frees block_rsv 2876 * files in the subvolume, it removes orphan item and frees block_rsv
@@ -2469,6 +3168,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2469 */ 3168 */
2470 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 3169 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2471 &BTRFS_I(inode)->runtime_flags); 3170 &BTRFS_I(inode)->runtime_flags);
3171 atomic_inc(&root->orphan_inodes);
2472 3172
2473 /* if we have links, this was a truncate, lets do that */ 3173 /* if we have links, this was a truncate, lets do that */
2474 if (inode->i_nlink) { 3174 if (inode->i_nlink) {
@@ -2478,7 +3178,21 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2478 continue; 3178 continue;
2479 } 3179 }
2480 nr_truncate++; 3180 nr_truncate++;
3181
3182 /* 1 for the orphan item deletion. */
3183 trans = btrfs_start_transaction(root, 1);
3184 if (IS_ERR(trans)) {
3185 ret = PTR_ERR(trans);
3186 goto out;
3187 }
3188 ret = btrfs_orphan_add(trans, inode);
3189 btrfs_end_transaction(trans, root);
3190 if (ret)
3191 goto out;
3192
2481 ret = btrfs_truncate(inode); 3193 ret = btrfs_truncate(inode);
3194 if (ret)
3195 btrfs_orphan_del(NULL, inode);
2482 } else { 3196 } else {
2483 nr_unlink++; 3197 nr_unlink++;
2484 } 3198 }
@@ -2697,34 +3411,41 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2697 struct btrfs_inode_item *item, 3411 struct btrfs_inode_item *item,
2698 struct inode *inode) 3412 struct inode *inode)
2699{ 3413{
2700 btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); 3414 struct btrfs_map_token token;
2701 btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
2702 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
2703 btrfs_set_inode_mode(leaf, item, inode->i_mode);
2704 btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
2705 3415
2706 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), 3416 btrfs_init_map_token(&token);
2707 inode->i_atime.tv_sec);
2708 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
2709 inode->i_atime.tv_nsec);
2710 3417
2711 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), 3418 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
2712 inode->i_mtime.tv_sec); 3419 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
2713 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), 3420 btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
2714 inode->i_mtime.tv_nsec); 3421 &token);
3422 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3423 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
2715 3424
2716 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), 3425 btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
2717 inode->i_ctime.tv_sec); 3426 inode->i_atime.tv_sec, &token);
2718 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), 3427 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
2719 inode->i_ctime.tv_nsec); 3428 inode->i_atime.tv_nsec, &token);
2720 3429
2721 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); 3430 btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
2722 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); 3431 inode->i_mtime.tv_sec, &token);
2723 btrfs_set_inode_sequence(leaf, item, inode->i_version); 3432 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
2724 btrfs_set_inode_transid(leaf, item, trans->transid); 3433 inode->i_mtime.tv_nsec, &token);
2725 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 3434
2726 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 3435 btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
2727 btrfs_set_inode_block_group(leaf, item, 0); 3436 inode->i_ctime.tv_sec, &token);
3437 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
3438 inode->i_ctime.tv_nsec, &token);
3439
3440 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3441 &token);
3442 btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
3443 &token);
3444 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3445 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3446 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3447 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3448 btrfs_set_token_inode_block_group(leaf, item, 0, &token);
2728} 3449}
2729 3450
2730/* 3451/*
@@ -3292,7 +4013,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3292 u64 extent_num_bytes = 0; 4013 u64 extent_num_bytes = 0;
3293 u64 extent_offset = 0; 4014 u64 extent_offset = 0;
3294 u64 item_end = 0; 4015 u64 item_end = 0;
3295 u64 mask = root->sectorsize - 1;
3296 u32 found_type = (u8)-1; 4016 u32 found_type = (u8)-1;
3297 int found_extent; 4017 int found_extent;
3298 int del_item; 4018 int del_item;
@@ -3316,7 +4036,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3316 * extent just the way it is. 4036 * extent just the way it is.
3317 */ 4037 */
3318 if (root->ref_cows || root == root->fs_info->tree_root) 4038 if (root->ref_cows || root == root->fs_info->tree_root)
3319 btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0); 4039 btrfs_drop_extent_cache(inode, ALIGN(new_size,
4040 root->sectorsize), (u64)-1, 0);
3320 4041
3321 /* 4042 /*
3322 * This function is also used to drop the items in the log tree before 4043 * This function is also used to drop the items in the log tree before
@@ -3395,10 +4116,9 @@ search_again:
3395 if (!del_item) { 4116 if (!del_item) {
3396 u64 orig_num_bytes = 4117 u64 orig_num_bytes =
3397 btrfs_file_extent_num_bytes(leaf, fi); 4118 btrfs_file_extent_num_bytes(leaf, fi);
3398 extent_num_bytes = new_size - 4119 extent_num_bytes = ALIGN(new_size -
3399 found_key.offset + root->sectorsize - 1; 4120 found_key.offset,
3400 extent_num_bytes = extent_num_bytes & 4121 root->sectorsize);
3401 ~((u64)root->sectorsize - 1);
3402 btrfs_set_file_extent_num_bytes(leaf, fi, 4122 btrfs_set_file_extent_num_bytes(leaf, fi,
3403 extent_num_bytes); 4123 extent_num_bytes);
3404 num_dec = (orig_num_bytes - 4124 num_dec = (orig_num_bytes -
@@ -3634,9 +4354,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3634 struct extent_map *em = NULL; 4354 struct extent_map *em = NULL;
3635 struct extent_state *cached_state = NULL; 4355 struct extent_state *cached_state = NULL;
3636 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 4356 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3637 u64 mask = root->sectorsize - 1; 4357 u64 hole_start = ALIGN(oldsize, root->sectorsize);
3638 u64 hole_start = (oldsize + mask) & ~mask; 4358 u64 block_end = ALIGN(size, root->sectorsize);
3639 u64 block_end = (size + mask) & ~mask;
3640 u64 last_byte; 4359 u64 last_byte;
3641 u64 cur_offset; 4360 u64 cur_offset;
3642 u64 hole_size; 4361 u64 hole_size;
@@ -3665,10 +4384,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3665 block_end - cur_offset, 0); 4384 block_end - cur_offset, 0);
3666 if (IS_ERR(em)) { 4385 if (IS_ERR(em)) {
3667 err = PTR_ERR(em); 4386 err = PTR_ERR(em);
4387 em = NULL;
3668 break; 4388 break;
3669 } 4389 }
3670 last_byte = min(extent_map_end(em), block_end); 4390 last_byte = min(extent_map_end(em), block_end);
3671 last_byte = (last_byte + mask) & ~mask; 4391 last_byte = ALIGN(last_byte , root->sectorsize);
3672 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 4392 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3673 struct extent_map *hole_em; 4393 struct extent_map *hole_em;
3674 hole_size = last_byte - cur_offset; 4394 hole_size = last_byte - cur_offset;
@@ -3748,16 +4468,27 @@ next:
3748 return err; 4468 return err;
3749} 4469}
3750 4470
3751static int btrfs_setsize(struct inode *inode, loff_t newsize) 4471static int btrfs_setsize(struct inode *inode, struct iattr *attr)
3752{ 4472{
3753 struct btrfs_root *root = BTRFS_I(inode)->root; 4473 struct btrfs_root *root = BTRFS_I(inode)->root;
3754 struct btrfs_trans_handle *trans; 4474 struct btrfs_trans_handle *trans;
3755 loff_t oldsize = i_size_read(inode); 4475 loff_t oldsize = i_size_read(inode);
4476 loff_t newsize = attr->ia_size;
4477 int mask = attr->ia_valid;
3756 int ret; 4478 int ret;
3757 4479
3758 if (newsize == oldsize) 4480 if (newsize == oldsize)
3759 return 0; 4481 return 0;
3760 4482
4483 /*
4484 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
4485 * special case where we need to update the times despite not having
4486 * these flags set. For all other operations the VFS set these flags
4487 * explicitly if it wants a timestamp update.
4488 */
4489 if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
4490 inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
4491
3761 if (newsize > oldsize) { 4492 if (newsize > oldsize) {
3762 truncate_pagecache(inode, oldsize, newsize); 4493 truncate_pagecache(inode, oldsize, newsize);
3763 ret = btrfs_cont_expand(inode, oldsize, newsize); 4494 ret = btrfs_cont_expand(inode, oldsize, newsize);
@@ -3783,9 +4514,40 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
3783 set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 4514 set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
3784 &BTRFS_I(inode)->runtime_flags); 4515 &BTRFS_I(inode)->runtime_flags);
3785 4516
4517 /*
4518 * 1 for the orphan item we're going to add
4519 * 1 for the orphan item deletion.
4520 */
4521 trans = btrfs_start_transaction(root, 2);
4522 if (IS_ERR(trans))
4523 return PTR_ERR(trans);
4524
4525 /*
4526 * We need to do this in case we fail at _any_ point during the
4527 * actual truncate. Once we do the truncate_setsize we could
4528 * invalidate pages which forces any outstanding ordered io to
4529 * be instantly completed which will give us extents that need
4530 * to be truncated. If we fail to get an orphan inode down we
4531 * could have left over extents that were never meant to live,
4532 * so we need to garuntee from this point on that everything
4533 * will be consistent.
4534 */
4535 ret = btrfs_orphan_add(trans, inode);
4536 btrfs_end_transaction(trans, root);
4537 if (ret)
4538 return ret;
4539
3786 /* we don't support swapfiles, so vmtruncate shouldn't fail */ 4540 /* we don't support swapfiles, so vmtruncate shouldn't fail */
3787 truncate_setsize(inode, newsize); 4541 truncate_setsize(inode, newsize);
4542
4543 /* Disable nonlocked read DIO to avoid the end less truncate */
4544 btrfs_inode_block_unlocked_dio(inode);
4545 inode_dio_wait(inode);
4546 btrfs_inode_resume_unlocked_dio(inode);
4547
3788 ret = btrfs_truncate(inode); 4548 ret = btrfs_truncate(inode);
4549 if (ret && inode->i_nlink)
4550 btrfs_orphan_del(NULL, inode);
3789 } 4551 }
3790 4552
3791 return ret; 4553 return ret;
@@ -3805,7 +4567,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3805 return err; 4567 return err;
3806 4568
3807 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 4569 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
3808 err = btrfs_setsize(inode, attr->ia_size); 4570 err = btrfs_setsize(inode, attr);
3809 if (err) 4571 if (err)
3810 return err; 4572 return err;
3811 } 4573 }
@@ -3855,6 +4617,12 @@ void btrfs_evict_inode(struct inode *inode)
3855 goto no_delete; 4617 goto no_delete;
3856 } 4618 }
3857 4619
4620 ret = btrfs_commit_inode_delayed_inode(inode);
4621 if (ret) {
4622 btrfs_orphan_del(NULL, inode);
4623 goto no_delete;
4624 }
4625
3858 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); 4626 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
3859 if (!rsv) { 4627 if (!rsv) {
3860 btrfs_orphan_del(NULL, inode); 4628 btrfs_orphan_del(NULL, inode);
@@ -3892,7 +4660,7 @@ void btrfs_evict_inode(struct inode *inode)
3892 goto no_delete; 4660 goto no_delete;
3893 } 4661 }
3894 4662
3895 trans = btrfs_start_transaction_lflush(root, 1); 4663 trans = btrfs_join_transaction(root);
3896 if (IS_ERR(trans)) { 4664 if (IS_ERR(trans)) {
3897 btrfs_orphan_del(NULL, inode); 4665 btrfs_orphan_del(NULL, inode);
3898 btrfs_free_block_rsv(root, rsv); 4666 btrfs_free_block_rsv(root, rsv);
@@ -3906,9 +4674,6 @@ void btrfs_evict_inode(struct inode *inode)
3906 break; 4674 break;
3907 4675
3908 trans->block_rsv = &root->fs_info->trans_block_rsv; 4676 trans->block_rsv = &root->fs_info->trans_block_rsv;
3909 ret = btrfs_update_inode(trans, root, inode);
3910 BUG_ON(ret);
3911
3912 btrfs_end_transaction(trans, root); 4677 btrfs_end_transaction(trans, root);
3913 trans = NULL; 4678 trans = NULL;
3914 btrfs_btree_balance_dirty(root); 4679 btrfs_btree_balance_dirty(root);
@@ -4342,7 +5107,7 @@ unsigned char btrfs_filetype_table[] = {
4342static int btrfs_real_readdir(struct file *filp, void *dirent, 5107static int btrfs_real_readdir(struct file *filp, void *dirent,
4343 filldir_t filldir) 5108 filldir_t filldir)
4344{ 5109{
4345 struct inode *inode = filp->f_dentry->d_inode; 5110 struct inode *inode = file_inode(filp);
4346 struct btrfs_root *root = BTRFS_I(inode)->root; 5111 struct btrfs_root *root = BTRFS_I(inode)->root;
4347 struct btrfs_item *item; 5112 struct btrfs_item *item;
4348 struct btrfs_dir_item *di; 5113 struct btrfs_dir_item *di;
@@ -4805,7 +5570,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4805 if (btrfs_test_opt(root, NODATASUM)) 5570 if (btrfs_test_opt(root, NODATASUM))
4806 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 5571 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
4807 if (btrfs_test_opt(root, NODATACOW)) 5572 if (btrfs_test_opt(root, NODATACOW))
4808 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 5573 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
5574 BTRFS_INODE_NODATASUM;
4809 } 5575 }
4810 5576
4811 insert_inode_hash(inode); 5577 insert_inode_hash(inode);
@@ -4957,12 +5723,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4957 goto out_unlock; 5723 goto out_unlock;
4958 } 5724 }
4959 5725
4960 err = btrfs_update_inode(trans, root, inode);
4961 if (err) {
4962 drop_inode = 1;
4963 goto out_unlock;
4964 }
4965
4966 /* 5726 /*
4967 * If the active LSM wants to access the inode during 5727 * If the active LSM wants to access the inode during
4968 * d_instantiate it needs these. Smack checks to see 5728 * d_instantiate it needs these. Smack checks to see
@@ -5347,8 +6107,7 @@ again:
5347 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 6107 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
5348 size_t size; 6108 size_t size;
5349 size = btrfs_file_extent_inline_len(leaf, item); 6109 size = btrfs_file_extent_inline_len(leaf, item);
5350 extent_end = (extent_start + size + root->sectorsize - 1) & 6110 extent_end = ALIGN(extent_start + size, root->sectorsize);
5351 ~((u64)root->sectorsize - 1);
5352 } 6111 }
5353 6112
5354 if (start >= extent_end) { 6113 if (start >= extent_end) {
@@ -5420,8 +6179,7 @@ again:
5420 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, 6179 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
5421 size - extent_offset); 6180 size - extent_offset);
5422 em->start = extent_start + extent_offset; 6181 em->start = extent_start + extent_offset;
5423 em->len = (copy_size + root->sectorsize - 1) & 6182 em->len = ALIGN(copy_size, root->sectorsize);
5424 ~((u64)root->sectorsize - 1);
5425 em->orig_block_len = em->len; 6183 em->orig_block_len = em->len;
5426 em->orig_start = em->start; 6184 em->orig_start = em->start;
5427 if (compress_type) { 6185 if (compress_type) {
@@ -5572,10 +6330,13 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
5572 return em; 6330 return em;
5573 if (em) { 6331 if (em) {
5574 /* 6332 /*
5575 * if our em maps to a hole, there might 6333 * if our em maps to
5576 * actually be delalloc bytes behind it 6334 * - a hole or
6335 * - a pre-alloc extent,
6336 * there might actually be delalloc bytes behind it.
5577 */ 6337 */
5578 if (em->block_start != EXTENT_MAP_HOLE) 6338 if (em->block_start != EXTENT_MAP_HOLE &&
6339 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5579 return em; 6340 return em;
5580 else 6341 else
5581 hole_em = em; 6342 hole_em = em;
@@ -5657,6 +6418,8 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
5657 */ 6418 */
5658 em->block_start = hole_em->block_start; 6419 em->block_start = hole_em->block_start;
5659 em->block_len = hole_len; 6420 em->block_len = hole_len;
6421 if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
6422 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
5660 } else { 6423 } else {
5661 em->start = range_start; 6424 em->start = range_start;
5662 em->len = found; 6425 em->len = found;
@@ -5895,6 +6658,8 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
5895 6658
5896 em->start = start; 6659 em->start = start;
5897 em->orig_start = orig_start; 6660 em->orig_start = orig_start;
6661 em->mod_start = start;
6662 em->mod_len = len;
5898 em->len = len; 6663 em->len = len;
5899 em->block_len = block_len; 6664 em->block_len = block_len;
5900 em->block_start = block_start; 6665 em->block_start = block_start;
@@ -5936,16 +6701,12 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5936 u64 len = bh_result->b_size; 6701 u64 len = bh_result->b_size;
5937 struct btrfs_trans_handle *trans; 6702 struct btrfs_trans_handle *trans;
5938 int unlock_bits = EXTENT_LOCKED; 6703 int unlock_bits = EXTENT_LOCKED;
5939 int ret; 6704 int ret = 0;
5940 6705
5941 if (create) { 6706 if (create)
5942 ret = btrfs_delalloc_reserve_space(inode, len);
5943 if (ret)
5944 return ret;
5945 unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; 6707 unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
5946 } else { 6708 else
5947 len = min_t(u64, len, root->sectorsize); 6709 len = min_t(u64, len, root->sectorsize);
5948 }
5949 6710
5950 lockstart = start; 6711 lockstart = start;
5951 lockend = start + len - 1; 6712 lockend = start + len - 1;
@@ -5957,14 +6718,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5957 if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create)) 6718 if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
5958 return -ENOTBLK; 6719 return -ENOTBLK;
5959 6720
5960 if (create) {
5961 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
5962 lockend, EXTENT_DELALLOC, NULL,
5963 &cached_state, GFP_NOFS);
5964 if (ret)
5965 goto unlock_err;
5966 }
5967
5968 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 6721 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
5969 if (IS_ERR(em)) { 6722 if (IS_ERR(em)) {
5970 ret = PTR_ERR(em); 6723 ret = PTR_ERR(em);
@@ -5996,7 +6749,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5996 if (!create && (em->block_start == EXTENT_MAP_HOLE || 6749 if (!create && (em->block_start == EXTENT_MAP_HOLE ||
5997 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 6750 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5998 free_extent_map(em); 6751 free_extent_map(em);
5999 ret = 0;
6000 goto unlock_err; 6752 goto unlock_err;
6001 } 6753 }
6002 6754
@@ -6094,6 +6846,15 @@ unlock:
6094 */ 6846 */
6095 if (start + len > i_size_read(inode)) 6847 if (start + len > i_size_read(inode))
6096 i_size_write(inode, start + len); 6848 i_size_write(inode, start + len);
6849
6850 spin_lock(&BTRFS_I(inode)->lock);
6851 BTRFS_I(inode)->outstanding_extents++;
6852 spin_unlock(&BTRFS_I(inode)->lock);
6853
6854 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6855 lockstart + len - 1, EXTENT_DELALLOC, NULL,
6856 &cached_state, GFP_NOFS);
6857 BUG_ON(ret);
6097 } 6858 }
6098 6859
6099 /* 6860 /*
@@ -6102,24 +6863,9 @@ unlock:
6102 * aren't using if there is any left over space. 6863 * aren't using if there is any left over space.
6103 */ 6864 */
6104 if (lockstart < lockend) { 6865 if (lockstart < lockend) {
6105 if (create && len < lockend - lockstart) { 6866 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6106 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 6867 lockend, unlock_bits, 1, 0,
6107 lockstart + len - 1, 6868 &cached_state, GFP_NOFS);
6108 unlock_bits | EXTENT_DEFRAG, 1, 0,
6109 &cached_state, GFP_NOFS);
6110 /*
6111 * Beside unlock, we also need to cleanup reserved space
6112 * for the left range by attaching EXTENT_DO_ACCOUNTING.
6113 */
6114 clear_extent_bit(&BTRFS_I(inode)->io_tree,
6115 lockstart + len, lockend,
6116 unlock_bits | EXTENT_DO_ACCOUNTING |
6117 EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS);
6118 } else {
6119 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6120 lockend, unlock_bits, 1, 0,
6121 &cached_state, GFP_NOFS);
6122 }
6123 } else { 6869 } else {
6124 free_extent_state(cached_state); 6870 free_extent_state(cached_state);
6125 } 6871 }
@@ -6129,9 +6875,6 @@ unlock:
6129 return 0; 6875 return 0;
6130 6876
6131unlock_err: 6877unlock_err:
6132 if (create)
6133 unlock_bits |= EXTENT_DO_ACCOUNTING;
6134
6135 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6878 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6136 unlock_bits, 1, 0, &cached_state, GFP_NOFS); 6879 unlock_bits, 1, 0, &cached_state, GFP_NOFS);
6137 return ret; 6880 return ret;
@@ -6372,19 +7115,24 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6372 int async_submit = 0; 7115 int async_submit = 0;
6373 7116
6374 map_length = orig_bio->bi_size; 7117 map_length = orig_bio->bi_size;
6375 ret = btrfs_map_block(root->fs_info, READ, start_sector << 9, 7118 ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
6376 &map_length, NULL, 0); 7119 &map_length, NULL, 0);
6377 if (ret) { 7120 if (ret) {
6378 bio_put(orig_bio); 7121 bio_put(orig_bio);
6379 return -EIO; 7122 return -EIO;
6380 } 7123 }
6381
6382 if (map_length >= orig_bio->bi_size) { 7124 if (map_length >= orig_bio->bi_size) {
6383 bio = orig_bio; 7125 bio = orig_bio;
6384 goto submit; 7126 goto submit;
6385 } 7127 }
6386 7128
6387 async_submit = 1; 7129 /* async crcs make it difficult to collect full stripe writes. */
7130 if (btrfs_get_alloc_profile(root, 1) &
7131 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
7132 async_submit = 0;
7133 else
7134 async_submit = 1;
7135
6388 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 7136 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
6389 if (!bio) 7137 if (!bio)
6390 return -ENOMEM; 7138 return -ENOMEM;
@@ -6426,7 +7174,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6426 bio->bi_end_io = btrfs_end_dio_bio; 7174 bio->bi_end_io = btrfs_end_dio_bio;
6427 7175
6428 map_length = orig_bio->bi_size; 7176 map_length = orig_bio->bi_size;
6429 ret = btrfs_map_block(root->fs_info, READ, 7177 ret = btrfs_map_block(root->fs_info, rw,
6430 start_sector << 9, 7178 start_sector << 9,
6431 &map_length, NULL, 0); 7179 &map_length, NULL, 0);
6432 if (ret) { 7180 if (ret) {
@@ -6569,15 +7317,60 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
6569{ 7317{
6570 struct file *file = iocb->ki_filp; 7318 struct file *file = iocb->ki_filp;
6571 struct inode *inode = file->f_mapping->host; 7319 struct inode *inode = file->f_mapping->host;
7320 size_t count = 0;
7321 int flags = 0;
7322 bool wakeup = true;
7323 bool relock = false;
7324 ssize_t ret;
6572 7325
6573 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, 7326 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
6574 offset, nr_segs)) 7327 offset, nr_segs))
6575 return 0; 7328 return 0;
6576 7329
6577 return __blockdev_direct_IO(rw, iocb, inode, 7330 atomic_inc(&inode->i_dio_count);
6578 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, 7331 smp_mb__after_atomic_inc();
6579 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, 7332
6580 btrfs_submit_direct, 0); 7333 if (rw & WRITE) {
7334 count = iov_length(iov, nr_segs);
7335 /*
7336 * If the write DIO is beyond the EOF, we need update
7337 * the isize, but it is protected by i_mutex. So we can
7338 * not unlock the i_mutex at this case.
7339 */
7340 if (offset + count <= inode->i_size) {
7341 mutex_unlock(&inode->i_mutex);
7342 relock = true;
7343 }
7344 ret = btrfs_delalloc_reserve_space(inode, count);
7345 if (ret)
7346 goto out;
7347 } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
7348 &BTRFS_I(inode)->runtime_flags))) {
7349 inode_dio_done(inode);
7350 flags = DIO_LOCKING | DIO_SKIP_HOLES;
7351 wakeup = false;
7352 }
7353
7354 ret = __blockdev_direct_IO(rw, iocb, inode,
7355 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
7356 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
7357 btrfs_submit_direct, flags);
7358 if (rw & WRITE) {
7359 if (ret < 0 && ret != -EIOCBQUEUED)
7360 btrfs_delalloc_release_space(inode, count);
7361 else if (ret >= 0 && (size_t)ret < count)
7362 btrfs_delalloc_release_space(inode,
7363 count - (size_t)ret);
7364 else
7365 btrfs_delalloc_release_metadata(inode, 0);
7366 }
7367out:
7368 if (wakeup)
7369 inode_dio_done(inode);
7370 if (relock)
7371 mutex_lock(&inode->i_mutex);
7372
7373 return ret;
6581} 7374}
6582 7375
6583#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) 7376#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
@@ -6681,8 +7474,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
6681 return; 7474 return;
6682 } 7475 }
6683 lock_extent_bits(tree, page_start, page_end, 0, &cached_state); 7476 lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
6684 ordered = btrfs_lookup_ordered_extent(inode, 7477 ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
6685 page_offset(page));
6686 if (ordered) { 7478 if (ordered) {
6687 /* 7479 /*
6688 * IO on this page will never be started, so we need 7480 * IO on this page will never be started, so we need
@@ -6737,7 +7529,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
6737int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 7529int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
6738{ 7530{
6739 struct page *page = vmf->page; 7531 struct page *page = vmf->page;
6740 struct inode *inode = fdentry(vma->vm_file)->d_inode; 7532 struct inode *inode = file_inode(vma->vm_file);
6741 struct btrfs_root *root = BTRFS_I(inode)->root; 7533 struct btrfs_root *root = BTRFS_I(inode)->root;
6742 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 7534 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6743 struct btrfs_ordered_extent *ordered; 7535 struct btrfs_ordered_extent *ordered;
@@ -6915,11 +7707,9 @@ static int btrfs_truncate(struct inode *inode)
6915 7707
6916 /* 7708 /*
6917 * 1 for the truncate slack space 7709 * 1 for the truncate slack space
6918 * 1 for the orphan item we're going to add
6919 * 1 for the orphan item deletion
6920 * 1 for updating the inode. 7710 * 1 for updating the inode.
6921 */ 7711 */
6922 trans = btrfs_start_transaction(root, 4); 7712 trans = btrfs_start_transaction(root, 2);
6923 if (IS_ERR(trans)) { 7713 if (IS_ERR(trans)) {
6924 err = PTR_ERR(trans); 7714 err = PTR_ERR(trans);
6925 goto out; 7715 goto out;
@@ -6930,12 +7720,6 @@ static int btrfs_truncate(struct inode *inode)
6930 min_size); 7720 min_size);
6931 BUG_ON(ret); 7721 BUG_ON(ret);
6932 7722
6933 ret = btrfs_orphan_add(trans, inode);
6934 if (ret) {
6935 btrfs_end_transaction(trans, root);
6936 goto out;
6937 }
6938
6939 /* 7723 /*
6940 * setattr is responsible for setting the ordered_data_close flag, 7724 * setattr is responsible for setting the ordered_data_close flag,
6941 * but that is only tested during the last file release. That 7725 * but that is only tested during the last file release. That
@@ -7004,12 +7788,6 @@ static int btrfs_truncate(struct inode *inode)
7004 ret = btrfs_orphan_del(trans, inode); 7788 ret = btrfs_orphan_del(trans, inode);
7005 if (ret) 7789 if (ret)
7006 err = ret; 7790 err = ret;
7007 } else if (ret && inode->i_nlink > 0) {
7008 /*
7009 * Failed to do the truncate, remove us from the in memory
7010 * orphan list.
7011 */
7012 ret = btrfs_orphan_del(NULL, inode);
7013 } 7791 }
7014 7792
7015 if (trans) { 7793 if (trans) {
@@ -7176,8 +7954,9 @@ int btrfs_drop_inode(struct inode *inode)
7176{ 7954{
7177 struct btrfs_root *root = BTRFS_I(inode)->root; 7955 struct btrfs_root *root = BTRFS_I(inode)->root;
7178 7956
7957 /* the snap/subvol tree is on deleting */
7179 if (btrfs_root_refs(&root->root_item) == 0 && 7958 if (btrfs_root_refs(&root->root_item) == 0 &&
7180 !btrfs_is_free_space_inode(inode)) 7959 root != root->fs_info->tree_root)
7181 return 1; 7960 return 1;
7182 else 7961 else
7183 return generic_drop_inode(inode); 7962 return generic_drop_inode(inode);
@@ -7259,40 +8038,22 @@ fail:
7259static int btrfs_getattr(struct vfsmount *mnt, 8038static int btrfs_getattr(struct vfsmount *mnt,
7260 struct dentry *dentry, struct kstat *stat) 8039 struct dentry *dentry, struct kstat *stat)
7261{ 8040{
8041 u64 delalloc_bytes;
7262 struct inode *inode = dentry->d_inode; 8042 struct inode *inode = dentry->d_inode;
7263 u32 blocksize = inode->i_sb->s_blocksize; 8043 u32 blocksize = inode->i_sb->s_blocksize;
7264 8044
7265 generic_fillattr(inode, stat); 8045 generic_fillattr(inode, stat);
7266 stat->dev = BTRFS_I(inode)->root->anon_dev; 8046 stat->dev = BTRFS_I(inode)->root->anon_dev;
7267 stat->blksize = PAGE_CACHE_SIZE; 8047 stat->blksize = PAGE_CACHE_SIZE;
8048
8049 spin_lock(&BTRFS_I(inode)->lock);
8050 delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
8051 spin_unlock(&BTRFS_I(inode)->lock);
7268 stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + 8052 stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
7269 ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; 8053 ALIGN(delalloc_bytes, blocksize)) >> 9;
7270 return 0; 8054 return 0;
7271} 8055}
7272 8056
7273/*
7274 * If a file is moved, it will inherit the cow and compression flags of the new
7275 * directory.
7276 */
7277static void fixup_inode_flags(struct inode *dir, struct inode *inode)
7278{
7279 struct btrfs_inode *b_dir = BTRFS_I(dir);
7280 struct btrfs_inode *b_inode = BTRFS_I(inode);
7281
7282 if (b_dir->flags & BTRFS_INODE_NODATACOW)
7283 b_inode->flags |= BTRFS_INODE_NODATACOW;
7284 else
7285 b_inode->flags &= ~BTRFS_INODE_NODATACOW;
7286
7287 if (b_dir->flags & BTRFS_INODE_COMPRESS) {
7288 b_inode->flags |= BTRFS_INODE_COMPRESS;
7289 b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
7290 } else {
7291 b_inode->flags &= ~(BTRFS_INODE_COMPRESS |
7292 BTRFS_INODE_NOCOMPRESS);
7293 }
7294}
7295
7296static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, 8057static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7297 struct inode *new_dir, struct dentry *new_dentry) 8058 struct inode *new_dir, struct dentry *new_dentry)
7298{ 8059{
@@ -7458,8 +8219,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7458 } 8219 }
7459 } 8220 }
7460 8221
7461 fixup_inode_flags(new_dir, old_inode);
7462
7463 ret = btrfs_add_link(trans, new_dir, old_inode, 8222 ret = btrfs_add_link(trans, new_dir, old_inode,
7464 new_dentry->d_name.name, 8223 new_dentry->d_name.name,
7465 new_dentry->d_name.len, 0, index); 8224 new_dentry->d_name.len, 0, index);
@@ -7531,41 +8290,57 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
7531 */ 8290 */
7532int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 8291int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
7533{ 8292{
7534 struct list_head *head = &root->fs_info->delalloc_inodes;
7535 struct btrfs_inode *binode; 8293 struct btrfs_inode *binode;
7536 struct inode *inode; 8294 struct inode *inode;
7537 struct btrfs_delalloc_work *work, *next; 8295 struct btrfs_delalloc_work *work, *next;
7538 struct list_head works; 8296 struct list_head works;
8297 struct list_head splice;
7539 int ret = 0; 8298 int ret = 0;
7540 8299
7541 if (root->fs_info->sb->s_flags & MS_RDONLY) 8300 if (root->fs_info->sb->s_flags & MS_RDONLY)
7542 return -EROFS; 8301 return -EROFS;
7543 8302
7544 INIT_LIST_HEAD(&works); 8303 INIT_LIST_HEAD(&works);
8304 INIT_LIST_HEAD(&splice);
7545 8305
7546 spin_lock(&root->fs_info->delalloc_lock); 8306 spin_lock(&root->fs_info->delalloc_lock);
7547 while (!list_empty(head)) { 8307 list_splice_init(&root->fs_info->delalloc_inodes, &splice);
7548 binode = list_entry(head->next, struct btrfs_inode, 8308 while (!list_empty(&splice)) {
8309 binode = list_entry(splice.next, struct btrfs_inode,
7549 delalloc_inodes); 8310 delalloc_inodes);
8311
8312 list_del_init(&binode->delalloc_inodes);
8313
7550 inode = igrab(&binode->vfs_inode); 8314 inode = igrab(&binode->vfs_inode);
7551 if (!inode) 8315 if (!inode) {
7552 list_del_init(&binode->delalloc_inodes); 8316 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
8317 &binode->runtime_flags);
8318 continue;
8319 }
8320
8321 list_add_tail(&binode->delalloc_inodes,
8322 &root->fs_info->delalloc_inodes);
7553 spin_unlock(&root->fs_info->delalloc_lock); 8323 spin_unlock(&root->fs_info->delalloc_lock);
7554 if (inode) { 8324
7555 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); 8325 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
7556 if (!work) { 8326 if (unlikely(!work)) {
7557 ret = -ENOMEM; 8327 ret = -ENOMEM;
7558 goto out; 8328 goto out;
7559 }
7560 list_add_tail(&work->list, &works);
7561 btrfs_queue_worker(&root->fs_info->flush_workers,
7562 &work->work);
7563 } 8329 }
8330 list_add_tail(&work->list, &works);
8331 btrfs_queue_worker(&root->fs_info->flush_workers,
8332 &work->work);
8333
7564 cond_resched(); 8334 cond_resched();
7565 spin_lock(&root->fs_info->delalloc_lock); 8335 spin_lock(&root->fs_info->delalloc_lock);
7566 } 8336 }
7567 spin_unlock(&root->fs_info->delalloc_lock); 8337 spin_unlock(&root->fs_info->delalloc_lock);
7568 8338
8339 list_for_each_entry_safe(work, next, &works, list) {
8340 list_del_init(&work->list);
8341 btrfs_wait_and_free_delalloc_work(work);
8342 }
8343
7569 /* the filemap_flush will queue IO into the worker threads, but 8344 /* the filemap_flush will queue IO into the worker threads, but
7570 * we have to make sure the IO is actually started and that 8345 * we have to make sure the IO is actually started and that
7571 * ordered extents get created before we return 8346 * ordered extents get created before we return
@@ -7578,11 +8353,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
7578 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 8353 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
7579 } 8354 }
7580 atomic_dec(&root->fs_info->async_submit_draining); 8355 atomic_dec(&root->fs_info->async_submit_draining);
8356 return 0;
7581out: 8357out:
7582 list_for_each_entry_safe(work, next, &works, list) { 8358 list_for_each_entry_safe(work, next, &works, list) {
7583 list_del_init(&work->list); 8359 list_del_init(&work->list);
7584 btrfs_wait_and_free_delalloc_work(work); 8360 btrfs_wait_and_free_delalloc_work(work);
7585 } 8361 }
8362
8363 if (!list_empty_careful(&splice)) {
8364 spin_lock(&root->fs_info->delalloc_lock);
8365 list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
8366 spin_unlock(&root->fs_info->delalloc_lock);
8367 }
7586 return ret; 8368 return ret;
7587} 8369}
7588 8370
@@ -7720,6 +8502,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
7720 struct btrfs_key ins; 8502 struct btrfs_key ins;
7721 u64 cur_offset = start; 8503 u64 cur_offset = start;
7722 u64 i_size; 8504 u64 i_size;
8505 u64 cur_bytes;
7723 int ret = 0; 8506 int ret = 0;
7724 bool own_trans = true; 8507 bool own_trans = true;
7725 8508
@@ -7734,8 +8517,10 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
7734 } 8517 }
7735 } 8518 }
7736 8519
7737 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, 8520 cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
7738 0, *alloc_hint, &ins, 1); 8521 cur_bytes = max(cur_bytes, min_size);
8522 ret = btrfs_reserve_extent(trans, root, cur_bytes,
8523 min_size, 0, *alloc_hint, &ins, 1);
7739 if (ret) { 8524 if (ret) {
7740 if (own_trans) 8525 if (own_trans)
7741 btrfs_end_transaction(trans, root); 8526 btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4b4516770f05..2c02310ff2d9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -42,12 +42,12 @@
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <linux/blkdev.h> 43#include <linux/blkdev.h>
44#include <linux/uuid.h> 44#include <linux/uuid.h>
45#include <linux/btrfs.h>
45#include "compat.h" 46#include "compat.h"
46#include "ctree.h" 47#include "ctree.h"
47#include "disk-io.h" 48#include "disk-io.h"
48#include "transaction.h" 49#include "transaction.h"
49#include "btrfs_inode.h" 50#include "btrfs_inode.h"
50#include "ioctl.h"
51#include "print-tree.h" 51#include "print-tree.h"
52#include "volumes.h" 52#include "volumes.h"
53#include "locking.h" 53#include "locking.h"
@@ -152,7 +152,7 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
152 152
153static int btrfs_ioctl_getflags(struct file *file, void __user *arg) 153static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
154{ 154{
155 struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode); 155 struct btrfs_inode *ip = BTRFS_I(file_inode(file));
156 unsigned int flags = btrfs_flags_to_ioctl(ip->flags); 156 unsigned int flags = btrfs_flags_to_ioctl(ip->flags);
157 157
158 if (copy_to_user(arg, &flags, sizeof(flags))) 158 if (copy_to_user(arg, &flags, sizeof(flags)))
@@ -177,7 +177,7 @@ static int check_flags(unsigned int flags)
177 177
178static int btrfs_ioctl_setflags(struct file *file, void __user *arg) 178static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
179{ 179{
180 struct inode *inode = file->f_path.dentry->d_inode; 180 struct inode *inode = file_inode(file);
181 struct btrfs_inode *ip = BTRFS_I(inode); 181 struct btrfs_inode *ip = BTRFS_I(inode);
182 struct btrfs_root *root = ip->root; 182 struct btrfs_root *root = ip->root;
183 struct btrfs_trans_handle *trans; 183 struct btrfs_trans_handle *trans;
@@ -310,7 +310,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
310 310
311static int btrfs_ioctl_getversion(struct file *file, int __user *arg) 311static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
312{ 312{
313 struct inode *inode = file->f_path.dentry->d_inode; 313 struct inode *inode = file_inode(file);
314 314
315 return put_user(inode->i_generation, arg); 315 return put_user(inode->i_generation, arg);
316} 316}
@@ -363,46 +363,52 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
363 return 0; 363 return 0;
364} 364}
365 365
366static noinline int create_subvol(struct btrfs_root *root, 366static noinline int create_subvol(struct inode *dir,
367 struct dentry *dentry, 367 struct dentry *dentry,
368 char *name, int namelen, 368 char *name, int namelen,
369 u64 *async_transid, 369 u64 *async_transid,
370 struct btrfs_qgroup_inherit **inherit) 370 struct btrfs_qgroup_inherit *inherit)
371{ 371{
372 struct btrfs_trans_handle *trans; 372 struct btrfs_trans_handle *trans;
373 struct btrfs_key key; 373 struct btrfs_key key;
374 struct btrfs_root_item root_item; 374 struct btrfs_root_item root_item;
375 struct btrfs_inode_item *inode_item; 375 struct btrfs_inode_item *inode_item;
376 struct extent_buffer *leaf; 376 struct extent_buffer *leaf;
377 struct btrfs_root *root = BTRFS_I(dir)->root;
377 struct btrfs_root *new_root; 378 struct btrfs_root *new_root;
378 struct dentry *parent = dentry->d_parent; 379 struct btrfs_block_rsv block_rsv;
379 struct inode *dir;
380 struct timespec cur_time = CURRENT_TIME; 380 struct timespec cur_time = CURRENT_TIME;
381 int ret; 381 int ret;
382 int err; 382 int err;
383 u64 objectid; 383 u64 objectid;
384 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; 384 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
385 u64 index = 0; 385 u64 index = 0;
386 u64 qgroup_reserved;
386 uuid_le new_uuid; 387 uuid_le new_uuid;
387 388
388 ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); 389 ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
389 if (ret) 390 if (ret)
390 return ret; 391 return ret;
391 392
392 dir = parent->d_inode; 393 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
393
394 /* 394 /*
395 * 1 - inode item 395 * The same as the snapshot creation, please see the comment
396 * 2 - refs 396 * of create_snapshot().
397 * 1 - root item
398 * 2 - dir items
399 */ 397 */
400 trans = btrfs_start_transaction(root, 6); 398 ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
401 if (IS_ERR(trans)) 399 7, &qgroup_reserved);
402 return PTR_ERR(trans); 400 if (ret)
401 return ret;
402
403 trans = btrfs_start_transaction(root, 0);
404 if (IS_ERR(trans)) {
405 ret = PTR_ERR(trans);
406 goto out;
407 }
408 trans->block_rsv = &block_rsv;
409 trans->bytes_reserved = block_rsv.size;
403 410
404 ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, 411 ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit);
405 inherit ? *inherit : NULL);
406 if (ret) 412 if (ret)
407 goto fail; 413 goto fail;
408 414
@@ -515,22 +521,31 @@ static noinline int create_subvol(struct btrfs_root *root,
515 521
516 BUG_ON(ret); 522 BUG_ON(ret);
517 523
518 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
519fail: 524fail:
525 trans->block_rsv = NULL;
526 trans->bytes_reserved = 0;
520 if (async_transid) { 527 if (async_transid) {
521 *async_transid = trans->transid; 528 *async_transid = trans->transid;
522 err = btrfs_commit_transaction_async(trans, root, 1); 529 err = btrfs_commit_transaction_async(trans, root, 1);
530 if (err)
531 err = btrfs_commit_transaction(trans, root);
523 } else { 532 } else {
524 err = btrfs_commit_transaction(trans, root); 533 err = btrfs_commit_transaction(trans, root);
525 } 534 }
526 if (err && !ret) 535 if (err && !ret)
527 ret = err; 536 ret = err;
537
538 if (!ret)
539 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
540out:
541 btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
528 return ret; 542 return ret;
529} 543}
530 544
531static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, 545static int create_snapshot(struct btrfs_root *root, struct inode *dir,
532 char *name, int namelen, u64 *async_transid, 546 struct dentry *dentry, char *name, int namelen,
533 bool readonly, struct btrfs_qgroup_inherit **inherit) 547 u64 *async_transid, bool readonly,
548 struct btrfs_qgroup_inherit *inherit)
534{ 549{
535 struct inode *inode; 550 struct inode *inode;
536 struct btrfs_pending_snapshot *pending_snapshot; 551 struct btrfs_pending_snapshot *pending_snapshot;
@@ -546,23 +561,31 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
546 561
547 btrfs_init_block_rsv(&pending_snapshot->block_rsv, 562 btrfs_init_block_rsv(&pending_snapshot->block_rsv,
548 BTRFS_BLOCK_RSV_TEMP); 563 BTRFS_BLOCK_RSV_TEMP);
564 /*
565 * 1 - parent dir inode
566 * 2 - dir entries
567 * 1 - root item
568 * 2 - root ref/backref
569 * 1 - root of snapshot
570 */
571 ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
572 &pending_snapshot->block_rsv, 7,
573 &pending_snapshot->qgroup_reserved);
574 if (ret)
575 goto out;
576
549 pending_snapshot->dentry = dentry; 577 pending_snapshot->dentry = dentry;
550 pending_snapshot->root = root; 578 pending_snapshot->root = root;
551 pending_snapshot->readonly = readonly; 579 pending_snapshot->readonly = readonly;
552 if (inherit) { 580 pending_snapshot->dir = dir;
553 pending_snapshot->inherit = *inherit; 581 pending_snapshot->inherit = inherit;
554 *inherit = NULL; /* take responsibility to free it */
555 }
556 582
557 trans = btrfs_start_transaction(root->fs_info->extent_root, 6); 583 trans = btrfs_start_transaction(root, 0);
558 if (IS_ERR(trans)) { 584 if (IS_ERR(trans)) {
559 ret = PTR_ERR(trans); 585 ret = PTR_ERR(trans);
560 goto fail; 586 goto fail;
561 } 587 }
562 588
563 ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
564 BUG_ON(ret);
565
566 spin_lock(&root->fs_info->trans_lock); 589 spin_lock(&root->fs_info->trans_lock);
567 list_add(&pending_snapshot->list, 590 list_add(&pending_snapshot->list,
568 &trans->transaction->pending_snapshots); 591 &trans->transaction->pending_snapshots);
@@ -571,16 +594,14 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
571 *async_transid = trans->transid; 594 *async_transid = trans->transid;
572 ret = btrfs_commit_transaction_async(trans, 595 ret = btrfs_commit_transaction_async(trans,
573 root->fs_info->extent_root, 1); 596 root->fs_info->extent_root, 1);
597 if (ret)
598 ret = btrfs_commit_transaction(trans, root);
574 } else { 599 } else {
575 ret = btrfs_commit_transaction(trans, 600 ret = btrfs_commit_transaction(trans,
576 root->fs_info->extent_root); 601 root->fs_info->extent_root);
577 } 602 }
578 if (ret) { 603 if (ret)
579 /* cleanup_transaction has freed this for us */
580 if (trans->aborted)
581 pending_snapshot = NULL;
582 goto fail; 604 goto fail;
583 }
584 605
585 ret = pending_snapshot->error; 606 ret = pending_snapshot->error;
586 if (ret) 607 if (ret)
@@ -599,6 +620,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
599 d_instantiate(dentry, inode); 620 d_instantiate(dentry, inode);
600 ret = 0; 621 ret = 0;
601fail: 622fail:
623 btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
624 &pending_snapshot->block_rsv,
625 pending_snapshot->qgroup_reserved);
626out:
602 kfree(pending_snapshot); 627 kfree(pending_snapshot);
603 return ret; 628 return ret;
604} 629}
@@ -692,7 +717,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
692 char *name, int namelen, 717 char *name, int namelen,
693 struct btrfs_root *snap_src, 718 struct btrfs_root *snap_src,
694 u64 *async_transid, bool readonly, 719 u64 *async_transid, bool readonly,
695 struct btrfs_qgroup_inherit **inherit) 720 struct btrfs_qgroup_inherit *inherit)
696{ 721{
697 struct inode *dir = parent->dentry->d_inode; 722 struct inode *dir = parent->dentry->d_inode;
698 struct dentry *dentry; 723 struct dentry *dentry;
@@ -729,11 +754,11 @@ static noinline int btrfs_mksubvol(struct path *parent,
729 goto out_up_read; 754 goto out_up_read;
730 755
731 if (snap_src) { 756 if (snap_src) {
732 error = create_snapshot(snap_src, dentry, name, namelen, 757 error = create_snapshot(snap_src, dir, dentry, name, namelen,
733 async_transid, readonly, inherit); 758 async_transid, readonly, inherit);
734 } else { 759 } else {
735 error = create_subvol(BTRFS_I(dir)->root, dentry, 760 error = create_subvol(dir, dentry, name, namelen,
736 name, namelen, async_transid, inherit); 761 async_transid, inherit);
737 } 762 }
738 if (!error) 763 if (!error)
739 fsnotify_mkdir(dir, dentry); 764 fsnotify_mkdir(dir, dentry);
@@ -815,7 +840,7 @@ static int find_new_extents(struct btrfs_root *root,
815 840
816 while(1) { 841 while(1) {
817 ret = btrfs_search_forward(root, &min_key, &max_key, 842 ret = btrfs_search_forward(root, &min_key, &max_key,
818 path, 0, newer_than); 843 path, newer_than);
819 if (ret != 0) 844 if (ret != 0)
820 goto none; 845 goto none;
821 if (min_key.objectid != ino) 846 if (min_key.objectid != ino)
@@ -1203,6 +1228,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1203 if (!(inode->i_sb->s_flags & MS_ACTIVE)) 1228 if (!(inode->i_sb->s_flags & MS_ACTIVE))
1204 break; 1229 break;
1205 1230
1231 if (btrfs_defrag_cancelled(root->fs_info)) {
1232 printk(KERN_DEBUG "btrfs: defrag_file cancelled\n");
1233 ret = -EAGAIN;
1234 break;
1235 }
1236
1206 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, 1237 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
1207 extent_thresh, &last_len, &skip, 1238 extent_thresh, &last_len, &skip,
1208 &defrag_end, range->flags & 1239 &defrag_end, range->flags &
@@ -1317,7 +1348,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1317 u64 new_size; 1348 u64 new_size;
1318 u64 old_size; 1349 u64 old_size;
1319 u64 devid = 1; 1350 u64 devid = 1;
1320 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 1351 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
1321 struct btrfs_ioctl_vol_args *vol_args; 1352 struct btrfs_ioctl_vol_args *vol_args;
1322 struct btrfs_trans_handle *trans; 1353 struct btrfs_trans_handle *trans;
1323 struct btrfs_device *device = NULL; 1354 struct btrfs_device *device = NULL;
@@ -1326,9 +1357,6 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1326 int ret = 0; 1357 int ret = 0;
1327 int mod = 0; 1358 int mod = 0;
1328 1359
1329 if (root->fs_info->sb->s_flags & MS_RDONLY)
1330 return -EROFS;
1331
1332 if (!capable(CAP_SYS_ADMIN)) 1360 if (!capable(CAP_SYS_ADMIN))
1333 return -EPERM; 1361 return -EPERM;
1334 1362
@@ -1339,7 +1367,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1339 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1367 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
1340 1)) { 1368 1)) {
1341 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); 1369 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
1342 return -EINPROGRESS; 1370 mnt_drop_write_file(file);
1371 return -EINVAL;
1343 } 1372 }
1344 1373
1345 mutex_lock(&root->fs_info->volume_mutex); 1374 mutex_lock(&root->fs_info->volume_mutex);
@@ -1359,21 +1388,27 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1359 *devstr = '\0'; 1388 *devstr = '\0';
1360 devstr = vol_args->name; 1389 devstr = vol_args->name;
1361 devid = simple_strtoull(devstr, &end, 10); 1390 devid = simple_strtoull(devstr, &end, 10);
1391 if (!devid) {
1392 ret = -EINVAL;
1393 goto out_free;
1394 }
1362 printk(KERN_INFO "btrfs: resizing devid %llu\n", 1395 printk(KERN_INFO "btrfs: resizing devid %llu\n",
1363 (unsigned long long)devid); 1396 (unsigned long long)devid);
1364 } 1397 }
1398
1365 device = btrfs_find_device(root->fs_info, devid, NULL, NULL); 1399 device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
1366 if (!device) { 1400 if (!device) {
1367 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", 1401 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
1368 (unsigned long long)devid); 1402 (unsigned long long)devid);
1369 ret = -EINVAL; 1403 ret = -ENODEV;
1370 goto out_free; 1404 goto out_free;
1371 } 1405 }
1372 if (device->fs_devices && device->fs_devices->seeding) { 1406
1407 if (!device->writeable) {
1373 printk(KERN_INFO "btrfs: resizer unable to apply on " 1408 printk(KERN_INFO "btrfs: resizer unable to apply on "
1374 "seeding device %llu\n", 1409 "readonly device %llu\n",
1375 (unsigned long long)devid); 1410 (unsigned long long)devid);
1376 ret = -EINVAL; 1411 ret = -EPERM;
1377 goto out_free; 1412 goto out_free;
1378 } 1413 }
1379 1414
@@ -1395,7 +1430,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1395 } 1430 }
1396 1431
1397 if (device->is_tgtdev_for_dev_replace) { 1432 if (device->is_tgtdev_for_dev_replace) {
1398 ret = -EINVAL; 1433 ret = -EPERM;
1399 goto out_free; 1434 goto out_free;
1400 } 1435 }
1401 1436
@@ -1443,15 +1478,15 @@ out_free:
1443 kfree(vol_args); 1478 kfree(vol_args);
1444out: 1479out:
1445 mutex_unlock(&root->fs_info->volume_mutex); 1480 mutex_unlock(&root->fs_info->volume_mutex);
1446 mnt_drop_write_file(file);
1447 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 1481 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
1482 mnt_drop_write_file(file);
1448 return ret; 1483 return ret;
1449} 1484}
1450 1485
1451static noinline int btrfs_ioctl_snap_create_transid(struct file *file, 1486static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
1452 char *name, unsigned long fd, int subvol, 1487 char *name, unsigned long fd, int subvol,
1453 u64 *transid, bool readonly, 1488 u64 *transid, bool readonly,
1454 struct btrfs_qgroup_inherit **inherit) 1489 struct btrfs_qgroup_inherit *inherit)
1455{ 1490{
1456 int namelen; 1491 int namelen;
1457 int ret = 0; 1492 int ret = 0;
@@ -1483,8 +1518,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
1483 goto out_drop_write; 1518 goto out_drop_write;
1484 } 1519 }
1485 1520
1486 src_inode = src.file->f_path.dentry->d_inode; 1521 src_inode = file_inode(src.file);
1487 if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) { 1522 if (src_inode->i_sb != file_inode(file)->i_sb) {
1488 printk(KERN_INFO "btrfs: Snapshot src from " 1523 printk(KERN_INFO "btrfs: Snapshot src from "
1489 "another FS\n"); 1524 "another FS\n");
1490 ret = -EINVAL; 1525 ret = -EINVAL;
@@ -1560,7 +1595,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1560 1595
1561 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, 1596 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1562 vol_args->fd, subvol, ptr, 1597 vol_args->fd, subvol, ptr,
1563 readonly, &inherit); 1598 readonly, inherit);
1564 1599
1565 if (ret == 0 && ptr && 1600 if (ret == 0 && ptr &&
1566 copy_to_user(arg + 1601 copy_to_user(arg +
@@ -1576,7 +1611,7 @@ out:
1576static noinline int btrfs_ioctl_subvol_getflags(struct file *file, 1611static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
1577 void __user *arg) 1612 void __user *arg)
1578{ 1613{
1579 struct inode *inode = fdentry(file)->d_inode; 1614 struct inode *inode = file_inode(file);
1580 struct btrfs_root *root = BTRFS_I(inode)->root; 1615 struct btrfs_root *root = BTRFS_I(inode)->root;
1581 int ret = 0; 1616 int ret = 0;
1582 u64 flags = 0; 1617 u64 flags = 0;
@@ -1598,7 +1633,7 @@ static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
1598static noinline int btrfs_ioctl_subvol_setflags(struct file *file, 1633static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1599 void __user *arg) 1634 void __user *arg)
1600{ 1635{
1601 struct inode *inode = fdentry(file)->d_inode; 1636 struct inode *inode = file_inode(file);
1602 struct btrfs_root *root = BTRFS_I(inode)->root; 1637 struct btrfs_root *root = BTRFS_I(inode)->root;
1603 struct btrfs_trans_handle *trans; 1638 struct btrfs_trans_handle *trans;
1604 u64 root_flags; 1639 u64 root_flags;
@@ -1857,7 +1892,7 @@ static noinline int search_ioctl(struct inode *inode,
1857 path->keep_locks = 1; 1892 path->keep_locks = 1;
1858 1893
1859 while(1) { 1894 while(1) {
1860 ret = btrfs_search_forward(root, &key, &max_key, path, 0, 1895 ret = btrfs_search_forward(root, &key, &max_key, path,
1861 sk->min_transid); 1896 sk->min_transid);
1862 if (ret != 0) { 1897 if (ret != 0) {
1863 if (ret > 0) 1898 if (ret > 0)
@@ -1892,7 +1927,7 @@ static noinline int btrfs_ioctl_tree_search(struct file *file,
1892 if (IS_ERR(args)) 1927 if (IS_ERR(args))
1893 return PTR_ERR(args); 1928 return PTR_ERR(args);
1894 1929
1895 inode = fdentry(file)->d_inode; 1930 inode = file_inode(file);
1896 ret = search_ioctl(inode, args); 1931 ret = search_ioctl(inode, args);
1897 if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) 1932 if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
1898 ret = -EFAULT; 1933 ret = -EFAULT;
@@ -2002,7 +2037,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
2002 if (IS_ERR(args)) 2037 if (IS_ERR(args))
2003 return PTR_ERR(args); 2038 return PTR_ERR(args);
2004 2039
2005 inode = fdentry(file)->d_inode; 2040 inode = file_inode(file);
2006 2041
2007 if (args->treeid == 0) 2042 if (args->treeid == 0)
2008 args->treeid = BTRFS_I(inode)->root->root_key.objectid; 2043 args->treeid = BTRFS_I(inode)->root->root_key.objectid;
@@ -2029,6 +2064,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2029 struct btrfs_root *dest = NULL; 2064 struct btrfs_root *dest = NULL;
2030 struct btrfs_ioctl_vol_args *vol_args; 2065 struct btrfs_ioctl_vol_args *vol_args;
2031 struct btrfs_trans_handle *trans; 2066 struct btrfs_trans_handle *trans;
2067 struct btrfs_block_rsv block_rsv;
2068 u64 qgroup_reserved;
2032 int namelen; 2069 int namelen;
2033 int ret; 2070 int ret;
2034 int err = 0; 2071 int err = 0;
@@ -2095,13 +2132,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2095 err = inode_permission(inode, MAY_WRITE | MAY_EXEC); 2132 err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
2096 if (err) 2133 if (err)
2097 goto out_dput; 2134 goto out_dput;
2098
2099 /* check if subvolume may be deleted by a non-root user */
2100 err = btrfs_may_delete(dir, dentry, 1);
2101 if (err)
2102 goto out_dput;
2103 } 2135 }
2104 2136
2137 /* check if subvolume may be deleted by a user */
2138 err = btrfs_may_delete(dir, dentry, 1);
2139 if (err)
2140 goto out_dput;
2141
2105 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { 2142 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
2106 err = -EINVAL; 2143 err = -EINVAL;
2107 goto out_dput; 2144 goto out_dput;
@@ -2118,12 +2155,23 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2118 if (err) 2155 if (err)
2119 goto out_up_write; 2156 goto out_up_write;
2120 2157
2158 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
2159 /*
2160 * One for dir inode, two for dir entries, two for root
2161 * ref/backref.
2162 */
2163 err = btrfs_subvolume_reserve_metadata(root, &block_rsv,
2164 5, &qgroup_reserved);
2165 if (err)
2166 goto out_up_write;
2167
2121 trans = btrfs_start_transaction(root, 0); 2168 trans = btrfs_start_transaction(root, 0);
2122 if (IS_ERR(trans)) { 2169 if (IS_ERR(trans)) {
2123 err = PTR_ERR(trans); 2170 err = PTR_ERR(trans);
2124 goto out_up_write; 2171 goto out_release;
2125 } 2172 }
2126 trans->block_rsv = &root->fs_info->global_block_rsv; 2173 trans->block_rsv = &block_rsv;
2174 trans->bytes_reserved = block_rsv.size;
2127 2175
2128 ret = btrfs_unlink_subvol(trans, root, dir, 2176 ret = btrfs_unlink_subvol(trans, root, dir,
2129 dest->root_key.objectid, 2177 dest->root_key.objectid,
@@ -2153,10 +2201,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2153 } 2201 }
2154 } 2202 }
2155out_end_trans: 2203out_end_trans:
2204 trans->block_rsv = NULL;
2205 trans->bytes_reserved = 0;
2156 ret = btrfs_end_transaction(trans, root); 2206 ret = btrfs_end_transaction(trans, root);
2157 if (ret && !err) 2207 if (ret && !err)
2158 err = ret; 2208 err = ret;
2159 inode->i_flags |= S_DEAD; 2209 inode->i_flags |= S_DEAD;
2210out_release:
2211 btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
2160out_up_write: 2212out_up_write:
2161 up_write(&root->fs_info->subvol_sem); 2213 up_write(&root->fs_info->subvol_sem);
2162out_unlock: 2214out_unlock:
@@ -2165,6 +2217,12 @@ out_unlock:
2165 shrink_dcache_sb(root->fs_info->sb); 2217 shrink_dcache_sb(root->fs_info->sb);
2166 btrfs_invalidate_inodes(dest); 2218 btrfs_invalidate_inodes(dest);
2167 d_delete(dentry); 2219 d_delete(dentry);
2220
2221 /* the last ref */
2222 if (dest->cache_inode) {
2223 iput(dest->cache_inode);
2224 dest->cache_inode = NULL;
2225 }
2168 } 2226 }
2169out_dput: 2227out_dput:
2170 dput(dentry); 2228 dput(dentry);
@@ -2178,24 +2236,18 @@ out:
2178 2236
2179static int btrfs_ioctl_defrag(struct file *file, void __user *argp) 2237static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
2180{ 2238{
2181 struct inode *inode = fdentry(file)->d_inode; 2239 struct inode *inode = file_inode(file);
2182 struct btrfs_root *root = BTRFS_I(inode)->root; 2240 struct btrfs_root *root = BTRFS_I(inode)->root;
2183 struct btrfs_ioctl_defrag_range_args *range; 2241 struct btrfs_ioctl_defrag_range_args *range;
2184 int ret; 2242 int ret;
2185 2243
2186 if (btrfs_root_readonly(root))
2187 return -EROFS;
2188
2189 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2190 1)) {
2191 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2192 return -EINPROGRESS;
2193 }
2194 ret = mnt_want_write_file(file); 2244 ret = mnt_want_write_file(file);
2195 if (ret) { 2245 if (ret)
2196 atomic_set(&root->fs_info->mutually_exclusive_operation_running,
2197 0);
2198 return ret; 2246 return ret;
2247
2248 if (btrfs_root_readonly(root)) {
2249 ret = -EROFS;
2250 goto out;
2199 } 2251 }
2200 2252
2201 switch (inode->i_mode & S_IFMT) { 2253 switch (inode->i_mode & S_IFMT) {
@@ -2204,10 +2256,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
2204 ret = -EPERM; 2256 ret = -EPERM;
2205 goto out; 2257 goto out;
2206 } 2258 }
2207 ret = btrfs_defrag_root(root, 0); 2259 ret = btrfs_defrag_root(root);
2208 if (ret) 2260 if (ret)
2209 goto out; 2261 goto out;
2210 ret = btrfs_defrag_root(root->fs_info->extent_root, 0); 2262 ret = btrfs_defrag_root(root->fs_info->extent_root);
2211 break; 2263 break;
2212 case S_IFREG: 2264 case S_IFREG:
2213 if (!(file->f_mode & FMODE_WRITE)) { 2265 if (!(file->f_mode & FMODE_WRITE)) {
@@ -2237,7 +2289,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
2237 /* the rest are all set to zero by kzalloc */ 2289 /* the rest are all set to zero by kzalloc */
2238 range->len = (u64)-1; 2290 range->len = (u64)-1;
2239 } 2291 }
2240 ret = btrfs_defrag_file(fdentry(file)->d_inode, file, 2292 ret = btrfs_defrag_file(file_inode(file), file,
2241 range, 0, 0); 2293 range, 0, 0);
2242 if (ret > 0) 2294 if (ret > 0)
2243 ret = 0; 2295 ret = 0;
@@ -2248,7 +2300,6 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
2248 } 2300 }
2249out: 2301out:
2250 mnt_drop_write_file(file); 2302 mnt_drop_write_file(file);
2251 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2252 return ret; 2303 return ret;
2253} 2304}
2254 2305
@@ -2263,7 +2314,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
2263 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 2314 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2264 1)) { 2315 1)) {
2265 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); 2316 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2266 return -EINPROGRESS; 2317 return -EINVAL;
2267 } 2318 }
2268 2319
2269 mutex_lock(&root->fs_info->volume_mutex); 2320 mutex_lock(&root->fs_info->volume_mutex);
@@ -2285,7 +2336,7 @@ out:
2285 2336
2286static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) 2337static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2287{ 2338{
2288 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 2339 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
2289 struct btrfs_ioctl_vol_args *vol_args; 2340 struct btrfs_ioctl_vol_args *vol_args;
2290 int ret; 2341 int ret;
2291 2342
@@ -2300,7 +2351,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2300 1)) { 2351 1)) {
2301 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); 2352 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2302 mnt_drop_write_file(file); 2353 mnt_drop_write_file(file);
2303 return -EINPROGRESS; 2354 return -EINVAL;
2304 } 2355 }
2305 2356
2306 mutex_lock(&root->fs_info->volume_mutex); 2357 mutex_lock(&root->fs_info->volume_mutex);
@@ -2316,8 +2367,8 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2316 kfree(vol_args); 2367 kfree(vol_args);
2317out: 2368out:
2318 mutex_unlock(&root->fs_info->volume_mutex); 2369 mutex_unlock(&root->fs_info->volume_mutex);
2319 mnt_drop_write_file(file);
2320 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 2370 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2371 mnt_drop_write_file(file);
2321 return ret; 2372 return ret;
2322} 2373}
2323 2374
@@ -2408,7 +2459,7 @@ out:
2408static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, 2459static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2409 u64 off, u64 olen, u64 destoff) 2460 u64 off, u64 olen, u64 destoff)
2410{ 2461{
2411 struct inode *inode = fdentry(file)->d_inode; 2462 struct inode *inode = file_inode(file);
2412 struct btrfs_root *root = BTRFS_I(inode)->root; 2463 struct btrfs_root *root = BTRFS_I(inode)->root;
2413 struct fd src_file; 2464 struct fd src_file;
2414 struct inode *src; 2465 struct inode *src;
@@ -2454,7 +2505,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2454 if (src_file.file->f_path.mnt != file->f_path.mnt) 2505 if (src_file.file->f_path.mnt != file->f_path.mnt)
2455 goto out_fput; 2506 goto out_fput;
2456 2507
2457 src = src_file.file->f_dentry->d_inode; 2508 src = file_inode(src_file.file);
2458 2509
2459 ret = -EINVAL; 2510 ret = -EINVAL;
2460 if (src == inode) 2511 if (src == inode)
@@ -2816,7 +2867,7 @@ static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
2816 */ 2867 */
2817static long btrfs_ioctl_trans_start(struct file *file) 2868static long btrfs_ioctl_trans_start(struct file *file)
2818{ 2869{
2819 struct inode *inode = fdentry(file)->d_inode; 2870 struct inode *inode = file_inode(file);
2820 struct btrfs_root *root = BTRFS_I(inode)->root; 2871 struct btrfs_root *root = BTRFS_I(inode)->root;
2821 struct btrfs_trans_handle *trans; 2872 struct btrfs_trans_handle *trans;
2822 int ret; 2873 int ret;
@@ -2856,7 +2907,7 @@ out:
2856 2907
2857static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) 2908static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2858{ 2909{
2859 struct inode *inode = fdentry(file)->d_inode; 2910 struct inode *inode = file_inode(file);
2860 struct btrfs_root *root = BTRFS_I(inode)->root; 2911 struct btrfs_root *root = BTRFS_I(inode)->root;
2861 struct btrfs_root *new_root; 2912 struct btrfs_root *new_root;
2862 struct btrfs_dir_item *di; 2913 struct btrfs_dir_item *di;
@@ -3080,7 +3131,7 @@ out:
3080 */ 3131 */
3081long btrfs_ioctl_trans_end(struct file *file) 3132long btrfs_ioctl_trans_end(struct file *file)
3082{ 3133{
3083 struct inode *inode = fdentry(file)->d_inode; 3134 struct inode *inode = file_inode(file);
3084 struct btrfs_root *root = BTRFS_I(inode)->root; 3135 struct btrfs_root *root = BTRFS_I(inode)->root;
3085 struct btrfs_trans_handle *trans; 3136 struct btrfs_trans_handle *trans;
3086 3137
@@ -3104,7 +3155,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
3104 u64 transid; 3155 u64 transid;
3105 int ret; 3156 int ret;
3106 3157
3107 trans = btrfs_attach_transaction(root); 3158 trans = btrfs_attach_transaction_barrier(root);
3108 if (IS_ERR(trans)) { 3159 if (IS_ERR(trans)) {
3109 if (PTR_ERR(trans) != -ENOENT) 3160 if (PTR_ERR(trans) != -ENOENT)
3110 return PTR_ERR(trans); 3161 return PTR_ERR(trans);
@@ -3142,7 +3193,7 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
3142 3193
3143static long btrfs_ioctl_scrub(struct file *file, void __user *arg) 3194static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
3144{ 3195{
3145 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 3196 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
3146 struct btrfs_ioctl_scrub_args *sa; 3197 struct btrfs_ioctl_scrub_args *sa;
3147 int ret; 3198 int ret;
3148 3199
@@ -3282,7 +3333,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
3282 struct inode_fs_paths *ipath = NULL; 3333 struct inode_fs_paths *ipath = NULL;
3283 struct btrfs_path *path; 3334 struct btrfs_path *path;
3284 3335
3285 if (!capable(CAP_SYS_ADMIN)) 3336 if (!capable(CAP_DAC_READ_SEARCH))
3286 return -EPERM; 3337 return -EPERM;
3287 3338
3288 path = btrfs_alloc_path(); 3339 path = btrfs_alloc_path();
@@ -3433,12 +3484,12 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
3433 3484
3434static long btrfs_ioctl_balance(struct file *file, void __user *arg) 3485static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3435{ 3486{
3436 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 3487 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
3437 struct btrfs_fs_info *fs_info = root->fs_info; 3488 struct btrfs_fs_info *fs_info = root->fs_info;
3438 struct btrfs_ioctl_balance_args *bargs; 3489 struct btrfs_ioctl_balance_args *bargs;
3439 struct btrfs_balance_control *bctl; 3490 struct btrfs_balance_control *bctl;
3491 bool need_unlock; /* for mut. excl. ops lock */
3440 int ret; 3492 int ret;
3441 int need_to_clear_lock = 0;
3442 3493
3443 if (!capable(CAP_SYS_ADMIN)) 3494 if (!capable(CAP_SYS_ADMIN))
3444 return -EPERM; 3495 return -EPERM;
@@ -3447,14 +3498,61 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3447 if (ret) 3498 if (ret)
3448 return ret; 3499 return ret;
3449 3500
3450 mutex_lock(&fs_info->volume_mutex); 3501again:
3502 if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
3503 mutex_lock(&fs_info->volume_mutex);
3504 mutex_lock(&fs_info->balance_mutex);
3505 need_unlock = true;
3506 goto locked;
3507 }
3508
3509 /*
3510 * mut. excl. ops lock is locked. Three possibilites:
3511 * (1) some other op is running
3512 * (2) balance is running
3513 * (3) balance is paused -- special case (think resume)
3514 */
3451 mutex_lock(&fs_info->balance_mutex); 3515 mutex_lock(&fs_info->balance_mutex);
3516 if (fs_info->balance_ctl) {
3517 /* this is either (2) or (3) */
3518 if (!atomic_read(&fs_info->balance_running)) {
3519 mutex_unlock(&fs_info->balance_mutex);
3520 if (!mutex_trylock(&fs_info->volume_mutex))
3521 goto again;
3522 mutex_lock(&fs_info->balance_mutex);
3523
3524 if (fs_info->balance_ctl &&
3525 !atomic_read(&fs_info->balance_running)) {
3526 /* this is (3) */
3527 need_unlock = false;
3528 goto locked;
3529 }
3530
3531 mutex_unlock(&fs_info->balance_mutex);
3532 mutex_unlock(&fs_info->volume_mutex);
3533 goto again;
3534 } else {
3535 /* this is (2) */
3536 mutex_unlock(&fs_info->balance_mutex);
3537 ret = -EINPROGRESS;
3538 goto out;
3539 }
3540 } else {
3541 /* this is (1) */
3542 mutex_unlock(&fs_info->balance_mutex);
3543 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
3544 ret = -EINVAL;
3545 goto out;
3546 }
3547
3548locked:
3549 BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running));
3452 3550
3453 if (arg) { 3551 if (arg) {
3454 bargs = memdup_user(arg, sizeof(*bargs)); 3552 bargs = memdup_user(arg, sizeof(*bargs));
3455 if (IS_ERR(bargs)) { 3553 if (IS_ERR(bargs)) {
3456 ret = PTR_ERR(bargs); 3554 ret = PTR_ERR(bargs);
3457 goto out; 3555 goto out_unlock;
3458 } 3556 }
3459 3557
3460 if (bargs->flags & BTRFS_BALANCE_RESUME) { 3558 if (bargs->flags & BTRFS_BALANCE_RESUME) {
@@ -3474,13 +3572,10 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3474 bargs = NULL; 3572 bargs = NULL;
3475 } 3573 }
3476 3574
3477 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 3575 if (fs_info->balance_ctl) {
3478 1)) {
3479 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
3480 ret = -EINPROGRESS; 3576 ret = -EINPROGRESS;
3481 goto out_bargs; 3577 goto out_bargs;
3482 } 3578 }
3483 need_to_clear_lock = 1;
3484 3579
3485 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 3580 bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
3486 if (!bctl) { 3581 if (!bctl) {
@@ -3501,11 +3596,17 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3501 } 3596 }
3502 3597
3503do_balance: 3598do_balance:
3504 ret = btrfs_balance(bctl, bargs);
3505 /* 3599 /*
3506 * bctl is freed in __cancel_balance or in free_fs_info if 3600 * Ownership of bctl and mutually_exclusive_operation_running
3507 * restriper was paused all the way until unmount 3601 * goes to to btrfs_balance. bctl is freed in __cancel_balance,
3602 * or, if restriper was paused all the way until unmount, in
3603 * free_fs_info. mutually_exclusive_operation_running is
3604 * cleared in __cancel_balance.
3508 */ 3605 */
3606 need_unlock = false;
3607
3608 ret = btrfs_balance(bctl, bargs);
3609
3509 if (arg) { 3610 if (arg) {
3510 if (copy_to_user(arg, bargs, sizeof(*bargs))) 3611 if (copy_to_user(arg, bargs, sizeof(*bargs)))
3511 ret = -EFAULT; 3612 ret = -EFAULT;
@@ -3513,12 +3614,12 @@ do_balance:
3513 3614
3514out_bargs: 3615out_bargs:
3515 kfree(bargs); 3616 kfree(bargs);
3516out: 3617out_unlock:
3517 if (need_to_clear_lock)
3518 atomic_set(&root->fs_info->mutually_exclusive_operation_running,
3519 0);
3520 mutex_unlock(&fs_info->balance_mutex); 3618 mutex_unlock(&fs_info->balance_mutex);
3521 mutex_unlock(&fs_info->volume_mutex); 3619 mutex_unlock(&fs_info->volume_mutex);
3620 if (need_unlock)
3621 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3622out:
3522 mnt_drop_write_file(file); 3623 mnt_drop_write_file(file);
3523 return ret; 3624 return ret;
3524} 3625}
@@ -3573,7 +3674,7 @@ out:
3573 3674
3574static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) 3675static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
3575{ 3676{
3576 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 3677 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
3577 struct btrfs_ioctl_quota_ctl_args *sa; 3678 struct btrfs_ioctl_quota_ctl_args *sa;
3578 struct btrfs_trans_handle *trans = NULL; 3679 struct btrfs_trans_handle *trans = NULL;
3579 int ret; 3680 int ret;
@@ -3632,7 +3733,7 @@ drop_write:
3632 3733
3633static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) 3734static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
3634{ 3735{
3635 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 3736 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
3636 struct btrfs_ioctl_qgroup_assign_args *sa; 3737 struct btrfs_ioctl_qgroup_assign_args *sa;
3637 struct btrfs_trans_handle *trans; 3738 struct btrfs_trans_handle *trans;
3638 int ret; 3739 int ret;
@@ -3679,7 +3780,7 @@ drop_write:
3679 3780
3680static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) 3781static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
3681{ 3782{
3682 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 3783 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
3683 struct btrfs_ioctl_qgroup_create_args *sa; 3784 struct btrfs_ioctl_qgroup_create_args *sa;
3684 struct btrfs_trans_handle *trans; 3785 struct btrfs_trans_handle *trans;
3685 int ret; 3786 int ret;
@@ -3698,6 +3799,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
3698 goto drop_write; 3799 goto drop_write;
3699 } 3800 }
3700 3801
3802 if (!sa->qgroupid) {
3803 ret = -EINVAL;
3804 goto out;
3805 }
3806
3701 trans = btrfs_join_transaction(root); 3807 trans = btrfs_join_transaction(root);
3702 if (IS_ERR(trans)) { 3808 if (IS_ERR(trans)) {
3703 ret = PTR_ERR(trans); 3809 ret = PTR_ERR(trans);
@@ -3725,7 +3831,7 @@ drop_write:
3725 3831
3726static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) 3832static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
3727{ 3833{
3728 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 3834 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
3729 struct btrfs_ioctl_qgroup_limit_args *sa; 3835 struct btrfs_ioctl_qgroup_limit_args *sa;
3730 struct btrfs_trans_handle *trans; 3836 struct btrfs_trans_handle *trans;
3731 int ret; 3837 int ret;
@@ -3775,7 +3881,7 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
3775 void __user *arg) 3881 void __user *arg)
3776{ 3882{
3777 struct btrfs_ioctl_received_subvol_args *sa = NULL; 3883 struct btrfs_ioctl_received_subvol_args *sa = NULL;
3778 struct inode *inode = fdentry(file)->d_inode; 3884 struct inode *inode = file_inode(file);
3779 struct btrfs_root *root = BTRFS_I(inode)->root; 3885 struct btrfs_root *root = BTRFS_I(inode)->root;
3780 struct btrfs_root_item *root_item = &root->root_item; 3886 struct btrfs_root_item *root_item = &root->root_item;
3781 struct btrfs_trans_handle *trans; 3887 struct btrfs_trans_handle *trans;
@@ -3852,10 +3958,69 @@ out:
3852 return ret; 3958 return ret;
3853} 3959}
3854 3960
3961static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
3962{
3963 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3964 const char *label = root->fs_info->super_copy->label;
3965 size_t len = strnlen(label, BTRFS_LABEL_SIZE);
3966 int ret;
3967
3968 if (len == BTRFS_LABEL_SIZE) {
3969 pr_warn("btrfs: label is too long, return the first %zu bytes\n",
3970 --len);
3971 }
3972
3973 mutex_lock(&root->fs_info->volume_mutex);
3974 ret = copy_to_user(arg, label, len);
3975 mutex_unlock(&root->fs_info->volume_mutex);
3976
3977 return ret ? -EFAULT : 0;
3978}
3979
3980static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
3981{
3982 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3983 struct btrfs_super_block *super_block = root->fs_info->super_copy;
3984 struct btrfs_trans_handle *trans;
3985 char label[BTRFS_LABEL_SIZE];
3986 int ret;
3987
3988 if (!capable(CAP_SYS_ADMIN))
3989 return -EPERM;
3990
3991 if (copy_from_user(label, arg, sizeof(label)))
3992 return -EFAULT;
3993
3994 if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
3995 pr_err("btrfs: unable to set label with more than %d bytes\n",
3996 BTRFS_LABEL_SIZE - 1);
3997 return -EINVAL;
3998 }
3999
4000 ret = mnt_want_write_file(file);
4001 if (ret)
4002 return ret;
4003
4004 mutex_lock(&root->fs_info->volume_mutex);
4005 trans = btrfs_start_transaction(root, 0);
4006 if (IS_ERR(trans)) {
4007 ret = PTR_ERR(trans);
4008 goto out_unlock;
4009 }
4010
4011 strcpy(super_block->label, label);
4012 ret = btrfs_end_transaction(trans, root);
4013
4014out_unlock:
4015 mutex_unlock(&root->fs_info->volume_mutex);
4016 mnt_drop_write_file(file);
4017 return ret;
4018}
4019
3855long btrfs_ioctl(struct file *file, unsigned int 4020long btrfs_ioctl(struct file *file, unsigned int
3856 cmd, unsigned long arg) 4021 cmd, unsigned long arg)
3857{ 4022{
3858 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 4023 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
3859 void __user *argp = (void __user *)arg; 4024 void __user *argp = (void __user *)arg;
3860 4025
3861 switch (cmd) { 4026 switch (cmd) {
@@ -3952,6 +4117,10 @@ long btrfs_ioctl(struct file *file, unsigned int
3952 return btrfs_ioctl_qgroup_limit(file, argp); 4117 return btrfs_ioctl_qgroup_limit(file, argp);
3953 case BTRFS_IOC_DEV_REPLACE: 4118 case BTRFS_IOC_DEV_REPLACE:
3954 return btrfs_ioctl_dev_replace(root, argp); 4119 return btrfs_ioctl_dev_replace(root, argp);
4120 case BTRFS_IOC_GET_FSLABEL:
4121 return btrfs_ioctl_get_fslabel(file, argp);
4122 case BTRFS_IOC_SET_FSLABEL:
4123 return btrfs_ioctl_set_fslabel(file, argp);
3955 } 4124 }
3956 4125
3957 return -ENOTTY; 4126 return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
deleted file mode 100644
index dabca9cc8c2e..000000000000
--- a/fs/btrfs/ioctl.h
+++ /dev/null
@@ -1,502 +0,0 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __IOCTL_
20#define __IOCTL_
21#include <linux/ioctl.h>
22
23#define BTRFS_IOCTL_MAGIC 0x94
24#define BTRFS_VOL_NAME_MAX 255
25
26/* this should be 4k */
27#define BTRFS_PATH_NAME_MAX 4087
28struct btrfs_ioctl_vol_args {
29 __s64 fd;
30 char name[BTRFS_PATH_NAME_MAX + 1];
31};
32
33#define BTRFS_DEVICE_PATH_NAME_MAX 1024
34
35#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
36#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
37#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2)
38#define BTRFS_FSID_SIZE 16
39#define BTRFS_UUID_SIZE 16
40
41#define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0)
42
43struct btrfs_qgroup_limit {
44 __u64 flags;
45 __u64 max_rfer;
46 __u64 max_excl;
47 __u64 rsv_rfer;
48 __u64 rsv_excl;
49};
50
51struct btrfs_qgroup_inherit {
52 __u64 flags;
53 __u64 num_qgroups;
54 __u64 num_ref_copies;
55 __u64 num_excl_copies;
56 struct btrfs_qgroup_limit lim;
57 __u64 qgroups[0];
58};
59
60struct btrfs_ioctl_qgroup_limit_args {
61 __u64 qgroupid;
62 struct btrfs_qgroup_limit lim;
63};
64
65#define BTRFS_SUBVOL_NAME_MAX 4039
66struct btrfs_ioctl_vol_args_v2 {
67 __s64 fd;
68 __u64 transid;
69 __u64 flags;
70 union {
71 struct {
72 __u64 size;
73 struct btrfs_qgroup_inherit __user *qgroup_inherit;
74 };
75 __u64 unused[4];
76 };
77 char name[BTRFS_SUBVOL_NAME_MAX + 1];
78};
79
80/*
81 * structure to report errors and progress to userspace, either as a
82 * result of a finished scrub, a canceled scrub or a progress inquiry
83 */
84struct btrfs_scrub_progress {
85 __u64 data_extents_scrubbed; /* # of data extents scrubbed */
86 __u64 tree_extents_scrubbed; /* # of tree extents scrubbed */
87 __u64 data_bytes_scrubbed; /* # of data bytes scrubbed */
88 __u64 tree_bytes_scrubbed; /* # of tree bytes scrubbed */
89 __u64 read_errors; /* # of read errors encountered (EIO) */
90 __u64 csum_errors; /* # of failed csum checks */
91 __u64 verify_errors; /* # of occurences, where the metadata
92 * of a tree block did not match the
93 * expected values, like generation or
94 * logical */
95 __u64 no_csum; /* # of 4k data block for which no csum
96 * is present, probably the result of
97 * data written with nodatasum */
98 __u64 csum_discards; /* # of csum for which no data was found
99 * in the extent tree. */
100 __u64 super_errors; /* # of bad super blocks encountered */
101 __u64 malloc_errors; /* # of internal kmalloc errors. These
102 * will likely cause an incomplete
103 * scrub */
104 __u64 uncorrectable_errors; /* # of errors where either no intact
105 * copy was found or the writeback
106 * failed */
107 __u64 corrected_errors; /* # of errors corrected */
108 __u64 last_physical; /* last physical address scrubbed. In
109 * case a scrub was aborted, this can
110 * be used to restart the scrub */
111 __u64 unverified_errors; /* # of occurences where a read for a
112 * full (64k) bio failed, but the re-
113 * check succeeded for each 4k piece.
114 * Intermittent error. */
115};
116
117#define BTRFS_SCRUB_READONLY 1
118struct btrfs_ioctl_scrub_args {
119 __u64 devid; /* in */
120 __u64 start; /* in */
121 __u64 end; /* in */
122 __u64 flags; /* in */
123 struct btrfs_scrub_progress progress; /* out */
124 /* pad to 1k */
125 __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
126};
127
128#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
129#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
130struct btrfs_ioctl_dev_replace_start_params {
131 __u64 srcdevid; /* in, if 0, use srcdev_name instead */
132 __u64 cont_reading_from_srcdev_mode; /* in, see #define
133 * above */
134 __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
135 __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
136};
137
138#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0
139#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED 1
140#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2
141#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3
142#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4
143struct btrfs_ioctl_dev_replace_status_params {
144 __u64 replace_state; /* out, see #define above */
145 __u64 progress_1000; /* out, 0 <= x <= 1000 */
146 __u64 time_started; /* out, seconds since 1-Jan-1970 */
147 __u64 time_stopped; /* out, seconds since 1-Jan-1970 */
148 __u64 num_write_errors; /* out */
149 __u64 num_uncorrectable_read_errors; /* out */
150};
151
152#define BTRFS_IOCTL_DEV_REPLACE_CMD_START 0
153#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS 1
154#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL 2
155#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0
156#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1
157#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2
158struct btrfs_ioctl_dev_replace_args {
159 __u64 cmd; /* in */
160 __u64 result; /* out */
161
162 union {
163 struct btrfs_ioctl_dev_replace_start_params start;
164 struct btrfs_ioctl_dev_replace_status_params status;
165 }; /* in/out */
166
167 __u64 spare[64];
168};
169
170struct btrfs_ioctl_dev_info_args {
171 __u64 devid; /* in/out */
172 __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */
173 __u64 bytes_used; /* out */
174 __u64 total_bytes; /* out */
175 __u64 unused[379]; /* pad to 4k */
176 __u8 path[BTRFS_DEVICE_PATH_NAME_MAX]; /* out */
177};
178
179struct btrfs_ioctl_fs_info_args {
180 __u64 max_id; /* out */
181 __u64 num_devices; /* out */
182 __u8 fsid[BTRFS_FSID_SIZE]; /* out */
183 __u64 reserved[124]; /* pad to 1k */
184};
185
186/* balance control ioctl modes */
187#define BTRFS_BALANCE_CTL_PAUSE 1
188#define BTRFS_BALANCE_CTL_CANCEL 2
189
190/*
191 * this is packed, because it should be exactly the same as its disk
192 * byte order counterpart (struct btrfs_disk_balance_args)
193 */
194struct btrfs_balance_args {
195 __u64 profiles;
196 __u64 usage;
197 __u64 devid;
198 __u64 pstart;
199 __u64 pend;
200 __u64 vstart;
201 __u64 vend;
202
203 __u64 target;
204
205 __u64 flags;
206
207 __u64 unused[8];
208} __attribute__ ((__packed__));
209
210/* report balance progress to userspace */
211struct btrfs_balance_progress {
212 __u64 expected; /* estimated # of chunks that will be
213 * relocated to fulfill the request */
214 __u64 considered; /* # of chunks we have considered so far */
215 __u64 completed; /* # of chunks relocated so far */
216};
217
218#define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0)
219#define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1)
220#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2)
221
222struct btrfs_ioctl_balance_args {
223 __u64 flags; /* in/out */
224 __u64 state; /* out */
225
226 struct btrfs_balance_args data; /* in/out */
227 struct btrfs_balance_args meta; /* in/out */
228 struct btrfs_balance_args sys; /* in/out */
229
230 struct btrfs_balance_progress stat; /* out */
231
232 __u64 unused[72]; /* pad to 1k */
233};
234
235#define BTRFS_INO_LOOKUP_PATH_MAX 4080
236struct btrfs_ioctl_ino_lookup_args {
237 __u64 treeid;
238 __u64 objectid;
239 char name[BTRFS_INO_LOOKUP_PATH_MAX];
240};
241
242struct btrfs_ioctl_search_key {
243 /* which root are we searching. 0 is the tree of tree roots */
244 __u64 tree_id;
245
246 /* keys returned will be >= min and <= max */
247 __u64 min_objectid;
248 __u64 max_objectid;
249
250 /* keys returned will be >= min and <= max */
251 __u64 min_offset;
252 __u64 max_offset;
253
254 /* max and min transids to search for */
255 __u64 min_transid;
256 __u64 max_transid;
257
258 /* keys returned will be >= min and <= max */
259 __u32 min_type;
260 __u32 max_type;
261
262 /*
263 * how many items did userland ask for, and how many are we
264 * returning
265 */
266 __u32 nr_items;
267
268 /* align to 64 bits */
269 __u32 unused;
270
271 /* some extra for later */
272 __u64 unused1;
273 __u64 unused2;
274 __u64 unused3;
275 __u64 unused4;
276};
277
278struct btrfs_ioctl_search_header {
279 __u64 transid;
280 __u64 objectid;
281 __u64 offset;
282 __u32 type;
283 __u32 len;
284};
285
286#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
287/*
288 * the buf is an array of search headers where
289 * each header is followed by the actual item
290 * the type field is expanded to 32 bits for alignment
291 */
292struct btrfs_ioctl_search_args {
293 struct btrfs_ioctl_search_key key;
294 char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
295};
296
297struct btrfs_ioctl_clone_range_args {
298 __s64 src_fd;
299 __u64 src_offset, src_length;
300 __u64 dest_offset;
301};
302
303/* flags for the defrag range ioctl */
304#define BTRFS_DEFRAG_RANGE_COMPRESS 1
305#define BTRFS_DEFRAG_RANGE_START_IO 2
306
307struct btrfs_ioctl_space_info {
308 __u64 flags;
309 __u64 total_bytes;
310 __u64 used_bytes;
311};
312
313struct btrfs_ioctl_space_args {
314 __u64 space_slots;
315 __u64 total_spaces;
316 struct btrfs_ioctl_space_info spaces[0];
317};
318
319struct btrfs_data_container {
320 __u32 bytes_left; /* out -- bytes not needed to deliver output */
321 __u32 bytes_missing; /* out -- additional bytes needed for result */
322 __u32 elem_cnt; /* out */
323 __u32 elem_missed; /* out */
324 __u64 val[0]; /* out */
325};
326
327struct btrfs_ioctl_ino_path_args {
328 __u64 inum; /* in */
329 __u64 size; /* in */
330 __u64 reserved[4];
331 /* struct btrfs_data_container *fspath; out */
332 __u64 fspath; /* out */
333};
334
335struct btrfs_ioctl_logical_ino_args {
336 __u64 logical; /* in */
337 __u64 size; /* in */
338 __u64 reserved[4];
339 /* struct btrfs_data_container *inodes; out */
340 __u64 inodes;
341};
342
343enum btrfs_dev_stat_values {
344 /* disk I/O failure stats */
345 BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */
346 BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */
347 BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */
348
349 /* stats for indirect indications for I/O failures */
350 BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or
351 * contents is illegal: this is an
352 * indication that the block was damaged
353 * during read or write, or written to
354 * wrong location or read from wrong
355 * location */
356 BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not
357 * been written */
358
359 BTRFS_DEV_STAT_VALUES_MAX
360};
361
362/* Reset statistics after reading; needs SYS_ADMIN capability */
363#define BTRFS_DEV_STATS_RESET (1ULL << 0)
364
365struct btrfs_ioctl_get_dev_stats {
366 __u64 devid; /* in */
367 __u64 nr_items; /* in/out */
368 __u64 flags; /* in/out */
369
370 /* out values: */
371 __u64 values[BTRFS_DEV_STAT_VALUES_MAX];
372
373 __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */
374};
375
376#define BTRFS_QUOTA_CTL_ENABLE 1
377#define BTRFS_QUOTA_CTL_DISABLE 2
378#define BTRFS_QUOTA_CTL_RESCAN 3
379struct btrfs_ioctl_quota_ctl_args {
380 __u64 cmd;
381 __u64 status;
382};
383
384struct btrfs_ioctl_qgroup_assign_args {
385 __u64 assign;
386 __u64 src;
387 __u64 dst;
388};
389
390struct btrfs_ioctl_qgroup_create_args {
391 __u64 create;
392 __u64 qgroupid;
393};
394struct btrfs_ioctl_timespec {
395 __u64 sec;
396 __u32 nsec;
397};
398
399struct btrfs_ioctl_received_subvol_args {
400 char uuid[BTRFS_UUID_SIZE]; /* in */
401 __u64 stransid; /* in */
402 __u64 rtransid; /* out */
403 struct btrfs_ioctl_timespec stime; /* in */
404 struct btrfs_ioctl_timespec rtime; /* out */
405 __u64 flags; /* in */
406 __u64 reserved[16]; /* in */
407};
408
409struct btrfs_ioctl_send_args {
410 __s64 send_fd; /* in */
411 __u64 clone_sources_count; /* in */
412 __u64 __user *clone_sources; /* in */
413 __u64 parent_root; /* in */
414 __u64 flags; /* in */
415 __u64 reserved[4]; /* in */
416};
417
418#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
419 struct btrfs_ioctl_vol_args)
420#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
421 struct btrfs_ioctl_vol_args)
422#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
423 struct btrfs_ioctl_vol_args)
424#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
425 struct btrfs_ioctl_vol_args)
426/* trans start and trans end are dangerous, and only for
427 * use by applications that know how to avoid the
428 * resulting deadlocks
429 */
430#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6)
431#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7)
432#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8)
433
434#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
435#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
436 struct btrfs_ioctl_vol_args)
437#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
438 struct btrfs_ioctl_vol_args)
439#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
440 struct btrfs_ioctl_vol_args)
441
442#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
443 struct btrfs_ioctl_clone_range_args)
444
445#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
446 struct btrfs_ioctl_vol_args)
447#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
448 struct btrfs_ioctl_vol_args)
449#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \
450 struct btrfs_ioctl_defrag_range_args)
451#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
452 struct btrfs_ioctl_search_args)
453#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
454 struct btrfs_ioctl_ino_lookup_args)
455#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
456#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
457 struct btrfs_ioctl_space_args)
458#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
459#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
460#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
461 struct btrfs_ioctl_vol_args_v2)
462#define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \
463 struct btrfs_ioctl_vol_args_v2)
464#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64)
465#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
466#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \
467 struct btrfs_ioctl_scrub_args)
468#define BTRFS_IOC_SCRUB_CANCEL _IO(BTRFS_IOCTL_MAGIC, 28)
469#define BTRFS_IOC_SCRUB_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 29, \
470 struct btrfs_ioctl_scrub_args)
471#define BTRFS_IOC_DEV_INFO _IOWR(BTRFS_IOCTL_MAGIC, 30, \
472 struct btrfs_ioctl_dev_info_args)
473#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
474 struct btrfs_ioctl_fs_info_args)
475#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
476 struct btrfs_ioctl_balance_args)
477#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
478#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \
479 struct btrfs_ioctl_balance_args)
480#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
481 struct btrfs_ioctl_ino_path_args)
482#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
483 struct btrfs_ioctl_ino_path_args)
484#define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \
485 struct btrfs_ioctl_received_subvol_args)
486#define BTRFS_IOC_SEND _IOW(BTRFS_IOCTL_MAGIC, 38, struct btrfs_ioctl_send_args)
487#define BTRFS_IOC_DEVICES_READY _IOR(BTRFS_IOCTL_MAGIC, 39, \
488 struct btrfs_ioctl_vol_args)
489#define BTRFS_IOC_QUOTA_CTL _IOWR(BTRFS_IOCTL_MAGIC, 40, \
490 struct btrfs_ioctl_quota_ctl_args)
491#define BTRFS_IOC_QGROUP_ASSIGN _IOW(BTRFS_IOCTL_MAGIC, 41, \
492 struct btrfs_ioctl_qgroup_assign_args)
493#define BTRFS_IOC_QGROUP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 42, \
494 struct btrfs_ioctl_qgroup_create_args)
495#define BTRFS_IOC_QGROUP_LIMIT _IOR(BTRFS_IOCTL_MAGIC, 43, \
496 struct btrfs_ioctl_qgroup_limit_args)
497#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
498 struct btrfs_ioctl_get_dev_stats)
499#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
500 struct btrfs_ioctl_dev_replace_args)
501
502#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 2a1762c66041..e95df435d897 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -113,11 +113,10 @@ again:
113 read_unlock(&eb->lock); 113 read_unlock(&eb->lock);
114 return; 114 return;
115 } 115 }
116 read_unlock(&eb->lock);
117 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
118 read_lock(&eb->lock);
119 if (atomic_read(&eb->blocking_writers)) { 116 if (atomic_read(&eb->blocking_writers)) {
120 read_unlock(&eb->lock); 117 read_unlock(&eb->lock);
118 wait_event(eb->write_lock_wq,
119 atomic_read(&eb->blocking_writers) == 0);
121 goto again; 120 goto again;
122 } 121 }
123 atomic_inc(&eb->read_locks); 122 atomic_inc(&eb->read_locks);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index f10731297040..dc08d77b717e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -196,6 +196,9 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
196 entry->file_offset = file_offset; 196 entry->file_offset = file_offset;
197 entry->start = start; 197 entry->start = start;
198 entry->len = len; 198 entry->len = len;
199 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
200 !(type == BTRFS_ORDERED_NOCOW))
201 entry->csum_bytes_left = disk_len;
199 entry->disk_len = disk_len; 202 entry->disk_len = disk_len;
200 entry->bytes_left = len; 203 entry->bytes_left = len;
201 entry->inode = igrab(inode); 204 entry->inode = igrab(inode);
@@ -213,6 +216,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
213 INIT_LIST_HEAD(&entry->root_extent_list); 216 INIT_LIST_HEAD(&entry->root_extent_list);
214 INIT_LIST_HEAD(&entry->work_list); 217 INIT_LIST_HEAD(&entry->work_list);
215 init_completion(&entry->completion); 218 init_completion(&entry->completion);
219 INIT_LIST_HEAD(&entry->log_list);
216 220
217 trace_btrfs_ordered_extent_add(inode, entry); 221 trace_btrfs_ordered_extent_add(inode, entry);
218 222
@@ -270,6 +274,10 @@ void btrfs_add_ordered_sum(struct inode *inode,
270 tree = &BTRFS_I(inode)->ordered_tree; 274 tree = &BTRFS_I(inode)->ordered_tree;
271 spin_lock_irq(&tree->lock); 275 spin_lock_irq(&tree->lock);
272 list_add_tail(&sum->list, &entry->list); 276 list_add_tail(&sum->list, &entry->list);
277 WARN_ON(entry->csum_bytes_left < sum->len);
278 entry->csum_bytes_left -= sum->len;
279 if (entry->csum_bytes_left == 0)
280 wake_up(&entry->wait);
273 spin_unlock_irq(&tree->lock); 281 spin_unlock_irq(&tree->lock);
274} 282}
275 283
@@ -405,6 +413,66 @@ out:
405 return ret == 0; 413 return ret == 0;
406} 414}
407 415
416/* Needs to either be called under a log transaction or the log_mutex */
417void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode)
418{
419 struct btrfs_ordered_inode_tree *tree;
420 struct btrfs_ordered_extent *ordered;
421 struct rb_node *n;
422 int index = log->log_transid % 2;
423
424 tree = &BTRFS_I(inode)->ordered_tree;
425 spin_lock_irq(&tree->lock);
426 for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
427 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
428 spin_lock(&log->log_extents_lock[index]);
429 if (list_empty(&ordered->log_list)) {
430 list_add_tail(&ordered->log_list, &log->logged_list[index]);
431 atomic_inc(&ordered->refs);
432 }
433 spin_unlock(&log->log_extents_lock[index]);
434 }
435 spin_unlock_irq(&tree->lock);
436}
437
438void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
439{
440 struct btrfs_ordered_extent *ordered;
441 int index = transid % 2;
442
443 spin_lock_irq(&log->log_extents_lock[index]);
444 while (!list_empty(&log->logged_list[index])) {
445 ordered = list_first_entry(&log->logged_list[index],
446 struct btrfs_ordered_extent,
447 log_list);
448 list_del_init(&ordered->log_list);
449 spin_unlock_irq(&log->log_extents_lock[index]);
450 wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
451 &ordered->flags));
452 btrfs_put_ordered_extent(ordered);
453 spin_lock_irq(&log->log_extents_lock[index]);
454 }
455 spin_unlock_irq(&log->log_extents_lock[index]);
456}
457
458void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid)
459{
460 struct btrfs_ordered_extent *ordered;
461 int index = transid % 2;
462
463 spin_lock_irq(&log->log_extents_lock[index]);
464 while (!list_empty(&log->logged_list[index])) {
465 ordered = list_first_entry(&log->logged_list[index],
466 struct btrfs_ordered_extent,
467 log_list);
468 list_del_init(&ordered->log_list);
469 spin_unlock_irq(&log->log_extents_lock[index]);
470 btrfs_put_ordered_extent(ordered);
471 spin_lock_irq(&log->log_extents_lock[index]);
472 }
473 spin_unlock_irq(&log->log_extents_lock[index]);
474}
475
408/* 476/*
409 * used to drop a reference on an ordered extent. This will free 477 * used to drop a reference on an ordered extent. This will free
410 * the extent if the last reference is dropped 478 * the extent if the last reference is dropped
@@ -544,10 +612,12 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
544 * extra check to make sure the ordered operation list really is empty 612 * extra check to make sure the ordered operation list really is empty
545 * before we return 613 * before we return
546 */ 614 */
547int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) 615int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
616 struct btrfs_root *root, int wait)
548{ 617{
549 struct btrfs_inode *btrfs_inode; 618 struct btrfs_inode *btrfs_inode;
550 struct inode *inode; 619 struct inode *inode;
620 struct btrfs_transaction *cur_trans = trans->transaction;
551 struct list_head splice; 621 struct list_head splice;
552 struct list_head works; 622 struct list_head works;
553 struct btrfs_delalloc_work *work, *next; 623 struct btrfs_delalloc_work *work, *next;
@@ -558,14 +628,10 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
558 628
559 mutex_lock(&root->fs_info->ordered_operations_mutex); 629 mutex_lock(&root->fs_info->ordered_operations_mutex);
560 spin_lock(&root->fs_info->ordered_extent_lock); 630 spin_lock(&root->fs_info->ordered_extent_lock);
561again: 631 list_splice_init(&cur_trans->ordered_operations, &splice);
562 list_splice_init(&root->fs_info->ordered_operations, &splice);
563
564 while (!list_empty(&splice)) { 632 while (!list_empty(&splice)) {
565
566 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 633 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
567 ordered_operations); 634 ordered_operations);
568
569 inode = &btrfs_inode->vfs_inode; 635 inode = &btrfs_inode->vfs_inode;
570 636
571 list_del_init(&btrfs_inode->ordered_operations); 637 list_del_init(&btrfs_inode->ordered_operations);
@@ -574,24 +640,22 @@ again:
574 * the inode may be getting freed (in sys_unlink path). 640 * the inode may be getting freed (in sys_unlink path).
575 */ 641 */
576 inode = igrab(inode); 642 inode = igrab(inode);
577
578 if (!wait && inode) {
579 list_add_tail(&BTRFS_I(inode)->ordered_operations,
580 &root->fs_info->ordered_operations);
581 }
582
583 if (!inode) 643 if (!inode)
584 continue; 644 continue;
645
646 if (!wait)
647 list_add_tail(&BTRFS_I(inode)->ordered_operations,
648 &cur_trans->ordered_operations);
585 spin_unlock(&root->fs_info->ordered_extent_lock); 649 spin_unlock(&root->fs_info->ordered_extent_lock);
586 650
587 work = btrfs_alloc_delalloc_work(inode, wait, 1); 651 work = btrfs_alloc_delalloc_work(inode, wait, 1);
588 if (!work) { 652 if (!work) {
653 spin_lock(&root->fs_info->ordered_extent_lock);
589 if (list_empty(&BTRFS_I(inode)->ordered_operations)) 654 if (list_empty(&BTRFS_I(inode)->ordered_operations))
590 list_add_tail(&btrfs_inode->ordered_operations, 655 list_add_tail(&btrfs_inode->ordered_operations,
591 &splice); 656 &splice);
592 spin_lock(&root->fs_info->ordered_extent_lock);
593 list_splice_tail(&splice, 657 list_splice_tail(&splice,
594 &root->fs_info->ordered_operations); 658 &cur_trans->ordered_operations);
595 spin_unlock(&root->fs_info->ordered_extent_lock); 659 spin_unlock(&root->fs_info->ordered_extent_lock);
596 ret = -ENOMEM; 660 ret = -ENOMEM;
597 goto out; 661 goto out;
@@ -603,9 +667,6 @@ again:
603 cond_resched(); 667 cond_resched();
604 spin_lock(&root->fs_info->ordered_extent_lock); 668 spin_lock(&root->fs_info->ordered_extent_lock);
605 } 669 }
606 if (wait && !list_empty(&root->fs_info->ordered_operations))
607 goto again;
608
609 spin_unlock(&root->fs_info->ordered_extent_lock); 670 spin_unlock(&root->fs_info->ordered_extent_lock);
610out: 671out:
611 list_for_each_entry_safe(work, next, &works, list) { 672 list_for_each_entry_safe(work, next, &works, list) {
@@ -836,9 +897,16 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
836 * if the disk i_size is already at the inode->i_size, or 897 * if the disk i_size is already at the inode->i_size, or
837 * this ordered extent is inside the disk i_size, we're done 898 * this ordered extent is inside the disk i_size, we're done
838 */ 899 */
839 if (disk_i_size == i_size || offset <= disk_i_size) { 900 if (disk_i_size == i_size)
901 goto out;
902
903 /*
904 * We still need to update disk_i_size if outstanding_isize is greater
905 * than disk_i_size.
906 */
907 if (offset <= disk_i_size &&
908 (!ordered || ordered->outstanding_isize <= disk_i_size))
840 goto out; 909 goto out;
841 }
842 910
843 /* 911 /*
844 * walk backward from this ordered extent to disk_i_size. 912 * walk backward from this ordered extent to disk_i_size.
@@ -870,7 +938,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
870 break; 938 break;
871 if (test->file_offset >= i_size) 939 if (test->file_offset >= i_size)
872 break; 940 break;
873 if (test->file_offset >= disk_i_size) { 941 if (entry_end(test) > disk_i_size) {
874 /* 942 /*
875 * we don't update disk_i_size now, so record this 943 * we don't update disk_i_size now, so record this
876 * undealt i_size. Or we will not know the real 944 * undealt i_size. Or we will not know the real
@@ -967,6 +1035,7 @@ out:
967void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 1035void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
968 struct btrfs_root *root, struct inode *inode) 1036 struct btrfs_root *root, struct inode *inode)
969{ 1037{
1038 struct btrfs_transaction *cur_trans = trans->transaction;
970 u64 last_mod; 1039 u64 last_mod;
971 1040
972 last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); 1041 last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
@@ -981,7 +1050,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
981 spin_lock(&root->fs_info->ordered_extent_lock); 1050 spin_lock(&root->fs_info->ordered_extent_lock);
982 if (list_empty(&BTRFS_I(inode)->ordered_operations)) { 1051 if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
983 list_add_tail(&BTRFS_I(inode)->ordered_operations, 1052 list_add_tail(&BTRFS_I(inode)->ordered_operations,
984 &root->fs_info->ordered_operations); 1053 &cur_trans->ordered_operations);
985 } 1054 }
986 spin_unlock(&root->fs_info->ordered_extent_lock); 1055 spin_unlock(&root->fs_info->ordered_extent_lock);
987} 1056}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f29d4bf5fbe7..8eadfe406cdd 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -79,6 +79,8 @@ struct btrfs_ordered_sum {
79#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent 79#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
80 * has done its due diligence in updating 80 * has done its due diligence in updating
81 * the isize. */ 81 * the isize. */
82#define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered
83 ordered extent */
82 84
83struct btrfs_ordered_extent { 85struct btrfs_ordered_extent {
84 /* logical offset in the file */ 86 /* logical offset in the file */
@@ -96,6 +98,9 @@ struct btrfs_ordered_extent {
96 /* number of bytes that still need writing */ 98 /* number of bytes that still need writing */
97 u64 bytes_left; 99 u64 bytes_left;
98 100
101 /* number of bytes that still need csumming */
102 u64 csum_bytes_left;
103
99 /* 104 /*
100 * the end of the ordered extent which is behind it but 105 * the end of the ordered extent which is behind it but
101 * didn't update disk_i_size. Please see the comment of 106 * didn't update disk_i_size. Please see the comment of
@@ -118,6 +123,9 @@ struct btrfs_ordered_extent {
118 /* list of checksums for insertion when the extent io is done */ 123 /* list of checksums for insertion when the extent io is done */
119 struct list_head list; 124 struct list_head list;
120 125
126 /* If we need to wait on this to be done */
127 struct list_head log_list;
128
121 /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ 129 /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
122 wait_queue_head_t wait; 130 wait_queue_head_t wait;
123 131
@@ -189,11 +197,15 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
189int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 197int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
190 struct btrfs_ordered_extent *ordered); 198 struct btrfs_ordered_extent *ordered);
191int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 199int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
192int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); 200int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
201 struct btrfs_root *root, int wait);
193void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 202void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
194 struct btrfs_root *root, 203 struct btrfs_root *root,
195 struct inode *inode); 204 struct inode *inode);
196void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput); 205void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
206void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
207void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
208void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
197int __init ordered_data_init(void); 209int __init ordered_data_init(void);
198void ordered_data_exit(void); 210void ordered_data_exit(void);
199#endif 211#endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 50d95fd190a5..920957ecb27e 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -294,6 +294,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
294 btrfs_dev_extent_chunk_offset(l, dev_extent), 294 btrfs_dev_extent_chunk_offset(l, dev_extent),
295 (unsigned long long) 295 (unsigned long long)
296 btrfs_dev_extent_length(l, dev_extent)); 296 btrfs_dev_extent_length(l, dev_extent));
297 break;
297 case BTRFS_DEV_STATS_KEY: 298 case BTRFS_DEV_STATS_KEY:
298 printk(KERN_INFO "\t\tdevice stats\n"); 299 printk(KERN_INFO "\t\tdevice stats\n");
299 break; 300 break;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index fe9d02c45f8e..aee4b1cc3d98 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -23,13 +23,13 @@
23#include <linux/rbtree.h> 23#include <linux/rbtree.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/workqueue.h> 25#include <linux/workqueue.h>
26#include <linux/btrfs.h>
26 27
27#include "ctree.h" 28#include "ctree.h"
28#include "transaction.h" 29#include "transaction.h"
29#include "disk-io.h" 30#include "disk-io.h"
30#include "locking.h" 31#include "locking.h"
31#include "ulist.h" 32#include "ulist.h"
32#include "ioctl.h"
33#include "backref.h" 33#include "backref.h"
34 34
35/* TODO XXX FIXME 35/* TODO XXX FIXME
@@ -379,6 +379,13 @@ next1:
379 379
380 ret = add_relation_rb(fs_info, found_key.objectid, 380 ret = add_relation_rb(fs_info, found_key.objectid,
381 found_key.offset); 381 found_key.offset);
382 if (ret == -ENOENT) {
383 printk(KERN_WARNING
384 "btrfs: orphan qgroup relation 0x%llx->0x%llx\n",
385 (unsigned long long)found_key.objectid,
386 (unsigned long long)found_key.offset);
387 ret = 0; /* ignore the error */
388 }
382 if (ret) 389 if (ret)
383 goto out; 390 goto out;
384next2: 391next2:
@@ -613,7 +620,9 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
613 key.offset = qgroupid; 620 key.offset = qgroupid;
614 621
615 path = btrfs_alloc_path(); 622 path = btrfs_alloc_path();
616 BUG_ON(!path); 623 if (!path)
624 return -ENOMEM;
625
617 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 626 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
618 if (ret > 0) 627 if (ret > 0)
619 ret = -ENOENT; 628 ret = -ENOENT;
@@ -654,7 +663,9 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
654 key.offset = qgroup->qgroupid; 663 key.offset = qgroup->qgroupid;
655 664
656 path = btrfs_alloc_path(); 665 path = btrfs_alloc_path();
657 BUG_ON(!path); 666 if (!path)
667 return -ENOMEM;
668
658 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 669 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
659 if (ret > 0) 670 if (ret > 0)
660 ret = -ENOENT; 671 ret = -ENOENT;
@@ -695,7 +706,9 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
695 key.offset = 0; 706 key.offset = 0;
696 707
697 path = btrfs_alloc_path(); 708 path = btrfs_alloc_path();
698 BUG_ON(!path); 709 if (!path)
710 return -ENOMEM;
711
699 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 712 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
700 if (ret > 0) 713 if (ret > 0)
701 ret = -ENOENT; 714 ret = -ENOENT;
@@ -725,33 +738,38 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
725{ 738{
726 struct btrfs_path *path; 739 struct btrfs_path *path;
727 struct btrfs_key key; 740 struct btrfs_key key;
741 struct extent_buffer *leaf = NULL;
728 int ret; 742 int ret;
729 743 int nr = 0;
730 if (!root)
731 return -EINVAL;
732 744
733 path = btrfs_alloc_path(); 745 path = btrfs_alloc_path();
734 if (!path) 746 if (!path)
735 return -ENOMEM; 747 return -ENOMEM;
736 748
737 while (1) { 749 path->leave_spinning = 1;
738 key.objectid = 0;
739 key.offset = 0;
740 key.type = 0;
741 750
742 path->leave_spinning = 1; 751 key.objectid = 0;
752 key.offset = 0;
753 key.type = 0;
754
755 while (1) {
743 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 756 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
744 if (ret > 0) { 757 if (ret < 0)
745 if (path->slots[0] == 0) 758 goto out;
746 break; 759 leaf = path->nodes[0];
747 path->slots[0]--; 760 nr = btrfs_header_nritems(leaf);
748 } else if (ret < 0) { 761 if (!nr)
749 break; 762 break;
750 } 763 /*
751 764 * delete the leaf one by one
752 ret = btrfs_del_item(trans, root, path); 765 * since the whole tree is going
766 * to be deleted.
767 */
768 path->slots[0] = 0;
769 ret = btrfs_del_items(trans, root, path, 0, nr);
753 if (ret) 770 if (ret)
754 goto out; 771 goto out;
772
755 btrfs_release_path(path); 773 btrfs_release_path(path);
756 } 774 }
757 ret = 0; 775 ret = 0;
@@ -840,6 +858,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
840 int ret = 0; 858 int ret = 0;
841 859
842 spin_lock(&fs_info->qgroup_lock); 860 spin_lock(&fs_info->qgroup_lock);
861 if (!fs_info->quota_root) {
862 spin_unlock(&fs_info->qgroup_lock);
863 return 0;
864 }
843 fs_info->quota_enabled = 0; 865 fs_info->quota_enabled = 0;
844 fs_info->pending_quota_state = 0; 866 fs_info->pending_quota_state = 0;
845 quota_root = fs_info->quota_root; 867 quota_root = fs_info->quota_root;
@@ -956,17 +978,28 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
956 struct btrfs_fs_info *fs_info, u64 qgroupid) 978 struct btrfs_fs_info *fs_info, u64 qgroupid)
957{ 979{
958 struct btrfs_root *quota_root; 980 struct btrfs_root *quota_root;
981 struct btrfs_qgroup *qgroup;
959 int ret = 0; 982 int ret = 0;
960 983
961 quota_root = fs_info->quota_root; 984 quota_root = fs_info->quota_root;
962 if (!quota_root) 985 if (!quota_root)
963 return -EINVAL; 986 return -EINVAL;
964 987
988 /* check if there are no relations to this qgroup */
989 spin_lock(&fs_info->qgroup_lock);
990 qgroup = find_qgroup_rb(fs_info, qgroupid);
991 if (qgroup) {
992 if (!list_empty(&qgroup->groups) || !list_empty(&qgroup->members)) {
993 spin_unlock(&fs_info->qgroup_lock);
994 return -EBUSY;
995 }
996 }
997 spin_unlock(&fs_info->qgroup_lock);
998
965 ret = del_qgroup_item(trans, quota_root, qgroupid); 999 ret = del_qgroup_item(trans, quota_root, qgroupid);
966 1000
967 spin_lock(&fs_info->qgroup_lock); 1001 spin_lock(&fs_info->qgroup_lock);
968 del_qgroup_rb(quota_root->fs_info, qgroupid); 1002 del_qgroup_rb(quota_root->fs_info, qgroupid);
969
970 spin_unlock(&fs_info->qgroup_lock); 1003 spin_unlock(&fs_info->qgroup_lock);
971 1004
972 return ret; 1005 return ret;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
new file mode 100644
index 000000000000..9a79fb790adb
--- /dev/null
+++ b/fs/btrfs/raid56.c
@@ -0,0 +1,2100 @@
1/*
2 * Copyright (C) 2012 Fusion-io All rights reserved.
3 * Copyright (C) 2012 Intel Corp. All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
18 */
19#include <linux/sched.h>
20#include <linux/wait.h>
21#include <linux/bio.h>
22#include <linux/slab.h>
23#include <linux/buffer_head.h>
24#include <linux/blkdev.h>
25#include <linux/random.h>
26#include <linux/iocontext.h>
27#include <linux/capability.h>
28#include <linux/ratelimit.h>
29#include <linux/kthread.h>
30#include <linux/raid/pq.h>
31#include <linux/hash.h>
32#include <linux/list_sort.h>
33#include <linux/raid/xor.h>
34#include <linux/vmalloc.h>
35#include <asm/div64.h>
36#include "compat.h"
37#include "ctree.h"
38#include "extent_map.h"
39#include "disk-io.h"
40#include "transaction.h"
41#include "print-tree.h"
42#include "volumes.h"
43#include "raid56.h"
44#include "async-thread.h"
45#include "check-integrity.h"
46#include "rcu-string.h"
47
48/* set when additional merges to this rbio are not allowed */
49#define RBIO_RMW_LOCKED_BIT 1
50
51/*
52 * set when this rbio is sitting in the hash, but it is just a cache
53 * of past RMW
54 */
55#define RBIO_CACHE_BIT 2
56
57/*
58 * set when it is safe to trust the stripe_pages for caching
59 */
60#define RBIO_CACHE_READY_BIT 3
61
62
63#define RBIO_CACHE_SIZE 1024
64
65struct btrfs_raid_bio {
66 struct btrfs_fs_info *fs_info;
67 struct btrfs_bio *bbio;
68
69 /*
70 * logical block numbers for the start of each stripe
71 * The last one or two are p/q. These are sorted,
72 * so raid_map[0] is the start of our full stripe
73 */
74 u64 *raid_map;
75
76 /* while we're doing rmw on a stripe
77 * we put it into a hash table so we can
78 * lock the stripe and merge more rbios
79 * into it.
80 */
81 struct list_head hash_list;
82
83 /*
84 * LRU list for the stripe cache
85 */
86 struct list_head stripe_cache;
87
88 /*
89 * for scheduling work in the helper threads
90 */
91 struct btrfs_work work;
92
93 /*
94 * bio list and bio_list_lock are used
95 * to add more bios into the stripe
96 * in hopes of avoiding the full rmw
97 */
98 struct bio_list bio_list;
99 spinlock_t bio_list_lock;
100
101 /* also protected by the bio_list_lock, the
102 * plug list is used by the plugging code
103 * to collect partial bios while plugged. The
104 * stripe locking code also uses it to hand off
105 * the stripe lock to the next pending IO
106 */
107 struct list_head plug_list;
108
109 /*
110 * flags that tell us if it is safe to
111 * merge with this bio
112 */
113 unsigned long flags;
114
115 /* size of each individual stripe on disk */
116 int stripe_len;
117
118 /* number of data stripes (no p/q) */
119 int nr_data;
120
121 /*
122 * set if we're doing a parity rebuild
123 * for a read from higher up, which is handled
124 * differently from a parity rebuild as part of
125 * rmw
126 */
127 int read_rebuild;
128
129 /* first bad stripe */
130 int faila;
131
132 /* second bad stripe (for raid6 use) */
133 int failb;
134
135 /*
136 * number of pages needed to represent the full
137 * stripe
138 */
139 int nr_pages;
140
141 /*
142 * size of all the bios in the bio_list. This
143 * helps us decide if the rbio maps to a full
144 * stripe or not
145 */
146 int bio_list_bytes;
147
148 atomic_t refs;
149
150 /*
151 * these are two arrays of pointers. We allocate the
152 * rbio big enough to hold them both and setup their
153 * locations when the rbio is allocated
154 */
155
156 /* pointers to pages that we allocated for
157 * reading/writing stripes directly from the disk (including P/Q)
158 */
159 struct page **stripe_pages;
160
161 /*
162 * pointers to the pages in the bio_list. Stored
163 * here for faster lookup
164 */
165 struct page **bio_pages;
166};
167
168static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
169static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
170static void rmw_work(struct btrfs_work *work);
171static void read_rebuild_work(struct btrfs_work *work);
172static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
173static void async_read_rebuild(struct btrfs_raid_bio *rbio);
174static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
175static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
176static void __free_raid_bio(struct btrfs_raid_bio *rbio);
177static void index_rbio_pages(struct btrfs_raid_bio *rbio);
178static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
179
180/*
181 * the stripe hash table is used for locking, and to collect
182 * bios in hopes of making a full stripe
183 */
184int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
185{
186 struct btrfs_stripe_hash_table *table;
187 struct btrfs_stripe_hash_table *x;
188 struct btrfs_stripe_hash *cur;
189 struct btrfs_stripe_hash *h;
190 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
191 int i;
192 int table_size;
193
194 if (info->stripe_hash_table)
195 return 0;
196
197 /*
198 * The table is large, starting with order 4 and can go as high as
199 * order 7 in case lock debugging is turned on.
200 *
201 * Try harder to allocate and fallback to vmalloc to lower the chance
202 * of a failing mount.
203 */
204 table_size = sizeof(*table) + sizeof(*h) * num_entries;
205 table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
206 if (!table) {
207 table = vzalloc(table_size);
208 if (!table)
209 return -ENOMEM;
210 }
211
212 spin_lock_init(&table->cache_lock);
213 INIT_LIST_HEAD(&table->stripe_cache);
214
215 h = table->table;
216
217 for (i = 0; i < num_entries; i++) {
218 cur = h + i;
219 INIT_LIST_HEAD(&cur->hash_list);
220 spin_lock_init(&cur->lock);
221 init_waitqueue_head(&cur->wait);
222 }
223
224 x = cmpxchg(&info->stripe_hash_table, NULL, table);
225 if (x) {
226 if (is_vmalloc_addr(x))
227 vfree(x);
228 else
229 kfree(x);
230 }
231 return 0;
232}
233
234/*
235 * caching an rbio means to copy anything from the
236 * bio_pages array into the stripe_pages array. We
237 * use the page uptodate bit in the stripe cache array
238 * to indicate if it has valid data
239 *
240 * once the caching is done, we set the cache ready
241 * bit.
242 */
243static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
244{
245 int i;
246 char *s;
247 char *d;
248 int ret;
249
250 ret = alloc_rbio_pages(rbio);
251 if (ret)
252 return;
253
254 for (i = 0; i < rbio->nr_pages; i++) {
255 if (!rbio->bio_pages[i])
256 continue;
257
258 s = kmap(rbio->bio_pages[i]);
259 d = kmap(rbio->stripe_pages[i]);
260
261 memcpy(d, s, PAGE_CACHE_SIZE);
262
263 kunmap(rbio->bio_pages[i]);
264 kunmap(rbio->stripe_pages[i]);
265 SetPageUptodate(rbio->stripe_pages[i]);
266 }
267 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
268}
269
270/*
271 * we hash on the first logical address of the stripe
272 */
273static int rbio_bucket(struct btrfs_raid_bio *rbio)
274{
275 u64 num = rbio->raid_map[0];
276
277 /*
278 * we shift down quite a bit. We're using byte
279 * addressing, and most of the lower bits are zeros.
280 * This tends to upset hash_64, and it consistently
281 * returns just one or two different values.
282 *
283 * shifting off the lower bits fixes things.
284 */
285 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
286}
287
288/*
289 * stealing an rbio means taking all the uptodate pages from the stripe
290 * array in the source rbio and putting them into the destination rbio
291 */
292static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
293{
294 int i;
295 struct page *s;
296 struct page *d;
297
298 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
299 return;
300
301 for (i = 0; i < dest->nr_pages; i++) {
302 s = src->stripe_pages[i];
303 if (!s || !PageUptodate(s)) {
304 continue;
305 }
306
307 d = dest->stripe_pages[i];
308 if (d)
309 __free_page(d);
310
311 dest->stripe_pages[i] = s;
312 src->stripe_pages[i] = NULL;
313 }
314}
315
316/*
317 * merging means we take the bio_list from the victim and
318 * splice it into the destination. The victim should
319 * be discarded afterwards.
320 *
321 * must be called with dest->rbio_list_lock held
322 */
323static void merge_rbio(struct btrfs_raid_bio *dest,
324 struct btrfs_raid_bio *victim)
325{
326 bio_list_merge(&dest->bio_list, &victim->bio_list);
327 dest->bio_list_bytes += victim->bio_list_bytes;
328 bio_list_init(&victim->bio_list);
329}
330
331/*
332 * used to prune items that are in the cache. The caller
333 * must hold the hash table lock.
334 */
335static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
336{
337 int bucket = rbio_bucket(rbio);
338 struct btrfs_stripe_hash_table *table;
339 struct btrfs_stripe_hash *h;
340 int freeit = 0;
341
342 /*
343 * check the bit again under the hash table lock.
344 */
345 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
346 return;
347
348 table = rbio->fs_info->stripe_hash_table;
349 h = table->table + bucket;
350
351 /* hold the lock for the bucket because we may be
352 * removing it from the hash table
353 */
354 spin_lock(&h->lock);
355
356 /*
357 * hold the lock for the bio list because we need
358 * to make sure the bio list is empty
359 */
360 spin_lock(&rbio->bio_list_lock);
361
362 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
363 list_del_init(&rbio->stripe_cache);
364 table->cache_size -= 1;
365 freeit = 1;
366
367 /* if the bio list isn't empty, this rbio is
368 * still involved in an IO. We take it out
369 * of the cache list, and drop the ref that
370 * was held for the list.
371 *
372 * If the bio_list was empty, we also remove
373 * the rbio from the hash_table, and drop
374 * the corresponding ref
375 */
376 if (bio_list_empty(&rbio->bio_list)) {
377 if (!list_empty(&rbio->hash_list)) {
378 list_del_init(&rbio->hash_list);
379 atomic_dec(&rbio->refs);
380 BUG_ON(!list_empty(&rbio->plug_list));
381 }
382 }
383 }
384
385 spin_unlock(&rbio->bio_list_lock);
386 spin_unlock(&h->lock);
387
388 if (freeit)
389 __free_raid_bio(rbio);
390}
391
392/*
393 * prune a given rbio from the cache
394 */
395static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
396{
397 struct btrfs_stripe_hash_table *table;
398 unsigned long flags;
399
400 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
401 return;
402
403 table = rbio->fs_info->stripe_hash_table;
404
405 spin_lock_irqsave(&table->cache_lock, flags);
406 __remove_rbio_from_cache(rbio);
407 spin_unlock_irqrestore(&table->cache_lock, flags);
408}
409
410/*
411 * remove everything in the cache
412 */
413void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
414{
415 struct btrfs_stripe_hash_table *table;
416 unsigned long flags;
417 struct btrfs_raid_bio *rbio;
418
419 table = info->stripe_hash_table;
420
421 spin_lock_irqsave(&table->cache_lock, flags);
422 while (!list_empty(&table->stripe_cache)) {
423 rbio = list_entry(table->stripe_cache.next,
424 struct btrfs_raid_bio,
425 stripe_cache);
426 __remove_rbio_from_cache(rbio);
427 }
428 spin_unlock_irqrestore(&table->cache_lock, flags);
429}
430
431/*
432 * remove all cached entries and free the hash table
433 * used by unmount
434 */
435void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
436{
437 if (!info->stripe_hash_table)
438 return;
439 btrfs_clear_rbio_cache(info);
440 if (is_vmalloc_addr(info->stripe_hash_table))
441 vfree(info->stripe_hash_table);
442 else
443 kfree(info->stripe_hash_table);
444 info->stripe_hash_table = NULL;
445}
446
447/*
448 * insert an rbio into the stripe cache. It
449 * must have already been prepared by calling
450 * cache_rbio_pages
451 *
452 * If this rbio was already cached, it gets
453 * moved to the front of the lru.
454 *
455 * If the size of the rbio cache is too big, we
456 * prune an item.
457 */
458static void cache_rbio(struct btrfs_raid_bio *rbio)
459{
460 struct btrfs_stripe_hash_table *table;
461 unsigned long flags;
462
463 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
464 return;
465
466 table = rbio->fs_info->stripe_hash_table;
467
468 spin_lock_irqsave(&table->cache_lock, flags);
469 spin_lock(&rbio->bio_list_lock);
470
471 /* bump our ref if we were not in the list before */
472 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
473 atomic_inc(&rbio->refs);
474
475 if (!list_empty(&rbio->stripe_cache)){
476 list_move(&rbio->stripe_cache, &table->stripe_cache);
477 } else {
478 list_add(&rbio->stripe_cache, &table->stripe_cache);
479 table->cache_size += 1;
480 }
481
482 spin_unlock(&rbio->bio_list_lock);
483
484 if (table->cache_size > RBIO_CACHE_SIZE) {
485 struct btrfs_raid_bio *found;
486
487 found = list_entry(table->stripe_cache.prev,
488 struct btrfs_raid_bio,
489 stripe_cache);
490
491 if (found != rbio)
492 __remove_rbio_from_cache(found);
493 }
494
495 spin_unlock_irqrestore(&table->cache_lock, flags);
496 return;
497}
498
499/*
500 * helper function to run the xor_blocks api. It is only
501 * able to do MAX_XOR_BLOCKS at a time, so we need to
502 * loop through.
503 */
504static void run_xor(void **pages, int src_cnt, ssize_t len)
505{
506 int src_off = 0;
507 int xor_src_cnt = 0;
508 void *dest = pages[src_cnt];
509
510 while(src_cnt > 0) {
511 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
512 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
513
514 src_cnt -= xor_src_cnt;
515 src_off += xor_src_cnt;
516 }
517}
518
519/*
520 * returns true if the bio list inside this rbio
521 * covers an entire stripe (no rmw required).
522 * Must be called with the bio list lock held, or
523 * at a time when you know it is impossible to add
524 * new bios into the list
525 */
526static int __rbio_is_full(struct btrfs_raid_bio *rbio)
527{
528 unsigned long size = rbio->bio_list_bytes;
529 int ret = 1;
530
531 if (size != rbio->nr_data * rbio->stripe_len)
532 ret = 0;
533
534 BUG_ON(size > rbio->nr_data * rbio->stripe_len);
535 return ret;
536}
537
538static int rbio_is_full(struct btrfs_raid_bio *rbio)
539{
540 unsigned long flags;
541 int ret;
542
543 spin_lock_irqsave(&rbio->bio_list_lock, flags);
544 ret = __rbio_is_full(rbio);
545 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
546 return ret;
547}
548
549/*
550 * returns 1 if it is safe to merge two rbios together.
551 * The merging is safe if the two rbios correspond to
552 * the same stripe and if they are both going in the same
553 * direction (read vs write), and if neither one is
554 * locked for final IO
555 *
556 * The caller is responsible for locking such that
557 * rmw_locked is safe to test
558 */
559static int rbio_can_merge(struct btrfs_raid_bio *last,
560 struct btrfs_raid_bio *cur)
561{
562 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
563 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
564 return 0;
565
566 /*
567 * we can't merge with cached rbios, since the
568 * idea is that when we merge the destination
569 * rbio is going to run our IO for us. We can
570 * steal from cached rbio's though, other functions
571 * handle that.
572 */
573 if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
574 test_bit(RBIO_CACHE_BIT, &cur->flags))
575 return 0;
576
577 if (last->raid_map[0] !=
578 cur->raid_map[0])
579 return 0;
580
581 /* reads can't merge with writes */
582 if (last->read_rebuild !=
583 cur->read_rebuild) {
584 return 0;
585 }
586
587 return 1;
588}
589
590/*
591 * helper to index into the pstripe
592 */
593static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
594{
595 index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
596 return rbio->stripe_pages[index];
597}
598
599/*
600 * helper to index into the qstripe, returns null
601 * if there is no qstripe
602 */
603static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
604{
605 if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
606 return NULL;
607
608 index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
609 PAGE_CACHE_SHIFT;
610 return rbio->stripe_pages[index];
611}
612
613/*
614 * The first stripe in the table for a logical address
615 * has the lock. rbios are added in one of three ways:
616 *
617 * 1) Nobody has the stripe locked yet. The rbio is given
618 * the lock and 0 is returned. The caller must start the IO
619 * themselves.
620 *
621 * 2) Someone has the stripe locked, but we're able to merge
622 * with the lock owner. The rbio is freed and the IO will
623 * start automatically along with the existing rbio. 1 is returned.
624 *
625 * 3) Someone has the stripe locked, but we're not able to merge.
626 * The rbio is added to the lock owner's plug list, or merged into
627 * an rbio already on the plug list. When the lock owner unlocks,
628 * the next rbio on the list is run and the IO is started automatically.
629 * 1 is returned
630 *
631 * If we return 0, the caller still owns the rbio and must continue with
632 * IO submission. If we return 1, the caller must assume the rbio has
633 * already been freed.
634 */
635static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
636{
637 int bucket = rbio_bucket(rbio);
638 struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
639 struct btrfs_raid_bio *cur;
640 struct btrfs_raid_bio *pending;
641 unsigned long flags;
642 DEFINE_WAIT(wait);
643 struct btrfs_raid_bio *freeit = NULL;
644 struct btrfs_raid_bio *cache_drop = NULL;
645 int ret = 0;
646 int walk = 0;
647
648 spin_lock_irqsave(&h->lock, flags);
649 list_for_each_entry(cur, &h->hash_list, hash_list) {
650 walk++;
651 if (cur->raid_map[0] == rbio->raid_map[0]) {
652 spin_lock(&cur->bio_list_lock);
653
654 /* can we steal this cached rbio's pages? */
655 if (bio_list_empty(&cur->bio_list) &&
656 list_empty(&cur->plug_list) &&
657 test_bit(RBIO_CACHE_BIT, &cur->flags) &&
658 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
659 list_del_init(&cur->hash_list);
660 atomic_dec(&cur->refs);
661
662 steal_rbio(cur, rbio);
663 cache_drop = cur;
664 spin_unlock(&cur->bio_list_lock);
665
666 goto lockit;
667 }
668
669 /* can we merge into the lock owner? */
670 if (rbio_can_merge(cur, rbio)) {
671 merge_rbio(cur, rbio);
672 spin_unlock(&cur->bio_list_lock);
673 freeit = rbio;
674 ret = 1;
675 goto out;
676 }
677
678
679 /*
680 * we couldn't merge with the running
681 * rbio, see if we can merge with the
682 * pending ones. We don't have to
683 * check for rmw_locked because there
684 * is no way they are inside finish_rmw
685 * right now
686 */
687 list_for_each_entry(pending, &cur->plug_list,
688 plug_list) {
689 if (rbio_can_merge(pending, rbio)) {
690 merge_rbio(pending, rbio);
691 spin_unlock(&cur->bio_list_lock);
692 freeit = rbio;
693 ret = 1;
694 goto out;
695 }
696 }
697
698 /* no merging, put us on the tail of the plug list,
699 * our rbio will be started with the currently
700 * running rbio unlocks
701 */
702 list_add_tail(&rbio->plug_list, &cur->plug_list);
703 spin_unlock(&cur->bio_list_lock);
704 ret = 1;
705 goto out;
706 }
707 }
708lockit:
709 atomic_inc(&rbio->refs);
710 list_add(&rbio->hash_list, &h->hash_list);
711out:
712 spin_unlock_irqrestore(&h->lock, flags);
713 if (cache_drop)
714 remove_rbio_from_cache(cache_drop);
715 if (freeit)
716 __free_raid_bio(freeit);
717 return ret;
718}
719
720/*
721 * called as rmw or parity rebuild is completed. If the plug list has more
722 * rbios waiting for this stripe, the next one on the list will be started
723 */
724static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
725{
726 int bucket;
727 struct btrfs_stripe_hash *h;
728 unsigned long flags;
729 int keep_cache = 0;
730
731 bucket = rbio_bucket(rbio);
732 h = rbio->fs_info->stripe_hash_table->table + bucket;
733
734 if (list_empty(&rbio->plug_list))
735 cache_rbio(rbio);
736
737 spin_lock_irqsave(&h->lock, flags);
738 spin_lock(&rbio->bio_list_lock);
739
740 if (!list_empty(&rbio->hash_list)) {
741 /*
742 * if we're still cached and there is no other IO
743 * to perform, just leave this rbio here for others
744 * to steal from later
745 */
746 if (list_empty(&rbio->plug_list) &&
747 test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
748 keep_cache = 1;
749 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
750 BUG_ON(!bio_list_empty(&rbio->bio_list));
751 goto done;
752 }
753
754 list_del_init(&rbio->hash_list);
755 atomic_dec(&rbio->refs);
756
757 /*
758 * we use the plug list to hold all the rbios
759 * waiting for the chance to lock this stripe.
760 * hand the lock over to one of them.
761 */
762 if (!list_empty(&rbio->plug_list)) {
763 struct btrfs_raid_bio *next;
764 struct list_head *head = rbio->plug_list.next;
765
766 next = list_entry(head, struct btrfs_raid_bio,
767 plug_list);
768
769 list_del_init(&rbio->plug_list);
770
771 list_add(&next->hash_list, &h->hash_list);
772 atomic_inc(&next->refs);
773 spin_unlock(&rbio->bio_list_lock);
774 spin_unlock_irqrestore(&h->lock, flags);
775
776 if (next->read_rebuild)
777 async_read_rebuild(next);
778 else {
779 steal_rbio(rbio, next);
780 async_rmw_stripe(next);
781 }
782
783 goto done_nolock;
784 } else if (waitqueue_active(&h->wait)) {
785 spin_unlock(&rbio->bio_list_lock);
786 spin_unlock_irqrestore(&h->lock, flags);
787 wake_up(&h->wait);
788 goto done_nolock;
789 }
790 }
791done:
792 spin_unlock(&rbio->bio_list_lock);
793 spin_unlock_irqrestore(&h->lock, flags);
794
795done_nolock:
796 if (!keep_cache)
797 remove_rbio_from_cache(rbio);
798}
799
800static void __free_raid_bio(struct btrfs_raid_bio *rbio)
801{
802 int i;
803
804 WARN_ON(atomic_read(&rbio->refs) < 0);
805 if (!atomic_dec_and_test(&rbio->refs))
806 return;
807
808 WARN_ON(!list_empty(&rbio->stripe_cache));
809 WARN_ON(!list_empty(&rbio->hash_list));
810 WARN_ON(!bio_list_empty(&rbio->bio_list));
811
812 for (i = 0; i < rbio->nr_pages; i++) {
813 if (rbio->stripe_pages[i]) {
814 __free_page(rbio->stripe_pages[i]);
815 rbio->stripe_pages[i] = NULL;
816 }
817 }
818 kfree(rbio->raid_map);
819 kfree(rbio->bbio);
820 kfree(rbio);
821}
822
823static void free_raid_bio(struct btrfs_raid_bio *rbio)
824{
825 unlock_stripe(rbio);
826 __free_raid_bio(rbio);
827}
828
829/*
830 * this frees the rbio and runs through all the bios in the
831 * bio_list and calls end_io on them
832 */
833static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
834{
835 struct bio *cur = bio_list_get(&rbio->bio_list);
836 struct bio *next;
837 free_raid_bio(rbio);
838
839 while (cur) {
840 next = cur->bi_next;
841 cur->bi_next = NULL;
842 if (uptodate)
843 set_bit(BIO_UPTODATE, &cur->bi_flags);
844 bio_endio(cur, err);
845 cur = next;
846 }
847}
848
849/*
850 * end io function used by finish_rmw. When we finally
851 * get here, we've written a full stripe
852 */
853static void raid_write_end_io(struct bio *bio, int err)
854{
855 struct btrfs_raid_bio *rbio = bio->bi_private;
856
857 if (err)
858 fail_bio_stripe(rbio, bio);
859
860 bio_put(bio);
861
862 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
863 return;
864
865 err = 0;
866
867 /* OK, we have read all the stripes we need to. */
868 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
869 err = -EIO;
870
871 rbio_orig_end_io(rbio, err, 0);
872 return;
873}
874
875/*
876 * the read/modify/write code wants to use the original bio for
877 * any pages it included, and then use the rbio for everything
878 * else. This function decides if a given index (stripe number)
879 * and page number in that stripe fall inside the original bio
880 * or the rbio.
881 *
882 * if you set bio_list_only, you'll get a NULL back for any ranges
883 * that are outside the bio_list
884 *
885 * This doesn't take any refs on anything, you get a bare page pointer
886 * and the caller must bump refs as required.
887 *
888 * You must call index_rbio_pages once before you can trust
889 * the answers from this function.
890 */
891static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
892 int index, int pagenr, int bio_list_only)
893{
894 int chunk_page;
895 struct page *p = NULL;
896
897 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
898
899 spin_lock_irq(&rbio->bio_list_lock);
900 p = rbio->bio_pages[chunk_page];
901 spin_unlock_irq(&rbio->bio_list_lock);
902
903 if (p || bio_list_only)
904 return p;
905
906 return rbio->stripe_pages[chunk_page];
907}
908
909/*
910 * number of pages we need for the entire stripe across all the
911 * drives
912 */
913static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
914{
915 unsigned long nr = stripe_len * nr_stripes;
916 return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
917}
918
919/*
920 * allocation and initial setup for the btrfs_raid_bio. Not
921 * this does not allocate any pages for rbio->pages.
922 */
923static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
924 struct btrfs_bio *bbio, u64 *raid_map,
925 u64 stripe_len)
926{
927 struct btrfs_raid_bio *rbio;
928 int nr_data = 0;
929 int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
930 void *p;
931
932 rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
933 GFP_NOFS);
934 if (!rbio) {
935 kfree(raid_map);
936 kfree(bbio);
937 return ERR_PTR(-ENOMEM);
938 }
939
940 bio_list_init(&rbio->bio_list);
941 INIT_LIST_HEAD(&rbio->plug_list);
942 spin_lock_init(&rbio->bio_list_lock);
943 INIT_LIST_HEAD(&rbio->stripe_cache);
944 INIT_LIST_HEAD(&rbio->hash_list);
945 rbio->bbio = bbio;
946 rbio->raid_map = raid_map;
947 rbio->fs_info = root->fs_info;
948 rbio->stripe_len = stripe_len;
949 rbio->nr_pages = num_pages;
950 rbio->faila = -1;
951 rbio->failb = -1;
952 atomic_set(&rbio->refs, 1);
953
954 /*
955 * the stripe_pages and bio_pages array point to the extra
956 * memory we allocated past the end of the rbio
957 */
958 p = rbio + 1;
959 rbio->stripe_pages = p;
960 rbio->bio_pages = p + sizeof(struct page *) * num_pages;
961
962 if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
963 nr_data = bbio->num_stripes - 2;
964 else
965 nr_data = bbio->num_stripes - 1;
966
967 rbio->nr_data = nr_data;
968 return rbio;
969}
970
971/* allocate pages for all the stripes in the bio, including parity */
972static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
973{
974 int i;
975 struct page *page;
976
977 for (i = 0; i < rbio->nr_pages; i++) {
978 if (rbio->stripe_pages[i])
979 continue;
980 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
981 if (!page)
982 return -ENOMEM;
983 rbio->stripe_pages[i] = page;
984 ClearPageUptodate(page);
985 }
986 return 0;
987}
988
989/* allocate pages for just the p/q stripes */
990static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
991{
992 int i;
993 struct page *page;
994
995 i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
996
997 for (; i < rbio->nr_pages; i++) {
998 if (rbio->stripe_pages[i])
999 continue;
1000 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1001 if (!page)
1002 return -ENOMEM;
1003 rbio->stripe_pages[i] = page;
1004 }
1005 return 0;
1006}
1007
1008/*
1009 * add a single page from a specific stripe into our list of bios for IO
1010 * this will try to merge into existing bios if possible, and returns
1011 * zero if all went well.
1012 */
1013int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1014 struct bio_list *bio_list,
1015 struct page *page,
1016 int stripe_nr,
1017 unsigned long page_index,
1018 unsigned long bio_max_len)
1019{
1020 struct bio *last = bio_list->tail;
1021 u64 last_end = 0;
1022 int ret;
1023 struct bio *bio;
1024 struct btrfs_bio_stripe *stripe;
1025 u64 disk_start;
1026
1027 stripe = &rbio->bbio->stripes[stripe_nr];
1028 disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
1029
1030 /* if the device is missing, just fail this stripe */
1031 if (!stripe->dev->bdev)
1032 return fail_rbio_index(rbio, stripe_nr);
1033
1034 /* see if we can add this page onto our existing bio */
1035 if (last) {
1036 last_end = (u64)last->bi_sector << 9;
1037 last_end += last->bi_size;
1038
1039 /*
1040 * we can't merge these if they are from different
1041 * devices or if they are not contiguous
1042 */
1043 if (last_end == disk_start && stripe->dev->bdev &&
1044 test_bit(BIO_UPTODATE, &last->bi_flags) &&
1045 last->bi_bdev == stripe->dev->bdev) {
1046 ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
1047 if (ret == PAGE_CACHE_SIZE)
1048 return 0;
1049 }
1050 }
1051
1052 /* put a new bio on the list */
1053 bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
1054 if (!bio)
1055 return -ENOMEM;
1056
1057 bio->bi_size = 0;
1058 bio->bi_bdev = stripe->dev->bdev;
1059 bio->bi_sector = disk_start >> 9;
1060 set_bit(BIO_UPTODATE, &bio->bi_flags);
1061
1062 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
1063 bio_list_add(bio_list, bio);
1064 return 0;
1065}
1066
1067/*
1068 * while we're doing the read/modify/write cycle, we could
1069 * have errors in reading pages off the disk. This checks
1070 * for errors and if we're not able to read the page it'll
1071 * trigger parity reconstruction. The rmw will be finished
1072 * after we've reconstructed the failed stripes
1073 */
1074static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1075{
1076 if (rbio->faila >= 0 || rbio->failb >= 0) {
1077 BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
1078 __raid56_parity_recover(rbio);
1079 } else {
1080 finish_rmw(rbio);
1081 }
1082}
1083
1084/*
1085 * these are just the pages from the rbio array, not from anything
1086 * the FS sent down to us
1087 */
1088static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
1089{
1090 int index;
1091 index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
1092 index += page;
1093 return rbio->stripe_pages[index];
1094}
1095
1096/*
1097 * helper function to walk our bio list and populate the bio_pages array with
1098 * the result. This seems expensive, but it is faster than constantly
1099 * searching through the bio list as we setup the IO in finish_rmw or stripe
1100 * reconstruction.
1101 *
1102 * This must be called before you trust the answers from page_in_rbio
1103 */
1104static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1105{
1106 struct bio *bio;
1107 u64 start;
1108 unsigned long stripe_offset;
1109 unsigned long page_index;
1110 struct page *p;
1111 int i;
1112
1113 spin_lock_irq(&rbio->bio_list_lock);
1114 bio_list_for_each(bio, &rbio->bio_list) {
1115 start = (u64)bio->bi_sector << 9;
1116 stripe_offset = start - rbio->raid_map[0];
1117 page_index = stripe_offset >> PAGE_CACHE_SHIFT;
1118
1119 for (i = 0; i < bio->bi_vcnt; i++) {
1120 p = bio->bi_io_vec[i].bv_page;
1121 rbio->bio_pages[page_index + i] = p;
1122 }
1123 }
1124 spin_unlock_irq(&rbio->bio_list_lock);
1125}
1126
1127/*
1128 * this is called from one of two situations. We either
1129 * have a full stripe from the higher layers, or we've read all
1130 * the missing bits off disk.
1131 *
1132 * This will calculate the parity and then send down any
1133 * changed blocks.
1134 */
1135static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1136{
1137 struct btrfs_bio *bbio = rbio->bbio;
1138 void *pointers[bbio->num_stripes];
1139 int stripe_len = rbio->stripe_len;
1140 int nr_data = rbio->nr_data;
1141 int stripe;
1142 int pagenr;
1143 int p_stripe = -1;
1144 int q_stripe = -1;
1145 struct bio_list bio_list;
1146 struct bio *bio;
1147 int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
1148 int ret;
1149
1150 bio_list_init(&bio_list);
1151
1152 if (bbio->num_stripes - rbio->nr_data == 1) {
1153 p_stripe = bbio->num_stripes - 1;
1154 } else if (bbio->num_stripes - rbio->nr_data == 2) {
1155 p_stripe = bbio->num_stripes - 2;
1156 q_stripe = bbio->num_stripes - 1;
1157 } else {
1158 BUG();
1159 }
1160
1161 /* at this point we either have a full stripe,
1162 * or we've read the full stripe from the drive.
1163 * recalculate the parity and write the new results.
1164 *
1165 * We're not allowed to add any new bios to the
1166 * bio list here, anyone else that wants to
1167 * change this stripe needs to do their own rmw.
1168 */
1169 spin_lock_irq(&rbio->bio_list_lock);
1170 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1171 spin_unlock_irq(&rbio->bio_list_lock);
1172
1173 atomic_set(&rbio->bbio->error, 0);
1174
1175 /*
1176 * now that we've set rmw_locked, run through the
1177 * bio list one last time and map the page pointers
1178 *
1179 * We don't cache full rbios because we're assuming
1180 * the higher layers are unlikely to use this area of
1181 * the disk again soon. If they do use it again,
1182 * hopefully they will send another full bio.
1183 */
1184 index_rbio_pages(rbio);
1185 if (!rbio_is_full(rbio))
1186 cache_rbio_pages(rbio);
1187 else
1188 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1189
1190 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1191 struct page *p;
1192 /* first collect one page from each data stripe */
1193 for (stripe = 0; stripe < nr_data; stripe++) {
1194 p = page_in_rbio(rbio, stripe, pagenr, 0);
1195 pointers[stripe] = kmap(p);
1196 }
1197
1198 /* then add the parity stripe */
1199 p = rbio_pstripe_page(rbio, pagenr);
1200 SetPageUptodate(p);
1201 pointers[stripe++] = kmap(p);
1202
1203 if (q_stripe != -1) {
1204
1205 /*
1206 * raid6, add the qstripe and call the
1207 * library function to fill in our p/q
1208 */
1209 p = rbio_qstripe_page(rbio, pagenr);
1210 SetPageUptodate(p);
1211 pointers[stripe++] = kmap(p);
1212
1213 raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
1214 pointers);
1215 } else {
1216 /* raid5 */
1217 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
1218 run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
1219 }
1220
1221
1222 for (stripe = 0; stripe < bbio->num_stripes; stripe++)
1223 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1224 }
1225
1226 /*
1227 * time to start writing. Make bios for everything from the
1228 * higher layers (the bio_list in our rbio) and our p/q. Ignore
1229 * everything else.
1230 */
1231 for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1232 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1233 struct page *page;
1234 if (stripe < rbio->nr_data) {
1235 page = page_in_rbio(rbio, stripe, pagenr, 1);
1236 if (!page)
1237 continue;
1238 } else {
1239 page = rbio_stripe_page(rbio, stripe, pagenr);
1240 }
1241
1242 ret = rbio_add_io_page(rbio, &bio_list,
1243 page, stripe, pagenr, rbio->stripe_len);
1244 if (ret)
1245 goto cleanup;
1246 }
1247 }
1248
1249 atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
1250 BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
1251
1252 while (1) {
1253 bio = bio_list_pop(&bio_list);
1254 if (!bio)
1255 break;
1256
1257 bio->bi_private = rbio;
1258 bio->bi_end_io = raid_write_end_io;
1259 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1260 submit_bio(WRITE, bio);
1261 }
1262 return;
1263
1264cleanup:
1265 rbio_orig_end_io(rbio, -EIO, 0);
1266}
1267
1268/*
1269 * helper to find the stripe number for a given bio. Used to figure out which
1270 * stripe has failed. This expects the bio to correspond to a physical disk,
1271 * so it looks up based on physical sector numbers.
1272 */
1273static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1274 struct bio *bio)
1275{
1276 u64 physical = bio->bi_sector;
1277 u64 stripe_start;
1278 int i;
1279 struct btrfs_bio_stripe *stripe;
1280
1281 physical <<= 9;
1282
1283 for (i = 0; i < rbio->bbio->num_stripes; i++) {
1284 stripe = &rbio->bbio->stripes[i];
1285 stripe_start = stripe->physical;
1286 if (physical >= stripe_start &&
1287 physical < stripe_start + rbio->stripe_len) {
1288 return i;
1289 }
1290 }
1291 return -1;
1292}
1293
1294/*
1295 * helper to find the stripe number for a given
1296 * bio (before mapping). Used to figure out which stripe has
1297 * failed. This looks up based on logical block numbers.
1298 */
1299static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1300 struct bio *bio)
1301{
1302 u64 logical = bio->bi_sector;
1303 u64 stripe_start;
1304 int i;
1305
1306 logical <<= 9;
1307
1308 for (i = 0; i < rbio->nr_data; i++) {
1309 stripe_start = rbio->raid_map[i];
1310 if (logical >= stripe_start &&
1311 logical < stripe_start + rbio->stripe_len) {
1312 return i;
1313 }
1314 }
1315 return -1;
1316}
1317
1318/*
1319 * returns -EIO if we had too many failures
1320 */
1321static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1322{
1323 unsigned long flags;
1324 int ret = 0;
1325
1326 spin_lock_irqsave(&rbio->bio_list_lock, flags);
1327
1328 /* we already know this stripe is bad, move on */
1329 if (rbio->faila == failed || rbio->failb == failed)
1330 goto out;
1331
1332 if (rbio->faila == -1) {
1333 /* first failure on this rbio */
1334 rbio->faila = failed;
1335 atomic_inc(&rbio->bbio->error);
1336 } else if (rbio->failb == -1) {
1337 /* second failure on this rbio */
1338 rbio->failb = failed;
1339 atomic_inc(&rbio->bbio->error);
1340 } else {
1341 ret = -EIO;
1342 }
1343out:
1344 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1345
1346 return ret;
1347}
1348
1349/*
1350 * helper to fail a stripe based on a physical disk
1351 * bio.
1352 */
1353static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1354 struct bio *bio)
1355{
1356 int failed = find_bio_stripe(rbio, bio);
1357
1358 if (failed < 0)
1359 return -EIO;
1360
1361 return fail_rbio_index(rbio, failed);
1362}
1363
1364/*
1365 * this sets each page in the bio uptodate. It should only be used on private
1366 * rbio pages, nothing that comes in from the higher layers
1367 */
1368static void set_bio_pages_uptodate(struct bio *bio)
1369{
1370 int i;
1371 struct page *p;
1372
1373 for (i = 0; i < bio->bi_vcnt; i++) {
1374 p = bio->bi_io_vec[i].bv_page;
1375 SetPageUptodate(p);
1376 }
1377}
1378
1379/*
1380 * end io for the read phase of the rmw cycle. All the bios here are physical
1381 * stripe bios we've read from the disk so we can recalculate the parity of the
1382 * stripe.
1383 *
1384 * This will usually kick off finish_rmw once all the bios are read in, but it
1385 * may trigger parity reconstruction if we had any errors along the way
1386 */
1387static void raid_rmw_end_io(struct bio *bio, int err)
1388{
1389 struct btrfs_raid_bio *rbio = bio->bi_private;
1390
1391 if (err)
1392 fail_bio_stripe(rbio, bio);
1393 else
1394 set_bio_pages_uptodate(bio);
1395
1396 bio_put(bio);
1397
1398 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1399 return;
1400
1401 err = 0;
1402 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1403 goto cleanup;
1404
1405 /*
1406 * this will normally call finish_rmw to start our write
1407 * but if there are any failed stripes we'll reconstruct
1408 * from parity first
1409 */
1410 validate_rbio_for_rmw(rbio);
1411 return;
1412
1413cleanup:
1414
1415 rbio_orig_end_io(rbio, -EIO, 0);
1416}
1417
1418static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1419{
1420 rbio->work.flags = 0;
1421 rbio->work.func = rmw_work;
1422
1423 btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1424 &rbio->work);
1425}
1426
1427static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1428{
1429 rbio->work.flags = 0;
1430 rbio->work.func = read_rebuild_work;
1431
1432 btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1433 &rbio->work);
1434}
1435
1436/*
1437 * the stripe must be locked by the caller. It will
1438 * unlock after all the writes are done
1439 */
1440static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1441{
1442 int bios_to_read = 0;
1443 struct btrfs_bio *bbio = rbio->bbio;
1444 struct bio_list bio_list;
1445 int ret;
1446 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1447 int pagenr;
1448 int stripe;
1449 struct bio *bio;
1450
1451 bio_list_init(&bio_list);
1452
1453 ret = alloc_rbio_pages(rbio);
1454 if (ret)
1455 goto cleanup;
1456
1457 index_rbio_pages(rbio);
1458
1459 atomic_set(&rbio->bbio->error, 0);
1460 /*
1461 * build a list of bios to read all the missing parts of this
1462 * stripe
1463 */
1464 for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1465 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1466 struct page *page;
1467 /*
1468 * we want to find all the pages missing from
1469 * the rbio and read them from the disk. If
1470 * page_in_rbio finds a page in the bio list
1471 * we don't need to read it off the stripe.
1472 */
1473 page = page_in_rbio(rbio, stripe, pagenr, 1);
1474 if (page)
1475 continue;
1476
1477 page = rbio_stripe_page(rbio, stripe, pagenr);
1478 /*
1479 * the bio cache may have handed us an uptodate
1480 * page. If so, be happy and use it
1481 */
1482 if (PageUptodate(page))
1483 continue;
1484
1485 ret = rbio_add_io_page(rbio, &bio_list, page,
1486 stripe, pagenr, rbio->stripe_len);
1487 if (ret)
1488 goto cleanup;
1489 }
1490 }
1491
1492 bios_to_read = bio_list_size(&bio_list);
1493 if (!bios_to_read) {
1494 /*
1495 * this can happen if others have merged with
1496 * us, it means there is nothing left to read.
1497 * But if there are missing devices it may not be
1498 * safe to do the full stripe write yet.
1499 */
1500 goto finish;
1501 }
1502
1503 /*
1504 * the bbio may be freed once we submit the last bio. Make sure
1505 * not to touch it after that
1506 */
1507 atomic_set(&bbio->stripes_pending, bios_to_read);
1508 while (1) {
1509 bio = bio_list_pop(&bio_list);
1510 if (!bio)
1511 break;
1512
1513 bio->bi_private = rbio;
1514 bio->bi_end_io = raid_rmw_end_io;
1515
1516 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1517 BTRFS_WQ_ENDIO_RAID56);
1518
1519 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1520 submit_bio(READ, bio);
1521 }
1522 /* the actual write will happen once the reads are done */
1523 return 0;
1524
1525cleanup:
1526 rbio_orig_end_io(rbio, -EIO, 0);
1527 return -EIO;
1528
1529finish:
1530 validate_rbio_for_rmw(rbio);
1531 return 0;
1532}
1533
1534/*
1535 * if the upper layers pass in a full stripe, we thank them by only allocating
1536 * enough pages to hold the parity, and sending it all down quickly.
1537 */
1538static int full_stripe_write(struct btrfs_raid_bio *rbio)
1539{
1540 int ret;
1541
1542 ret = alloc_rbio_parity_pages(rbio);
1543 if (ret)
1544 return ret;
1545
1546 ret = lock_stripe_add(rbio);
1547 if (ret == 0)
1548 finish_rmw(rbio);
1549 return 0;
1550}
1551
1552/*
1553 * partial stripe writes get handed over to async helpers.
1554 * We're really hoping to merge a few more writes into this
1555 * rbio before calculating new parity
1556 */
1557static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1558{
1559 int ret;
1560
1561 ret = lock_stripe_add(rbio);
1562 if (ret == 0)
1563 async_rmw_stripe(rbio);
1564 return 0;
1565}
1566
1567/*
1568 * sometimes while we were reading from the drive to
1569 * recalculate parity, enough new bios come into create
1570 * a full stripe. So we do a check here to see if we can
1571 * go directly to finish_rmw
1572 */
1573static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1574{
1575 /* head off into rmw land if we don't have a full stripe */
1576 if (!rbio_is_full(rbio))
1577 return partial_stripe_write(rbio);
1578 return full_stripe_write(rbio);
1579}
1580
1581/*
1582 * We use plugging call backs to collect full stripes.
1583 * Any time we get a partial stripe write while plugged
1584 * we collect it into a list. When the unplug comes down,
1585 * we sort the list by logical block number and merge
1586 * everything we can into the same rbios
1587 */
1588struct btrfs_plug_cb {
1589 struct blk_plug_cb cb;
1590 struct btrfs_fs_info *info;
1591 struct list_head rbio_list;
1592 struct btrfs_work work;
1593};
1594
1595/*
1596 * rbios on the plug list are sorted for easier merging.
1597 */
1598static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
1599{
1600 struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1601 plug_list);
1602 struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1603 plug_list);
1604 u64 a_sector = ra->bio_list.head->bi_sector;
1605 u64 b_sector = rb->bio_list.head->bi_sector;
1606
1607 if (a_sector < b_sector)
1608 return -1;
1609 if (a_sector > b_sector)
1610 return 1;
1611 return 0;
1612}
1613
1614static void run_plug(struct btrfs_plug_cb *plug)
1615{
1616 struct btrfs_raid_bio *cur;
1617 struct btrfs_raid_bio *last = NULL;
1618
1619 /*
1620 * sort our plug list then try to merge
1621 * everything we can in hopes of creating full
1622 * stripes.
1623 */
1624 list_sort(NULL, &plug->rbio_list, plug_cmp);
1625 while (!list_empty(&plug->rbio_list)) {
1626 cur = list_entry(plug->rbio_list.next,
1627 struct btrfs_raid_bio, plug_list);
1628 list_del_init(&cur->plug_list);
1629
1630 if (rbio_is_full(cur)) {
1631 /* we have a full stripe, send it down */
1632 full_stripe_write(cur);
1633 continue;
1634 }
1635 if (last) {
1636 if (rbio_can_merge(last, cur)) {
1637 merge_rbio(last, cur);
1638 __free_raid_bio(cur);
1639 continue;
1640
1641 }
1642 __raid56_parity_write(last);
1643 }
1644 last = cur;
1645 }
1646 if (last) {
1647 __raid56_parity_write(last);
1648 }
1649 kfree(plug);
1650}
1651
1652/*
1653 * if the unplug comes from schedule, we have to push the
1654 * work off to a helper thread
1655 */
1656static void unplug_work(struct btrfs_work *work)
1657{
1658 struct btrfs_plug_cb *plug;
1659 plug = container_of(work, struct btrfs_plug_cb, work);
1660 run_plug(plug);
1661}
1662
1663static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1664{
1665 struct btrfs_plug_cb *plug;
1666 plug = container_of(cb, struct btrfs_plug_cb, cb);
1667
1668 if (from_schedule) {
1669 plug->work.flags = 0;
1670 plug->work.func = unplug_work;
1671 btrfs_queue_worker(&plug->info->rmw_workers,
1672 &plug->work);
1673 return;
1674 }
1675 run_plug(plug);
1676}
1677
1678/*
1679 * our main entry point for writes from the rest of the FS.
1680 */
1681int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1682 struct btrfs_bio *bbio, u64 *raid_map,
1683 u64 stripe_len)
1684{
1685 struct btrfs_raid_bio *rbio;
1686 struct btrfs_plug_cb *plug = NULL;
1687 struct blk_plug_cb *cb;
1688
1689 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1690 if (IS_ERR(rbio)) {
1691 kfree(raid_map);
1692 kfree(bbio);
1693 return PTR_ERR(rbio);
1694 }
1695 bio_list_add(&rbio->bio_list, bio);
1696 rbio->bio_list_bytes = bio->bi_size;
1697
1698 /*
1699 * don't plug on full rbios, just get them out the door
1700 * as quickly as we can
1701 */
1702 if (rbio_is_full(rbio))
1703 return full_stripe_write(rbio);
1704
1705 cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
1706 sizeof(*plug));
1707 if (cb) {
1708 plug = container_of(cb, struct btrfs_plug_cb, cb);
1709 if (!plug->info) {
1710 plug->info = root->fs_info;
1711 INIT_LIST_HEAD(&plug->rbio_list);
1712 }
1713 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1714 } else {
1715 return __raid56_parity_write(rbio);
1716 }
1717 return 0;
1718}
1719
1720/*
1721 * all parity reconstruction happens here. We've read in everything
1722 * we can find from the drives and this does the heavy lifting of
1723 * sorting the good from the bad.
1724 */
1725static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1726{
1727 int pagenr, stripe;
1728 void **pointers;
1729 int faila = -1, failb = -1;
1730 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1731 struct page *page;
1732 int err;
1733 int i;
1734
1735 pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
1736 GFP_NOFS);
1737 if (!pointers) {
1738 err = -ENOMEM;
1739 goto cleanup_io;
1740 }
1741
1742 faila = rbio->faila;
1743 failb = rbio->failb;
1744
1745 if (rbio->read_rebuild) {
1746 spin_lock_irq(&rbio->bio_list_lock);
1747 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1748 spin_unlock_irq(&rbio->bio_list_lock);
1749 }
1750
1751 index_rbio_pages(rbio);
1752
1753 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1754 /* setup our array of pointers with pages
1755 * from each stripe
1756 */
1757 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1758 /*
1759 * if we're rebuilding a read, we have to use
1760 * pages from the bio list
1761 */
1762 if (rbio->read_rebuild &&
1763 (stripe == faila || stripe == failb)) {
1764 page = page_in_rbio(rbio, stripe, pagenr, 0);
1765 } else {
1766 page = rbio_stripe_page(rbio, stripe, pagenr);
1767 }
1768 pointers[stripe] = kmap(page);
1769 }
1770
1771 /* all raid6 handling here */
1772 if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
1773 RAID6_Q_STRIPE) {
1774
1775 /*
1776 * single failure, rebuild from parity raid5
1777 * style
1778 */
1779 if (failb < 0) {
1780 if (faila == rbio->nr_data) {
1781 /*
1782 * Just the P stripe has failed, without
1783 * a bad data or Q stripe.
1784 * TODO, we should redo the xor here.
1785 */
1786 err = -EIO;
1787 goto cleanup;
1788 }
1789 /*
1790 * a single failure in raid6 is rebuilt
1791 * in the pstripe code below
1792 */
1793 goto pstripe;
1794 }
1795
1796 /* make sure our ps and qs are in order */
1797 if (faila > failb) {
1798 int tmp = failb;
1799 failb = faila;
1800 faila = tmp;
1801 }
1802
1803 /* if the q stripe is failed, do a pstripe reconstruction
1804 * from the xors.
1805 * If both the q stripe and the P stripe are failed, we're
1806 * here due to a crc mismatch and we can't give them the
1807 * data they want
1808 */
1809 if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
1810 if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
1811 err = -EIO;
1812 goto cleanup;
1813 }
1814 /*
1815 * otherwise we have one bad data stripe and
1816 * a good P stripe. raid5!
1817 */
1818 goto pstripe;
1819 }
1820
1821 if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
1822 raid6_datap_recov(rbio->bbio->num_stripes,
1823 PAGE_SIZE, faila, pointers);
1824 } else {
1825 raid6_2data_recov(rbio->bbio->num_stripes,
1826 PAGE_SIZE, faila, failb,
1827 pointers);
1828 }
1829 } else {
1830 void *p;
1831
1832 /* rebuild from P stripe here (raid5 or raid6) */
1833 BUG_ON(failb != -1);
1834pstripe:
1835 /* Copy parity block into failed block to start with */
1836 memcpy(pointers[faila],
1837 pointers[rbio->nr_data],
1838 PAGE_CACHE_SIZE);
1839
1840 /* rearrange the pointer array */
1841 p = pointers[faila];
1842 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1843 pointers[stripe] = pointers[stripe + 1];
1844 pointers[rbio->nr_data - 1] = p;
1845
1846 /* xor in the rest */
1847 run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
1848 }
1849 /* if we're doing this rebuild as part of an rmw, go through
1850 * and set all of our private rbio pages in the
1851 * failed stripes as uptodate. This way finish_rmw will
1852 * know they can be trusted. If this was a read reconstruction,
1853 * other endio functions will fiddle the uptodate bits
1854 */
1855 if (!rbio->read_rebuild) {
1856 for (i = 0; i < nr_pages; i++) {
1857 if (faila != -1) {
1858 page = rbio_stripe_page(rbio, faila, i);
1859 SetPageUptodate(page);
1860 }
1861 if (failb != -1) {
1862 page = rbio_stripe_page(rbio, failb, i);
1863 SetPageUptodate(page);
1864 }
1865 }
1866 }
1867 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1868 /*
1869 * if we're rebuilding a read, we have to use
1870 * pages from the bio list
1871 */
1872 if (rbio->read_rebuild &&
1873 (stripe == faila || stripe == failb)) {
1874 page = page_in_rbio(rbio, stripe, pagenr, 0);
1875 } else {
1876 page = rbio_stripe_page(rbio, stripe, pagenr);
1877 }
1878 kunmap(page);
1879 }
1880 }
1881
1882 err = 0;
1883cleanup:
1884 kfree(pointers);
1885
1886cleanup_io:
1887
1888 if (rbio->read_rebuild) {
1889 if (err == 0)
1890 cache_rbio_pages(rbio);
1891 else
1892 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1893
1894 rbio_orig_end_io(rbio, err, err == 0);
1895 } else if (err == 0) {
1896 rbio->faila = -1;
1897 rbio->failb = -1;
1898 finish_rmw(rbio);
1899 } else {
1900 rbio_orig_end_io(rbio, err, 0);
1901 }
1902}
1903
1904/*
1905 * This is called only for stripes we've read from disk to
1906 * reconstruct the parity.
1907 */
1908static void raid_recover_end_io(struct bio *bio, int err)
1909{
1910 struct btrfs_raid_bio *rbio = bio->bi_private;
1911
1912 /*
1913 * we only read stripe pages off the disk, set them
1914 * up to date if there were no errors
1915 */
1916 if (err)
1917 fail_bio_stripe(rbio, bio);
1918 else
1919 set_bio_pages_uptodate(bio);
1920 bio_put(bio);
1921
1922 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1923 return;
1924
1925 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1926 rbio_orig_end_io(rbio, -EIO, 0);
1927 else
1928 __raid_recover_end_io(rbio);
1929}
1930
1931/*
1932 * reads everything we need off the disk to reconstruct
1933 * the parity. endio handlers trigger final reconstruction
1934 * when the IO is done.
1935 *
1936 * This is used both for reads from the higher layers and for
1937 * parity construction required to finish a rmw cycle.
1938 */
1939static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1940{
1941 int bios_to_read = 0;
1942 struct btrfs_bio *bbio = rbio->bbio;
1943 struct bio_list bio_list;
1944 int ret;
1945 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1946 int pagenr;
1947 int stripe;
1948 struct bio *bio;
1949
1950 bio_list_init(&bio_list);
1951
1952 ret = alloc_rbio_pages(rbio);
1953 if (ret)
1954 goto cleanup;
1955
1956 atomic_set(&rbio->bbio->error, 0);
1957
1958 /*
1959 * read everything that hasn't failed. Thanks to the
1960 * stripe cache, it is possible that some or all of these
1961 * pages are going to be uptodate.
1962 */
1963 for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1964 if (rbio->faila == stripe ||
1965 rbio->failb == stripe)
1966 continue;
1967
1968 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1969 struct page *p;
1970
1971 /*
1972 * the rmw code may have already read this
1973 * page in
1974 */
1975 p = rbio_stripe_page(rbio, stripe, pagenr);
1976 if (PageUptodate(p))
1977 continue;
1978
1979 ret = rbio_add_io_page(rbio, &bio_list,
1980 rbio_stripe_page(rbio, stripe, pagenr),
1981 stripe, pagenr, rbio->stripe_len);
1982 if (ret < 0)
1983 goto cleanup;
1984 }
1985 }
1986
1987 bios_to_read = bio_list_size(&bio_list);
1988 if (!bios_to_read) {
1989 /*
1990 * we might have no bios to read just because the pages
1991 * were up to date, or we might have no bios to read because
1992 * the devices were gone.
1993 */
1994 if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
1995 __raid_recover_end_io(rbio);
1996 goto out;
1997 } else {
1998 goto cleanup;
1999 }
2000 }
2001
2002 /*
2003 * the bbio may be freed once we submit the last bio. Make sure
2004 * not to touch it after that
2005 */
2006 atomic_set(&bbio->stripes_pending, bios_to_read);
2007 while (1) {
2008 bio = bio_list_pop(&bio_list);
2009 if (!bio)
2010 break;
2011
2012 bio->bi_private = rbio;
2013 bio->bi_end_io = raid_recover_end_io;
2014
2015 btrfs_bio_wq_end_io(rbio->fs_info, bio,
2016 BTRFS_WQ_ENDIO_RAID56);
2017
2018 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2019 submit_bio(READ, bio);
2020 }
2021out:
2022 return 0;
2023
2024cleanup:
2025 if (rbio->read_rebuild)
2026 rbio_orig_end_io(rbio, -EIO, 0);
2027 return -EIO;
2028}
2029
2030/*
2031 * the main entry point for reads from the higher layers. This
2032 * is really only called when the normal read path had a failure,
2033 * so we assume the bio they send down corresponds to a failed part
2034 * of the drive.
2035 */
2036int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
2037 struct btrfs_bio *bbio, u64 *raid_map,
2038 u64 stripe_len, int mirror_num)
2039{
2040 struct btrfs_raid_bio *rbio;
2041 int ret;
2042
2043 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
2044 if (IS_ERR(rbio)) {
2045 return PTR_ERR(rbio);
2046 }
2047
2048 rbio->read_rebuild = 1;
2049 bio_list_add(&rbio->bio_list, bio);
2050 rbio->bio_list_bytes = bio->bi_size;
2051
2052 rbio->faila = find_logical_bio_stripe(rbio, bio);
2053 if (rbio->faila == -1) {
2054 BUG();
2055 kfree(rbio);
2056 return -EIO;
2057 }
2058
2059 /*
2060 * reconstruct from the q stripe if they are
2061 * asking for mirror 3
2062 */
2063 if (mirror_num == 3)
2064 rbio->failb = bbio->num_stripes - 2;
2065
2066 ret = lock_stripe_add(rbio);
2067
2068 /*
2069 * __raid56_parity_recover will end the bio with
2070 * any errors it hits. We don't want to return
2071 * its error value up the stack because our caller
2072 * will end up calling bio_endio with any nonzero
2073 * return
2074 */
2075 if (ret == 0)
2076 __raid56_parity_recover(rbio);
2077 /*
2078 * our rbio has been added to the list of
2079 * rbios that will be handled after the
2080 * currently lock owner is done
2081 */
2082 return 0;
2083
2084}
2085
2086static void rmw_work(struct btrfs_work *work)
2087{
2088 struct btrfs_raid_bio *rbio;
2089
2090 rbio = container_of(work, struct btrfs_raid_bio, work);
2091 raid56_rmw_stripe(rbio);
2092}
2093
2094static void read_rebuild_work(struct btrfs_work *work)
2095{
2096 struct btrfs_raid_bio *rbio;
2097
2098 rbio = container_of(work, struct btrfs_raid_bio, work);
2099 __raid56_parity_recover(rbio);
2100}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
new file mode 100644
index 000000000000..ea5d73bfdfbe
--- /dev/null
+++ b/fs/btrfs/raid56.h
@@ -0,0 +1,51 @@
1/*
2 * Copyright (C) 2012 Fusion-io All rights reserved.
3 * Copyright (C) 2012 Intel Corp. All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
18 */
19
20#ifndef __BTRFS_RAID56__
21#define __BTRFS_RAID56__
22static inline int nr_parity_stripes(struct map_lookup *map)
23{
24 if (map->type & BTRFS_BLOCK_GROUP_RAID5)
25 return 1;
26 else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
27 return 2;
28 else
29 return 0;
30}
31
32static inline int nr_data_stripes(struct map_lookup *map)
33{
34 return map->num_stripes - nr_parity_stripes(map);
35}
36#define RAID5_P_STRIPE ((u64)-2)
37#define RAID6_Q_STRIPE ((u64)-1)
38
39#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \
40 ((x) == RAID6_Q_STRIPE))
41
42int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
43 struct btrfs_bio *bbio, u64 *raid_map,
44 u64 stripe_len, int mirror_num);
45int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
46 struct btrfs_bio *bbio, u64 *raid_map,
47 u64 stripe_len);
48
49int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
50void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
51#endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 300e09ac3659..b67171e6d688 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1269,6 +1269,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
1269 } 1269 }
1270 spin_unlock(&rc->reloc_root_tree.lock); 1270 spin_unlock(&rc->reloc_root_tree.lock);
1271 1271
1272 if (!node)
1273 return 0;
1272 BUG_ON((struct btrfs_root *)node->data != root); 1274 BUG_ON((struct btrfs_root *)node->data != root);
1273 1275
1274 if (!del) { 1276 if (!del) {
@@ -2238,13 +2240,28 @@ again:
2238} 2240}
2239 2241
2240static noinline_for_stack 2242static noinline_for_stack
2243void free_reloc_roots(struct list_head *list)
2244{
2245 struct btrfs_root *reloc_root;
2246
2247 while (!list_empty(list)) {
2248 reloc_root = list_entry(list->next, struct btrfs_root,
2249 root_list);
2250 __update_reloc_root(reloc_root, 1);
2251 free_extent_buffer(reloc_root->node);
2252 free_extent_buffer(reloc_root->commit_root);
2253 kfree(reloc_root);
2254 }
2255}
2256
2257static noinline_for_stack
2241int merge_reloc_roots(struct reloc_control *rc) 2258int merge_reloc_roots(struct reloc_control *rc)
2242{ 2259{
2243 struct btrfs_root *root; 2260 struct btrfs_root *root;
2244 struct btrfs_root *reloc_root; 2261 struct btrfs_root *reloc_root;
2245 LIST_HEAD(reloc_roots); 2262 LIST_HEAD(reloc_roots);
2246 int found = 0; 2263 int found = 0;
2247 int ret; 2264 int ret = 0;
2248again: 2265again:
2249 root = rc->extent_root; 2266 root = rc->extent_root;
2250 2267
@@ -2270,20 +2287,33 @@ again:
2270 BUG_ON(root->reloc_root != reloc_root); 2287 BUG_ON(root->reloc_root != reloc_root);
2271 2288
2272 ret = merge_reloc_root(rc, root); 2289 ret = merge_reloc_root(rc, root);
2273 BUG_ON(ret); 2290 if (ret)
2291 goto out;
2274 } else { 2292 } else {
2275 list_del_init(&reloc_root->root_list); 2293 list_del_init(&reloc_root->root_list);
2276 } 2294 }
2277 ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); 2295 ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
2278 BUG_ON(ret < 0); 2296 if (ret < 0) {
2297 if (list_empty(&reloc_root->root_list))
2298 list_add_tail(&reloc_root->root_list,
2299 &reloc_roots);
2300 goto out;
2301 }
2279 } 2302 }
2280 2303
2281 if (found) { 2304 if (found) {
2282 found = 0; 2305 found = 0;
2283 goto again; 2306 goto again;
2284 } 2307 }
2308out:
2309 if (ret) {
2310 btrfs_std_error(root->fs_info, ret);
2311 if (!list_empty(&reloc_roots))
2312 free_reloc_roots(&reloc_roots);
2313 }
2314
2285 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2315 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2286 return 0; 2316 return ret;
2287} 2317}
2288 2318
2289static void free_block_list(struct rb_root *blocks) 2319static void free_block_list(struct rb_root *blocks)
@@ -2818,8 +2848,10 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2818 int err = 0; 2848 int err = 0;
2819 2849
2820 path = btrfs_alloc_path(); 2850 path = btrfs_alloc_path();
2821 if (!path) 2851 if (!path) {
2822 return -ENOMEM; 2852 err = -ENOMEM;
2853 goto out_path;
2854 }
2823 2855
2824 rb_node = rb_first(blocks); 2856 rb_node = rb_first(blocks);
2825 while (rb_node) { 2857 while (rb_node) {
@@ -2858,10 +2890,11 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2858 rb_node = rb_next(rb_node); 2890 rb_node = rb_next(rb_node);
2859 } 2891 }
2860out: 2892out:
2861 free_block_list(blocks);
2862 err = finish_pending_nodes(trans, rc, path, err); 2893 err = finish_pending_nodes(trans, rc, path, err);
2863 2894
2864 btrfs_free_path(path); 2895 btrfs_free_path(path);
2896out_path:
2897 free_block_list(blocks);
2865 return err; 2898 return err;
2866} 2899}
2867 2900
@@ -3017,7 +3050,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
3017 } 3050 }
3018 } 3051 }
3019 3052
3020 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 3053 page_start = page_offset(page);
3021 page_end = page_start + PAGE_CACHE_SIZE - 1; 3054 page_end = page_start + PAGE_CACHE_SIZE - 1;
3022 3055
3023 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); 3056 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
@@ -3472,7 +3505,7 @@ out:
3472} 3505}
3473 3506
3474/* 3507/*
3475 * hepler to find all tree blocks that reference a given data extent 3508 * helper to find all tree blocks that reference a given data extent
3476 */ 3509 */
3477static noinline_for_stack 3510static noinline_for_stack
3478int add_data_references(struct reloc_control *rc, 3511int add_data_references(struct reloc_control *rc,
@@ -3566,7 +3599,7 @@ int add_data_references(struct reloc_control *rc,
3566} 3599}
3567 3600
3568/* 3601/*
3569 * hepler to find next unprocessed extent 3602 * helper to find next unprocessed extent
3570 */ 3603 */
3571static noinline_for_stack 3604static noinline_for_stack
3572int find_next_extent(struct btrfs_trans_handle *trans, 3605int find_next_extent(struct btrfs_trans_handle *trans,
@@ -3698,7 +3731,15 @@ int prepare_to_relocate(struct reloc_control *rc)
3698 set_reloc_control(rc); 3731 set_reloc_control(rc);
3699 3732
3700 trans = btrfs_join_transaction(rc->extent_root); 3733 trans = btrfs_join_transaction(rc->extent_root);
3701 BUG_ON(IS_ERR(trans)); 3734 if (IS_ERR(trans)) {
3735 unset_reloc_control(rc);
3736 /*
3737 * extent tree is not a ref_cow tree and has no reloc_root to
3738 * cleanup. And callers are responsible to free the above
3739 * block rsv.
3740 */
3741 return PTR_ERR(trans);
3742 }
3702 btrfs_commit_transaction(trans, rc->extent_root); 3743 btrfs_commit_transaction(trans, rc->extent_root);
3703 return 0; 3744 return 0;
3704} 3745}
@@ -3730,7 +3771,11 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3730 while (1) { 3771 while (1) {
3731 progress++; 3772 progress++;
3732 trans = btrfs_start_transaction(rc->extent_root, 0); 3773 trans = btrfs_start_transaction(rc->extent_root, 0);
3733 BUG_ON(IS_ERR(trans)); 3774 if (IS_ERR(trans)) {
3775 err = PTR_ERR(trans);
3776 trans = NULL;
3777 break;
3778 }
3734restart: 3779restart:
3735 if (update_backref_cache(trans, &rc->backref_cache)) { 3780 if (update_backref_cache(trans, &rc->backref_cache)) {
3736 btrfs_end_transaction(trans, rc->extent_root); 3781 btrfs_end_transaction(trans, rc->extent_root);
@@ -4264,14 +4309,9 @@ int btrfs_recover_relocation(struct btrfs_root *root)
4264out_free: 4309out_free:
4265 kfree(rc); 4310 kfree(rc);
4266out: 4311out:
4267 while (!list_empty(&reloc_roots)) { 4312 if (!list_empty(&reloc_roots))
4268 reloc_root = list_entry(reloc_roots.next, 4313 free_reloc_roots(&reloc_roots);
4269 struct btrfs_root, root_list); 4314
4270 list_del(&reloc_root->root_list);
4271 free_extent_buffer(reloc_root->node);
4272 free_extent_buffer(reloc_root->commit_root);
4273 kfree(reloc_root);
4274 }
4275 btrfs_free_path(path); 4315 btrfs_free_path(path);
4276 4316
4277 if (err == 0) { 4317 if (err == 0) {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index bdbb94f245c9..53c3501fa4ca 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -28,6 +28,7 @@
28#include "dev-replace.h" 28#include "dev-replace.h"
29#include "check-integrity.h" 29#include "check-integrity.h"
30#include "rcu-string.h" 30#include "rcu-string.h"
31#include "raid56.h"
31 32
32/* 33/*
33 * This is only the first step towards a full-features scrub. It reads all 34 * This is only the first step towards a full-features scrub. It reads all
@@ -580,20 +581,29 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
580 int corrected = 0; 581 int corrected = 0;
581 struct btrfs_key key; 582 struct btrfs_key key;
582 struct inode *inode = NULL; 583 struct inode *inode = NULL;
584 struct btrfs_fs_info *fs_info;
583 u64 end = offset + PAGE_SIZE - 1; 585 u64 end = offset + PAGE_SIZE - 1;
584 struct btrfs_root *local_root; 586 struct btrfs_root *local_root;
587 int srcu_index;
585 588
586 key.objectid = root; 589 key.objectid = root;
587 key.type = BTRFS_ROOT_ITEM_KEY; 590 key.type = BTRFS_ROOT_ITEM_KEY;
588 key.offset = (u64)-1; 591 key.offset = (u64)-1;
589 local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); 592
590 if (IS_ERR(local_root)) 593 fs_info = fixup->root->fs_info;
594 srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
595
596 local_root = btrfs_read_fs_root_no_name(fs_info, &key);
597 if (IS_ERR(local_root)) {
598 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
591 return PTR_ERR(local_root); 599 return PTR_ERR(local_root);
600 }
592 601
593 key.type = BTRFS_INODE_ITEM_KEY; 602 key.type = BTRFS_INODE_ITEM_KEY;
594 key.objectid = inum; 603 key.objectid = inum;
595 key.offset = 0; 604 key.offset = 0;
596 inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); 605 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
606 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
597 if (IS_ERR(inode)) 607 if (IS_ERR(inode))
598 return PTR_ERR(inode); 608 return PTR_ERR(inode);
599 609
@@ -606,7 +616,6 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
606 } 616 }
607 617
608 if (PageUptodate(page)) { 618 if (PageUptodate(page)) {
609 struct btrfs_fs_info *fs_info;
610 if (PageDirty(page)) { 619 if (PageDirty(page)) {
611 /* 620 /*
612 * we need to write the data to the defect sector. the 621 * we need to write the data to the defect sector. the
@@ -2246,6 +2255,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2246 struct btrfs_device *extent_dev; 2255 struct btrfs_device *extent_dev;
2247 int extent_mirror_num; 2256 int extent_mirror_num;
2248 2257
2258 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2259 BTRFS_BLOCK_GROUP_RAID6)) {
2260 if (num >= nr_data_stripes(map)) {
2261 return 0;
2262 }
2263 }
2264
2249 nstripes = length; 2265 nstripes = length;
2250 offset = 0; 2266 offset = 0;
2251 do_div(nstripes, map->stripe_len); 2267 do_div(nstripes, map->stripe_len);
@@ -2700,7 +2716,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2700 int ret; 2716 int ret;
2701 struct btrfs_root *root = sctx->dev_root; 2717 struct btrfs_root *root = sctx->dev_root;
2702 2718
2703 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 2719 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
2704 return -EIO; 2720 return -EIO;
2705 2721
2706 gen = root->fs_info->last_trans_committed; 2722 gen = root->fs_info->last_trans_committed;
@@ -3180,18 +3196,25 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3180 u64 physical_for_dev_replace; 3196 u64 physical_for_dev_replace;
3181 u64 len; 3197 u64 len;
3182 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; 3198 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3199 int srcu_index;
3183 3200
3184 key.objectid = root; 3201 key.objectid = root;
3185 key.type = BTRFS_ROOT_ITEM_KEY; 3202 key.type = BTRFS_ROOT_ITEM_KEY;
3186 key.offset = (u64)-1; 3203 key.offset = (u64)-1;
3204
3205 srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
3206
3187 local_root = btrfs_read_fs_root_no_name(fs_info, &key); 3207 local_root = btrfs_read_fs_root_no_name(fs_info, &key);
3188 if (IS_ERR(local_root)) 3208 if (IS_ERR(local_root)) {
3209 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
3189 return PTR_ERR(local_root); 3210 return PTR_ERR(local_root);
3211 }
3190 3212
3191 key.type = BTRFS_INODE_ITEM_KEY; 3213 key.type = BTRFS_INODE_ITEM_KEY;
3192 key.objectid = inum; 3214 key.objectid = inum;
3193 key.offset = 0; 3215 key.offset = 0;
3194 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); 3216 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
3217 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
3195 if (IS_ERR(inode)) 3218 if (IS_ERR(inode))
3196 return PTR_ERR(inode); 3219 return PTR_ERR(inode);
3197 3220
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 54454542ad40..f7a8b861058b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -85,6 +85,7 @@ struct send_ctx {
85 u32 send_max_size; 85 u32 send_max_size;
86 u64 total_send_size; 86 u64 total_send_size;
87 u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; 87 u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
88 u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */
88 89
89 struct vfsmount *mnt; 90 struct vfsmount *mnt;
90 91
@@ -1814,8 +1815,10 @@ static int name_cache_insert(struct send_ctx *sctx,
1814 (unsigned long)nce->ino); 1815 (unsigned long)nce->ino);
1815 if (!nce_head) { 1816 if (!nce_head) {
1816 nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS); 1817 nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS);
1817 if (!nce_head) 1818 if (!nce_head) {
1819 kfree(nce);
1818 return -ENOMEM; 1820 return -ENOMEM;
1821 }
1819 INIT_LIST_HEAD(nce_head); 1822 INIT_LIST_HEAD(nce_head);
1820 1823
1821 ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); 1824 ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
@@ -3707,6 +3710,39 @@ out:
3707 return ret; 3710 return ret;
3708} 3711}
3709 3712
3713/*
3714 * Send an update extent command to user space.
3715 */
3716static int send_update_extent(struct send_ctx *sctx,
3717 u64 offset, u32 len)
3718{
3719 int ret = 0;
3720 struct fs_path *p;
3721
3722 p = fs_path_alloc(sctx);
3723 if (!p)
3724 return -ENOMEM;
3725
3726 ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT);
3727 if (ret < 0)
3728 goto out;
3729
3730 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
3731 if (ret < 0)
3732 goto out;
3733
3734 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
3735 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
3736 TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
3737
3738 ret = send_cmd(sctx);
3739
3740tlv_put_failure:
3741out:
3742 fs_path_free(sctx, p);
3743 return ret;
3744}
3745
3710static int send_write_or_clone(struct send_ctx *sctx, 3746static int send_write_or_clone(struct send_ctx *sctx,
3711 struct btrfs_path *path, 3747 struct btrfs_path *path,
3712 struct btrfs_key *key, 3748 struct btrfs_key *key,
@@ -3742,7 +3778,11 @@ static int send_write_or_clone(struct send_ctx *sctx,
3742 goto out; 3778 goto out;
3743 } 3779 }
3744 3780
3745 if (!clone_root) { 3781 if (clone_root) {
3782 ret = send_clone(sctx, offset, len, clone_root);
3783 } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
3784 ret = send_update_extent(sctx, offset, len);
3785 } else {
3746 while (pos < len) { 3786 while (pos < len) {
3747 l = len - pos; 3787 l = len - pos;
3748 if (l > BTRFS_SEND_READ_SIZE) 3788 if (l > BTRFS_SEND_READ_SIZE)
@@ -3755,10 +3795,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
3755 pos += ret; 3795 pos += ret;
3756 } 3796 }
3757 ret = 0; 3797 ret = 0;
3758 } else {
3759 ret = send_clone(sctx, offset, len, clone_root);
3760 } 3798 }
3761
3762out: 3799out:
3763 return ret; 3800 return ret;
3764} 3801}
@@ -4534,7 +4571,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4534 struct btrfs_fs_info *fs_info; 4571 struct btrfs_fs_info *fs_info;
4535 struct btrfs_ioctl_send_args *arg = NULL; 4572 struct btrfs_ioctl_send_args *arg = NULL;
4536 struct btrfs_key key; 4573 struct btrfs_key key;
4537 struct file *filp = NULL;
4538 struct send_ctx *sctx = NULL; 4574 struct send_ctx *sctx = NULL;
4539 u32 i; 4575 u32 i;
4540 u64 *clone_sources_tmp = NULL; 4576 u64 *clone_sources_tmp = NULL;
@@ -4542,7 +4578,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4542 if (!capable(CAP_SYS_ADMIN)) 4578 if (!capable(CAP_SYS_ADMIN))
4543 return -EPERM; 4579 return -EPERM;
4544 4580
4545 send_root = BTRFS_I(fdentry(mnt_file)->d_inode)->root; 4581 send_root = BTRFS_I(file_inode(mnt_file))->root;
4546 fs_info = send_root->fs_info; 4582 fs_info = send_root->fs_info;
4547 4583
4548 arg = memdup_user(arg_, sizeof(*arg)); 4584 arg = memdup_user(arg_, sizeof(*arg));
@@ -4559,6 +4595,11 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4559 goto out; 4595 goto out;
4560 } 4596 }
4561 4597
4598 if (arg->flags & ~BTRFS_SEND_FLAG_NO_FILE_DATA) {
4599 ret = -EINVAL;
4600 goto out;
4601 }
4602
4562 sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS); 4603 sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS);
4563 if (!sctx) { 4604 if (!sctx) {
4564 ret = -ENOMEM; 4605 ret = -ENOMEM;
@@ -4570,6 +4611,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4570 INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS); 4611 INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS);
4571 INIT_LIST_HEAD(&sctx->name_cache_list); 4612 INIT_LIST_HEAD(&sctx->name_cache_list);
4572 4613
4614 sctx->flags = arg->flags;
4615
4573 sctx->send_filp = fget(arg->send_fd); 4616 sctx->send_filp = fget(arg->send_fd);
4574 if (IS_ERR(sctx->send_filp)) { 4617 if (IS_ERR(sctx->send_filp)) {
4575 ret = PTR_ERR(sctx->send_filp); 4618 ret = PTR_ERR(sctx->send_filp);
@@ -4671,8 +4714,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4671 goto out; 4714 goto out;
4672 4715
4673out: 4716out:
4674 if (filp)
4675 fput(filp);
4676 kfree(arg); 4717 kfree(arg);
4677 vfree(clone_sources_tmp); 4718 vfree(clone_sources_tmp);
4678 4719
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 1bf4f32fd4ef..8bb18f7ccaa6 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -86,6 +86,7 @@ enum btrfs_send_cmd {
86 BTRFS_SEND_C_UTIMES, 86 BTRFS_SEND_C_UTIMES,
87 87
88 BTRFS_SEND_C_END, 88 BTRFS_SEND_C_END,
89 BTRFS_SEND_C_UPDATE_EXTENT,
89 __BTRFS_SEND_C_MAX, 90 __BTRFS_SEND_C_MAX,
90}; 91};
91#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) 92#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 99545df1b86c..f6b88595f858 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -41,13 +41,13 @@
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/cleancache.h> 42#include <linux/cleancache.h>
43#include <linux/ratelimit.h> 43#include <linux/ratelimit.h>
44#include <linux/btrfs.h>
44#include "compat.h" 45#include "compat.h"
45#include "delayed-inode.h" 46#include "delayed-inode.h"
46#include "ctree.h" 47#include "ctree.h"
47#include "disk-io.h" 48#include "disk-io.h"
48#include "transaction.h" 49#include "transaction.h"
49#include "btrfs_inode.h" 50#include "btrfs_inode.h"
50#include "ioctl.h"
51#include "print-tree.h" 51#include "print-tree.h"
52#include "xattr.h" 52#include "xattr.h"
53#include "volumes.h" 53#include "volumes.h"
@@ -63,8 +63,7 @@
63static const struct super_operations btrfs_super_ops; 63static const struct super_operations btrfs_super_ops;
64static struct file_system_type btrfs_fs_type; 64static struct file_system_type btrfs_fs_type;
65 65
66static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, 66static const char *btrfs_decode_error(int errno, char nbuf[16])
67 char nbuf[16])
68{ 67{
69 char *errstr = NULL; 68 char *errstr = NULL;
70 69
@@ -98,7 +97,7 @@ static void __save_error_info(struct btrfs_fs_info *fs_info)
98 * today we only save the error info into ram. Long term we'll 97 * today we only save the error info into ram. Long term we'll
99 * also send it down to the disk 98 * also send it down to the disk
100 */ 99 */
101 fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR; 100 set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
102} 101}
103 102
104static void save_error_info(struct btrfs_fs_info *fs_info) 103static void save_error_info(struct btrfs_fs_info *fs_info)
@@ -114,7 +113,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
114 if (sb->s_flags & MS_RDONLY) 113 if (sb->s_flags & MS_RDONLY)
115 return; 114 return;
116 115
117 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 116 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
118 sb->s_flags |= MS_RDONLY; 117 sb->s_flags |= MS_RDONLY;
119 printk(KERN_INFO "btrfs is forced readonly\n"); 118 printk(KERN_INFO "btrfs is forced readonly\n");
120 /* 119 /*
@@ -142,8 +141,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
142 struct super_block *sb = fs_info->sb; 141 struct super_block *sb = fs_info->sb;
143 char nbuf[16]; 142 char nbuf[16];
144 const char *errstr; 143 const char *errstr;
145 va_list args;
146 va_start(args, fmt);
147 144
148 /* 145 /*
149 * Special case: if the error is EROFS, and we're already 146 * Special case: if the error is EROFS, and we're already
@@ -152,15 +149,18 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
152 if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) 149 if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
153 return; 150 return;
154 151
155 errstr = btrfs_decode_error(fs_info, errno, nbuf); 152 errstr = btrfs_decode_error(errno, nbuf);
156 if (fmt) { 153 if (fmt) {
157 struct va_format vaf = { 154 struct va_format vaf;
158 .fmt = fmt, 155 va_list args;
159 .va = &args, 156
160 }; 157 va_start(args, fmt);
158 vaf.fmt = fmt;
159 vaf.va = &args;
161 160
162 printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n", 161 printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n",
163 sb->s_id, function, line, errstr, &vaf); 162 sb->s_id, function, line, errstr, &vaf);
163 va_end(args);
164 } else { 164 } else {
165 printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n", 165 printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
166 sb->s_id, function, line, errstr); 166 sb->s_id, function, line, errstr);
@@ -171,7 +171,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
171 save_error_info(fs_info); 171 save_error_info(fs_info);
172 btrfs_handle_error(fs_info); 172 btrfs_handle_error(fs_info);
173 } 173 }
174 va_end(args);
175} 174}
176 175
177static const char * const logtypes[] = { 176static const char * const logtypes[] = {
@@ -261,13 +260,13 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
261 char nbuf[16]; 260 char nbuf[16];
262 const char *errstr; 261 const char *errstr;
263 262
264 errstr = btrfs_decode_error(root->fs_info, errno, nbuf); 263 errstr = btrfs_decode_error(errno, nbuf);
265 btrfs_printk(root->fs_info, 264 btrfs_printk(root->fs_info,
266 "%s:%d: Aborting unused transaction(%s).\n", 265 "%s:%d: Aborting unused transaction(%s).\n",
267 function, line, errstr); 266 function, line, errstr);
268 return; 267 return;
269 } 268 }
270 trans->transaction->aborted = errno; 269 ACCESS_ONCE(trans->transaction->aborted) = errno;
271 __btrfs_std_error(root->fs_info, function, line, errno, NULL); 270 __btrfs_std_error(root->fs_info, function, line, errno, NULL);
272} 271}
273/* 272/*
@@ -289,8 +288,8 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
289 va_start(args, fmt); 288 va_start(args, fmt);
290 vaf.va = &args; 289 vaf.va = &args;
291 290
292 errstr = btrfs_decode_error(fs_info, errno, nbuf); 291 errstr = btrfs_decode_error(errno, nbuf);
293 if (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR) 292 if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR))
294 panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n", 293 panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n",
295 s_id, function, line, &vaf, errstr); 294 s_id, function, line, &vaf, errstr);
296 295
@@ -438,6 +437,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
438 case Opt_compress_force: 437 case Opt_compress_force:
439 case Opt_compress_force_type: 438 case Opt_compress_force_type:
440 compress_force = true; 439 compress_force = true;
440 /* Fallthrough */
441 case Opt_compress: 441 case Opt_compress:
442 case Opt_compress_type: 442 case Opt_compress_type:
443 if (token == Opt_compress || 443 if (token == Opt_compress ||
@@ -519,7 +519,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
519 case Opt_alloc_start: 519 case Opt_alloc_start:
520 num = match_strdup(&args[0]); 520 num = match_strdup(&args[0]);
521 if (num) { 521 if (num) {
522 mutex_lock(&info->chunk_mutex);
522 info->alloc_start = memparse(num, NULL); 523 info->alloc_start = memparse(num, NULL);
524 mutex_unlock(&info->chunk_mutex);
523 kfree(num); 525 kfree(num);
524 printk(KERN_INFO 526 printk(KERN_INFO
525 "btrfs: allocations start at %llu\n", 527 "btrfs: allocations start at %llu\n",
@@ -876,7 +878,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
876 878
877 btrfs_wait_ordered_extents(root, 0); 879 btrfs_wait_ordered_extents(root, 0);
878 880
879 trans = btrfs_attach_transaction(root); 881 trans = btrfs_attach_transaction_barrier(root);
880 if (IS_ERR(trans)) { 882 if (IS_ERR(trans)) {
881 /* no transaction, don't bother */ 883 /* no transaction, don't bother */
882 if (PTR_ERR(trans) == -ENOENT) 884 if (PTR_ERR(trans) == -ENOENT)
@@ -1200,6 +1202,38 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1200 new_pool_size); 1202 new_pool_size);
1201} 1203}
1202 1204
1205static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info,
1206 unsigned long old_opts, int flags)
1207{
1208 set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
1209
1210 if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
1211 (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
1212 (flags & MS_RDONLY))) {
1213 /* wait for any defraggers to finish */
1214 wait_event(fs_info->transaction_wait,
1215 (atomic_read(&fs_info->defrag_running) == 0));
1216 if (flags & MS_RDONLY)
1217 sync_filesystem(fs_info->sb);
1218 }
1219}
1220
1221static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
1222 unsigned long old_opts)
1223{
1224 /*
1225 * We need cleanup all defragable inodes if the autodefragment is
1226 * close or the fs is R/O.
1227 */
1228 if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
1229 (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
1230 (fs_info->sb->s_flags & MS_RDONLY))) {
1231 btrfs_cleanup_defrag_inodes(fs_info);
1232 }
1233
1234 clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
1235}
1236
1203static int btrfs_remount(struct super_block *sb, int *flags, char *data) 1237static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1204{ 1238{
1205 struct btrfs_fs_info *fs_info = btrfs_sb(sb); 1239 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -1213,6 +1247,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1213 unsigned int old_metadata_ratio = fs_info->metadata_ratio; 1247 unsigned int old_metadata_ratio = fs_info->metadata_ratio;
1214 int ret; 1248 int ret;
1215 1249
1250 btrfs_remount_prepare(fs_info, old_opts, *flags);
1251
1216 ret = btrfs_parse_options(root, data); 1252 ret = btrfs_parse_options(root, data);
1217 if (ret) { 1253 if (ret) {
1218 ret = -EINVAL; 1254 ret = -EINVAL;
@@ -1223,7 +1259,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1223 fs_info->thread_pool_size, old_thread_pool_size); 1259 fs_info->thread_pool_size, old_thread_pool_size);
1224 1260
1225 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 1261 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
1226 return 0; 1262 goto out;
1227 1263
1228 if (*flags & MS_RDONLY) { 1264 if (*flags & MS_RDONLY) {
1229 /* 1265 /*
@@ -1278,7 +1314,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1278 } 1314 }
1279 sb->s_flags &= ~MS_RDONLY; 1315 sb->s_flags &= ~MS_RDONLY;
1280 } 1316 }
1281 1317out:
1318 btrfs_remount_cleanup(fs_info, old_opts);
1282 return 0; 1319 return 0;
1283 1320
1284restore: 1321restore:
@@ -1289,10 +1326,13 @@ restore:
1289 fs_info->mount_opt = old_opts; 1326 fs_info->mount_opt = old_opts;
1290 fs_info->compress_type = old_compress_type; 1327 fs_info->compress_type = old_compress_type;
1291 fs_info->max_inline = old_max_inline; 1328 fs_info->max_inline = old_max_inline;
1329 mutex_lock(&fs_info->chunk_mutex);
1292 fs_info->alloc_start = old_alloc_start; 1330 fs_info->alloc_start = old_alloc_start;
1331 mutex_unlock(&fs_info->chunk_mutex);
1293 btrfs_resize_thread_pool(fs_info, 1332 btrfs_resize_thread_pool(fs_info,
1294 old_thread_pool_size, fs_info->thread_pool_size); 1333 old_thread_pool_size, fs_info->thread_pool_size);
1295 fs_info->metadata_ratio = old_metadata_ratio; 1334 fs_info->metadata_ratio = old_metadata_ratio;
1335 btrfs_remount_cleanup(fs_info, old_opts);
1296 return ret; 1336 return ret;
1297} 1337}
1298 1338
@@ -1518,6 +1558,7 @@ static struct file_system_type btrfs_fs_type = {
1518 .kill_sb = btrfs_kill_super, 1558 .kill_sb = btrfs_kill_super,
1519 .fs_flags = FS_REQUIRES_DEV, 1559 .fs_flags = FS_REQUIRES_DEV,
1520}; 1560};
1561MODULE_ALIAS_FS("btrfs");
1521 1562
1522/* 1563/*
1523 * used by btrfsctl to scan devices when no FS is mounted 1564 * used by btrfsctl to scan devices when no FS is mounted
@@ -1559,7 +1600,7 @@ static int btrfs_freeze(struct super_block *sb)
1559 struct btrfs_trans_handle *trans; 1600 struct btrfs_trans_handle *trans;
1560 struct btrfs_root *root = btrfs_sb(sb)->tree_root; 1601 struct btrfs_root *root = btrfs_sb(sb)->tree_root;
1561 1602
1562 trans = btrfs_attach_transaction(root); 1603 trans = btrfs_attach_transaction_barrier(root);
1563 if (IS_ERR(trans)) { 1604 if (IS_ERR(trans)) {
1564 /* no transaction, don't bother */ 1605 /* no transaction, don't bother */
1565 if (PTR_ERR(trans) == -ENOENT) 1606 if (PTR_ERR(trans) == -ENOENT)
@@ -1684,10 +1725,14 @@ static int __init init_btrfs_fs(void)
1684 if (err) 1725 if (err)
1685 goto free_delayed_inode; 1726 goto free_delayed_inode;
1686 1727
1687 err = btrfs_interface_init(); 1728 err = btrfs_delayed_ref_init();
1688 if (err) 1729 if (err)
1689 goto free_auto_defrag; 1730 goto free_auto_defrag;
1690 1731
1732 err = btrfs_interface_init();
1733 if (err)
1734 goto free_delayed_ref;
1735
1691 err = register_filesystem(&btrfs_fs_type); 1736 err = register_filesystem(&btrfs_fs_type);
1692 if (err) 1737 if (err)
1693 goto unregister_ioctl; 1738 goto unregister_ioctl;
@@ -1699,6 +1744,8 @@ static int __init init_btrfs_fs(void)
1699 1744
1700unregister_ioctl: 1745unregister_ioctl:
1701 btrfs_interface_exit(); 1746 btrfs_interface_exit();
1747free_delayed_ref:
1748 btrfs_delayed_ref_exit();
1702free_auto_defrag: 1749free_auto_defrag:
1703 btrfs_auto_defrag_exit(); 1750 btrfs_auto_defrag_exit();
1704free_delayed_inode: 1751free_delayed_inode:
@@ -1720,6 +1767,7 @@ free_compress:
1720static void __exit exit_btrfs_fs(void) 1767static void __exit exit_btrfs_fs(void)
1721{ 1768{
1722 btrfs_destroy_cachep(); 1769 btrfs_destroy_cachep();
1770 btrfs_delayed_ref_exit();
1723 btrfs_auto_defrag_exit(); 1771 btrfs_auto_defrag_exit();
1724 btrfs_delayed_inode_exit(); 1772 btrfs_delayed_inode_exit();
1725 ordered_data_exit(); 1773 ordered_data_exit();
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index daac9ae6d731..5b326cd60a4a 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -21,7 +21,6 @@
21#include <linux/spinlock.h> 21#include <linux/spinlock.h>
22#include <linux/completion.h> 22#include <linux/completion.h>
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/module.h>
25#include <linux/kobject.h> 24#include <linux/kobject.h>
26 25
27#include "ctree.h" 26#include "ctree.h"
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 87fac9a21ea5..9250b9c4f01e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -40,7 +40,6 @@ void put_transaction(struct btrfs_transaction *transaction)
40 if (atomic_dec_and_test(&transaction->use_count)) { 40 if (atomic_dec_and_test(&transaction->use_count)) {
41 BUG_ON(!list_empty(&transaction->list)); 41 BUG_ON(!list_empty(&transaction->list));
42 WARN_ON(transaction->delayed_refs.root.rb_node); 42 WARN_ON(transaction->delayed_refs.root.rb_node);
43 memset(transaction, 0, sizeof(*transaction));
44 kmem_cache_free(btrfs_transaction_cachep, transaction); 43 kmem_cache_free(btrfs_transaction_cachep, transaction);
45 } 44 }
46} 45}
@@ -51,6 +50,14 @@ static noinline void switch_commit_root(struct btrfs_root *root)
51 root->commit_root = btrfs_root_node(root); 50 root->commit_root = btrfs_root_node(root);
52} 51}
53 52
53static inline int can_join_transaction(struct btrfs_transaction *trans,
54 int type)
55{
56 return !(trans->in_commit &&
57 type != TRANS_JOIN &&
58 type != TRANS_JOIN_NOLOCK);
59}
60
54/* 61/*
55 * either allocate a new transaction or hop into the existing one 62 * either allocate a new transaction or hop into the existing one
56 */ 63 */
@@ -62,7 +69,7 @@ static noinline int join_transaction(struct btrfs_root *root, int type)
62 spin_lock(&fs_info->trans_lock); 69 spin_lock(&fs_info->trans_lock);
63loop: 70loop:
64 /* The file system has been taken offline. No new transactions. */ 71 /* The file system has been taken offline. No new transactions. */
65 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 72 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
66 spin_unlock(&fs_info->trans_lock); 73 spin_unlock(&fs_info->trans_lock);
67 return -EROFS; 74 return -EROFS;
68 } 75 }
@@ -86,6 +93,10 @@ loop:
86 spin_unlock(&fs_info->trans_lock); 93 spin_unlock(&fs_info->trans_lock);
87 return cur_trans->aborted; 94 return cur_trans->aborted;
88 } 95 }
96 if (!can_join_transaction(cur_trans, type)) {
97 spin_unlock(&fs_info->trans_lock);
98 return -EBUSY;
99 }
89 atomic_inc(&cur_trans->use_count); 100 atomic_inc(&cur_trans->use_count);
90 atomic_inc(&cur_trans->num_writers); 101 atomic_inc(&cur_trans->num_writers);
91 cur_trans->num_joined++; 102 cur_trans->num_joined++;
@@ -112,9 +123,8 @@ loop:
112 * to redo the trans_no_join checks above 123 * to redo the trans_no_join checks above
113 */ 124 */
114 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 125 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
115 cur_trans = fs_info->running_transaction;
116 goto loop; 126 goto loop;
117 } else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 127 } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
118 spin_unlock(&fs_info->trans_lock); 128 spin_unlock(&fs_info->trans_lock);
119 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 129 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
120 return -EROFS; 130 return -EROFS;
@@ -156,8 +166,12 @@ loop:
156 166
157 spin_lock_init(&cur_trans->commit_lock); 167 spin_lock_init(&cur_trans->commit_lock);
158 spin_lock_init(&cur_trans->delayed_refs.lock); 168 spin_lock_init(&cur_trans->delayed_refs.lock);
169 atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
170 atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
171 init_waitqueue_head(&cur_trans->delayed_refs.wait);
159 172
160 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 173 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
174 INIT_LIST_HEAD(&cur_trans->ordered_operations);
161 list_add_tail(&cur_trans->list, &fs_info->trans_list); 175 list_add_tail(&cur_trans->list, &fs_info->trans_list);
162 extent_io_tree_init(&cur_trans->dirty_pages, 176 extent_io_tree_init(&cur_trans->dirty_pages,
163 fs_info->btree_inode->i_mapping); 177 fs_info->btree_inode->i_mapping);
@@ -302,7 +316,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,
302 int ret; 316 int ret;
303 u64 qgroup_reserved = 0; 317 u64 qgroup_reserved = 0;
304 318
305 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 319 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
306 return ERR_PTR(-EROFS); 320 return ERR_PTR(-EROFS);
307 321
308 if (current->journal_info) { 322 if (current->journal_info) {
@@ -333,12 +347,14 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,
333 &root->fs_info->trans_block_rsv, 347 &root->fs_info->trans_block_rsv,
334 num_bytes, flush); 348 num_bytes, flush);
335 if (ret) 349 if (ret)
336 return ERR_PTR(ret); 350 goto reserve_fail;
337 } 351 }
338again: 352again:
339 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 353 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
340 if (!h) 354 if (!h) {
341 return ERR_PTR(-ENOMEM); 355 ret = -ENOMEM;
356 goto alloc_fail;
357 }
342 358
343 /* 359 /*
344 * If we are JOIN_NOLOCK we're already committing a transaction and 360 * If we are JOIN_NOLOCK we're already committing a transaction and
@@ -358,18 +374,17 @@ again:
358 374
359 do { 375 do {
360 ret = join_transaction(root, type); 376 ret = join_transaction(root, type);
361 if (ret == -EBUSY) 377 if (ret == -EBUSY) {
362 wait_current_trans(root); 378 wait_current_trans(root);
379 if (unlikely(type == TRANS_ATTACH))
380 ret = -ENOENT;
381 }
363 } while (ret == -EBUSY); 382 } while (ret == -EBUSY);
364 383
365 if (ret < 0) { 384 if (ret < 0) {
366 /* We must get the transaction if we are JOIN_NOLOCK. */ 385 /* We must get the transaction if we are JOIN_NOLOCK. */
367 BUG_ON(type == TRANS_JOIN_NOLOCK); 386 BUG_ON(type == TRANS_JOIN_NOLOCK);
368 387 goto join_fail;
369 if (type < TRANS_JOIN_NOLOCK)
370 sb_end_intwrite(root->fs_info->sb);
371 kmem_cache_free(btrfs_trans_handle_cachep, h);
372 return ERR_PTR(ret);
373 } 388 }
374 389
375 cur_trans = root->fs_info->running_transaction; 390 cur_trans = root->fs_info->running_transaction;
@@ -385,9 +400,10 @@ again:
385 h->block_rsv = NULL; 400 h->block_rsv = NULL;
386 h->orig_rsv = NULL; 401 h->orig_rsv = NULL;
387 h->aborted = 0; 402 h->aborted = 0;
388 h->qgroup_reserved = qgroup_reserved; 403 h->qgroup_reserved = 0;
389 h->delayed_ref_elem.seq = 0; 404 h->delayed_ref_elem.seq = 0;
390 h->type = type; 405 h->type = type;
406 h->allocating_chunk = false;
391 INIT_LIST_HEAD(&h->qgroup_ref_list); 407 INIT_LIST_HEAD(&h->qgroup_ref_list);
392 INIT_LIST_HEAD(&h->new_bgs); 408 INIT_LIST_HEAD(&h->new_bgs);
393 409
@@ -403,6 +419,7 @@ again:
403 h->block_rsv = &root->fs_info->trans_block_rsv; 419 h->block_rsv = &root->fs_info->trans_block_rsv;
404 h->bytes_reserved = num_bytes; 420 h->bytes_reserved = num_bytes;
405 } 421 }
422 h->qgroup_reserved = qgroup_reserved;
406 423
407got_it: 424got_it:
408 btrfs_record_root_in_trans(h, root); 425 btrfs_record_root_in_trans(h, root);
@@ -410,6 +427,19 @@ got_it:
410 if (!current->journal_info && type != TRANS_USERSPACE) 427 if (!current->journal_info && type != TRANS_USERSPACE)
411 current->journal_info = h; 428 current->journal_info = h;
412 return h; 429 return h;
430
431join_fail:
432 if (type < TRANS_JOIN_NOLOCK)
433 sb_end_intwrite(root->fs_info->sb);
434 kmem_cache_free(btrfs_trans_handle_cachep, h);
435alloc_fail:
436 if (num_bytes)
437 btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
438 num_bytes);
439reserve_fail:
440 if (qgroup_reserved)
441 btrfs_qgroup_free(root, qgroup_reserved);
442 return ERR_PTR(ret);
413} 443}
414 444
415struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 445struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
@@ -441,11 +471,43 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root
441 return start_transaction(root, 0, TRANS_USERSPACE, 0); 471 return start_transaction(root, 0, TRANS_USERSPACE, 0);
442} 472}
443 473
474/*
475 * btrfs_attach_transaction() - catch the running transaction
476 *
477 * It is used when we want to commit the current the transaction, but
478 * don't want to start a new one.
479 *
480 * Note: If this function return -ENOENT, it just means there is no
481 * running transaction. But it is possible that the inactive transaction
482 * is still in the memory, not fully on disk. If you hope there is no
483 * inactive transaction in the fs when -ENOENT is returned, you should
484 * invoke
485 * btrfs_attach_transaction_barrier()
486 */
444struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) 487struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
445{ 488{
446 return start_transaction(root, 0, TRANS_ATTACH, 0); 489 return start_transaction(root, 0, TRANS_ATTACH, 0);
447} 490}
448 491
492/*
493 * btrfs_attach_transaction() - catch the running transaction
494 *
495 * It is similar to the above function, the differentia is this one
496 * will wait for all the inactive transactions until they fully
497 * complete.
498 */
499struct btrfs_trans_handle *
500btrfs_attach_transaction_barrier(struct btrfs_root *root)
501{
502 struct btrfs_trans_handle *trans;
503
504 trans = start_transaction(root, 0, TRANS_ATTACH, 0);
505 if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT)
506 btrfs_wait_for_commit(root, 0);
507
508 return trans;
509}
510
449/* wait for a transaction commit to be fully complete */ 511/* wait for a transaction commit to be fully complete */
450static noinline void wait_for_commit(struct btrfs_root *root, 512static noinline void wait_for_commit(struct btrfs_root *root,
451 struct btrfs_transaction *commit) 513 struct btrfs_transaction *commit)
@@ -577,7 +639,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
577 if (!list_empty(&trans->new_bgs)) 639 if (!list_empty(&trans->new_bgs))
578 btrfs_create_pending_block_groups(trans, root); 640 btrfs_create_pending_block_groups(trans, root);
579 641
580 while (count < 2) { 642 while (count < 1) {
581 unsigned long cur = trans->delayed_ref_updates; 643 unsigned long cur = trans->delayed_ref_updates;
582 trans->delayed_ref_updates = 0; 644 trans->delayed_ref_updates = 0;
583 if (cur && 645 if (cur &&
@@ -589,6 +651,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
589 } 651 }
590 count++; 652 count++;
591 } 653 }
654
592 btrfs_trans_release_metadata(trans, root); 655 btrfs_trans_release_metadata(trans, root);
593 trans->block_rsv = NULL; 656 trans->block_rsv = NULL;
594 657
@@ -634,12 +697,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
634 btrfs_run_delayed_iputs(root); 697 btrfs_run_delayed_iputs(root);
635 698
636 if (trans->aborted || 699 if (trans->aborted ||
637 root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 700 test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
638 err = -EIO; 701 err = -EIO;
639 }
640 assert_qgroups_uptodate(trans); 702 assert_qgroups_uptodate(trans);
641 703
642 memset(trans, 0, sizeof(*trans));
643 kmem_cache_free(btrfs_trans_handle_cachep, trans); 704 kmem_cache_free(btrfs_trans_handle_cachep, trans);
644 return err; 705 return err;
645} 706}
@@ -686,7 +747,9 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
686 struct extent_state *cached_state = NULL; 747 struct extent_state *cached_state = NULL;
687 u64 start = 0; 748 u64 start = 0;
688 u64 end; 749 u64 end;
750 struct blk_plug plug;
689 751
752 blk_start_plug(&plug);
690 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 753 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
691 mark, &cached_state)) { 754 mark, &cached_state)) {
692 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, 755 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
@@ -700,6 +763,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
700 } 763 }
701 if (err) 764 if (err)
702 werr = err; 765 werr = err;
766 blk_finish_plug(&plug);
703 return werr; 767 return werr;
704} 768}
705 769
@@ -950,10 +1014,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
950} 1014}
951 1015
952/* 1016/*
953 * defrag a given btree. If cacheonly == 1, this won't read from the disk, 1017 * defrag a given btree.
954 * otherwise every leaf in the btree is read and defragged. 1018 * Every leaf in the btree is read and defragged.
955 */ 1019 */
956int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) 1020int btrfs_defrag_root(struct btrfs_root *root)
957{ 1021{
958 struct btrfs_fs_info *info = root->fs_info; 1022 struct btrfs_fs_info *info = root->fs_info;
959 struct btrfs_trans_handle *trans; 1023 struct btrfs_trans_handle *trans;
@@ -967,7 +1031,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
967 if (IS_ERR(trans)) 1031 if (IS_ERR(trans))
968 return PTR_ERR(trans); 1032 return PTR_ERR(trans);
969 1033
970 ret = btrfs_defrag_leaves(trans, root, cacheonly); 1034 ret = btrfs_defrag_leaves(trans, root);
971 1035
972 btrfs_end_transaction(trans, root); 1036 btrfs_end_transaction(trans, root);
973 btrfs_btree_balance_dirty(info->tree_root); 1037 btrfs_btree_balance_dirty(info->tree_root);
@@ -975,6 +1039,12 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
975 1039
976 if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) 1040 if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
977 break; 1041 break;
1042
1043 if (btrfs_defrag_cancelled(root->fs_info)) {
1044 printk(KERN_DEBUG "btrfs: defrag_root cancelled\n");
1045 ret = -EAGAIN;
1046 break;
1047 }
978 } 1048 }
979 root->defrag_running = 0; 1049 root->defrag_running = 0;
980 return ret; 1050 return ret;
@@ -982,7 +1052,12 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
982 1052
983/* 1053/*
984 * new snapshots need to be created at a very specific time in the 1054 * new snapshots need to be created at a very specific time in the
985 * transaction commit. This does the actual creation 1055 * transaction commit. This does the actual creation.
1056 *
1057 * Note:
1058 * If the error which may affect the commitment of the current transaction
1059 * happens, we should return the error number. If the error which just affect
1060 * the creation of the pending snapshots, just return 0.
986 */ 1061 */
987static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, 1062static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
988 struct btrfs_fs_info *fs_info, 1063 struct btrfs_fs_info *fs_info,
@@ -997,12 +1072,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
997 struct inode *parent_inode; 1072 struct inode *parent_inode;
998 struct btrfs_path *path; 1073 struct btrfs_path *path;
999 struct btrfs_dir_item *dir_item; 1074 struct btrfs_dir_item *dir_item;
1000 struct dentry *parent;
1001 struct dentry *dentry; 1075 struct dentry *dentry;
1002 struct extent_buffer *tmp; 1076 struct extent_buffer *tmp;
1003 struct extent_buffer *old; 1077 struct extent_buffer *old;
1004 struct timespec cur_time = CURRENT_TIME; 1078 struct timespec cur_time = CURRENT_TIME;
1005 int ret; 1079 int ret = 0;
1006 u64 to_reserve = 0; 1080 u64 to_reserve = 0;
1007 u64 index = 0; 1081 u64 index = 0;
1008 u64 objectid; 1082 u64 objectid;
@@ -1011,40 +1085,36 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1011 1085
1012 path = btrfs_alloc_path(); 1086 path = btrfs_alloc_path();
1013 if (!path) { 1087 if (!path) {
1014 ret = pending->error = -ENOMEM; 1088 pending->error = -ENOMEM;
1015 goto path_alloc_fail; 1089 return 0;
1016 } 1090 }
1017 1091
1018 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 1092 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
1019 if (!new_root_item) { 1093 if (!new_root_item) {
1020 ret = pending->error = -ENOMEM; 1094 pending->error = -ENOMEM;
1021 goto root_item_alloc_fail; 1095 goto root_item_alloc_fail;
1022 } 1096 }
1023 1097
1024 ret = btrfs_find_free_objectid(tree_root, &objectid); 1098 pending->error = btrfs_find_free_objectid(tree_root, &objectid);
1025 if (ret) { 1099 if (pending->error)
1026 pending->error = ret;
1027 goto no_free_objectid; 1100 goto no_free_objectid;
1028 }
1029 1101
1030 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 1102 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
1031 1103
1032 if (to_reserve > 0) { 1104 if (to_reserve > 0) {
1033 ret = btrfs_block_rsv_add(root, &pending->block_rsv, 1105 pending->error = btrfs_block_rsv_add(root,
1034 to_reserve, 1106 &pending->block_rsv,
1035 BTRFS_RESERVE_NO_FLUSH); 1107 to_reserve,
1036 if (ret) { 1108 BTRFS_RESERVE_NO_FLUSH);
1037 pending->error = ret; 1109 if (pending->error)
1038 goto no_free_objectid; 1110 goto no_free_objectid;
1039 }
1040 } 1111 }
1041 1112
1042 ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid, 1113 pending->error = btrfs_qgroup_inherit(trans, fs_info,
1043 objectid, pending->inherit); 1114 root->root_key.objectid,
1044 if (ret) { 1115 objectid, pending->inherit);
1045 pending->error = ret; 1116 if (pending->error)
1046 goto no_free_objectid; 1117 goto no_free_objectid;
1047 }
1048 1118
1049 key.objectid = objectid; 1119 key.objectid = objectid;
1050 key.offset = (u64)-1; 1120 key.offset = (u64)-1;
@@ -1052,10 +1122,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1052 1122
1053 rsv = trans->block_rsv; 1123 rsv = trans->block_rsv;
1054 trans->block_rsv = &pending->block_rsv; 1124 trans->block_rsv = &pending->block_rsv;
1125 trans->bytes_reserved = trans->block_rsv->reserved;
1055 1126
1056 dentry = pending->dentry; 1127 dentry = pending->dentry;
1057 parent = dget_parent(dentry); 1128 parent_inode = pending->dir;
1058 parent_inode = parent->d_inode;
1059 parent_root = BTRFS_I(parent_inode)->root; 1129 parent_root = BTRFS_I(parent_inode)->root;
1060 record_root_in_trans(trans, parent_root); 1130 record_root_in_trans(trans, parent_root);
1061 1131
@@ -1072,7 +1142,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1072 dentry->d_name.len, 0); 1142 dentry->d_name.len, 0);
1073 if (dir_item != NULL && !IS_ERR(dir_item)) { 1143 if (dir_item != NULL && !IS_ERR(dir_item)) {
1074 pending->error = -EEXIST; 1144 pending->error = -EEXIST;
1075 goto fail; 1145 goto dir_item_existed;
1076 } else if (IS_ERR(dir_item)) { 1146 } else if (IS_ERR(dir_item)) {
1077 ret = PTR_ERR(dir_item); 1147 ret = PTR_ERR(dir_item);
1078 btrfs_abort_transaction(trans, root, ret); 1148 btrfs_abort_transaction(trans, root, ret);
@@ -1203,14 +1273,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1203 if (ret) 1273 if (ret)
1204 btrfs_abort_transaction(trans, root, ret); 1274 btrfs_abort_transaction(trans, root, ret);
1205fail: 1275fail:
1206 dput(parent); 1276 pending->error = ret;
1277dir_item_existed:
1207 trans->block_rsv = rsv; 1278 trans->block_rsv = rsv;
1279 trans->bytes_reserved = 0;
1208no_free_objectid: 1280no_free_objectid:
1209 kfree(new_root_item); 1281 kfree(new_root_item);
1210root_item_alloc_fail: 1282root_item_alloc_fail:
1211 btrfs_free_path(path); 1283 btrfs_free_path(path);
1212path_alloc_fail:
1213 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
1214 return ret; 1284 return ret;
1215} 1285}
1216 1286
@@ -1220,12 +1290,17 @@ path_alloc_fail:
1220static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, 1290static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
1221 struct btrfs_fs_info *fs_info) 1291 struct btrfs_fs_info *fs_info)
1222{ 1292{
1223 struct btrfs_pending_snapshot *pending; 1293 struct btrfs_pending_snapshot *pending, *next;
1224 struct list_head *head = &trans->transaction->pending_snapshots; 1294 struct list_head *head = &trans->transaction->pending_snapshots;
1295 int ret = 0;
1225 1296
1226 list_for_each_entry(pending, head, list) 1297 list_for_each_entry_safe(pending, next, head, list) {
1227 create_pending_snapshot(trans, fs_info, pending); 1298 list_del(&pending->list);
1228 return 0; 1299 ret = create_pending_snapshot(trans, fs_info, pending);
1300 if (ret)
1301 break;
1302 }
1303 return ret;
1229} 1304}
1230 1305
1231static void update_super_roots(struct btrfs_root *root) 1306static void update_super_roots(struct btrfs_root *root)
@@ -1296,13 +1371,13 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
1296struct btrfs_async_commit { 1371struct btrfs_async_commit {
1297 struct btrfs_trans_handle *newtrans; 1372 struct btrfs_trans_handle *newtrans;
1298 struct btrfs_root *root; 1373 struct btrfs_root *root;
1299 struct delayed_work work; 1374 struct work_struct work;
1300}; 1375};
1301 1376
1302static void do_async_commit(struct work_struct *work) 1377static void do_async_commit(struct work_struct *work)
1303{ 1378{
1304 struct btrfs_async_commit *ac = 1379 struct btrfs_async_commit *ac =
1305 container_of(work, struct btrfs_async_commit, work.work); 1380 container_of(work, struct btrfs_async_commit, work);
1306 1381
1307 /* 1382 /*
1308 * We've got freeze protection passed with the transaction. 1383 * We've got freeze protection passed with the transaction.
@@ -1330,7 +1405,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1330 if (!ac) 1405 if (!ac)
1331 return -ENOMEM; 1406 return -ENOMEM;
1332 1407
1333 INIT_DELAYED_WORK(&ac->work, do_async_commit); 1408 INIT_WORK(&ac->work, do_async_commit);
1334 ac->root = root; 1409 ac->root = root;
1335 ac->newtrans = btrfs_join_transaction(root); 1410 ac->newtrans = btrfs_join_transaction(root);
1336 if (IS_ERR(ac->newtrans)) { 1411 if (IS_ERR(ac->newtrans)) {
@@ -1354,7 +1429,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1354 &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1429 &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1355 1, _THIS_IP_); 1430 1, _THIS_IP_);
1356 1431
1357 schedule_delayed_work(&ac->work, 0); 1432 schedule_work(&ac->work);
1358 1433
1359 /* wait for transaction to start and unblock */ 1434 /* wait for transaction to start and unblock */
1360 if (wait_for_unblock) 1435 if (wait_for_unblock)
@@ -1374,16 +1449,29 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1374 struct btrfs_root *root, int err) 1449 struct btrfs_root *root, int err)
1375{ 1450{
1376 struct btrfs_transaction *cur_trans = trans->transaction; 1451 struct btrfs_transaction *cur_trans = trans->transaction;
1452 DEFINE_WAIT(wait);
1377 1453
1378 WARN_ON(trans->use_count > 1); 1454 WARN_ON(trans->use_count > 1);
1379 1455
1380 btrfs_abort_transaction(trans, root, err); 1456 btrfs_abort_transaction(trans, root, err);
1381 1457
1382 spin_lock(&root->fs_info->trans_lock); 1458 spin_lock(&root->fs_info->trans_lock);
1459
1460 if (list_empty(&cur_trans->list)) {
1461 spin_unlock(&root->fs_info->trans_lock);
1462 btrfs_end_transaction(trans, root);
1463 return;
1464 }
1465
1383 list_del_init(&cur_trans->list); 1466 list_del_init(&cur_trans->list);
1384 if (cur_trans == root->fs_info->running_transaction) { 1467 if (cur_trans == root->fs_info->running_transaction) {
1468 root->fs_info->trans_no_join = 1;
1469 spin_unlock(&root->fs_info->trans_lock);
1470 wait_event(cur_trans->writer_wait,
1471 atomic_read(&cur_trans->num_writers) == 1);
1472
1473 spin_lock(&root->fs_info->trans_lock);
1385 root->fs_info->running_transaction = NULL; 1474 root->fs_info->running_transaction = NULL;
1386 root->fs_info->trans_no_join = 0;
1387 } 1475 }
1388 spin_unlock(&root->fs_info->trans_lock); 1476 spin_unlock(&root->fs_info->trans_lock);
1389 1477
@@ -1417,7 +1505,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1417 } 1505 }
1418 1506
1419 if (flush_on_commit || snap_pending) { 1507 if (flush_on_commit || snap_pending) {
1420 btrfs_start_delalloc_inodes(root, 1); 1508 ret = btrfs_start_delalloc_inodes(root, 1);
1509 if (ret)
1510 return ret;
1421 btrfs_wait_ordered_extents(root, 1); 1511 btrfs_wait_ordered_extents(root, 1);
1422 } 1512 }
1423 1513
@@ -1439,9 +1529,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1439 * it here and no for sure that nothing new will be added 1529 * it here and no for sure that nothing new will be added
1440 * to the list 1530 * to the list
1441 */ 1531 */
1442 btrfs_run_ordered_operations(root, 1); 1532 ret = btrfs_run_ordered_operations(trans, root, 1);
1443 1533
1444 return 0; 1534 return ret;
1445} 1535}
1446 1536
1447/* 1537/*
@@ -1462,26 +1552,35 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1462 int should_grow = 0; 1552 int should_grow = 0;
1463 unsigned long now = get_seconds(); 1553 unsigned long now = get_seconds();
1464 1554
1465 ret = btrfs_run_ordered_operations(root, 0); 1555 ret = btrfs_run_ordered_operations(trans, root, 0);
1466 if (ret) { 1556 if (ret) {
1467 btrfs_abort_transaction(trans, root, ret); 1557 btrfs_abort_transaction(trans, root, ret);
1468 goto cleanup_transaction; 1558 btrfs_end_transaction(trans, root);
1559 return ret;
1469 } 1560 }
1470 1561
1471 if (cur_trans->aborted) { 1562 /* Stop the commit early if ->aborted is set */
1563 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
1472 ret = cur_trans->aborted; 1564 ret = cur_trans->aborted;
1473 goto cleanup_transaction; 1565 btrfs_end_transaction(trans, root);
1566 return ret;
1474 } 1567 }
1475 1568
1476 /* make a pass through all the delayed refs we have so far 1569 /* make a pass through all the delayed refs we have so far
1477 * any runnings procs may add more while we are here 1570 * any runnings procs may add more while we are here
1478 */ 1571 */
1479 ret = btrfs_run_delayed_refs(trans, root, 0); 1572 ret = btrfs_run_delayed_refs(trans, root, 0);
1480 if (ret) 1573 if (ret) {
1481 goto cleanup_transaction; 1574 btrfs_end_transaction(trans, root);
1575 return ret;
1576 }
1482 1577
1483 btrfs_trans_release_metadata(trans, root); 1578 btrfs_trans_release_metadata(trans, root);
1484 trans->block_rsv = NULL; 1579 trans->block_rsv = NULL;
1580 if (trans->qgroup_reserved) {
1581 btrfs_qgroup_free(root, trans->qgroup_reserved);
1582 trans->qgroup_reserved = 0;
1583 }
1485 1584
1486 cur_trans = trans->transaction; 1585 cur_trans = trans->transaction;
1487 1586
@@ -1495,8 +1594,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1495 btrfs_create_pending_block_groups(trans, root); 1594 btrfs_create_pending_block_groups(trans, root);
1496 1595
1497 ret = btrfs_run_delayed_refs(trans, root, 0); 1596 ret = btrfs_run_delayed_refs(trans, root, 0);
1498 if (ret) 1597 if (ret) {
1499 goto cleanup_transaction; 1598 btrfs_end_transaction(trans, root);
1599 return ret;
1600 }
1500 1601
1501 spin_lock(&cur_trans->commit_lock); 1602 spin_lock(&cur_trans->commit_lock);
1502 if (cur_trans->in_commit) { 1603 if (cur_trans->in_commit) {
@@ -1574,6 +1675,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1574 wait_event(cur_trans->writer_wait, 1675 wait_event(cur_trans->writer_wait,
1575 atomic_read(&cur_trans->num_writers) == 1); 1676 atomic_read(&cur_trans->num_writers) == 1);
1576 1677
1678 /* ->aborted might be set after the previous check, so check it */
1679 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
1680 ret = cur_trans->aborted;
1681 goto cleanup_transaction;
1682 }
1577 /* 1683 /*
1578 * the reloc mutex makes sure that we stop 1684 * the reloc mutex makes sure that we stop
1579 * the balancing code from coming in and moving 1685 * the balancing code from coming in and moving
@@ -1657,6 +1763,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1657 goto cleanup_transaction; 1763 goto cleanup_transaction;
1658 } 1764 }
1659 1765
1766 /*
1767 * The tasks which save the space cache and inode cache may also
1768 * update ->aborted, check it.
1769 */
1770 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
1771 ret = cur_trans->aborted;
1772 mutex_unlock(&root->fs_info->tree_log_mutex);
1773 mutex_unlock(&root->fs_info->reloc_mutex);
1774 goto cleanup_transaction;
1775 }
1776
1660 btrfs_prepare_extent_commit(trans, root); 1777 btrfs_prepare_extent_commit(trans, root);
1661 1778
1662 cur_trans = root->fs_info->running_transaction; 1779 cur_trans = root->fs_info->running_transaction;
@@ -1744,6 +1861,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1744cleanup_transaction: 1861cleanup_transaction:
1745 btrfs_trans_release_metadata(trans, root); 1862 btrfs_trans_release_metadata(trans, root);
1746 trans->block_rsv = NULL; 1863 trans->block_rsv = NULL;
1864 if (trans->qgroup_reserved) {
1865 btrfs_qgroup_free(root, trans->qgroup_reserved);
1866 trans->qgroup_reserved = 0;
1867 }
1747 btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n"); 1868 btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n");
1748// WARN_ON(1); 1869// WARN_ON(1);
1749 if (current->journal_info == trans) 1870 if (current->journal_info == trans)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 0e8aa1e6c287..3c8e0d25c8e4 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -43,6 +43,7 @@ struct btrfs_transaction {
43 wait_queue_head_t writer_wait; 43 wait_queue_head_t writer_wait;
44 wait_queue_head_t commit_wait; 44 wait_queue_head_t commit_wait;
45 struct list_head pending_snapshots; 45 struct list_head pending_snapshots;
46 struct list_head ordered_operations;
46 struct btrfs_delayed_ref_root delayed_refs; 47 struct btrfs_delayed_ref_root delayed_refs;
47 int aborted; 48 int aborted;
48}; 49};
@@ -68,6 +69,7 @@ struct btrfs_trans_handle {
68 struct btrfs_block_rsv *orig_rsv; 69 struct btrfs_block_rsv *orig_rsv;
69 short aborted; 70 short aborted;
70 short adding_csums; 71 short adding_csums;
72 bool allocating_chunk;
71 enum btrfs_trans_type type; 73 enum btrfs_trans_type type;
72 /* 74 /*
73 * this root is only needed to validate that the root passed to 75 * this root is only needed to validate that the root passed to
@@ -82,11 +84,13 @@ struct btrfs_trans_handle {
82 84
83struct btrfs_pending_snapshot { 85struct btrfs_pending_snapshot {
84 struct dentry *dentry; 86 struct dentry *dentry;
87 struct inode *dir;
85 struct btrfs_root *root; 88 struct btrfs_root *root;
86 struct btrfs_root *snap; 89 struct btrfs_root *snap;
87 struct btrfs_qgroup_inherit *inherit; 90 struct btrfs_qgroup_inherit *inherit;
88 /* block reservation for the operation */ 91 /* block reservation for the operation */
89 struct btrfs_block_rsv block_rsv; 92 struct btrfs_block_rsv block_rsv;
93 u64 qgroup_reserved;
90 /* extra metadata reseration for relocation */ 94 /* extra metadata reseration for relocation */
91 int error; 95 int error;
92 bool readonly; 96 bool readonly;
@@ -110,13 +114,15 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush(
110struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); 114struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
111struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); 115struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
112struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); 116struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
117struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
118 struct btrfs_root *root);
113struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); 119struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
114int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); 120int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
115int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 121int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
116 struct btrfs_root *root); 122 struct btrfs_root *root);
117 123
118int btrfs_add_dead_root(struct btrfs_root *root); 124int btrfs_add_dead_root(struct btrfs_root *root);
119int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); 125int btrfs_defrag_root(struct btrfs_root *root);
120int btrfs_clean_old_snapshots(struct btrfs_root *root); 126int btrfs_clean_old_snapshots(struct btrfs_root *root);
121int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 127int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
122 struct btrfs_root *root); 128 struct btrfs_root *root);
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3b580ee8ab1d..94e05c1f118a 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -23,13 +23,14 @@
23#include "transaction.h" 23#include "transaction.h"
24#include "locking.h" 24#include "locking.h"
25 25
26/* defrag all the leaves in a given btree. If cache_only == 1, don't read 26/*
27 * things from disk, otherwise read all the leaves and try to get key order to 27 * Defrag all the leaves in a given btree.
28 * Read all the leaves and try to get key order to
28 * better reflect disk order 29 * better reflect disk order
29 */ 30 */
30 31
31int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 32int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
32 struct btrfs_root *root, int cache_only) 33 struct btrfs_root *root)
33{ 34{
34 struct btrfs_path *path = NULL; 35 struct btrfs_path *path = NULL;
35 struct btrfs_key key; 36 struct btrfs_key key;
@@ -41,9 +42,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
41 u64 last_ret = 0; 42 u64 last_ret = 0;
42 u64 min_trans = 0; 43 u64 min_trans = 0;
43 44
44 if (cache_only)
45 goto out;
46
47 if (root->fs_info->extent_root == root) { 45 if (root->fs_info->extent_root == root) {
48 /* 46 /*
49 * there's recursion here right now in the tree locking, 47 * there's recursion here right now in the tree locking,
@@ -86,11 +84,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
86 } 84 }
87 85
88 path->keep_locks = 1; 86 path->keep_locks = 1;
89 if (cache_only)
90 min_trans = root->defrag_trans_start;
91 87
92 ret = btrfs_search_forward(root, &key, NULL, path, 88 ret = btrfs_search_forward(root, &key, NULL, path, min_trans);
93 cache_only, min_trans);
94 if (ret < 0) 89 if (ret < 0)
95 goto out; 90 goto out;
96 if (ret > 0) { 91 if (ret > 0) {
@@ -109,11 +104,11 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
109 goto out; 104 goto out;
110 } 105 }
111 path->slots[1] = btrfs_header_nritems(path->nodes[1]); 106 path->slots[1] = btrfs_header_nritems(path->nodes[1]);
112 next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only, 107 next_key_ret = btrfs_find_next_key(root, path, &key, 1,
113 min_trans); 108 min_trans);
114 ret = btrfs_realloc_node(trans, root, 109 ret = btrfs_realloc_node(trans, root,
115 path->nodes[1], 0, 110 path->nodes[1], 0,
116 cache_only, &last_ret, 111 &last_ret,
117 &root->defrag_progress); 112 &root->defrag_progress);
118 if (ret) { 113 if (ret) {
119 WARN_ON(ret == -EAGAIN); 114 WARN_ON(ret == -EAGAIN);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 83186c7e45d4..451fad96ecd1 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -278,8 +278,7 @@ static int process_one_buffer(struct btrfs_root *log,
278 struct walk_control *wc, u64 gen) 278 struct walk_control *wc, u64 gen)
279{ 279{
280 if (wc->pin) 280 if (wc->pin)
281 btrfs_pin_extent_for_log_replay(wc->trans, 281 btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
282 log->fs_info->extent_root,
283 eb->start, eb->len); 282 eb->start, eb->len);
284 283
285 if (btrfs_buffer_uptodate(eb, gen, 0)) { 284 if (btrfs_buffer_uptodate(eb, gen, 0)) {
@@ -485,7 +484,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
485 struct btrfs_key *key) 484 struct btrfs_key *key)
486{ 485{
487 int found_type; 486 int found_type;
488 u64 mask = root->sectorsize - 1;
489 u64 extent_end; 487 u64 extent_end;
490 u64 start = key->offset; 488 u64 start = key->offset;
491 u64 saved_nbytes; 489 u64 saved_nbytes;
@@ -502,7 +500,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
502 extent_end = start + btrfs_file_extent_num_bytes(eb, item); 500 extent_end = start + btrfs_file_extent_num_bytes(eb, item);
503 else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 501 else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
504 size = btrfs_file_extent_inline_len(eb, item); 502 size = btrfs_file_extent_inline_len(eb, item);
505 extent_end = (start + size + mask) & ~mask; 503 extent_end = ALIGN(start + size, root->sectorsize);
506 } else { 504 } else {
507 ret = 0; 505 ret = 0;
508 goto out; 506 goto out;
@@ -1384,7 +1382,10 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1384 1382
1385 btrfs_release_path(path); 1383 btrfs_release_path(path);
1386 if (ret == 0) { 1384 if (ret == 0) {
1387 btrfs_inc_nlink(inode); 1385 if (!inode->i_nlink)
1386 set_nlink(inode, 1);
1387 else
1388 btrfs_inc_nlink(inode);
1388 ret = btrfs_update_inode(trans, root, inode); 1389 ret = btrfs_update_inode(trans, root, inode);
1389 } else if (ret == -EEXIST) { 1390 } else if (ret == -EEXIST) {
1390 ret = 0; 1391 ret = 0;
@@ -2281,6 +2282,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2281 unsigned long log_transid = 0; 2282 unsigned long log_transid = 0;
2282 2283
2283 mutex_lock(&root->log_mutex); 2284 mutex_lock(&root->log_mutex);
2285 log_transid = root->log_transid;
2284 index1 = root->log_transid % 2; 2286 index1 = root->log_transid % 2;
2285 if (atomic_read(&root->log_commit[index1])) { 2287 if (atomic_read(&root->log_commit[index1])) {
2286 wait_log_commit(trans, root, root->log_transid); 2288 wait_log_commit(trans, root, root->log_transid);
@@ -2308,11 +2310,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2308 /* bail out if we need to do a full commit */ 2310 /* bail out if we need to do a full commit */
2309 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2311 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2310 ret = -EAGAIN; 2312 ret = -EAGAIN;
2313 btrfs_free_logged_extents(log, log_transid);
2311 mutex_unlock(&root->log_mutex); 2314 mutex_unlock(&root->log_mutex);
2312 goto out; 2315 goto out;
2313 } 2316 }
2314 2317
2315 log_transid = root->log_transid;
2316 if (log_transid % 2 == 0) 2318 if (log_transid % 2 == 0)
2317 mark = EXTENT_DIRTY; 2319 mark = EXTENT_DIRTY;
2318 else 2320 else
@@ -2324,6 +2326,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2324 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); 2326 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
2325 if (ret) { 2327 if (ret) {
2326 btrfs_abort_transaction(trans, root, ret); 2328 btrfs_abort_transaction(trans, root, ret);
2329 btrfs_free_logged_extents(log, log_transid);
2327 mutex_unlock(&root->log_mutex); 2330 mutex_unlock(&root->log_mutex);
2328 goto out; 2331 goto out;
2329 } 2332 }
@@ -2363,6 +2366,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2363 } 2366 }
2364 root->fs_info->last_trans_log_full_commit = trans->transid; 2367 root->fs_info->last_trans_log_full_commit = trans->transid;
2365 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2368 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2369 btrfs_free_logged_extents(log, log_transid);
2366 mutex_unlock(&log_root_tree->log_mutex); 2370 mutex_unlock(&log_root_tree->log_mutex);
2367 ret = -EAGAIN; 2371 ret = -EAGAIN;
2368 goto out; 2372 goto out;
@@ -2373,6 +2377,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2373 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2377 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2374 wait_log_commit(trans, log_root_tree, 2378 wait_log_commit(trans, log_root_tree,
2375 log_root_tree->log_transid); 2379 log_root_tree->log_transid);
2380 btrfs_free_logged_extents(log, log_transid);
2376 mutex_unlock(&log_root_tree->log_mutex); 2381 mutex_unlock(&log_root_tree->log_mutex);
2377 ret = 0; 2382 ret = 0;
2378 goto out; 2383 goto out;
@@ -2392,6 +2397,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2392 */ 2397 */
2393 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2398 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2394 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2399 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2400 btrfs_free_logged_extents(log, log_transid);
2395 mutex_unlock(&log_root_tree->log_mutex); 2401 mutex_unlock(&log_root_tree->log_mutex);
2396 ret = -EAGAIN; 2402 ret = -EAGAIN;
2397 goto out_wake_log_root; 2403 goto out_wake_log_root;
@@ -2402,10 +2408,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2402 EXTENT_DIRTY | EXTENT_NEW); 2408 EXTENT_DIRTY | EXTENT_NEW);
2403 if (ret) { 2409 if (ret) {
2404 btrfs_abort_transaction(trans, root, ret); 2410 btrfs_abort_transaction(trans, root, ret);
2411 btrfs_free_logged_extents(log, log_transid);
2405 mutex_unlock(&log_root_tree->log_mutex); 2412 mutex_unlock(&log_root_tree->log_mutex);
2406 goto out_wake_log_root; 2413 goto out_wake_log_root;
2407 } 2414 }
2408 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2415 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2416 btrfs_wait_logged_extents(log, log_transid);
2409 2417
2410 btrfs_set_super_log_root(root->fs_info->super_for_commit, 2418 btrfs_set_super_log_root(root->fs_info->super_for_commit,
2411 log_root_tree->node->start); 2419 log_root_tree->node->start);
@@ -2461,8 +2469,10 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
2461 .process_func = process_one_buffer 2469 .process_func = process_one_buffer
2462 }; 2470 };
2463 2471
2464 ret = walk_log_tree(trans, log, &wc); 2472 if (trans) {
2465 BUG_ON(ret); 2473 ret = walk_log_tree(trans, log, &wc);
2474 BUG_ON(ret);
2475 }
2466 2476
2467 while (1) { 2477 while (1) {
2468 ret = find_first_extent_bit(&log->dirty_log_pages, 2478 ret = find_first_extent_bit(&log->dirty_log_pages,
@@ -2475,6 +2485,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
2475 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2485 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
2476 } 2486 }
2477 2487
2488 /*
2489 * We may have short-circuited the log tree with the full commit logic
2490 * and left ordered extents on our list, so clear these out to keep us
2491 * from leaking inodes and memory.
2492 */
2493 btrfs_free_logged_extents(log, 0);
2494 btrfs_free_logged_extents(log, 1);
2495
2478 free_extent_buffer(log->node); 2496 free_extent_buffer(log->node);
2479 kfree(log); 2497 kfree(log);
2480} 2498}
@@ -2724,7 +2742,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2724 path->keep_locks = 1; 2742 path->keep_locks = 1;
2725 2743
2726 ret = btrfs_search_forward(root, &min_key, &max_key, 2744 ret = btrfs_search_forward(root, &min_key, &max_key,
2727 path, 0, trans->transid); 2745 path, trans->transid);
2728 2746
2729 /* 2747 /*
2730 * we didn't find anything from this transaction, see if there 2748 * we didn't find anything from this transaction, see if there
@@ -3271,16 +3289,21 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3271 struct btrfs_root *log = root->log_root; 3289 struct btrfs_root *log = root->log_root;
3272 struct btrfs_file_extent_item *fi; 3290 struct btrfs_file_extent_item *fi;
3273 struct extent_buffer *leaf; 3291 struct extent_buffer *leaf;
3292 struct btrfs_ordered_extent *ordered;
3274 struct list_head ordered_sums; 3293 struct list_head ordered_sums;
3275 struct btrfs_map_token token; 3294 struct btrfs_map_token token;
3276 struct btrfs_key key; 3295 struct btrfs_key key;
3277 u64 csum_offset = em->mod_start - em->start; 3296 u64 mod_start = em->mod_start;
3278 u64 csum_len = em->mod_len; 3297 u64 mod_len = em->mod_len;
3298 u64 csum_offset;
3299 u64 csum_len;
3279 u64 extent_offset = em->start - em->orig_start; 3300 u64 extent_offset = em->start - em->orig_start;
3280 u64 block_len; 3301 u64 block_len;
3281 int ret; 3302 int ret;
3303 int index = log->log_transid % 2;
3282 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3304 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3283 3305
3306insert:
3284 INIT_LIST_HEAD(&ordered_sums); 3307 INIT_LIST_HEAD(&ordered_sums);
3285 btrfs_init_map_token(&token); 3308 btrfs_init_map_token(&token);
3286 key.objectid = btrfs_ino(inode); 3309 key.objectid = btrfs_ino(inode);
@@ -3296,6 +3319,23 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3296 leaf = path->nodes[0]; 3319 leaf = path->nodes[0];
3297 fi = btrfs_item_ptr(leaf, path->slots[0], 3320 fi = btrfs_item_ptr(leaf, path->slots[0],
3298 struct btrfs_file_extent_item); 3321 struct btrfs_file_extent_item);
3322
3323 /*
3324 * If we are overwriting an inline extent with a real one then we need
3325 * to just delete the inline extent as it may not be large enough to
3326 * have the entire file_extent_item.
3327 */
3328 if (ret && btrfs_token_file_extent_type(leaf, fi, &token) ==
3329 BTRFS_FILE_EXTENT_INLINE) {
3330 ret = btrfs_del_item(trans, log, path);
3331 btrfs_release_path(path);
3332 if (ret) {
3333 path->really_keep_locks = 0;
3334 return ret;
3335 }
3336 goto insert;
3337 }
3338
3299 btrfs_set_token_file_extent_generation(leaf, fi, em->generation, 3339 btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
3300 &token); 3340 &token);
3301 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3341 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
@@ -3357,6 +3397,97 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3357 if (skip_csum) 3397 if (skip_csum)
3358 return 0; 3398 return 0;
3359 3399
3400 if (em->compress_type) {
3401 csum_offset = 0;
3402 csum_len = block_len;
3403 }
3404
3405 /*
3406 * First check and see if our csums are on our outstanding ordered
3407 * extents.
3408 */
3409again:
3410 spin_lock_irq(&log->log_extents_lock[index]);
3411 list_for_each_entry(ordered, &log->logged_list[index], log_list) {
3412 struct btrfs_ordered_sum *sum;
3413
3414 if (!mod_len)
3415 break;
3416
3417 if (ordered->inode != inode)
3418 continue;
3419
3420 if (ordered->file_offset + ordered->len <= mod_start ||
3421 mod_start + mod_len <= ordered->file_offset)
3422 continue;
3423
3424 /*
3425 * We are going to copy all the csums on this ordered extent, so
3426 * go ahead and adjust mod_start and mod_len in case this
3427 * ordered extent has already been logged.
3428 */
3429 if (ordered->file_offset > mod_start) {
3430 if (ordered->file_offset + ordered->len >=
3431 mod_start + mod_len)
3432 mod_len = ordered->file_offset - mod_start;
3433 /*
3434 * If we have this case
3435 *
3436 * |--------- logged extent ---------|
3437 * |----- ordered extent ----|
3438 *
3439 * Just don't mess with mod_start and mod_len, we'll
3440 * just end up logging more csums than we need and it
3441 * will be ok.
3442 */
3443 } else {
3444 if (ordered->file_offset + ordered->len <
3445 mod_start + mod_len) {
3446 mod_len = (mod_start + mod_len) -
3447 (ordered->file_offset + ordered->len);
3448 mod_start = ordered->file_offset +
3449 ordered->len;
3450 } else {
3451 mod_len = 0;
3452 }
3453 }
3454
3455 /*
3456 * To keep us from looping for the above case of an ordered
3457 * extent that falls inside of the logged extent.
3458 */
3459 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
3460 &ordered->flags))
3461 continue;
3462 atomic_inc(&ordered->refs);
3463 spin_unlock_irq(&log->log_extents_lock[index]);
3464 /*
3465 * we've dropped the lock, we must either break or
3466 * start over after this.
3467 */
3468
3469 wait_event(ordered->wait, ordered->csum_bytes_left == 0);
3470
3471 list_for_each_entry(sum, &ordered->list, list) {
3472 ret = btrfs_csum_file_blocks(trans, log, sum);
3473 if (ret) {
3474 btrfs_put_ordered_extent(ordered);
3475 goto unlocked;
3476 }
3477 }
3478 btrfs_put_ordered_extent(ordered);
3479 goto again;
3480
3481 }
3482 spin_unlock_irq(&log->log_extents_lock[index]);
3483unlocked:
3484
3485 if (!mod_len || ret)
3486 return ret;
3487
3488 csum_offset = mod_start - em->start;
3489 csum_len = mod_len;
3490
3360 /* block start is already adjusted for the file extent offset. */ 3491 /* block start is already adjusted for the file extent offset. */
3361 ret = btrfs_lookup_csums_range(log->fs_info->csum_root, 3492 ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
3362 em->block_start + csum_offset, 3493 em->block_start + csum_offset,
@@ -3388,6 +3519,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3388 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3519 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3389 u64 test_gen; 3520 u64 test_gen;
3390 int ret = 0; 3521 int ret = 0;
3522 int num = 0;
3391 3523
3392 INIT_LIST_HEAD(&extents); 3524 INIT_LIST_HEAD(&extents);
3393 3525
@@ -3396,27 +3528,42 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3396 3528
3397 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 3529 list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
3398 list_del_init(&em->list); 3530 list_del_init(&em->list);
3531
3532 /*
3533 * Just an arbitrary number, this can be really CPU intensive
3534 * once we start getting a lot of extents, and really once we
3535 * have a bunch of extents we just want to commit since it will
3536 * be faster.
3537 */
3538 if (++num > 32768) {
3539 list_del_init(&tree->modified_extents);
3540 ret = -EFBIG;
3541 goto process;
3542 }
3543
3399 if (em->generation <= test_gen) 3544 if (em->generation <= test_gen)
3400 continue; 3545 continue;
3401 /* Need a ref to keep it from getting evicted from cache */ 3546 /* Need a ref to keep it from getting evicted from cache */
3402 atomic_inc(&em->refs); 3547 atomic_inc(&em->refs);
3403 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 3548 set_bit(EXTENT_FLAG_LOGGING, &em->flags);
3404 list_add_tail(&em->list, &extents); 3549 list_add_tail(&em->list, &extents);
3550 num++;
3405 } 3551 }
3406 3552
3407 list_sort(NULL, &extents, extent_cmp); 3553 list_sort(NULL, &extents, extent_cmp);
3408 3554
3555process:
3409 while (!list_empty(&extents)) { 3556 while (!list_empty(&extents)) {
3410 em = list_entry(extents.next, struct extent_map, list); 3557 em = list_entry(extents.next, struct extent_map, list);
3411 3558
3412 list_del_init(&em->list); 3559 list_del_init(&em->list);
3413 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
3414 3560
3415 /* 3561 /*
3416 * If we had an error we just need to delete everybody from our 3562 * If we had an error we just need to delete everybody from our
3417 * private list. 3563 * private list.
3418 */ 3564 */
3419 if (ret) { 3565 if (ret) {
3566 clear_em_logging(tree, em);
3420 free_extent_map(em); 3567 free_extent_map(em);
3421 continue; 3568 continue;
3422 } 3569 }
@@ -3424,8 +3571,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3424 write_unlock(&tree->lock); 3571 write_unlock(&tree->lock);
3425 3572
3426 ret = log_one_extent(trans, inode, root, em, path); 3573 ret = log_one_extent(trans, inode, root, em, path);
3427 free_extent_map(em);
3428 write_lock(&tree->lock); 3574 write_lock(&tree->lock);
3575 clear_em_logging(tree, em);
3576 free_extent_map(em);
3429 } 3577 }
3430 WARN_ON(!list_empty(&extents)); 3578 WARN_ON(!list_empty(&extents));
3431 write_unlock(&tree->lock); 3579 write_unlock(&tree->lock);
@@ -3507,6 +3655,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3507 3655
3508 mutex_lock(&BTRFS_I(inode)->log_mutex); 3656 mutex_lock(&BTRFS_I(inode)->log_mutex);
3509 3657
3658 btrfs_get_logged_extents(log, inode);
3659
3510 /* 3660 /*
3511 * a brute force approach to making sure we get the most uptodate 3661 * a brute force approach to making sure we get the most uptodate
3512 * copies of everything. 3662 * copies of everything.
@@ -3552,7 +3702,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3552 while (1) { 3702 while (1) {
3553 ins_nr = 0; 3703 ins_nr = 0;
3554 ret = btrfs_search_forward(root, &min_key, &max_key, 3704 ret = btrfs_search_forward(root, &min_key, &max_key,
3555 path, 0, trans->transid); 3705 path, trans->transid);
3556 if (ret != 0) 3706 if (ret != 0)
3557 break; 3707 break;
3558again: 3708again:
@@ -3650,6 +3800,8 @@ log_extents:
3650 BTRFS_I(inode)->logged_trans = trans->transid; 3800 BTRFS_I(inode)->logged_trans = trans->transid;
3651 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 3801 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
3652out_unlock: 3802out_unlock:
3803 if (err)
3804 btrfs_free_logged_extents(log, log->log_transid);
3653 mutex_unlock(&BTRFS_I(inode)->log_mutex); 3805 mutex_unlock(&BTRFS_I(inode)->log_mutex);
3654 3806
3655 btrfs_free_path(path); 3807 btrfs_free_path(path);
@@ -3816,7 +3968,6 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
3816end_trans: 3968end_trans:
3817 dput(old_parent); 3969 dput(old_parent);
3818 if (ret < 0) { 3970 if (ret < 0) {
3819 WARN_ON(ret != -ENOSPC);
3820 root->fs_info->last_trans_log_full_commit = trans->transid; 3971 root->fs_info->last_trans_log_full_commit = trans->transid;
3821 ret = 1; 3972 ret = 1;
3822 } 3973 }
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 99be4c138db6..ddc61cad0080 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -5,7 +5,7 @@
5 */ 5 */
6 6
7#include <linux/slab.h> 7#include <linux/slab.h>
8#include <linux/module.h> 8#include <linux/export.h>
9#include "ulist.h" 9#include "ulist.h"
10 10
11/* 11/*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5cce6aa74012..6b9cff42265d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,6 +25,8 @@
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/raid/pq.h>
29#include <asm/div64.h>
28#include "compat.h" 30#include "compat.h"
29#include "ctree.h" 31#include "ctree.h"
30#include "extent_map.h" 32#include "extent_map.h"
@@ -32,6 +34,7 @@
32#include "transaction.h" 34#include "transaction.h"
33#include "print-tree.h" 35#include "print-tree.h"
34#include "volumes.h" 36#include "volumes.h"
37#include "raid56.h"
35#include "async-thread.h" 38#include "async-thread.h"
36#include "check-integrity.h" 39#include "check-integrity.h"
37#include "rcu-string.h" 40#include "rcu-string.h"
@@ -647,6 +650,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
647 new_device->writeable = 0; 650 new_device->writeable = 0;
648 new_device->in_fs_metadata = 0; 651 new_device->in_fs_metadata = 0;
649 new_device->can_discard = 0; 652 new_device->can_discard = 0;
653 spin_lock_init(&new_device->io_lock);
650 list_replace_rcu(&device->dev_list, &new_device->dev_list); 654 list_replace_rcu(&device->dev_list, &new_device->dev_list);
651 655
652 call_rcu(&device->rcu, free_device); 656 call_rcu(&device->rcu, free_device);
@@ -792,26 +796,75 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
792 return ret; 796 return ret;
793} 797}
794 798
799/*
800 * Look for a btrfs signature on a device. This may be called out of the mount path
801 * and we are not allowed to call set_blocksize during the scan. The superblock
802 * is read via pagecache
803 */
795int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 804int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
796 struct btrfs_fs_devices **fs_devices_ret) 805 struct btrfs_fs_devices **fs_devices_ret)
797{ 806{
798 struct btrfs_super_block *disk_super; 807 struct btrfs_super_block *disk_super;
799 struct block_device *bdev; 808 struct block_device *bdev;
800 struct buffer_head *bh; 809 struct page *page;
801 int ret; 810 void *p;
811 int ret = -EINVAL;
802 u64 devid; 812 u64 devid;
803 u64 transid; 813 u64 transid;
804 u64 total_devices; 814 u64 total_devices;
815 u64 bytenr;
816 pgoff_t index;
805 817
818 /*
819 * we would like to check all the supers, but that would make
820 * a btrfs mount succeed after a mkfs from a different FS.
821 * So, we need to add a special mount option to scan for
822 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
823 */
824 bytenr = btrfs_sb_offset(0);
806 flags |= FMODE_EXCL; 825 flags |= FMODE_EXCL;
807 mutex_lock(&uuid_mutex); 826 mutex_lock(&uuid_mutex);
808 ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh); 827
809 if (ret) 828 bdev = blkdev_get_by_path(path, flags, holder);
829
830 if (IS_ERR(bdev)) {
831 ret = PTR_ERR(bdev);
810 goto error; 832 goto error;
811 disk_super = (struct btrfs_super_block *)bh->b_data; 833 }
834
835 /* make sure our super fits in the device */
836 if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
837 goto error_bdev_put;
838
839 /* make sure our super fits in the page */
840 if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
841 goto error_bdev_put;
842
843 /* make sure our super doesn't straddle pages on disk */
844 index = bytenr >> PAGE_CACHE_SHIFT;
845 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
846 goto error_bdev_put;
847
848 /* pull in the page with our super */
849 page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
850 index, GFP_NOFS);
851
852 if (IS_ERR_OR_NULL(page))
853 goto error_bdev_put;
854
855 p = kmap(page);
856
857 /* align our pointer to the offset of the super block */
858 disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
859
860 if (btrfs_super_bytenr(disk_super) != bytenr ||
861 disk_super->magic != cpu_to_le64(BTRFS_MAGIC))
862 goto error_unmap;
863
812 devid = btrfs_stack_device_id(&disk_super->dev_item); 864 devid = btrfs_stack_device_id(&disk_super->dev_item);
813 transid = btrfs_super_generation(disk_super); 865 transid = btrfs_super_generation(disk_super);
814 total_devices = btrfs_super_num_devices(disk_super); 866 total_devices = btrfs_super_num_devices(disk_super);
867
815 if (disk_super->label[0]) { 868 if (disk_super->label[0]) {
816 if (disk_super->label[BTRFS_LABEL_SIZE - 1]) 869 if (disk_super->label[BTRFS_LABEL_SIZE - 1])
817 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; 870 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
@@ -819,12 +872,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
819 } else { 872 } else {
820 printk(KERN_INFO "device fsid %pU ", disk_super->fsid); 873 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
821 } 874 }
875
822 printk(KERN_CONT "devid %llu transid %llu %s\n", 876 printk(KERN_CONT "devid %llu transid %llu %s\n",
823 (unsigned long long)devid, (unsigned long long)transid, path); 877 (unsigned long long)devid, (unsigned long long)transid, path);
878
824 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 879 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
825 if (!ret && fs_devices_ret) 880 if (!ret && fs_devices_ret)
826 (*fs_devices_ret)->total_devices = total_devices; 881 (*fs_devices_ret)->total_devices = total_devices;
827 brelse(bh); 882
883error_unmap:
884 kunmap(page);
885 page_cache_release(page);
886
887error_bdev_put:
828 blkdev_put(bdev, flags); 888 blkdev_put(bdev, flags);
829error: 889error:
830 mutex_unlock(&uuid_mutex); 890 mutex_unlock(&uuid_mutex);
@@ -1372,14 +1432,19 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1372 u64 devid; 1432 u64 devid;
1373 u64 num_devices; 1433 u64 num_devices;
1374 u8 *dev_uuid; 1434 u8 *dev_uuid;
1435 unsigned seq;
1375 int ret = 0; 1436 int ret = 0;
1376 bool clear_super = false; 1437 bool clear_super = false;
1377 1438
1378 mutex_lock(&uuid_mutex); 1439 mutex_lock(&uuid_mutex);
1379 1440
1380 all_avail = root->fs_info->avail_data_alloc_bits | 1441 do {
1381 root->fs_info->avail_system_alloc_bits | 1442 seq = read_seqbegin(&root->fs_info->profiles_lock);
1382 root->fs_info->avail_metadata_alloc_bits; 1443
1444 all_avail = root->fs_info->avail_data_alloc_bits |
1445 root->fs_info->avail_system_alloc_bits |
1446 root->fs_info->avail_metadata_alloc_bits;
1447 } while (read_seqretry(&root->fs_info->profiles_lock, seq));
1383 1448
1384 num_devices = root->fs_info->fs_devices->num_devices; 1449 num_devices = root->fs_info->fs_devices->num_devices;
1385 btrfs_dev_replace_lock(&root->fs_info->dev_replace); 1450 btrfs_dev_replace_lock(&root->fs_info->dev_replace);
@@ -1403,6 +1468,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1403 goto out; 1468 goto out;
1404 } 1469 }
1405 1470
1471 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
1472 root->fs_info->fs_devices->rw_devices <= 2) {
1473 printk(KERN_ERR "btrfs: unable to go below two "
1474 "devices on raid5\n");
1475 ret = -EINVAL;
1476 goto out;
1477 }
1478 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
1479 root->fs_info->fs_devices->rw_devices <= 3) {
1480 printk(KERN_ERR "btrfs: unable to go below three "
1481 "devices on raid6\n");
1482 ret = -EINVAL;
1483 goto out;
1484 }
1485
1406 if (strcmp(device_path, "missing") == 0) { 1486 if (strcmp(device_path, "missing") == 0) {
1407 struct list_head *devices; 1487 struct list_head *devices;
1408 struct btrfs_device *tmp; 1488 struct btrfs_device *tmp;
@@ -1431,7 +1511,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1431 } 1511 }
1432 } else { 1512 } else {
1433 ret = btrfs_get_bdev_and_sb(device_path, 1513 ret = btrfs_get_bdev_and_sb(device_path,
1434 FMODE_READ | FMODE_EXCL, 1514 FMODE_WRITE | FMODE_EXCL,
1435 root->fs_info->bdev_holder, 0, 1515 root->fs_info->bdev_holder, 0,
1436 &bdev, &bh); 1516 &bdev, &bh);
1437 if (ret) 1517 if (ret)
@@ -1556,7 +1636,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1556 ret = 0; 1636 ret = 0;
1557 1637
1558 /* Notify udev that device has changed */ 1638 /* Notify udev that device has changed */
1559 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 1639 if (bdev)
1640 btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
1560 1641
1561error_brelse: 1642error_brelse:
1562 brelse(bh); 1643 brelse(bh);
@@ -2298,7 +2379,11 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
2298 return ret; 2379 return ret;
2299 2380
2300 trans = btrfs_start_transaction(root, 0); 2381 trans = btrfs_start_transaction(root, 0);
2301 BUG_ON(IS_ERR(trans)); 2382 if (IS_ERR(trans)) {
2383 ret = PTR_ERR(trans);
2384 btrfs_std_error(root->fs_info, ret);
2385 return ret;
2386 }
2302 2387
2303 lock_chunks(root); 2388 lock_chunks(root);
2304 2389
@@ -2614,7 +2699,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2614 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 2699 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2615 chunk_used = btrfs_block_group_used(&cache->item); 2700 chunk_used = btrfs_block_group_used(&cache->item);
2616 2701
2617 user_thresh = div_factor_fine(cache->key.offset, bargs->usage); 2702 if (bargs->usage == 0)
2703 user_thresh = 1;
2704 else if (bargs->usage > 100)
2705 user_thresh = cache->key.offset;
2706 else
2707 user_thresh = div_factor_fine(cache->key.offset,
2708 bargs->usage);
2709
2618 if (chunk_used < user_thresh) 2710 if (chunk_used < user_thresh)
2619 ret = 0; 2711 ret = 0;
2620 2712
@@ -2656,11 +2748,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
2656 return 0; 2748 return 0;
2657 2749
2658 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 2750 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2659 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) 2751 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
2660 factor = 2; 2752 factor = num_stripes / 2;
2661 else 2753 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
2662 factor = 1; 2754 factor = num_stripes - 1;
2663 factor = num_stripes / factor; 2755 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
2756 factor = num_stripes - 2;
2757 } else {
2758 factor = num_stripes;
2759 }
2664 2760
2665 for (i = 0; i < num_stripes; i++) { 2761 for (i = 0; i < num_stripes; i++) {
2666 stripe = btrfs_stripe_nr(chunk, i); 2762 stripe = btrfs_stripe_nr(chunk, i);
@@ -2958,7 +3054,10 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
2958 3054
2959 unset_balance_control(fs_info); 3055 unset_balance_control(fs_info);
2960 ret = del_balance_item(fs_info->tree_root); 3056 ret = del_balance_item(fs_info->tree_root);
2961 BUG_ON(ret); 3057 if (ret)
3058 btrfs_std_error(fs_info, ret);
3059
3060 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
2962} 3061}
2963 3062
2964void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, 3063void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
@@ -2975,6 +3074,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2975 int mixed = 0; 3074 int mixed = 0;
2976 int ret; 3075 int ret;
2977 u64 num_devices; 3076 u64 num_devices;
3077 unsigned seq;
2978 3078
2979 if (btrfs_fs_closing(fs_info) || 3079 if (btrfs_fs_closing(fs_info) ||
2980 atomic_read(&fs_info->balance_pause_req) || 3080 atomic_read(&fs_info->balance_pause_req) ||
@@ -3017,7 +3117,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3017 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3117 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3018 else 3118 else
3019 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 3119 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
3020 BTRFS_BLOCK_GROUP_RAID10); 3120 BTRFS_BLOCK_GROUP_RAID10 |
3121 BTRFS_BLOCK_GROUP_RAID5 |
3122 BTRFS_BLOCK_GROUP_RAID6);
3021 3123
3022 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3124 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3023 (!alloc_profile_is_valid(bctl->data.target, 1) || 3125 (!alloc_profile_is_valid(bctl->data.target, 1) ||
@@ -3057,23 +3159,29 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3057 3159
3058 /* allow to reduce meta or sys integrity only if force set */ 3160 /* allow to reduce meta or sys integrity only if force set */
3059 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3161 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3060 BTRFS_BLOCK_GROUP_RAID10; 3162 BTRFS_BLOCK_GROUP_RAID10 |
3061 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3163 BTRFS_BLOCK_GROUP_RAID5 |
3062 (fs_info->avail_system_alloc_bits & allowed) && 3164 BTRFS_BLOCK_GROUP_RAID6;
3063 !(bctl->sys.target & allowed)) || 3165 do {
3064 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3166 seq = read_seqbegin(&fs_info->profiles_lock);
3065 (fs_info->avail_metadata_alloc_bits & allowed) && 3167
3066 !(bctl->meta.target & allowed))) { 3168 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3067 if (bctl->flags & BTRFS_BALANCE_FORCE) { 3169 (fs_info->avail_system_alloc_bits & allowed) &&
3068 printk(KERN_INFO "btrfs: force reducing metadata " 3170 !(bctl->sys.target & allowed)) ||
3069 "integrity\n"); 3171 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3070 } else { 3172 (fs_info->avail_metadata_alloc_bits & allowed) &&
3071 printk(KERN_ERR "btrfs: balance will reduce metadata " 3173 !(bctl->meta.target & allowed))) {
3072 "integrity, use force if you want this\n"); 3174 if (bctl->flags & BTRFS_BALANCE_FORCE) {
3073 ret = -EINVAL; 3175 printk(KERN_INFO "btrfs: force reducing metadata "
3074 goto out; 3176 "integrity\n");
3177 } else {
3178 printk(KERN_ERR "btrfs: balance will reduce metadata "
3179 "integrity, use force if you want this\n");
3180 ret = -EINVAL;
3181 goto out;
3182 }
3075 } 3183 }
3076 } 3184 } while (read_seqretry(&fs_info->profiles_lock, seq));
3077 3185
3078 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3186 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3079 int num_tolerated_disk_barrier_failures; 3187 int num_tolerated_disk_barrier_failures;
@@ -3117,6 +3225,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3117 mutex_lock(&fs_info->balance_mutex); 3225 mutex_lock(&fs_info->balance_mutex);
3118 atomic_dec(&fs_info->balance_running); 3226 atomic_dec(&fs_info->balance_running);
3119 3227
3228 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3229 fs_info->num_tolerated_disk_barrier_failures =
3230 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3231 }
3232
3120 if (bargs) { 3233 if (bargs) {
3121 memset(bargs, 0, sizeof(*bargs)); 3234 memset(bargs, 0, sizeof(*bargs));
3122 update_ioctl_balance_args(fs_info, 0, bargs); 3235 update_ioctl_balance_args(fs_info, 0, bargs);
@@ -3127,19 +3240,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3127 __cancel_balance(fs_info); 3240 __cancel_balance(fs_info);
3128 } 3241 }
3129 3242
3130 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3131 fs_info->num_tolerated_disk_barrier_failures =
3132 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3133 }
3134
3135 wake_up(&fs_info->balance_wait_q); 3243 wake_up(&fs_info->balance_wait_q);
3136 3244
3137 return ret; 3245 return ret;
3138out: 3246out:
3139 if (bctl->flags & BTRFS_BALANCE_RESUME) 3247 if (bctl->flags & BTRFS_BALANCE_RESUME)
3140 __cancel_balance(fs_info); 3248 __cancel_balance(fs_info);
3141 else 3249 else {
3142 kfree(bctl); 3250 kfree(bctl);
3251 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3252 }
3143 return ret; 3253 return ret;
3144} 3254}
3145 3255
@@ -3156,7 +3266,6 @@ static int balance_kthread(void *data)
3156 ret = btrfs_balance(fs_info->balance_ctl, NULL); 3266 ret = btrfs_balance(fs_info->balance_ctl, NULL);
3157 } 3267 }
3158 3268
3159 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3160 mutex_unlock(&fs_info->balance_mutex); 3269 mutex_unlock(&fs_info->balance_mutex);
3161 mutex_unlock(&fs_info->volume_mutex); 3270 mutex_unlock(&fs_info->volume_mutex);
3162 3271
@@ -3179,7 +3288,6 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
3179 return 0; 3288 return 0;
3180 } 3289 }
3181 3290
3182 WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
3183 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 3291 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
3184 if (IS_ERR(tsk)) 3292 if (IS_ERR(tsk))
3185 return PTR_ERR(tsk); 3293 return PTR_ERR(tsk);
@@ -3233,6 +3341,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
3233 btrfs_balance_sys(leaf, item, &disk_bargs); 3341 btrfs_balance_sys(leaf, item, &disk_bargs);
3234 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 3342 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
3235 3343
3344 WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
3345
3236 mutex_lock(&fs_info->volume_mutex); 3346 mutex_lock(&fs_info->volume_mutex);
3237 mutex_lock(&fs_info->balance_mutex); 3347 mutex_lock(&fs_info->balance_mutex);
3238 3348
@@ -3492,13 +3602,86 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
3492} 3602}
3493 3603
3494struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 3604struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
3495 { 2, 1, 0, 4, 2, 2 /* raid10 */ }, 3605 [BTRFS_RAID_RAID10] = {
3496 { 1, 1, 2, 2, 2, 2 /* raid1 */ }, 3606 .sub_stripes = 2,
3497 { 1, 2, 1, 1, 1, 2 /* dup */ }, 3607 .dev_stripes = 1,
3498 { 1, 1, 0, 2, 1, 1 /* raid0 */ }, 3608 .devs_max = 0, /* 0 == as many as possible */
3499 { 1, 1, 0, 1, 1, 1 /* single */ }, 3609 .devs_min = 4,
3610 .devs_increment = 2,
3611 .ncopies = 2,
3612 },
3613 [BTRFS_RAID_RAID1] = {
3614 .sub_stripes = 1,
3615 .dev_stripes = 1,
3616 .devs_max = 2,
3617 .devs_min = 2,
3618 .devs_increment = 2,
3619 .ncopies = 2,
3620 },
3621 [BTRFS_RAID_DUP] = {
3622 .sub_stripes = 1,
3623 .dev_stripes = 2,
3624 .devs_max = 1,
3625 .devs_min = 1,
3626 .devs_increment = 1,
3627 .ncopies = 2,
3628 },
3629 [BTRFS_RAID_RAID0] = {
3630 .sub_stripes = 1,
3631 .dev_stripes = 1,
3632 .devs_max = 0,
3633 .devs_min = 2,
3634 .devs_increment = 1,
3635 .ncopies = 1,
3636 },
3637 [BTRFS_RAID_SINGLE] = {
3638 .sub_stripes = 1,
3639 .dev_stripes = 1,
3640 .devs_max = 1,
3641 .devs_min = 1,
3642 .devs_increment = 1,
3643 .ncopies = 1,
3644 },
3645 [BTRFS_RAID_RAID5] = {
3646 .sub_stripes = 1,
3647 .dev_stripes = 1,
3648 .devs_max = 0,
3649 .devs_min = 2,
3650 .devs_increment = 1,
3651 .ncopies = 2,
3652 },
3653 [BTRFS_RAID_RAID6] = {
3654 .sub_stripes = 1,
3655 .dev_stripes = 1,
3656 .devs_max = 0,
3657 .devs_min = 3,
3658 .devs_increment = 1,
3659 .ncopies = 3,
3660 },
3500}; 3661};
3501 3662
3663static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
3664{
3665 /* TODO allow them to set a preferred stripe size */
3666 return 64 * 1024;
3667}
3668
3669static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
3670{
3671 u64 features;
3672
3673 if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
3674 return;
3675
3676 features = btrfs_super_incompat_flags(info->super_copy);
3677 if (features & BTRFS_FEATURE_INCOMPAT_RAID56)
3678 return;
3679
3680 features |= BTRFS_FEATURE_INCOMPAT_RAID56;
3681 btrfs_set_super_incompat_flags(info->super_copy, features);
3682 printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n");
3683}
3684
3502static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3685static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3503 struct btrfs_root *extent_root, 3686 struct btrfs_root *extent_root,
3504 struct map_lookup **map_ret, 3687 struct map_lookup **map_ret,
@@ -3514,6 +3697,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3514 struct btrfs_device_info *devices_info = NULL; 3697 struct btrfs_device_info *devices_info = NULL;
3515 u64 total_avail; 3698 u64 total_avail;
3516 int num_stripes; /* total number of stripes to allocate */ 3699 int num_stripes; /* total number of stripes to allocate */
3700 int data_stripes; /* number of stripes that count for
3701 block group size */
3517 int sub_stripes; /* sub_stripes info for map */ 3702 int sub_stripes; /* sub_stripes info for map */
3518 int dev_stripes; /* stripes per dev */ 3703 int dev_stripes; /* stripes per dev */
3519 int devs_max; /* max devs to use */ 3704 int devs_max; /* max devs to use */
@@ -3525,6 +3710,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3525 u64 max_chunk_size; 3710 u64 max_chunk_size;
3526 u64 stripe_size; 3711 u64 stripe_size;
3527 u64 num_bytes; 3712 u64 num_bytes;
3713 u64 raid_stripe_len = BTRFS_STRIPE_LEN;
3528 int ndevs; 3714 int ndevs;
3529 int i; 3715 int i;
3530 int j; 3716 int j;
@@ -3619,12 +3805,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3619 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) 3805 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
3620 continue; 3806 continue;
3621 3807
3808 if (ndevs == fs_devices->rw_devices) {
3809 WARN(1, "%s: found more than %llu devices\n",
3810 __func__, fs_devices->rw_devices);
3811 break;
3812 }
3622 devices_info[ndevs].dev_offset = dev_offset; 3813 devices_info[ndevs].dev_offset = dev_offset;
3623 devices_info[ndevs].max_avail = max_avail; 3814 devices_info[ndevs].max_avail = max_avail;
3624 devices_info[ndevs].total_avail = total_avail; 3815 devices_info[ndevs].total_avail = total_avail;
3625 devices_info[ndevs].dev = device; 3816 devices_info[ndevs].dev = device;
3626 ++ndevs; 3817 ++ndevs;
3627 WARN_ON(ndevs > fs_devices->rw_devices);
3628 } 3818 }
3629 3819
3630 /* 3820 /*
@@ -3650,16 +3840,48 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3650 stripe_size = devices_info[ndevs-1].max_avail; 3840 stripe_size = devices_info[ndevs-1].max_avail;
3651 num_stripes = ndevs * dev_stripes; 3841 num_stripes = ndevs * dev_stripes;
3652 3842
3653 if (stripe_size * ndevs > max_chunk_size * ncopies) { 3843 /*
3654 stripe_size = max_chunk_size * ncopies; 3844 * this will have to be fixed for RAID1 and RAID10 over
3655 do_div(stripe_size, ndevs); 3845 * more drives
3846 */
3847 data_stripes = num_stripes / ncopies;
3848
3849 if (type & BTRFS_BLOCK_GROUP_RAID5) {
3850 raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
3851 btrfs_super_stripesize(info->super_copy));
3852 data_stripes = num_stripes - 1;
3853 }
3854 if (type & BTRFS_BLOCK_GROUP_RAID6) {
3855 raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
3856 btrfs_super_stripesize(info->super_copy));
3857 data_stripes = num_stripes - 2;
3858 }
3859
3860 /*
3861 * Use the number of data stripes to figure out how big this chunk
3862 * is really going to be in terms of logical address space,
3863 * and compare that answer with the max chunk size
3864 */
3865 if (stripe_size * data_stripes > max_chunk_size) {
3866 u64 mask = (1ULL << 24) - 1;
3867 stripe_size = max_chunk_size;
3868 do_div(stripe_size, data_stripes);
3869
3870 /* bump the answer up to a 16MB boundary */
3871 stripe_size = (stripe_size + mask) & ~mask;
3872
3873 /* but don't go higher than the limits we found
3874 * while searching for free extents
3875 */
3876 if (stripe_size > devices_info[ndevs-1].max_avail)
3877 stripe_size = devices_info[ndevs-1].max_avail;
3656 } 3878 }
3657 3879
3658 do_div(stripe_size, dev_stripes); 3880 do_div(stripe_size, dev_stripes);
3659 3881
3660 /* align to BTRFS_STRIPE_LEN */ 3882 /* align to BTRFS_STRIPE_LEN */
3661 do_div(stripe_size, BTRFS_STRIPE_LEN); 3883 do_div(stripe_size, raid_stripe_len);
3662 stripe_size *= BTRFS_STRIPE_LEN; 3884 stripe_size *= raid_stripe_len;
3663 3885
3664 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 3886 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3665 if (!map) { 3887 if (!map) {
@@ -3677,14 +3899,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3677 } 3899 }
3678 } 3900 }
3679 map->sector_size = extent_root->sectorsize; 3901 map->sector_size = extent_root->sectorsize;
3680 map->stripe_len = BTRFS_STRIPE_LEN; 3902 map->stripe_len = raid_stripe_len;
3681 map->io_align = BTRFS_STRIPE_LEN; 3903 map->io_align = raid_stripe_len;
3682 map->io_width = BTRFS_STRIPE_LEN; 3904 map->io_width = raid_stripe_len;
3683 map->type = type; 3905 map->type = type;
3684 map->sub_stripes = sub_stripes; 3906 map->sub_stripes = sub_stripes;
3685 3907
3686 *map_ret = map; 3908 *map_ret = map;
3687 num_bytes = stripe_size * (num_stripes / ncopies); 3909 num_bytes = stripe_size * data_stripes;
3688 3910
3689 *stripe_size_out = stripe_size; 3911 *stripe_size_out = stripe_size;
3690 *num_bytes_out = num_bytes; 3912 *num_bytes_out = num_bytes;
@@ -3706,15 +3928,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3706 write_lock(&em_tree->lock); 3928 write_lock(&em_tree->lock);
3707 ret = add_extent_mapping(em_tree, em); 3929 ret = add_extent_mapping(em_tree, em);
3708 write_unlock(&em_tree->lock); 3930 write_unlock(&em_tree->lock);
3709 free_extent_map(em); 3931 if (ret) {
3710 if (ret) 3932 free_extent_map(em);
3711 goto error;
3712
3713 ret = btrfs_make_block_group(trans, extent_root, 0, type,
3714 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3715 start, num_bytes);
3716 if (ret)
3717 goto error; 3933 goto error;
3934 }
3718 3935
3719 for (i = 0; i < map->num_stripes; ++i) { 3936 for (i = 0; i < map->num_stripes; ++i) {
3720 struct btrfs_device *device; 3937 struct btrfs_device *device;
@@ -3727,15 +3944,44 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3727 info->chunk_root->root_key.objectid, 3944 info->chunk_root->root_key.objectid,
3728 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 3945 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3729 start, dev_offset, stripe_size); 3946 start, dev_offset, stripe_size);
3730 if (ret) { 3947 if (ret)
3731 btrfs_abort_transaction(trans, extent_root, ret); 3948 goto error_dev_extent;
3732 goto error;
3733 }
3734 } 3949 }
3735 3950
3951 ret = btrfs_make_block_group(trans, extent_root, 0, type,
3952 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3953 start, num_bytes);
3954 if (ret) {
3955 i = map->num_stripes - 1;
3956 goto error_dev_extent;
3957 }
3958
3959 free_extent_map(em);
3960 check_raid56_incompat_flag(extent_root->fs_info, type);
3961
3736 kfree(devices_info); 3962 kfree(devices_info);
3737 return 0; 3963 return 0;
3738 3964
3965error_dev_extent:
3966 for (; i >= 0; i--) {
3967 struct btrfs_device *device;
3968 int err;
3969
3970 device = map->stripes[i].dev;
3971 err = btrfs_free_dev_extent(trans, device, start);
3972 if (err) {
3973 btrfs_abort_transaction(trans, extent_root, err);
3974 break;
3975 }
3976 }
3977 write_lock(&em_tree->lock);
3978 remove_extent_mapping(em_tree, em);
3979 write_unlock(&em_tree->lock);
3980
3981 /* One for our allocation */
3982 free_extent_map(em);
3983 /* One for the tree reference */
3984 free_extent_map(em);
3739error: 3985error:
3740 kfree(map); 3986 kfree(map);
3741 kfree(devices_info); 3987 kfree(devices_info);
@@ -3875,10 +4121,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3875 if (ret) 4121 if (ret)
3876 return ret; 4122 return ret;
3877 4123
3878 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 4124 alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
3879 fs_info->avail_metadata_alloc_bits;
3880 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3881
3882 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 4125 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
3883 &stripe_size, chunk_offset, alloc_profile); 4126 &stripe_size, chunk_offset, alloc_profile);
3884 if (ret) 4127 if (ret)
@@ -3886,10 +4129,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3886 4129
3887 sys_chunk_offset = chunk_offset + chunk_size; 4130 sys_chunk_offset = chunk_offset + chunk_size;
3888 4131
3889 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | 4132 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
3890 fs_info->avail_system_alloc_bits;
3891 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3892
3893 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 4133 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
3894 &sys_chunk_size, &sys_stripe_size, 4134 &sys_chunk_size, &sys_stripe_size,
3895 sys_chunk_offset, alloc_profile); 4135 sys_chunk_offset, alloc_profile);
@@ -4002,6 +4242,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4002 ret = map->num_stripes; 4242 ret = map->num_stripes;
4003 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4243 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4004 ret = map->sub_stripes; 4244 ret = map->sub_stripes;
4245 else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
4246 ret = 2;
4247 else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4248 ret = 3;
4005 else 4249 else
4006 ret = 1; 4250 ret = 1;
4007 free_extent_map(em); 4251 free_extent_map(em);
@@ -4014,6 +4258,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4014 return ret; 4258 return ret;
4015} 4259}
4016 4260
4261unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
4262 struct btrfs_mapping_tree *map_tree,
4263 u64 logical)
4264{
4265 struct extent_map *em;
4266 struct map_lookup *map;
4267 struct extent_map_tree *em_tree = &map_tree->map_tree;
4268 unsigned long len = root->sectorsize;
4269
4270 read_lock(&em_tree->lock);
4271 em = lookup_extent_mapping(em_tree, logical, len);
4272 read_unlock(&em_tree->lock);
4273 BUG_ON(!em);
4274
4275 BUG_ON(em->start > logical || em->start + em->len < logical);
4276 map = (struct map_lookup *)em->bdev;
4277 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4278 BTRFS_BLOCK_GROUP_RAID6)) {
4279 len = map->stripe_len * nr_data_stripes(map);
4280 }
4281 free_extent_map(em);
4282 return len;
4283}
4284
4285int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
4286 u64 logical, u64 len, int mirror_num)
4287{
4288 struct extent_map *em;
4289 struct map_lookup *map;
4290 struct extent_map_tree *em_tree = &map_tree->map_tree;
4291 int ret = 0;
4292
4293 read_lock(&em_tree->lock);
4294 em = lookup_extent_mapping(em_tree, logical, len);
4295 read_unlock(&em_tree->lock);
4296 BUG_ON(!em);
4297
4298 BUG_ON(em->start > logical || em->start + em->len < logical);
4299 map = (struct map_lookup *)em->bdev;
4300 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4301 BTRFS_BLOCK_GROUP_RAID6))
4302 ret = 1;
4303 free_extent_map(em);
4304 return ret;
4305}
4306
4017static int find_live_mirror(struct btrfs_fs_info *fs_info, 4307static int find_live_mirror(struct btrfs_fs_info *fs_info,
4018 struct map_lookup *map, int first, int num, 4308 struct map_lookup *map, int first, int num,
4019 int optimal, int dev_replace_is_ongoing) 4309 int optimal, int dev_replace_is_ongoing)
@@ -4051,10 +4341,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
4051 return optimal; 4341 return optimal;
4052} 4342}
4053 4343
4344static inline int parity_smaller(u64 a, u64 b)
4345{
4346 return a > b;
4347}
4348
4349/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
4350static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
4351{
4352 struct btrfs_bio_stripe s;
4353 int i;
4354 u64 l;
4355 int again = 1;
4356
4357 while (again) {
4358 again = 0;
4359 for (i = 0; i < bbio->num_stripes - 1; i++) {
4360 if (parity_smaller(raid_map[i], raid_map[i+1])) {
4361 s = bbio->stripes[i];
4362 l = raid_map[i];
4363 bbio->stripes[i] = bbio->stripes[i+1];
4364 raid_map[i] = raid_map[i+1];
4365 bbio->stripes[i+1] = s;
4366 raid_map[i+1] = l;
4367 again = 1;
4368 }
4369 }
4370 }
4371}
4372
4054static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 4373static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4055 u64 logical, u64 *length, 4374 u64 logical, u64 *length,
4056 struct btrfs_bio **bbio_ret, 4375 struct btrfs_bio **bbio_ret,
4057 int mirror_num) 4376 int mirror_num, u64 **raid_map_ret)
4058{ 4377{
4059 struct extent_map *em; 4378 struct extent_map *em;
4060 struct map_lookup *map; 4379 struct map_lookup *map;
@@ -4066,6 +4385,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4066 u64 stripe_nr; 4385 u64 stripe_nr;
4067 u64 stripe_nr_orig; 4386 u64 stripe_nr_orig;
4068 u64 stripe_nr_end; 4387 u64 stripe_nr_end;
4388 u64 stripe_len;
4389 u64 *raid_map = NULL;
4069 int stripe_index; 4390 int stripe_index;
4070 int i; 4391 int i;
4071 int ret = 0; 4392 int ret = 0;
@@ -4077,6 +4398,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4077 int num_alloc_stripes; 4398 int num_alloc_stripes;
4078 int patch_the_first_stripe_for_dev_replace = 0; 4399 int patch_the_first_stripe_for_dev_replace = 0;
4079 u64 physical_to_patch_in_first_stripe = 0; 4400 u64 physical_to_patch_in_first_stripe = 0;
4401 u64 raid56_full_stripe_start = (u64)-1;
4080 4402
4081 read_lock(&em_tree->lock); 4403 read_lock(&em_tree->lock);
4082 em = lookup_extent_mapping(em_tree, logical, *length); 4404 em = lookup_extent_mapping(em_tree, logical, *length);
@@ -4093,29 +4415,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4093 map = (struct map_lookup *)em->bdev; 4415 map = (struct map_lookup *)em->bdev;
4094 offset = logical - em->start; 4416 offset = logical - em->start;
4095 4417
4418 if (mirror_num > map->num_stripes)
4419 mirror_num = 0;
4420
4421 stripe_len = map->stripe_len;
4096 stripe_nr = offset; 4422 stripe_nr = offset;
4097 /* 4423 /*
4098 * stripe_nr counts the total number of stripes we have to stride 4424 * stripe_nr counts the total number of stripes we have to stride
4099 * to get to this block 4425 * to get to this block
4100 */ 4426 */
4101 do_div(stripe_nr, map->stripe_len); 4427 do_div(stripe_nr, stripe_len);
4102 4428
4103 stripe_offset = stripe_nr * map->stripe_len; 4429 stripe_offset = stripe_nr * stripe_len;
4104 BUG_ON(offset < stripe_offset); 4430 BUG_ON(offset < stripe_offset);
4105 4431
4106 /* stripe_offset is the offset of this block in its stripe*/ 4432 /* stripe_offset is the offset of this block in its stripe*/
4107 stripe_offset = offset - stripe_offset; 4433 stripe_offset = offset - stripe_offset;
4108 4434
4109 if (rw & REQ_DISCARD) 4435 /* if we're here for raid56, we need to know the stripe aligned start */
4436 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4437 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
4438 raid56_full_stripe_start = offset;
4439
4440 /* allow a write of a full stripe, but make sure we don't
4441 * allow straddling of stripes
4442 */
4443 do_div(raid56_full_stripe_start, full_stripe_len);
4444 raid56_full_stripe_start *= full_stripe_len;
4445 }
4446
4447 if (rw & REQ_DISCARD) {
4448 /* we don't discard raid56 yet */
4449 if (map->type &
4450 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4451 ret = -EOPNOTSUPP;
4452 goto out;
4453 }
4110 *length = min_t(u64, em->len - offset, *length); 4454 *length = min_t(u64, em->len - offset, *length);
4111 else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 4455 } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
4112 /* we limit the length of each bio to what fits in a stripe */ 4456 u64 max_len;
4113 *length = min_t(u64, em->len - offset, 4457 /* For writes to RAID[56], allow a full stripeset across all disks.
4114 map->stripe_len - stripe_offset); 4458 For other RAID types and for RAID[56] reads, just allow a single
4459 stripe (on a single disk). */
4460 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
4461 (rw & REQ_WRITE)) {
4462 max_len = stripe_len * nr_data_stripes(map) -
4463 (offset - raid56_full_stripe_start);
4464 } else {
4465 /* we limit the length of each bio to what fits in a stripe */
4466 max_len = stripe_len - stripe_offset;
4467 }
4468 *length = min_t(u64, em->len - offset, max_len);
4115 } else { 4469 } else {
4116 *length = em->len - offset; 4470 *length = em->len - offset;
4117 } 4471 }
4118 4472
4473 /* This is for when we're called from btrfs_merge_bio_hook() and all
4474 it cares about is the length */
4119 if (!bbio_ret) 4475 if (!bbio_ret)
4120 goto out; 4476 goto out;
4121 4477
@@ -4148,7 +4504,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4148 u64 physical_of_found = 0; 4504 u64 physical_of_found = 0;
4149 4505
4150 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, 4506 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
4151 logical, &tmp_length, &tmp_bbio, 0); 4507 logical, &tmp_length, &tmp_bbio, 0, NULL);
4152 if (ret) { 4508 if (ret) {
4153 WARN_ON(tmp_bbio != NULL); 4509 WARN_ON(tmp_bbio != NULL);
4154 goto out; 4510 goto out;
@@ -4209,11 +4565,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4209 num_stripes = 1; 4565 num_stripes = 1;
4210 stripe_index = 0; 4566 stripe_index = 0;
4211 stripe_nr_orig = stripe_nr; 4567 stripe_nr_orig = stripe_nr;
4212 stripe_nr_end = (offset + *length + map->stripe_len - 1) & 4568 stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
4213 (~(map->stripe_len - 1));
4214 do_div(stripe_nr_end, map->stripe_len); 4569 do_div(stripe_nr_end, map->stripe_len);
4215 stripe_end_offset = stripe_nr_end * map->stripe_len - 4570 stripe_end_offset = stripe_nr_end * map->stripe_len -
4216 (offset + *length); 4571 (offset + *length);
4572
4217 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4573 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4218 if (rw & REQ_DISCARD) 4574 if (rw & REQ_DISCARD)
4219 num_stripes = min_t(u64, map->num_stripes, 4575 num_stripes = min_t(u64, map->num_stripes,
@@ -4264,6 +4620,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4264 dev_replace_is_ongoing); 4620 dev_replace_is_ongoing);
4265 mirror_num = stripe_index - old_stripe_index + 1; 4621 mirror_num = stripe_index - old_stripe_index + 1;
4266 } 4622 }
4623
4624 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4625 BTRFS_BLOCK_GROUP_RAID6)) {
4626 u64 tmp;
4627
4628 if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
4629 && raid_map_ret) {
4630 int i, rot;
4631
4632 /* push stripe_nr back to the start of the full stripe */
4633 stripe_nr = raid56_full_stripe_start;
4634 do_div(stripe_nr, stripe_len);
4635
4636 stripe_index = do_div(stripe_nr, nr_data_stripes(map));
4637
4638 /* RAID[56] write or recovery. Return all stripes */
4639 num_stripes = map->num_stripes;
4640 max_errors = nr_parity_stripes(map);
4641
4642 raid_map = kmalloc(sizeof(u64) * num_stripes,
4643 GFP_NOFS);
4644 if (!raid_map) {
4645 ret = -ENOMEM;
4646 goto out;
4647 }
4648
4649 /* Work out the disk rotation on this stripe-set */
4650 tmp = stripe_nr;
4651 rot = do_div(tmp, num_stripes);
4652
4653 /* Fill in the logical address of each stripe */
4654 tmp = stripe_nr * nr_data_stripes(map);
4655 for (i = 0; i < nr_data_stripes(map); i++)
4656 raid_map[(i+rot) % num_stripes] =
4657 em->start + (tmp + i) * map->stripe_len;
4658
4659 raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
4660 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4661 raid_map[(i+rot+1) % num_stripes] =
4662 RAID6_Q_STRIPE;
4663
4664 *length = map->stripe_len;
4665 stripe_index = 0;
4666 stripe_offset = 0;
4667 } else {
4668 /*
4669 * Mirror #0 or #1 means the original data block.
4670 * Mirror #2 is RAID5 parity block.
4671 * Mirror #3 is RAID6 Q block.
4672 */
4673 stripe_index = do_div(stripe_nr, nr_data_stripes(map));
4674 if (mirror_num > 1)
4675 stripe_index = nr_data_stripes(map) +
4676 mirror_num - 2;
4677
4678 /* We distribute the parity blocks across stripes */
4679 tmp = stripe_nr + stripe_index;
4680 stripe_index = do_div(tmp, map->num_stripes);
4681 }
4267 } else { 4682 } else {
4268 /* 4683 /*
4269 * after this do_div call, stripe_nr is the number of stripes 4684 * after this do_div call, stripe_nr is the number of stripes
@@ -4372,8 +4787,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4372 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { 4787 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
4373 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 4788 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4374 BTRFS_BLOCK_GROUP_RAID10 | 4789 BTRFS_BLOCK_GROUP_RAID10 |
4790 BTRFS_BLOCK_GROUP_RAID5 |
4375 BTRFS_BLOCK_GROUP_DUP)) { 4791 BTRFS_BLOCK_GROUP_DUP)) {
4376 max_errors = 1; 4792 max_errors = 1;
4793 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
4794 max_errors = 2;
4377 } 4795 }
4378 } 4796 }
4379 4797
@@ -4474,6 +4892,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4474 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 4892 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
4475 bbio->mirror_num = map->num_stripes + 1; 4893 bbio->mirror_num = map->num_stripes + 1;
4476 } 4894 }
4895 if (raid_map) {
4896 sort_parity_stripes(bbio, raid_map);
4897 *raid_map_ret = raid_map;
4898 }
4477out: 4899out:
4478 if (dev_replace_is_ongoing) 4900 if (dev_replace_is_ongoing)
4479 btrfs_dev_replace_unlock(dev_replace); 4901 btrfs_dev_replace_unlock(dev_replace);
@@ -4486,7 +4908,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4486 struct btrfs_bio **bbio_ret, int mirror_num) 4908 struct btrfs_bio **bbio_ret, int mirror_num)
4487{ 4909{
4488 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, 4910 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
4489 mirror_num); 4911 mirror_num, NULL);
4490} 4912}
4491 4913
4492int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 4914int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -4500,6 +4922,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4500 u64 bytenr; 4922 u64 bytenr;
4501 u64 length; 4923 u64 length;
4502 u64 stripe_nr; 4924 u64 stripe_nr;
4925 u64 rmap_len;
4503 int i, j, nr = 0; 4926 int i, j, nr = 0;
4504 4927
4505 read_lock(&em_tree->lock); 4928 read_lock(&em_tree->lock);
@@ -4510,10 +4933,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4510 map = (struct map_lookup *)em->bdev; 4933 map = (struct map_lookup *)em->bdev;
4511 4934
4512 length = em->len; 4935 length = em->len;
4936 rmap_len = map->stripe_len;
4937
4513 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4938 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4514 do_div(length, map->num_stripes / map->sub_stripes); 4939 do_div(length, map->num_stripes / map->sub_stripes);
4515 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 4940 else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
4516 do_div(length, map->num_stripes); 4941 do_div(length, map->num_stripes);
4942 else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4943 BTRFS_BLOCK_GROUP_RAID6)) {
4944 do_div(length, nr_data_stripes(map));
4945 rmap_len = map->stripe_len * nr_data_stripes(map);
4946 }
4517 4947
4518 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 4948 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
4519 BUG_ON(!buf); /* -ENOMEM */ 4949 BUG_ON(!buf); /* -ENOMEM */
@@ -4533,8 +4963,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4533 do_div(stripe_nr, map->sub_stripes); 4963 do_div(stripe_nr, map->sub_stripes);
4534 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4964 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4535 stripe_nr = stripe_nr * map->num_stripes + i; 4965 stripe_nr = stripe_nr * map->num_stripes + i;
4536 } 4966 } /* else if RAID[56], multiply by nr_data_stripes().
4537 bytenr = chunk_start + stripe_nr * map->stripe_len; 4967 * Alternatively, just use rmap_len below instead of
4968 * map->stripe_len */
4969
4970 bytenr = chunk_start + stripe_nr * rmap_len;
4538 WARN_ON(nr >= map->num_stripes); 4971 WARN_ON(nr >= map->num_stripes);
4539 for (j = 0; j < nr; j++) { 4972 for (j = 0; j < nr; j++) {
4540 if (buf[j] == bytenr) 4973 if (buf[j] == bytenr)
@@ -4548,7 +4981,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4548 4981
4549 *logical = buf; 4982 *logical = buf;
4550 *naddrs = nr; 4983 *naddrs = nr;
4551 *stripe_len = map->stripe_len; 4984 *stripe_len = rmap_len;
4552 4985
4553 free_extent_map(em); 4986 free_extent_map(em);
4554 return 0; 4987 return 0;
@@ -4622,7 +5055,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
4622 bio->bi_bdev = (struct block_device *) 5055 bio->bi_bdev = (struct block_device *)
4623 (unsigned long)bbio->mirror_num; 5056 (unsigned long)bbio->mirror_num;
4624 /* only send an error to the higher layers if it is 5057 /* only send an error to the higher layers if it is
4625 * beyond the tolerance of the multi-bio 5058 * beyond the tolerance of the btrfs bio
4626 */ 5059 */
4627 if (atomic_read(&bbio->error) > bbio->max_errors) { 5060 if (atomic_read(&bbio->error) > bbio->max_errors) {
4628 err = -EIO; 5061 err = -EIO;
@@ -4656,13 +5089,18 @@ struct async_sched {
4656 * This will add one bio to the pending list for a device and make sure 5089 * This will add one bio to the pending list for a device and make sure
4657 * the work struct is scheduled. 5090 * the work struct is scheduled.
4658 */ 5091 */
4659static noinline void schedule_bio(struct btrfs_root *root, 5092noinline void btrfs_schedule_bio(struct btrfs_root *root,
4660 struct btrfs_device *device, 5093 struct btrfs_device *device,
4661 int rw, struct bio *bio) 5094 int rw, struct bio *bio)
4662{ 5095{
4663 int should_queue = 1; 5096 int should_queue = 1;
4664 struct btrfs_pending_bios *pending_bios; 5097 struct btrfs_pending_bios *pending_bios;
4665 5098
5099 if (device->missing || !device->bdev) {
5100 bio_endio(bio, -EIO);
5101 return;
5102 }
5103
4666 /* don't bother with additional async steps for reads, right now */ 5104 /* don't bother with additional async steps for reads, right now */
4667 if (!(rw & REQ_WRITE)) { 5105 if (!(rw & REQ_WRITE)) {
4668 bio_get(bio); 5106 bio_get(bio);
@@ -4760,7 +5198,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
4760#endif 5198#endif
4761 bio->bi_bdev = dev->bdev; 5199 bio->bi_bdev = dev->bdev;
4762 if (async) 5200 if (async)
4763 schedule_bio(root, dev, rw, bio); 5201 btrfs_schedule_bio(root, dev, rw, bio);
4764 else 5202 else
4765 btrfsic_submit_bio(rw, bio); 5203 btrfsic_submit_bio(rw, bio);
4766} 5204}
@@ -4819,6 +5257,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4819 u64 logical = (u64)bio->bi_sector << 9; 5257 u64 logical = (u64)bio->bi_sector << 9;
4820 u64 length = 0; 5258 u64 length = 0;
4821 u64 map_length; 5259 u64 map_length;
5260 u64 *raid_map = NULL;
4822 int ret; 5261 int ret;
4823 int dev_nr = 0; 5262 int dev_nr = 0;
4824 int total_devs = 1; 5263 int total_devs = 1;
@@ -4827,12 +5266,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4827 length = bio->bi_size; 5266 length = bio->bi_size;
4828 map_length = length; 5267 map_length = length;
4829 5268
4830 ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5269 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
4831 mirror_num); 5270 mirror_num, &raid_map);
4832 if (ret) 5271 if (ret) /* -ENOMEM */
4833 return ret; 5272 return ret;
4834 5273
4835 total_devs = bbio->num_stripes; 5274 total_devs = bbio->num_stripes;
5275 bbio->orig_bio = first_bio;
5276 bbio->private = first_bio->bi_private;
5277 bbio->end_io = first_bio->bi_end_io;
5278 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
5279
5280 if (raid_map) {
5281 /* In this case, map_length has been set to the length of
5282 a single stripe; not the whole write */
5283 if (rw & WRITE) {
5284 return raid56_parity_write(root, bio, bbio,
5285 raid_map, map_length);
5286 } else {
5287 return raid56_parity_recover(root, bio, bbio,
5288 raid_map, map_length,
5289 mirror_num);
5290 }
5291 }
5292
4836 if (map_length < length) { 5293 if (map_length < length) {
4837 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " 5294 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
4838 "len %llu\n", (unsigned long long)logical, 5295 "len %llu\n", (unsigned long long)logical,
@@ -4841,11 +5298,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4841 BUG(); 5298 BUG();
4842 } 5299 }
4843 5300
4844 bbio->orig_bio = first_bio;
4845 bbio->private = first_bio->bi_private;
4846 bbio->end_io = first_bio->bi_end_io;
4847 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4848
4849 while (dev_nr < total_devs) { 5301 while (dev_nr < total_devs) {
4850 dev = bbio->stripes[dev_nr].dev; 5302 dev = bbio->stripes[dev_nr].dev;
4851 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { 5303 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d3c3939ac751..062d8604d35b 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -21,8 +21,8 @@
21 21
22#include <linux/bio.h> 22#include <linux/bio.h>
23#include <linux/sort.h> 23#include <linux/sort.h>
24#include <linux/btrfs.h>
24#include "async-thread.h" 25#include "async-thread.h"
25#include "ioctl.h"
26 26
27#define BTRFS_STRIPE_LEN (64 * 1024) 27#define BTRFS_STRIPE_LEN (64 * 1024)
28 28
@@ -321,7 +321,14 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
321void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, 321void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
322 struct btrfs_device *tgtdev); 322 struct btrfs_device *tgtdev);
323int btrfs_scratch_superblock(struct btrfs_device *device); 323int btrfs_scratch_superblock(struct btrfs_device *device);
324 324void btrfs_schedule_bio(struct btrfs_root *root,
325 struct btrfs_device *device,
326 int rw, struct bio *bio);
327int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
328 u64 logical, u64 len, int mirror_num);
329unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
330 struct btrfs_mapping_tree *map_tree,
331 u64 logical);
325static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 332static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
326 int index) 333 int index)
327{ 334{