aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/Kconfig3
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/backref.c5
-rw-r--r--fs/btrfs/backref.h2
-rw-r--r--fs/btrfs/btrfs_inode.h20
-rw-r--r--fs/btrfs/check-integrity.c3
-rw-r--r--fs/btrfs/compression.c4
-rw-r--r--fs/btrfs/ctree.c68
-rw-r--r--fs/btrfs/ctree.h150
-rw-r--r--fs/btrfs/delayed-inode.c147
-rw-r--r--fs/btrfs/delayed-inode.h1
-rw-r--r--fs/btrfs/delayed-ref.c82
-rw-r--r--fs/btrfs/delayed-ref.h52
-rw-r--r--fs/btrfs/dev-replace.c6
-rw-r--r--fs/btrfs/disk-io.c227
-rw-r--r--fs/btrfs/disk-io.h7
-rw-r--r--fs/btrfs/extent-tree.c578
-rw-r--r--fs/btrfs/extent_io.c138
-rw-r--r--fs/btrfs/extent_io.h8
-rw-r--r--fs/btrfs/extent_map.c1
-rw-r--r--fs/btrfs/file-item.c67
-rw-r--r--fs/btrfs/file.c57
-rw-r--r--fs/btrfs/free-space-cache.c62
-rw-r--r--fs/btrfs/inode.c1064
-rw-r--r--fs/btrfs/ioctl.c211
-rw-r--r--fs/btrfs/ioctl.h502
-rw-r--r--fs/btrfs/locking.c5
-rw-r--r--fs/btrfs/ordered-data.c98
-rw-r--r--fs/btrfs/ordered-data.h14
-rw-r--r--fs/btrfs/print-tree.c1
-rw-r--r--fs/btrfs/qgroup.c55
-rw-r--r--fs/btrfs/raid56.c2099
-rw-r--r--fs/btrfs/raid56.h51
-rw-r--r--fs/btrfs/relocation.c2
-rw-r--r--fs/btrfs/scrub.c10
-rw-r--r--fs/btrfs/send.c53
-rw-r--r--fs/btrfs/send.h1
-rw-r--r--fs/btrfs/super.c89
-rw-r--r--fs/btrfs/sysfs.c1
-rw-r--r--fs/btrfs/transaction.c151
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-defrag.c19
-rw-r--r--fs/btrfs/tree-log.c166
-rw-r--r--fs/btrfs/ulist.c2
-rw-r--r--fs/btrfs/volumes.c636
-rw-r--r--fs/btrfs/volumes.h11
46 files changed, 5421 insertions, 1518 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index ccd25ba7a9ac..9a8622a5b867 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -5,6 +5,9 @@ config BTRFS_FS
5 select ZLIB_DEFLATE 5 select ZLIB_DEFLATE
6 select LZO_COMPRESS 6 select LZO_COMPRESS
7 select LZO_DECOMPRESS 7 select LZO_DECOMPRESS
8 select RAID6_PQ
9 select XOR_BLOCKS
10
8 help 11 help
9 Btrfs is a new filesystem with extents, writable snapshotting, 12 Btrfs is a new filesystem with extents, writable snapshotting,
10 support for multiple devices and many more features. 13 support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 7df3e0f0ee51..3932224f99e9 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o 11 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o
12 12
13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o 14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 04edf69be875..bd605c87adfd 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -352,11 +352,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
352 err = __resolve_indirect_ref(fs_info, search_commit_root, 352 err = __resolve_indirect_ref(fs_info, search_commit_root,
353 time_seq, ref, parents, 353 time_seq, ref, parents,
354 extent_item_pos); 354 extent_item_pos);
355 if (err) { 355 if (err)
356 if (ret == 0)
357 ret = err;
358 continue; 356 continue;
359 }
360 357
361 /* we put the first parent into the ref at hand */ 358 /* we put the first parent into the ref at hand */
362 ULIST_ITER_INIT(&uiter); 359 ULIST_ITER_INIT(&uiter);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index d61feca79455..310a7f6d09b1 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -19,7 +19,7 @@
19#ifndef __BTRFS_BACKREF__ 19#ifndef __BTRFS_BACKREF__
20#define __BTRFS_BACKREF__ 20#define __BTRFS_BACKREF__
21 21
22#include "ioctl.h" 22#include <linux/btrfs.h>
23#include "ulist.h" 23#include "ulist.h"
24#include "extent_io.h" 24#include "extent_io.h"
25 25
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 2a8c242bc4f5..d9b97d4960e6 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -40,6 +40,8 @@
40#define BTRFS_INODE_HAS_ASYNC_EXTENT 6 40#define BTRFS_INODE_HAS_ASYNC_EXTENT 6
41#define BTRFS_INODE_NEEDS_FULL_SYNC 7 41#define BTRFS_INODE_NEEDS_FULL_SYNC 7
42#define BTRFS_INODE_COPY_EVERYTHING 8 42#define BTRFS_INODE_COPY_EVERYTHING 8
43#define BTRFS_INODE_IN_DELALLOC_LIST 9
44#define BTRFS_INODE_READDIO_NEED_LOCK 10
43 45
44/* in memory btrfs inode */ 46/* in memory btrfs inode */
45struct btrfs_inode { 47struct btrfs_inode {
@@ -216,4 +218,22 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
216 return 0; 218 return 0;
217} 219}
218 220
221/*
222 * Disable DIO read nolock optimization, so new dio readers will be forced
223 * to grab i_mutex. It is used to avoid the endless truncate due to
224 * nonlocked dio read.
225 */
226static inline void btrfs_inode_block_unlocked_dio(struct inode *inode)
227{
228 set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags);
229 smp_mb();
230}
231
232static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode)
233{
234 smp_mb__before_clear_bit();
235 clear_bit(BTRFS_INODE_READDIO_NEED_LOCK,
236 &BTRFS_I(inode)->runtime_flags);
237}
238
219#endif 239#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 11d47bfb62b4..18af6f48781a 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -813,8 +813,7 @@ static int btrfsic_process_superblock_dev_mirror(
813 (bh->b_data + (dev_bytenr & 4095)); 813 (bh->b_data + (dev_bytenr & 4095));
814 814
815 if (btrfs_super_bytenr(super_tmp) != dev_bytenr || 815 if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
816 strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC, 816 super_tmp->magic != cpu_to_le64(BTRFS_MAGIC) ||
817 sizeof(super_tmp->magic)) ||
818 memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || 817 memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
819 btrfs_super_nodesize(super_tmp) != state->metablock_size || 818 btrfs_super_nodesize(super_tmp) != state->metablock_size ||
820 btrfs_super_leafsize(super_tmp) != state->metablock_size || 819 btrfs_super_leafsize(super_tmp) != state->metablock_size ||
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 94ab2f80e7e3..15b94089abc4 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -372,7 +372,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
372 page = compressed_pages[pg_index]; 372 page = compressed_pages[pg_index];
373 page->mapping = inode->i_mapping; 373 page->mapping = inode->i_mapping;
374 if (bio->bi_size) 374 if (bio->bi_size)
375 ret = io_tree->ops->merge_bio_hook(page, 0, 375 ret = io_tree->ops->merge_bio_hook(WRITE, page, 0,
376 PAGE_CACHE_SIZE, 376 PAGE_CACHE_SIZE,
377 bio, 0); 377 bio, 0);
378 else 378 else
@@ -655,7 +655,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
655 page->index = em_start >> PAGE_CACHE_SHIFT; 655 page->index = em_start >> PAGE_CACHE_SHIFT;
656 656
657 if (comp_bio->bi_size) 657 if (comp_bio->bi_size)
658 ret = tree->ops->merge_bio_hook(page, 0, 658 ret = tree->ops->merge_bio_hook(READ, page, 0,
659 PAGE_CACHE_SIZE, 659 PAGE_CACHE_SIZE,
660 comp_bio, 0); 660 comp_bio, 0);
661 else 661 else
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index eea5da7a2b9a..ecd25a1b4e51 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1138,6 +1138,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1138 switch (tm->op) { 1138 switch (tm->op) {
1139 case MOD_LOG_KEY_REMOVE_WHILE_FREEING: 1139 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1140 BUG_ON(tm->slot < n); 1140 BUG_ON(tm->slot < n);
1141 /* Fallthrough */
1141 case MOD_LOG_KEY_REMOVE_WHILE_MOVING: 1142 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1142 case MOD_LOG_KEY_REMOVE: 1143 case MOD_LOG_KEY_REMOVE:
1143 btrfs_set_node_key(eb, &tm->key, tm->slot); 1144 btrfs_set_node_key(eb, &tm->key, tm->slot);
@@ -1222,7 +1223,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1222 1223
1223 __tree_mod_log_rewind(eb_rewin, time_seq, tm); 1224 __tree_mod_log_rewind(eb_rewin, time_seq, tm);
1224 WARN_ON(btrfs_header_nritems(eb_rewin) > 1225 WARN_ON(btrfs_header_nritems(eb_rewin) >
1225 BTRFS_NODEPTRS_PER_BLOCK(fs_info->fs_root)); 1226 BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1226 1227
1227 return eb_rewin; 1228 return eb_rewin;
1228} 1229}
@@ -1441,7 +1442,7 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1441 */ 1442 */
1442int btrfs_realloc_node(struct btrfs_trans_handle *trans, 1443int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1443 struct btrfs_root *root, struct extent_buffer *parent, 1444 struct btrfs_root *root, struct extent_buffer *parent,
1444 int start_slot, int cache_only, u64 *last_ret, 1445 int start_slot, u64 *last_ret,
1445 struct btrfs_key *progress) 1446 struct btrfs_key *progress)
1446{ 1447{
1447 struct extent_buffer *cur; 1448 struct extent_buffer *cur;
@@ -1461,8 +1462,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1461 struct btrfs_disk_key disk_key; 1462 struct btrfs_disk_key disk_key;
1462 1463
1463 parent_level = btrfs_header_level(parent); 1464 parent_level = btrfs_header_level(parent);
1464 if (cache_only && parent_level != 1)
1465 return 0;
1466 1465
1467 WARN_ON(trans->transaction != root->fs_info->running_transaction); 1466 WARN_ON(trans->transaction != root->fs_info->running_transaction);
1468 WARN_ON(trans->transid != root->fs_info->generation); 1467 WARN_ON(trans->transid != root->fs_info->generation);
@@ -1508,10 +1507,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1508 else 1507 else
1509 uptodate = 0; 1508 uptodate = 0;
1510 if (!cur || !uptodate) { 1509 if (!cur || !uptodate) {
1511 if (cache_only) {
1512 free_extent_buffer(cur);
1513 continue;
1514 }
1515 if (!cur) { 1510 if (!cur) {
1516 cur = read_tree_block(root, blocknr, 1511 cur = read_tree_block(root, blocknr,
1517 blocksize, gen); 1512 blocksize, gen);
@@ -4825,8 +4820,8 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4825 4820
4826/* 4821/*
4827 * A helper function to walk down the tree starting at min_key, and looking 4822 * A helper function to walk down the tree starting at min_key, and looking
4828 * for nodes or leaves that are either in cache or have a minimum 4823 * for nodes or leaves that are have a minimum transaction id.
4829 * transaction id. This is used by the btree defrag code, and tree logging 4824 * This is used by the btree defrag code, and tree logging
4830 * 4825 *
4831 * This does not cow, but it does stuff the starting key it finds back 4826 * This does not cow, but it does stuff the starting key it finds back
4832 * into min_key, so you can call btrfs_search_slot with cow=1 on the 4827 * into min_key, so you can call btrfs_search_slot with cow=1 on the
@@ -4847,7 +4842,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4847 */ 4842 */
4848int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 4843int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4849 struct btrfs_key *max_key, 4844 struct btrfs_key *max_key,
4850 struct btrfs_path *path, int cache_only, 4845 struct btrfs_path *path,
4851 u64 min_trans) 4846 u64 min_trans)
4852{ 4847{
4853 struct extent_buffer *cur; 4848 struct extent_buffer *cur;
@@ -4887,15 +4882,12 @@ again:
4887 if (sret && slot > 0) 4882 if (sret && slot > 0)
4888 slot--; 4883 slot--;
4889 /* 4884 /*
4890 * check this node pointer against the cache_only and 4885 * check this node pointer against the min_trans parameters.
4891 * min_trans parameters. If it isn't in cache or is too 4886 * If it is too old, old, skip to the next one.
4892 * old, skip to the next one.
4893 */ 4887 */
4894 while (slot < nritems) { 4888 while (slot < nritems) {
4895 u64 blockptr; 4889 u64 blockptr;
4896 u64 gen; 4890 u64 gen;
4897 struct extent_buffer *tmp;
4898 struct btrfs_disk_key disk_key;
4899 4891
4900 blockptr = btrfs_node_blockptr(cur, slot); 4892 blockptr = btrfs_node_blockptr(cur, slot);
4901 gen = btrfs_node_ptr_generation(cur, slot); 4893 gen = btrfs_node_ptr_generation(cur, slot);
@@ -4903,27 +4895,7 @@ again:
4903 slot++; 4895 slot++;
4904 continue; 4896 continue;
4905 } 4897 }
4906 if (!cache_only) 4898 break;
4907 break;
4908
4909 if (max_key) {
4910 btrfs_node_key(cur, &disk_key, slot);
4911 if (comp_keys(&disk_key, max_key) >= 0) {
4912 ret = 1;
4913 goto out;
4914 }
4915 }
4916
4917 tmp = btrfs_find_tree_block(root, blockptr,
4918 btrfs_level_size(root, level - 1));
4919
4920 if (tmp && btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
4921 free_extent_buffer(tmp);
4922 break;
4923 }
4924 if (tmp)
4925 free_extent_buffer(tmp);
4926 slot++;
4927 } 4899 }
4928find_next_key: 4900find_next_key:
4929 /* 4901 /*
@@ -4934,7 +4906,7 @@ find_next_key:
4934 path->slots[level] = slot; 4906 path->slots[level] = slot;
4935 btrfs_set_path_blocking(path); 4907 btrfs_set_path_blocking(path);
4936 sret = btrfs_find_next_key(root, path, min_key, level, 4908 sret = btrfs_find_next_key(root, path, min_key, level,
4937 cache_only, min_trans); 4909 min_trans);
4938 if (sret == 0) { 4910 if (sret == 0) {
4939 btrfs_release_path(path); 4911 btrfs_release_path(path);
4940 goto again; 4912 goto again;
@@ -5399,8 +5371,7 @@ out:
5399/* 5371/*
5400 * this is similar to btrfs_next_leaf, but does not try to preserve 5372 * this is similar to btrfs_next_leaf, but does not try to preserve
5401 * and fixup the path. It looks for and returns the next key in the 5373 * and fixup the path. It looks for and returns the next key in the
5402 * tree based on the current path and the cache_only and min_trans 5374 * tree based on the current path and the min_trans parameters.
5403 * parameters.
5404 * 5375 *
5405 * 0 is returned if another key is found, < 0 if there are any errors 5376 * 0 is returned if another key is found, < 0 if there are any errors
5406 * and 1 is returned if there are no higher keys in the tree 5377 * and 1 is returned if there are no higher keys in the tree
@@ -5409,8 +5380,7 @@ out:
5409 * calling this function. 5380 * calling this function.
5410 */ 5381 */
5411int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 5382int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5412 struct btrfs_key *key, int level, 5383 struct btrfs_key *key, int level, u64 min_trans)
5413 int cache_only, u64 min_trans)
5414{ 5384{
5415 int slot; 5385 int slot;
5416 struct extent_buffer *c; 5386 struct extent_buffer *c;
@@ -5461,22 +5431,8 @@ next:
5461 if (level == 0) 5431 if (level == 0)
5462 btrfs_item_key_to_cpu(c, key, slot); 5432 btrfs_item_key_to_cpu(c, key, slot);
5463 else { 5433 else {
5464 u64 blockptr = btrfs_node_blockptr(c, slot);
5465 u64 gen = btrfs_node_ptr_generation(c, slot); 5434 u64 gen = btrfs_node_ptr_generation(c, slot);
5466 5435
5467 if (cache_only) {
5468 struct extent_buffer *cur;
5469 cur = btrfs_find_tree_block(root, blockptr,
5470 btrfs_level_size(root, level - 1));
5471 if (!cur ||
5472 btrfs_buffer_uptodate(cur, gen, 1) <= 0) {
5473 slot++;
5474 if (cur)
5475 free_extent_buffer(cur);
5476 goto next;
5477 }
5478 free_extent_buffer(cur);
5479 }
5480 if (gen < min_trans) { 5436 if (gen < min_trans) {
5481 slot++; 5437 slot++;
5482 goto next; 5438 goto next;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 547b7b05727f..0d82922179db 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -31,10 +31,10 @@
31#include <trace/events/btrfs.h> 31#include <trace/events/btrfs.h>
32#include <asm/kmap_types.h> 32#include <asm/kmap_types.h>
33#include <linux/pagemap.h> 33#include <linux/pagemap.h>
34#include <linux/btrfs.h>
34#include "extent_io.h" 35#include "extent_io.h"
35#include "extent_map.h" 36#include "extent_map.h"
36#include "async-thread.h" 37#include "async-thread.h"
37#include "ioctl.h"
38 38
39struct btrfs_trans_handle; 39struct btrfs_trans_handle;
40struct btrfs_transaction; 40struct btrfs_transaction;
@@ -46,7 +46,7 @@ extern struct kmem_cache *btrfs_path_cachep;
46extern struct kmem_cache *btrfs_free_space_cachep; 46extern struct kmem_cache *btrfs_free_space_cachep;
47struct btrfs_ordered_sum; 47struct btrfs_ordered_sum;
48 48
49#define BTRFS_MAGIC "_BHRfS_M" 49#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
50 50
51#define BTRFS_MAX_MIRRORS 3 51#define BTRFS_MAX_MIRRORS 3
52 52
@@ -191,6 +191,8 @@ static int btrfs_csum_sizes[] = { 4, 0 };
191/* ioprio of readahead is set to idle */ 191/* ioprio of readahead is set to idle */
192#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) 192#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
193 193
194#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024)
195
194/* 196/*
195 * The key defines the order in the tree, and so it also defines (optimal) 197 * The key defines the order in the tree, and so it also defines (optimal)
196 * block layout. 198 * block layout.
@@ -336,7 +338,10 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
336/* 338/*
337 * File system states 339 * File system states
338 */ 340 */
341#define BTRFS_FS_STATE_ERROR 0
342#define BTRFS_FS_STATE_REMOUNTING 1
339 343
344/* Super block flags */
340/* Errors detected */ 345/* Errors detected */
341#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) 346#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2)
342 347
@@ -502,6 +507,7 @@ struct btrfs_super_block {
502#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) 507#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
503 508
504#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) 509#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
510#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7)
505 511
506#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 512#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
507#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 513#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
@@ -511,6 +517,7 @@ struct btrfs_super_block {
511 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ 517 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
512 BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ 518 BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
513 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ 519 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
520 BTRFS_FEATURE_INCOMPAT_RAID56 | \
514 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) 521 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
515 522
516/* 523/*
@@ -952,8 +959,20 @@ struct btrfs_dev_replace_item {
952#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) 959#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
953#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) 960#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
954#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) 961#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
962#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7)
963#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8)
955#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE 964#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
956#define BTRFS_NR_RAID_TYPES 5 965
966enum btrfs_raid_types {
967 BTRFS_RAID_RAID10,
968 BTRFS_RAID_RAID1,
969 BTRFS_RAID_DUP,
970 BTRFS_RAID_RAID0,
971 BTRFS_RAID_SINGLE,
972 BTRFS_RAID_RAID5,
973 BTRFS_RAID_RAID6,
974 BTRFS_NR_RAID_TYPES
975};
957 976
958#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ 977#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
959 BTRFS_BLOCK_GROUP_SYSTEM | \ 978 BTRFS_BLOCK_GROUP_SYSTEM | \
@@ -961,6 +980,8 @@ struct btrfs_dev_replace_item {
961 980
962#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ 981#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
963 BTRFS_BLOCK_GROUP_RAID1 | \ 982 BTRFS_BLOCK_GROUP_RAID1 | \
983 BTRFS_BLOCK_GROUP_RAID5 | \
984 BTRFS_BLOCK_GROUP_RAID6 | \
964 BTRFS_BLOCK_GROUP_DUP | \ 985 BTRFS_BLOCK_GROUP_DUP | \
965 BTRFS_BLOCK_GROUP_RAID10) 986 BTRFS_BLOCK_GROUP_RAID10)
966/* 987/*
@@ -1185,6 +1206,10 @@ struct btrfs_block_group_cache {
1185 u64 flags; 1206 u64 flags;
1186 u64 sectorsize; 1207 u64 sectorsize;
1187 u64 cache_generation; 1208 u64 cache_generation;
1209
1210 /* for raid56, this is a full stripe, without parity */
1211 unsigned long full_stripe_len;
1212
1188 unsigned int ro:1; 1213 unsigned int ro:1;
1189 unsigned int dirty:1; 1214 unsigned int dirty:1;
1190 unsigned int iref:1; 1215 unsigned int iref:1;
@@ -1225,6 +1250,28 @@ struct seq_list {
1225 u64 seq; 1250 u64 seq;
1226}; 1251};
1227 1252
1253enum btrfs_orphan_cleanup_state {
1254 ORPHAN_CLEANUP_STARTED = 1,
1255 ORPHAN_CLEANUP_DONE = 2,
1256};
1257
1258/* used by the raid56 code to lock stripes for read/modify/write */
1259struct btrfs_stripe_hash {
1260 struct list_head hash_list;
1261 wait_queue_head_t wait;
1262 spinlock_t lock;
1263};
1264
1265/* used by the raid56 code to lock stripes for read/modify/write */
1266struct btrfs_stripe_hash_table {
1267 struct list_head stripe_cache;
1268 spinlock_t cache_lock;
1269 int cache_size;
1270 struct btrfs_stripe_hash table[];
1271};
1272
1273#define BTRFS_STRIPE_HASH_TABLE_BITS 11
1274
1228/* fs_info */ 1275/* fs_info */
1229struct reloc_control; 1276struct reloc_control;
1230struct btrfs_device; 1277struct btrfs_device;
@@ -1250,6 +1297,7 @@ struct btrfs_fs_info {
1250 1297
1251 /* block group cache stuff */ 1298 /* block group cache stuff */
1252 spinlock_t block_group_cache_lock; 1299 spinlock_t block_group_cache_lock;
1300 u64 first_logical_byte;
1253 struct rb_root block_group_cache_tree; 1301 struct rb_root block_group_cache_tree;
1254 1302
1255 /* keep track of unallocated space */ 1303 /* keep track of unallocated space */
@@ -1288,7 +1336,23 @@ struct btrfs_fs_info {
1288 u64 last_trans_log_full_commit; 1336 u64 last_trans_log_full_commit;
1289 unsigned long mount_opt; 1337 unsigned long mount_opt;
1290 unsigned long compress_type:4; 1338 unsigned long compress_type:4;
1339 /*
1340 * It is a suggestive number, the read side is safe even it gets a
1341 * wrong number because we will write out the data into a regular
1342 * extent. The write side(mount/remount) is under ->s_umount lock,
1343 * so it is also safe.
1344 */
1291 u64 max_inline; 1345 u64 max_inline;
1346 /*
1347 * Protected by ->chunk_mutex and sb->s_umount.
1348 *
1349 * The reason that we use two lock to protect it is because only
1350 * remount and mount operations can change it and these two operations
1351 * are under sb->s_umount, but the read side (chunk allocation) can not
1352 * acquire sb->s_umount or the deadlock would happen. So we use two
1353 * locks to protect it. On the write side, we must acquire two locks,
1354 * and on the read side, we just need acquire one of them.
1355 */
1292 u64 alloc_start; 1356 u64 alloc_start;
1293 struct btrfs_transaction *running_transaction; 1357 struct btrfs_transaction *running_transaction;
1294 wait_queue_head_t transaction_throttle; 1358 wait_queue_head_t transaction_throttle;
@@ -1307,6 +1371,13 @@ struct btrfs_fs_info {
1307 struct mutex cleaner_mutex; 1371 struct mutex cleaner_mutex;
1308 struct mutex chunk_mutex; 1372 struct mutex chunk_mutex;
1309 struct mutex volume_mutex; 1373 struct mutex volume_mutex;
1374
1375 /* this is used during read/modify/write to make sure
1376 * no two ios are trying to mod the same stripe at the same
1377 * time
1378 */
1379 struct btrfs_stripe_hash_table *stripe_hash_table;
1380
1310 /* 1381 /*
1311 * this protects the ordered operations list only while we are 1382 * this protects the ordered operations list only while we are
1312 * processing all of the entries on it. This way we make 1383 * processing all of the entries on it. This way we make
@@ -1365,6 +1436,7 @@ struct btrfs_fs_info {
1365 */ 1436 */
1366 struct list_head ordered_extents; 1437 struct list_head ordered_extents;
1367 1438
1439 spinlock_t delalloc_lock;
1368 /* 1440 /*
1369 * all of the inodes that have delalloc bytes. It is possible for 1441 * all of the inodes that have delalloc bytes. It is possible for
1370 * this list to be empty even when there is still dirty data=ordered 1442 * this list to be empty even when there is still dirty data=ordered
@@ -1373,13 +1445,6 @@ struct btrfs_fs_info {
1373 struct list_head delalloc_inodes; 1445 struct list_head delalloc_inodes;
1374 1446
1375 /* 1447 /*
1376 * special rename and truncate targets that must be on disk before
1377 * we're allowed to commit. This is basically the ext3 style
1378 * data=ordered list.
1379 */
1380 struct list_head ordered_operations;
1381
1382 /*
1383 * there is a pool of worker threads for checksumming during writes 1448 * there is a pool of worker threads for checksumming during writes
1384 * and a pool for checksumming after reads. This is because readers 1449 * and a pool for checksumming after reads. This is because readers
1385 * can run with FS locks held, and the writers may be waiting for 1450 * can run with FS locks held, and the writers may be waiting for
@@ -1395,6 +1460,8 @@ struct btrfs_fs_info {
1395 struct btrfs_workers flush_workers; 1460 struct btrfs_workers flush_workers;
1396 struct btrfs_workers endio_workers; 1461 struct btrfs_workers endio_workers;
1397 struct btrfs_workers endio_meta_workers; 1462 struct btrfs_workers endio_meta_workers;
1463 struct btrfs_workers endio_raid56_workers;
1464 struct btrfs_workers rmw_workers;
1398 struct btrfs_workers endio_meta_write_workers; 1465 struct btrfs_workers endio_meta_write_workers;
1399 struct btrfs_workers endio_write_workers; 1466 struct btrfs_workers endio_write_workers;
1400 struct btrfs_workers endio_freespace_worker; 1467 struct btrfs_workers endio_freespace_worker;
@@ -1423,10 +1490,12 @@ struct btrfs_fs_info {
1423 1490
1424 u64 total_pinned; 1491 u64 total_pinned;
1425 1492
1426 /* protected by the delalloc lock, used to keep from writing 1493 /* used to keep from writing metadata until there is a nice batch */
1427 * metadata until there is a nice batch 1494 struct percpu_counter dirty_metadata_bytes;
1428 */ 1495 struct percpu_counter delalloc_bytes;
1429 u64 dirty_metadata_bytes; 1496 s32 dirty_metadata_batch;
1497 s32 delalloc_batch;
1498
1430 struct list_head dirty_cowonly_roots; 1499 struct list_head dirty_cowonly_roots;
1431 1500
1432 struct btrfs_fs_devices *fs_devices; 1501 struct btrfs_fs_devices *fs_devices;
@@ -1442,9 +1511,6 @@ struct btrfs_fs_info {
1442 1511
1443 struct reloc_control *reloc_ctl; 1512 struct reloc_control *reloc_ctl;
1444 1513
1445 spinlock_t delalloc_lock;
1446 u64 delalloc_bytes;
1447
1448 /* data_alloc_cluster is only used in ssd mode */ 1514 /* data_alloc_cluster is only used in ssd mode */
1449 struct btrfs_free_cluster data_alloc_cluster; 1515 struct btrfs_free_cluster data_alloc_cluster;
1450 1516
@@ -1456,6 +1522,8 @@ struct btrfs_fs_info {
1456 struct rb_root defrag_inodes; 1522 struct rb_root defrag_inodes;
1457 atomic_t defrag_running; 1523 atomic_t defrag_running;
1458 1524
1525 /* Used to protect avail_{data, metadata, system}_alloc_bits */
1526 seqlock_t profiles_lock;
1459 /* 1527 /*
1460 * these three are in extended format (availability of single 1528 * these three are in extended format (availability of single
1461 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other 1529 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
@@ -1520,7 +1588,7 @@ struct btrfs_fs_info {
1520 u64 qgroup_seq; 1588 u64 qgroup_seq;
1521 1589
1522 /* filesystem state */ 1590 /* filesystem state */
1523 u64 fs_state; 1591 unsigned long fs_state;
1524 1592
1525 struct btrfs_delayed_root *delayed_root; 1593 struct btrfs_delayed_root *delayed_root;
1526 1594
@@ -1623,6 +1691,9 @@ struct btrfs_root {
1623 1691
1624 struct list_head root_list; 1692 struct list_head root_list;
1625 1693
1694 spinlock_t log_extents_lock[2];
1695 struct list_head logged_list[2];
1696
1626 spinlock_t orphan_lock; 1697 spinlock_t orphan_lock;
1627 atomic_t orphan_inodes; 1698 atomic_t orphan_inodes;
1628 struct btrfs_block_rsv *orphan_block_rsv; 1699 struct btrfs_block_rsv *orphan_block_rsv;
@@ -1832,6 +1903,7 @@ struct btrfs_ioctl_defrag_range_args {
1832 1903
1833#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1904#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1834#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1905#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
1906#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt)
1835#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ 1907#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
1836 BTRFS_MOUNT_##opt) 1908 BTRFS_MOUNT_##opt)
1837/* 1909/*
@@ -2936,8 +3008,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
2936 u64 num_bytes, u64 *refs, u64 *flags); 3008 u64 num_bytes, u64 *refs, u64 *flags);
2937int btrfs_pin_extent(struct btrfs_root *root, 3009int btrfs_pin_extent(struct btrfs_root *root,
2938 u64 bytenr, u64 num, int reserved); 3010 u64 bytenr, u64 num, int reserved);
2939int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, 3011int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
2940 struct btrfs_root *root,
2941 u64 bytenr, u64 num_bytes); 3012 u64 bytenr, u64 num_bytes);
2942int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 3013int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2943 struct btrfs_root *root, 3014 struct btrfs_root *root,
@@ -3035,8 +3106,13 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3035int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 3106int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
3036 struct inode *inode); 3107 struct inode *inode);
3037void btrfs_orphan_release_metadata(struct inode *inode); 3108void btrfs_orphan_release_metadata(struct inode *inode);
3038int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, 3109int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
3039 struct btrfs_pending_snapshot *pending); 3110 struct btrfs_block_rsv *rsv,
3111 int nitems,
3112 u64 *qgroup_reserved);
3113void btrfs_subvolume_release_metadata(struct btrfs_root *root,
3114 struct btrfs_block_rsv *rsv,
3115 u64 qgroup_reserved);
3040int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); 3116int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
3041void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); 3117void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
3042int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); 3118int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
@@ -3092,10 +3168,10 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
3092struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); 3168struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
3093int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 3169int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
3094 struct btrfs_key *key, int lowest_level, 3170 struct btrfs_key *key, int lowest_level,
3095 int cache_only, u64 min_trans); 3171 u64 min_trans);
3096int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 3172int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3097 struct btrfs_key *max_key, 3173 struct btrfs_key *max_key,
3098 struct btrfs_path *path, int cache_only, 3174 struct btrfs_path *path,
3099 u64 min_trans); 3175 u64 min_trans);
3100enum btrfs_compare_tree_result { 3176enum btrfs_compare_tree_result {
3101 BTRFS_COMPARE_TREE_NEW, 3177 BTRFS_COMPARE_TREE_NEW,
@@ -3148,7 +3224,7 @@ int btrfs_search_slot_for_read(struct btrfs_root *root,
3148 int find_higher, int return_any); 3224 int find_higher, int return_any);
3149int btrfs_realloc_node(struct btrfs_trans_handle *trans, 3225int btrfs_realloc_node(struct btrfs_trans_handle *trans,
3150 struct btrfs_root *root, struct extent_buffer *parent, 3226 struct btrfs_root *root, struct extent_buffer *parent,
3151 int start_slot, int cache_only, u64 *last_ret, 3227 int start_slot, u64 *last_ret,
3152 struct btrfs_key *progress); 3228 struct btrfs_key *progress);
3153void btrfs_release_path(struct btrfs_path *p); 3229void btrfs_release_path(struct btrfs_path *p);
3154struct btrfs_path *btrfs_alloc_path(void); 3230struct btrfs_path *btrfs_alloc_path(void);
@@ -3459,9 +3535,9 @@ int btrfs_writepages(struct address_space *mapping,
3459 struct writeback_control *wbc); 3535 struct writeback_control *wbc);
3460int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 3536int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
3461 struct btrfs_root *new_root, u64 new_dirid); 3537 struct btrfs_root *new_root, u64 new_dirid);
3462int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 3538int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
3463 size_t size, struct bio *bio, unsigned long bio_flags); 3539 size_t size, struct bio *bio,
3464 3540 unsigned long bio_flags);
3465int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 3541int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
3466int btrfs_readpage(struct file *file, struct page *page); 3542int btrfs_readpage(struct file *file, struct page *page);
3467void btrfs_evict_inode(struct inode *inode); 3543void btrfs_evict_inode(struct inode *inode);
@@ -3543,7 +3619,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
3543 3619
3544/* tree-defrag.c */ 3620/* tree-defrag.c */
3545int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 3621int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
3546 struct btrfs_root *root, int cache_only); 3622 struct btrfs_root *root);
3547 3623
3548/* sysfs.c */ 3624/* sysfs.c */
3549int btrfs_init_sysfs(void); 3625int btrfs_init_sysfs(void);
@@ -3620,11 +3696,14 @@ __printf(5, 6)
3620void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, 3696void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
3621 unsigned int line, int errno, const char *fmt, ...); 3697 unsigned int line, int errno, const char *fmt, ...);
3622 3698
3699/*
3700 * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
3701 * will panic(). Otherwise we BUG() here.
3702 */
3623#define btrfs_panic(fs_info, errno, fmt, args...) \ 3703#define btrfs_panic(fs_info, errno, fmt, args...) \
3624do { \ 3704do { \
3625 struct btrfs_fs_info *_i = (fs_info); \ 3705 __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
3626 __btrfs_panic(_i, __func__, __LINE__, errno, fmt, ##args); \ 3706 BUG(); \
3627 BUG_ON(!(_i->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)); \
3628} while (0) 3707} while (0)
3629 3708
3630/* acl.c */ 3709/* acl.c */
@@ -3745,4 +3824,11 @@ static inline int is_fstree(u64 rootid)
3745 return 1; 3824 return 1;
3746 return 0; 3825 return 0;
3747} 3826}
3827
3828static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
3829{
3830 return signal_pending(current);
3831}
3832
3833
3748#endif 3834#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 34836036f01b..0b278b117cbe 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -875,7 +875,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
875 struct btrfs_delayed_item *delayed_item) 875 struct btrfs_delayed_item *delayed_item)
876{ 876{
877 struct extent_buffer *leaf; 877 struct extent_buffer *leaf;
878 struct btrfs_item *item;
879 char *ptr; 878 char *ptr;
880 int ret; 879 int ret;
881 880
@@ -886,7 +885,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
886 885
887 leaf = path->nodes[0]; 886 leaf = path->nodes[0];
888 887
889 item = btrfs_item_nr(leaf, path->slots[0]);
890 ptr = btrfs_item_ptr(leaf, path->slots[0], char); 888 ptr = btrfs_item_ptr(leaf, path->slots[0], char);
891 889
892 write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr, 890 write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
@@ -1065,32 +1063,25 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
1065 } 1063 }
1066} 1064}
1067 1065
1068static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, 1066static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1069 struct btrfs_root *root, 1067 struct btrfs_root *root,
1070 struct btrfs_path *path, 1068 struct btrfs_path *path,
1071 struct btrfs_delayed_node *node) 1069 struct btrfs_delayed_node *node)
1072{ 1070{
1073 struct btrfs_key key; 1071 struct btrfs_key key;
1074 struct btrfs_inode_item *inode_item; 1072 struct btrfs_inode_item *inode_item;
1075 struct extent_buffer *leaf; 1073 struct extent_buffer *leaf;
1076 int ret; 1074 int ret;
1077 1075
1078 mutex_lock(&node->mutex);
1079 if (!node->inode_dirty) {
1080 mutex_unlock(&node->mutex);
1081 return 0;
1082 }
1083
1084 key.objectid = node->inode_id; 1076 key.objectid = node->inode_id;
1085 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 1077 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
1086 key.offset = 0; 1078 key.offset = 0;
1079
1087 ret = btrfs_lookup_inode(trans, root, path, &key, 1); 1080 ret = btrfs_lookup_inode(trans, root, path, &key, 1);
1088 if (ret > 0) { 1081 if (ret > 0) {
1089 btrfs_release_path(path); 1082 btrfs_release_path(path);
1090 mutex_unlock(&node->mutex);
1091 return -ENOENT; 1083 return -ENOENT;
1092 } else if (ret < 0) { 1084 } else if (ret < 0) {
1093 mutex_unlock(&node->mutex);
1094 return ret; 1085 return ret;
1095 } 1086 }
1096 1087
@@ -1105,11 +1096,47 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1105 1096
1106 btrfs_delayed_inode_release_metadata(root, node); 1097 btrfs_delayed_inode_release_metadata(root, node);
1107 btrfs_release_delayed_inode(node); 1098 btrfs_release_delayed_inode(node);
1108 mutex_unlock(&node->mutex);
1109 1099
1110 return 0; 1100 return 0;
1111} 1101}
1112 1102
1103static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1104 struct btrfs_root *root,
1105 struct btrfs_path *path,
1106 struct btrfs_delayed_node *node)
1107{
1108 int ret;
1109
1110 mutex_lock(&node->mutex);
1111 if (!node->inode_dirty) {
1112 mutex_unlock(&node->mutex);
1113 return 0;
1114 }
1115
1116 ret = __btrfs_update_delayed_inode(trans, root, path, node);
1117 mutex_unlock(&node->mutex);
1118 return ret;
1119}
1120
1121static inline int
1122__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1123 struct btrfs_path *path,
1124 struct btrfs_delayed_node *node)
1125{
1126 int ret;
1127
1128 ret = btrfs_insert_delayed_items(trans, path, node->root, node);
1129 if (ret)
1130 return ret;
1131
1132 ret = btrfs_delete_delayed_items(trans, path, node->root, node);
1133 if (ret)
1134 return ret;
1135
1136 ret = btrfs_update_delayed_inode(trans, node->root, path, node);
1137 return ret;
1138}
1139
1113/* 1140/*
1114 * Called when committing the transaction. 1141 * Called when committing the transaction.
1115 * Returns 0 on success. 1142 * Returns 0 on success.
@@ -1119,7 +1146,6 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1119static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, 1146static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1120 struct btrfs_root *root, int nr) 1147 struct btrfs_root *root, int nr)
1121{ 1148{
1122 struct btrfs_root *curr_root = root;
1123 struct btrfs_delayed_root *delayed_root; 1149 struct btrfs_delayed_root *delayed_root;
1124 struct btrfs_delayed_node *curr_node, *prev_node; 1150 struct btrfs_delayed_node *curr_node, *prev_node;
1125 struct btrfs_path *path; 1151 struct btrfs_path *path;
@@ -1142,15 +1168,8 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1142 1168
1143 curr_node = btrfs_first_delayed_node(delayed_root); 1169 curr_node = btrfs_first_delayed_node(delayed_root);
1144 while (curr_node && (!count || (count && nr--))) { 1170 while (curr_node && (!count || (count && nr--))) {
1145 curr_root = curr_node->root; 1171 ret = __btrfs_commit_inode_delayed_items(trans, path,
1146 ret = btrfs_insert_delayed_items(trans, path, curr_root, 1172 curr_node);
1147 curr_node);
1148 if (!ret)
1149 ret = btrfs_delete_delayed_items(trans, path,
1150 curr_root, curr_node);
1151 if (!ret)
1152 ret = btrfs_update_delayed_inode(trans, curr_root,
1153 path, curr_node);
1154 if (ret) { 1173 if (ret) {
1155 btrfs_release_delayed_node(curr_node); 1174 btrfs_release_delayed_node(curr_node);
1156 curr_node = NULL; 1175 curr_node = NULL;
@@ -1183,51 +1202,93 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
1183 return __btrfs_run_delayed_items(trans, root, nr); 1202 return __btrfs_run_delayed_items(trans, root, nr);
1184} 1203}
1185 1204
1186static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, 1205int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1187 struct btrfs_delayed_node *node) 1206 struct inode *inode)
1188{ 1207{
1208 struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
1189 struct btrfs_path *path; 1209 struct btrfs_path *path;
1190 struct btrfs_block_rsv *block_rsv; 1210 struct btrfs_block_rsv *block_rsv;
1191 int ret; 1211 int ret;
1192 1212
1213 if (!delayed_node)
1214 return 0;
1215
1216 mutex_lock(&delayed_node->mutex);
1217 if (!delayed_node->count) {
1218 mutex_unlock(&delayed_node->mutex);
1219 btrfs_release_delayed_node(delayed_node);
1220 return 0;
1221 }
1222 mutex_unlock(&delayed_node->mutex);
1223
1193 path = btrfs_alloc_path(); 1224 path = btrfs_alloc_path();
1194 if (!path) 1225 if (!path)
1195 return -ENOMEM; 1226 return -ENOMEM;
1196 path->leave_spinning = 1; 1227 path->leave_spinning = 1;
1197 1228
1198 block_rsv = trans->block_rsv; 1229 block_rsv = trans->block_rsv;
1199 trans->block_rsv = &node->root->fs_info->delayed_block_rsv; 1230 trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
1200 1231
1201 ret = btrfs_insert_delayed_items(trans, path, node->root, node); 1232 ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
1202 if (!ret)
1203 ret = btrfs_delete_delayed_items(trans, path, node->root, node);
1204 if (!ret)
1205 ret = btrfs_update_delayed_inode(trans, node->root, path, node);
1206 btrfs_free_path(path);
1207 1233
1234 btrfs_release_delayed_node(delayed_node);
1235 btrfs_free_path(path);
1208 trans->block_rsv = block_rsv; 1236 trans->block_rsv = block_rsv;
1237
1209 return ret; 1238 return ret;
1210} 1239}
1211 1240
1212int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, 1241int btrfs_commit_inode_delayed_inode(struct inode *inode)
1213 struct inode *inode)
1214{ 1242{
1243 struct btrfs_trans_handle *trans;
1215 struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); 1244 struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
1245 struct btrfs_path *path;
1246 struct btrfs_block_rsv *block_rsv;
1216 int ret; 1247 int ret;
1217 1248
1218 if (!delayed_node) 1249 if (!delayed_node)
1219 return 0; 1250 return 0;
1220 1251
1221 mutex_lock(&delayed_node->mutex); 1252 mutex_lock(&delayed_node->mutex);
1222 if (!delayed_node->count) { 1253 if (!delayed_node->inode_dirty) {
1223 mutex_unlock(&delayed_node->mutex); 1254 mutex_unlock(&delayed_node->mutex);
1224 btrfs_release_delayed_node(delayed_node); 1255 btrfs_release_delayed_node(delayed_node);
1225 return 0; 1256 return 0;
1226 } 1257 }
1227 mutex_unlock(&delayed_node->mutex); 1258 mutex_unlock(&delayed_node->mutex);
1228 1259
1229 ret = __btrfs_commit_inode_delayed_items(trans, delayed_node); 1260 trans = btrfs_join_transaction(delayed_node->root);
1261 if (IS_ERR(trans)) {
1262 ret = PTR_ERR(trans);
1263 goto out;
1264 }
1265
1266 path = btrfs_alloc_path();
1267 if (!path) {
1268 ret = -ENOMEM;
1269 goto trans_out;
1270 }
1271 path->leave_spinning = 1;
1272
1273 block_rsv = trans->block_rsv;
1274 trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
1275
1276 mutex_lock(&delayed_node->mutex);
1277 if (delayed_node->inode_dirty)
1278 ret = __btrfs_update_delayed_inode(trans, delayed_node->root,
1279 path, delayed_node);
1280 else
1281 ret = 0;
1282 mutex_unlock(&delayed_node->mutex);
1283
1284 btrfs_free_path(path);
1285 trans->block_rsv = block_rsv;
1286trans_out:
1287 btrfs_end_transaction(trans, delayed_node->root);
1288 btrfs_btree_balance_dirty(delayed_node->root);
1289out:
1230 btrfs_release_delayed_node(delayed_node); 1290 btrfs_release_delayed_node(delayed_node);
1291
1231 return ret; 1292 return ret;
1232} 1293}
1233 1294
@@ -1258,7 +1319,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1258 struct btrfs_root *root; 1319 struct btrfs_root *root;
1259 struct btrfs_block_rsv *block_rsv; 1320 struct btrfs_block_rsv *block_rsv;
1260 int need_requeue = 0; 1321 int need_requeue = 0;
1261 int ret;
1262 1322
1263 async_node = container_of(work, struct btrfs_async_delayed_node, work); 1323 async_node = container_of(work, struct btrfs_async_delayed_node, work);
1264 1324
@@ -1277,14 +1337,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1277 block_rsv = trans->block_rsv; 1337 block_rsv = trans->block_rsv;
1278 trans->block_rsv = &root->fs_info->delayed_block_rsv; 1338 trans->block_rsv = &root->fs_info->delayed_block_rsv;
1279 1339
1280 ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); 1340 __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
1281 if (!ret)
1282 ret = btrfs_delete_delayed_items(trans, path, root,
1283 delayed_node);
1284
1285 if (!ret)
1286 btrfs_update_delayed_inode(trans, root, path, delayed_node);
1287
1288 /* 1341 /*
1289 * Maybe new delayed items have been inserted, so we need requeue 1342 * Maybe new delayed items have been inserted, so we need requeue
1290 * the work. Besides that, we must dequeue the empty delayed nodes 1343 * the work. Besides that, we must dequeue the empty delayed nodes
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 4f808e1baeed..78b6ad0fc669 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -117,6 +117,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
117/* Used for evicting the inode. */ 117/* Used for evicting the inode. */
118void btrfs_remove_delayed_node(struct inode *inode); 118void btrfs_remove_delayed_node(struct inode *inode);
119void btrfs_kill_delayed_inode_items(struct inode *inode); 119void btrfs_kill_delayed_inode_items(struct inode *inode);
120int btrfs_commit_inode_delayed_inode(struct inode *inode);
120 121
121 122
122int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, 123int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index ae9411773397..b7a0641ead77 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -23,6 +23,10 @@
23#include "delayed-ref.h" 23#include "delayed-ref.h"
24#include "transaction.h" 24#include "transaction.h"
25 25
26struct kmem_cache *btrfs_delayed_ref_head_cachep;
27struct kmem_cache *btrfs_delayed_tree_ref_cachep;
28struct kmem_cache *btrfs_delayed_data_ref_cachep;
29struct kmem_cache *btrfs_delayed_extent_op_cachep;
26/* 30/*
27 * delayed back reference update tracking. For subvolume trees 31 * delayed back reference update tracking. For subvolume trees
28 * we queue up extent allocations and backref maintenance for 32 * we queue up extent allocations and backref maintenance for
@@ -422,6 +426,14 @@ again:
422 return 1; 426 return 1;
423} 427}
424 428
429void btrfs_release_ref_cluster(struct list_head *cluster)
430{
431 struct list_head *pos, *q;
432
433 list_for_each_safe(pos, q, cluster)
434 list_del_init(pos);
435}
436
425/* 437/*
426 * helper function to update an extent delayed ref in the 438 * helper function to update an extent delayed ref in the
427 * rbtree. existing and update must both have the same 439 * rbtree. existing and update must both have the same
@@ -511,7 +523,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
511 ref->extent_op->flags_to_set; 523 ref->extent_op->flags_to_set;
512 existing_ref->extent_op->update_flags = 1; 524 existing_ref->extent_op->update_flags = 1;
513 } 525 }
514 kfree(ref->extent_op); 526 btrfs_free_delayed_extent_op(ref->extent_op);
515 } 527 }
516 } 528 }
517 /* 529 /*
@@ -592,7 +604,7 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
592 * we've updated the existing ref, free the newly 604 * we've updated the existing ref, free the newly
593 * allocated ref 605 * allocated ref
594 */ 606 */
595 kfree(head_ref); 607 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
596 } else { 608 } else {
597 delayed_refs->num_heads++; 609 delayed_refs->num_heads++;
598 delayed_refs->num_heads_ready++; 610 delayed_refs->num_heads_ready++;
@@ -653,7 +665,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
653 * we've updated the existing ref, free the newly 665 * we've updated the existing ref, free the newly
654 * allocated ref 666 * allocated ref
655 */ 667 */
656 kfree(full_ref); 668 kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
657 } else { 669 } else {
658 delayed_refs->num_entries++; 670 delayed_refs->num_entries++;
659 trans->delayed_ref_updates++; 671 trans->delayed_ref_updates++;
@@ -714,7 +726,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
714 * we've updated the existing ref, free the newly 726 * we've updated the existing ref, free the newly
715 * allocated ref 727 * allocated ref
716 */ 728 */
717 kfree(full_ref); 729 kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
718 } else { 730 } else {
719 delayed_refs->num_entries++; 731 delayed_refs->num_entries++;
720 trans->delayed_ref_updates++; 732 trans->delayed_ref_updates++;
@@ -738,13 +750,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
738 struct btrfs_delayed_ref_root *delayed_refs; 750 struct btrfs_delayed_ref_root *delayed_refs;
739 751
740 BUG_ON(extent_op && extent_op->is_data); 752 BUG_ON(extent_op && extent_op->is_data);
741 ref = kmalloc(sizeof(*ref), GFP_NOFS); 753 ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
742 if (!ref) 754 if (!ref)
743 return -ENOMEM; 755 return -ENOMEM;
744 756
745 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); 757 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
746 if (!head_ref) { 758 if (!head_ref) {
747 kfree(ref); 759 kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
748 return -ENOMEM; 760 return -ENOMEM;
749 } 761 }
750 762
@@ -786,13 +798,13 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
786 struct btrfs_delayed_ref_root *delayed_refs; 798 struct btrfs_delayed_ref_root *delayed_refs;
787 799
788 BUG_ON(extent_op && !extent_op->is_data); 800 BUG_ON(extent_op && !extent_op->is_data);
789 ref = kmalloc(sizeof(*ref), GFP_NOFS); 801 ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
790 if (!ref) 802 if (!ref)
791 return -ENOMEM; 803 return -ENOMEM;
792 804
793 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); 805 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
794 if (!head_ref) { 806 if (!head_ref) {
795 kfree(ref); 807 kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
796 return -ENOMEM; 808 return -ENOMEM;
797 } 809 }
798 810
@@ -826,7 +838,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
826 struct btrfs_delayed_ref_head *head_ref; 838 struct btrfs_delayed_ref_head *head_ref;
827 struct btrfs_delayed_ref_root *delayed_refs; 839 struct btrfs_delayed_ref_root *delayed_refs;
828 840
829 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); 841 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
830 if (!head_ref) 842 if (!head_ref)
831 return -ENOMEM; 843 return -ENOMEM;
832 844
@@ -860,3 +872,51 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
860 return btrfs_delayed_node_to_head(ref); 872 return btrfs_delayed_node_to_head(ref);
861 return NULL; 873 return NULL;
862} 874}
875
876void btrfs_delayed_ref_exit(void)
877{
878 if (btrfs_delayed_ref_head_cachep)
879 kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
880 if (btrfs_delayed_tree_ref_cachep)
881 kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
882 if (btrfs_delayed_data_ref_cachep)
883 kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
884 if (btrfs_delayed_extent_op_cachep)
885 kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
886}
887
888int btrfs_delayed_ref_init(void)
889{
890 btrfs_delayed_ref_head_cachep = kmem_cache_create(
891 "btrfs_delayed_ref_head",
892 sizeof(struct btrfs_delayed_ref_head), 0,
893 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
894 if (!btrfs_delayed_ref_head_cachep)
895 goto fail;
896
897 btrfs_delayed_tree_ref_cachep = kmem_cache_create(
898 "btrfs_delayed_tree_ref",
899 sizeof(struct btrfs_delayed_tree_ref), 0,
900 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
901 if (!btrfs_delayed_tree_ref_cachep)
902 goto fail;
903
904 btrfs_delayed_data_ref_cachep = kmem_cache_create(
905 "btrfs_delayed_data_ref",
906 sizeof(struct btrfs_delayed_data_ref), 0,
907 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
908 if (!btrfs_delayed_data_ref_cachep)
909 goto fail;
910
911 btrfs_delayed_extent_op_cachep = kmem_cache_create(
912 "btrfs_delayed_extent_op",
913 sizeof(struct btrfs_delayed_extent_op), 0,
914 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
915 if (!btrfs_delayed_extent_op_cachep)
916 goto fail;
917
918 return 0;
919fail:
920 btrfs_delayed_ref_exit();
921 return -ENOMEM;
922}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index c9d703693df0..f75fcaf79aeb 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -132,6 +132,15 @@ struct btrfs_delayed_ref_root {
132 unsigned long num_heads_ready; 132 unsigned long num_heads_ready;
133 133
134 /* 134 /*
135 * bumped when someone is making progress on the delayed
136 * refs, so that other procs know they are just adding to
137 * contention intead of helping
138 */
139 atomic_t procs_running_refs;
140 atomic_t ref_seq;
141 wait_queue_head_t wait;
142
143 /*
135 * set when the tree is flushing before a transaction commit, 144 * set when the tree is flushing before a transaction commit,
136 * used by the throttling code to decide if new updates need 145 * used by the throttling code to decide if new updates need
137 * to be run right away 146 * to be run right away
@@ -141,12 +150,47 @@ struct btrfs_delayed_ref_root {
141 u64 run_delayed_start; 150 u64 run_delayed_start;
142}; 151};
143 152
153extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
154extern struct kmem_cache *btrfs_delayed_tree_ref_cachep;
155extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
156extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
157
158int btrfs_delayed_ref_init(void);
159void btrfs_delayed_ref_exit(void);
160
161static inline struct btrfs_delayed_extent_op *
162btrfs_alloc_delayed_extent_op(void)
163{
164 return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS);
165}
166
167static inline void
168btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
169{
170 if (op)
171 kmem_cache_free(btrfs_delayed_extent_op_cachep, op);
172}
173
144static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) 174static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
145{ 175{
146 WARN_ON(atomic_read(&ref->refs) == 0); 176 WARN_ON(atomic_read(&ref->refs) == 0);
147 if (atomic_dec_and_test(&ref->refs)) { 177 if (atomic_dec_and_test(&ref->refs)) {
148 WARN_ON(ref->in_tree); 178 WARN_ON(ref->in_tree);
149 kfree(ref); 179 switch (ref->type) {
180 case BTRFS_TREE_BLOCK_REF_KEY:
181 case BTRFS_SHARED_BLOCK_REF_KEY:
182 kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
183 break;
184 case BTRFS_EXTENT_DATA_REF_KEY:
185 case BTRFS_SHARED_DATA_REF_KEY:
186 kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
187 break;
188 case 0:
189 kmem_cache_free(btrfs_delayed_ref_head_cachep, ref);
190 break;
191 default:
192 BUG();
193 }
150 } 194 }
151} 195}
152 196
@@ -176,8 +220,14 @@ struct btrfs_delayed_ref_head *
176btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); 220btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
177int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, 221int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
178 struct btrfs_delayed_ref_head *head); 222 struct btrfs_delayed_ref_head *head);
223static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
224{
225 mutex_unlock(&head->mutex);
226}
227
179int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 228int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
180 struct list_head *cluster, u64 search_start); 229 struct list_head *cluster, u64 search_start);
230void btrfs_release_ref_cluster(struct list_head *cluster);
181 231
182int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, 232int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
183 struct btrfs_delayed_ref_root *delayed_refs, 233 struct btrfs_delayed_ref_root *delayed_refs,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 66dbc8dbddf7..7ba7b3900cb8 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -465,7 +465,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
465 * flush all outstanding I/O and inode extent mappings before the 465 * flush all outstanding I/O and inode extent mappings before the
466 * copy operation is declared as being finished 466 * copy operation is declared as being finished
467 */ 467 */
468 btrfs_start_delalloc_inodes(root, 0); 468 ret = btrfs_start_delalloc_inodes(root, 0);
469 if (ret) {
470 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
471 return ret;
472 }
469 btrfs_wait_ordered_extents(root, 0); 473 btrfs_wait_ordered_extents(root, 0);
470 474
471 trans = btrfs_start_transaction(root, 0); 475 trans = btrfs_start_transaction(root, 0);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a8f652dc940b..02369a3c162e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,6 +46,7 @@
46#include "check-integrity.h" 46#include "check-integrity.h"
47#include "rcu-string.h" 47#include "rcu-string.h"
48#include "dev-replace.h" 48#include "dev-replace.h"
49#include "raid56.h"
49 50
50#ifdef CONFIG_X86 51#ifdef CONFIG_X86
51#include <asm/cpufeature.h> 52#include <asm/cpufeature.h>
@@ -56,7 +57,8 @@ static void end_workqueue_fn(struct btrfs_work *work);
56static void free_fs_root(struct btrfs_root *root); 57static void free_fs_root(struct btrfs_root *root);
57static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, 58static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
58 int read_only); 59 int read_only);
59static void btrfs_destroy_ordered_operations(struct btrfs_root *root); 60static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
61 struct btrfs_root *root);
60static void btrfs_destroy_ordered_extents(struct btrfs_root *root); 62static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
61static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, 63static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
62 struct btrfs_root *root); 64 struct btrfs_root *root);
@@ -420,7 +422,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
420static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) 422static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
421{ 423{
422 struct extent_io_tree *tree; 424 struct extent_io_tree *tree;
423 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 425 u64 start = page_offset(page);
424 u64 found_start; 426 u64 found_start;
425 struct extent_buffer *eb; 427 struct extent_buffer *eb;
426 428
@@ -639,8 +641,15 @@ err:
639 btree_readahead_hook(root, eb, eb->start, ret); 641 btree_readahead_hook(root, eb, eb->start, ret);
640 } 642 }
641 643
642 if (ret) 644 if (ret) {
645 /*
646 * our io error hook is going to dec the io pages
647 * again, we have to make sure it has something
648 * to decrement
649 */
650 atomic_inc(&eb->io_pages);
643 clear_extent_buffer_uptodate(eb); 651 clear_extent_buffer_uptodate(eb);
652 }
644 free_extent_buffer(eb); 653 free_extent_buffer(eb);
645out: 654out:
646 return ret; 655 return ret;
@@ -654,6 +663,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
654 eb = (struct extent_buffer *)page->private; 663 eb = (struct extent_buffer *)page->private;
655 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 664 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
656 eb->read_mirror = failed_mirror; 665 eb->read_mirror = failed_mirror;
666 atomic_dec(&eb->io_pages);
657 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) 667 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
658 btree_readahead_hook(root, eb, eb->start, -EIO); 668 btree_readahead_hook(root, eb, eb->start, -EIO);
659 return -EIO; /* we fixed nothing */ 669 return -EIO; /* we fixed nothing */
@@ -670,17 +680,23 @@ static void end_workqueue_bio(struct bio *bio, int err)
670 end_io_wq->work.flags = 0; 680 end_io_wq->work.flags = 0;
671 681
672 if (bio->bi_rw & REQ_WRITE) { 682 if (bio->bi_rw & REQ_WRITE) {
673 if (end_io_wq->metadata == 1) 683 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
674 btrfs_queue_worker(&fs_info->endio_meta_write_workers, 684 btrfs_queue_worker(&fs_info->endio_meta_write_workers,
675 &end_io_wq->work); 685 &end_io_wq->work);
676 else if (end_io_wq->metadata == 2) 686 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
677 btrfs_queue_worker(&fs_info->endio_freespace_worker, 687 btrfs_queue_worker(&fs_info->endio_freespace_worker,
678 &end_io_wq->work); 688 &end_io_wq->work);
689 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
690 btrfs_queue_worker(&fs_info->endio_raid56_workers,
691 &end_io_wq->work);
679 else 692 else
680 btrfs_queue_worker(&fs_info->endio_write_workers, 693 btrfs_queue_worker(&fs_info->endio_write_workers,
681 &end_io_wq->work); 694 &end_io_wq->work);
682 } else { 695 } else {
683 if (end_io_wq->metadata) 696 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
697 btrfs_queue_worker(&fs_info->endio_raid56_workers,
698 &end_io_wq->work);
699 else if (end_io_wq->metadata)
684 btrfs_queue_worker(&fs_info->endio_meta_workers, 700 btrfs_queue_worker(&fs_info->endio_meta_workers,
685 &end_io_wq->work); 701 &end_io_wq->work);
686 else 702 else
@@ -695,6 +711,7 @@ static void end_workqueue_bio(struct bio *bio, int err)
695 * 0 - if data 711 * 0 - if data
696 * 1 - if normal metadta 712 * 1 - if normal metadta
697 * 2 - if writing to the free space cache area 713 * 2 - if writing to the free space cache area
714 * 3 - raid parity work
698 */ 715 */
699int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 716int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
700 int metadata) 717 int metadata)
@@ -946,18 +963,20 @@ static int btree_writepages(struct address_space *mapping,
946 struct writeback_control *wbc) 963 struct writeback_control *wbc)
947{ 964{
948 struct extent_io_tree *tree; 965 struct extent_io_tree *tree;
966 struct btrfs_fs_info *fs_info;
967 int ret;
968
949 tree = &BTRFS_I(mapping->host)->io_tree; 969 tree = &BTRFS_I(mapping->host)->io_tree;
950 if (wbc->sync_mode == WB_SYNC_NONE) { 970 if (wbc->sync_mode == WB_SYNC_NONE) {
951 struct btrfs_root *root = BTRFS_I(mapping->host)->root;
952 u64 num_dirty;
953 unsigned long thresh = 32 * 1024 * 1024;
954 971
955 if (wbc->for_kupdate) 972 if (wbc->for_kupdate)
956 return 0; 973 return 0;
957 974
975 fs_info = BTRFS_I(mapping->host)->root->fs_info;
958 /* this is a bit racy, but that's ok */ 976 /* this is a bit racy, but that's ok */
959 num_dirty = root->fs_info->dirty_metadata_bytes; 977 ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
960 if (num_dirty < thresh) 978 BTRFS_DIRTY_METADATA_THRESH);
979 if (ret < 0)
961 return 0; 980 return 0;
962 } 981 }
963 return btree_write_cache_pages(mapping, wbc); 982 return btree_write_cache_pages(mapping, wbc);
@@ -1125,24 +1144,16 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
1125void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1144void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1126 struct extent_buffer *buf) 1145 struct extent_buffer *buf)
1127{ 1146{
1147 struct btrfs_fs_info *fs_info = root->fs_info;
1148
1128 if (btrfs_header_generation(buf) == 1149 if (btrfs_header_generation(buf) ==
1129 root->fs_info->running_transaction->transid) { 1150 fs_info->running_transaction->transid) {
1130 btrfs_assert_tree_locked(buf); 1151 btrfs_assert_tree_locked(buf);
1131 1152
1132 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { 1153 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
1133 spin_lock(&root->fs_info->delalloc_lock); 1154 __percpu_counter_add(&fs_info->dirty_metadata_bytes,
1134 if (root->fs_info->dirty_metadata_bytes >= buf->len) 1155 -buf->len,
1135 root->fs_info->dirty_metadata_bytes -= buf->len; 1156 fs_info->dirty_metadata_batch);
1136 else {
1137 spin_unlock(&root->fs_info->delalloc_lock);
1138 btrfs_panic(root->fs_info, -EOVERFLOW,
1139 "Can't clear %lu bytes from "
1140 " dirty_mdatadata_bytes (%llu)",
1141 buf->len,
1142 root->fs_info->dirty_metadata_bytes);
1143 }
1144 spin_unlock(&root->fs_info->delalloc_lock);
1145
1146 /* ugh, clear_extent_buffer_dirty needs to lock the page */ 1157 /* ugh, clear_extent_buffer_dirty needs to lock the page */
1147 btrfs_set_lock_blocking(buf); 1158 btrfs_set_lock_blocking(buf);
1148 clear_extent_buffer_dirty(buf); 1159 clear_extent_buffer_dirty(buf);
@@ -1178,9 +1189,13 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1178 1189
1179 INIT_LIST_HEAD(&root->dirty_list); 1190 INIT_LIST_HEAD(&root->dirty_list);
1180 INIT_LIST_HEAD(&root->root_list); 1191 INIT_LIST_HEAD(&root->root_list);
1192 INIT_LIST_HEAD(&root->logged_list[0]);
1193 INIT_LIST_HEAD(&root->logged_list[1]);
1181 spin_lock_init(&root->orphan_lock); 1194 spin_lock_init(&root->orphan_lock);
1182 spin_lock_init(&root->inode_lock); 1195 spin_lock_init(&root->inode_lock);
1183 spin_lock_init(&root->accounting_lock); 1196 spin_lock_init(&root->accounting_lock);
1197 spin_lock_init(&root->log_extents_lock[0]);
1198 spin_lock_init(&root->log_extents_lock[1]);
1184 mutex_init(&root->objectid_mutex); 1199 mutex_init(&root->objectid_mutex);
1185 mutex_init(&root->log_mutex); 1200 mutex_init(&root->log_mutex);
1186 init_waitqueue_head(&root->log_writer_wait); 1201 init_waitqueue_head(&root->log_writer_wait);
@@ -2004,10 +2019,24 @@ int open_ctree(struct super_block *sb,
2004 goto fail_srcu; 2019 goto fail_srcu;
2005 } 2020 }
2006 2021
2022 ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0);
2023 if (ret) {
2024 err = ret;
2025 goto fail_bdi;
2026 }
2027 fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
2028 (1 + ilog2(nr_cpu_ids));
2029
2030 ret = percpu_counter_init(&fs_info->delalloc_bytes, 0);
2031 if (ret) {
2032 err = ret;
2033 goto fail_dirty_metadata_bytes;
2034 }
2035
2007 fs_info->btree_inode = new_inode(sb); 2036 fs_info->btree_inode = new_inode(sb);
2008 if (!fs_info->btree_inode) { 2037 if (!fs_info->btree_inode) {
2009 err = -ENOMEM; 2038 err = -ENOMEM;
2010 goto fail_bdi; 2039 goto fail_delalloc_bytes;
2011 } 2040 }
2012 2041
2013 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); 2042 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -2017,7 +2046,6 @@ int open_ctree(struct super_block *sb,
2017 INIT_LIST_HEAD(&fs_info->dead_roots); 2046 INIT_LIST_HEAD(&fs_info->dead_roots);
2018 INIT_LIST_HEAD(&fs_info->delayed_iputs); 2047 INIT_LIST_HEAD(&fs_info->delayed_iputs);
2019 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 2048 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
2020 INIT_LIST_HEAD(&fs_info->ordered_operations);
2021 INIT_LIST_HEAD(&fs_info->caching_block_groups); 2049 INIT_LIST_HEAD(&fs_info->caching_block_groups);
2022 spin_lock_init(&fs_info->delalloc_lock); 2050 spin_lock_init(&fs_info->delalloc_lock);
2023 spin_lock_init(&fs_info->trans_lock); 2051 spin_lock_init(&fs_info->trans_lock);
@@ -2028,6 +2056,7 @@ int open_ctree(struct super_block *sb,
2028 spin_lock_init(&fs_info->tree_mod_seq_lock); 2056 spin_lock_init(&fs_info->tree_mod_seq_lock);
2029 rwlock_init(&fs_info->tree_mod_log_lock); 2057 rwlock_init(&fs_info->tree_mod_log_lock);
2030 mutex_init(&fs_info->reloc_mutex); 2058 mutex_init(&fs_info->reloc_mutex);
2059 seqlock_init(&fs_info->profiles_lock);
2031 2060
2032 init_completion(&fs_info->kobj_unregister); 2061 init_completion(&fs_info->kobj_unregister);
2033 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 2062 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2126,6 +2155,7 @@ int open_ctree(struct super_block *sb,
2126 2155
2127 spin_lock_init(&fs_info->block_group_cache_lock); 2156 spin_lock_init(&fs_info->block_group_cache_lock);
2128 fs_info->block_group_cache_tree = RB_ROOT; 2157 fs_info->block_group_cache_tree = RB_ROOT;
2158 fs_info->first_logical_byte = (u64)-1;
2129 2159
2130 extent_io_tree_init(&fs_info->freed_extents[0], 2160 extent_io_tree_init(&fs_info->freed_extents[0],
2131 fs_info->btree_inode->i_mapping); 2161 fs_info->btree_inode->i_mapping);
@@ -2165,6 +2195,12 @@ int open_ctree(struct super_block *sb,
2165 init_waitqueue_head(&fs_info->transaction_blocked_wait); 2195 init_waitqueue_head(&fs_info->transaction_blocked_wait);
2166 init_waitqueue_head(&fs_info->async_submit_wait); 2196 init_waitqueue_head(&fs_info->async_submit_wait);
2167 2197
2198 ret = btrfs_alloc_stripe_hash_table(fs_info);
2199 if (ret) {
2200 err = ret;
2201 goto fail_alloc;
2202 }
2203
2168 __setup_root(4096, 4096, 4096, 4096, tree_root, 2204 __setup_root(4096, 4096, 4096, 4096, tree_root,
2169 fs_info, BTRFS_ROOT_TREE_OBJECTID); 2205 fs_info, BTRFS_ROOT_TREE_OBJECTID);
2170 2206
@@ -2187,7 +2223,8 @@ int open_ctree(struct super_block *sb,
2187 goto fail_alloc; 2223 goto fail_alloc;
2188 2224
2189 /* check FS state, whether FS is broken. */ 2225 /* check FS state, whether FS is broken. */
2190 fs_info->fs_state |= btrfs_super_flags(disk_super); 2226 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
2227 set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
2191 2228
2192 ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); 2229 ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
2193 if (ret) { 2230 if (ret) {
@@ -2261,6 +2298,8 @@ int open_ctree(struct super_block *sb,
2261 leafsize = btrfs_super_leafsize(disk_super); 2298 leafsize = btrfs_super_leafsize(disk_super);
2262 sectorsize = btrfs_super_sectorsize(disk_super); 2299 sectorsize = btrfs_super_sectorsize(disk_super);
2263 stripesize = btrfs_super_stripesize(disk_super); 2300 stripesize = btrfs_super_stripesize(disk_super);
2301 fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids));
2302 fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
2264 2303
2265 /* 2304 /*
2266 * mixed block groups end up with duplicate but slightly offset 2305 * mixed block groups end up with duplicate but slightly offset
@@ -2332,6 +2371,12 @@ int open_ctree(struct super_block *sb,
2332 btrfs_init_workers(&fs_info->endio_meta_write_workers, 2371 btrfs_init_workers(&fs_info->endio_meta_write_workers,
2333 "endio-meta-write", fs_info->thread_pool_size, 2372 "endio-meta-write", fs_info->thread_pool_size,
2334 &fs_info->generic_worker); 2373 &fs_info->generic_worker);
2374 btrfs_init_workers(&fs_info->endio_raid56_workers,
2375 "endio-raid56", fs_info->thread_pool_size,
2376 &fs_info->generic_worker);
2377 btrfs_init_workers(&fs_info->rmw_workers,
2378 "rmw", fs_info->thread_pool_size,
2379 &fs_info->generic_worker);
2335 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", 2380 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
2336 fs_info->thread_pool_size, 2381 fs_info->thread_pool_size,
2337 &fs_info->generic_worker); 2382 &fs_info->generic_worker);
@@ -2350,6 +2395,8 @@ int open_ctree(struct super_block *sb,
2350 */ 2395 */
2351 fs_info->endio_workers.idle_thresh = 4; 2396 fs_info->endio_workers.idle_thresh = 4;
2352 fs_info->endio_meta_workers.idle_thresh = 4; 2397 fs_info->endio_meta_workers.idle_thresh = 4;
2398 fs_info->endio_raid56_workers.idle_thresh = 4;
2399 fs_info->rmw_workers.idle_thresh = 2;
2353 2400
2354 fs_info->endio_write_workers.idle_thresh = 2; 2401 fs_info->endio_write_workers.idle_thresh = 2;
2355 fs_info->endio_meta_write_workers.idle_thresh = 2; 2402 fs_info->endio_meta_write_workers.idle_thresh = 2;
@@ -2366,6 +2413,8 @@ int open_ctree(struct super_block *sb,
2366 ret |= btrfs_start_workers(&fs_info->fixup_workers); 2413 ret |= btrfs_start_workers(&fs_info->fixup_workers);
2367 ret |= btrfs_start_workers(&fs_info->endio_workers); 2414 ret |= btrfs_start_workers(&fs_info->endio_workers);
2368 ret |= btrfs_start_workers(&fs_info->endio_meta_workers); 2415 ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
2416 ret |= btrfs_start_workers(&fs_info->rmw_workers);
2417 ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);
2369 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); 2418 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
2370 ret |= btrfs_start_workers(&fs_info->endio_write_workers); 2419 ret |= btrfs_start_workers(&fs_info->endio_write_workers);
2371 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); 2420 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
@@ -2390,8 +2439,7 @@ int open_ctree(struct super_block *sb,
2390 sb->s_blocksize = sectorsize; 2439 sb->s_blocksize = sectorsize;
2391 sb->s_blocksize_bits = blksize_bits(sectorsize); 2440 sb->s_blocksize_bits = blksize_bits(sectorsize);
2392 2441
2393 if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, 2442 if (disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) {
2394 sizeof(disk_super->magic))) {
2395 printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); 2443 printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
2396 goto fail_sb_buffer; 2444 goto fail_sb_buffer;
2397 } 2445 }
@@ -2694,13 +2742,13 @@ fail_cleaner:
2694 * kthreads 2742 * kthreads
2695 */ 2743 */
2696 filemap_write_and_wait(fs_info->btree_inode->i_mapping); 2744 filemap_write_and_wait(fs_info->btree_inode->i_mapping);
2697 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2698 2745
2699fail_block_groups: 2746fail_block_groups:
2700 btrfs_free_block_groups(fs_info); 2747 btrfs_free_block_groups(fs_info);
2701 2748
2702fail_tree_roots: 2749fail_tree_roots:
2703 free_root_pointers(fs_info, 1); 2750 free_root_pointers(fs_info, 1);
2751 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2704 2752
2705fail_sb_buffer: 2753fail_sb_buffer:
2706 btrfs_stop_workers(&fs_info->generic_worker); 2754 btrfs_stop_workers(&fs_info->generic_worker);
@@ -2710,6 +2758,8 @@ fail_sb_buffer:
2710 btrfs_stop_workers(&fs_info->workers); 2758 btrfs_stop_workers(&fs_info->workers);
2711 btrfs_stop_workers(&fs_info->endio_workers); 2759 btrfs_stop_workers(&fs_info->endio_workers);
2712 btrfs_stop_workers(&fs_info->endio_meta_workers); 2760 btrfs_stop_workers(&fs_info->endio_meta_workers);
2761 btrfs_stop_workers(&fs_info->endio_raid56_workers);
2762 btrfs_stop_workers(&fs_info->rmw_workers);
2713 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2763 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2714 btrfs_stop_workers(&fs_info->endio_write_workers); 2764 btrfs_stop_workers(&fs_info->endio_write_workers);
2715 btrfs_stop_workers(&fs_info->endio_freespace_worker); 2765 btrfs_stop_workers(&fs_info->endio_freespace_worker);
@@ -2721,13 +2771,17 @@ fail_alloc:
2721fail_iput: 2771fail_iput:
2722 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2772 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2723 2773
2724 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2725 iput(fs_info->btree_inode); 2774 iput(fs_info->btree_inode);
2775fail_delalloc_bytes:
2776 percpu_counter_destroy(&fs_info->delalloc_bytes);
2777fail_dirty_metadata_bytes:
2778 percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
2726fail_bdi: 2779fail_bdi:
2727 bdi_destroy(&fs_info->bdi); 2780 bdi_destroy(&fs_info->bdi);
2728fail_srcu: 2781fail_srcu:
2729 cleanup_srcu_struct(&fs_info->subvol_srcu); 2782 cleanup_srcu_struct(&fs_info->subvol_srcu);
2730fail: 2783fail:
2784 btrfs_free_stripe_hash_table(fs_info);
2731 btrfs_close_devices(fs_info->fs_devices); 2785 btrfs_close_devices(fs_info->fs_devices);
2732 return err; 2786 return err;
2733 2787
@@ -2795,8 +2849,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
2795 2849
2796 super = (struct btrfs_super_block *)bh->b_data; 2850 super = (struct btrfs_super_block *)bh->b_data;
2797 if (btrfs_super_bytenr(super) != bytenr || 2851 if (btrfs_super_bytenr(super) != bytenr ||
2798 strncmp((char *)(&super->magic), BTRFS_MAGIC, 2852 super->magic != cpu_to_le64(BTRFS_MAGIC)) {
2799 sizeof(super->magic))) {
2800 brelse(bh); 2853 brelse(bh);
2801 continue; 2854 continue;
2802 } 2855 }
@@ -3076,11 +3129,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
3076 ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) 3129 ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
3077 == 0))) 3130 == 0)))
3078 num_tolerated_disk_barrier_failures = 0; 3131 num_tolerated_disk_barrier_failures = 0;
3079 else if (num_tolerated_disk_barrier_failures > 1 3132 else if (num_tolerated_disk_barrier_failures > 1) {
3080 && 3133 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3081 (flags & (BTRFS_BLOCK_GROUP_RAID1 | 3134 BTRFS_BLOCK_GROUP_RAID5 |
3082 BTRFS_BLOCK_GROUP_RAID10))) 3135 BTRFS_BLOCK_GROUP_RAID10)) {
3083 num_tolerated_disk_barrier_failures = 1; 3136 num_tolerated_disk_barrier_failures = 1;
3137 } else if (flags &
3138 BTRFS_BLOCK_GROUP_RAID5) {
3139 num_tolerated_disk_barrier_failures = 2;
3140 }
3141 }
3084 } 3142 }
3085 } 3143 }
3086 up_read(&sinfo->groups_sem); 3144 up_read(&sinfo->groups_sem);
@@ -3195,6 +3253,11 @@ void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
3195 if (btrfs_root_refs(&root->root_item) == 0) 3253 if (btrfs_root_refs(&root->root_item) == 0)
3196 synchronize_srcu(&fs_info->subvol_srcu); 3254 synchronize_srcu(&fs_info->subvol_srcu);
3197 3255
3256 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
3257 btrfs_free_log(NULL, root);
3258 btrfs_free_log_root_tree(NULL, fs_info);
3259 }
3260
3198 __btrfs_remove_free_space_cache(root->free_ino_pinned); 3261 __btrfs_remove_free_space_cache(root->free_ino_pinned);
3199 __btrfs_remove_free_space_cache(root->free_ino_ctl); 3262 __btrfs_remove_free_space_cache(root->free_ino_ctl);
3200 free_fs_root(root); 3263 free_fs_root(root);
@@ -3339,7 +3402,7 @@ int close_ctree(struct btrfs_root *root)
3339 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 3402 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
3340 } 3403 }
3341 3404
3342 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 3405 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3343 btrfs_error_commit_super(root); 3406 btrfs_error_commit_super(root);
3344 3407
3345 btrfs_put_block_group_cache(fs_info); 3408 btrfs_put_block_group_cache(fs_info);
@@ -3352,9 +3415,9 @@ int close_ctree(struct btrfs_root *root)
3352 3415
3353 btrfs_free_qgroup_config(root->fs_info); 3416 btrfs_free_qgroup_config(root->fs_info);
3354 3417
3355 if (fs_info->delalloc_bytes) { 3418 if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
3356 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", 3419 printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n",
3357 (unsigned long long)fs_info->delalloc_bytes); 3420 percpu_counter_sum(&fs_info->delalloc_bytes));
3358 } 3421 }
3359 3422
3360 free_extent_buffer(fs_info->extent_root->node); 3423 free_extent_buffer(fs_info->extent_root->node);
@@ -3384,6 +3447,8 @@ int close_ctree(struct btrfs_root *root)
3384 btrfs_stop_workers(&fs_info->workers); 3447 btrfs_stop_workers(&fs_info->workers);
3385 btrfs_stop_workers(&fs_info->endio_workers); 3448 btrfs_stop_workers(&fs_info->endio_workers);
3386 btrfs_stop_workers(&fs_info->endio_meta_workers); 3449 btrfs_stop_workers(&fs_info->endio_meta_workers);
3450 btrfs_stop_workers(&fs_info->endio_raid56_workers);
3451 btrfs_stop_workers(&fs_info->rmw_workers);
3387 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 3452 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
3388 btrfs_stop_workers(&fs_info->endio_write_workers); 3453 btrfs_stop_workers(&fs_info->endio_write_workers);
3389 btrfs_stop_workers(&fs_info->endio_freespace_worker); 3454 btrfs_stop_workers(&fs_info->endio_freespace_worker);
@@ -3401,9 +3466,13 @@ int close_ctree(struct btrfs_root *root)
3401 btrfs_close_devices(fs_info->fs_devices); 3466 btrfs_close_devices(fs_info->fs_devices);
3402 btrfs_mapping_tree_free(&fs_info->mapping_tree); 3467 btrfs_mapping_tree_free(&fs_info->mapping_tree);
3403 3468
3469 percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
3470 percpu_counter_destroy(&fs_info->delalloc_bytes);
3404 bdi_destroy(&fs_info->bdi); 3471 bdi_destroy(&fs_info->bdi);
3405 cleanup_srcu_struct(&fs_info->subvol_srcu); 3472 cleanup_srcu_struct(&fs_info->subvol_srcu);
3406 3473
3474 btrfs_free_stripe_hash_table(fs_info);
3475
3407 return 0; 3476 return 0;
3408} 3477}
3409 3478
@@ -3443,11 +3512,10 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
3443 (unsigned long long)transid, 3512 (unsigned long long)transid,
3444 (unsigned long long)root->fs_info->generation); 3513 (unsigned long long)root->fs_info->generation);
3445 was_dirty = set_extent_buffer_dirty(buf); 3514 was_dirty = set_extent_buffer_dirty(buf);
3446 if (!was_dirty) { 3515 if (!was_dirty)
3447 spin_lock(&root->fs_info->delalloc_lock); 3516 __percpu_counter_add(&root->fs_info->dirty_metadata_bytes,
3448 root->fs_info->dirty_metadata_bytes += buf->len; 3517 buf->len,
3449 spin_unlock(&root->fs_info->delalloc_lock); 3518 root->fs_info->dirty_metadata_batch);
3450 }
3451} 3519}
3452 3520
3453static void __btrfs_btree_balance_dirty(struct btrfs_root *root, 3521static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
@@ -3457,8 +3525,7 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
3457 * looks as though older kernels can get into trouble with 3525 * looks as though older kernels can get into trouble with
3458 * this code, they end up stuck in balance_dirty_pages forever 3526 * this code, they end up stuck in balance_dirty_pages forever
3459 */ 3527 */
3460 u64 num_dirty; 3528 int ret;
3461 unsigned long thresh = 32 * 1024 * 1024;
3462 3529
3463 if (current->flags & PF_MEMALLOC) 3530 if (current->flags & PF_MEMALLOC)
3464 return; 3531 return;
@@ -3466,9 +3533,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
3466 if (flush_delayed) 3533 if (flush_delayed)
3467 btrfs_balance_delayed_items(root); 3534 btrfs_balance_delayed_items(root);
3468 3535
3469 num_dirty = root->fs_info->dirty_metadata_bytes; 3536 ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
3470 3537 BTRFS_DIRTY_METADATA_THRESH);
3471 if (num_dirty > thresh) { 3538 if (ret > 0) {
3472 balance_dirty_pages_ratelimited( 3539 balance_dirty_pages_ratelimited(
3473 root->fs_info->btree_inode->i_mapping); 3540 root->fs_info->btree_inode->i_mapping);
3474 } 3541 }
@@ -3518,7 +3585,8 @@ void btrfs_error_commit_super(struct btrfs_root *root)
3518 btrfs_cleanup_transaction(root); 3585 btrfs_cleanup_transaction(root);
3519} 3586}
3520 3587
3521static void btrfs_destroy_ordered_operations(struct btrfs_root *root) 3588static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
3589 struct btrfs_root *root)
3522{ 3590{
3523 struct btrfs_inode *btrfs_inode; 3591 struct btrfs_inode *btrfs_inode;
3524 struct list_head splice; 3592 struct list_head splice;
@@ -3528,7 +3596,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
3528 mutex_lock(&root->fs_info->ordered_operations_mutex); 3596 mutex_lock(&root->fs_info->ordered_operations_mutex);
3529 spin_lock(&root->fs_info->ordered_extent_lock); 3597 spin_lock(&root->fs_info->ordered_extent_lock);
3530 3598
3531 list_splice_init(&root->fs_info->ordered_operations, &splice); 3599 list_splice_init(&t->ordered_operations, &splice);
3532 while (!list_empty(&splice)) { 3600 while (!list_empty(&splice)) {
3533 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 3601 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
3534 ordered_operations); 3602 ordered_operations);
@@ -3544,35 +3612,16 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
3544 3612
3545static void btrfs_destroy_ordered_extents(struct btrfs_root *root) 3613static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
3546{ 3614{
3547 struct list_head splice;
3548 struct btrfs_ordered_extent *ordered; 3615 struct btrfs_ordered_extent *ordered;
3549 struct inode *inode;
3550
3551 INIT_LIST_HEAD(&splice);
3552 3616
3553 spin_lock(&root->fs_info->ordered_extent_lock); 3617 spin_lock(&root->fs_info->ordered_extent_lock);
3554 3618 /*
3555 list_splice_init(&root->fs_info->ordered_extents, &splice); 3619 * This will just short circuit the ordered completion stuff which will
3556 while (!list_empty(&splice)) { 3620 * make sure the ordered extent gets properly cleaned up.
3557 ordered = list_entry(splice.next, struct btrfs_ordered_extent, 3621 */
3558 root_extent_list); 3622 list_for_each_entry(ordered, &root->fs_info->ordered_extents,
3559 3623 root_extent_list)
3560 list_del_init(&ordered->root_extent_list); 3624 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
3561 atomic_inc(&ordered->refs);
3562
3563 /* the inode may be getting freed (in sys_unlink path). */
3564 inode = igrab(ordered->inode);
3565
3566 spin_unlock(&root->fs_info->ordered_extent_lock);
3567 if (inode)
3568 iput(inode);
3569
3570 atomic_set(&ordered->refs, 1);
3571 btrfs_put_ordered_extent(ordered);
3572
3573 spin_lock(&root->fs_info->ordered_extent_lock);
3574 }
3575
3576 spin_unlock(&root->fs_info->ordered_extent_lock); 3625 spin_unlock(&root->fs_info->ordered_extent_lock);
3577} 3626}
3578 3627
@@ -3594,11 +3643,11 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3594 } 3643 }
3595 3644
3596 while ((node = rb_first(&delayed_refs->root)) != NULL) { 3645 while ((node = rb_first(&delayed_refs->root)) != NULL) {
3597 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 3646 struct btrfs_delayed_ref_head *head = NULL;
3598 3647
3648 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
3599 atomic_set(&ref->refs, 1); 3649 atomic_set(&ref->refs, 1);
3600 if (btrfs_delayed_ref_is_head(ref)) { 3650 if (btrfs_delayed_ref_is_head(ref)) {
3601 struct btrfs_delayed_ref_head *head;
3602 3651
3603 head = btrfs_delayed_node_to_head(ref); 3652 head = btrfs_delayed_node_to_head(ref);
3604 if (!mutex_trylock(&head->mutex)) { 3653 if (!mutex_trylock(&head->mutex)) {
@@ -3614,16 +3663,18 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3614 continue; 3663 continue;
3615 } 3664 }
3616 3665
3617 kfree(head->extent_op); 3666 btrfs_free_delayed_extent_op(head->extent_op);
3618 delayed_refs->num_heads--; 3667 delayed_refs->num_heads--;
3619 if (list_empty(&head->cluster)) 3668 if (list_empty(&head->cluster))
3620 delayed_refs->num_heads_ready--; 3669 delayed_refs->num_heads_ready--;
3621 list_del_init(&head->cluster); 3670 list_del_init(&head->cluster);
3622 } 3671 }
3672
3623 ref->in_tree = 0; 3673 ref->in_tree = 0;
3624 rb_erase(&ref->rb_node, &delayed_refs->root); 3674 rb_erase(&ref->rb_node, &delayed_refs->root);
3625 delayed_refs->num_entries--; 3675 delayed_refs->num_entries--;
3626 3676 if (head)
3677 mutex_unlock(&head->mutex);
3627 spin_unlock(&delayed_refs->lock); 3678 spin_unlock(&delayed_refs->lock);
3628 btrfs_put_delayed_ref(ref); 3679 btrfs_put_delayed_ref(ref);
3629 3680
@@ -3671,6 +3722,8 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
3671 delalloc_inodes); 3722 delalloc_inodes);
3672 3723
3673 list_del_init(&btrfs_inode->delalloc_inodes); 3724 list_del_init(&btrfs_inode->delalloc_inodes);
3725 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
3726 &btrfs_inode->runtime_flags);
3674 3727
3675 btrfs_invalidate_inodes(btrfs_inode->root); 3728 btrfs_invalidate_inodes(btrfs_inode->root);
3676 } 3729 }
@@ -3823,10 +3876,8 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
3823 3876
3824 while (!list_empty(&list)) { 3877 while (!list_empty(&list)) {
3825 t = list_entry(list.next, struct btrfs_transaction, list); 3878 t = list_entry(list.next, struct btrfs_transaction, list);
3826 if (!t)
3827 break;
3828 3879
3829 btrfs_destroy_ordered_operations(root); 3880 btrfs_destroy_ordered_operations(t, root);
3830 3881
3831 btrfs_destroy_ordered_extents(root); 3882 btrfs_destroy_ordered_extents(root);
3832 3883
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 305c33efb0e3..034d7dc552b2 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,6 +25,13 @@
25#define BTRFS_SUPER_MIRROR_MAX 3 25#define BTRFS_SUPER_MIRROR_MAX 3
26#define BTRFS_SUPER_MIRROR_SHIFT 12 26#define BTRFS_SUPER_MIRROR_SHIFT 12
27 27
28enum {
29 BTRFS_WQ_ENDIO_DATA = 0,
30 BTRFS_WQ_ENDIO_METADATA = 1,
31 BTRFS_WQ_ENDIO_FREE_SPACE = 2,
32 BTRFS_WQ_ENDIO_RAID56 = 3,
33};
34
28static inline u64 btrfs_sb_offset(int mirror) 35static inline u64 btrfs_sb_offset(int mirror)
29{ 36{
30 u64 start = 16 * 1024; 37 u64 start = 16 * 1024;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cf54bdfee334..3e074dab2d57 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,6 +31,7 @@
31#include "print-tree.h" 31#include "print-tree.h"
32#include "transaction.h" 32#include "transaction.h"
33#include "volumes.h" 33#include "volumes.h"
34#include "raid56.h"
34#include "locking.h" 35#include "locking.h"
35#include "free-space-cache.h" 36#include "free-space-cache.h"
36#include "math.h" 37#include "math.h"
@@ -72,8 +73,7 @@ enum {
72 RESERVE_ALLOC_NO_ACCOUNT = 2, 73 RESERVE_ALLOC_NO_ACCOUNT = 2,
73}; 74};
74 75
75static int update_block_group(struct btrfs_trans_handle *trans, 76static int update_block_group(struct btrfs_root *root,
76 struct btrfs_root *root,
77 u64 bytenr, u64 num_bytes, int alloc); 77 u64 bytenr, u64 num_bytes, int alloc);
78static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 78static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
79 struct btrfs_root *root, 79 struct btrfs_root *root,
@@ -103,6 +103,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
103 int dump_block_groups); 103 int dump_block_groups);
104static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 104static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
105 u64 num_bytes, int reserve); 105 u64 num_bytes, int reserve);
106static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
107 u64 num_bytes);
106 108
107static noinline int 109static noinline int
108block_group_cache_done(struct btrfs_block_group_cache *cache) 110block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -162,6 +164,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
162 rb_link_node(&block_group->cache_node, parent, p); 164 rb_link_node(&block_group->cache_node, parent, p);
163 rb_insert_color(&block_group->cache_node, 165 rb_insert_color(&block_group->cache_node,
164 &info->block_group_cache_tree); 166 &info->block_group_cache_tree);
167
168 if (info->first_logical_byte > block_group->key.objectid)
169 info->first_logical_byte = block_group->key.objectid;
170
165 spin_unlock(&info->block_group_cache_lock); 171 spin_unlock(&info->block_group_cache_lock);
166 172
167 return 0; 173 return 0;
@@ -203,8 +209,11 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
203 break; 209 break;
204 } 210 }
205 } 211 }
206 if (ret) 212 if (ret) {
207 btrfs_get_block_group(ret); 213 btrfs_get_block_group(ret);
214 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
215 info->first_logical_byte = ret->key.objectid;
216 }
208 spin_unlock(&info->block_group_cache_lock); 217 spin_unlock(&info->block_group_cache_lock);
209 218
210 return ret; 219 return ret;
@@ -468,8 +477,6 @@ out:
468} 477}
469 478
470static int cache_block_group(struct btrfs_block_group_cache *cache, 479static int cache_block_group(struct btrfs_block_group_cache *cache,
471 struct btrfs_trans_handle *trans,
472 struct btrfs_root *root,
473 int load_cache_only) 480 int load_cache_only)
474{ 481{
475 DEFINE_WAIT(wait); 482 DEFINE_WAIT(wait);
@@ -527,12 +534,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
527 cache->cached = BTRFS_CACHE_FAST; 534 cache->cached = BTRFS_CACHE_FAST;
528 spin_unlock(&cache->lock); 535 spin_unlock(&cache->lock);
529 536
530 /*
531 * We can't do the read from on-disk cache during a commit since we need
532 * to have the normal tree locking. Also if we are currently trying to
533 * allocate blocks for the tree root we can't do the fast caching since
534 * we likely hold important locks.
535 */
536 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 537 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
537 ret = load_free_space_cache(fs_info, cache); 538 ret = load_free_space_cache(fs_info, cache);
538 539
@@ -1852,6 +1853,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1852 *actual_bytes = discarded_bytes; 1853 *actual_bytes = discarded_bytes;
1853 1854
1854 1855
1856 if (ret == -EOPNOTSUPP)
1857 ret = 0;
1855 return ret; 1858 return ret;
1856} 1859}
1857 1860
@@ -2143,7 +2146,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2143 node->num_bytes); 2146 node->num_bytes);
2144 } 2147 }
2145 } 2148 }
2146 mutex_unlock(&head->mutex);
2147 return ret; 2149 return ret;
2148 } 2150 }
2149 2151
@@ -2258,7 +2260,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2258 * process of being added. Don't run this ref yet. 2260 * process of being added. Don't run this ref yet.
2259 */ 2261 */
2260 list_del_init(&locked_ref->cluster); 2262 list_del_init(&locked_ref->cluster);
2261 mutex_unlock(&locked_ref->mutex); 2263 btrfs_delayed_ref_unlock(locked_ref);
2262 locked_ref = NULL; 2264 locked_ref = NULL;
2263 delayed_refs->num_heads_ready++; 2265 delayed_refs->num_heads_ready++;
2264 spin_unlock(&delayed_refs->lock); 2266 spin_unlock(&delayed_refs->lock);
@@ -2285,7 +2287,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2285 ref = &locked_ref->node; 2287 ref = &locked_ref->node;
2286 2288
2287 if (extent_op && must_insert_reserved) { 2289 if (extent_op && must_insert_reserved) {
2288 kfree(extent_op); 2290 btrfs_free_delayed_extent_op(extent_op);
2289 extent_op = NULL; 2291 extent_op = NULL;
2290 } 2292 }
2291 2293
@@ -2294,28 +2296,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2294 2296
2295 ret = run_delayed_extent_op(trans, root, 2297 ret = run_delayed_extent_op(trans, root,
2296 ref, extent_op); 2298 ref, extent_op);
2297 kfree(extent_op); 2299 btrfs_free_delayed_extent_op(extent_op);
2298 2300
2299 if (ret) { 2301 if (ret) {
2300 list_del_init(&locked_ref->cluster); 2302 printk(KERN_DEBUG
2301 mutex_unlock(&locked_ref->mutex); 2303 "btrfs: run_delayed_extent_op "
2302 2304 "returned %d\n", ret);
2303 printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
2304 spin_lock(&delayed_refs->lock); 2305 spin_lock(&delayed_refs->lock);
2306 btrfs_delayed_ref_unlock(locked_ref);
2305 return ret; 2307 return ret;
2306 } 2308 }
2307 2309
2308 goto next; 2310 goto next;
2309 } 2311 }
2310
2311 list_del_init(&locked_ref->cluster);
2312 locked_ref = NULL;
2313 } 2312 }
2314 2313
2315 ref->in_tree = 0; 2314 ref->in_tree = 0;
2316 rb_erase(&ref->rb_node, &delayed_refs->root); 2315 rb_erase(&ref->rb_node, &delayed_refs->root);
2317 delayed_refs->num_entries--; 2316 delayed_refs->num_entries--;
2318 if (locked_ref) { 2317 if (!btrfs_delayed_ref_is_head(ref)) {
2319 /* 2318 /*
2320 * when we play the delayed ref, also correct the 2319 * when we play the delayed ref, also correct the
2321 * ref_mod on head 2320 * ref_mod on head
@@ -2337,20 +2336,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2337 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2336 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2338 must_insert_reserved); 2337 must_insert_reserved);
2339 2338
2340 btrfs_put_delayed_ref(ref); 2339 btrfs_free_delayed_extent_op(extent_op);
2341 kfree(extent_op);
2342 count++;
2343
2344 if (ret) { 2340 if (ret) {
2345 if (locked_ref) { 2341 btrfs_delayed_ref_unlock(locked_ref);
2346 list_del_init(&locked_ref->cluster); 2342 btrfs_put_delayed_ref(ref);
2347 mutex_unlock(&locked_ref->mutex); 2343 printk(KERN_DEBUG
2348 } 2344 "btrfs: run_one_delayed_ref returned %d\n", ret);
2349 printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
2350 spin_lock(&delayed_refs->lock); 2345 spin_lock(&delayed_refs->lock);
2351 return ret; 2346 return ret;
2352 } 2347 }
2353 2348
2349 /*
2350 * If this node is a head, that means all the refs in this head
2351 * have been dealt with, and we will pick the next head to deal
2352 * with, so we must unlock the head and drop it from the cluster
2353 * list before we release it.
2354 */
2355 if (btrfs_delayed_ref_is_head(ref)) {
2356 list_del_init(&locked_ref->cluster);
2357 btrfs_delayed_ref_unlock(locked_ref);
2358 locked_ref = NULL;
2359 }
2360 btrfs_put_delayed_ref(ref);
2361 count++;
2354next: 2362next:
2355 cond_resched(); 2363 cond_resched();
2356 spin_lock(&delayed_refs->lock); 2364 spin_lock(&delayed_refs->lock);
@@ -2435,6 +2443,16 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2435 return ret; 2443 return ret;
2436} 2444}
2437 2445
2446static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
2447 int count)
2448{
2449 int val = atomic_read(&delayed_refs->ref_seq);
2450
2451 if (val < seq || val >= seq + count)
2452 return 1;
2453 return 0;
2454}
2455
2438/* 2456/*
2439 * this starts processing the delayed reference count updates and 2457 * this starts processing the delayed reference count updates and
2440 * extent insertions we have queued up so far. count can be 2458 * extent insertions we have queued up so far. count can be
@@ -2469,6 +2487,44 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2469 2487
2470 delayed_refs = &trans->transaction->delayed_refs; 2488 delayed_refs = &trans->transaction->delayed_refs;
2471 INIT_LIST_HEAD(&cluster); 2489 INIT_LIST_HEAD(&cluster);
2490 if (count == 0) {
2491 count = delayed_refs->num_entries * 2;
2492 run_most = 1;
2493 }
2494
2495 if (!run_all && !run_most) {
2496 int old;
2497 int seq = atomic_read(&delayed_refs->ref_seq);
2498
2499progress:
2500 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2501 if (old) {
2502 DEFINE_WAIT(__wait);
2503 if (delayed_refs->num_entries < 16348)
2504 return 0;
2505
2506 prepare_to_wait(&delayed_refs->wait, &__wait,
2507 TASK_UNINTERRUPTIBLE);
2508
2509 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2510 if (old) {
2511 schedule();
2512 finish_wait(&delayed_refs->wait, &__wait);
2513
2514 if (!refs_newer(delayed_refs, seq, 256))
2515 goto progress;
2516 else
2517 return 0;
2518 } else {
2519 finish_wait(&delayed_refs->wait, &__wait);
2520 goto again;
2521 }
2522 }
2523
2524 } else {
2525 atomic_inc(&delayed_refs->procs_running_refs);
2526 }
2527
2472again: 2528again:
2473 loops = 0; 2529 loops = 0;
2474 spin_lock(&delayed_refs->lock); 2530 spin_lock(&delayed_refs->lock);
@@ -2477,10 +2533,6 @@ again:
2477 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2533 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2478#endif 2534#endif
2479 2535
2480 if (count == 0) {
2481 count = delayed_refs->num_entries * 2;
2482 run_most = 1;
2483 }
2484 while (1) { 2536 while (1) {
2485 if (!(run_all || run_most) && 2537 if (!(run_all || run_most) &&
2486 delayed_refs->num_heads_ready < 64) 2538 delayed_refs->num_heads_ready < 64)
@@ -2500,11 +2552,15 @@ again:
2500 2552
2501 ret = run_clustered_refs(trans, root, &cluster); 2553 ret = run_clustered_refs(trans, root, &cluster);
2502 if (ret < 0) { 2554 if (ret < 0) {
2555 btrfs_release_ref_cluster(&cluster);
2503 spin_unlock(&delayed_refs->lock); 2556 spin_unlock(&delayed_refs->lock);
2504 btrfs_abort_transaction(trans, root, ret); 2557 btrfs_abort_transaction(trans, root, ret);
2558 atomic_dec(&delayed_refs->procs_running_refs);
2505 return ret; 2559 return ret;
2506 } 2560 }
2507 2561
2562 atomic_add(ret, &delayed_refs->ref_seq);
2563
2508 count -= min_t(unsigned long, ret, count); 2564 count -= min_t(unsigned long, ret, count);
2509 2565
2510 if (count == 0) 2566 if (count == 0)
@@ -2573,6 +2629,11 @@ again:
2573 goto again; 2629 goto again;
2574 } 2630 }
2575out: 2631out:
2632 atomic_dec(&delayed_refs->procs_running_refs);
2633 smp_mb();
2634 if (waitqueue_active(&delayed_refs->wait))
2635 wake_up(&delayed_refs->wait);
2636
2576 spin_unlock(&delayed_refs->lock); 2637 spin_unlock(&delayed_refs->lock);
2577 assert_qgroups_uptodate(trans); 2638 assert_qgroups_uptodate(trans);
2578 return 0; 2639 return 0;
@@ -2586,7 +2647,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2586 struct btrfs_delayed_extent_op *extent_op; 2647 struct btrfs_delayed_extent_op *extent_op;
2587 int ret; 2648 int ret;
2588 2649
2589 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 2650 extent_op = btrfs_alloc_delayed_extent_op();
2590 if (!extent_op) 2651 if (!extent_op)
2591 return -ENOMEM; 2652 return -ENOMEM;
2592 2653
@@ -2598,7 +2659,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2598 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, 2659 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2599 num_bytes, extent_op); 2660 num_bytes, extent_op);
2600 if (ret) 2661 if (ret)
2601 kfree(extent_op); 2662 btrfs_free_delayed_extent_op(extent_op);
2602 return ret; 2663 return ret;
2603} 2664}
2604 2665
@@ -3223,12 +3284,14 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3223 u64 extra_flags = chunk_to_extended(flags) & 3284 u64 extra_flags = chunk_to_extended(flags) &
3224 BTRFS_EXTENDED_PROFILE_MASK; 3285 BTRFS_EXTENDED_PROFILE_MASK;
3225 3286
3287 write_seqlock(&fs_info->profiles_lock);
3226 if (flags & BTRFS_BLOCK_GROUP_DATA) 3288 if (flags & BTRFS_BLOCK_GROUP_DATA)
3227 fs_info->avail_data_alloc_bits |= extra_flags; 3289 fs_info->avail_data_alloc_bits |= extra_flags;
3228 if (flags & BTRFS_BLOCK_GROUP_METADATA) 3290 if (flags & BTRFS_BLOCK_GROUP_METADATA)
3229 fs_info->avail_metadata_alloc_bits |= extra_flags; 3291 fs_info->avail_metadata_alloc_bits |= extra_flags;
3230 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3292 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3231 fs_info->avail_system_alloc_bits |= extra_flags; 3293 fs_info->avail_system_alloc_bits |= extra_flags;
3294 write_sequnlock(&fs_info->profiles_lock);
3232} 3295}
3233 3296
3234/* 3297/*
@@ -3276,6 +3339,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3276 u64 num_devices = root->fs_info->fs_devices->rw_devices + 3339 u64 num_devices = root->fs_info->fs_devices->rw_devices +
3277 root->fs_info->fs_devices->missing_devices; 3340 root->fs_info->fs_devices->missing_devices;
3278 u64 target; 3341 u64 target;
3342 u64 tmp;
3279 3343
3280 /* 3344 /*
3281 * see if restripe for this chunk_type is in progress, if so 3345 * see if restripe for this chunk_type is in progress, if so
@@ -3292,40 +3356,48 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3292 } 3356 }
3293 spin_unlock(&root->fs_info->balance_lock); 3357 spin_unlock(&root->fs_info->balance_lock);
3294 3358
3359 /* First, mask out the RAID levels which aren't possible */
3295 if (num_devices == 1) 3360 if (num_devices == 1)
3296 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 3361 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
3362 BTRFS_BLOCK_GROUP_RAID5);
3363 if (num_devices < 3)
3364 flags &= ~BTRFS_BLOCK_GROUP_RAID6;
3297 if (num_devices < 4) 3365 if (num_devices < 4)
3298 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 3366 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3299 3367
3300 if ((flags & BTRFS_BLOCK_GROUP_DUP) && 3368 tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3301 (flags & (BTRFS_BLOCK_GROUP_RAID1 | 3369 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
3302 BTRFS_BLOCK_GROUP_RAID10))) { 3370 BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
3303 flags &= ~BTRFS_BLOCK_GROUP_DUP; 3371 flags &= ~tmp;
3304 }
3305
3306 if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
3307 (flags & BTRFS_BLOCK_GROUP_RAID10)) {
3308 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
3309 }
3310 3372
3311 if ((flags & BTRFS_BLOCK_GROUP_RAID0) && 3373 if (tmp & BTRFS_BLOCK_GROUP_RAID6)
3312 ((flags & BTRFS_BLOCK_GROUP_RAID1) | 3374 tmp = BTRFS_BLOCK_GROUP_RAID6;
3313 (flags & BTRFS_BLOCK_GROUP_RAID10) | 3375 else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
3314 (flags & BTRFS_BLOCK_GROUP_DUP))) { 3376 tmp = BTRFS_BLOCK_GROUP_RAID5;
3315 flags &= ~BTRFS_BLOCK_GROUP_RAID0; 3377 else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
3316 } 3378 tmp = BTRFS_BLOCK_GROUP_RAID10;
3379 else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
3380 tmp = BTRFS_BLOCK_GROUP_RAID1;
3381 else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
3382 tmp = BTRFS_BLOCK_GROUP_RAID0;
3317 3383
3318 return extended_to_chunk(flags); 3384 return extended_to_chunk(flags | tmp);
3319} 3385}
3320 3386
3321static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) 3387static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3322{ 3388{
3323 if (flags & BTRFS_BLOCK_GROUP_DATA) 3389 unsigned seq;
3324 flags |= root->fs_info->avail_data_alloc_bits; 3390
3325 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3391 do {
3326 flags |= root->fs_info->avail_system_alloc_bits; 3392 seq = read_seqbegin(&root->fs_info->profiles_lock);
3327 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 3393
3328 flags |= root->fs_info->avail_metadata_alloc_bits; 3394 if (flags & BTRFS_BLOCK_GROUP_DATA)
3395 flags |= root->fs_info->avail_data_alloc_bits;
3396 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3397 flags |= root->fs_info->avail_system_alloc_bits;
3398 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3399 flags |= root->fs_info->avail_metadata_alloc_bits;
3400 } while (read_seqretry(&root->fs_info->profiles_lock, seq));
3329 3401
3330 return btrfs_reduce_alloc_profile(root, flags); 3402 return btrfs_reduce_alloc_profile(root, flags);
3331} 3403}
@@ -3333,6 +3405,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3333u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3405u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3334{ 3406{
3335 u64 flags; 3407 u64 flags;
3408 u64 ret;
3336 3409
3337 if (data) 3410 if (data)
3338 flags = BTRFS_BLOCK_GROUP_DATA; 3411 flags = BTRFS_BLOCK_GROUP_DATA;
@@ -3341,7 +3414,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3341 else 3414 else
3342 flags = BTRFS_BLOCK_GROUP_METADATA; 3415 flags = BTRFS_BLOCK_GROUP_METADATA;
3343 3416
3344 return get_alloc_profile(root, flags); 3417 ret = get_alloc_profile(root, flags);
3418 return ret;
3345} 3419}
3346 3420
3347/* 3421/*
@@ -3357,7 +3431,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3357 int ret = 0, committed = 0, alloc_chunk = 1; 3431 int ret = 0, committed = 0, alloc_chunk = 1;
3358 3432
3359 /* make sure bytes are sectorsize aligned */ 3433 /* make sure bytes are sectorsize aligned */
3360 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3434 bytes = ALIGN(bytes, root->sectorsize);
3361 3435
3362 if (root == root->fs_info->tree_root || 3436 if (root == root->fs_info->tree_root ||
3363 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { 3437 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
@@ -3452,7 +3526,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3452 struct btrfs_space_info *data_sinfo; 3526 struct btrfs_space_info *data_sinfo;
3453 3527
3454 /* make sure bytes are sectorsize aligned */ 3528 /* make sure bytes are sectorsize aligned */
3455 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3529 bytes = ALIGN(bytes, root->sectorsize);
3456 3530
3457 data_sinfo = root->fs_info->data_sinfo; 3531 data_sinfo = root->fs_info->data_sinfo;
3458 spin_lock(&data_sinfo->lock); 3532 spin_lock(&data_sinfo->lock);
@@ -3516,8 +3590,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
3516{ 3590{
3517 u64 num_dev; 3591 u64 num_dev;
3518 3592
3519 if (type & BTRFS_BLOCK_GROUP_RAID10 || 3593 if (type & (BTRFS_BLOCK_GROUP_RAID10 |
3520 type & BTRFS_BLOCK_GROUP_RAID0) 3594 BTRFS_BLOCK_GROUP_RAID0 |
3595 BTRFS_BLOCK_GROUP_RAID5 |
3596 BTRFS_BLOCK_GROUP_RAID6))
3521 num_dev = root->fs_info->fs_devices->rw_devices; 3597 num_dev = root->fs_info->fs_devices->rw_devices;
3522 else if (type & BTRFS_BLOCK_GROUP_RAID1) 3598 else if (type & BTRFS_BLOCK_GROUP_RAID1)
3523 num_dev = 2; 3599 num_dev = 2;
@@ -3564,6 +3640,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3564 int wait_for_alloc = 0; 3640 int wait_for_alloc = 0;
3565 int ret = 0; 3641 int ret = 0;
3566 3642
3643 /* Don't re-enter if we're already allocating a chunk */
3644 if (trans->allocating_chunk)
3645 return -ENOSPC;
3646
3567 space_info = __find_space_info(extent_root->fs_info, flags); 3647 space_info = __find_space_info(extent_root->fs_info, flags);
3568 if (!space_info) { 3648 if (!space_info) {
3569 ret = update_space_info(extent_root->fs_info, flags, 3649 ret = update_space_info(extent_root->fs_info, flags,
@@ -3606,6 +3686,8 @@ again:
3606 goto again; 3686 goto again;
3607 } 3687 }
3608 3688
3689 trans->allocating_chunk = true;
3690
3609 /* 3691 /*
3610 * If we have mixed data/metadata chunks we want to make sure we keep 3692 * If we have mixed data/metadata chunks we want to make sure we keep
3611 * allocating mixed chunks instead of individual chunks. 3693 * allocating mixed chunks instead of individual chunks.
@@ -3632,19 +3714,20 @@ again:
3632 check_system_chunk(trans, extent_root, flags); 3714 check_system_chunk(trans, extent_root, flags);
3633 3715
3634 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3716 ret = btrfs_alloc_chunk(trans, extent_root, flags);
3635 if (ret < 0 && ret != -ENOSPC) 3717 trans->allocating_chunk = false;
3636 goto out;
3637 3718
3638 spin_lock(&space_info->lock); 3719 spin_lock(&space_info->lock);
3720 if (ret < 0 && ret != -ENOSPC)
3721 goto out;
3639 if (ret) 3722 if (ret)
3640 space_info->full = 1; 3723 space_info->full = 1;
3641 else 3724 else
3642 ret = 1; 3725 ret = 1;
3643 3726
3644 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 3727 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3728out:
3645 space_info->chunk_alloc = 0; 3729 space_info->chunk_alloc = 0;
3646 spin_unlock(&space_info->lock); 3730 spin_unlock(&space_info->lock);
3647out:
3648 mutex_unlock(&fs_info->chunk_mutex); 3731 mutex_unlock(&fs_info->chunk_mutex);
3649 return ret; 3732 return ret;
3650} 3733}
@@ -3653,13 +3736,31 @@ static int can_overcommit(struct btrfs_root *root,
3653 struct btrfs_space_info *space_info, u64 bytes, 3736 struct btrfs_space_info *space_info, u64 bytes,
3654 enum btrfs_reserve_flush_enum flush) 3737 enum btrfs_reserve_flush_enum flush)
3655{ 3738{
3739 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3656 u64 profile = btrfs_get_alloc_profile(root, 0); 3740 u64 profile = btrfs_get_alloc_profile(root, 0);
3741 u64 rsv_size = 0;
3657 u64 avail; 3742 u64 avail;
3658 u64 used; 3743 u64 used;
3744 u64 to_add;
3659 3745
3660 used = space_info->bytes_used + space_info->bytes_reserved + 3746 used = space_info->bytes_used + space_info->bytes_reserved +
3661 space_info->bytes_pinned + space_info->bytes_readonly + 3747 space_info->bytes_pinned + space_info->bytes_readonly;
3662 space_info->bytes_may_use; 3748
3749 spin_lock(&global_rsv->lock);
3750 rsv_size = global_rsv->size;
3751 spin_unlock(&global_rsv->lock);
3752
3753 /*
3754 * We only want to allow over committing if we have lots of actual space
3755 * free, but if we don't have enough space to handle the global reserve
3756 * space then we could end up having a real enospc problem when trying
3757 * to allocate a chunk or some other such important allocation.
3758 */
3759 rsv_size <<= 1;
3760 if (used + rsv_size >= space_info->total_bytes)
3761 return 0;
3762
3763 used += space_info->bytes_may_use;
3663 3764
3664 spin_lock(&root->fs_info->free_chunk_lock); 3765 spin_lock(&root->fs_info->free_chunk_lock);
3665 avail = root->fs_info->free_chunk_space; 3766 avail = root->fs_info->free_chunk_space;
@@ -3667,28 +3768,60 @@ static int can_overcommit(struct btrfs_root *root,
3667 3768
3668 /* 3769 /*
3669 * If we have dup, raid1 or raid10 then only half of the free 3770 * If we have dup, raid1 or raid10 then only half of the free
3670 * space is actually useable. 3771 * space is actually useable. For raid56, the space info used
3772 * doesn't include the parity drive, so we don't have to
3773 * change the math
3671 */ 3774 */
3672 if (profile & (BTRFS_BLOCK_GROUP_DUP | 3775 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3673 BTRFS_BLOCK_GROUP_RAID1 | 3776 BTRFS_BLOCK_GROUP_RAID1 |
3674 BTRFS_BLOCK_GROUP_RAID10)) 3777 BTRFS_BLOCK_GROUP_RAID10))
3675 avail >>= 1; 3778 avail >>= 1;
3676 3779
3780 to_add = space_info->total_bytes;
3781
3677 /* 3782 /*
3678 * If we aren't flushing all things, let us overcommit up to 3783 * If we aren't flushing all things, let us overcommit up to
3679 * 1/2th of the space. If we can flush, don't let us overcommit 3784 * 1/2th of the space. If we can flush, don't let us overcommit
3680 * too much, let it overcommit up to 1/8 of the space. 3785 * too much, let it overcommit up to 1/8 of the space.
3681 */ 3786 */
3682 if (flush == BTRFS_RESERVE_FLUSH_ALL) 3787 if (flush == BTRFS_RESERVE_FLUSH_ALL)
3683 avail >>= 3; 3788 to_add >>= 3;
3684 else 3789 else
3685 avail >>= 1; 3790 to_add >>= 1;
3686 3791
3687 if (used + bytes < space_info->total_bytes + avail) 3792 /*
3793 * Limit the overcommit to the amount of free space we could possibly
3794 * allocate for chunks.
3795 */
3796 to_add = min(avail, to_add);
3797
3798 if (used + bytes < space_info->total_bytes + to_add)
3688 return 1; 3799 return 1;
3689 return 0; 3800 return 0;
3690} 3801}
3691 3802
3803void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3804 unsigned long nr_pages)
3805{
3806 struct super_block *sb = root->fs_info->sb;
3807 int started;
3808
3809 /* If we can not start writeback, just sync all the delalloc file. */
3810 started = try_to_writeback_inodes_sb_nr(sb, nr_pages,
3811 WB_REASON_FS_FREE_SPACE);
3812 if (!started) {
3813 /*
3814 * We needn't worry the filesystem going from r/w to r/o though
3815 * we don't acquire ->s_umount mutex, because the filesystem
3816 * should guarantee the delalloc inodes list be empty after
3817 * the filesystem is readonly(all dirty pages are written to
3818 * the disk).
3819 */
3820 btrfs_start_delalloc_inodes(root, 0);
3821 btrfs_wait_ordered_extents(root, 0);
3822 }
3823}
3824
3692/* 3825/*
3693 * shrink metadata reservation for delalloc 3826 * shrink metadata reservation for delalloc
3694 */ 3827 */
@@ -3710,7 +3843,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3710 space_info = block_rsv->space_info; 3843 space_info = block_rsv->space_info;
3711 3844
3712 smp_mb(); 3845 smp_mb();
3713 delalloc_bytes = root->fs_info->delalloc_bytes; 3846 delalloc_bytes = percpu_counter_sum_positive(
3847 &root->fs_info->delalloc_bytes);
3714 if (delalloc_bytes == 0) { 3848 if (delalloc_bytes == 0) {
3715 if (trans) 3849 if (trans)
3716 return; 3850 return;
@@ -3721,10 +3855,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3721 while (delalloc_bytes && loops < 3) { 3855 while (delalloc_bytes && loops < 3) {
3722 max_reclaim = min(delalloc_bytes, to_reclaim); 3856 max_reclaim = min(delalloc_bytes, to_reclaim);
3723 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 3857 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
3724 try_to_writeback_inodes_sb_nr(root->fs_info->sb, 3858 btrfs_writeback_inodes_sb_nr(root, nr_pages);
3725 nr_pages,
3726 WB_REASON_FS_FREE_SPACE);
3727
3728 /* 3859 /*
3729 * We need to wait for the async pages to actually start before 3860 * We need to wait for the async pages to actually start before
3730 * we do anything. 3861 * we do anything.
@@ -3752,7 +3883,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3752 break; 3883 break;
3753 } 3884 }
3754 smp_mb(); 3885 smp_mb();
3755 delalloc_bytes = root->fs_info->delalloc_bytes; 3886 delalloc_bytes = percpu_counter_sum_positive(
3887 &root->fs_info->delalloc_bytes);
3756 } 3888 }
3757} 3889}
3758 3890
@@ -4016,6 +4148,15 @@ again:
4016 goto again; 4148 goto again;
4017 4149
4018out: 4150out:
4151 if (ret == -ENOSPC &&
4152 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
4153 struct btrfs_block_rsv *global_rsv =
4154 &root->fs_info->global_block_rsv;
4155
4156 if (block_rsv != global_rsv &&
4157 !block_rsv_use_bytes(global_rsv, orig_bytes))
4158 ret = 0;
4159 }
4019 if (flushing) { 4160 if (flushing) {
4020 spin_lock(&space_info->lock); 4161 spin_lock(&space_info->lock);
4021 space_info->flush = 0; 4162 space_info->flush = 0;
@@ -4402,19 +4543,60 @@ void btrfs_orphan_release_metadata(struct inode *inode)
4402 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 4543 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4403} 4544}
4404 4545
4405int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, 4546/*
4406 struct btrfs_pending_snapshot *pending) 4547 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
4548 * root: the root of the parent directory
4549 * rsv: block reservation
4550 * items: the number of items that we need do reservation
4551 * qgroup_reserved: used to return the reserved size in qgroup
4552 *
4553 * This function is used to reserve the space for snapshot/subvolume
4554 * creation and deletion. Those operations are different with the
4555 * common file/directory operations, they change two fs/file trees
4556 * and root tree, the number of items that the qgroup reserves is
4557 * different with the free space reservation. So we can not use
4558 * the space reseravtion mechanism in start_transaction().
4559 */
4560int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
4561 struct btrfs_block_rsv *rsv,
4562 int items,
4563 u64 *qgroup_reserved)
4407{ 4564{
4408 struct btrfs_root *root = pending->root; 4565 u64 num_bytes;
4409 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); 4566 int ret;
4410 struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; 4567
4411 /* 4568 if (root->fs_info->quota_enabled) {
4412 * two for root back/forward refs, two for directory entries, 4569 /* One for parent inode, two for dir entries */
4413 * one for root of the snapshot and one for parent inode. 4570 num_bytes = 3 * root->leafsize;
4414 */ 4571 ret = btrfs_qgroup_reserve(root, num_bytes);
4415 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6); 4572 if (ret)
4416 dst_rsv->space_info = src_rsv->space_info; 4573 return ret;
4417 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4574 } else {
4575 num_bytes = 0;
4576 }
4577
4578 *qgroup_reserved = num_bytes;
4579
4580 num_bytes = btrfs_calc_trans_metadata_size(root, items);
4581 rsv->space_info = __find_space_info(root->fs_info,
4582 BTRFS_BLOCK_GROUP_METADATA);
4583 ret = btrfs_block_rsv_add(root, rsv, num_bytes,
4584 BTRFS_RESERVE_FLUSH_ALL);
4585 if (ret) {
4586 if (*qgroup_reserved)
4587 btrfs_qgroup_free(root, *qgroup_reserved);
4588 }
4589
4590 return ret;
4591}
4592
4593void btrfs_subvolume_release_metadata(struct btrfs_root *root,
4594 struct btrfs_block_rsv *rsv,
4595 u64 qgroup_reserved)
4596{
4597 btrfs_block_rsv_release(root, rsv, (u64)-1);
4598 if (qgroup_reserved)
4599 btrfs_qgroup_free(root, qgroup_reserved);
4418} 4600}
4419 4601
4420/** 4602/**
@@ -4522,6 +4704,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4522 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 4704 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
4523 int ret = 0; 4705 int ret = 0;
4524 bool delalloc_lock = true; 4706 bool delalloc_lock = true;
4707 u64 to_free = 0;
4708 unsigned dropped;
4525 4709
4526 /* If we are a free space inode we need to not flush since we will be in 4710 /* If we are a free space inode we need to not flush since we will be in
4527 * the middle of a transaction commit. We also don't need the delalloc 4711 * the middle of a transaction commit. We also don't need the delalloc
@@ -4565,54 +4749,19 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4565 csum_bytes = BTRFS_I(inode)->csum_bytes; 4749 csum_bytes = BTRFS_I(inode)->csum_bytes;
4566 spin_unlock(&BTRFS_I(inode)->lock); 4750 spin_unlock(&BTRFS_I(inode)->lock);
4567 4751
4568 if (root->fs_info->quota_enabled) 4752 if (root->fs_info->quota_enabled) {
4569 ret = btrfs_qgroup_reserve(root, num_bytes + 4753 ret = btrfs_qgroup_reserve(root, num_bytes +
4570 nr_extents * root->leafsize); 4754 nr_extents * root->leafsize);
4755 if (ret)
4756 goto out_fail;
4757 }
4571 4758
4572 /* 4759 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4573 * ret != 0 here means the qgroup reservation failed, we go straight to 4760 if (unlikely(ret)) {
4574 * the shared error handling then. 4761 if (root->fs_info->quota_enabled)
4575 */
4576 if (ret == 0)
4577 ret = reserve_metadata_bytes(root, block_rsv,
4578 to_reserve, flush);
4579
4580 if (ret) {
4581 u64 to_free = 0;
4582 unsigned dropped;
4583
4584 spin_lock(&BTRFS_I(inode)->lock);
4585 dropped = drop_outstanding_extent(inode);
4586 /*
4587 * If the inodes csum_bytes is the same as the original
4588 * csum_bytes then we know we haven't raced with any free()ers
4589 * so we can just reduce our inodes csum bytes and carry on.
4590 * Otherwise we have to do the normal free thing to account for
4591 * the case that the free side didn't free up its reserve
4592 * because of this outstanding reservation.
4593 */
4594 if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4595 calc_csum_metadata_size(inode, num_bytes, 0);
4596 else
4597 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4598 spin_unlock(&BTRFS_I(inode)->lock);
4599 if (dropped)
4600 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4601
4602 if (to_free) {
4603 btrfs_block_rsv_release(root, block_rsv, to_free);
4604 trace_btrfs_space_reservation(root->fs_info,
4605 "delalloc",
4606 btrfs_ino(inode),
4607 to_free, 0);
4608 }
4609 if (root->fs_info->quota_enabled) {
4610 btrfs_qgroup_free(root, num_bytes + 4762 btrfs_qgroup_free(root, num_bytes +
4611 nr_extents * root->leafsize); 4763 nr_extents * root->leafsize);
4612 } 4764 goto out_fail;
4613 if (delalloc_lock)
4614 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4615 return ret;
4616 } 4765 }
4617 4766
4618 spin_lock(&BTRFS_I(inode)->lock); 4767 spin_lock(&BTRFS_I(inode)->lock);
@@ -4633,6 +4782,34 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4633 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4782 block_rsv_add_bytes(block_rsv, to_reserve, 1);
4634 4783
4635 return 0; 4784 return 0;
4785
4786out_fail:
4787 spin_lock(&BTRFS_I(inode)->lock);
4788 dropped = drop_outstanding_extent(inode);
4789 /*
4790 * If the inodes csum_bytes is the same as the original
4791 * csum_bytes then we know we haven't raced with any free()ers
4792 * so we can just reduce our inodes csum bytes and carry on.
4793 * Otherwise we have to do the normal free thing to account for
4794 * the case that the free side didn't free up its reserve
4795 * because of this outstanding reservation.
4796 */
4797 if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4798 calc_csum_metadata_size(inode, num_bytes, 0);
4799 else
4800 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4801 spin_unlock(&BTRFS_I(inode)->lock);
4802 if (dropped)
4803 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4804
4805 if (to_free) {
4806 btrfs_block_rsv_release(root, block_rsv, to_free);
4807 trace_btrfs_space_reservation(root->fs_info, "delalloc",
4808 btrfs_ino(inode), to_free, 0);
4809 }
4810 if (delalloc_lock)
4811 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4812 return ret;
4636} 4813}
4637 4814
4638/** 4815/**
@@ -4654,7 +4831,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4654 spin_lock(&BTRFS_I(inode)->lock); 4831 spin_lock(&BTRFS_I(inode)->lock);
4655 dropped = drop_outstanding_extent(inode); 4832 dropped = drop_outstanding_extent(inode);
4656 4833
4657 to_free = calc_csum_metadata_size(inode, num_bytes, 0); 4834 if (num_bytes)
4835 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4658 spin_unlock(&BTRFS_I(inode)->lock); 4836 spin_unlock(&BTRFS_I(inode)->lock);
4659 if (dropped > 0) 4837 if (dropped > 0)
4660 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4838 to_free += btrfs_calc_trans_metadata_size(root, dropped);
@@ -4721,8 +4899,7 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4721 btrfs_free_reserved_data_space(inode, num_bytes); 4899 btrfs_free_reserved_data_space(inode, num_bytes);
4722} 4900}
4723 4901
4724static int update_block_group(struct btrfs_trans_handle *trans, 4902static int update_block_group(struct btrfs_root *root,
4725 struct btrfs_root *root,
4726 u64 bytenr, u64 num_bytes, int alloc) 4903 u64 bytenr, u64 num_bytes, int alloc)
4727{ 4904{
4728 struct btrfs_block_group_cache *cache = NULL; 4905 struct btrfs_block_group_cache *cache = NULL;
@@ -4759,7 +4936,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4759 * space back to the block group, otherwise we will leak space. 4936 * space back to the block group, otherwise we will leak space.
4760 */ 4937 */
4761 if (!alloc && cache->cached == BTRFS_CACHE_NO) 4938 if (!alloc && cache->cached == BTRFS_CACHE_NO)
4762 cache_block_group(cache, trans, NULL, 1); 4939 cache_block_group(cache, 1);
4763 4940
4764 byte_in_group = bytenr - cache->key.objectid; 4941 byte_in_group = bytenr - cache->key.objectid;
4765 WARN_ON(byte_in_group > cache->key.offset); 4942 WARN_ON(byte_in_group > cache->key.offset);
@@ -4809,6 +4986,13 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
4809 struct btrfs_block_group_cache *cache; 4986 struct btrfs_block_group_cache *cache;
4810 u64 bytenr; 4987 u64 bytenr;
4811 4988
4989 spin_lock(&root->fs_info->block_group_cache_lock);
4990 bytenr = root->fs_info->first_logical_byte;
4991 spin_unlock(&root->fs_info->block_group_cache_lock);
4992
4993 if (bytenr < (u64)-1)
4994 return bytenr;
4995
4812 cache = btrfs_lookup_first_block_group(root->fs_info, search_start); 4996 cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
4813 if (!cache) 4997 if (!cache)
4814 return 0; 4998 return 0;
@@ -4859,8 +5043,7 @@ int btrfs_pin_extent(struct btrfs_root *root,
4859/* 5043/*
4860 * this function must be called within transaction 5044 * this function must be called within transaction
4861 */ 5045 */
4862int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, 5046int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
4863 struct btrfs_root *root,
4864 u64 bytenr, u64 num_bytes) 5047 u64 bytenr, u64 num_bytes)
4865{ 5048{
4866 struct btrfs_block_group_cache *cache; 5049 struct btrfs_block_group_cache *cache;
@@ -4874,7 +5057,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
4874 * to one because the slow code to read in the free extents does check 5057 * to one because the slow code to read in the free extents does check
4875 * the pinned extents. 5058 * the pinned extents.
4876 */ 5059 */
4877 cache_block_group(cache, trans, root, 1); 5060 cache_block_group(cache, 1);
4878 5061
4879 pin_down_extent(root, cache, bytenr, num_bytes, 0); 5062 pin_down_extent(root, cache, bytenr, num_bytes, 0);
4880 5063
@@ -5271,7 +5454,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5271 } 5454 }
5272 } 5455 }
5273 5456
5274 ret = update_block_group(trans, root, bytenr, num_bytes, 0); 5457 ret = update_block_group(root, bytenr, num_bytes, 0);
5275 if (ret) { 5458 if (ret) {
5276 btrfs_abort_transaction(trans, extent_root, ret); 5459 btrfs_abort_transaction(trans, extent_root, ret);
5277 goto out; 5460 goto out;
@@ -5316,7 +5499,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5316 if (head->extent_op) { 5499 if (head->extent_op) {
5317 if (!head->must_insert_reserved) 5500 if (!head->must_insert_reserved)
5318 goto out; 5501 goto out;
5319 kfree(head->extent_op); 5502 btrfs_free_delayed_extent_op(head->extent_op);
5320 head->extent_op = NULL; 5503 head->extent_op = NULL;
5321 } 5504 }
5322 5505
@@ -5439,10 +5622,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5439 return ret; 5622 return ret;
5440} 5623}
5441 5624
5442static u64 stripe_align(struct btrfs_root *root, u64 val) 5625static u64 stripe_align(struct btrfs_root *root,
5626 struct btrfs_block_group_cache *cache,
5627 u64 val, u64 num_bytes)
5443{ 5628{
5444 u64 mask = ((u64)root->stripesize - 1); 5629 u64 ret = ALIGN(val, root->stripesize);
5445 u64 ret = (val + mask) & ~mask;
5446 return ret; 5630 return ret;
5447} 5631}
5448 5632
@@ -5462,7 +5646,6 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
5462 u64 num_bytes) 5646 u64 num_bytes)
5463{ 5647{
5464 struct btrfs_caching_control *caching_ctl; 5648 struct btrfs_caching_control *caching_ctl;
5465 DEFINE_WAIT(wait);
5466 5649
5467 caching_ctl = get_caching_control(cache); 5650 caching_ctl = get_caching_control(cache);
5468 if (!caching_ctl) 5651 if (!caching_ctl)
@@ -5479,7 +5662,6 @@ static noinline int
5479wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 5662wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5480{ 5663{
5481 struct btrfs_caching_control *caching_ctl; 5664 struct btrfs_caching_control *caching_ctl;
5482 DEFINE_WAIT(wait);
5483 5665
5484 caching_ctl = get_caching_control(cache); 5666 caching_ctl = get_caching_control(cache);
5485 if (!caching_ctl) 5667 if (!caching_ctl)
@@ -5493,20 +5675,20 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5493 5675
5494int __get_raid_index(u64 flags) 5676int __get_raid_index(u64 flags)
5495{ 5677{
5496 int index;
5497
5498 if (flags & BTRFS_BLOCK_GROUP_RAID10) 5678 if (flags & BTRFS_BLOCK_GROUP_RAID10)
5499 index = 0; 5679 return BTRFS_RAID_RAID10;
5500 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 5680 else if (flags & BTRFS_BLOCK_GROUP_RAID1)
5501 index = 1; 5681 return BTRFS_RAID_RAID1;
5502 else if (flags & BTRFS_BLOCK_GROUP_DUP) 5682 else if (flags & BTRFS_BLOCK_GROUP_DUP)
5503 index = 2; 5683 return BTRFS_RAID_DUP;
5504 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 5684 else if (flags & BTRFS_BLOCK_GROUP_RAID0)
5505 index = 3; 5685 return BTRFS_RAID_RAID0;
5506 else 5686 else if (flags & BTRFS_BLOCK_GROUP_RAID5)
5507 index = 4; 5687 return BTRFS_RAID_RAID5;
5688 else if (flags & BTRFS_BLOCK_GROUP_RAID6)
5689 return BTRFS_RAID_RAID6;
5508 5690
5509 return index; 5691 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
5510} 5692}
5511 5693
5512static int get_block_group_index(struct btrfs_block_group_cache *cache) 5694static int get_block_group_index(struct btrfs_block_group_cache *cache)
@@ -5649,6 +5831,8 @@ search:
5649 if (!block_group_bits(block_group, data)) { 5831 if (!block_group_bits(block_group, data)) {
5650 u64 extra = BTRFS_BLOCK_GROUP_DUP | 5832 u64 extra = BTRFS_BLOCK_GROUP_DUP |
5651 BTRFS_BLOCK_GROUP_RAID1 | 5833 BTRFS_BLOCK_GROUP_RAID1 |
5834 BTRFS_BLOCK_GROUP_RAID5 |
5835 BTRFS_BLOCK_GROUP_RAID6 |
5652 BTRFS_BLOCK_GROUP_RAID10; 5836 BTRFS_BLOCK_GROUP_RAID10;
5653 5837
5654 /* 5838 /*
@@ -5664,8 +5848,7 @@ have_block_group:
5664 cached = block_group_cache_done(block_group); 5848 cached = block_group_cache_done(block_group);
5665 if (unlikely(!cached)) { 5849 if (unlikely(!cached)) {
5666 found_uncached_bg = true; 5850 found_uncached_bg = true;
5667 ret = cache_block_group(block_group, trans, 5851 ret = cache_block_group(block_group, 0);
5668 orig_root, 0);
5669 BUG_ON(ret < 0); 5852 BUG_ON(ret < 0);
5670 ret = 0; 5853 ret = 0;
5671 } 5854 }
@@ -5678,6 +5861,7 @@ have_block_group:
5678 * lets look there 5861 * lets look there
5679 */ 5862 */
5680 if (last_ptr) { 5863 if (last_ptr) {
5864 unsigned long aligned_cluster;
5681 /* 5865 /*
5682 * the refill lock keeps out other 5866 * the refill lock keeps out other
5683 * people trying to start a new cluster 5867 * people trying to start a new cluster
@@ -5744,11 +5928,15 @@ refill_cluster:
5744 goto unclustered_alloc; 5928 goto unclustered_alloc;
5745 } 5929 }
5746 5930
5931 aligned_cluster = max_t(unsigned long,
5932 empty_cluster + empty_size,
5933 block_group->full_stripe_len);
5934
5747 /* allocate a cluster in this block group */ 5935 /* allocate a cluster in this block group */
5748 ret = btrfs_find_space_cluster(trans, root, 5936 ret = btrfs_find_space_cluster(trans, root,
5749 block_group, last_ptr, 5937 block_group, last_ptr,
5750 search_start, num_bytes, 5938 search_start, num_bytes,
5751 empty_cluster + empty_size); 5939 aligned_cluster);
5752 if (ret == 0) { 5940 if (ret == 0) {
5753 /* 5941 /*
5754 * now pull our allocation out of this 5942 * now pull our allocation out of this
@@ -5819,7 +6007,8 @@ unclustered_alloc:
5819 goto loop; 6007 goto loop;
5820 } 6008 }
5821checks: 6009checks:
5822 search_start = stripe_align(root, offset); 6010 search_start = stripe_align(root, used_block_group,
6011 offset, num_bytes);
5823 6012
5824 /* move on to the next group */ 6013 /* move on to the next group */
5825 if (search_start + num_bytes > 6014 if (search_start + num_bytes >
@@ -5970,7 +6159,7 @@ again:
5970 if (ret == -ENOSPC) { 6159 if (ret == -ENOSPC) {
5971 if (!final_tried) { 6160 if (!final_tried) {
5972 num_bytes = num_bytes >> 1; 6161 num_bytes = num_bytes >> 1;
5973 num_bytes = num_bytes & ~(root->sectorsize - 1); 6162 num_bytes = round_down(num_bytes, root->sectorsize);
5974 num_bytes = max(num_bytes, min_alloc_size); 6163 num_bytes = max(num_bytes, min_alloc_size);
5975 if (num_bytes == min_alloc_size) 6164 if (num_bytes == min_alloc_size)
5976 final_tried = true; 6165 final_tried = true;
@@ -6094,7 +6283,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6094 btrfs_mark_buffer_dirty(path->nodes[0]); 6283 btrfs_mark_buffer_dirty(path->nodes[0]);
6095 btrfs_free_path(path); 6284 btrfs_free_path(path);
6096 6285
6097 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); 6286 ret = update_block_group(root, ins->objectid, ins->offset, 1);
6098 if (ret) { /* -ENOENT, logic error */ 6287 if (ret) { /* -ENOENT, logic error */
6099 printk(KERN_ERR "btrfs update block group failed for %llu " 6288 printk(KERN_ERR "btrfs update block group failed for %llu "
6100 "%llu\n", (unsigned long long)ins->objectid, 6289 "%llu\n", (unsigned long long)ins->objectid,
@@ -6158,7 +6347,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6158 btrfs_mark_buffer_dirty(leaf); 6347 btrfs_mark_buffer_dirty(leaf);
6159 btrfs_free_path(path); 6348 btrfs_free_path(path);
6160 6349
6161 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); 6350 ret = update_block_group(root, ins->objectid, ins->offset, 1);
6162 if (ret) { /* -ENOENT, logic error */ 6351 if (ret) { /* -ENOENT, logic error */
6163 printk(KERN_ERR "btrfs update block group failed for %llu " 6352 printk(KERN_ERR "btrfs update block group failed for %llu "
6164 "%llu\n", (unsigned long long)ins->objectid, 6353 "%llu\n", (unsigned long long)ins->objectid,
@@ -6201,7 +6390,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
6201 u64 num_bytes = ins->offset; 6390 u64 num_bytes = ins->offset;
6202 6391
6203 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 6392 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6204 cache_block_group(block_group, trans, NULL, 0); 6393 cache_block_group(block_group, 0);
6205 caching_ctl = get_caching_control(block_group); 6394 caching_ctl = get_caching_control(block_group);
6206 6395
6207 if (!caching_ctl) { 6396 if (!caching_ctl) {
@@ -6315,12 +6504,14 @@ use_block_rsv(struct btrfs_trans_handle *trans,
6315 if (!ret) 6504 if (!ret)
6316 return block_rsv; 6505 return block_rsv;
6317 if (ret && !block_rsv->failfast) { 6506 if (ret && !block_rsv->failfast) {
6318 static DEFINE_RATELIMIT_STATE(_rs, 6507 if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
6319 DEFAULT_RATELIMIT_INTERVAL, 6508 static DEFINE_RATELIMIT_STATE(_rs,
6320 /*DEFAULT_RATELIMIT_BURST*/ 2); 6509 DEFAULT_RATELIMIT_INTERVAL * 10,
6321 if (__ratelimit(&_rs)) 6510 /*DEFAULT_RATELIMIT_BURST*/ 1);
6322 WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n", 6511 if (__ratelimit(&_rs))
6323 ret); 6512 WARN(1, KERN_DEBUG
6513 "btrfs: block rsv returned %d\n", ret);
6514 }
6324 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 6515 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6325 BTRFS_RESERVE_NO_FLUSH); 6516 BTRFS_RESERVE_NO_FLUSH);
6326 if (!ret) { 6517 if (!ret) {
@@ -6386,7 +6577,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6386 6577
6387 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 6578 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
6388 struct btrfs_delayed_extent_op *extent_op; 6579 struct btrfs_delayed_extent_op *extent_op;
6389 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 6580 extent_op = btrfs_alloc_delayed_extent_op();
6390 BUG_ON(!extent_op); /* -ENOMEM */ 6581 BUG_ON(!extent_op); /* -ENOMEM */
6391 if (key) 6582 if (key)
6392 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 6583 memcpy(&extent_op->key, key, sizeof(extent_op->key));
@@ -7189,6 +7380,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7189 root->fs_info->fs_devices->missing_devices; 7380 root->fs_info->fs_devices->missing_devices;
7190 7381
7191 stripped = BTRFS_BLOCK_GROUP_RAID0 | 7382 stripped = BTRFS_BLOCK_GROUP_RAID0 |
7383 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
7192 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 7384 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7193 7385
7194 if (num_devices == 1) { 7386 if (num_devices == 1) {
@@ -7467,16 +7659,16 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7467 index = get_block_group_index(block_group); 7659 index = get_block_group_index(block_group);
7468 } 7660 }
7469 7661
7470 if (index == 0) { 7662 if (index == BTRFS_RAID_RAID10) {
7471 dev_min = 4; 7663 dev_min = 4;
7472 /* Divide by 2 */ 7664 /* Divide by 2 */
7473 min_free >>= 1; 7665 min_free >>= 1;
7474 } else if (index == 1) { 7666 } else if (index == BTRFS_RAID_RAID1) {
7475 dev_min = 2; 7667 dev_min = 2;
7476 } else if (index == 2) { 7668 } else if (index == BTRFS_RAID_DUP) {
7477 /* Multiply by 2 */ 7669 /* Multiply by 2 */
7478 min_free <<= 1; 7670 min_free <<= 1;
7479 } else if (index == 3) { 7671 } else if (index == BTRFS_RAID_RAID0) {
7480 dev_min = fs_devices->rw_devices; 7672 dev_min = fs_devices->rw_devices;
7481 do_div(min_free, dev_min); 7673 do_div(min_free, dev_min);
7482 } 7674 }
@@ -7637,11 +7829,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
7637 space_info = list_entry(info->space_info.next, 7829 space_info = list_entry(info->space_info.next,
7638 struct btrfs_space_info, 7830 struct btrfs_space_info,
7639 list); 7831 list);
7640 if (space_info->bytes_pinned > 0 || 7832 if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
7641 space_info->bytes_reserved > 0 || 7833 if (space_info->bytes_pinned > 0 ||
7642 space_info->bytes_may_use > 0) { 7834 space_info->bytes_reserved > 0 ||
7643 WARN_ON(1); 7835 space_info->bytes_may_use > 0) {
7644 dump_space_info(space_info, 0, 0); 7836 WARN_ON(1);
7837 dump_space_info(space_info, 0, 0);
7838 }
7645 } 7839 }
7646 list_del(&space_info->list); 7840 list_del(&space_info->list);
7647 kfree(space_info); 7841 kfree(space_info);
@@ -7740,7 +7934,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7740 btrfs_release_path(path); 7934 btrfs_release_path(path);
7741 cache->flags = btrfs_block_group_flags(&cache->item); 7935 cache->flags = btrfs_block_group_flags(&cache->item);
7742 cache->sectorsize = root->sectorsize; 7936 cache->sectorsize = root->sectorsize;
7743 7937 cache->full_stripe_len = btrfs_full_stripe_len(root,
7938 &root->fs_info->mapping_tree,
7939 found_key.objectid);
7744 btrfs_init_free_space_ctl(cache); 7940 btrfs_init_free_space_ctl(cache);
7745 7941
7746 /* 7942 /*
@@ -7794,6 +7990,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7794 if (!(get_alloc_profile(root, space_info->flags) & 7990 if (!(get_alloc_profile(root, space_info->flags) &
7795 (BTRFS_BLOCK_GROUP_RAID10 | 7991 (BTRFS_BLOCK_GROUP_RAID10 |
7796 BTRFS_BLOCK_GROUP_RAID1 | 7992 BTRFS_BLOCK_GROUP_RAID1 |
7993 BTRFS_BLOCK_GROUP_RAID5 |
7994 BTRFS_BLOCK_GROUP_RAID6 |
7797 BTRFS_BLOCK_GROUP_DUP))) 7995 BTRFS_BLOCK_GROUP_DUP)))
7798 continue; 7996 continue;
7799 /* 7997 /*
@@ -7869,6 +8067,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7869 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 8067 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7870 cache->sectorsize = root->sectorsize; 8068 cache->sectorsize = root->sectorsize;
7871 cache->fs_info = root->fs_info; 8069 cache->fs_info = root->fs_info;
8070 cache->full_stripe_len = btrfs_full_stripe_len(root,
8071 &root->fs_info->mapping_tree,
8072 chunk_offset);
7872 8073
7873 atomic_set(&cache->count, 1); 8074 atomic_set(&cache->count, 1);
7874 spin_lock_init(&cache->lock); 8075 spin_lock_init(&cache->lock);
@@ -7918,12 +8119,14 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
7918 u64 extra_flags = chunk_to_extended(flags) & 8119 u64 extra_flags = chunk_to_extended(flags) &
7919 BTRFS_EXTENDED_PROFILE_MASK; 8120 BTRFS_EXTENDED_PROFILE_MASK;
7920 8121
8122 write_seqlock(&fs_info->profiles_lock);
7921 if (flags & BTRFS_BLOCK_GROUP_DATA) 8123 if (flags & BTRFS_BLOCK_GROUP_DATA)
7922 fs_info->avail_data_alloc_bits &= ~extra_flags; 8124 fs_info->avail_data_alloc_bits &= ~extra_flags;
7923 if (flags & BTRFS_BLOCK_GROUP_METADATA) 8125 if (flags & BTRFS_BLOCK_GROUP_METADATA)
7924 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 8126 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
7925 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 8127 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
7926 fs_info->avail_system_alloc_bits &= ~extra_flags; 8128 fs_info->avail_system_alloc_bits &= ~extra_flags;
8129 write_sequnlock(&fs_info->profiles_lock);
7927} 8130}
7928 8131
7929int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 8132int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
@@ -8022,6 +8225,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8022 spin_lock(&root->fs_info->block_group_cache_lock); 8225 spin_lock(&root->fs_info->block_group_cache_lock);
8023 rb_erase(&block_group->cache_node, 8226 rb_erase(&block_group->cache_node,
8024 &root->fs_info->block_group_cache_tree); 8227 &root->fs_info->block_group_cache_tree);
8228
8229 if (root->fs_info->first_logical_byte == block_group->key.objectid)
8230 root->fs_info->first_logical_byte = (u64)-1;
8025 spin_unlock(&root->fs_info->block_group_cache_lock); 8231 spin_unlock(&root->fs_info->block_group_cache_lock);
8026 8232
8027 down_write(&block_group->space_info->groups_sem); 8233 down_write(&block_group->space_info->groups_sem);
@@ -8144,7 +8350,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8144 8350
8145 if (end - start >= range->minlen) { 8351 if (end - start >= range->minlen) {
8146 if (!block_group_cache_done(cache)) { 8352 if (!block_group_cache_done(cache)) {
8147 ret = cache_block_group(cache, NULL, root, 0); 8353 ret = cache_block_group(cache, 0);
8148 if (!ret) 8354 if (!ret)
8149 wait_block_group_cache_done(cache); 8355 wait_block_group_cache_done(cache);
8150 } 8356 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1b319df29eee..f173c5af6461 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4,7 +4,6 @@
4#include <linux/mm.h> 4#include <linux/mm.h>
5#include <linux/pagemap.h> 5#include <linux/pagemap.h>
6#include <linux/page-flags.h> 6#include <linux/page-flags.h>
7#include <linux/module.h>
8#include <linux/spinlock.h> 7#include <linux/spinlock.h>
9#include <linux/blkdev.h> 8#include <linux/blkdev.h>
10#include <linux/swap.h> 9#include <linux/swap.h>
@@ -1834,7 +1833,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1834 */ 1833 */
1835static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) 1834static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
1836{ 1835{
1837 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1836 u64 start = page_offset(page);
1838 u64 end = start + PAGE_CACHE_SIZE - 1; 1837 u64 end = start + PAGE_CACHE_SIZE - 1;
1839 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1838 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1840 SetPageUptodate(page); 1839 SetPageUptodate(page);
@@ -1846,7 +1845,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
1846 */ 1845 */
1847static void check_page_locked(struct extent_io_tree *tree, struct page *page) 1846static void check_page_locked(struct extent_io_tree *tree, struct page *page)
1848{ 1847{
1849 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1848 u64 start = page_offset(page);
1850 u64 end = start + PAGE_CACHE_SIZE - 1; 1849 u64 end = start + PAGE_CACHE_SIZE - 1;
1851 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) 1850 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1852 unlock_page(page); 1851 unlock_page(page);
@@ -1895,13 +1894,11 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1895 if (ret) 1894 if (ret)
1896 err = ret; 1895 err = ret;
1897 1896
1898 if (did_repair) { 1897 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1899 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, 1898 rec->start + rec->len - 1,
1900 rec->start + rec->len - 1, 1899 EXTENT_DAMAGED, GFP_NOFS);
1901 EXTENT_DAMAGED, GFP_NOFS); 1900 if (ret && !err)
1902 if (ret && !err) 1901 err = ret;
1903 err = ret;
1904 }
1905 1902
1906 kfree(rec); 1903 kfree(rec);
1907 return err; 1904 return err;
@@ -1932,10 +1929,15 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
1932 u64 map_length = 0; 1929 u64 map_length = 0;
1933 u64 sector; 1930 u64 sector;
1934 struct btrfs_bio *bbio = NULL; 1931 struct btrfs_bio *bbio = NULL;
1932 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
1935 int ret; 1933 int ret;
1936 1934
1937 BUG_ON(!mirror_num); 1935 BUG_ON(!mirror_num);
1938 1936
1937 /* we can't repair anything in raid56 yet */
1938 if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
1939 return 0;
1940
1939 bio = bio_alloc(GFP_NOFS, 1); 1941 bio = bio_alloc(GFP_NOFS, 1);
1940 if (!bio) 1942 if (!bio)
1941 return -EIO; 1943 return -EIO;
@@ -1960,7 +1962,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
1960 return -EIO; 1962 return -EIO;
1961 } 1963 }
1962 bio->bi_bdev = dev->bdev; 1964 bio->bi_bdev = dev->bdev;
1963 bio_add_page(bio, page, length, start-page_offset(page)); 1965 bio_add_page(bio, page, length, start - page_offset(page));
1964 btrfsic_submit_bio(WRITE_SYNC, bio); 1966 btrfsic_submit_bio(WRITE_SYNC, bio);
1965 wait_for_completion(&compl); 1967 wait_for_completion(&compl);
1966 1968
@@ -2052,6 +2054,7 @@ static int clean_io_failure(u64 start, struct page *page)
2052 failrec->failed_mirror); 2054 failrec->failed_mirror);
2053 did_repair = !ret; 2055 did_repair = !ret;
2054 } 2056 }
2057 ret = 0;
2055 } 2058 }
2056 2059
2057out: 2060out:
@@ -2293,8 +2296,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
2293 struct page *page = bvec->bv_page; 2296 struct page *page = bvec->bv_page;
2294 tree = &BTRFS_I(page->mapping->host)->io_tree; 2297 tree = &BTRFS_I(page->mapping->host)->io_tree;
2295 2298
2296 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2299 start = page_offset(page) + bvec->bv_offset;
2297 bvec->bv_offset;
2298 end = start + bvec->bv_len - 1; 2300 end = start + bvec->bv_len - 1;
2299 2301
2300 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 2302 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
@@ -2353,8 +2355,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2353 (long int)bio->bi_bdev); 2355 (long int)bio->bi_bdev);
2354 tree = &BTRFS_I(page->mapping->host)->io_tree; 2356 tree = &BTRFS_I(page->mapping->host)->io_tree;
2355 2357
2356 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2358 start = page_offset(page) + bvec->bv_offset;
2357 bvec->bv_offset;
2358 end = start + bvec->bv_len - 1; 2359 end = start + bvec->bv_len - 1;
2359 2360
2360 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 2361 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
@@ -2471,7 +2472,7 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
2471 struct extent_io_tree *tree = bio->bi_private; 2472 struct extent_io_tree *tree = bio->bi_private;
2472 u64 start; 2473 u64 start;
2473 2474
2474 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 2475 start = page_offset(page) + bvec->bv_offset;
2475 2476
2476 bio->bi_private = NULL; 2477 bio->bi_private = NULL;
2477 2478
@@ -2489,13 +2490,13 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
2489 return ret; 2490 return ret;
2490} 2491}
2491 2492
2492static int merge_bio(struct extent_io_tree *tree, struct page *page, 2493static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
2493 unsigned long offset, size_t size, struct bio *bio, 2494 unsigned long offset, size_t size, struct bio *bio,
2494 unsigned long bio_flags) 2495 unsigned long bio_flags)
2495{ 2496{
2496 int ret = 0; 2497 int ret = 0;
2497 if (tree->ops && tree->ops->merge_bio_hook) 2498 if (tree->ops && tree->ops->merge_bio_hook)
2498 ret = tree->ops->merge_bio_hook(page, offset, size, bio, 2499 ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio,
2499 bio_flags); 2500 bio_flags);
2500 BUG_ON(ret < 0); 2501 BUG_ON(ret < 0);
2501 return ret; 2502 return ret;
@@ -2530,7 +2531,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
2530 sector; 2531 sector;
2531 2532
2532 if (prev_bio_flags != bio_flags || !contig || 2533 if (prev_bio_flags != bio_flags || !contig ||
2533 merge_bio(tree, page, offset, page_size, bio, bio_flags) || 2534 merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
2534 bio_add_page(bio, page, page_size, offset) < page_size) { 2535 bio_add_page(bio, page, page_size, offset) < page_size) {
2535 ret = submit_one_bio(rw, bio, mirror_num, 2536 ret = submit_one_bio(rw, bio, mirror_num,
2536 prev_bio_flags); 2537 prev_bio_flags);
@@ -2595,7 +2596,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2595 unsigned long *bio_flags) 2596 unsigned long *bio_flags)
2596{ 2597{
2597 struct inode *inode = page->mapping->host; 2598 struct inode *inode = page->mapping->host;
2598 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2599 u64 start = page_offset(page);
2599 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2600 u64 page_end = start + PAGE_CACHE_SIZE - 1;
2600 u64 end; 2601 u64 end;
2601 u64 cur = start; 2602 u64 cur = start;
@@ -2648,6 +2649,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2648 } 2649 }
2649 } 2650 }
2650 while (cur <= end) { 2651 while (cur <= end) {
2652 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2653
2651 if (cur >= last_byte) { 2654 if (cur >= last_byte) {
2652 char *userpage; 2655 char *userpage;
2653 struct extent_state *cached = NULL; 2656 struct extent_state *cached = NULL;
@@ -2682,7 +2685,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2682 2685
2683 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2686 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2684 cur_end = min(extent_map_end(em) - 1, end); 2687 cur_end = min(extent_map_end(em) - 1, end);
2685 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2688 iosize = ALIGN(iosize, blocksize);
2686 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2689 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2687 disk_io_size = em->block_len; 2690 disk_io_size = em->block_len;
2688 sector = em->block_start >> 9; 2691 sector = em->block_start >> 9;
@@ -2735,26 +2738,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2735 continue; 2738 continue;
2736 } 2739 }
2737 2740
2738 ret = 0; 2741 pnr -= page->index;
2739 if (tree->ops && tree->ops->readpage_io_hook) { 2742 ret = submit_extent_page(READ, tree, page,
2740 ret = tree->ops->readpage_io_hook(page, cur,
2741 cur + iosize - 1);
2742 }
2743 if (!ret) {
2744 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2745 pnr -= page->index;
2746 ret = submit_extent_page(READ, tree, page,
2747 sector, disk_io_size, pg_offset, 2743 sector, disk_io_size, pg_offset,
2748 bdev, bio, pnr, 2744 bdev, bio, pnr,
2749 end_bio_extent_readpage, mirror_num, 2745 end_bio_extent_readpage, mirror_num,
2750 *bio_flags, 2746 *bio_flags,
2751 this_bio_flag); 2747 this_bio_flag);
2752 if (!ret) { 2748 if (!ret) {
2753 nr++; 2749 nr++;
2754 *bio_flags = this_bio_flag; 2750 *bio_flags = this_bio_flag;
2755 } 2751 } else {
2756 }
2757 if (ret) {
2758 SetPageError(page); 2752 SetPageError(page);
2759 unlock_extent(tree, cur, cur + iosize - 1); 2753 unlock_extent(tree, cur, cur + iosize - 1);
2760 } 2754 }
@@ -2806,7 +2800,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2806 struct inode *inode = page->mapping->host; 2800 struct inode *inode = page->mapping->host;
2807 struct extent_page_data *epd = data; 2801 struct extent_page_data *epd = data;
2808 struct extent_io_tree *tree = epd->tree; 2802 struct extent_io_tree *tree = epd->tree;
2809 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2803 u64 start = page_offset(page);
2810 u64 delalloc_start; 2804 u64 delalloc_start;
2811 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2805 u64 page_end = start + PAGE_CACHE_SIZE - 1;
2812 u64 end; 2806 u64 end;
@@ -2982,7 +2976,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2982 BUG_ON(extent_map_end(em) <= cur); 2976 BUG_ON(extent_map_end(em) <= cur);
2983 BUG_ON(end < cur); 2977 BUG_ON(end < cur);
2984 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2978 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2985 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2979 iosize = ALIGN(iosize, blocksize);
2986 sector = (em->block_start + extent_offset) >> 9; 2980 sector = (em->block_start + extent_offset) >> 9;
2987 bdev = em->bdev; 2981 bdev = em->bdev;
2988 block_start = em->block_start; 2982 block_start = em->block_start;
@@ -3124,12 +3118,9 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
3124 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3118 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3125 spin_unlock(&eb->refs_lock); 3119 spin_unlock(&eb->refs_lock);
3126 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3120 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3127 spin_lock(&fs_info->delalloc_lock); 3121 __percpu_counter_add(&fs_info->dirty_metadata_bytes,
3128 if (fs_info->dirty_metadata_bytes >= eb->len) 3122 -eb->len,
3129 fs_info->dirty_metadata_bytes -= eb->len; 3123 fs_info->dirty_metadata_batch);
3130 else
3131 WARN_ON(1);
3132 spin_unlock(&fs_info->delalloc_lock);
3133 ret = 1; 3124 ret = 1;
3134 } else { 3125 } else {
3135 spin_unlock(&eb->refs_lock); 3126 spin_unlock(&eb->refs_lock);
@@ -3446,15 +3437,9 @@ retry:
3446 * swizzled back from swapper_space to tmpfs file 3437 * swizzled back from swapper_space to tmpfs file
3447 * mapping 3438 * mapping
3448 */ 3439 */
3449 if (tree->ops && 3440 if (!trylock_page(page)) {
3450 tree->ops->write_cache_pages_lock_hook) { 3441 flush_fn(data);
3451 tree->ops->write_cache_pages_lock_hook(page, 3442 lock_page(page);
3452 data, flush_fn);
3453 } else {
3454 if (!trylock_page(page)) {
3455 flush_fn(data);
3456 lock_page(page);
3457 }
3458 } 3443 }
3459 3444
3460 if (unlikely(page->mapping != mapping)) { 3445 if (unlikely(page->mapping != mapping)) {
@@ -3674,11 +3659,11 @@ int extent_invalidatepage(struct extent_io_tree *tree,
3674 struct page *page, unsigned long offset) 3659 struct page *page, unsigned long offset)
3675{ 3660{
3676 struct extent_state *cached_state = NULL; 3661 struct extent_state *cached_state = NULL;
3677 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); 3662 u64 start = page_offset(page);
3678 u64 end = start + PAGE_CACHE_SIZE - 1; 3663 u64 end = start + PAGE_CACHE_SIZE - 1;
3679 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 3664 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
3680 3665
3681 start += (offset + blocksize - 1) & ~(blocksize - 1); 3666 start += ALIGN(offset, blocksize);
3682 if (start > end) 3667 if (start > end)
3683 return 0; 3668 return 0;
3684 3669
@@ -3700,7 +3685,7 @@ int try_release_extent_state(struct extent_map_tree *map,
3700 struct extent_io_tree *tree, struct page *page, 3685 struct extent_io_tree *tree, struct page *page,
3701 gfp_t mask) 3686 gfp_t mask)
3702{ 3687{
3703 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 3688 u64 start = page_offset(page);
3704 u64 end = start + PAGE_CACHE_SIZE - 1; 3689 u64 end = start + PAGE_CACHE_SIZE - 1;
3705 int ret = 1; 3690 int ret = 1;
3706 3691
@@ -3739,7 +3724,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
3739 gfp_t mask) 3724 gfp_t mask)
3740{ 3725{
3741 struct extent_map *em; 3726 struct extent_map *em;
3742 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 3727 u64 start = page_offset(page);
3743 u64 end = start + PAGE_CACHE_SIZE - 1; 3728 u64 end = start + PAGE_CACHE_SIZE - 1;
3744 3729
3745 if ((mask & __GFP_WAIT) && 3730 if ((mask & __GFP_WAIT) &&
@@ -3797,7 +3782,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
3797 len = last - offset; 3782 len = last - offset;
3798 if (len == 0) 3783 if (len == 0)
3799 break; 3784 break;
3800 len = (len + sectorsize - 1) & ~(sectorsize - 1); 3785 len = ALIGN(len, sectorsize);
3801 em = get_extent(inode, NULL, 0, offset, len, 0); 3786 em = get_extent(inode, NULL, 0, offset, len, 0);
3802 if (IS_ERR_OR_NULL(em)) 3787 if (IS_ERR_OR_NULL(em))
3803 return em; 3788 return em;
@@ -3995,8 +3980,6 @@ static void __free_extent_buffer(struct extent_buffer *eb)
3995 list_del(&eb->leak_list); 3980 list_del(&eb->leak_list);
3996 spin_unlock_irqrestore(&leak_lock, flags); 3981 spin_unlock_irqrestore(&leak_lock, flags);
3997#endif 3982#endif
3998 if (eb->pages && eb->pages != eb->inline_pages)
3999 kfree(eb->pages);
4000 kmem_cache_free(extent_buffer_cache, eb); 3983 kmem_cache_free(extent_buffer_cache, eb);
4001} 3984}
4002 3985
@@ -4037,19 +4020,12 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
4037 atomic_set(&eb->refs, 1); 4020 atomic_set(&eb->refs, 1);
4038 atomic_set(&eb->io_pages, 0); 4021 atomic_set(&eb->io_pages, 0);
4039 4022
4040 if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) { 4023 /*
4041 struct page **pages; 4024 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
4042 int num_pages = (len + PAGE_CACHE_SIZE - 1) >> 4025 */
4043 PAGE_CACHE_SHIFT; 4026 BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
4044 pages = kzalloc(num_pages, mask); 4027 > MAX_INLINE_EXTENT_BUFFER_SIZE);
4045 if (!pages) { 4028 BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
4046 __free_extent_buffer(eb);
4047 return NULL;
4048 }
4049 eb->pages = pages;
4050 } else {
4051 eb->pages = eb->inline_pages;
4052 }
4053 4029
4054 return eb; 4030 return eb;
4055} 4031}
@@ -4180,6 +4156,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4180 4156
4181static void check_buffer_tree_ref(struct extent_buffer *eb) 4157static void check_buffer_tree_ref(struct extent_buffer *eb)
4182{ 4158{
4159 int refs;
4183 /* the ref bit is tricky. We have to make sure it is set 4160 /* the ref bit is tricky. We have to make sure it is set
4184 * if we have the buffer dirty. Otherwise the 4161 * if we have the buffer dirty. Otherwise the
4185 * code to free a buffer can end up dropping a dirty 4162 * code to free a buffer can end up dropping a dirty
@@ -4200,6 +4177,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
4200 * So bump the ref count first, then set the bit. If someone 4177 * So bump the ref count first, then set the bit. If someone
4201 * beat us to it, drop the ref we added. 4178 * beat us to it, drop the ref we added.
4202 */ 4179 */
4180 refs = atomic_read(&eb->refs);
4181 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4182 return;
4183
4203 spin_lock(&eb->refs_lock); 4184 spin_lock(&eb->refs_lock);
4204 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4185 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4205 atomic_inc(&eb->refs); 4186 atomic_inc(&eb->refs);
@@ -4401,9 +4382,20 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4401 4382
4402void free_extent_buffer(struct extent_buffer *eb) 4383void free_extent_buffer(struct extent_buffer *eb)
4403{ 4384{
4385 int refs;
4386 int old;
4404 if (!eb) 4387 if (!eb)
4405 return; 4388 return;
4406 4389
4390 while (1) {
4391 refs = atomic_read(&eb->refs);
4392 if (refs <= 3)
4393 break;
4394 old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
4395 if (old == refs)
4396 return;
4397 }
4398
4407 spin_lock(&eb->refs_lock); 4399 spin_lock(&eb->refs_lock);
4408 if (atomic_read(&eb->refs) == 2 && 4400 if (atomic_read(&eb->refs) == 2 &&
4409 test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) 4401 test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 2eacfabd3263..6068a1985560 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -72,10 +72,9 @@ struct extent_io_ops {
72 int (*writepage_start_hook)(struct page *page, u64 start, u64 end); 72 int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
73 int (*writepage_io_hook)(struct page *page, u64 start, u64 end); 73 int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
74 extent_submit_bio_hook_t *submit_bio_hook; 74 extent_submit_bio_hook_t *submit_bio_hook;
75 int (*merge_bio_hook)(struct page *page, unsigned long offset, 75 int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset,
76 size_t size, struct bio *bio, 76 size_t size, struct bio *bio,
77 unsigned long bio_flags); 77 unsigned long bio_flags);
78 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
79 int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); 78 int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
80 int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, 79 int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
81 struct extent_state *state, int mirror); 80 struct extent_state *state, int mirror);
@@ -90,8 +89,6 @@ struct extent_io_ops {
90 struct extent_state *other); 89 struct extent_state *other);
91 void (*split_extent_hook)(struct inode *inode, 90 void (*split_extent_hook)(struct inode *inode,
92 struct extent_state *orig, u64 split); 91 struct extent_state *orig, u64 split);
93 int (*write_cache_pages_lock_hook)(struct page *page, void *data,
94 void (*flush_fn)(void *));
95}; 92};
96 93
97struct extent_io_tree { 94struct extent_io_tree {
@@ -161,8 +158,7 @@ struct extent_buffer {
161 */ 158 */
162 wait_queue_head_t read_lock_wq; 159 wait_queue_head_t read_lock_wq;
163 wait_queue_head_t lock_wq; 160 wait_queue_head_t lock_wq;
164 struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES]; 161 struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
165 struct page **pages;
166}; 162};
167 163
168static inline void extent_set_compress_type(unsigned long *bio_flags, 164static inline void extent_set_compress_type(unsigned long *bio_flags,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index fdb7a8db3b57..2834ca5768ea 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,6 +1,5 @@
1#include <linux/err.h> 1#include <linux/err.h>
2#include <linux/slab.h> 2#include <linux/slab.h>
3#include <linux/module.h>
4#include <linux/spinlock.h> 3#include <linux/spinlock.h>
5#include <linux/hardirq.h> 4#include <linux/hardirq.h>
6#include "ctree.h" 5#include "ctree.h"
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 94aa53b38721..ec160202be3e 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -684,6 +684,24 @@ out:
684 return ret; 684 return ret;
685} 685}
686 686
687static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums,
688 struct btrfs_sector_sum *sector_sum,
689 u64 total_bytes, u64 sectorsize)
690{
691 u64 tmp = sectorsize;
692 u64 next_sector = sector_sum->bytenr;
693 struct btrfs_sector_sum *next = sector_sum + 1;
694
695 while ((tmp + total_bytes) < sums->len) {
696 if (next_sector + sectorsize != next->bytenr)
697 break;
698 tmp += sectorsize;
699 next_sector = next->bytenr;
700 next++;
701 }
702 return tmp;
703}
704
687int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, 705int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
688 struct btrfs_root *root, 706 struct btrfs_root *root,
689 struct btrfs_ordered_sum *sums) 707 struct btrfs_ordered_sum *sums)
@@ -789,20 +807,32 @@ again:
789 goto insert; 807 goto insert;
790 } 808 }
791 809
792 if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) / 810 if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) /
793 csum_size) { 811 csum_size) {
794 u32 diff = (csum_offset + 1) * csum_size; 812 int extend_nr;
813 u64 tmp;
814 u32 diff;
815 u32 free_space;
795 816
796 /* 817 if (btrfs_leaf_free_space(root, leaf) <
797 * is the item big enough already? we dropped our lock 818 sizeof(struct btrfs_item) + csum_size * 2)
798 * before and need to recheck 819 goto insert;
799 */ 820
800 if (diff < btrfs_item_size_nr(leaf, path->slots[0])) 821 free_space = btrfs_leaf_free_space(root, leaf) -
801 goto csum; 822 sizeof(struct btrfs_item) - csum_size;
823 tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes,
824 root->sectorsize);
825 tmp >>= root->fs_info->sb->s_blocksize_bits;
826 WARN_ON(tmp < 1);
827
828 extend_nr = max_t(int, 1, (int)tmp);
829 diff = (csum_offset + extend_nr) * csum_size;
830 diff = min(diff, MAX_CSUM_ITEMS(root, csum_size) * csum_size);
802 831
803 diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); 832 diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
804 if (diff != csum_size) 833 diff = min(free_space, diff);
805 goto insert; 834 diff /= csum_size;
835 diff *= csum_size;
806 836
807 btrfs_extend_item(trans, root, path, diff); 837 btrfs_extend_item(trans, root, path, diff);
808 goto csum; 838 goto csum;
@@ -812,19 +842,14 @@ insert:
812 btrfs_release_path(path); 842 btrfs_release_path(path);
813 csum_offset = 0; 843 csum_offset = 0;
814 if (found_next) { 844 if (found_next) {
815 u64 tmp = total_bytes + root->sectorsize; 845 u64 tmp;
816 u64 next_sector = sector_sum->bytenr;
817 struct btrfs_sector_sum *next = sector_sum + 1;
818 846
819 while (tmp < sums->len) { 847 tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes,
820 if (next_sector + root->sectorsize != next->bytenr) 848 root->sectorsize);
821 break;
822 tmp += root->sectorsize;
823 next_sector = next->bytenr;
824 next++;
825 }
826 tmp = min(tmp, next_offset - file_key.offset);
827 tmp >>= root->fs_info->sb->s_blocksize_bits; 849 tmp >>= root->fs_info->sb->s_blocksize_bits;
850 tmp = min(tmp, (next_offset - file_key.offset) >>
851 root->fs_info->sb->s_blocksize_bits);
852
828 tmp = max((u64)1, tmp); 853 tmp = max((u64)1, tmp);
829 tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size)); 854 tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
830 ins_size = csum_size * tmp; 855 ins_size = csum_size * tmp;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4b241fe9d2fe..af1d0605a5c1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -30,11 +30,11 @@
30#include <linux/statfs.h> 30#include <linux/statfs.h>
31#include <linux/compat.h> 31#include <linux/compat.h>
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/btrfs.h>
33#include "ctree.h" 34#include "ctree.h"
34#include "disk-io.h" 35#include "disk-io.h"
35#include "transaction.h" 36#include "transaction.h"
36#include "btrfs_inode.h" 37#include "btrfs_inode.h"
37#include "ioctl.h"
38#include "print-tree.h" 38#include "print-tree.h"
39#include "tree-log.h" 39#include "tree-log.h"
40#include "locking.h" 40#include "locking.h"
@@ -374,6 +374,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
374 374
375 atomic_inc(&fs_info->defrag_running); 375 atomic_inc(&fs_info->defrag_running);
376 while(1) { 376 while(1) {
377 /* Pause the auto defragger. */
378 if (test_bit(BTRFS_FS_STATE_REMOUNTING,
379 &fs_info->fs_state))
380 break;
381
377 if (!__need_auto_defrag(fs_info->tree_root)) 382 if (!__need_auto_defrag(fs_info->tree_root))
378 break; 383 break;
379 384
@@ -505,8 +510,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
505 loff_t isize = i_size_read(inode); 510 loff_t isize = i_size_read(inode);
506 511
507 start_pos = pos & ~((u64)root->sectorsize - 1); 512 start_pos = pos & ~((u64)root->sectorsize - 1);
508 num_bytes = (write_bytes + pos - start_pos + 513 num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);
509 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
510 514
511 end_of_last_block = start_pos + num_bytes - 1; 515 end_of_last_block = start_pos + num_bytes - 1;
512 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 516 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
@@ -1544,7 +1548,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1544 * although we have opened a file as writable, we have 1548 * although we have opened a file as writable, we have
1545 * to stop this write operation to ensure FS consistency. 1549 * to stop this write operation to ensure FS consistency.
1546 */ 1550 */
1547 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 1551 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
1548 mutex_unlock(&inode->i_mutex); 1552 mutex_unlock(&inode->i_mutex);
1549 err = -EROFS; 1553 err = -EROFS;
1550 goto out; 1554 goto out;
@@ -1627,7 +1631,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
1627 */ 1631 */
1628 if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 1632 if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
1629 &BTRFS_I(inode)->runtime_flags)) { 1633 &BTRFS_I(inode)->runtime_flags)) {
1630 btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); 1634 struct btrfs_trans_handle *trans;
1635 struct btrfs_root *root = BTRFS_I(inode)->root;
1636
1637 /*
1638 * We need to block on a committing transaction to keep us from
1639 * throwing a ordered operation on to the list and causing
1640 * something like sync to deadlock trying to flush out this
1641 * inode.
1642 */
1643 trans = btrfs_start_transaction(root, 0);
1644 if (IS_ERR(trans))
1645 return PTR_ERR(trans);
1646 btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
1647 btrfs_end_transaction(trans, root);
1631 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 1648 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1632 filemap_flush(inode->i_mapping); 1649 filemap_flush(inode->i_mapping);
1633 } 1650 }
@@ -1654,16 +1671,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1654 struct btrfs_root *root = BTRFS_I(inode)->root; 1671 struct btrfs_root *root = BTRFS_I(inode)->root;
1655 int ret = 0; 1672 int ret = 0;
1656 struct btrfs_trans_handle *trans; 1673 struct btrfs_trans_handle *trans;
1674 bool full_sync = 0;
1657 1675
1658 trace_btrfs_sync_file(file, datasync); 1676 trace_btrfs_sync_file(file, datasync);
1659 1677
1660 /* 1678 /*
1661 * We write the dirty pages in the range and wait until they complete 1679 * We write the dirty pages in the range and wait until they complete
1662 * out of the ->i_mutex. If so, we can flush the dirty pages by 1680 * out of the ->i_mutex. If so, we can flush the dirty pages by
1663 * multi-task, and make the performance up. 1681 * multi-task, and make the performance up. See
1682 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1664 */ 1683 */
1665 atomic_inc(&BTRFS_I(inode)->sync_writers); 1684 atomic_inc(&BTRFS_I(inode)->sync_writers);
1666 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 1685 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1686 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1687 &BTRFS_I(inode)->runtime_flags))
1688 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1667 atomic_dec(&BTRFS_I(inode)->sync_writers); 1689 atomic_dec(&BTRFS_I(inode)->sync_writers);
1668 if (ret) 1690 if (ret)
1669 return ret; 1691 return ret;
@@ -1675,7 +1697,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1675 * range being left. 1697 * range being left.
1676 */ 1698 */
1677 atomic_inc(&root->log_batch); 1699 atomic_inc(&root->log_batch);
1678 btrfs_wait_ordered_range(inode, start, end - start + 1); 1700 full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1701 &BTRFS_I(inode)->runtime_flags);
1702 if (full_sync)
1703 btrfs_wait_ordered_range(inode, start, end - start + 1);
1679 atomic_inc(&root->log_batch); 1704 atomic_inc(&root->log_batch);
1680 1705
1681 /* 1706 /*
@@ -1742,13 +1767,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1742 1767
1743 if (ret != BTRFS_NO_LOG_SYNC) { 1768 if (ret != BTRFS_NO_LOG_SYNC) {
1744 if (ret > 0) { 1769 if (ret > 0) {
1770 /*
1771 * If we didn't already wait for ordered extents we need
1772 * to do that now.
1773 */
1774 if (!full_sync)
1775 btrfs_wait_ordered_range(inode, start,
1776 end - start + 1);
1745 ret = btrfs_commit_transaction(trans, root); 1777 ret = btrfs_commit_transaction(trans, root);
1746 } else { 1778 } else {
1747 ret = btrfs_sync_log(trans, root); 1779 ret = btrfs_sync_log(trans, root);
1748 if (ret == 0) 1780 if (ret == 0) {
1749 ret = btrfs_end_transaction(trans, root); 1781 ret = btrfs_end_transaction(trans, root);
1750 else 1782 } else {
1783 if (!full_sync)
1784 btrfs_wait_ordered_range(inode, start,
1785 end -
1786 start + 1);
1751 ret = btrfs_commit_transaction(trans, root); 1787 ret = btrfs_commit_transaction(trans, root);
1788 }
1752 } 1789 }
1753 } else { 1790 } else {
1754 ret = btrfs_end_transaction(trans, root); 1791 ret = btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 0be7a8742a43..1f84fc09c1a8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1356,6 +1356,8 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1356 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; 1356 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
1357 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 1357 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
1358 1358
1359 max_bitmaps = max(max_bitmaps, 1);
1360
1359 BUG_ON(ctl->total_bitmaps > max_bitmaps); 1361 BUG_ON(ctl->total_bitmaps > max_bitmaps);
1360 1362
1361 /* 1363 /*
@@ -1463,10 +1465,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1463} 1465}
1464 1466
1465static struct btrfs_free_space * 1467static struct btrfs_free_space *
1466find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) 1468find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1469 unsigned long align)
1467{ 1470{
1468 struct btrfs_free_space *entry; 1471 struct btrfs_free_space *entry;
1469 struct rb_node *node; 1472 struct rb_node *node;
1473 u64 ctl_off;
1474 u64 tmp;
1475 u64 align_off;
1470 int ret; 1476 int ret;
1471 1477
1472 if (!ctl->free_space_offset.rb_node) 1478 if (!ctl->free_space_offset.rb_node)
@@ -1481,15 +1487,34 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
1481 if (entry->bytes < *bytes) 1487 if (entry->bytes < *bytes)
1482 continue; 1488 continue;
1483 1489
1490 /* make sure the space returned is big enough
1491 * to match our requested alignment
1492 */
1493 if (*bytes >= align) {
1494 ctl_off = entry->offset - ctl->start;
1495 tmp = ctl_off + align - 1;;
1496 do_div(tmp, align);
1497 tmp = tmp * align + ctl->start;
1498 align_off = tmp - entry->offset;
1499 } else {
1500 align_off = 0;
1501 tmp = entry->offset;
1502 }
1503
1504 if (entry->bytes < *bytes + align_off)
1505 continue;
1506
1484 if (entry->bitmap) { 1507 if (entry->bitmap) {
1485 ret = search_bitmap(ctl, entry, offset, bytes); 1508 ret = search_bitmap(ctl, entry, &tmp, bytes);
1486 if (!ret) 1509 if (!ret) {
1510 *offset = tmp;
1487 return entry; 1511 return entry;
1512 }
1488 continue; 1513 continue;
1489 } 1514 }
1490 1515
1491 *offset = entry->offset; 1516 *offset = tmp;
1492 *bytes = entry->bytes; 1517 *bytes = entry->bytes - align_off;
1493 return entry; 1518 return entry;
1494 } 1519 }
1495 1520
@@ -1636,10 +1661,14 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
1636 } 1661 }
1637 1662
1638 /* 1663 /*
1639 * some block groups are so tiny they can't be enveloped by a bitmap, so 1664 * The original block groups from mkfs can be really small, like 8
1640 * don't even bother to create a bitmap for this 1665 * megabytes, so don't bother with a bitmap for those entries. However
1666 * some block groups can be smaller than what a bitmap would cover but
1667 * are still large enough that they could overflow the 32k memory limit,
1668 * so allow those block groups to still be allowed to have a bitmap
1669 * entry.
1641 */ 1670 */
1642 if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset) 1671 if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)
1643 return false; 1672 return false;
1644 1673
1645 return true; 1674 return true;
@@ -2095,9 +2124,12 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2095 struct btrfs_free_space *entry = NULL; 2124 struct btrfs_free_space *entry = NULL;
2096 u64 bytes_search = bytes + empty_size; 2125 u64 bytes_search = bytes + empty_size;
2097 u64 ret = 0; 2126 u64 ret = 0;
2127 u64 align_gap = 0;
2128 u64 align_gap_len = 0;
2098 2129
2099 spin_lock(&ctl->tree_lock); 2130 spin_lock(&ctl->tree_lock);
2100 entry = find_free_space(ctl, &offset, &bytes_search); 2131 entry = find_free_space(ctl, &offset, &bytes_search,
2132 block_group->full_stripe_len);
2101 if (!entry) 2133 if (!entry)
2102 goto out; 2134 goto out;
2103 2135
@@ -2107,9 +2139,15 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2107 if (!entry->bytes) 2139 if (!entry->bytes)
2108 free_bitmap(ctl, entry); 2140 free_bitmap(ctl, entry);
2109 } else { 2141 } else {
2142
2110 unlink_free_space(ctl, entry); 2143 unlink_free_space(ctl, entry);
2111 entry->offset += bytes; 2144 align_gap_len = offset - entry->offset;
2112 entry->bytes -= bytes; 2145 align_gap = entry->offset;
2146
2147 entry->offset = offset + bytes;
2148 WARN_ON(entry->bytes < bytes + align_gap_len);
2149
2150 entry->bytes -= bytes + align_gap_len;
2113 if (!entry->bytes) 2151 if (!entry->bytes)
2114 kmem_cache_free(btrfs_free_space_cachep, entry); 2152 kmem_cache_free(btrfs_free_space_cachep, entry);
2115 else 2153 else
@@ -2119,6 +2157,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2119out: 2157out:
2120 spin_unlock(&ctl->tree_lock); 2158 spin_unlock(&ctl->tree_lock);
2121 2159
2160 if (align_gap_len)
2161 __btrfs_add_free_space(ctl, align_gap, align_gap_len);
2122 return ret; 2162 return ret;
2123} 2163}
2124 2164
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 55c07b650378..c226daefd65d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -39,12 +39,13 @@
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/ratelimit.h> 40#include <linux/ratelimit.h>
41#include <linux/mount.h> 41#include <linux/mount.h>
42#include <linux/btrfs.h>
43#include <linux/blkdev.h>
42#include "compat.h" 44#include "compat.h"
43#include "ctree.h" 45#include "ctree.h"
44#include "disk-io.h" 46#include "disk-io.h"
45#include "transaction.h" 47#include "transaction.h"
46#include "btrfs_inode.h" 48#include "btrfs_inode.h"
47#include "ioctl.h"
48#include "print-tree.h" 49#include "print-tree.h"
49#include "ordered-data.h" 50#include "ordered-data.h"
50#include "xattr.h" 51#include "xattr.h"
@@ -54,6 +55,7 @@
54#include "locking.h" 55#include "locking.h"
55#include "free-space-cache.h" 56#include "free-space-cache.h"
56#include "inode-map.h" 57#include "inode-map.h"
58#include "backref.h"
57 59
58struct btrfs_iget_args { 60struct btrfs_iget_args {
59 u64 ino; 61 u64 ino;
@@ -231,8 +233,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
231 u64 isize = i_size_read(inode); 233 u64 isize = i_size_read(inode);
232 u64 actual_end = min(end + 1, isize); 234 u64 actual_end = min(end + 1, isize);
233 u64 inline_len = actual_end - start; 235 u64 inline_len = actual_end - start;
234 u64 aligned_end = (end + root->sectorsize - 1) & 236 u64 aligned_end = ALIGN(end, root->sectorsize);
235 ~((u64)root->sectorsize - 1);
236 u64 data_len = inline_len; 237 u64 data_len = inline_len;
237 int ret; 238 int ret;
238 239
@@ -265,6 +266,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
265 return 1; 266 return 1;
266 } 267 }
267 268
269 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
268 btrfs_delalloc_release_metadata(inode, end + 1 - start); 270 btrfs_delalloc_release_metadata(inode, end + 1 - start);
269 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 271 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
270 return 0; 272 return 0;
@@ -389,7 +391,7 @@ again:
389 * a compressed extent to 128k. 391 * a compressed extent to 128k.
390 */ 392 */
391 total_compressed = min(total_compressed, max_uncompressed); 393 total_compressed = min(total_compressed, max_uncompressed);
392 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 394 num_bytes = ALIGN(end - start + 1, blocksize);
393 num_bytes = max(blocksize, num_bytes); 395 num_bytes = max(blocksize, num_bytes);
394 total_in = 0; 396 total_in = 0;
395 ret = 0; 397 ret = 0;
@@ -488,15 +490,13 @@ cont:
488 * up to a block size boundary so the allocator does sane 490 * up to a block size boundary so the allocator does sane
489 * things 491 * things
490 */ 492 */
491 total_compressed = (total_compressed + blocksize - 1) & 493 total_compressed = ALIGN(total_compressed, blocksize);
492 ~(blocksize - 1);
493 494
494 /* 495 /*
495 * one last check to make sure the compression is really a 496 * one last check to make sure the compression is really a
496 * win, compare the page count read with the blocks on disk 497 * win, compare the page count read with the blocks on disk
497 */ 498 */
498 total_in = (total_in + PAGE_CACHE_SIZE - 1) & 499 total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
499 ~(PAGE_CACHE_SIZE - 1);
500 if (total_compressed >= total_in) { 500 if (total_compressed >= total_in) {
501 will_compress = 0; 501 will_compress = 0;
502 } else { 502 } else {
@@ -608,7 +608,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
608 if (list_empty(&async_cow->extents)) 608 if (list_empty(&async_cow->extents))
609 return 0; 609 return 0;
610 610
611 611again:
612 while (!list_empty(&async_cow->extents)) { 612 while (!list_empty(&async_cow->extents)) {
613 async_extent = list_entry(async_cow->extents.next, 613 async_extent = list_entry(async_cow->extents.next,
614 struct async_extent, list); 614 struct async_extent, list);
@@ -648,6 +648,8 @@ retry:
648 async_extent->ram_size - 1, 648 async_extent->ram_size - 1,
649 btrfs_get_extent, 649 btrfs_get_extent,
650 WB_SYNC_ALL); 650 WB_SYNC_ALL);
651 else if (ret)
652 unlock_page(async_cow->locked_page);
651 kfree(async_extent); 653 kfree(async_extent);
652 cond_resched(); 654 cond_resched();
653 continue; 655 continue;
@@ -672,6 +674,7 @@ retry:
672 674
673 if (ret) { 675 if (ret) {
674 int i; 676 int i;
677
675 for (i = 0; i < async_extent->nr_pages; i++) { 678 for (i = 0; i < async_extent->nr_pages; i++) {
676 WARN_ON(async_extent->pages[i]->mapping); 679 WARN_ON(async_extent->pages[i]->mapping);
677 page_cache_release(async_extent->pages[i]); 680 page_cache_release(async_extent->pages[i]);
@@ -679,12 +682,10 @@ retry:
679 kfree(async_extent->pages); 682 kfree(async_extent->pages);
680 async_extent->nr_pages = 0; 683 async_extent->nr_pages = 0;
681 async_extent->pages = NULL; 684 async_extent->pages = NULL;
682 unlock_extent(io_tree, async_extent->start, 685
683 async_extent->start +
684 async_extent->ram_size - 1);
685 if (ret == -ENOSPC) 686 if (ret == -ENOSPC)
686 goto retry; 687 goto retry;
687 goto out_free; /* JDM: Requeue? */ 688 goto out_free;
688 } 689 }
689 690
690 /* 691 /*
@@ -696,10 +697,13 @@ retry:
696 async_extent->ram_size - 1, 0); 697 async_extent->ram_size - 1, 0);
697 698
698 em = alloc_extent_map(); 699 em = alloc_extent_map();
699 BUG_ON(!em); /* -ENOMEM */ 700 if (!em)
701 goto out_free_reserve;
700 em->start = async_extent->start; 702 em->start = async_extent->start;
701 em->len = async_extent->ram_size; 703 em->len = async_extent->ram_size;
702 em->orig_start = em->start; 704 em->orig_start = em->start;
705 em->mod_start = em->start;
706 em->mod_len = em->len;
703 707
704 em->block_start = ins.objectid; 708 em->block_start = ins.objectid;
705 em->block_len = ins.offset; 709 em->block_len = ins.offset;
@@ -726,6 +730,9 @@ retry:
726 async_extent->ram_size - 1, 0); 730 async_extent->ram_size - 1, 0);
727 } 731 }
728 732
733 if (ret)
734 goto out_free_reserve;
735
729 ret = btrfs_add_ordered_extent_compress(inode, 736 ret = btrfs_add_ordered_extent_compress(inode,
730 async_extent->start, 737 async_extent->start,
731 ins.objectid, 738 ins.objectid,
@@ -733,7 +740,8 @@ retry:
733 ins.offset, 740 ins.offset,
734 BTRFS_ORDERED_COMPRESSED, 741 BTRFS_ORDERED_COMPRESSED,
735 async_extent->compress_type); 742 async_extent->compress_type);
736 BUG_ON(ret); /* -ENOMEM */ 743 if (ret)
744 goto out_free_reserve;
737 745
738 /* 746 /*
739 * clear dirty, set writeback and unlock the pages. 747 * clear dirty, set writeback and unlock the pages.
@@ -754,18 +762,30 @@ retry:
754 ins.objectid, 762 ins.objectid,
755 ins.offset, async_extent->pages, 763 ins.offset, async_extent->pages,
756 async_extent->nr_pages); 764 async_extent->nr_pages);
757
758 BUG_ON(ret); /* -ENOMEM */
759 alloc_hint = ins.objectid + ins.offset; 765 alloc_hint = ins.objectid + ins.offset;
760 kfree(async_extent); 766 kfree(async_extent);
767 if (ret)
768 goto out;
761 cond_resched(); 769 cond_resched();
762 } 770 }
763 ret = 0; 771 ret = 0;
764out: 772out:
765 return ret; 773 return ret;
774out_free_reserve:
775 btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
766out_free: 776out_free:
777 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
778 async_extent->start,
779 async_extent->start +
780 async_extent->ram_size - 1,
781 NULL, EXTENT_CLEAR_UNLOCK_PAGE |
782 EXTENT_CLEAR_UNLOCK |
783 EXTENT_CLEAR_DELALLOC |
784 EXTENT_CLEAR_DIRTY |
785 EXTENT_SET_WRITEBACK |
786 EXTENT_END_WRITEBACK);
767 kfree(async_extent); 787 kfree(async_extent);
768 goto out; 788 goto again;
769} 789}
770 790
771static u64 get_extent_allocation_hint(struct inode *inode, u64 start, 791static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
@@ -834,7 +854,7 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
834 854
835 BUG_ON(btrfs_is_free_space_inode(inode)); 855 BUG_ON(btrfs_is_free_space_inode(inode));
836 856
837 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 857 num_bytes = ALIGN(end - start + 1, blocksize);
838 num_bytes = max(blocksize, num_bytes); 858 num_bytes = max(blocksize, num_bytes);
839 disk_num_bytes = num_bytes; 859 disk_num_bytes = num_bytes;
840 860
@@ -892,6 +912,8 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
892 em->orig_start = em->start; 912 em->orig_start = em->start;
893 ram_size = ins.offset; 913 ram_size = ins.offset;
894 em->len = ins.offset; 914 em->len = ins.offset;
915 em->mod_start = em->start;
916 em->mod_len = em->len;
895 917
896 em->block_start = ins.objectid; 918 em->block_start = ins.objectid;
897 em->block_len = ins.offset; 919 em->block_len = ins.offset;
@@ -1338,6 +1360,8 @@ out_check:
1338 em->block_start = disk_bytenr; 1360 em->block_start = disk_bytenr;
1339 em->orig_block_len = disk_num_bytes; 1361 em->orig_block_len = disk_num_bytes;
1340 em->bdev = root->fs_info->fs_devices->latest_bdev; 1362 em->bdev = root->fs_info->fs_devices->latest_bdev;
1363 em->mod_start = em->start;
1364 em->mod_len = em->len;
1341 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1365 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1342 set_bit(EXTENT_FLAG_FILLING, &em->flags); 1366 set_bit(EXTENT_FLAG_FILLING, &em->flags);
1343 em->generation = -1; 1367 em->generation = -1;
@@ -1508,14 +1532,22 @@ static void btrfs_set_bit_hook(struct inode *inode,
1508 spin_unlock(&BTRFS_I(inode)->lock); 1532 spin_unlock(&BTRFS_I(inode)->lock);
1509 } 1533 }
1510 1534
1511 spin_lock(&root->fs_info->delalloc_lock); 1535 __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
1536 root->fs_info->delalloc_batch);
1537 spin_lock(&BTRFS_I(inode)->lock);
1512 BTRFS_I(inode)->delalloc_bytes += len; 1538 BTRFS_I(inode)->delalloc_bytes += len;
1513 root->fs_info->delalloc_bytes += len; 1539 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1514 if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1540 &BTRFS_I(inode)->runtime_flags)) {
1515 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1541 spin_lock(&root->fs_info->delalloc_lock);
1516 &root->fs_info->delalloc_inodes); 1542 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1543 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1544 &root->fs_info->delalloc_inodes);
1545 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1546 &BTRFS_I(inode)->runtime_flags);
1547 }
1548 spin_unlock(&root->fs_info->delalloc_lock);
1517 } 1549 }
1518 spin_unlock(&root->fs_info->delalloc_lock); 1550 spin_unlock(&BTRFS_I(inode)->lock);
1519 } 1551 }
1520} 1552}
1521 1553
@@ -1550,15 +1582,22 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1550 && do_list) 1582 && do_list)
1551 btrfs_free_reserved_data_space(inode, len); 1583 btrfs_free_reserved_data_space(inode, len);
1552 1584
1553 spin_lock(&root->fs_info->delalloc_lock); 1585 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
1554 root->fs_info->delalloc_bytes -= len; 1586 root->fs_info->delalloc_batch);
1587 spin_lock(&BTRFS_I(inode)->lock);
1555 BTRFS_I(inode)->delalloc_bytes -= len; 1588 BTRFS_I(inode)->delalloc_bytes -= len;
1556
1557 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && 1589 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1558 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1590 test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1559 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1591 &BTRFS_I(inode)->runtime_flags)) {
1592 spin_lock(&root->fs_info->delalloc_lock);
1593 if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1594 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1595 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1596 &BTRFS_I(inode)->runtime_flags);
1597 }
1598 spin_unlock(&root->fs_info->delalloc_lock);
1560 } 1599 }
1561 spin_unlock(&root->fs_info->delalloc_lock); 1600 spin_unlock(&BTRFS_I(inode)->lock);
1562 } 1601 }
1563} 1602}
1564 1603
@@ -1566,7 +1605,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1566 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure 1605 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1567 * we don't create bios that span stripes or chunks 1606 * we don't create bios that span stripes or chunks
1568 */ 1607 */
1569int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 1608int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
1570 size_t size, struct bio *bio, 1609 size_t size, struct bio *bio,
1571 unsigned long bio_flags) 1610 unsigned long bio_flags)
1572{ 1611{
@@ -1581,7 +1620,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1581 1620
1582 length = bio->bi_size; 1621 length = bio->bi_size;
1583 map_length = length; 1622 map_length = length;
1584 ret = btrfs_map_block(root->fs_info, READ, logical, 1623 ret = btrfs_map_block(root->fs_info, rw, logical,
1585 &map_length, NULL, 0); 1624 &map_length, NULL, 0);
1586 /* Will always return 0 with map_multi == NULL */ 1625 /* Will always return 0 with map_multi == NULL */
1587 BUG_ON(ret < 0); 1626 BUG_ON(ret < 0);
@@ -1892,6 +1931,640 @@ out:
1892 return ret; 1931 return ret;
1893} 1932}
1894 1933
1934/* snapshot-aware defrag */
1935struct sa_defrag_extent_backref {
1936 struct rb_node node;
1937 struct old_sa_defrag_extent *old;
1938 u64 root_id;
1939 u64 inum;
1940 u64 file_pos;
1941 u64 extent_offset;
1942 u64 num_bytes;
1943 u64 generation;
1944};
1945
1946struct old_sa_defrag_extent {
1947 struct list_head list;
1948 struct new_sa_defrag_extent *new;
1949
1950 u64 extent_offset;
1951 u64 bytenr;
1952 u64 offset;
1953 u64 len;
1954 int count;
1955};
1956
1957struct new_sa_defrag_extent {
1958 struct rb_root root;
1959 struct list_head head;
1960 struct btrfs_path *path;
1961 struct inode *inode;
1962 u64 file_pos;
1963 u64 len;
1964 u64 bytenr;
1965 u64 disk_len;
1966 u8 compress_type;
1967};
1968
1969static int backref_comp(struct sa_defrag_extent_backref *b1,
1970 struct sa_defrag_extent_backref *b2)
1971{
1972 if (b1->root_id < b2->root_id)
1973 return -1;
1974 else if (b1->root_id > b2->root_id)
1975 return 1;
1976
1977 if (b1->inum < b2->inum)
1978 return -1;
1979 else if (b1->inum > b2->inum)
1980 return 1;
1981
1982 if (b1->file_pos < b2->file_pos)
1983 return -1;
1984 else if (b1->file_pos > b2->file_pos)
1985 return 1;
1986
1987 /*
1988 * [------------------------------] ===> (a range of space)
1989 * |<--->| |<---->| =============> (fs/file tree A)
1990 * |<---------------------------->| ===> (fs/file tree B)
1991 *
1992 * A range of space can refer to two file extents in one tree while
1993 * refer to only one file extent in another tree.
1994 *
1995 * So we may process a disk offset more than one time(two extents in A)
1996 * and locate at the same extent(one extent in B), then insert two same
1997 * backrefs(both refer to the extent in B).
1998 */
1999 return 0;
2000}
2001
2002static void backref_insert(struct rb_root *root,
2003 struct sa_defrag_extent_backref *backref)
2004{
2005 struct rb_node **p = &root->rb_node;
2006 struct rb_node *parent = NULL;
2007 struct sa_defrag_extent_backref *entry;
2008 int ret;
2009
2010 while (*p) {
2011 parent = *p;
2012 entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2013
2014 ret = backref_comp(backref, entry);
2015 if (ret < 0)
2016 p = &(*p)->rb_left;
2017 else
2018 p = &(*p)->rb_right;
2019 }
2020
2021 rb_link_node(&backref->node, parent, p);
2022 rb_insert_color(&backref->node, root);
2023}
2024
2025/*
2026 * Note the backref might has changed, and in this case we just return 0.
2027 */
2028static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2029 void *ctx)
2030{
2031 struct btrfs_file_extent_item *extent;
2032 struct btrfs_fs_info *fs_info;
2033 struct old_sa_defrag_extent *old = ctx;
2034 struct new_sa_defrag_extent *new = old->new;
2035 struct btrfs_path *path = new->path;
2036 struct btrfs_key key;
2037 struct btrfs_root *root;
2038 struct sa_defrag_extent_backref *backref;
2039 struct extent_buffer *leaf;
2040 struct inode *inode = new->inode;
2041 int slot;
2042 int ret;
2043 u64 extent_offset;
2044 u64 num_bytes;
2045
2046 if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2047 inum == btrfs_ino(inode))
2048 return 0;
2049
2050 key.objectid = root_id;
2051 key.type = BTRFS_ROOT_ITEM_KEY;
2052 key.offset = (u64)-1;
2053
2054 fs_info = BTRFS_I(inode)->root->fs_info;
2055 root = btrfs_read_fs_root_no_name(fs_info, &key);
2056 if (IS_ERR(root)) {
2057 if (PTR_ERR(root) == -ENOENT)
2058 return 0;
2059 WARN_ON(1);
2060 pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
2061 inum, offset, root_id);
2062 return PTR_ERR(root);
2063 }
2064
2065 key.objectid = inum;
2066 key.type = BTRFS_EXTENT_DATA_KEY;
2067 if (offset > (u64)-1 << 32)
2068 key.offset = 0;
2069 else
2070 key.offset = offset;
2071
2072 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2073 if (ret < 0) {
2074 WARN_ON(1);
2075 return ret;
2076 }
2077
2078 while (1) {
2079 cond_resched();
2080
2081 leaf = path->nodes[0];
2082 slot = path->slots[0];
2083
2084 if (slot >= btrfs_header_nritems(leaf)) {
2085 ret = btrfs_next_leaf(root, path);
2086 if (ret < 0) {
2087 goto out;
2088 } else if (ret > 0) {
2089 ret = 0;
2090 goto out;
2091 }
2092 continue;
2093 }
2094
2095 path->slots[0]++;
2096
2097 btrfs_item_key_to_cpu(leaf, &key, slot);
2098
2099 if (key.objectid > inum)
2100 goto out;
2101
2102 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2103 continue;
2104
2105 extent = btrfs_item_ptr(leaf, slot,
2106 struct btrfs_file_extent_item);
2107
2108 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2109 continue;
2110
2111 extent_offset = btrfs_file_extent_offset(leaf, extent);
2112 if (key.offset - extent_offset != offset)
2113 continue;
2114
2115 num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2116 if (extent_offset >= old->extent_offset + old->offset +
2117 old->len || extent_offset + num_bytes <=
2118 old->extent_offset + old->offset)
2119 continue;
2120
2121 break;
2122 }
2123
2124 backref = kmalloc(sizeof(*backref), GFP_NOFS);
2125 if (!backref) {
2126 ret = -ENOENT;
2127 goto out;
2128 }
2129
2130 backref->root_id = root_id;
2131 backref->inum = inum;
2132 backref->file_pos = offset + extent_offset;
2133 backref->num_bytes = num_bytes;
2134 backref->extent_offset = extent_offset;
2135 backref->generation = btrfs_file_extent_generation(leaf, extent);
2136 backref->old = old;
2137 backref_insert(&new->root, backref);
2138 old->count++;
2139out:
2140 btrfs_release_path(path);
2141 WARN_ON(ret);
2142 return ret;
2143}
2144
2145static noinline bool record_extent_backrefs(struct btrfs_path *path,
2146 struct new_sa_defrag_extent *new)
2147{
2148 struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
2149 struct old_sa_defrag_extent *old, *tmp;
2150 int ret;
2151
2152 new->path = path;
2153
2154 list_for_each_entry_safe(old, tmp, &new->head, list) {
2155 ret = iterate_inodes_from_logical(old->bytenr, fs_info,
2156 path, record_one_backref,
2157 old);
2158 BUG_ON(ret < 0 && ret != -ENOENT);
2159
2160 /* no backref to be processed for this extent */
2161 if (!old->count) {
2162 list_del(&old->list);
2163 kfree(old);
2164 }
2165 }
2166
2167 if (list_empty(&new->head))
2168 return false;
2169
2170 return true;
2171}
2172
2173static int relink_is_mergable(struct extent_buffer *leaf,
2174 struct btrfs_file_extent_item *fi,
2175 u64 disk_bytenr)
2176{
2177 if (btrfs_file_extent_disk_bytenr(leaf, fi) != disk_bytenr)
2178 return 0;
2179
2180 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2181 return 0;
2182
2183 if (btrfs_file_extent_compression(leaf, fi) ||
2184 btrfs_file_extent_encryption(leaf, fi) ||
2185 btrfs_file_extent_other_encoding(leaf, fi))
2186 return 0;
2187
2188 return 1;
2189}
2190
2191/*
2192 * Note the backref might has changed, and in this case we just return 0.
2193 */
2194static noinline int relink_extent_backref(struct btrfs_path *path,
2195 struct sa_defrag_extent_backref *prev,
2196 struct sa_defrag_extent_backref *backref)
2197{
2198 struct btrfs_file_extent_item *extent;
2199 struct btrfs_file_extent_item *item;
2200 struct btrfs_ordered_extent *ordered;
2201 struct btrfs_trans_handle *trans;
2202 struct btrfs_fs_info *fs_info;
2203 struct btrfs_root *root;
2204 struct btrfs_key key;
2205 struct extent_buffer *leaf;
2206 struct old_sa_defrag_extent *old = backref->old;
2207 struct new_sa_defrag_extent *new = old->new;
2208 struct inode *src_inode = new->inode;
2209 struct inode *inode;
2210 struct extent_state *cached = NULL;
2211 int ret = 0;
2212 u64 start;
2213 u64 len;
2214 u64 lock_start;
2215 u64 lock_end;
2216 bool merge = false;
2217 int index;
2218
2219 if (prev && prev->root_id == backref->root_id &&
2220 prev->inum == backref->inum &&
2221 prev->file_pos + prev->num_bytes == backref->file_pos)
2222 merge = true;
2223
2224 /* step 1: get root */
2225 key.objectid = backref->root_id;
2226 key.type = BTRFS_ROOT_ITEM_KEY;
2227 key.offset = (u64)-1;
2228
2229 fs_info = BTRFS_I(src_inode)->root->fs_info;
2230 index = srcu_read_lock(&fs_info->subvol_srcu);
2231
2232 root = btrfs_read_fs_root_no_name(fs_info, &key);
2233 if (IS_ERR(root)) {
2234 srcu_read_unlock(&fs_info->subvol_srcu, index);
2235 if (PTR_ERR(root) == -ENOENT)
2236 return 0;
2237 return PTR_ERR(root);
2238 }
2239 if (btrfs_root_refs(&root->root_item) == 0) {
2240 srcu_read_unlock(&fs_info->subvol_srcu, index);
2241 /* parse ENOENT to 0 */
2242 return 0;
2243 }
2244
2245 /* step 2: get inode */
2246 key.objectid = backref->inum;
2247 key.type = BTRFS_INODE_ITEM_KEY;
2248 key.offset = 0;
2249
2250 inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2251 if (IS_ERR(inode)) {
2252 srcu_read_unlock(&fs_info->subvol_srcu, index);
2253 return 0;
2254 }
2255
2256 srcu_read_unlock(&fs_info->subvol_srcu, index);
2257
2258 /* step 3: relink backref */
2259 lock_start = backref->file_pos;
2260 lock_end = backref->file_pos + backref->num_bytes - 1;
2261 lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2262 0, &cached);
2263
2264 ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2265 if (ordered) {
2266 btrfs_put_ordered_extent(ordered);
2267 goto out_unlock;
2268 }
2269
2270 trans = btrfs_join_transaction(root);
2271 if (IS_ERR(trans)) {
2272 ret = PTR_ERR(trans);
2273 goto out_unlock;
2274 }
2275
2276 key.objectid = backref->inum;
2277 key.type = BTRFS_EXTENT_DATA_KEY;
2278 key.offset = backref->file_pos;
2279
2280 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2281 if (ret < 0) {
2282 goto out_free_path;
2283 } else if (ret > 0) {
2284 ret = 0;
2285 goto out_free_path;
2286 }
2287
2288 extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2289 struct btrfs_file_extent_item);
2290
2291 if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2292 backref->generation)
2293 goto out_free_path;
2294
2295 btrfs_release_path(path);
2296
2297 start = backref->file_pos;
2298 if (backref->extent_offset < old->extent_offset + old->offset)
2299 start += old->extent_offset + old->offset -
2300 backref->extent_offset;
2301
2302 len = min(backref->extent_offset + backref->num_bytes,
2303 old->extent_offset + old->offset + old->len);
2304 len -= max(backref->extent_offset, old->extent_offset + old->offset);
2305
2306 ret = btrfs_drop_extents(trans, root, inode, start,
2307 start + len, 1);
2308 if (ret)
2309 goto out_free_path;
2310again:
2311 key.objectid = btrfs_ino(inode);
2312 key.type = BTRFS_EXTENT_DATA_KEY;
2313 key.offset = start;
2314
2315 if (merge) {
2316 struct btrfs_file_extent_item *fi;
2317 u64 extent_len;
2318 struct btrfs_key found_key;
2319
2320 ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
2321 if (ret < 0)
2322 goto out_free_path;
2323
2324 path->slots[0]--;
2325 leaf = path->nodes[0];
2326 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2327
2328 fi = btrfs_item_ptr(leaf, path->slots[0],
2329 struct btrfs_file_extent_item);
2330 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2331
2332 if (relink_is_mergable(leaf, fi, new->bytenr) &&
2333 extent_len + found_key.offset == start) {
2334 btrfs_set_file_extent_num_bytes(leaf, fi,
2335 extent_len + len);
2336 btrfs_mark_buffer_dirty(leaf);
2337 inode_add_bytes(inode, len);
2338
2339 ret = 1;
2340 goto out_free_path;
2341 } else {
2342 merge = false;
2343 btrfs_release_path(path);
2344 goto again;
2345 }
2346 }
2347
2348 ret = btrfs_insert_empty_item(trans, root, path, &key,
2349 sizeof(*extent));
2350 if (ret) {
2351 btrfs_abort_transaction(trans, root, ret);
2352 goto out_free_path;
2353 }
2354
2355 leaf = path->nodes[0];
2356 item = btrfs_item_ptr(leaf, path->slots[0],
2357 struct btrfs_file_extent_item);
2358 btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2359 btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2360 btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2361 btrfs_set_file_extent_num_bytes(leaf, item, len);
2362 btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2363 btrfs_set_file_extent_generation(leaf, item, trans->transid);
2364 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2365 btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2366 btrfs_set_file_extent_encryption(leaf, item, 0);
2367 btrfs_set_file_extent_other_encoding(leaf, item, 0);
2368
2369 btrfs_mark_buffer_dirty(leaf);
2370 inode_add_bytes(inode, len);
2371
2372 ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2373 new->disk_len, 0,
2374 backref->root_id, backref->inum,
2375 new->file_pos, 0); /* start - extent_offset */
2376 if (ret) {
2377 btrfs_abort_transaction(trans, root, ret);
2378 goto out_free_path;
2379 }
2380
2381 ret = 1;
2382out_free_path:
2383 btrfs_release_path(path);
2384 btrfs_end_transaction(trans, root);
2385out_unlock:
2386 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2387 &cached, GFP_NOFS);
2388 iput(inode);
2389 return ret;
2390}
2391
2392static void relink_file_extents(struct new_sa_defrag_extent *new)
2393{
2394 struct btrfs_path *path;
2395 struct old_sa_defrag_extent *old, *tmp;
2396 struct sa_defrag_extent_backref *backref;
2397 struct sa_defrag_extent_backref *prev = NULL;
2398 struct inode *inode;
2399 struct btrfs_root *root;
2400 struct rb_node *node;
2401 int ret;
2402
2403 inode = new->inode;
2404 root = BTRFS_I(inode)->root;
2405
2406 path = btrfs_alloc_path();
2407 if (!path)
2408 return;
2409
2410 if (!record_extent_backrefs(path, new)) {
2411 btrfs_free_path(path);
2412 goto out;
2413 }
2414 btrfs_release_path(path);
2415
2416 while (1) {
2417 node = rb_first(&new->root);
2418 if (!node)
2419 break;
2420 rb_erase(node, &new->root);
2421
2422 backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2423
2424 ret = relink_extent_backref(path, prev, backref);
2425 WARN_ON(ret < 0);
2426
2427 kfree(prev);
2428
2429 if (ret == 1)
2430 prev = backref;
2431 else
2432 prev = NULL;
2433 cond_resched();
2434 }
2435 kfree(prev);
2436
2437 btrfs_free_path(path);
2438
2439 list_for_each_entry_safe(old, tmp, &new->head, list) {
2440 list_del(&old->list);
2441 kfree(old);
2442 }
2443out:
2444 atomic_dec(&root->fs_info->defrag_running);
2445 wake_up(&root->fs_info->transaction_wait);
2446
2447 kfree(new);
2448}
2449
2450static struct new_sa_defrag_extent *
2451record_old_file_extents(struct inode *inode,
2452 struct btrfs_ordered_extent *ordered)
2453{
2454 struct btrfs_root *root = BTRFS_I(inode)->root;
2455 struct btrfs_path *path;
2456 struct btrfs_key key;
2457 struct old_sa_defrag_extent *old, *tmp;
2458 struct new_sa_defrag_extent *new;
2459 int ret;
2460
2461 new = kmalloc(sizeof(*new), GFP_NOFS);
2462 if (!new)
2463 return NULL;
2464
2465 new->inode = inode;
2466 new->file_pos = ordered->file_offset;
2467 new->len = ordered->len;
2468 new->bytenr = ordered->start;
2469 new->disk_len = ordered->disk_len;
2470 new->compress_type = ordered->compress_type;
2471 new->root = RB_ROOT;
2472 INIT_LIST_HEAD(&new->head);
2473
2474 path = btrfs_alloc_path();
2475 if (!path)
2476 goto out_kfree;
2477
2478 key.objectid = btrfs_ino(inode);
2479 key.type = BTRFS_EXTENT_DATA_KEY;
2480 key.offset = new->file_pos;
2481
2482 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2483 if (ret < 0)
2484 goto out_free_path;
2485 if (ret > 0 && path->slots[0] > 0)
2486 path->slots[0]--;
2487
2488 /* find out all the old extents for the file range */
2489 while (1) {
2490 struct btrfs_file_extent_item *extent;
2491 struct extent_buffer *l;
2492 int slot;
2493 u64 num_bytes;
2494 u64 offset;
2495 u64 end;
2496 u64 disk_bytenr;
2497 u64 extent_offset;
2498
2499 l = path->nodes[0];
2500 slot = path->slots[0];
2501
2502 if (slot >= btrfs_header_nritems(l)) {
2503 ret = btrfs_next_leaf(root, path);
2504 if (ret < 0)
2505 goto out_free_list;
2506 else if (ret > 0)
2507 break;
2508 continue;
2509 }
2510
2511 btrfs_item_key_to_cpu(l, &key, slot);
2512
2513 if (key.objectid != btrfs_ino(inode))
2514 break;
2515 if (key.type != BTRFS_EXTENT_DATA_KEY)
2516 break;
2517 if (key.offset >= new->file_pos + new->len)
2518 break;
2519
2520 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2521
2522 num_bytes = btrfs_file_extent_num_bytes(l, extent);
2523 if (key.offset + num_bytes < new->file_pos)
2524 goto next;
2525
2526 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2527 if (!disk_bytenr)
2528 goto next;
2529
2530 extent_offset = btrfs_file_extent_offset(l, extent);
2531
2532 old = kmalloc(sizeof(*old), GFP_NOFS);
2533 if (!old)
2534 goto out_free_list;
2535
2536 offset = max(new->file_pos, key.offset);
2537 end = min(new->file_pos + new->len, key.offset + num_bytes);
2538
2539 old->bytenr = disk_bytenr;
2540 old->extent_offset = extent_offset;
2541 old->offset = offset - key.offset;
2542 old->len = end - offset;
2543 old->new = new;
2544 old->count = 0;
2545 list_add_tail(&old->list, &new->head);
2546next:
2547 path->slots[0]++;
2548 cond_resched();
2549 }
2550
2551 btrfs_free_path(path);
2552 atomic_inc(&root->fs_info->defrag_running);
2553
2554 return new;
2555
2556out_free_list:
2557 list_for_each_entry_safe(old, tmp, &new->head, list) {
2558 list_del(&old->list);
2559 kfree(old);
2560 }
2561out_free_path:
2562 btrfs_free_path(path);
2563out_kfree:
2564 kfree(new);
2565 return NULL;
2566}
2567
1895/* 2568/*
1896 * helper function for btrfs_finish_ordered_io, this 2569 * helper function for btrfs_finish_ordered_io, this
1897 * just reads in some of the csum leaves to prime them into ram 2570 * just reads in some of the csum leaves to prime them into ram
@@ -1909,6 +2582,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1909 struct btrfs_trans_handle *trans = NULL; 2582 struct btrfs_trans_handle *trans = NULL;
1910 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2583 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1911 struct extent_state *cached_state = NULL; 2584 struct extent_state *cached_state = NULL;
2585 struct new_sa_defrag_extent *new = NULL;
1912 int compress_type = 0; 2586 int compress_type = 0;
1913 int ret; 2587 int ret;
1914 bool nolock; 2588 bool nolock;
@@ -1943,6 +2617,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1943 ordered_extent->file_offset + ordered_extent->len - 1, 2617 ordered_extent->file_offset + ordered_extent->len - 1,
1944 0, &cached_state); 2618 0, &cached_state);
1945 2619
2620 ret = test_range_bit(io_tree, ordered_extent->file_offset,
2621 ordered_extent->file_offset + ordered_extent->len - 1,
2622 EXTENT_DEFRAG, 1, cached_state);
2623 if (ret) {
2624 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2625 if (last_snapshot >= BTRFS_I(inode)->generation)
2626 /* the inode is shared */
2627 new = record_old_file_extents(inode, ordered_extent);
2628
2629 clear_extent_bit(io_tree, ordered_extent->file_offset,
2630 ordered_extent->file_offset + ordered_extent->len - 1,
2631 EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
2632 }
2633
1946 if (nolock) 2634 if (nolock)
1947 trans = btrfs_join_transaction_nolock(root); 2635 trans = btrfs_join_transaction_nolock(root);
1948 else 2636 else
@@ -2001,17 +2689,33 @@ out:
2001 if (trans) 2689 if (trans)
2002 btrfs_end_transaction(trans, root); 2690 btrfs_end_transaction(trans, root);
2003 2691
2004 if (ret) 2692 if (ret) {
2005 clear_extent_uptodate(io_tree, ordered_extent->file_offset, 2693 clear_extent_uptodate(io_tree, ordered_extent->file_offset,
2006 ordered_extent->file_offset + 2694 ordered_extent->file_offset +
2007 ordered_extent->len - 1, NULL, GFP_NOFS); 2695 ordered_extent->len - 1, NULL, GFP_NOFS);
2008 2696
2697 /*
2698 * If the ordered extent had an IOERR or something else went
2699 * wrong we need to return the space for this ordered extent
2700 * back to the allocator.
2701 */
2702 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2703 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
2704 btrfs_free_reserved_extent(root, ordered_extent->start,
2705 ordered_extent->disk_len);
2706 }
2707
2708
2009 /* 2709 /*
2010 * This needs to be done to make sure anybody waiting knows we are done 2710 * This needs to be done to make sure anybody waiting knows we are done
2011 * updating everything for this ordered extent. 2711 * updating everything for this ordered extent.
2012 */ 2712 */
2013 btrfs_remove_ordered_extent(inode, ordered_extent); 2713 btrfs_remove_ordered_extent(inode, ordered_extent);
2014 2714
2715 /* for snapshot-aware defrag */
2716 if (new)
2717 relink_file_extents(new);
2718
2015 /* once for us */ 2719 /* once for us */
2016 btrfs_put_ordered_extent(ordered_extent); 2720 btrfs_put_ordered_extent(ordered_extent);
2017 /* once for the tree */ 2721 /* once for the tree */
@@ -2062,7 +2766,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2062static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, 2766static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
2063 struct extent_state *state, int mirror) 2767 struct extent_state *state, int mirror)
2064{ 2768{
2065 size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); 2769 size_t offset = start - page_offset(page);
2066 struct inode *inode = page->mapping->host; 2770 struct inode *inode = page->mapping->host;
2067 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2771 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2068 char *kaddr; 2772 char *kaddr;
@@ -2167,11 +2871,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
2167 } 2871 }
2168} 2872}
2169 2873
2170enum btrfs_orphan_cleanup_state {
2171 ORPHAN_CLEANUP_STARTED = 1,
2172 ORPHAN_CLEANUP_DONE = 2,
2173};
2174
2175/* 2874/*
2176 * This is called in transaction commit time. If there are no orphan 2875 * This is called in transaction commit time. If there are no orphan
2177 * files in the subvolume, it removes orphan item and frees block_rsv 2876 * files in the subvolume, it removes orphan item and frees block_rsv
@@ -2469,6 +3168,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2469 */ 3168 */
2470 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 3169 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2471 &BTRFS_I(inode)->runtime_flags); 3170 &BTRFS_I(inode)->runtime_flags);
3171 atomic_inc(&root->orphan_inodes);
2472 3172
2473 /* if we have links, this was a truncate, lets do that */ 3173 /* if we have links, this was a truncate, lets do that */
2474 if (inode->i_nlink) { 3174 if (inode->i_nlink) {
@@ -2491,6 +3191,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2491 goto out; 3191 goto out;
2492 3192
2493 ret = btrfs_truncate(inode); 3193 ret = btrfs_truncate(inode);
3194 if (ret)
3195 btrfs_orphan_del(NULL, inode);
2494 } else { 3196 } else {
2495 nr_unlink++; 3197 nr_unlink++;
2496 } 3198 }
@@ -2709,34 +3411,41 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2709 struct btrfs_inode_item *item, 3411 struct btrfs_inode_item *item,
2710 struct inode *inode) 3412 struct inode *inode)
2711{ 3413{
2712 btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); 3414 struct btrfs_map_token token;
2713 btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); 3415
2714 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); 3416 btrfs_init_map_token(&token);
2715 btrfs_set_inode_mode(leaf, item, inode->i_mode); 3417
2716 btrfs_set_inode_nlink(leaf, item, inode->i_nlink); 3418 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3419 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3420 btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
3421 &token);
3422 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3423 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
2717 3424
2718 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), 3425 btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
2719 inode->i_atime.tv_sec); 3426 inode->i_atime.tv_sec, &token);
2720 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), 3427 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
2721 inode->i_atime.tv_nsec); 3428 inode->i_atime.tv_nsec, &token);
2722 3429
2723 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), 3430 btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
2724 inode->i_mtime.tv_sec); 3431 inode->i_mtime.tv_sec, &token);
2725 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), 3432 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
2726 inode->i_mtime.tv_nsec); 3433 inode->i_mtime.tv_nsec, &token);
2727 3434
2728 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), 3435 btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
2729 inode->i_ctime.tv_sec); 3436 inode->i_ctime.tv_sec, &token);
2730 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), 3437 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
2731 inode->i_ctime.tv_nsec); 3438 inode->i_ctime.tv_nsec, &token);
2732 3439
2733 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); 3440 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
2734 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); 3441 &token);
2735 btrfs_set_inode_sequence(leaf, item, inode->i_version); 3442 btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
2736 btrfs_set_inode_transid(leaf, item, trans->transid); 3443 &token);
2737 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 3444 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
2738 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 3445 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
2739 btrfs_set_inode_block_group(leaf, item, 0); 3446 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3447 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3448 btrfs_set_token_inode_block_group(leaf, item, 0, &token);
2740} 3449}
2741 3450
2742/* 3451/*
@@ -3304,7 +4013,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3304 u64 extent_num_bytes = 0; 4013 u64 extent_num_bytes = 0;
3305 u64 extent_offset = 0; 4014 u64 extent_offset = 0;
3306 u64 item_end = 0; 4015 u64 item_end = 0;
3307 u64 mask = root->sectorsize - 1;
3308 u32 found_type = (u8)-1; 4016 u32 found_type = (u8)-1;
3309 int found_extent; 4017 int found_extent;
3310 int del_item; 4018 int del_item;
@@ -3328,7 +4036,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3328 * extent just the way it is. 4036 * extent just the way it is.
3329 */ 4037 */
3330 if (root->ref_cows || root == root->fs_info->tree_root) 4038 if (root->ref_cows || root == root->fs_info->tree_root)
3331 btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0); 4039 btrfs_drop_extent_cache(inode, ALIGN(new_size,
4040 root->sectorsize), (u64)-1, 0);
3332 4041
3333 /* 4042 /*
3334 * This function is also used to drop the items in the log tree before 4043 * This function is also used to drop the items in the log tree before
@@ -3407,10 +4116,9 @@ search_again:
3407 if (!del_item) { 4116 if (!del_item) {
3408 u64 orig_num_bytes = 4117 u64 orig_num_bytes =
3409 btrfs_file_extent_num_bytes(leaf, fi); 4118 btrfs_file_extent_num_bytes(leaf, fi);
3410 extent_num_bytes = new_size - 4119 extent_num_bytes = ALIGN(new_size -
3411 found_key.offset + root->sectorsize - 1; 4120 found_key.offset,
3412 extent_num_bytes = extent_num_bytes & 4121 root->sectorsize);
3413 ~((u64)root->sectorsize - 1);
3414 btrfs_set_file_extent_num_bytes(leaf, fi, 4122 btrfs_set_file_extent_num_bytes(leaf, fi,
3415 extent_num_bytes); 4123 extent_num_bytes);
3416 num_dec = (orig_num_bytes - 4124 num_dec = (orig_num_bytes -
@@ -3646,9 +4354,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3646 struct extent_map *em = NULL; 4354 struct extent_map *em = NULL;
3647 struct extent_state *cached_state = NULL; 4355 struct extent_state *cached_state = NULL;
3648 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 4356 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3649 u64 mask = root->sectorsize - 1; 4357 u64 hole_start = ALIGN(oldsize, root->sectorsize);
3650 u64 hole_start = (oldsize + mask) & ~mask; 4358 u64 block_end = ALIGN(size, root->sectorsize);
3651 u64 block_end = (size + mask) & ~mask;
3652 u64 last_byte; 4359 u64 last_byte;
3653 u64 cur_offset; 4360 u64 cur_offset;
3654 u64 hole_size; 4361 u64 hole_size;
@@ -3681,7 +4388,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3681 break; 4388 break;
3682 } 4389 }
3683 last_byte = min(extent_map_end(em), block_end); 4390 last_byte = min(extent_map_end(em), block_end);
3684 last_byte = (last_byte + mask) & ~mask; 4391 last_byte = ALIGN(last_byte , root->sectorsize);
3685 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 4392 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3686 struct extent_map *hole_em; 4393 struct extent_map *hole_em;
3687 hole_size = last_byte - cur_offset; 4394 hole_size = last_byte - cur_offset;
@@ -3832,6 +4539,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
3832 4539
3833 /* we don't support swapfiles, so vmtruncate shouldn't fail */ 4540 /* we don't support swapfiles, so vmtruncate shouldn't fail */
3834 truncate_setsize(inode, newsize); 4541 truncate_setsize(inode, newsize);
4542
4543 /* Disable nonlocked read DIO to avoid the end less truncate */
4544 btrfs_inode_block_unlocked_dio(inode);
4545 inode_dio_wait(inode);
4546 btrfs_inode_resume_unlocked_dio(inode);
4547
3835 ret = btrfs_truncate(inode); 4548 ret = btrfs_truncate(inode);
3836 if (ret && inode->i_nlink) 4549 if (ret && inode->i_nlink)
3837 btrfs_orphan_del(NULL, inode); 4550 btrfs_orphan_del(NULL, inode);
@@ -3904,6 +4617,12 @@ void btrfs_evict_inode(struct inode *inode)
3904 goto no_delete; 4617 goto no_delete;
3905 } 4618 }
3906 4619
4620 ret = btrfs_commit_inode_delayed_inode(inode);
4621 if (ret) {
4622 btrfs_orphan_del(NULL, inode);
4623 goto no_delete;
4624 }
4625
3907 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); 4626 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
3908 if (!rsv) { 4627 if (!rsv) {
3909 btrfs_orphan_del(NULL, inode); 4628 btrfs_orphan_del(NULL, inode);
@@ -3941,7 +4660,7 @@ void btrfs_evict_inode(struct inode *inode)
3941 goto no_delete; 4660 goto no_delete;
3942 } 4661 }
3943 4662
3944 trans = btrfs_start_transaction_lflush(root, 1); 4663 trans = btrfs_join_transaction(root);
3945 if (IS_ERR(trans)) { 4664 if (IS_ERR(trans)) {
3946 btrfs_orphan_del(NULL, inode); 4665 btrfs_orphan_del(NULL, inode);
3947 btrfs_free_block_rsv(root, rsv); 4666 btrfs_free_block_rsv(root, rsv);
@@ -3955,9 +4674,6 @@ void btrfs_evict_inode(struct inode *inode)
3955 break; 4674 break;
3956 4675
3957 trans->block_rsv = &root->fs_info->trans_block_rsv; 4676 trans->block_rsv = &root->fs_info->trans_block_rsv;
3958 ret = btrfs_update_inode(trans, root, inode);
3959 BUG_ON(ret);
3960
3961 btrfs_end_transaction(trans, root); 4677 btrfs_end_transaction(trans, root);
3962 trans = NULL; 4678 trans = NULL;
3963 btrfs_btree_balance_dirty(root); 4679 btrfs_btree_balance_dirty(root);
@@ -4854,7 +5570,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4854 if (btrfs_test_opt(root, NODATASUM)) 5570 if (btrfs_test_opt(root, NODATASUM))
4855 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 5571 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
4856 if (btrfs_test_opt(root, NODATACOW)) 5572 if (btrfs_test_opt(root, NODATACOW))
4857 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 5573 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
5574 BTRFS_INODE_NODATASUM;
4858 } 5575 }
4859 5576
4860 insert_inode_hash(inode); 5577 insert_inode_hash(inode);
@@ -5006,12 +5723,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
5006 goto out_unlock; 5723 goto out_unlock;
5007 } 5724 }
5008 5725
5009 err = btrfs_update_inode(trans, root, inode);
5010 if (err) {
5011 drop_inode = 1;
5012 goto out_unlock;
5013 }
5014
5015 /* 5726 /*
5016 * If the active LSM wants to access the inode during 5727 * If the active LSM wants to access the inode during
5017 * d_instantiate it needs these. Smack checks to see 5728 * d_instantiate it needs these. Smack checks to see
@@ -5396,8 +6107,7 @@ again:
5396 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 6107 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
5397 size_t size; 6108 size_t size;
5398 size = btrfs_file_extent_inline_len(leaf, item); 6109 size = btrfs_file_extent_inline_len(leaf, item);
5399 extent_end = (extent_start + size + root->sectorsize - 1) & 6110 extent_end = ALIGN(extent_start + size, root->sectorsize);
5400 ~((u64)root->sectorsize - 1);
5401 } 6111 }
5402 6112
5403 if (start >= extent_end) { 6113 if (start >= extent_end) {
@@ -5469,8 +6179,7 @@ again:
5469 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, 6179 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
5470 size - extent_offset); 6180 size - extent_offset);
5471 em->start = extent_start + extent_offset; 6181 em->start = extent_start + extent_offset;
5472 em->len = (copy_size + root->sectorsize - 1) & 6182 em->len = ALIGN(copy_size, root->sectorsize);
5473 ~((u64)root->sectorsize - 1);
5474 em->orig_block_len = em->len; 6183 em->orig_block_len = em->len;
5475 em->orig_start = em->start; 6184 em->orig_start = em->start;
5476 if (compress_type) { 6185 if (compress_type) {
@@ -5949,6 +6658,8 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
5949 6658
5950 em->start = start; 6659 em->start = start;
5951 em->orig_start = orig_start; 6660 em->orig_start = orig_start;
6661 em->mod_start = start;
6662 em->mod_len = len;
5952 em->len = len; 6663 em->len = len;
5953 em->block_len = block_len; 6664 em->block_len = block_len;
5954 em->block_start = block_start; 6665 em->block_start = block_start;
@@ -5990,16 +6701,12 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5990 u64 len = bh_result->b_size; 6701 u64 len = bh_result->b_size;
5991 struct btrfs_trans_handle *trans; 6702 struct btrfs_trans_handle *trans;
5992 int unlock_bits = EXTENT_LOCKED; 6703 int unlock_bits = EXTENT_LOCKED;
5993 int ret; 6704 int ret = 0;
5994 6705
5995 if (create) { 6706 if (create)
5996 ret = btrfs_delalloc_reserve_space(inode, len);
5997 if (ret)
5998 return ret;
5999 unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; 6707 unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
6000 } else { 6708 else
6001 len = min_t(u64, len, root->sectorsize); 6709 len = min_t(u64, len, root->sectorsize);
6002 }
6003 6710
6004 lockstart = start; 6711 lockstart = start;
6005 lockend = start + len - 1; 6712 lockend = start + len - 1;
@@ -6011,14 +6718,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
6011 if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create)) 6718 if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
6012 return -ENOTBLK; 6719 return -ENOTBLK;
6013 6720
6014 if (create) {
6015 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6016 lockend, EXTENT_DELALLOC, NULL,
6017 &cached_state, GFP_NOFS);
6018 if (ret)
6019 goto unlock_err;
6020 }
6021
6022 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 6721 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
6023 if (IS_ERR(em)) { 6722 if (IS_ERR(em)) {
6024 ret = PTR_ERR(em); 6723 ret = PTR_ERR(em);
@@ -6050,7 +6749,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
6050 if (!create && (em->block_start == EXTENT_MAP_HOLE || 6749 if (!create && (em->block_start == EXTENT_MAP_HOLE ||
6051 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 6750 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
6052 free_extent_map(em); 6751 free_extent_map(em);
6053 ret = 0;
6054 goto unlock_err; 6752 goto unlock_err;
6055 } 6753 }
6056 6754
@@ -6148,6 +6846,15 @@ unlock:
6148 */ 6846 */
6149 if (start + len > i_size_read(inode)) 6847 if (start + len > i_size_read(inode))
6150 i_size_write(inode, start + len); 6848 i_size_write(inode, start + len);
6849
6850 spin_lock(&BTRFS_I(inode)->lock);
6851 BTRFS_I(inode)->outstanding_extents++;
6852 spin_unlock(&BTRFS_I(inode)->lock);
6853
6854 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6855 lockstart + len - 1, EXTENT_DELALLOC, NULL,
6856 &cached_state, GFP_NOFS);
6857 BUG_ON(ret);
6151 } 6858 }
6152 6859
6153 /* 6860 /*
@@ -6156,24 +6863,9 @@ unlock:
6156 * aren't using if there is any left over space. 6863 * aren't using if there is any left over space.
6157 */ 6864 */
6158 if (lockstart < lockend) { 6865 if (lockstart < lockend) {
6159 if (create && len < lockend - lockstart) { 6866 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6160 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 6867 lockend, unlock_bits, 1, 0,
6161 lockstart + len - 1, 6868 &cached_state, GFP_NOFS);
6162 unlock_bits | EXTENT_DEFRAG, 1, 0,
6163 &cached_state, GFP_NOFS);
6164 /*
6165 * Beside unlock, we also need to cleanup reserved space
6166 * for the left range by attaching EXTENT_DO_ACCOUNTING.
6167 */
6168 clear_extent_bit(&BTRFS_I(inode)->io_tree,
6169 lockstart + len, lockend,
6170 unlock_bits | EXTENT_DO_ACCOUNTING |
6171 EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS);
6172 } else {
6173 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6174 lockend, unlock_bits, 1, 0,
6175 &cached_state, GFP_NOFS);
6176 }
6177 } else { 6869 } else {
6178 free_extent_state(cached_state); 6870 free_extent_state(cached_state);
6179 } 6871 }
@@ -6183,9 +6875,6 @@ unlock:
6183 return 0; 6875 return 0;
6184 6876
6185unlock_err: 6877unlock_err:
6186 if (create)
6187 unlock_bits |= EXTENT_DO_ACCOUNTING;
6188
6189 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6878 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6190 unlock_bits, 1, 0, &cached_state, GFP_NOFS); 6879 unlock_bits, 1, 0, &cached_state, GFP_NOFS);
6191 return ret; 6880 return ret;
@@ -6426,19 +7115,24 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6426 int async_submit = 0; 7115 int async_submit = 0;
6427 7116
6428 map_length = orig_bio->bi_size; 7117 map_length = orig_bio->bi_size;
6429 ret = btrfs_map_block(root->fs_info, READ, start_sector << 9, 7118 ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
6430 &map_length, NULL, 0); 7119 &map_length, NULL, 0);
6431 if (ret) { 7120 if (ret) {
6432 bio_put(orig_bio); 7121 bio_put(orig_bio);
6433 return -EIO; 7122 return -EIO;
6434 } 7123 }
6435
6436 if (map_length >= orig_bio->bi_size) { 7124 if (map_length >= orig_bio->bi_size) {
6437 bio = orig_bio; 7125 bio = orig_bio;
6438 goto submit; 7126 goto submit;
6439 } 7127 }
6440 7128
6441 async_submit = 1; 7129 /* async crcs make it difficult to collect full stripe writes. */
7130 if (btrfs_get_alloc_profile(root, 1) &
7131 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
7132 async_submit = 0;
7133 else
7134 async_submit = 1;
7135
6442 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 7136 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
6443 if (!bio) 7137 if (!bio)
6444 return -ENOMEM; 7138 return -ENOMEM;
@@ -6480,7 +7174,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6480 bio->bi_end_io = btrfs_end_dio_bio; 7174 bio->bi_end_io = btrfs_end_dio_bio;
6481 7175
6482 map_length = orig_bio->bi_size; 7176 map_length = orig_bio->bi_size;
6483 ret = btrfs_map_block(root->fs_info, READ, 7177 ret = btrfs_map_block(root->fs_info, rw,
6484 start_sector << 9, 7178 start_sector << 9,
6485 &map_length, NULL, 0); 7179 &map_length, NULL, 0);
6486 if (ret) { 7180 if (ret) {
@@ -6623,15 +7317,60 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
6623{ 7317{
6624 struct file *file = iocb->ki_filp; 7318 struct file *file = iocb->ki_filp;
6625 struct inode *inode = file->f_mapping->host; 7319 struct inode *inode = file->f_mapping->host;
7320 size_t count = 0;
7321 int flags = 0;
7322 bool wakeup = true;
7323 bool relock = false;
7324 ssize_t ret;
6626 7325
6627 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, 7326 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
6628 offset, nr_segs)) 7327 offset, nr_segs))
6629 return 0; 7328 return 0;
6630 7329
6631 return __blockdev_direct_IO(rw, iocb, inode, 7330 atomic_inc(&inode->i_dio_count);
6632 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, 7331 smp_mb__after_atomic_inc();
6633 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, 7332
6634 btrfs_submit_direct, 0); 7333 if (rw & WRITE) {
7334 count = iov_length(iov, nr_segs);
7335 /*
7336 * If the write DIO is beyond the EOF, we need update
7337 * the isize, but it is protected by i_mutex. So we can
7338 * not unlock the i_mutex at this case.
7339 */
7340 if (offset + count <= inode->i_size) {
7341 mutex_unlock(&inode->i_mutex);
7342 relock = true;
7343 }
7344 ret = btrfs_delalloc_reserve_space(inode, count);
7345 if (ret)
7346 goto out;
7347 } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
7348 &BTRFS_I(inode)->runtime_flags))) {
7349 inode_dio_done(inode);
7350 flags = DIO_LOCKING | DIO_SKIP_HOLES;
7351 wakeup = false;
7352 }
7353
7354 ret = __blockdev_direct_IO(rw, iocb, inode,
7355 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
7356 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
7357 btrfs_submit_direct, flags);
7358 if (rw & WRITE) {
7359 if (ret < 0 && ret != -EIOCBQUEUED)
7360 btrfs_delalloc_release_space(inode, count);
7361 else if (ret >= 0 && (size_t)ret < count)
7362 btrfs_delalloc_release_space(inode,
7363 count - (size_t)ret);
7364 else
7365 btrfs_delalloc_release_metadata(inode, 0);
7366 }
7367out:
7368 if (wakeup)
7369 inode_dio_done(inode);
7370 if (relock)
7371 mutex_lock(&inode->i_mutex);
7372
7373 return ret;
6635} 7374}
6636 7375
6637#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) 7376#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
@@ -6735,8 +7474,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
6735 return; 7474 return;
6736 } 7475 }
6737 lock_extent_bits(tree, page_start, page_end, 0, &cached_state); 7476 lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
6738 ordered = btrfs_lookup_ordered_extent(inode, 7477 ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
6739 page_offset(page));
6740 if (ordered) { 7478 if (ordered) {
6741 /* 7479 /*
6742 * IO on this page will never be started, so we need 7480 * IO on this page will never be started, so we need
@@ -7216,8 +7954,9 @@ int btrfs_drop_inode(struct inode *inode)
7216{ 7954{
7217 struct btrfs_root *root = BTRFS_I(inode)->root; 7955 struct btrfs_root *root = BTRFS_I(inode)->root;
7218 7956
7957 /* the snap/subvol tree is on deleting */
7219 if (btrfs_root_refs(&root->root_item) == 0 && 7958 if (btrfs_root_refs(&root->root_item) == 0 &&
7220 !btrfs_is_free_space_inode(inode)) 7959 root != root->fs_info->tree_root)
7221 return 1; 7960 return 1;
7222 else 7961 else
7223 return generic_drop_inode(inode); 7962 return generic_drop_inode(inode);
@@ -7299,40 +8038,22 @@ fail:
7299static int btrfs_getattr(struct vfsmount *mnt, 8038static int btrfs_getattr(struct vfsmount *mnt,
7300 struct dentry *dentry, struct kstat *stat) 8039 struct dentry *dentry, struct kstat *stat)
7301{ 8040{
8041 u64 delalloc_bytes;
7302 struct inode *inode = dentry->d_inode; 8042 struct inode *inode = dentry->d_inode;
7303 u32 blocksize = inode->i_sb->s_blocksize; 8043 u32 blocksize = inode->i_sb->s_blocksize;
7304 8044
7305 generic_fillattr(inode, stat); 8045 generic_fillattr(inode, stat);
7306 stat->dev = BTRFS_I(inode)->root->anon_dev; 8046 stat->dev = BTRFS_I(inode)->root->anon_dev;
7307 stat->blksize = PAGE_CACHE_SIZE; 8047 stat->blksize = PAGE_CACHE_SIZE;
8048
8049 spin_lock(&BTRFS_I(inode)->lock);
8050 delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
8051 spin_unlock(&BTRFS_I(inode)->lock);
7308 stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + 8052 stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
7309 ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; 8053 ALIGN(delalloc_bytes, blocksize)) >> 9;
7310 return 0; 8054 return 0;
7311} 8055}
7312 8056
7313/*
7314 * If a file is moved, it will inherit the cow and compression flags of the new
7315 * directory.
7316 */
7317static void fixup_inode_flags(struct inode *dir, struct inode *inode)
7318{
7319 struct btrfs_inode *b_dir = BTRFS_I(dir);
7320 struct btrfs_inode *b_inode = BTRFS_I(inode);
7321
7322 if (b_dir->flags & BTRFS_INODE_NODATACOW)
7323 b_inode->flags |= BTRFS_INODE_NODATACOW;
7324 else
7325 b_inode->flags &= ~BTRFS_INODE_NODATACOW;
7326
7327 if (b_dir->flags & BTRFS_INODE_COMPRESS) {
7328 b_inode->flags |= BTRFS_INODE_COMPRESS;
7329 b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
7330 } else {
7331 b_inode->flags &= ~(BTRFS_INODE_COMPRESS |
7332 BTRFS_INODE_NOCOMPRESS);
7333 }
7334}
7335
7336static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, 8057static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7337 struct inode *new_dir, struct dentry *new_dentry) 8058 struct inode *new_dir, struct dentry *new_dentry)
7338{ 8059{
@@ -7498,8 +8219,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7498 } 8219 }
7499 } 8220 }
7500 8221
7501 fixup_inode_flags(new_dir, old_inode);
7502
7503 ret = btrfs_add_link(trans, new_dir, old_inode, 8222 ret = btrfs_add_link(trans, new_dir, old_inode,
7504 new_dentry->d_name.name, 8223 new_dentry->d_name.name,
7505 new_dentry->d_name.len, 0, index); 8224 new_dentry->d_name.len, 0, index);
@@ -7583,7 +8302,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
7583 8302
7584 INIT_LIST_HEAD(&works); 8303 INIT_LIST_HEAD(&works);
7585 INIT_LIST_HEAD(&splice); 8304 INIT_LIST_HEAD(&splice);
7586again: 8305
7587 spin_lock(&root->fs_info->delalloc_lock); 8306 spin_lock(&root->fs_info->delalloc_lock);
7588 list_splice_init(&root->fs_info->delalloc_inodes, &splice); 8307 list_splice_init(&root->fs_info->delalloc_inodes, &splice);
7589 while (!list_empty(&splice)) { 8308 while (!list_empty(&splice)) {
@@ -7593,8 +8312,11 @@ again:
7593 list_del_init(&binode->delalloc_inodes); 8312 list_del_init(&binode->delalloc_inodes);
7594 8313
7595 inode = igrab(&binode->vfs_inode); 8314 inode = igrab(&binode->vfs_inode);
7596 if (!inode) 8315 if (!inode) {
8316 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
8317 &binode->runtime_flags);
7597 continue; 8318 continue;
8319 }
7598 8320
7599 list_add_tail(&binode->delalloc_inodes, 8321 list_add_tail(&binode->delalloc_inodes,
7600 &root->fs_info->delalloc_inodes); 8322 &root->fs_info->delalloc_inodes);
@@ -7619,13 +8341,6 @@ again:
7619 btrfs_wait_and_free_delalloc_work(work); 8341 btrfs_wait_and_free_delalloc_work(work);
7620 } 8342 }
7621 8343
7622 spin_lock(&root->fs_info->delalloc_lock);
7623 if (!list_empty(&root->fs_info->delalloc_inodes)) {
7624 spin_unlock(&root->fs_info->delalloc_lock);
7625 goto again;
7626 }
7627 spin_unlock(&root->fs_info->delalloc_lock);
7628
7629 /* the filemap_flush will queue IO into the worker threads, but 8344 /* the filemap_flush will queue IO into the worker threads, but
7630 * we have to make sure the IO is actually started and that 8345 * we have to make sure the IO is actually started and that
7631 * ordered extents get created before we return 8346 * ordered extents get created before we return
@@ -7801,8 +8516,9 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
7801 } 8516 }
7802 } 8517 }
7803 8518
7804 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, 8519 ret = btrfs_reserve_extent(trans, root,
7805 0, *alloc_hint, &ins, 1); 8520 min(num_bytes, 256ULL * 1024 * 1024),
8521 min_size, 0, *alloc_hint, &ins, 1);
7806 if (ret) { 8522 if (ret) {
7807 if (own_trans) 8523 if (own_trans)
7808 btrfs_end_transaction(trans, root); 8524 btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c3f09f71bedd..c83086fdda05 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -42,12 +42,12 @@
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <linux/blkdev.h> 43#include <linux/blkdev.h>
44#include <linux/uuid.h> 44#include <linux/uuid.h>
45#include <linux/btrfs.h>
45#include "compat.h" 46#include "compat.h"
46#include "ctree.h" 47#include "ctree.h"
47#include "disk-io.h" 48#include "disk-io.h"
48#include "transaction.h" 49#include "transaction.h"
49#include "btrfs_inode.h" 50#include "btrfs_inode.h"
50#include "ioctl.h"
51#include "print-tree.h" 51#include "print-tree.h"
52#include "volumes.h" 52#include "volumes.h"
53#include "locking.h" 53#include "locking.h"
@@ -363,46 +363,52 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
363 return 0; 363 return 0;
364} 364}
365 365
366static noinline int create_subvol(struct btrfs_root *root, 366static noinline int create_subvol(struct inode *dir,
367 struct dentry *dentry, 367 struct dentry *dentry,
368 char *name, int namelen, 368 char *name, int namelen,
369 u64 *async_transid, 369 u64 *async_transid,
370 struct btrfs_qgroup_inherit **inherit) 370 struct btrfs_qgroup_inherit *inherit)
371{ 371{
372 struct btrfs_trans_handle *trans; 372 struct btrfs_trans_handle *trans;
373 struct btrfs_key key; 373 struct btrfs_key key;
374 struct btrfs_root_item root_item; 374 struct btrfs_root_item root_item;
375 struct btrfs_inode_item *inode_item; 375 struct btrfs_inode_item *inode_item;
376 struct extent_buffer *leaf; 376 struct extent_buffer *leaf;
377 struct btrfs_root *root = BTRFS_I(dir)->root;
377 struct btrfs_root *new_root; 378 struct btrfs_root *new_root;
378 struct dentry *parent = dentry->d_parent; 379 struct btrfs_block_rsv block_rsv;
379 struct inode *dir;
380 struct timespec cur_time = CURRENT_TIME; 380 struct timespec cur_time = CURRENT_TIME;
381 int ret; 381 int ret;
382 int err; 382 int err;
383 u64 objectid; 383 u64 objectid;
384 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; 384 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
385 u64 index = 0; 385 u64 index = 0;
386 u64 qgroup_reserved;
386 uuid_le new_uuid; 387 uuid_le new_uuid;
387 388
388 ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); 389 ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
389 if (ret) 390 if (ret)
390 return ret; 391 return ret;
391 392
392 dir = parent->d_inode; 393 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
393
394 /* 394 /*
395 * 1 - inode item 395 * The same as the snapshot creation, please see the comment
396 * 2 - refs 396 * of create_snapshot().
397 * 1 - root item
398 * 2 - dir items
399 */ 397 */
400 trans = btrfs_start_transaction(root, 6); 398 ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
401 if (IS_ERR(trans)) 399 7, &qgroup_reserved);
402 return PTR_ERR(trans); 400 if (ret)
401 return ret;
402
403 trans = btrfs_start_transaction(root, 0);
404 if (IS_ERR(trans)) {
405 ret = PTR_ERR(trans);
406 goto out;
407 }
408 trans->block_rsv = &block_rsv;
409 trans->bytes_reserved = block_rsv.size;
403 410
404 ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, 411 ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit);
405 inherit ? *inherit : NULL);
406 if (ret) 412 if (ret)
407 goto fail; 413 goto fail;
408 414
@@ -516,6 +522,8 @@ static noinline int create_subvol(struct btrfs_root *root,
516 BUG_ON(ret); 522 BUG_ON(ret);
517 523
518fail: 524fail:
525 trans->block_rsv = NULL;
526 trans->bytes_reserved = 0;
519 if (async_transid) { 527 if (async_transid) {
520 *async_transid = trans->transid; 528 *async_transid = trans->transid;
521 err = btrfs_commit_transaction_async(trans, root, 1); 529 err = btrfs_commit_transaction_async(trans, root, 1);
@@ -527,13 +535,15 @@ fail:
527 535
528 if (!ret) 536 if (!ret)
529 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); 537 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
530 538out:
539 btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
531 return ret; 540 return ret;
532} 541}
533 542
534static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, 543static int create_snapshot(struct btrfs_root *root, struct inode *dir,
535 char *name, int namelen, u64 *async_transid, 544 struct dentry *dentry, char *name, int namelen,
536 bool readonly, struct btrfs_qgroup_inherit **inherit) 545 u64 *async_transid, bool readonly,
546 struct btrfs_qgroup_inherit *inherit)
537{ 547{
538 struct inode *inode; 548 struct inode *inode;
539 struct btrfs_pending_snapshot *pending_snapshot; 549 struct btrfs_pending_snapshot *pending_snapshot;
@@ -549,23 +559,31 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
549 559
550 btrfs_init_block_rsv(&pending_snapshot->block_rsv, 560 btrfs_init_block_rsv(&pending_snapshot->block_rsv,
551 BTRFS_BLOCK_RSV_TEMP); 561 BTRFS_BLOCK_RSV_TEMP);
562 /*
563 * 1 - parent dir inode
564 * 2 - dir entries
565 * 1 - root item
566 * 2 - root ref/backref
567 * 1 - root of snapshot
568 */
569 ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
570 &pending_snapshot->block_rsv, 7,
571 &pending_snapshot->qgroup_reserved);
572 if (ret)
573 goto out;
574
552 pending_snapshot->dentry = dentry; 575 pending_snapshot->dentry = dentry;
553 pending_snapshot->root = root; 576 pending_snapshot->root = root;
554 pending_snapshot->readonly = readonly; 577 pending_snapshot->readonly = readonly;
555 if (inherit) { 578 pending_snapshot->dir = dir;
556 pending_snapshot->inherit = *inherit; 579 pending_snapshot->inherit = inherit;
557 *inherit = NULL; /* take responsibility to free it */
558 }
559 580
560 trans = btrfs_start_transaction(root->fs_info->extent_root, 6); 581 trans = btrfs_start_transaction(root, 0);
561 if (IS_ERR(trans)) { 582 if (IS_ERR(trans)) {
562 ret = PTR_ERR(trans); 583 ret = PTR_ERR(trans);
563 goto fail; 584 goto fail;
564 } 585 }
565 586
566 ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
567 BUG_ON(ret);
568
569 spin_lock(&root->fs_info->trans_lock); 587 spin_lock(&root->fs_info->trans_lock);
570 list_add(&pending_snapshot->list, 588 list_add(&pending_snapshot->list,
571 &trans->transaction->pending_snapshots); 589 &trans->transaction->pending_snapshots);
@@ -602,6 +620,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
602 d_instantiate(dentry, inode); 620 d_instantiate(dentry, inode);
603 ret = 0; 621 ret = 0;
604fail: 622fail:
623 btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
624 &pending_snapshot->block_rsv,
625 pending_snapshot->qgroup_reserved);
626out:
605 kfree(pending_snapshot); 627 kfree(pending_snapshot);
606 return ret; 628 return ret;
607} 629}
@@ -695,7 +717,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
695 char *name, int namelen, 717 char *name, int namelen,
696 struct btrfs_root *snap_src, 718 struct btrfs_root *snap_src,
697 u64 *async_transid, bool readonly, 719 u64 *async_transid, bool readonly,
698 struct btrfs_qgroup_inherit **inherit) 720 struct btrfs_qgroup_inherit *inherit)
699{ 721{
700 struct inode *dir = parent->dentry->d_inode; 722 struct inode *dir = parent->dentry->d_inode;
701 struct dentry *dentry; 723 struct dentry *dentry;
@@ -732,11 +754,11 @@ static noinline int btrfs_mksubvol(struct path *parent,
732 goto out_up_read; 754 goto out_up_read;
733 755
734 if (snap_src) { 756 if (snap_src) {
735 error = create_snapshot(snap_src, dentry, name, namelen, 757 error = create_snapshot(snap_src, dir, dentry, name, namelen,
736 async_transid, readonly, inherit); 758 async_transid, readonly, inherit);
737 } else { 759 } else {
738 error = create_subvol(BTRFS_I(dir)->root, dentry, 760 error = create_subvol(dir, dentry, name, namelen,
739 name, namelen, async_transid, inherit); 761 async_transid, inherit);
740 } 762 }
741 if (!error) 763 if (!error)
742 fsnotify_mkdir(dir, dentry); 764 fsnotify_mkdir(dir, dentry);
@@ -818,7 +840,7 @@ static int find_new_extents(struct btrfs_root *root,
818 840
819 while(1) { 841 while(1) {
820 ret = btrfs_search_forward(root, &min_key, &max_key, 842 ret = btrfs_search_forward(root, &min_key, &max_key,
821 path, 0, newer_than); 843 path, newer_than);
822 if (ret != 0) 844 if (ret != 0)
823 goto none; 845 goto none;
824 if (min_key.objectid != ino) 846 if (min_key.objectid != ino)
@@ -1206,6 +1228,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1206 if (!(inode->i_sb->s_flags & MS_ACTIVE)) 1228 if (!(inode->i_sb->s_flags & MS_ACTIVE))
1207 break; 1229 break;
1208 1230
1231 if (btrfs_defrag_cancelled(root->fs_info)) {
1232 printk(KERN_DEBUG "btrfs: defrag_file cancelled\n");
1233 ret = -EAGAIN;
1234 break;
1235 }
1236
1209 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, 1237 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
1210 extent_thresh, &last_len, &skip, 1238 extent_thresh, &last_len, &skip,
1211 &defrag_end, range->flags & 1239 &defrag_end, range->flags &
@@ -1329,9 +1357,6 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1329 int ret = 0; 1357 int ret = 0;
1330 int mod = 0; 1358 int mod = 0;
1331 1359
1332 if (root->fs_info->sb->s_flags & MS_RDONLY)
1333 return -EROFS;
1334
1335 if (!capable(CAP_SYS_ADMIN)) 1360 if (!capable(CAP_SYS_ADMIN))
1336 return -EPERM; 1361 return -EPERM;
1337 1362
@@ -1363,6 +1388,10 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1363 *devstr = '\0'; 1388 *devstr = '\0';
1364 devstr = vol_args->name; 1389 devstr = vol_args->name;
1365 devid = simple_strtoull(devstr, &end, 10); 1390 devid = simple_strtoull(devstr, &end, 10);
1391 if (!devid) {
1392 ret = -EINVAL;
1393 goto out_free;
1394 }
1366 printk(KERN_INFO "btrfs: resizing devid %llu\n", 1395 printk(KERN_INFO "btrfs: resizing devid %llu\n",
1367 (unsigned long long)devid); 1396 (unsigned long long)devid);
1368 } 1397 }
@@ -1371,7 +1400,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1371 if (!device) { 1400 if (!device) {
1372 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", 1401 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
1373 (unsigned long long)devid); 1402 (unsigned long long)devid);
1374 ret = -EINVAL; 1403 ret = -ENODEV;
1375 goto out_free; 1404 goto out_free;
1376 } 1405 }
1377 1406
@@ -1379,7 +1408,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1379 printk(KERN_INFO "btrfs: resizer unable to apply on " 1408 printk(KERN_INFO "btrfs: resizer unable to apply on "
1380 "readonly device %llu\n", 1409 "readonly device %llu\n",
1381 (unsigned long long)devid); 1410 (unsigned long long)devid);
1382 ret = -EINVAL; 1411 ret = -EPERM;
1383 goto out_free; 1412 goto out_free;
1384 } 1413 }
1385 1414
@@ -1401,7 +1430,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1401 } 1430 }
1402 1431
1403 if (device->is_tgtdev_for_dev_replace) { 1432 if (device->is_tgtdev_for_dev_replace) {
1404 ret = -EINVAL; 1433 ret = -EPERM;
1405 goto out_free; 1434 goto out_free;
1406 } 1435 }
1407 1436
@@ -1457,7 +1486,7 @@ out:
1457static noinline int btrfs_ioctl_snap_create_transid(struct file *file, 1486static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
1458 char *name, unsigned long fd, int subvol, 1487 char *name, unsigned long fd, int subvol,
1459 u64 *transid, bool readonly, 1488 u64 *transid, bool readonly,
1460 struct btrfs_qgroup_inherit **inherit) 1489 struct btrfs_qgroup_inherit *inherit)
1461{ 1490{
1462 int namelen; 1491 int namelen;
1463 int ret = 0; 1492 int ret = 0;
@@ -1566,7 +1595,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1566 1595
1567 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, 1596 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1568 vol_args->fd, subvol, ptr, 1597 vol_args->fd, subvol, ptr,
1569 readonly, &inherit); 1598 readonly, inherit);
1570 1599
1571 if (ret == 0 && ptr && 1600 if (ret == 0 && ptr &&
1572 copy_to_user(arg + 1601 copy_to_user(arg +
@@ -1863,7 +1892,7 @@ static noinline int search_ioctl(struct inode *inode,
1863 path->keep_locks = 1; 1892 path->keep_locks = 1;
1864 1893
1865 while(1) { 1894 while(1) {
1866 ret = btrfs_search_forward(root, &key, &max_key, path, 0, 1895 ret = btrfs_search_forward(root, &key, &max_key, path,
1867 sk->min_transid); 1896 sk->min_transid);
1868 if (ret != 0) { 1897 if (ret != 0) {
1869 if (ret > 0) 1898 if (ret > 0)
@@ -2035,6 +2064,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2035 struct btrfs_root *dest = NULL; 2064 struct btrfs_root *dest = NULL;
2036 struct btrfs_ioctl_vol_args *vol_args; 2065 struct btrfs_ioctl_vol_args *vol_args;
2037 struct btrfs_trans_handle *trans; 2066 struct btrfs_trans_handle *trans;
2067 struct btrfs_block_rsv block_rsv;
2068 u64 qgroup_reserved;
2038 int namelen; 2069 int namelen;
2039 int ret; 2070 int ret;
2040 int err = 0; 2071 int err = 0;
@@ -2124,12 +2155,23 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2124 if (err) 2155 if (err)
2125 goto out_up_write; 2156 goto out_up_write;
2126 2157
2158 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
2159 /*
2160 * One for dir inode, two for dir entries, two for root
2161 * ref/backref.
2162 */
2163 err = btrfs_subvolume_reserve_metadata(root, &block_rsv,
2164 5, &qgroup_reserved);
2165 if (err)
2166 goto out_up_write;
2167
2127 trans = btrfs_start_transaction(root, 0); 2168 trans = btrfs_start_transaction(root, 0);
2128 if (IS_ERR(trans)) { 2169 if (IS_ERR(trans)) {
2129 err = PTR_ERR(trans); 2170 err = PTR_ERR(trans);
2130 goto out_up_write; 2171 goto out_release;
2131 } 2172 }
2132 trans->block_rsv = &root->fs_info->global_block_rsv; 2173 trans->block_rsv = &block_rsv;
2174 trans->bytes_reserved = block_rsv.size;
2133 2175
2134 ret = btrfs_unlink_subvol(trans, root, dir, 2176 ret = btrfs_unlink_subvol(trans, root, dir,
2135 dest->root_key.objectid, 2177 dest->root_key.objectid,
@@ -2159,10 +2201,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2159 } 2201 }
2160 } 2202 }
2161out_end_trans: 2203out_end_trans:
2204 trans->block_rsv = NULL;
2205 trans->bytes_reserved = 0;
2162 ret = btrfs_end_transaction(trans, root); 2206 ret = btrfs_end_transaction(trans, root);
2163 if (ret && !err) 2207 if (ret && !err)
2164 err = ret; 2208 err = ret;
2165 inode->i_flags |= S_DEAD; 2209 inode->i_flags |= S_DEAD;
2210out_release:
2211 btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
2166out_up_write: 2212out_up_write:
2167 up_write(&root->fs_info->subvol_sem); 2213 up_write(&root->fs_info->subvol_sem);
2168out_unlock: 2214out_unlock:
@@ -2171,6 +2217,12 @@ out_unlock:
2171 shrink_dcache_sb(root->fs_info->sb); 2217 shrink_dcache_sb(root->fs_info->sb);
2172 btrfs_invalidate_inodes(dest); 2218 btrfs_invalidate_inodes(dest);
2173 d_delete(dentry); 2219 d_delete(dentry);
2220
2221 /* the last ref */
2222 if (dest->cache_inode) {
2223 iput(dest->cache_inode);
2224 dest->cache_inode = NULL;
2225 }
2174 } 2226 }
2175out_dput: 2227out_dput:
2176 dput(dentry); 2228 dput(dentry);
@@ -2211,10 +2263,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
2211 ret = -EPERM; 2263 ret = -EPERM;
2212 goto out; 2264 goto out;
2213 } 2265 }
2214 ret = btrfs_defrag_root(root, 0); 2266 ret = btrfs_defrag_root(root);
2215 if (ret) 2267 if (ret)
2216 goto out; 2268 goto out;
2217 ret = btrfs_defrag_root(root->fs_info->extent_root, 0); 2269 ret = btrfs_defrag_root(root->fs_info->extent_root);
2218 break; 2270 break;
2219 case S_IFREG: 2271 case S_IFREG:
2220 if (!(file->f_mode & FMODE_WRITE)) { 2272 if (!(file->f_mode & FMODE_WRITE)) {
@@ -3111,7 +3163,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
3111 u64 transid; 3163 u64 transid;
3112 int ret; 3164 int ret;
3113 3165
3114 trans = btrfs_attach_transaction(root); 3166 trans = btrfs_attach_transaction_barrier(root);
3115 if (IS_ERR(trans)) { 3167 if (IS_ERR(trans)) {
3116 if (PTR_ERR(trans) != -ENOENT) 3168 if (PTR_ERR(trans) != -ENOENT)
3117 return PTR_ERR(trans); 3169 return PTR_ERR(trans);
@@ -3289,7 +3341,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
3289 struct inode_fs_paths *ipath = NULL; 3341 struct inode_fs_paths *ipath = NULL;
3290 struct btrfs_path *path; 3342 struct btrfs_path *path;
3291 3343
3292 if (!capable(CAP_SYS_ADMIN)) 3344 if (!capable(CAP_DAC_READ_SEARCH))
3293 return -EPERM; 3345 return -EPERM;
3294 3346
3295 path = btrfs_alloc_path(); 3347 path = btrfs_alloc_path();
@@ -3914,6 +3966,65 @@ out:
3914 return ret; 3966 return ret;
3915} 3967}
3916 3968
3969static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
3970{
3971 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3972 const char *label = root->fs_info->super_copy->label;
3973 size_t len = strnlen(label, BTRFS_LABEL_SIZE);
3974 int ret;
3975
3976 if (len == BTRFS_LABEL_SIZE) {
3977 pr_warn("btrfs: label is too long, return the first %zu bytes\n",
3978 --len);
3979 }
3980
3981 mutex_lock(&root->fs_info->volume_mutex);
3982 ret = copy_to_user(arg, label, len);
3983 mutex_unlock(&root->fs_info->volume_mutex);
3984
3985 return ret ? -EFAULT : 0;
3986}
3987
3988static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
3989{
3990 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3991 struct btrfs_super_block *super_block = root->fs_info->super_copy;
3992 struct btrfs_trans_handle *trans;
3993 char label[BTRFS_LABEL_SIZE];
3994 int ret;
3995
3996 if (!capable(CAP_SYS_ADMIN))
3997 return -EPERM;
3998
3999 if (copy_from_user(label, arg, sizeof(label)))
4000 return -EFAULT;
4001
4002 if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
4003 pr_err("btrfs: unable to set label with more than %d bytes\n",
4004 BTRFS_LABEL_SIZE - 1);
4005 return -EINVAL;
4006 }
4007
4008 ret = mnt_want_write_file(file);
4009 if (ret)
4010 return ret;
4011
4012 mutex_lock(&root->fs_info->volume_mutex);
4013 trans = btrfs_start_transaction(root, 0);
4014 if (IS_ERR(trans)) {
4015 ret = PTR_ERR(trans);
4016 goto out_unlock;
4017 }
4018
4019 strcpy(super_block->label, label);
4020 ret = btrfs_end_transaction(trans, root);
4021
4022out_unlock:
4023 mutex_unlock(&root->fs_info->volume_mutex);
4024 mnt_drop_write_file(file);
4025 return ret;
4026}
4027
3917long btrfs_ioctl(struct file *file, unsigned int 4028long btrfs_ioctl(struct file *file, unsigned int
3918 cmd, unsigned long arg) 4029 cmd, unsigned long arg)
3919{ 4030{
@@ -4014,6 +4125,10 @@ long btrfs_ioctl(struct file *file, unsigned int
4014 return btrfs_ioctl_qgroup_limit(file, argp); 4125 return btrfs_ioctl_qgroup_limit(file, argp);
4015 case BTRFS_IOC_DEV_REPLACE: 4126 case BTRFS_IOC_DEV_REPLACE:
4016 return btrfs_ioctl_dev_replace(root, argp); 4127 return btrfs_ioctl_dev_replace(root, argp);
4128 case BTRFS_IOC_GET_FSLABEL:
4129 return btrfs_ioctl_get_fslabel(file, argp);
4130 case BTRFS_IOC_SET_FSLABEL:
4131 return btrfs_ioctl_set_fslabel(file, argp);
4017 } 4132 }
4018 4133
4019 return -ENOTTY; 4134 return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
deleted file mode 100644
index dabca9cc8c2e..000000000000
--- a/fs/btrfs/ioctl.h
+++ /dev/null
@@ -1,502 +0,0 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __IOCTL_
20#define __IOCTL_
21#include <linux/ioctl.h>
22
23#define BTRFS_IOCTL_MAGIC 0x94
24#define BTRFS_VOL_NAME_MAX 255
25
26/* this should be 4k */
27#define BTRFS_PATH_NAME_MAX 4087
28struct btrfs_ioctl_vol_args {
29 __s64 fd;
30 char name[BTRFS_PATH_NAME_MAX + 1];
31};
32
33#define BTRFS_DEVICE_PATH_NAME_MAX 1024
34
35#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
36#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
37#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2)
38#define BTRFS_FSID_SIZE 16
39#define BTRFS_UUID_SIZE 16
40
41#define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0)
42
43struct btrfs_qgroup_limit {
44 __u64 flags;
45 __u64 max_rfer;
46 __u64 max_excl;
47 __u64 rsv_rfer;
48 __u64 rsv_excl;
49};
50
51struct btrfs_qgroup_inherit {
52 __u64 flags;
53 __u64 num_qgroups;
54 __u64 num_ref_copies;
55 __u64 num_excl_copies;
56 struct btrfs_qgroup_limit lim;
57 __u64 qgroups[0];
58};
59
60struct btrfs_ioctl_qgroup_limit_args {
61 __u64 qgroupid;
62 struct btrfs_qgroup_limit lim;
63};
64
65#define BTRFS_SUBVOL_NAME_MAX 4039
66struct btrfs_ioctl_vol_args_v2 {
67 __s64 fd;
68 __u64 transid;
69 __u64 flags;
70 union {
71 struct {
72 __u64 size;
73 struct btrfs_qgroup_inherit __user *qgroup_inherit;
74 };
75 __u64 unused[4];
76 };
77 char name[BTRFS_SUBVOL_NAME_MAX + 1];
78};
79
80/*
81 * structure to report errors and progress to userspace, either as a
82 * result of a finished scrub, a canceled scrub or a progress inquiry
83 */
84struct btrfs_scrub_progress {
85 __u64 data_extents_scrubbed; /* # of data extents scrubbed */
86 __u64 tree_extents_scrubbed; /* # of tree extents scrubbed */
87 __u64 data_bytes_scrubbed; /* # of data bytes scrubbed */
88 __u64 tree_bytes_scrubbed; /* # of tree bytes scrubbed */
89 __u64 read_errors; /* # of read errors encountered (EIO) */
90 __u64 csum_errors; /* # of failed csum checks */
91 __u64 verify_errors; /* # of occurences, where the metadata
92 * of a tree block did not match the
93 * expected values, like generation or
94 * logical */
95 __u64 no_csum; /* # of 4k data block for which no csum
96 * is present, probably the result of
97 * data written with nodatasum */
98 __u64 csum_discards; /* # of csum for which no data was found
99 * in the extent tree. */
100 __u64 super_errors; /* # of bad super blocks encountered */
101 __u64 malloc_errors; /* # of internal kmalloc errors. These
102 * will likely cause an incomplete
103 * scrub */
104 __u64 uncorrectable_errors; /* # of errors where either no intact
105 * copy was found or the writeback
106 * failed */
107 __u64 corrected_errors; /* # of errors corrected */
108 __u64 last_physical; /* last physical address scrubbed. In
109 * case a scrub was aborted, this can
110 * be used to restart the scrub */
111 __u64 unverified_errors; /* # of occurences where a read for a
112 * full (64k) bio failed, but the re-
113 * check succeeded for each 4k piece.
114 * Intermittent error. */
115};
116
117#define BTRFS_SCRUB_READONLY 1
118struct btrfs_ioctl_scrub_args {
119 __u64 devid; /* in */
120 __u64 start; /* in */
121 __u64 end; /* in */
122 __u64 flags; /* in */
123 struct btrfs_scrub_progress progress; /* out */
124 /* pad to 1k */
125 __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
126};
127
128#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
129#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
130struct btrfs_ioctl_dev_replace_start_params {
131 __u64 srcdevid; /* in, if 0, use srcdev_name instead */
132 __u64 cont_reading_from_srcdev_mode; /* in, see #define
133 * above */
134 __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
135 __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
136};
137
138#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0
139#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED 1
140#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2
141#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3
142#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4
143struct btrfs_ioctl_dev_replace_status_params {
144 __u64 replace_state; /* out, see #define above */
145 __u64 progress_1000; /* out, 0 <= x <= 1000 */
146 __u64 time_started; /* out, seconds since 1-Jan-1970 */
147 __u64 time_stopped; /* out, seconds since 1-Jan-1970 */
148 __u64 num_write_errors; /* out */
149 __u64 num_uncorrectable_read_errors; /* out */
150};
151
152#define BTRFS_IOCTL_DEV_REPLACE_CMD_START 0
153#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS 1
154#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL 2
155#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0
156#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1
157#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2
158struct btrfs_ioctl_dev_replace_args {
159 __u64 cmd; /* in */
160 __u64 result; /* out */
161
162 union {
163 struct btrfs_ioctl_dev_replace_start_params start;
164 struct btrfs_ioctl_dev_replace_status_params status;
165 }; /* in/out */
166
167 __u64 spare[64];
168};
169
170struct btrfs_ioctl_dev_info_args {
171 __u64 devid; /* in/out */
172 __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */
173 __u64 bytes_used; /* out */
174 __u64 total_bytes; /* out */
175 __u64 unused[379]; /* pad to 4k */
176 __u8 path[BTRFS_DEVICE_PATH_NAME_MAX]; /* out */
177};
178
179struct btrfs_ioctl_fs_info_args {
180 __u64 max_id; /* out */
181 __u64 num_devices; /* out */
182 __u8 fsid[BTRFS_FSID_SIZE]; /* out */
183 __u64 reserved[124]; /* pad to 1k */
184};
185
186/* balance control ioctl modes */
187#define BTRFS_BALANCE_CTL_PAUSE 1
188#define BTRFS_BALANCE_CTL_CANCEL 2
189
190/*
191 * this is packed, because it should be exactly the same as its disk
192 * byte order counterpart (struct btrfs_disk_balance_args)
193 */
194struct btrfs_balance_args {
195 __u64 profiles;
196 __u64 usage;
197 __u64 devid;
198 __u64 pstart;
199 __u64 pend;
200 __u64 vstart;
201 __u64 vend;
202
203 __u64 target;
204
205 __u64 flags;
206
207 __u64 unused[8];
208} __attribute__ ((__packed__));
209
210/* report balance progress to userspace */
211struct btrfs_balance_progress {
212 __u64 expected; /* estimated # of chunks that will be
213 * relocated to fulfill the request */
214 __u64 considered; /* # of chunks we have considered so far */
215 __u64 completed; /* # of chunks relocated so far */
216};
217
218#define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0)
219#define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1)
220#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2)
221
222struct btrfs_ioctl_balance_args {
223 __u64 flags; /* in/out */
224 __u64 state; /* out */
225
226 struct btrfs_balance_args data; /* in/out */
227 struct btrfs_balance_args meta; /* in/out */
228 struct btrfs_balance_args sys; /* in/out */
229
230 struct btrfs_balance_progress stat; /* out */
231
232 __u64 unused[72]; /* pad to 1k */
233};
234
235#define BTRFS_INO_LOOKUP_PATH_MAX 4080
236struct btrfs_ioctl_ino_lookup_args {
237 __u64 treeid;
238 __u64 objectid;
239 char name[BTRFS_INO_LOOKUP_PATH_MAX];
240};
241
242struct btrfs_ioctl_search_key {
243 /* which root are we searching. 0 is the tree of tree roots */
244 __u64 tree_id;
245
246 /* keys returned will be >= min and <= max */
247 __u64 min_objectid;
248 __u64 max_objectid;
249
250 /* keys returned will be >= min and <= max */
251 __u64 min_offset;
252 __u64 max_offset;
253
254 /* max and min transids to search for */
255 __u64 min_transid;
256 __u64 max_transid;
257
258 /* keys returned will be >= min and <= max */
259 __u32 min_type;
260 __u32 max_type;
261
262 /*
263 * how many items did userland ask for, and how many are we
264 * returning
265 */
266 __u32 nr_items;
267
268 /* align to 64 bits */
269 __u32 unused;
270
271 /* some extra for later */
272 __u64 unused1;
273 __u64 unused2;
274 __u64 unused3;
275 __u64 unused4;
276};
277
278struct btrfs_ioctl_search_header {
279 __u64 transid;
280 __u64 objectid;
281 __u64 offset;
282 __u32 type;
283 __u32 len;
284};
285
286#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
287/*
288 * the buf is an array of search headers where
289 * each header is followed by the actual item
290 * the type field is expanded to 32 bits for alignment
291 */
292struct btrfs_ioctl_search_args {
293 struct btrfs_ioctl_search_key key;
294 char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
295};
296
297struct btrfs_ioctl_clone_range_args {
298 __s64 src_fd;
299 __u64 src_offset, src_length;
300 __u64 dest_offset;
301};
302
303/* flags for the defrag range ioctl */
304#define BTRFS_DEFRAG_RANGE_COMPRESS 1
305#define BTRFS_DEFRAG_RANGE_START_IO 2
306
307struct btrfs_ioctl_space_info {
308 __u64 flags;
309 __u64 total_bytes;
310 __u64 used_bytes;
311};
312
313struct btrfs_ioctl_space_args {
314 __u64 space_slots;
315 __u64 total_spaces;
316 struct btrfs_ioctl_space_info spaces[0];
317};
318
319struct btrfs_data_container {
320 __u32 bytes_left; /* out -- bytes not needed to deliver output */
321 __u32 bytes_missing; /* out -- additional bytes needed for result */
322 __u32 elem_cnt; /* out */
323 __u32 elem_missed; /* out */
324 __u64 val[0]; /* out */
325};
326
327struct btrfs_ioctl_ino_path_args {
328 __u64 inum; /* in */
329 __u64 size; /* in */
330 __u64 reserved[4];
331 /* struct btrfs_data_container *fspath; out */
332 __u64 fspath; /* out */
333};
334
335struct btrfs_ioctl_logical_ino_args {
336 __u64 logical; /* in */
337 __u64 size; /* in */
338 __u64 reserved[4];
339 /* struct btrfs_data_container *inodes; out */
340 __u64 inodes;
341};
342
343enum btrfs_dev_stat_values {
344 /* disk I/O failure stats */
345 BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */
346 BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */
347 BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */
348
349 /* stats for indirect indications for I/O failures */
350 BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or
351 * contents is illegal: this is an
352 * indication that the block was damaged
353 * during read or write, or written to
354 * wrong location or read from wrong
355 * location */
356 BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not
357 * been written */
358
359 BTRFS_DEV_STAT_VALUES_MAX
360};
361
362/* Reset statistics after reading; needs SYS_ADMIN capability */
363#define BTRFS_DEV_STATS_RESET (1ULL << 0)
364
365struct btrfs_ioctl_get_dev_stats {
366 __u64 devid; /* in */
367 __u64 nr_items; /* in/out */
368 __u64 flags; /* in/out */
369
370 /* out values: */
371 __u64 values[BTRFS_DEV_STAT_VALUES_MAX];
372
373 __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */
374};
375
376#define BTRFS_QUOTA_CTL_ENABLE 1
377#define BTRFS_QUOTA_CTL_DISABLE 2
378#define BTRFS_QUOTA_CTL_RESCAN 3
379struct btrfs_ioctl_quota_ctl_args {
380 __u64 cmd;
381 __u64 status;
382};
383
384struct btrfs_ioctl_qgroup_assign_args {
385 __u64 assign;
386 __u64 src;
387 __u64 dst;
388};
389
390struct btrfs_ioctl_qgroup_create_args {
391 __u64 create;
392 __u64 qgroupid;
393};
394struct btrfs_ioctl_timespec {
395 __u64 sec;
396 __u32 nsec;
397};
398
399struct btrfs_ioctl_received_subvol_args {
400 char uuid[BTRFS_UUID_SIZE]; /* in */
401 __u64 stransid; /* in */
402 __u64 rtransid; /* out */
403 struct btrfs_ioctl_timespec stime; /* in */
404 struct btrfs_ioctl_timespec rtime; /* out */
405 __u64 flags; /* in */
406 __u64 reserved[16]; /* in */
407};
408
409struct btrfs_ioctl_send_args {
410 __s64 send_fd; /* in */
411 __u64 clone_sources_count; /* in */
412 __u64 __user *clone_sources; /* in */
413 __u64 parent_root; /* in */
414 __u64 flags; /* in */
415 __u64 reserved[4]; /* in */
416};
417
418#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
419 struct btrfs_ioctl_vol_args)
420#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
421 struct btrfs_ioctl_vol_args)
422#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
423 struct btrfs_ioctl_vol_args)
424#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
425 struct btrfs_ioctl_vol_args)
426/* trans start and trans end are dangerous, and only for
427 * use by applications that know how to avoid the
428 * resulting deadlocks
429 */
430#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6)
431#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7)
432#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8)
433
434#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
435#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
436 struct btrfs_ioctl_vol_args)
437#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
438 struct btrfs_ioctl_vol_args)
439#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
440 struct btrfs_ioctl_vol_args)
441
442#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
443 struct btrfs_ioctl_clone_range_args)
444
445#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
446 struct btrfs_ioctl_vol_args)
447#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
448 struct btrfs_ioctl_vol_args)
449#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \
450 struct btrfs_ioctl_defrag_range_args)
451#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
452 struct btrfs_ioctl_search_args)
453#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
454 struct btrfs_ioctl_ino_lookup_args)
455#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
456#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
457 struct btrfs_ioctl_space_args)
458#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
459#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
460#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
461 struct btrfs_ioctl_vol_args_v2)
462#define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \
463 struct btrfs_ioctl_vol_args_v2)
464#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64)
465#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
466#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \
467 struct btrfs_ioctl_scrub_args)
468#define BTRFS_IOC_SCRUB_CANCEL _IO(BTRFS_IOCTL_MAGIC, 28)
469#define BTRFS_IOC_SCRUB_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 29, \
470 struct btrfs_ioctl_scrub_args)
471#define BTRFS_IOC_DEV_INFO _IOWR(BTRFS_IOCTL_MAGIC, 30, \
472 struct btrfs_ioctl_dev_info_args)
473#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
474 struct btrfs_ioctl_fs_info_args)
475#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
476 struct btrfs_ioctl_balance_args)
477#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
478#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \
479 struct btrfs_ioctl_balance_args)
480#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
481 struct btrfs_ioctl_ino_path_args)
482#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
483 struct btrfs_ioctl_ino_path_args)
484#define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \
485 struct btrfs_ioctl_received_subvol_args)
486#define BTRFS_IOC_SEND _IOW(BTRFS_IOCTL_MAGIC, 38, struct btrfs_ioctl_send_args)
487#define BTRFS_IOC_DEVICES_READY _IOR(BTRFS_IOCTL_MAGIC, 39, \
488 struct btrfs_ioctl_vol_args)
489#define BTRFS_IOC_QUOTA_CTL _IOWR(BTRFS_IOCTL_MAGIC, 40, \
490 struct btrfs_ioctl_quota_ctl_args)
491#define BTRFS_IOC_QGROUP_ASSIGN _IOW(BTRFS_IOCTL_MAGIC, 41, \
492 struct btrfs_ioctl_qgroup_assign_args)
493#define BTRFS_IOC_QGROUP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 42, \
494 struct btrfs_ioctl_qgroup_create_args)
495#define BTRFS_IOC_QGROUP_LIMIT _IOR(BTRFS_IOCTL_MAGIC, 43, \
496 struct btrfs_ioctl_qgroup_limit_args)
497#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
498 struct btrfs_ioctl_get_dev_stats)
499#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
500 struct btrfs_ioctl_dev_replace_args)
501
502#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 2a1762c66041..e95df435d897 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -113,11 +113,10 @@ again:
113 read_unlock(&eb->lock); 113 read_unlock(&eb->lock);
114 return; 114 return;
115 } 115 }
116 read_unlock(&eb->lock);
117 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
118 read_lock(&eb->lock);
119 if (atomic_read(&eb->blocking_writers)) { 116 if (atomic_read(&eb->blocking_writers)) {
120 read_unlock(&eb->lock); 117 read_unlock(&eb->lock);
118 wait_event(eb->write_lock_wq,
119 atomic_read(&eb->blocking_writers) == 0);
121 goto again; 120 goto again;
122 } 121 }
123 atomic_inc(&eb->read_locks); 122 atomic_inc(&eb->read_locks);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e5ed56729607..dc08d77b717e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -196,6 +196,9 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
196 entry->file_offset = file_offset; 196 entry->file_offset = file_offset;
197 entry->start = start; 197 entry->start = start;
198 entry->len = len; 198 entry->len = len;
199 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
200 !(type == BTRFS_ORDERED_NOCOW))
201 entry->csum_bytes_left = disk_len;
199 entry->disk_len = disk_len; 202 entry->disk_len = disk_len;
200 entry->bytes_left = len; 203 entry->bytes_left = len;
201 entry->inode = igrab(inode); 204 entry->inode = igrab(inode);
@@ -213,6 +216,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
213 INIT_LIST_HEAD(&entry->root_extent_list); 216 INIT_LIST_HEAD(&entry->root_extent_list);
214 INIT_LIST_HEAD(&entry->work_list); 217 INIT_LIST_HEAD(&entry->work_list);
215 init_completion(&entry->completion); 218 init_completion(&entry->completion);
219 INIT_LIST_HEAD(&entry->log_list);
216 220
217 trace_btrfs_ordered_extent_add(inode, entry); 221 trace_btrfs_ordered_extent_add(inode, entry);
218 222
@@ -270,6 +274,10 @@ void btrfs_add_ordered_sum(struct inode *inode,
270 tree = &BTRFS_I(inode)->ordered_tree; 274 tree = &BTRFS_I(inode)->ordered_tree;
271 spin_lock_irq(&tree->lock); 275 spin_lock_irq(&tree->lock);
272 list_add_tail(&sum->list, &entry->list); 276 list_add_tail(&sum->list, &entry->list);
277 WARN_ON(entry->csum_bytes_left < sum->len);
278 entry->csum_bytes_left -= sum->len;
279 if (entry->csum_bytes_left == 0)
280 wake_up(&entry->wait);
273 spin_unlock_irq(&tree->lock); 281 spin_unlock_irq(&tree->lock);
274} 282}
275 283
@@ -405,6 +413,66 @@ out:
405 return ret == 0; 413 return ret == 0;
406} 414}
407 415
416/* Needs to either be called under a log transaction or the log_mutex */
417void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode)
418{
419 struct btrfs_ordered_inode_tree *tree;
420 struct btrfs_ordered_extent *ordered;
421 struct rb_node *n;
422 int index = log->log_transid % 2;
423
424 tree = &BTRFS_I(inode)->ordered_tree;
425 spin_lock_irq(&tree->lock);
426 for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
427 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
428 spin_lock(&log->log_extents_lock[index]);
429 if (list_empty(&ordered->log_list)) {
430 list_add_tail(&ordered->log_list, &log->logged_list[index]);
431 atomic_inc(&ordered->refs);
432 }
433 spin_unlock(&log->log_extents_lock[index]);
434 }
435 spin_unlock_irq(&tree->lock);
436}
437
438void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
439{
440 struct btrfs_ordered_extent *ordered;
441 int index = transid % 2;
442
443 spin_lock_irq(&log->log_extents_lock[index]);
444 while (!list_empty(&log->logged_list[index])) {
445 ordered = list_first_entry(&log->logged_list[index],
446 struct btrfs_ordered_extent,
447 log_list);
448 list_del_init(&ordered->log_list);
449 spin_unlock_irq(&log->log_extents_lock[index]);
450 wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
451 &ordered->flags));
452 btrfs_put_ordered_extent(ordered);
453 spin_lock_irq(&log->log_extents_lock[index]);
454 }
455 spin_unlock_irq(&log->log_extents_lock[index]);
456}
457
458void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid)
459{
460 struct btrfs_ordered_extent *ordered;
461 int index = transid % 2;
462
463 spin_lock_irq(&log->log_extents_lock[index]);
464 while (!list_empty(&log->logged_list[index])) {
465 ordered = list_first_entry(&log->logged_list[index],
466 struct btrfs_ordered_extent,
467 log_list);
468 list_del_init(&ordered->log_list);
469 spin_unlock_irq(&log->log_extents_lock[index]);
470 btrfs_put_ordered_extent(ordered);
471 spin_lock_irq(&log->log_extents_lock[index]);
472 }
473 spin_unlock_irq(&log->log_extents_lock[index]);
474}
475
408/* 476/*
409 * used to drop a reference on an ordered extent. This will free 477 * used to drop a reference on an ordered extent. This will free
410 * the extent if the last reference is dropped 478 * the extent if the last reference is dropped
@@ -544,10 +612,12 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
544 * extra check to make sure the ordered operation list really is empty 612 * extra check to make sure the ordered operation list really is empty
545 * before we return 613 * before we return
546 */ 614 */
547int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) 615int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
616 struct btrfs_root *root, int wait)
548{ 617{
549 struct btrfs_inode *btrfs_inode; 618 struct btrfs_inode *btrfs_inode;
550 struct inode *inode; 619 struct inode *inode;
620 struct btrfs_transaction *cur_trans = trans->transaction;
551 struct list_head splice; 621 struct list_head splice;
552 struct list_head works; 622 struct list_head works;
553 struct btrfs_delalloc_work *work, *next; 623 struct btrfs_delalloc_work *work, *next;
@@ -558,14 +628,10 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
558 628
559 mutex_lock(&root->fs_info->ordered_operations_mutex); 629 mutex_lock(&root->fs_info->ordered_operations_mutex);
560 spin_lock(&root->fs_info->ordered_extent_lock); 630 spin_lock(&root->fs_info->ordered_extent_lock);
561again: 631 list_splice_init(&cur_trans->ordered_operations, &splice);
562 list_splice_init(&root->fs_info->ordered_operations, &splice);
563
564 while (!list_empty(&splice)) { 632 while (!list_empty(&splice)) {
565
566 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 633 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
567 ordered_operations); 634 ordered_operations);
568
569 inode = &btrfs_inode->vfs_inode; 635 inode = &btrfs_inode->vfs_inode;
570 636
571 list_del_init(&btrfs_inode->ordered_operations); 637 list_del_init(&btrfs_inode->ordered_operations);
@@ -574,24 +640,22 @@ again:
574 * the inode may be getting freed (in sys_unlink path). 640 * the inode may be getting freed (in sys_unlink path).
575 */ 641 */
576 inode = igrab(inode); 642 inode = igrab(inode);
577
578 if (!wait && inode) {
579 list_add_tail(&BTRFS_I(inode)->ordered_operations,
580 &root->fs_info->ordered_operations);
581 }
582
583 if (!inode) 643 if (!inode)
584 continue; 644 continue;
645
646 if (!wait)
647 list_add_tail(&BTRFS_I(inode)->ordered_operations,
648 &cur_trans->ordered_operations);
585 spin_unlock(&root->fs_info->ordered_extent_lock); 649 spin_unlock(&root->fs_info->ordered_extent_lock);
586 650
587 work = btrfs_alloc_delalloc_work(inode, wait, 1); 651 work = btrfs_alloc_delalloc_work(inode, wait, 1);
588 if (!work) { 652 if (!work) {
653 spin_lock(&root->fs_info->ordered_extent_lock);
589 if (list_empty(&BTRFS_I(inode)->ordered_operations)) 654 if (list_empty(&BTRFS_I(inode)->ordered_operations))
590 list_add_tail(&btrfs_inode->ordered_operations, 655 list_add_tail(&btrfs_inode->ordered_operations,
591 &splice); 656 &splice);
592 spin_lock(&root->fs_info->ordered_extent_lock);
593 list_splice_tail(&splice, 657 list_splice_tail(&splice,
594 &root->fs_info->ordered_operations); 658 &cur_trans->ordered_operations);
595 spin_unlock(&root->fs_info->ordered_extent_lock); 659 spin_unlock(&root->fs_info->ordered_extent_lock);
596 ret = -ENOMEM; 660 ret = -ENOMEM;
597 goto out; 661 goto out;
@@ -603,9 +667,6 @@ again:
603 cond_resched(); 667 cond_resched();
604 spin_lock(&root->fs_info->ordered_extent_lock); 668 spin_lock(&root->fs_info->ordered_extent_lock);
605 } 669 }
606 if (wait && !list_empty(&root->fs_info->ordered_operations))
607 goto again;
608
609 spin_unlock(&root->fs_info->ordered_extent_lock); 670 spin_unlock(&root->fs_info->ordered_extent_lock);
610out: 671out:
611 list_for_each_entry_safe(work, next, &works, list) { 672 list_for_each_entry_safe(work, next, &works, list) {
@@ -974,6 +1035,7 @@ out:
974void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 1035void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
975 struct btrfs_root *root, struct inode *inode) 1036 struct btrfs_root *root, struct inode *inode)
976{ 1037{
1038 struct btrfs_transaction *cur_trans = trans->transaction;
977 u64 last_mod; 1039 u64 last_mod;
978 1040
979 last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); 1041 last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
@@ -988,7 +1050,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
988 spin_lock(&root->fs_info->ordered_extent_lock); 1050 spin_lock(&root->fs_info->ordered_extent_lock);
989 if (list_empty(&BTRFS_I(inode)->ordered_operations)) { 1051 if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
990 list_add_tail(&BTRFS_I(inode)->ordered_operations, 1052 list_add_tail(&BTRFS_I(inode)->ordered_operations,
991 &root->fs_info->ordered_operations); 1053 &cur_trans->ordered_operations);
992 } 1054 }
993 spin_unlock(&root->fs_info->ordered_extent_lock); 1055 spin_unlock(&root->fs_info->ordered_extent_lock);
994} 1056}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f29d4bf5fbe7..8eadfe406cdd 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -79,6 +79,8 @@ struct btrfs_ordered_sum {
79#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent 79#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
80 * has done its due diligence in updating 80 * has done its due diligence in updating
81 * the isize. */ 81 * the isize. */
82#define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered
83 ordered extent */
82 84
83struct btrfs_ordered_extent { 85struct btrfs_ordered_extent {
84 /* logical offset in the file */ 86 /* logical offset in the file */
@@ -96,6 +98,9 @@ struct btrfs_ordered_extent {
96 /* number of bytes that still need writing */ 98 /* number of bytes that still need writing */
97 u64 bytes_left; 99 u64 bytes_left;
98 100
101 /* number of bytes that still need csumming */
102 u64 csum_bytes_left;
103
99 /* 104 /*
100 * the end of the ordered extent which is behind it but 105 * the end of the ordered extent which is behind it but
101 * didn't update disk_i_size. Please see the comment of 106 * didn't update disk_i_size. Please see the comment of
@@ -118,6 +123,9 @@ struct btrfs_ordered_extent {
118 /* list of checksums for insertion when the extent io is done */ 123 /* list of checksums for insertion when the extent io is done */
119 struct list_head list; 124 struct list_head list;
120 125
126 /* If we need to wait on this to be done */
127 struct list_head log_list;
128
121 /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ 129 /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
122 wait_queue_head_t wait; 130 wait_queue_head_t wait;
123 131
@@ -189,11 +197,15 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
189int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 197int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
190 struct btrfs_ordered_extent *ordered); 198 struct btrfs_ordered_extent *ordered);
191int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 199int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
192int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); 200int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
201 struct btrfs_root *root, int wait);
193void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 202void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
194 struct btrfs_root *root, 203 struct btrfs_root *root,
195 struct inode *inode); 204 struct inode *inode);
196void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput); 205void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
206void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
207void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
208void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
197int __init ordered_data_init(void); 209int __init ordered_data_init(void);
198void ordered_data_exit(void); 210void ordered_data_exit(void);
199#endif 211#endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 50d95fd190a5..920957ecb27e 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -294,6 +294,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
294 btrfs_dev_extent_chunk_offset(l, dev_extent), 294 btrfs_dev_extent_chunk_offset(l, dev_extent),
295 (unsigned long long) 295 (unsigned long long)
296 btrfs_dev_extent_length(l, dev_extent)); 296 btrfs_dev_extent_length(l, dev_extent));
297 break;
297 case BTRFS_DEV_STATS_KEY: 298 case BTRFS_DEV_STATS_KEY:
298 printk(KERN_INFO "\t\tdevice stats\n"); 299 printk(KERN_INFO "\t\tdevice stats\n");
299 break; 300 break;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index a5c856234323..aee4b1cc3d98 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -23,13 +23,13 @@
23#include <linux/rbtree.h> 23#include <linux/rbtree.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/workqueue.h> 25#include <linux/workqueue.h>
26#include <linux/btrfs.h>
26 27
27#include "ctree.h" 28#include "ctree.h"
28#include "transaction.h" 29#include "transaction.h"
29#include "disk-io.h" 30#include "disk-io.h"
30#include "locking.h" 31#include "locking.h"
31#include "ulist.h" 32#include "ulist.h"
32#include "ioctl.h"
33#include "backref.h" 33#include "backref.h"
34 34
35/* TODO XXX FIXME 35/* TODO XXX FIXME
@@ -620,7 +620,9 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
620 key.offset = qgroupid; 620 key.offset = qgroupid;
621 621
622 path = btrfs_alloc_path(); 622 path = btrfs_alloc_path();
623 BUG_ON(!path); 623 if (!path)
624 return -ENOMEM;
625
624 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 626 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
625 if (ret > 0) 627 if (ret > 0)
626 ret = -ENOENT; 628 ret = -ENOENT;
@@ -661,7 +663,9 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
661 key.offset = qgroup->qgroupid; 663 key.offset = qgroup->qgroupid;
662 664
663 path = btrfs_alloc_path(); 665 path = btrfs_alloc_path();
664 BUG_ON(!path); 666 if (!path)
667 return -ENOMEM;
668
665 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 669 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
666 if (ret > 0) 670 if (ret > 0)
667 ret = -ENOENT; 671 ret = -ENOENT;
@@ -702,7 +706,9 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
702 key.offset = 0; 706 key.offset = 0;
703 707
704 path = btrfs_alloc_path(); 708 path = btrfs_alloc_path();
705 BUG_ON(!path); 709 if (!path)
710 return -ENOMEM;
711
706 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 712 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
707 if (ret > 0) 713 if (ret > 0)
708 ret = -ENOENT; 714 ret = -ENOENT;
@@ -732,33 +738,38 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
732{ 738{
733 struct btrfs_path *path; 739 struct btrfs_path *path;
734 struct btrfs_key key; 740 struct btrfs_key key;
741 struct extent_buffer *leaf = NULL;
735 int ret; 742 int ret;
736 743 int nr = 0;
737 if (!root)
738 return -EINVAL;
739 744
740 path = btrfs_alloc_path(); 745 path = btrfs_alloc_path();
741 if (!path) 746 if (!path)
742 return -ENOMEM; 747 return -ENOMEM;
743 748
744 while (1) { 749 path->leave_spinning = 1;
745 key.objectid = 0;
746 key.offset = 0;
747 key.type = 0;
748 750
749 path->leave_spinning = 1; 751 key.objectid = 0;
752 key.offset = 0;
753 key.type = 0;
754
755 while (1) {
750 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 756 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
751 if (ret > 0) { 757 if (ret < 0)
752 if (path->slots[0] == 0) 758 goto out;
753 break; 759 leaf = path->nodes[0];
754 path->slots[0]--; 760 nr = btrfs_header_nritems(leaf);
755 } else if (ret < 0) { 761 if (!nr)
756 break; 762 break;
757 } 763 /*
758 764 * delete the leaf one by one
759 ret = btrfs_del_item(trans, root, path); 765 * since the whole tree is going
766 * to be deleted.
767 */
768 path->slots[0] = 0;
769 ret = btrfs_del_items(trans, root, path, 0, nr);
760 if (ret) 770 if (ret)
761 goto out; 771 goto out;
772
762 btrfs_release_path(path); 773 btrfs_release_path(path);
763 } 774 }
764 ret = 0; 775 ret = 0;
@@ -847,6 +858,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
847 int ret = 0; 858 int ret = 0;
848 859
849 spin_lock(&fs_info->qgroup_lock); 860 spin_lock(&fs_info->qgroup_lock);
861 if (!fs_info->quota_root) {
862 spin_unlock(&fs_info->qgroup_lock);
863 return 0;
864 }
850 fs_info->quota_enabled = 0; 865 fs_info->quota_enabled = 0;
851 fs_info->pending_quota_state = 0; 866 fs_info->pending_quota_state = 0;
852 quota_root = fs_info->quota_root; 867 quota_root = fs_info->quota_root;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
new file mode 100644
index 000000000000..07222053c7d8
--- /dev/null
+++ b/fs/btrfs/raid56.c
@@ -0,0 +1,2099 @@
1/*
2 * Copyright (C) 2012 Fusion-io All rights reserved.
3 * Copyright (C) 2012 Intel Corp. All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
18 */
19#include <linux/sched.h>
20#include <linux/wait.h>
21#include <linux/bio.h>
22#include <linux/slab.h>
23#include <linux/buffer_head.h>
24#include <linux/blkdev.h>
25#include <linux/random.h>
26#include <linux/iocontext.h>
27#include <linux/capability.h>
28#include <linux/ratelimit.h>
29#include <linux/kthread.h>
30#include <linux/raid/pq.h>
31#include <linux/hash.h>
32#include <linux/list_sort.h>
33#include <linux/raid/xor.h>
34#include <asm/div64.h>
35#include "compat.h"
36#include "ctree.h"
37#include "extent_map.h"
38#include "disk-io.h"
39#include "transaction.h"
40#include "print-tree.h"
41#include "volumes.h"
42#include "raid56.h"
43#include "async-thread.h"
44#include "check-integrity.h"
45#include "rcu-string.h"
46
47/* set when additional merges to this rbio are not allowed */
48#define RBIO_RMW_LOCKED_BIT 1
49
50/*
51 * set when this rbio is sitting in the hash, but it is just a cache
52 * of past RMW
53 */
54#define RBIO_CACHE_BIT 2
55
56/*
57 * set when it is safe to trust the stripe_pages for caching
58 */
59#define RBIO_CACHE_READY_BIT 3
60
61
62#define RBIO_CACHE_SIZE 1024
63
64struct btrfs_raid_bio {
65 struct btrfs_fs_info *fs_info;
66 struct btrfs_bio *bbio;
67
68 /*
69 * logical block numbers for the start of each stripe
70 * The last one or two are p/q. These are sorted,
71 * so raid_map[0] is the start of our full stripe
72 */
73 u64 *raid_map;
74
75 /* while we're doing rmw on a stripe
76 * we put it into a hash table so we can
77 * lock the stripe and merge more rbios
78 * into it.
79 */
80 struct list_head hash_list;
81
82 /*
83 * LRU list for the stripe cache
84 */
85 struct list_head stripe_cache;
86
87 /*
88 * for scheduling work in the helper threads
89 */
90 struct btrfs_work work;
91
92 /*
93 * bio list and bio_list_lock are used
94 * to add more bios into the stripe
95 * in hopes of avoiding the full rmw
96 */
97 struct bio_list bio_list;
98 spinlock_t bio_list_lock;
99
100 /* also protected by the bio_list_lock, the
101 * plug list is used by the plugging code
102 * to collect partial bios while plugged. The
103 * stripe locking code also uses it to hand off
104 * the stripe lock to the next pending IO
105 */
106 struct list_head plug_list;
107
108 /*
109 * flags that tell us if it is safe to
110 * merge with this bio
111 */
112 unsigned long flags;
113
114 /* size of each individual stripe on disk */
115 int stripe_len;
116
117 /* number of data stripes (no p/q) */
118 int nr_data;
119
120 /*
121 * set if we're doing a parity rebuild
122 * for a read from higher up, which is handled
123 * differently from a parity rebuild as part of
124 * rmw
125 */
126 int read_rebuild;
127
128 /* first bad stripe */
129 int faila;
130
131 /* second bad stripe (for raid6 use) */
132 int failb;
133
134 /*
135 * number of pages needed to represent the full
136 * stripe
137 */
138 int nr_pages;
139
140 /*
141 * size of all the bios in the bio_list. This
142 * helps us decide if the rbio maps to a full
143 * stripe or not
144 */
145 int bio_list_bytes;
146
147 atomic_t refs;
148
149 /*
150 * these are two arrays of pointers. We allocate the
151 * rbio big enough to hold them both and setup their
152 * locations when the rbio is allocated
153 */
154
155 /* pointers to pages that we allocated for
156 * reading/writing stripes directly from the disk (including P/Q)
157 */
158 struct page **stripe_pages;
159
160 /*
161 * pointers to the pages in the bio_list. Stored
162 * here for faster lookup
163 */
164 struct page **bio_pages;
165};
166
167static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
168static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
169static void rmw_work(struct btrfs_work *work);
170static void read_rebuild_work(struct btrfs_work *work);
171static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
172static void async_read_rebuild(struct btrfs_raid_bio *rbio);
173static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
174static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
175static void __free_raid_bio(struct btrfs_raid_bio *rbio);
176static void index_rbio_pages(struct btrfs_raid_bio *rbio);
177static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
178
179/*
180 * the stripe hash table is used for locking, and to collect
181 * bios in hopes of making a full stripe
182 */
183int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
184{
185 struct btrfs_stripe_hash_table *table;
186 struct btrfs_stripe_hash_table *x;
187 struct btrfs_stripe_hash *cur;
188 struct btrfs_stripe_hash *h;
189 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
190 int i;
191 int table_size;
192
193 if (info->stripe_hash_table)
194 return 0;
195
196 /*
197 * The table is large, starting with order 4 and can go as high as
198 * order 7 in case lock debugging is turned on.
199 *
200 * Try harder to allocate and fallback to vmalloc to lower the chance
201 * of a failing mount.
202 */
203 table_size = sizeof(*table) + sizeof(*h) * num_entries;
204 table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
205 if (!table) {
206 table = vzalloc(table_size);
207 if (!table)
208 return -ENOMEM;
209 }
210
211 spin_lock_init(&table->cache_lock);
212 INIT_LIST_HEAD(&table->stripe_cache);
213
214 h = table->table;
215
216 for (i = 0; i < num_entries; i++) {
217 cur = h + i;
218 INIT_LIST_HEAD(&cur->hash_list);
219 spin_lock_init(&cur->lock);
220 init_waitqueue_head(&cur->wait);
221 }
222
223 x = cmpxchg(&info->stripe_hash_table, NULL, table);
224 if (x) {
225 if (is_vmalloc_addr(x))
226 vfree(x);
227 else
228 kfree(x);
229 }
230 return 0;
231}
232
233/*
234 * caching an rbio means to copy anything from the
235 * bio_pages array into the stripe_pages array. We
236 * use the page uptodate bit in the stripe cache array
237 * to indicate if it has valid data
238 *
239 * once the caching is done, we set the cache ready
240 * bit.
241 */
242static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
243{
244 int i;
245 char *s;
246 char *d;
247 int ret;
248
249 ret = alloc_rbio_pages(rbio);
250 if (ret)
251 return;
252
253 for (i = 0; i < rbio->nr_pages; i++) {
254 if (!rbio->bio_pages[i])
255 continue;
256
257 s = kmap(rbio->bio_pages[i]);
258 d = kmap(rbio->stripe_pages[i]);
259
260 memcpy(d, s, PAGE_CACHE_SIZE);
261
262 kunmap(rbio->bio_pages[i]);
263 kunmap(rbio->stripe_pages[i]);
264 SetPageUptodate(rbio->stripe_pages[i]);
265 }
266 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
267}
268
269/*
270 * we hash on the first logical address of the stripe
271 */
272static int rbio_bucket(struct btrfs_raid_bio *rbio)
273{
274 u64 num = rbio->raid_map[0];
275
276 /*
277 * we shift down quite a bit. We're using byte
278 * addressing, and most of the lower bits are zeros.
279 * This tends to upset hash_64, and it consistently
280 * returns just one or two different values.
281 *
282 * shifting off the lower bits fixes things.
283 */
284 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
285}
286
287/*
288 * stealing an rbio means taking all the uptodate pages from the stripe
289 * array in the source rbio and putting them into the destination rbio
290 */
291static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
292{
293 int i;
294 struct page *s;
295 struct page *d;
296
297 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
298 return;
299
300 for (i = 0; i < dest->nr_pages; i++) {
301 s = src->stripe_pages[i];
302 if (!s || !PageUptodate(s)) {
303 continue;
304 }
305
306 d = dest->stripe_pages[i];
307 if (d)
308 __free_page(d);
309
310 dest->stripe_pages[i] = s;
311 src->stripe_pages[i] = NULL;
312 }
313}
314
315/*
316 * merging means we take the bio_list from the victim and
317 * splice it into the destination. The victim should
318 * be discarded afterwards.
319 *
320 * must be called with dest->rbio_list_lock held
321 */
322static void merge_rbio(struct btrfs_raid_bio *dest,
323 struct btrfs_raid_bio *victim)
324{
325 bio_list_merge(&dest->bio_list, &victim->bio_list);
326 dest->bio_list_bytes += victim->bio_list_bytes;
327 bio_list_init(&victim->bio_list);
328}
329
330/*
331 * used to prune items that are in the cache. The caller
332 * must hold the hash table lock.
333 */
334static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
335{
336 int bucket = rbio_bucket(rbio);
337 struct btrfs_stripe_hash_table *table;
338 struct btrfs_stripe_hash *h;
339 int freeit = 0;
340
341 /*
342 * check the bit again under the hash table lock.
343 */
344 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
345 return;
346
347 table = rbio->fs_info->stripe_hash_table;
348 h = table->table + bucket;
349
350 /* hold the lock for the bucket because we may be
351 * removing it from the hash table
352 */
353 spin_lock(&h->lock);
354
355 /*
356 * hold the lock for the bio list because we need
357 * to make sure the bio list is empty
358 */
359 spin_lock(&rbio->bio_list_lock);
360
361 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
362 list_del_init(&rbio->stripe_cache);
363 table->cache_size -= 1;
364 freeit = 1;
365
366 /* if the bio list isn't empty, this rbio is
367 * still involved in an IO. We take it out
368 * of the cache list, and drop the ref that
369 * was held for the list.
370 *
371 * If the bio_list was empty, we also remove
372 * the rbio from the hash_table, and drop
373 * the corresponding ref
374 */
375 if (bio_list_empty(&rbio->bio_list)) {
376 if (!list_empty(&rbio->hash_list)) {
377 list_del_init(&rbio->hash_list);
378 atomic_dec(&rbio->refs);
379 BUG_ON(!list_empty(&rbio->plug_list));
380 }
381 }
382 }
383
384 spin_unlock(&rbio->bio_list_lock);
385 spin_unlock(&h->lock);
386
387 if (freeit)
388 __free_raid_bio(rbio);
389}
390
391/*
392 * prune a given rbio from the cache
393 */
394static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
395{
396 struct btrfs_stripe_hash_table *table;
397 unsigned long flags;
398
399 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
400 return;
401
402 table = rbio->fs_info->stripe_hash_table;
403
404 spin_lock_irqsave(&table->cache_lock, flags);
405 __remove_rbio_from_cache(rbio);
406 spin_unlock_irqrestore(&table->cache_lock, flags);
407}
408
409/*
410 * remove everything in the cache
411 */
412void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
413{
414 struct btrfs_stripe_hash_table *table;
415 unsigned long flags;
416 struct btrfs_raid_bio *rbio;
417
418 table = info->stripe_hash_table;
419
420 spin_lock_irqsave(&table->cache_lock, flags);
421 while (!list_empty(&table->stripe_cache)) {
422 rbio = list_entry(table->stripe_cache.next,
423 struct btrfs_raid_bio,
424 stripe_cache);
425 __remove_rbio_from_cache(rbio);
426 }
427 spin_unlock_irqrestore(&table->cache_lock, flags);
428}
429
430/*
431 * remove all cached entries and free the hash table
432 * used by unmount
433 */
434void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
435{
436 if (!info->stripe_hash_table)
437 return;
438 btrfs_clear_rbio_cache(info);
439 if (is_vmalloc_addr(info->stripe_hash_table))
440 vfree(info->stripe_hash_table);
441 else
442 kfree(info->stripe_hash_table);
443 info->stripe_hash_table = NULL;
444}
445
446/*
447 * insert an rbio into the stripe cache. It
448 * must have already been prepared by calling
449 * cache_rbio_pages
450 *
451 * If this rbio was already cached, it gets
452 * moved to the front of the lru.
453 *
454 * If the size of the rbio cache is too big, we
455 * prune an item.
456 */
457static void cache_rbio(struct btrfs_raid_bio *rbio)
458{
459 struct btrfs_stripe_hash_table *table;
460 unsigned long flags;
461
462 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
463 return;
464
465 table = rbio->fs_info->stripe_hash_table;
466
467 spin_lock_irqsave(&table->cache_lock, flags);
468 spin_lock(&rbio->bio_list_lock);
469
470 /* bump our ref if we were not in the list before */
471 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
472 atomic_inc(&rbio->refs);
473
474 if (!list_empty(&rbio->stripe_cache)){
475 list_move(&rbio->stripe_cache, &table->stripe_cache);
476 } else {
477 list_add(&rbio->stripe_cache, &table->stripe_cache);
478 table->cache_size += 1;
479 }
480
481 spin_unlock(&rbio->bio_list_lock);
482
483 if (table->cache_size > RBIO_CACHE_SIZE) {
484 struct btrfs_raid_bio *found;
485
486 found = list_entry(table->stripe_cache.prev,
487 struct btrfs_raid_bio,
488 stripe_cache);
489
490 if (found != rbio)
491 __remove_rbio_from_cache(found);
492 }
493
494 spin_unlock_irqrestore(&table->cache_lock, flags);
495 return;
496}
497
498/*
499 * helper function to run the xor_blocks api. It is only
500 * able to do MAX_XOR_BLOCKS at a time, so we need to
501 * loop through.
502 */
503static void run_xor(void **pages, int src_cnt, ssize_t len)
504{
505 int src_off = 0;
506 int xor_src_cnt = 0;
507 void *dest = pages[src_cnt];
508
509 while(src_cnt > 0) {
510 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
511 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
512
513 src_cnt -= xor_src_cnt;
514 src_off += xor_src_cnt;
515 }
516}
517
518/*
519 * returns true if the bio list inside this rbio
520 * covers an entire stripe (no rmw required).
521 * Must be called with the bio list lock held, or
522 * at a time when you know it is impossible to add
523 * new bios into the list
524 */
525static int __rbio_is_full(struct btrfs_raid_bio *rbio)
526{
527 unsigned long size = rbio->bio_list_bytes;
528 int ret = 1;
529
530 if (size != rbio->nr_data * rbio->stripe_len)
531 ret = 0;
532
533 BUG_ON(size > rbio->nr_data * rbio->stripe_len);
534 return ret;
535}
536
537static int rbio_is_full(struct btrfs_raid_bio *rbio)
538{
539 unsigned long flags;
540 int ret;
541
542 spin_lock_irqsave(&rbio->bio_list_lock, flags);
543 ret = __rbio_is_full(rbio);
544 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
545 return ret;
546}
547
548/*
549 * returns 1 if it is safe to merge two rbios together.
550 * The merging is safe if the two rbios correspond to
551 * the same stripe and if they are both going in the same
552 * direction (read vs write), and if neither one is
553 * locked for final IO
554 *
555 * The caller is responsible for locking such that
556 * rmw_locked is safe to test
557 */
558static int rbio_can_merge(struct btrfs_raid_bio *last,
559 struct btrfs_raid_bio *cur)
560{
561 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
562 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
563 return 0;
564
565 /*
566 * we can't merge with cached rbios, since the
567 * idea is that when we merge the destination
568 * rbio is going to run our IO for us. We can
569 * steal from cached rbio's though, other functions
570 * handle that.
571 */
572 if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
573 test_bit(RBIO_CACHE_BIT, &cur->flags))
574 return 0;
575
576 if (last->raid_map[0] !=
577 cur->raid_map[0])
578 return 0;
579
580 /* reads can't merge with writes */
581 if (last->read_rebuild !=
582 cur->read_rebuild) {
583 return 0;
584 }
585
586 return 1;
587}
588
589/*
590 * helper to index into the pstripe
591 */
592static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
593{
594 index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
595 return rbio->stripe_pages[index];
596}
597
598/*
599 * helper to index into the qstripe, returns null
600 * if there is no qstripe
601 */
602static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
603{
604 if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
605 return NULL;
606
607 index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
608 PAGE_CACHE_SHIFT;
609 return rbio->stripe_pages[index];
610}
611
612/*
613 * The first stripe in the table for a logical address
614 * has the lock. rbios are added in one of three ways:
615 *
616 * 1) Nobody has the stripe locked yet. The rbio is given
617 * the lock and 0 is returned. The caller must start the IO
618 * themselves.
619 *
620 * 2) Someone has the stripe locked, but we're able to merge
621 * with the lock owner. The rbio is freed and the IO will
622 * start automatically along with the existing rbio. 1 is returned.
623 *
624 * 3) Someone has the stripe locked, but we're not able to merge.
625 * The rbio is added to the lock owner's plug list, or merged into
626 * an rbio already on the plug list. When the lock owner unlocks,
627 * the next rbio on the list is run and the IO is started automatically.
628 * 1 is returned
629 *
630 * If we return 0, the caller still owns the rbio and must continue with
631 * IO submission. If we return 1, the caller must assume the rbio has
632 * already been freed.
633 */
634static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
635{
636 int bucket = rbio_bucket(rbio);
637 struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
638 struct btrfs_raid_bio *cur;
639 struct btrfs_raid_bio *pending;
640 unsigned long flags;
641 DEFINE_WAIT(wait);
642 struct btrfs_raid_bio *freeit = NULL;
643 struct btrfs_raid_bio *cache_drop = NULL;
644 int ret = 0;
645 int walk = 0;
646
647 spin_lock_irqsave(&h->lock, flags);
648 list_for_each_entry(cur, &h->hash_list, hash_list) {
649 walk++;
650 if (cur->raid_map[0] == rbio->raid_map[0]) {
651 spin_lock(&cur->bio_list_lock);
652
653 /* can we steal this cached rbio's pages? */
654 if (bio_list_empty(&cur->bio_list) &&
655 list_empty(&cur->plug_list) &&
656 test_bit(RBIO_CACHE_BIT, &cur->flags) &&
657 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
658 list_del_init(&cur->hash_list);
659 atomic_dec(&cur->refs);
660
661 steal_rbio(cur, rbio);
662 cache_drop = cur;
663 spin_unlock(&cur->bio_list_lock);
664
665 goto lockit;
666 }
667
668 /* can we merge into the lock owner? */
669 if (rbio_can_merge(cur, rbio)) {
670 merge_rbio(cur, rbio);
671 spin_unlock(&cur->bio_list_lock);
672 freeit = rbio;
673 ret = 1;
674 goto out;
675 }
676
677
678 /*
679 * we couldn't merge with the running
680 * rbio, see if we can merge with the
681 * pending ones. We don't have to
682 * check for rmw_locked because there
683 * is no way they are inside finish_rmw
684 * right now
685 */
686 list_for_each_entry(pending, &cur->plug_list,
687 plug_list) {
688 if (rbio_can_merge(pending, rbio)) {
689 merge_rbio(pending, rbio);
690 spin_unlock(&cur->bio_list_lock);
691 freeit = rbio;
692 ret = 1;
693 goto out;
694 }
695 }
696
697 /* no merging, put us on the tail of the plug list,
698 * our rbio will be started with the currently
699 * running rbio unlocks
700 */
701 list_add_tail(&rbio->plug_list, &cur->plug_list);
702 spin_unlock(&cur->bio_list_lock);
703 ret = 1;
704 goto out;
705 }
706 }
707lockit:
708 atomic_inc(&rbio->refs);
709 list_add(&rbio->hash_list, &h->hash_list);
710out:
711 spin_unlock_irqrestore(&h->lock, flags);
712 if (cache_drop)
713 remove_rbio_from_cache(cache_drop);
714 if (freeit)
715 __free_raid_bio(freeit);
716 return ret;
717}
718
719/*
720 * called as rmw or parity rebuild is completed. If the plug list has more
721 * rbios waiting for this stripe, the next one on the list will be started
722 */
723static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
724{
725 int bucket;
726 struct btrfs_stripe_hash *h;
727 unsigned long flags;
728 int keep_cache = 0;
729
730 bucket = rbio_bucket(rbio);
731 h = rbio->fs_info->stripe_hash_table->table + bucket;
732
733 if (list_empty(&rbio->plug_list))
734 cache_rbio(rbio);
735
736 spin_lock_irqsave(&h->lock, flags);
737 spin_lock(&rbio->bio_list_lock);
738
739 if (!list_empty(&rbio->hash_list)) {
740 /*
741 * if we're still cached and there is no other IO
742 * to perform, just leave this rbio here for others
743 * to steal from later
744 */
745 if (list_empty(&rbio->plug_list) &&
746 test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
747 keep_cache = 1;
748 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
749 BUG_ON(!bio_list_empty(&rbio->bio_list));
750 goto done;
751 }
752
753 list_del_init(&rbio->hash_list);
754 atomic_dec(&rbio->refs);
755
756 /*
757 * we use the plug list to hold all the rbios
758 * waiting for the chance to lock this stripe.
759 * hand the lock over to one of them.
760 */
761 if (!list_empty(&rbio->plug_list)) {
762 struct btrfs_raid_bio *next;
763 struct list_head *head = rbio->plug_list.next;
764
765 next = list_entry(head, struct btrfs_raid_bio,
766 plug_list);
767
768 list_del_init(&rbio->plug_list);
769
770 list_add(&next->hash_list, &h->hash_list);
771 atomic_inc(&next->refs);
772 spin_unlock(&rbio->bio_list_lock);
773 spin_unlock_irqrestore(&h->lock, flags);
774
775 if (next->read_rebuild)
776 async_read_rebuild(next);
777 else {
778 steal_rbio(rbio, next);
779 async_rmw_stripe(next);
780 }
781
782 goto done_nolock;
783 } else if (waitqueue_active(&h->wait)) {
784 spin_unlock(&rbio->bio_list_lock);
785 spin_unlock_irqrestore(&h->lock, flags);
786 wake_up(&h->wait);
787 goto done_nolock;
788 }
789 }
790done:
791 spin_unlock(&rbio->bio_list_lock);
792 spin_unlock_irqrestore(&h->lock, flags);
793
794done_nolock:
795 if (!keep_cache)
796 remove_rbio_from_cache(rbio);
797}
798
799static void __free_raid_bio(struct btrfs_raid_bio *rbio)
800{
801 int i;
802
803 WARN_ON(atomic_read(&rbio->refs) < 0);
804 if (!atomic_dec_and_test(&rbio->refs))
805 return;
806
807 WARN_ON(!list_empty(&rbio->stripe_cache));
808 WARN_ON(!list_empty(&rbio->hash_list));
809 WARN_ON(!bio_list_empty(&rbio->bio_list));
810
811 for (i = 0; i < rbio->nr_pages; i++) {
812 if (rbio->stripe_pages[i]) {
813 __free_page(rbio->stripe_pages[i]);
814 rbio->stripe_pages[i] = NULL;
815 }
816 }
817 kfree(rbio->raid_map);
818 kfree(rbio->bbio);
819 kfree(rbio);
820}
821
822static void free_raid_bio(struct btrfs_raid_bio *rbio)
823{
824 unlock_stripe(rbio);
825 __free_raid_bio(rbio);
826}
827
828/*
829 * this frees the rbio and runs through all the bios in the
830 * bio_list and calls end_io on them
831 */
832static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
833{
834 struct bio *cur = bio_list_get(&rbio->bio_list);
835 struct bio *next;
836 free_raid_bio(rbio);
837
838 while (cur) {
839 next = cur->bi_next;
840 cur->bi_next = NULL;
841 if (uptodate)
842 set_bit(BIO_UPTODATE, &cur->bi_flags);
843 bio_endio(cur, err);
844 cur = next;
845 }
846}
847
848/*
849 * end io function used by finish_rmw. When we finally
850 * get here, we've written a full stripe
851 */
852static void raid_write_end_io(struct bio *bio, int err)
853{
854 struct btrfs_raid_bio *rbio = bio->bi_private;
855
856 if (err)
857 fail_bio_stripe(rbio, bio);
858
859 bio_put(bio);
860
861 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
862 return;
863
864 err = 0;
865
866 /* OK, we have read all the stripes we need to. */
867 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
868 err = -EIO;
869
870 rbio_orig_end_io(rbio, err, 0);
871 return;
872}
873
874/*
875 * the read/modify/write code wants to use the original bio for
876 * any pages it included, and then use the rbio for everything
877 * else. This function decides if a given index (stripe number)
878 * and page number in that stripe fall inside the original bio
879 * or the rbio.
880 *
881 * if you set bio_list_only, you'll get a NULL back for any ranges
882 * that are outside the bio_list
883 *
884 * This doesn't take any refs on anything, you get a bare page pointer
885 * and the caller must bump refs as required.
886 *
887 * You must call index_rbio_pages once before you can trust
888 * the answers from this function.
889 */
890static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
891 int index, int pagenr, int bio_list_only)
892{
893 int chunk_page;
894 struct page *p = NULL;
895
896 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
897
898 spin_lock_irq(&rbio->bio_list_lock);
899 p = rbio->bio_pages[chunk_page];
900 spin_unlock_irq(&rbio->bio_list_lock);
901
902 if (p || bio_list_only)
903 return p;
904
905 return rbio->stripe_pages[chunk_page];
906}
907
908/*
909 * number of pages we need for the entire stripe across all the
910 * drives
911 */
912static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
913{
914 unsigned long nr = stripe_len * nr_stripes;
915 return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
916}
917
918/*
919 * allocation and initial setup for the btrfs_raid_bio. Not
920 * this does not allocate any pages for rbio->pages.
921 */
922static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
923 struct btrfs_bio *bbio, u64 *raid_map,
924 u64 stripe_len)
925{
926 struct btrfs_raid_bio *rbio;
927 int nr_data = 0;
928 int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
929 void *p;
930
931 rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
932 GFP_NOFS);
933 if (!rbio) {
934 kfree(raid_map);
935 kfree(bbio);
936 return ERR_PTR(-ENOMEM);
937 }
938
939 bio_list_init(&rbio->bio_list);
940 INIT_LIST_HEAD(&rbio->plug_list);
941 spin_lock_init(&rbio->bio_list_lock);
942 INIT_LIST_HEAD(&rbio->stripe_cache);
943 INIT_LIST_HEAD(&rbio->hash_list);
944 rbio->bbio = bbio;
945 rbio->raid_map = raid_map;
946 rbio->fs_info = root->fs_info;
947 rbio->stripe_len = stripe_len;
948 rbio->nr_pages = num_pages;
949 rbio->faila = -1;
950 rbio->failb = -1;
951 atomic_set(&rbio->refs, 1);
952
953 /*
954 * the stripe_pages and bio_pages array point to the extra
955 * memory we allocated past the end of the rbio
956 */
957 p = rbio + 1;
958 rbio->stripe_pages = p;
959 rbio->bio_pages = p + sizeof(struct page *) * num_pages;
960
961 if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
962 nr_data = bbio->num_stripes - 2;
963 else
964 nr_data = bbio->num_stripes - 1;
965
966 rbio->nr_data = nr_data;
967 return rbio;
968}
969
970/* allocate pages for all the stripes in the bio, including parity */
971static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
972{
973 int i;
974 struct page *page;
975
976 for (i = 0; i < rbio->nr_pages; i++) {
977 if (rbio->stripe_pages[i])
978 continue;
979 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
980 if (!page)
981 return -ENOMEM;
982 rbio->stripe_pages[i] = page;
983 ClearPageUptodate(page);
984 }
985 return 0;
986}
987
988/* allocate pages for just the p/q stripes */
989static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
990{
991 int i;
992 struct page *page;
993
994 i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
995
996 for (; i < rbio->nr_pages; i++) {
997 if (rbio->stripe_pages[i])
998 continue;
999 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1000 if (!page)
1001 return -ENOMEM;
1002 rbio->stripe_pages[i] = page;
1003 }
1004 return 0;
1005}
1006
1007/*
1008 * add a single page from a specific stripe into our list of bios for IO
1009 * this will try to merge into existing bios if possible, and returns
1010 * zero if all went well.
1011 */
1012int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1013 struct bio_list *bio_list,
1014 struct page *page,
1015 int stripe_nr,
1016 unsigned long page_index,
1017 unsigned long bio_max_len)
1018{
1019 struct bio *last = bio_list->tail;
1020 u64 last_end = 0;
1021 int ret;
1022 struct bio *bio;
1023 struct btrfs_bio_stripe *stripe;
1024 u64 disk_start;
1025
1026 stripe = &rbio->bbio->stripes[stripe_nr];
1027 disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
1028
1029 /* if the device is missing, just fail this stripe */
1030 if (!stripe->dev->bdev)
1031 return fail_rbio_index(rbio, stripe_nr);
1032
1033 /* see if we can add this page onto our existing bio */
1034 if (last) {
1035 last_end = (u64)last->bi_sector << 9;
1036 last_end += last->bi_size;
1037
1038 /*
1039 * we can't merge these if they are from different
1040 * devices or if they are not contiguous
1041 */
1042 if (last_end == disk_start && stripe->dev->bdev &&
1043 test_bit(BIO_UPTODATE, &last->bi_flags) &&
1044 last->bi_bdev == stripe->dev->bdev) {
1045 ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
1046 if (ret == PAGE_CACHE_SIZE)
1047 return 0;
1048 }
1049 }
1050
1051 /* put a new bio on the list */
1052 bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
1053 if (!bio)
1054 return -ENOMEM;
1055
1056 bio->bi_size = 0;
1057 bio->bi_bdev = stripe->dev->bdev;
1058 bio->bi_sector = disk_start >> 9;
1059 set_bit(BIO_UPTODATE, &bio->bi_flags);
1060
1061 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
1062 bio_list_add(bio_list, bio);
1063 return 0;
1064}
1065
1066/*
1067 * while we're doing the read/modify/write cycle, we could
1068 * have errors in reading pages off the disk. This checks
1069 * for errors and if we're not able to read the page it'll
1070 * trigger parity reconstruction. The rmw will be finished
1071 * after we've reconstructed the failed stripes
1072 */
1073static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1074{
1075 if (rbio->faila >= 0 || rbio->failb >= 0) {
1076 BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
1077 __raid56_parity_recover(rbio);
1078 } else {
1079 finish_rmw(rbio);
1080 }
1081}
1082
1083/*
1084 * these are just the pages from the rbio array, not from anything
1085 * the FS sent down to us
1086 */
1087static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
1088{
1089 int index;
1090 index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
1091 index += page;
1092 return rbio->stripe_pages[index];
1093}
1094
1095/*
1096 * helper function to walk our bio list and populate the bio_pages array with
1097 * the result. This seems expensive, but it is faster than constantly
1098 * searching through the bio list as we setup the IO in finish_rmw or stripe
1099 * reconstruction.
1100 *
1101 * This must be called before you trust the answers from page_in_rbio
1102 */
1103static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1104{
1105 struct bio *bio;
1106 u64 start;
1107 unsigned long stripe_offset;
1108 unsigned long page_index;
1109 struct page *p;
1110 int i;
1111
1112 spin_lock_irq(&rbio->bio_list_lock);
1113 bio_list_for_each(bio, &rbio->bio_list) {
1114 start = (u64)bio->bi_sector << 9;
1115 stripe_offset = start - rbio->raid_map[0];
1116 page_index = stripe_offset >> PAGE_CACHE_SHIFT;
1117
1118 for (i = 0; i < bio->bi_vcnt; i++) {
1119 p = bio->bi_io_vec[i].bv_page;
1120 rbio->bio_pages[page_index + i] = p;
1121 }
1122 }
1123 spin_unlock_irq(&rbio->bio_list_lock);
1124}
1125
1126/*
1127 * this is called from one of two situations. We either
1128 * have a full stripe from the higher layers, or we've read all
1129 * the missing bits off disk.
1130 *
1131 * This will calculate the parity and then send down any
1132 * changed blocks.
1133 */
1134static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1135{
1136 struct btrfs_bio *bbio = rbio->bbio;
1137 void *pointers[bbio->num_stripes];
1138 int stripe_len = rbio->stripe_len;
1139 int nr_data = rbio->nr_data;
1140 int stripe;
1141 int pagenr;
1142 int p_stripe = -1;
1143 int q_stripe = -1;
1144 struct bio_list bio_list;
1145 struct bio *bio;
1146 int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
1147 int ret;
1148
1149 bio_list_init(&bio_list);
1150
1151 if (bbio->num_stripes - rbio->nr_data == 1) {
1152 p_stripe = bbio->num_stripes - 1;
1153 } else if (bbio->num_stripes - rbio->nr_data == 2) {
1154 p_stripe = bbio->num_stripes - 2;
1155 q_stripe = bbio->num_stripes - 1;
1156 } else {
1157 BUG();
1158 }
1159
1160 /* at this point we either have a full stripe,
1161 * or we've read the full stripe from the drive.
1162 * recalculate the parity and write the new results.
1163 *
1164 * We're not allowed to add any new bios to the
1165 * bio list here, anyone else that wants to
1166 * change this stripe needs to do their own rmw.
1167 */
1168 spin_lock_irq(&rbio->bio_list_lock);
1169 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1170 spin_unlock_irq(&rbio->bio_list_lock);
1171
1172 atomic_set(&rbio->bbio->error, 0);
1173
1174 /*
1175 * now that we've set rmw_locked, run through the
1176 * bio list one last time and map the page pointers
1177 *
1178 * We don't cache full rbios because we're assuming
1179 * the higher layers are unlikely to use this area of
1180 * the disk again soon. If they do use it again,
1181 * hopefully they will send another full bio.
1182 */
1183 index_rbio_pages(rbio);
1184 if (!rbio_is_full(rbio))
1185 cache_rbio_pages(rbio);
1186 else
1187 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1188
1189 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1190 struct page *p;
1191 /* first collect one page from each data stripe */
1192 for (stripe = 0; stripe < nr_data; stripe++) {
1193 p = page_in_rbio(rbio, stripe, pagenr, 0);
1194 pointers[stripe] = kmap(p);
1195 }
1196
1197 /* then add the parity stripe */
1198 p = rbio_pstripe_page(rbio, pagenr);
1199 SetPageUptodate(p);
1200 pointers[stripe++] = kmap(p);
1201
1202 if (q_stripe != -1) {
1203
1204 /*
1205 * raid6, add the qstripe and call the
1206 * library function to fill in our p/q
1207 */
1208 p = rbio_qstripe_page(rbio, pagenr);
1209 SetPageUptodate(p);
1210 pointers[stripe++] = kmap(p);
1211
1212 raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
1213 pointers);
1214 } else {
1215 /* raid5 */
1216 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
1217 run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
1218 }
1219
1220
1221 for (stripe = 0; stripe < bbio->num_stripes; stripe++)
1222 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1223 }
1224
1225 /*
1226 * time to start writing. Make bios for everything from the
1227 * higher layers (the bio_list in our rbio) and our p/q. Ignore
1228 * everything else.
1229 */
1230 for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1231 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1232 struct page *page;
1233 if (stripe < rbio->nr_data) {
1234 page = page_in_rbio(rbio, stripe, pagenr, 1);
1235 if (!page)
1236 continue;
1237 } else {
1238 page = rbio_stripe_page(rbio, stripe, pagenr);
1239 }
1240
1241 ret = rbio_add_io_page(rbio, &bio_list,
1242 page, stripe, pagenr, rbio->stripe_len);
1243 if (ret)
1244 goto cleanup;
1245 }
1246 }
1247
1248 atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
1249 BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
1250
1251 while (1) {
1252 bio = bio_list_pop(&bio_list);
1253 if (!bio)
1254 break;
1255
1256 bio->bi_private = rbio;
1257 bio->bi_end_io = raid_write_end_io;
1258 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1259 submit_bio(WRITE, bio);
1260 }
1261 return;
1262
1263cleanup:
1264 rbio_orig_end_io(rbio, -EIO, 0);
1265}
1266
1267/*
1268 * helper to find the stripe number for a given bio. Used to figure out which
1269 * stripe has failed. This expects the bio to correspond to a physical disk,
1270 * so it looks up based on physical sector numbers.
1271 */
1272static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1273 struct bio *bio)
1274{
1275 u64 physical = bio->bi_sector;
1276 u64 stripe_start;
1277 int i;
1278 struct btrfs_bio_stripe *stripe;
1279
1280 physical <<= 9;
1281
1282 for (i = 0; i < rbio->bbio->num_stripes; i++) {
1283 stripe = &rbio->bbio->stripes[i];
1284 stripe_start = stripe->physical;
1285 if (physical >= stripe_start &&
1286 physical < stripe_start + rbio->stripe_len) {
1287 return i;
1288 }
1289 }
1290 return -1;
1291}
1292
1293/*
1294 * helper to find the stripe number for a given
1295 * bio (before mapping). Used to figure out which stripe has
1296 * failed. This looks up based on logical block numbers.
1297 */
1298static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1299 struct bio *bio)
1300{
1301 u64 logical = bio->bi_sector;
1302 u64 stripe_start;
1303 int i;
1304
1305 logical <<= 9;
1306
1307 for (i = 0; i < rbio->nr_data; i++) {
1308 stripe_start = rbio->raid_map[i];
1309 if (logical >= stripe_start &&
1310 logical < stripe_start + rbio->stripe_len) {
1311 return i;
1312 }
1313 }
1314 return -1;
1315}
1316
1317/*
1318 * returns -EIO if we had too many failures
1319 */
1320static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1321{
1322 unsigned long flags;
1323 int ret = 0;
1324
1325 spin_lock_irqsave(&rbio->bio_list_lock, flags);
1326
1327 /* we already know this stripe is bad, move on */
1328 if (rbio->faila == failed || rbio->failb == failed)
1329 goto out;
1330
1331 if (rbio->faila == -1) {
1332 /* first failure on this rbio */
1333 rbio->faila = failed;
1334 atomic_inc(&rbio->bbio->error);
1335 } else if (rbio->failb == -1) {
1336 /* second failure on this rbio */
1337 rbio->failb = failed;
1338 atomic_inc(&rbio->bbio->error);
1339 } else {
1340 ret = -EIO;
1341 }
1342out:
1343 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1344
1345 return ret;
1346}
1347
1348/*
1349 * helper to fail a stripe based on a physical disk
1350 * bio.
1351 */
1352static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1353 struct bio *bio)
1354{
1355 int failed = find_bio_stripe(rbio, bio);
1356
1357 if (failed < 0)
1358 return -EIO;
1359
1360 return fail_rbio_index(rbio, failed);
1361}
1362
1363/*
1364 * this sets each page in the bio uptodate. It should only be used on private
1365 * rbio pages, nothing that comes in from the higher layers
1366 */
1367static void set_bio_pages_uptodate(struct bio *bio)
1368{
1369 int i;
1370 struct page *p;
1371
1372 for (i = 0; i < bio->bi_vcnt; i++) {
1373 p = bio->bi_io_vec[i].bv_page;
1374 SetPageUptodate(p);
1375 }
1376}
1377
1378/*
1379 * end io for the read phase of the rmw cycle. All the bios here are physical
1380 * stripe bios we've read from the disk so we can recalculate the parity of the
1381 * stripe.
1382 *
1383 * This will usually kick off finish_rmw once all the bios are read in, but it
1384 * may trigger parity reconstruction if we had any errors along the way
1385 */
1386static void raid_rmw_end_io(struct bio *bio, int err)
1387{
1388 struct btrfs_raid_bio *rbio = bio->bi_private;
1389
1390 if (err)
1391 fail_bio_stripe(rbio, bio);
1392 else
1393 set_bio_pages_uptodate(bio);
1394
1395 bio_put(bio);
1396
1397 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1398 return;
1399
1400 err = 0;
1401 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1402 goto cleanup;
1403
1404 /*
1405 * this will normally call finish_rmw to start our write
1406 * but if there are any failed stripes we'll reconstruct
1407 * from parity first
1408 */
1409 validate_rbio_for_rmw(rbio);
1410 return;
1411
1412cleanup:
1413
1414 rbio_orig_end_io(rbio, -EIO, 0);
1415}
1416
1417static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1418{
1419 rbio->work.flags = 0;
1420 rbio->work.func = rmw_work;
1421
1422 btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1423 &rbio->work);
1424}
1425
1426static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1427{
1428 rbio->work.flags = 0;
1429 rbio->work.func = read_rebuild_work;
1430
1431 btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1432 &rbio->work);
1433}
1434
1435/*
1436 * the stripe must be locked by the caller. It will
1437 * unlock after all the writes are done
1438 */
1439static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1440{
1441 int bios_to_read = 0;
1442 struct btrfs_bio *bbio = rbio->bbio;
1443 struct bio_list bio_list;
1444 int ret;
1445 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1446 int pagenr;
1447 int stripe;
1448 struct bio *bio;
1449
1450 bio_list_init(&bio_list);
1451
1452 ret = alloc_rbio_pages(rbio);
1453 if (ret)
1454 goto cleanup;
1455
1456 index_rbio_pages(rbio);
1457
1458 atomic_set(&rbio->bbio->error, 0);
1459 /*
1460 * build a list of bios to read all the missing parts of this
1461 * stripe
1462 */
1463 for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1464 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1465 struct page *page;
1466 /*
1467 * we want to find all the pages missing from
1468 * the rbio and read them from the disk. If
1469 * page_in_rbio finds a page in the bio list
1470 * we don't need to read it off the stripe.
1471 */
1472 page = page_in_rbio(rbio, stripe, pagenr, 1);
1473 if (page)
1474 continue;
1475
1476 page = rbio_stripe_page(rbio, stripe, pagenr);
1477 /*
1478 * the bio cache may have handed us an uptodate
1479 * page. If so, be happy and use it
1480 */
1481 if (PageUptodate(page))
1482 continue;
1483
1484 ret = rbio_add_io_page(rbio, &bio_list, page,
1485 stripe, pagenr, rbio->stripe_len);
1486 if (ret)
1487 goto cleanup;
1488 }
1489 }
1490
1491 bios_to_read = bio_list_size(&bio_list);
1492 if (!bios_to_read) {
1493 /*
1494 * this can happen if others have merged with
1495 * us, it means there is nothing left to read.
1496 * But if there are missing devices it may not be
1497 * safe to do the full stripe write yet.
1498 */
1499 goto finish;
1500 }
1501
1502 /*
1503 * the bbio may be freed once we submit the last bio. Make sure
1504 * not to touch it after that
1505 */
1506 atomic_set(&bbio->stripes_pending, bios_to_read);
1507 while (1) {
1508 bio = bio_list_pop(&bio_list);
1509 if (!bio)
1510 break;
1511
1512 bio->bi_private = rbio;
1513 bio->bi_end_io = raid_rmw_end_io;
1514
1515 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1516 BTRFS_WQ_ENDIO_RAID56);
1517
1518 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1519 submit_bio(READ, bio);
1520 }
1521 /* the actual write will happen once the reads are done */
1522 return 0;
1523
1524cleanup:
1525 rbio_orig_end_io(rbio, -EIO, 0);
1526 return -EIO;
1527
1528finish:
1529 validate_rbio_for_rmw(rbio);
1530 return 0;
1531}
1532
1533/*
1534 * if the upper layers pass in a full stripe, we thank them by only allocating
1535 * enough pages to hold the parity, and sending it all down quickly.
1536 */
1537static int full_stripe_write(struct btrfs_raid_bio *rbio)
1538{
1539 int ret;
1540
1541 ret = alloc_rbio_parity_pages(rbio);
1542 if (ret)
1543 return ret;
1544
1545 ret = lock_stripe_add(rbio);
1546 if (ret == 0)
1547 finish_rmw(rbio);
1548 return 0;
1549}
1550
1551/*
1552 * partial stripe writes get handed over to async helpers.
1553 * We're really hoping to merge a few more writes into this
1554 * rbio before calculating new parity
1555 */
1556static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1557{
1558 int ret;
1559
1560 ret = lock_stripe_add(rbio);
1561 if (ret == 0)
1562 async_rmw_stripe(rbio);
1563 return 0;
1564}
1565
1566/*
1567 * sometimes while we were reading from the drive to
1568 * recalculate parity, enough new bios come into create
1569 * a full stripe. So we do a check here to see if we can
1570 * go directly to finish_rmw
1571 */
1572static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1573{
1574 /* head off into rmw land if we don't have a full stripe */
1575 if (!rbio_is_full(rbio))
1576 return partial_stripe_write(rbio);
1577 return full_stripe_write(rbio);
1578}
1579
1580/*
1581 * We use plugging call backs to collect full stripes.
1582 * Any time we get a partial stripe write while plugged
1583 * we collect it into a list. When the unplug comes down,
1584 * we sort the list by logical block number and merge
1585 * everything we can into the same rbios
1586 */
1587struct btrfs_plug_cb {
1588 struct blk_plug_cb cb;
1589 struct btrfs_fs_info *info;
1590 struct list_head rbio_list;
1591 struct btrfs_work work;
1592};
1593
1594/*
1595 * rbios on the plug list are sorted for easier merging.
1596 */
1597static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
1598{
1599 struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1600 plug_list);
1601 struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1602 plug_list);
1603 u64 a_sector = ra->bio_list.head->bi_sector;
1604 u64 b_sector = rb->bio_list.head->bi_sector;
1605
1606 if (a_sector < b_sector)
1607 return -1;
1608 if (a_sector > b_sector)
1609 return 1;
1610 return 0;
1611}
1612
1613static void run_plug(struct btrfs_plug_cb *plug)
1614{
1615 struct btrfs_raid_bio *cur;
1616 struct btrfs_raid_bio *last = NULL;
1617
1618 /*
1619 * sort our plug list then try to merge
1620 * everything we can in hopes of creating full
1621 * stripes.
1622 */
1623 list_sort(NULL, &plug->rbio_list, plug_cmp);
1624 while (!list_empty(&plug->rbio_list)) {
1625 cur = list_entry(plug->rbio_list.next,
1626 struct btrfs_raid_bio, plug_list);
1627 list_del_init(&cur->plug_list);
1628
1629 if (rbio_is_full(cur)) {
1630 /* we have a full stripe, send it down */
1631 full_stripe_write(cur);
1632 continue;
1633 }
1634 if (last) {
1635 if (rbio_can_merge(last, cur)) {
1636 merge_rbio(last, cur);
1637 __free_raid_bio(cur);
1638 continue;
1639
1640 }
1641 __raid56_parity_write(last);
1642 }
1643 last = cur;
1644 }
1645 if (last) {
1646 __raid56_parity_write(last);
1647 }
1648 kfree(plug);
1649}
1650
1651/*
1652 * if the unplug comes from schedule, we have to push the
1653 * work off to a helper thread
1654 */
1655static void unplug_work(struct btrfs_work *work)
1656{
1657 struct btrfs_plug_cb *plug;
1658 plug = container_of(work, struct btrfs_plug_cb, work);
1659 run_plug(plug);
1660}
1661
1662static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1663{
1664 struct btrfs_plug_cb *plug;
1665 plug = container_of(cb, struct btrfs_plug_cb, cb);
1666
1667 if (from_schedule) {
1668 plug->work.flags = 0;
1669 plug->work.func = unplug_work;
1670 btrfs_queue_worker(&plug->info->rmw_workers,
1671 &plug->work);
1672 return;
1673 }
1674 run_plug(plug);
1675}
1676
1677/*
1678 * our main entry point for writes from the rest of the FS.
1679 */
1680int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1681 struct btrfs_bio *bbio, u64 *raid_map,
1682 u64 stripe_len)
1683{
1684 struct btrfs_raid_bio *rbio;
1685 struct btrfs_plug_cb *plug = NULL;
1686 struct blk_plug_cb *cb;
1687
1688 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1689 if (IS_ERR(rbio)) {
1690 kfree(raid_map);
1691 kfree(bbio);
1692 return PTR_ERR(rbio);
1693 }
1694 bio_list_add(&rbio->bio_list, bio);
1695 rbio->bio_list_bytes = bio->bi_size;
1696
1697 /*
1698 * don't plug on full rbios, just get them out the door
1699 * as quickly as we can
1700 */
1701 if (rbio_is_full(rbio))
1702 return full_stripe_write(rbio);
1703
1704 cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
1705 sizeof(*plug));
1706 if (cb) {
1707 plug = container_of(cb, struct btrfs_plug_cb, cb);
1708 if (!plug->info) {
1709 plug->info = root->fs_info;
1710 INIT_LIST_HEAD(&plug->rbio_list);
1711 }
1712 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1713 } else {
1714 return __raid56_parity_write(rbio);
1715 }
1716 return 0;
1717}
1718
1719/*
1720 * all parity reconstruction happens here. We've read in everything
1721 * we can find from the drives and this does the heavy lifting of
1722 * sorting the good from the bad.
1723 */
1724static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1725{
1726 int pagenr, stripe;
1727 void **pointers;
1728 int faila = -1, failb = -1;
1729 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1730 struct page *page;
1731 int err;
1732 int i;
1733
1734 pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
1735 GFP_NOFS);
1736 if (!pointers) {
1737 err = -ENOMEM;
1738 goto cleanup_io;
1739 }
1740
1741 faila = rbio->faila;
1742 failb = rbio->failb;
1743
1744 if (rbio->read_rebuild) {
1745 spin_lock_irq(&rbio->bio_list_lock);
1746 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1747 spin_unlock_irq(&rbio->bio_list_lock);
1748 }
1749
1750 index_rbio_pages(rbio);
1751
1752 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1753 /* setup our array of pointers with pages
1754 * from each stripe
1755 */
1756 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1757 /*
1758 * if we're rebuilding a read, we have to use
1759 * pages from the bio list
1760 */
1761 if (rbio->read_rebuild &&
1762 (stripe == faila || stripe == failb)) {
1763 page = page_in_rbio(rbio, stripe, pagenr, 0);
1764 } else {
1765 page = rbio_stripe_page(rbio, stripe, pagenr);
1766 }
1767 pointers[stripe] = kmap(page);
1768 }
1769
1770 /* all raid6 handling here */
1771 if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
1772 RAID6_Q_STRIPE) {
1773
1774 /*
1775 * single failure, rebuild from parity raid5
1776 * style
1777 */
1778 if (failb < 0) {
1779 if (faila == rbio->nr_data) {
1780 /*
1781 * Just the P stripe has failed, without
1782 * a bad data or Q stripe.
1783 * TODO, we should redo the xor here.
1784 */
1785 err = -EIO;
1786 goto cleanup;
1787 }
1788 /*
1789 * a single failure in raid6 is rebuilt
1790 * in the pstripe code below
1791 */
1792 goto pstripe;
1793 }
1794
1795 /* make sure our ps and qs are in order */
1796 if (faila > failb) {
1797 int tmp = failb;
1798 failb = faila;
1799 faila = tmp;
1800 }
1801
1802 /* if the q stripe is failed, do a pstripe reconstruction
1803 * from the xors.
1804 * If both the q stripe and the P stripe are failed, we're
1805 * here due to a crc mismatch and we can't give them the
1806 * data they want
1807 */
1808 if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
1809 if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
1810 err = -EIO;
1811 goto cleanup;
1812 }
1813 /*
1814 * otherwise we have one bad data stripe and
1815 * a good P stripe. raid5!
1816 */
1817 goto pstripe;
1818 }
1819
1820 if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
1821 raid6_datap_recov(rbio->bbio->num_stripes,
1822 PAGE_SIZE, faila, pointers);
1823 } else {
1824 raid6_2data_recov(rbio->bbio->num_stripes,
1825 PAGE_SIZE, faila, failb,
1826 pointers);
1827 }
1828 } else {
1829 void *p;
1830
1831 /* rebuild from P stripe here (raid5 or raid6) */
1832 BUG_ON(failb != -1);
1833pstripe:
1834 /* Copy parity block into failed block to start with */
1835 memcpy(pointers[faila],
1836 pointers[rbio->nr_data],
1837 PAGE_CACHE_SIZE);
1838
1839 /* rearrange the pointer array */
1840 p = pointers[faila];
1841 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1842 pointers[stripe] = pointers[stripe + 1];
1843 pointers[rbio->nr_data - 1] = p;
1844
1845 /* xor in the rest */
1846 run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
1847 }
1848 /* if we're doing this rebuild as part of an rmw, go through
1849 * and set all of our private rbio pages in the
1850 * failed stripes as uptodate. This way finish_rmw will
1851 * know they can be trusted. If this was a read reconstruction,
1852 * other endio functions will fiddle the uptodate bits
1853 */
1854 if (!rbio->read_rebuild) {
1855 for (i = 0; i < nr_pages; i++) {
1856 if (faila != -1) {
1857 page = rbio_stripe_page(rbio, faila, i);
1858 SetPageUptodate(page);
1859 }
1860 if (failb != -1) {
1861 page = rbio_stripe_page(rbio, failb, i);
1862 SetPageUptodate(page);
1863 }
1864 }
1865 }
1866 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1867 /*
1868 * if we're rebuilding a read, we have to use
1869 * pages from the bio list
1870 */
1871 if (rbio->read_rebuild &&
1872 (stripe == faila || stripe == failb)) {
1873 page = page_in_rbio(rbio, stripe, pagenr, 0);
1874 } else {
1875 page = rbio_stripe_page(rbio, stripe, pagenr);
1876 }
1877 kunmap(page);
1878 }
1879 }
1880
1881 err = 0;
1882cleanup:
1883 kfree(pointers);
1884
1885cleanup_io:
1886
1887 if (rbio->read_rebuild) {
1888 if (err == 0)
1889 cache_rbio_pages(rbio);
1890 else
1891 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1892
1893 rbio_orig_end_io(rbio, err, err == 0);
1894 } else if (err == 0) {
1895 rbio->faila = -1;
1896 rbio->failb = -1;
1897 finish_rmw(rbio);
1898 } else {
1899 rbio_orig_end_io(rbio, err, 0);
1900 }
1901}
1902
1903/*
1904 * This is called only for stripes we've read from disk to
1905 * reconstruct the parity.
1906 */
1907static void raid_recover_end_io(struct bio *bio, int err)
1908{
1909 struct btrfs_raid_bio *rbio = bio->bi_private;
1910
1911 /*
1912 * we only read stripe pages off the disk, set them
1913 * up to date if there were no errors
1914 */
1915 if (err)
1916 fail_bio_stripe(rbio, bio);
1917 else
1918 set_bio_pages_uptodate(bio);
1919 bio_put(bio);
1920
1921 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1922 return;
1923
1924 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1925 rbio_orig_end_io(rbio, -EIO, 0);
1926 else
1927 __raid_recover_end_io(rbio);
1928}
1929
1930/*
1931 * reads everything we need off the disk to reconstruct
1932 * the parity. endio handlers trigger final reconstruction
1933 * when the IO is done.
1934 *
1935 * This is used both for reads from the higher layers and for
1936 * parity construction required to finish a rmw cycle.
1937 */
1938static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1939{
1940 int bios_to_read = 0;
1941 struct btrfs_bio *bbio = rbio->bbio;
1942 struct bio_list bio_list;
1943 int ret;
1944 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1945 int pagenr;
1946 int stripe;
1947 struct bio *bio;
1948
1949 bio_list_init(&bio_list);
1950
1951 ret = alloc_rbio_pages(rbio);
1952 if (ret)
1953 goto cleanup;
1954
1955 atomic_set(&rbio->bbio->error, 0);
1956
1957 /*
1958 * read everything that hasn't failed. Thanks to the
1959 * stripe cache, it is possible that some or all of these
1960 * pages are going to be uptodate.
1961 */
1962 for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1963 if (rbio->faila == stripe ||
1964 rbio->failb == stripe)
1965 continue;
1966
1967 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1968 struct page *p;
1969
1970 /*
1971 * the rmw code may have already read this
1972 * page in
1973 */
1974 p = rbio_stripe_page(rbio, stripe, pagenr);
1975 if (PageUptodate(p))
1976 continue;
1977
1978 ret = rbio_add_io_page(rbio, &bio_list,
1979 rbio_stripe_page(rbio, stripe, pagenr),
1980 stripe, pagenr, rbio->stripe_len);
1981 if (ret < 0)
1982 goto cleanup;
1983 }
1984 }
1985
1986 bios_to_read = bio_list_size(&bio_list);
1987 if (!bios_to_read) {
1988 /*
1989 * we might have no bios to read just because the pages
1990 * were up to date, or we might have no bios to read because
1991 * the devices were gone.
1992 */
1993 if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
1994 __raid_recover_end_io(rbio);
1995 goto out;
1996 } else {
1997 goto cleanup;
1998 }
1999 }
2000
2001 /*
2002 * the bbio may be freed once we submit the last bio. Make sure
2003 * not to touch it after that
2004 */
2005 atomic_set(&bbio->stripes_pending, bios_to_read);
2006 while (1) {
2007 bio = bio_list_pop(&bio_list);
2008 if (!bio)
2009 break;
2010
2011 bio->bi_private = rbio;
2012 bio->bi_end_io = raid_recover_end_io;
2013
2014 btrfs_bio_wq_end_io(rbio->fs_info, bio,
2015 BTRFS_WQ_ENDIO_RAID56);
2016
2017 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2018 submit_bio(READ, bio);
2019 }
2020out:
2021 return 0;
2022
2023cleanup:
2024 if (rbio->read_rebuild)
2025 rbio_orig_end_io(rbio, -EIO, 0);
2026 return -EIO;
2027}
2028
2029/*
2030 * the main entry point for reads from the higher layers. This
2031 * is really only called when the normal read path had a failure,
2032 * so we assume the bio they send down corresponds to a failed part
2033 * of the drive.
2034 */
2035int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
2036 struct btrfs_bio *bbio, u64 *raid_map,
2037 u64 stripe_len, int mirror_num)
2038{
2039 struct btrfs_raid_bio *rbio;
2040 int ret;
2041
2042 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
2043 if (IS_ERR(rbio)) {
2044 return PTR_ERR(rbio);
2045 }
2046
2047 rbio->read_rebuild = 1;
2048 bio_list_add(&rbio->bio_list, bio);
2049 rbio->bio_list_bytes = bio->bi_size;
2050
2051 rbio->faila = find_logical_bio_stripe(rbio, bio);
2052 if (rbio->faila == -1) {
2053 BUG();
2054 kfree(rbio);
2055 return -EIO;
2056 }
2057
2058 /*
2059 * reconstruct from the q stripe if they are
2060 * asking for mirror 3
2061 */
2062 if (mirror_num == 3)
2063 rbio->failb = bbio->num_stripes - 2;
2064
2065 ret = lock_stripe_add(rbio);
2066
2067 /*
2068 * __raid56_parity_recover will end the bio with
2069 * any errors it hits. We don't want to return
2070 * its error value up the stack because our caller
2071 * will end up calling bio_endio with any nonzero
2072 * return
2073 */
2074 if (ret == 0)
2075 __raid56_parity_recover(rbio);
2076 /*
2077 * our rbio has been added to the list of
2078 * rbios that will be handled after the
2079 * currently lock owner is done
2080 */
2081 return 0;
2082
2083}
2084
2085static void rmw_work(struct btrfs_work *work)
2086{
2087 struct btrfs_raid_bio *rbio;
2088
2089 rbio = container_of(work, struct btrfs_raid_bio, work);
2090 raid56_rmw_stripe(rbio);
2091}
2092
2093static void read_rebuild_work(struct btrfs_work *work)
2094{
2095 struct btrfs_raid_bio *rbio;
2096
2097 rbio = container_of(work, struct btrfs_raid_bio, work);
2098 __raid56_parity_recover(rbio);
2099}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
new file mode 100644
index 000000000000..ea5d73bfdfbe
--- /dev/null
+++ b/fs/btrfs/raid56.h
@@ -0,0 +1,51 @@
1/*
2 * Copyright (C) 2012 Fusion-io All rights reserved.
3 * Copyright (C) 2012 Intel Corp. All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
18 */
19
20#ifndef __BTRFS_RAID56__
21#define __BTRFS_RAID56__
22static inline int nr_parity_stripes(struct map_lookup *map)
23{
24 if (map->type & BTRFS_BLOCK_GROUP_RAID5)
25 return 1;
26 else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
27 return 2;
28 else
29 return 0;
30}
31
32static inline int nr_data_stripes(struct map_lookup *map)
33{
34 return map->num_stripes - nr_parity_stripes(map);
35}
36#define RAID5_P_STRIPE ((u64)-2)
37#define RAID6_Q_STRIPE ((u64)-1)
38
39#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \
40 ((x) == RAID6_Q_STRIPE))
41
42int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
43 struct btrfs_bio *bbio, u64 *raid_map,
44 u64 stripe_len, int mirror_num);
45int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
46 struct btrfs_bio *bbio, u64 *raid_map,
47 u64 stripe_len);
48
49int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
50void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
51#endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 17c306bf177a..50695dc5e2ab 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3017,7 +3017,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
3017 } 3017 }
3018 } 3018 }
3019 3019
3020 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 3020 page_start = page_offset(page);
3021 page_end = page_start + PAGE_CACHE_SIZE - 1; 3021 page_end = page_start + PAGE_CACHE_SIZE - 1;
3022 3022
3023 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); 3023 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 67783e03d121..53c3501fa4ca 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -28,6 +28,7 @@
28#include "dev-replace.h" 28#include "dev-replace.h"
29#include "check-integrity.h" 29#include "check-integrity.h"
30#include "rcu-string.h" 30#include "rcu-string.h"
31#include "raid56.h"
31 32
32/* 33/*
33 * This is only the first step towards a full-features scrub. It reads all 34 * This is only the first step towards a full-features scrub. It reads all
@@ -2254,6 +2255,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2254 struct btrfs_device *extent_dev; 2255 struct btrfs_device *extent_dev;
2255 int extent_mirror_num; 2256 int extent_mirror_num;
2256 2257
2258 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2259 BTRFS_BLOCK_GROUP_RAID6)) {
2260 if (num >= nr_data_stripes(map)) {
2261 return 0;
2262 }
2263 }
2264
2257 nstripes = length; 2265 nstripes = length;
2258 offset = 0; 2266 offset = 0;
2259 do_div(nstripes, map->stripe_len); 2267 do_div(nstripes, map->stripe_len);
@@ -2708,7 +2716,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2708 int ret; 2716 int ret;
2709 struct btrfs_root *root = sctx->dev_root; 2717 struct btrfs_root *root = sctx->dev_root;
2710 2718
2711 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 2719 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
2712 return -EIO; 2720 return -EIO;
2713 2721
2714 gen = root->fs_info->last_trans_committed; 2722 gen = root->fs_info->last_trans_committed;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f4ab7a9260eb..f7a8b861058b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -85,6 +85,7 @@ struct send_ctx {
85 u32 send_max_size; 85 u32 send_max_size;
86 u64 total_send_size; 86 u64 total_send_size;
87 u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; 87 u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
88 u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */
88 89
89 struct vfsmount *mnt; 90 struct vfsmount *mnt;
90 91
@@ -3709,6 +3710,39 @@ out:
3709 return ret; 3710 return ret;
3710} 3711}
3711 3712
3713/*
3714 * Send an update extent command to user space.
3715 */
3716static int send_update_extent(struct send_ctx *sctx,
3717 u64 offset, u32 len)
3718{
3719 int ret = 0;
3720 struct fs_path *p;
3721
3722 p = fs_path_alloc(sctx);
3723 if (!p)
3724 return -ENOMEM;
3725
3726 ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT);
3727 if (ret < 0)
3728 goto out;
3729
3730 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
3731 if (ret < 0)
3732 goto out;
3733
3734 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
3735 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
3736 TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
3737
3738 ret = send_cmd(sctx);
3739
3740tlv_put_failure:
3741out:
3742 fs_path_free(sctx, p);
3743 return ret;
3744}
3745
3712static int send_write_or_clone(struct send_ctx *sctx, 3746static int send_write_or_clone(struct send_ctx *sctx,
3713 struct btrfs_path *path, 3747 struct btrfs_path *path,
3714 struct btrfs_key *key, 3748 struct btrfs_key *key,
@@ -3744,7 +3778,11 @@ static int send_write_or_clone(struct send_ctx *sctx,
3744 goto out; 3778 goto out;
3745 } 3779 }
3746 3780
3747 if (!clone_root) { 3781 if (clone_root) {
3782 ret = send_clone(sctx, offset, len, clone_root);
3783 } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
3784 ret = send_update_extent(sctx, offset, len);
3785 } else {
3748 while (pos < len) { 3786 while (pos < len) {
3749 l = len - pos; 3787 l = len - pos;
3750 if (l > BTRFS_SEND_READ_SIZE) 3788 if (l > BTRFS_SEND_READ_SIZE)
@@ -3757,10 +3795,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
3757 pos += ret; 3795 pos += ret;
3758 } 3796 }
3759 ret = 0; 3797 ret = 0;
3760 } else {
3761 ret = send_clone(sctx, offset, len, clone_root);
3762 } 3798 }
3763
3764out: 3799out:
3765 return ret; 3800 return ret;
3766} 3801}
@@ -4536,7 +4571,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4536 struct btrfs_fs_info *fs_info; 4571 struct btrfs_fs_info *fs_info;
4537 struct btrfs_ioctl_send_args *arg = NULL; 4572 struct btrfs_ioctl_send_args *arg = NULL;
4538 struct btrfs_key key; 4573 struct btrfs_key key;
4539 struct file *filp = NULL;
4540 struct send_ctx *sctx = NULL; 4574 struct send_ctx *sctx = NULL;
4541 u32 i; 4575 u32 i;
4542 u64 *clone_sources_tmp = NULL; 4576 u64 *clone_sources_tmp = NULL;
@@ -4561,6 +4595,11 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4561 goto out; 4595 goto out;
4562 } 4596 }
4563 4597
4598 if (arg->flags & ~BTRFS_SEND_FLAG_NO_FILE_DATA) {
4599 ret = -EINVAL;
4600 goto out;
4601 }
4602
4564 sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS); 4603 sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS);
4565 if (!sctx) { 4604 if (!sctx) {
4566 ret = -ENOMEM; 4605 ret = -ENOMEM;
@@ -4572,6 +4611,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4572 INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS); 4611 INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS);
4573 INIT_LIST_HEAD(&sctx->name_cache_list); 4612 INIT_LIST_HEAD(&sctx->name_cache_list);
4574 4613
4614 sctx->flags = arg->flags;
4615
4575 sctx->send_filp = fget(arg->send_fd); 4616 sctx->send_filp = fget(arg->send_fd);
4576 if (IS_ERR(sctx->send_filp)) { 4617 if (IS_ERR(sctx->send_filp)) {
4577 ret = PTR_ERR(sctx->send_filp); 4618 ret = PTR_ERR(sctx->send_filp);
@@ -4673,8 +4714,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4673 goto out; 4714 goto out;
4674 4715
4675out: 4716out:
4676 if (filp)
4677 fput(filp);
4678 kfree(arg); 4717 kfree(arg);
4679 vfree(clone_sources_tmp); 4718 vfree(clone_sources_tmp);
4680 4719
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 1bf4f32fd4ef..8bb18f7ccaa6 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -86,6 +86,7 @@ enum btrfs_send_cmd {
86 BTRFS_SEND_C_UTIMES, 86 BTRFS_SEND_C_UTIMES,
87 87
88 BTRFS_SEND_C_END, 88 BTRFS_SEND_C_END,
89 BTRFS_SEND_C_UPDATE_EXTENT,
89 __BTRFS_SEND_C_MAX, 90 __BTRFS_SEND_C_MAX,
90}; 91};
91#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) 92#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d8982e9601d3..68a29a1ea068 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -41,13 +41,13 @@
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/cleancache.h> 42#include <linux/cleancache.h>
43#include <linux/ratelimit.h> 43#include <linux/ratelimit.h>
44#include <linux/btrfs.h>
44#include "compat.h" 45#include "compat.h"
45#include "delayed-inode.h" 46#include "delayed-inode.h"
46#include "ctree.h" 47#include "ctree.h"
47#include "disk-io.h" 48#include "disk-io.h"
48#include "transaction.h" 49#include "transaction.h"
49#include "btrfs_inode.h" 50#include "btrfs_inode.h"
50#include "ioctl.h"
51#include "print-tree.h" 51#include "print-tree.h"
52#include "xattr.h" 52#include "xattr.h"
53#include "volumes.h" 53#include "volumes.h"
@@ -63,8 +63,7 @@
63static const struct super_operations btrfs_super_ops; 63static const struct super_operations btrfs_super_ops;
64static struct file_system_type btrfs_fs_type; 64static struct file_system_type btrfs_fs_type;
65 65
66static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, 66static const char *btrfs_decode_error(int errno, char nbuf[16])
67 char nbuf[16])
68{ 67{
69 char *errstr = NULL; 68 char *errstr = NULL;
70 69
@@ -98,7 +97,7 @@ static void __save_error_info(struct btrfs_fs_info *fs_info)
98 * today we only save the error info into ram. Long term we'll 97 * today we only save the error info into ram. Long term we'll
99 * also send it down to the disk 98 * also send it down to the disk
100 */ 99 */
101 fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR; 100 set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
102} 101}
103 102
104static void save_error_info(struct btrfs_fs_info *fs_info) 103static void save_error_info(struct btrfs_fs_info *fs_info)
@@ -114,7 +113,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
114 if (sb->s_flags & MS_RDONLY) 113 if (sb->s_flags & MS_RDONLY)
115 return; 114 return;
116 115
117 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 116 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
118 sb->s_flags |= MS_RDONLY; 117 sb->s_flags |= MS_RDONLY;
119 printk(KERN_INFO "btrfs is forced readonly\n"); 118 printk(KERN_INFO "btrfs is forced readonly\n");
120 /* 119 /*
@@ -142,8 +141,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
142 struct super_block *sb = fs_info->sb; 141 struct super_block *sb = fs_info->sb;
143 char nbuf[16]; 142 char nbuf[16];
144 const char *errstr; 143 const char *errstr;
145 va_list args;
146 va_start(args, fmt);
147 144
148 /* 145 /*
149 * Special case: if the error is EROFS, and we're already 146 * Special case: if the error is EROFS, and we're already
@@ -152,15 +149,18 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
152 if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) 149 if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
153 return; 150 return;
154 151
155 errstr = btrfs_decode_error(fs_info, errno, nbuf); 152 errstr = btrfs_decode_error(errno, nbuf);
156 if (fmt) { 153 if (fmt) {
157 struct va_format vaf = { 154 struct va_format vaf;
158 .fmt = fmt, 155 va_list args;
159 .va = &args, 156
160 }; 157 va_start(args, fmt);
158 vaf.fmt = fmt;
159 vaf.va = &args;
161 160
162 printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n", 161 printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n",
163 sb->s_id, function, line, errstr, &vaf); 162 sb->s_id, function, line, errstr, &vaf);
163 va_end(args);
164 } else { 164 } else {
165 printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n", 165 printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
166 sb->s_id, function, line, errstr); 166 sb->s_id, function, line, errstr);
@@ -171,7 +171,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
171 save_error_info(fs_info); 171 save_error_info(fs_info);
172 btrfs_handle_error(fs_info); 172 btrfs_handle_error(fs_info);
173 } 173 }
174 va_end(args);
175} 174}
176 175
177static const char * const logtypes[] = { 176static const char * const logtypes[] = {
@@ -261,7 +260,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
261 char nbuf[16]; 260 char nbuf[16];
262 const char *errstr; 261 const char *errstr;
263 262
264 errstr = btrfs_decode_error(root->fs_info, errno, nbuf); 263 errstr = btrfs_decode_error(errno, nbuf);
265 btrfs_printk(root->fs_info, 264 btrfs_printk(root->fs_info,
266 "%s:%d: Aborting unused transaction(%s).\n", 265 "%s:%d: Aborting unused transaction(%s).\n",
267 function, line, errstr); 266 function, line, errstr);
@@ -289,8 +288,8 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
289 va_start(args, fmt); 288 va_start(args, fmt);
290 vaf.va = &args; 289 vaf.va = &args;
291 290
292 errstr = btrfs_decode_error(fs_info, errno, nbuf); 291 errstr = btrfs_decode_error(errno, nbuf);
293 if (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR) 292 if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR))
294 panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n", 293 panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n",
295 s_id, function, line, &vaf, errstr); 294 s_id, function, line, &vaf, errstr);
296 295
@@ -438,6 +437,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
438 case Opt_compress_force: 437 case Opt_compress_force:
439 case Opt_compress_force_type: 438 case Opt_compress_force_type:
440 compress_force = true; 439 compress_force = true;
440 /* Fallthrough */
441 case Opt_compress: 441 case Opt_compress:
442 case Opt_compress_type: 442 case Opt_compress_type:
443 if (token == Opt_compress || 443 if (token == Opt_compress ||
@@ -519,7 +519,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
519 case Opt_alloc_start: 519 case Opt_alloc_start:
520 num = match_strdup(&args[0]); 520 num = match_strdup(&args[0]);
521 if (num) { 521 if (num) {
522 mutex_lock(&info->chunk_mutex);
522 info->alloc_start = memparse(num, NULL); 523 info->alloc_start = memparse(num, NULL);
524 mutex_unlock(&info->chunk_mutex);
523 kfree(num); 525 kfree(num);
524 printk(KERN_INFO 526 printk(KERN_INFO
525 "btrfs: allocations start at %llu\n", 527 "btrfs: allocations start at %llu\n",
@@ -876,7 +878,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
876 878
877 btrfs_wait_ordered_extents(root, 0); 879 btrfs_wait_ordered_extents(root, 0);
878 880
879 trans = btrfs_attach_transaction(root); 881 trans = btrfs_attach_transaction_barrier(root);
880 if (IS_ERR(trans)) { 882 if (IS_ERR(trans)) {
881 /* no transaction, don't bother */ 883 /* no transaction, don't bother */
882 if (PTR_ERR(trans) == -ENOENT) 884 if (PTR_ERR(trans) == -ENOENT)
@@ -1200,6 +1202,38 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1200 new_pool_size); 1202 new_pool_size);
1201} 1203}
1202 1204
1205static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info,
1206 unsigned long old_opts, int flags)
1207{
1208 set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
1209
1210 if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
1211 (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
1212 (flags & MS_RDONLY))) {
1213 /* wait for any defraggers to finish */
1214 wait_event(fs_info->transaction_wait,
1215 (atomic_read(&fs_info->defrag_running) == 0));
1216 if (flags & MS_RDONLY)
1217 sync_filesystem(fs_info->sb);
1218 }
1219}
1220
1221static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
1222 unsigned long old_opts)
1223{
1224 /*
1225 * We need cleanup all defragable inodes if the autodefragment is
1226 * close or the fs is R/O.
1227 */
1228 if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
1229 (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
1230 (fs_info->sb->s_flags & MS_RDONLY))) {
1231 btrfs_cleanup_defrag_inodes(fs_info);
1232 }
1233
1234 clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
1235}
1236
1203static int btrfs_remount(struct super_block *sb, int *flags, char *data) 1237static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1204{ 1238{
1205 struct btrfs_fs_info *fs_info = btrfs_sb(sb); 1239 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -1213,6 +1247,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1213 unsigned int old_metadata_ratio = fs_info->metadata_ratio; 1247 unsigned int old_metadata_ratio = fs_info->metadata_ratio;
1214 int ret; 1248 int ret;
1215 1249
1250 btrfs_remount_prepare(fs_info, old_opts, *flags);
1251
1216 ret = btrfs_parse_options(root, data); 1252 ret = btrfs_parse_options(root, data);
1217 if (ret) { 1253 if (ret) {
1218 ret = -EINVAL; 1254 ret = -EINVAL;
@@ -1223,7 +1259,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1223 fs_info->thread_pool_size, old_thread_pool_size); 1259 fs_info->thread_pool_size, old_thread_pool_size);
1224 1260
1225 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 1261 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
1226 return 0; 1262 goto out;
1227 1263
1228 if (*flags & MS_RDONLY) { 1264 if (*flags & MS_RDONLY) {
1229 /* 1265 /*
@@ -1278,7 +1314,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1278 } 1314 }
1279 sb->s_flags &= ~MS_RDONLY; 1315 sb->s_flags &= ~MS_RDONLY;
1280 } 1316 }
1281 1317out:
1318 btrfs_remount_cleanup(fs_info, old_opts);
1282 return 0; 1319 return 0;
1283 1320
1284restore: 1321restore:
@@ -1289,10 +1326,13 @@ restore:
1289 fs_info->mount_opt = old_opts; 1326 fs_info->mount_opt = old_opts;
1290 fs_info->compress_type = old_compress_type; 1327 fs_info->compress_type = old_compress_type;
1291 fs_info->max_inline = old_max_inline; 1328 fs_info->max_inline = old_max_inline;
1329 mutex_lock(&fs_info->chunk_mutex);
1292 fs_info->alloc_start = old_alloc_start; 1330 fs_info->alloc_start = old_alloc_start;
1331 mutex_unlock(&fs_info->chunk_mutex);
1293 btrfs_resize_thread_pool(fs_info, 1332 btrfs_resize_thread_pool(fs_info,
1294 old_thread_pool_size, fs_info->thread_pool_size); 1333 old_thread_pool_size, fs_info->thread_pool_size);
1295 fs_info->metadata_ratio = old_metadata_ratio; 1334 fs_info->metadata_ratio = old_metadata_ratio;
1335 btrfs_remount_cleanup(fs_info, old_opts);
1296 return ret; 1336 return ret;
1297} 1337}
1298 1338
@@ -1559,7 +1599,7 @@ static int btrfs_freeze(struct super_block *sb)
1559 struct btrfs_trans_handle *trans; 1599 struct btrfs_trans_handle *trans;
1560 struct btrfs_root *root = btrfs_sb(sb)->tree_root; 1600 struct btrfs_root *root = btrfs_sb(sb)->tree_root;
1561 1601
1562 trans = btrfs_attach_transaction(root); 1602 trans = btrfs_attach_transaction_barrier(root);
1563 if (IS_ERR(trans)) { 1603 if (IS_ERR(trans)) {
1564 /* no transaction, don't bother */ 1604 /* no transaction, don't bother */
1565 if (PTR_ERR(trans) == -ENOENT) 1605 if (PTR_ERR(trans) == -ENOENT)
@@ -1684,10 +1724,14 @@ static int __init init_btrfs_fs(void)
1684 if (err) 1724 if (err)
1685 goto free_delayed_inode; 1725 goto free_delayed_inode;
1686 1726
1687 err = btrfs_interface_init(); 1727 err = btrfs_delayed_ref_init();
1688 if (err) 1728 if (err)
1689 goto free_auto_defrag; 1729 goto free_auto_defrag;
1690 1730
1731 err = btrfs_interface_init();
1732 if (err)
1733 goto free_delayed_ref;
1734
1691 err = register_filesystem(&btrfs_fs_type); 1735 err = register_filesystem(&btrfs_fs_type);
1692 if (err) 1736 if (err)
1693 goto unregister_ioctl; 1737 goto unregister_ioctl;
@@ -1699,6 +1743,8 @@ static int __init init_btrfs_fs(void)
1699 1743
1700unregister_ioctl: 1744unregister_ioctl:
1701 btrfs_interface_exit(); 1745 btrfs_interface_exit();
1746free_delayed_ref:
1747 btrfs_delayed_ref_exit();
1702free_auto_defrag: 1748free_auto_defrag:
1703 btrfs_auto_defrag_exit(); 1749 btrfs_auto_defrag_exit();
1704free_delayed_inode: 1750free_delayed_inode:
@@ -1720,6 +1766,7 @@ free_compress:
1720static void __exit exit_btrfs_fs(void) 1766static void __exit exit_btrfs_fs(void)
1721{ 1767{
1722 btrfs_destroy_cachep(); 1768 btrfs_destroy_cachep();
1769 btrfs_delayed_ref_exit();
1723 btrfs_auto_defrag_exit(); 1770 btrfs_auto_defrag_exit();
1724 btrfs_delayed_inode_exit(); 1771 btrfs_delayed_inode_exit();
1725 ordered_data_exit(); 1772 ordered_data_exit();
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index daac9ae6d731..5b326cd60a4a 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -21,7 +21,6 @@
21#include <linux/spinlock.h> 21#include <linux/spinlock.h>
22#include <linux/completion.h> 22#include <linux/completion.h>
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/module.h>
25#include <linux/kobject.h> 24#include <linux/kobject.h>
26 25
27#include "ctree.h" 26#include "ctree.h"
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4c0067c4f76d..e52da6fb1165 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -40,7 +40,6 @@ void put_transaction(struct btrfs_transaction *transaction)
40 if (atomic_dec_and_test(&transaction->use_count)) { 40 if (atomic_dec_and_test(&transaction->use_count)) {
41 BUG_ON(!list_empty(&transaction->list)); 41 BUG_ON(!list_empty(&transaction->list));
42 WARN_ON(transaction->delayed_refs.root.rb_node); 42 WARN_ON(transaction->delayed_refs.root.rb_node);
43 memset(transaction, 0, sizeof(*transaction));
44 kmem_cache_free(btrfs_transaction_cachep, transaction); 43 kmem_cache_free(btrfs_transaction_cachep, transaction);
45 } 44 }
46} 45}
@@ -51,6 +50,14 @@ static noinline void switch_commit_root(struct btrfs_root *root)
51 root->commit_root = btrfs_root_node(root); 50 root->commit_root = btrfs_root_node(root);
52} 51}
53 52
53static inline int can_join_transaction(struct btrfs_transaction *trans,
54 int type)
55{
56 return !(trans->in_commit &&
57 type != TRANS_JOIN &&
58 type != TRANS_JOIN_NOLOCK);
59}
60
54/* 61/*
55 * either allocate a new transaction or hop into the existing one 62 * either allocate a new transaction or hop into the existing one
56 */ 63 */
@@ -62,7 +69,7 @@ static noinline int join_transaction(struct btrfs_root *root, int type)
62 spin_lock(&fs_info->trans_lock); 69 spin_lock(&fs_info->trans_lock);
63loop: 70loop:
64 /* The file system has been taken offline. No new transactions. */ 71 /* The file system has been taken offline. No new transactions. */
65 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 72 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
66 spin_unlock(&fs_info->trans_lock); 73 spin_unlock(&fs_info->trans_lock);
67 return -EROFS; 74 return -EROFS;
68 } 75 }
@@ -86,6 +93,10 @@ loop:
86 spin_unlock(&fs_info->trans_lock); 93 spin_unlock(&fs_info->trans_lock);
87 return cur_trans->aborted; 94 return cur_trans->aborted;
88 } 95 }
96 if (!can_join_transaction(cur_trans, type)) {
97 spin_unlock(&fs_info->trans_lock);
98 return -EBUSY;
99 }
89 atomic_inc(&cur_trans->use_count); 100 atomic_inc(&cur_trans->use_count);
90 atomic_inc(&cur_trans->num_writers); 101 atomic_inc(&cur_trans->num_writers);
91 cur_trans->num_joined++; 102 cur_trans->num_joined++;
@@ -113,7 +124,7 @@ loop:
113 */ 124 */
114 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 125 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
115 goto loop; 126 goto loop;
116 } else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 127 } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
117 spin_unlock(&fs_info->trans_lock); 128 spin_unlock(&fs_info->trans_lock);
118 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 129 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
119 return -EROFS; 130 return -EROFS;
@@ -155,8 +166,12 @@ loop:
155 166
156 spin_lock_init(&cur_trans->commit_lock); 167 spin_lock_init(&cur_trans->commit_lock);
157 spin_lock_init(&cur_trans->delayed_refs.lock); 168 spin_lock_init(&cur_trans->delayed_refs.lock);
169 atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
170 atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
171 init_waitqueue_head(&cur_trans->delayed_refs.wait);
158 172
159 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 173 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
174 INIT_LIST_HEAD(&cur_trans->ordered_operations);
160 list_add_tail(&cur_trans->list, &fs_info->trans_list); 175 list_add_tail(&cur_trans->list, &fs_info->trans_list);
161 extent_io_tree_init(&cur_trans->dirty_pages, 176 extent_io_tree_init(&cur_trans->dirty_pages,
162 fs_info->btree_inode->i_mapping); 177 fs_info->btree_inode->i_mapping);
@@ -301,7 +316,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,
301 int ret; 316 int ret;
302 u64 qgroup_reserved = 0; 317 u64 qgroup_reserved = 0;
303 318
304 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 319 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
305 return ERR_PTR(-EROFS); 320 return ERR_PTR(-EROFS);
306 321
307 if (current->journal_info) { 322 if (current->journal_info) {
@@ -359,8 +374,11 @@ again:
359 374
360 do { 375 do {
361 ret = join_transaction(root, type); 376 ret = join_transaction(root, type);
362 if (ret == -EBUSY) 377 if (ret == -EBUSY) {
363 wait_current_trans(root); 378 wait_current_trans(root);
379 if (unlikely(type == TRANS_ATTACH))
380 ret = -ENOENT;
381 }
364 } while (ret == -EBUSY); 382 } while (ret == -EBUSY);
365 383
366 if (ret < 0) { 384 if (ret < 0) {
@@ -382,9 +400,10 @@ again:
382 h->block_rsv = NULL; 400 h->block_rsv = NULL;
383 h->orig_rsv = NULL; 401 h->orig_rsv = NULL;
384 h->aborted = 0; 402 h->aborted = 0;
385 h->qgroup_reserved = qgroup_reserved; 403 h->qgroup_reserved = 0;
386 h->delayed_ref_elem.seq = 0; 404 h->delayed_ref_elem.seq = 0;
387 h->type = type; 405 h->type = type;
406 h->allocating_chunk = false;
388 INIT_LIST_HEAD(&h->qgroup_ref_list); 407 INIT_LIST_HEAD(&h->qgroup_ref_list);
389 INIT_LIST_HEAD(&h->new_bgs); 408 INIT_LIST_HEAD(&h->new_bgs);
390 409
@@ -400,6 +419,7 @@ again:
400 h->block_rsv = &root->fs_info->trans_block_rsv; 419 h->block_rsv = &root->fs_info->trans_block_rsv;
401 h->bytes_reserved = num_bytes; 420 h->bytes_reserved = num_bytes;
402 } 421 }
422 h->qgroup_reserved = qgroup_reserved;
403 423
404got_it: 424got_it:
405 btrfs_record_root_in_trans(h, root); 425 btrfs_record_root_in_trans(h, root);
@@ -451,11 +471,43 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root
451 return start_transaction(root, 0, TRANS_USERSPACE, 0); 471 return start_transaction(root, 0, TRANS_USERSPACE, 0);
452} 472}
453 473
474/*
475 * btrfs_attach_transaction() - catch the running transaction
476 *
477 * It is used when we want to commit the current the transaction, but
478 * don't want to start a new one.
479 *
480 * Note: If this function return -ENOENT, it just means there is no
481 * running transaction. But it is possible that the inactive transaction
482 * is still in the memory, not fully on disk. If you hope there is no
483 * inactive transaction in the fs when -ENOENT is returned, you should
484 * invoke
485 * btrfs_attach_transaction_barrier()
486 */
454struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) 487struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
455{ 488{
456 return start_transaction(root, 0, TRANS_ATTACH, 0); 489 return start_transaction(root, 0, TRANS_ATTACH, 0);
457} 490}
458 491
492/*
493 * btrfs_attach_transaction() - catch the running transaction
494 *
495 * It is similar to the above function, the differentia is this one
496 * will wait for all the inactive transactions until they fully
497 * complete.
498 */
499struct btrfs_trans_handle *
500btrfs_attach_transaction_barrier(struct btrfs_root *root)
501{
502 struct btrfs_trans_handle *trans;
503
504 trans = start_transaction(root, 0, TRANS_ATTACH, 0);
505 if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT)
506 btrfs_wait_for_commit(root, 0);
507
508 return trans;
509}
510
459/* wait for a transaction commit to be fully complete */ 511/* wait for a transaction commit to be fully complete */
460static noinline void wait_for_commit(struct btrfs_root *root, 512static noinline void wait_for_commit(struct btrfs_root *root,
461 struct btrfs_transaction *commit) 513 struct btrfs_transaction *commit)
@@ -587,7 +639,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
587 if (!list_empty(&trans->new_bgs)) 639 if (!list_empty(&trans->new_bgs))
588 btrfs_create_pending_block_groups(trans, root); 640 btrfs_create_pending_block_groups(trans, root);
589 641
590 while (count < 2) { 642 while (count < 1) {
591 unsigned long cur = trans->delayed_ref_updates; 643 unsigned long cur = trans->delayed_ref_updates;
592 trans->delayed_ref_updates = 0; 644 trans->delayed_ref_updates = 0;
593 if (cur && 645 if (cur &&
@@ -599,6 +651,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
599 } 651 }
600 count++; 652 count++;
601 } 653 }
654
602 btrfs_trans_release_metadata(trans, root); 655 btrfs_trans_release_metadata(trans, root);
603 trans->block_rsv = NULL; 656 trans->block_rsv = NULL;
604 657
@@ -644,12 +697,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
644 btrfs_run_delayed_iputs(root); 697 btrfs_run_delayed_iputs(root);
645 698
646 if (trans->aborted || 699 if (trans->aborted ||
647 root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 700 test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
648 err = -EIO; 701 err = -EIO;
649 }
650 assert_qgroups_uptodate(trans); 702 assert_qgroups_uptodate(trans);
651 703
652 memset(trans, 0, sizeof(*trans));
653 kmem_cache_free(btrfs_trans_handle_cachep, trans); 704 kmem_cache_free(btrfs_trans_handle_cachep, trans);
654 return err; 705 return err;
655} 706}
@@ -696,7 +747,9 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
696 struct extent_state *cached_state = NULL; 747 struct extent_state *cached_state = NULL;
697 u64 start = 0; 748 u64 start = 0;
698 u64 end; 749 u64 end;
750 struct blk_plug plug;
699 751
752 blk_start_plug(&plug);
700 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 753 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
701 mark, &cached_state)) { 754 mark, &cached_state)) {
702 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, 755 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
@@ -710,6 +763,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
710 } 763 }
711 if (err) 764 if (err)
712 werr = err; 765 werr = err;
766 blk_finish_plug(&plug);
713 return werr; 767 return werr;
714} 768}
715 769
@@ -960,10 +1014,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
960} 1014}
961 1015
962/* 1016/*
963 * defrag a given btree. If cacheonly == 1, this won't read from the disk, 1017 * defrag a given btree.
964 * otherwise every leaf in the btree is read and defragged. 1018 * Every leaf in the btree is read and defragged.
965 */ 1019 */
966int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) 1020int btrfs_defrag_root(struct btrfs_root *root)
967{ 1021{
968 struct btrfs_fs_info *info = root->fs_info; 1022 struct btrfs_fs_info *info = root->fs_info;
969 struct btrfs_trans_handle *trans; 1023 struct btrfs_trans_handle *trans;
@@ -977,7 +1031,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
977 if (IS_ERR(trans)) 1031 if (IS_ERR(trans))
978 return PTR_ERR(trans); 1032 return PTR_ERR(trans);
979 1033
980 ret = btrfs_defrag_leaves(trans, root, cacheonly); 1034 ret = btrfs_defrag_leaves(trans, root);
981 1035
982 btrfs_end_transaction(trans, root); 1036 btrfs_end_transaction(trans, root);
983 btrfs_btree_balance_dirty(info->tree_root); 1037 btrfs_btree_balance_dirty(info->tree_root);
@@ -985,6 +1039,12 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
985 1039
986 if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) 1040 if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
987 break; 1041 break;
1042
1043 if (btrfs_defrag_cancelled(root->fs_info)) {
1044 printk(KERN_DEBUG "btrfs: defrag_root cancelled\n");
1045 ret = -EAGAIN;
1046 break;
1047 }
988 } 1048 }
989 root->defrag_running = 0; 1049 root->defrag_running = 0;
990 return ret; 1050 return ret;
@@ -1007,7 +1067,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1007 struct inode *parent_inode; 1067 struct inode *parent_inode;
1008 struct btrfs_path *path; 1068 struct btrfs_path *path;
1009 struct btrfs_dir_item *dir_item; 1069 struct btrfs_dir_item *dir_item;
1010 struct dentry *parent;
1011 struct dentry *dentry; 1070 struct dentry *dentry;
1012 struct extent_buffer *tmp; 1071 struct extent_buffer *tmp;
1013 struct extent_buffer *old; 1072 struct extent_buffer *old;
@@ -1022,7 +1081,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1022 path = btrfs_alloc_path(); 1081 path = btrfs_alloc_path();
1023 if (!path) { 1082 if (!path) {
1024 ret = pending->error = -ENOMEM; 1083 ret = pending->error = -ENOMEM;
1025 goto path_alloc_fail; 1084 return ret;
1026 } 1085 }
1027 1086
1028 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 1087 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
@@ -1062,10 +1121,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1062 1121
1063 rsv = trans->block_rsv; 1122 rsv = trans->block_rsv;
1064 trans->block_rsv = &pending->block_rsv; 1123 trans->block_rsv = &pending->block_rsv;
1124 trans->bytes_reserved = trans->block_rsv->reserved;
1065 1125
1066 dentry = pending->dentry; 1126 dentry = pending->dentry;
1067 parent = dget_parent(dentry); 1127 parent_inode = pending->dir;
1068 parent_inode = parent->d_inode;
1069 parent_root = BTRFS_I(parent_inode)->root; 1128 parent_root = BTRFS_I(parent_inode)->root;
1070 record_root_in_trans(trans, parent_root); 1129 record_root_in_trans(trans, parent_root);
1071 1130
@@ -1213,14 +1272,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1213 if (ret) 1272 if (ret)
1214 btrfs_abort_transaction(trans, root, ret); 1273 btrfs_abort_transaction(trans, root, ret);
1215fail: 1274fail:
1216 dput(parent);
1217 trans->block_rsv = rsv; 1275 trans->block_rsv = rsv;
1276 trans->bytes_reserved = 0;
1218no_free_objectid: 1277no_free_objectid:
1219 kfree(new_root_item); 1278 kfree(new_root_item);
1220root_item_alloc_fail: 1279root_item_alloc_fail:
1221 btrfs_free_path(path); 1280 btrfs_free_path(path);
1222path_alloc_fail:
1223 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
1224 return ret; 1281 return ret;
1225} 1282}
1226 1283
@@ -1306,13 +1363,13 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
1306struct btrfs_async_commit { 1363struct btrfs_async_commit {
1307 struct btrfs_trans_handle *newtrans; 1364 struct btrfs_trans_handle *newtrans;
1308 struct btrfs_root *root; 1365 struct btrfs_root *root;
1309 struct delayed_work work; 1366 struct work_struct work;
1310}; 1367};
1311 1368
1312static void do_async_commit(struct work_struct *work) 1369static void do_async_commit(struct work_struct *work)
1313{ 1370{
1314 struct btrfs_async_commit *ac = 1371 struct btrfs_async_commit *ac =
1315 container_of(work, struct btrfs_async_commit, work.work); 1372 container_of(work, struct btrfs_async_commit, work);
1316 1373
1317 /* 1374 /*
1318 * We've got freeze protection passed with the transaction. 1375 * We've got freeze protection passed with the transaction.
@@ -1340,7 +1397,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1340 if (!ac) 1397 if (!ac)
1341 return -ENOMEM; 1398 return -ENOMEM;
1342 1399
1343 INIT_DELAYED_WORK(&ac->work, do_async_commit); 1400 INIT_WORK(&ac->work, do_async_commit);
1344 ac->root = root; 1401 ac->root = root;
1345 ac->newtrans = btrfs_join_transaction(root); 1402 ac->newtrans = btrfs_join_transaction(root);
1346 if (IS_ERR(ac->newtrans)) { 1403 if (IS_ERR(ac->newtrans)) {
@@ -1364,7 +1421,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1364 &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1421 &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1365 1, _THIS_IP_); 1422 1, _THIS_IP_);
1366 1423
1367 schedule_delayed_work(&ac->work, 0); 1424 schedule_work(&ac->work);
1368 1425
1369 /* wait for transaction to start and unblock */ 1426 /* wait for transaction to start and unblock */
1370 if (wait_for_unblock) 1427 if (wait_for_unblock)
@@ -1384,6 +1441,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1384 struct btrfs_root *root, int err) 1441 struct btrfs_root *root, int err)
1385{ 1442{
1386 struct btrfs_transaction *cur_trans = trans->transaction; 1443 struct btrfs_transaction *cur_trans = trans->transaction;
1444 DEFINE_WAIT(wait);
1387 1445
1388 WARN_ON(trans->use_count > 1); 1446 WARN_ON(trans->use_count > 1);
1389 1447
@@ -1392,8 +1450,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1392 spin_lock(&root->fs_info->trans_lock); 1450 spin_lock(&root->fs_info->trans_lock);
1393 list_del_init(&cur_trans->list); 1451 list_del_init(&cur_trans->list);
1394 if (cur_trans == root->fs_info->running_transaction) { 1452 if (cur_trans == root->fs_info->running_transaction) {
1453 root->fs_info->trans_no_join = 1;
1454 spin_unlock(&root->fs_info->trans_lock);
1455 wait_event(cur_trans->writer_wait,
1456 atomic_read(&cur_trans->num_writers) == 1);
1457
1458 spin_lock(&root->fs_info->trans_lock);
1395 root->fs_info->running_transaction = NULL; 1459 root->fs_info->running_transaction = NULL;
1396 root->fs_info->trans_no_join = 0;
1397 } 1460 }
1398 spin_unlock(&root->fs_info->trans_lock); 1461 spin_unlock(&root->fs_info->trans_lock);
1399 1462
@@ -1427,7 +1490,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1427 } 1490 }
1428 1491
1429 if (flush_on_commit || snap_pending) { 1492 if (flush_on_commit || snap_pending) {
1430 btrfs_start_delalloc_inodes(root, 1); 1493 ret = btrfs_start_delalloc_inodes(root, 1);
1494 if (ret)
1495 return ret;
1431 btrfs_wait_ordered_extents(root, 1); 1496 btrfs_wait_ordered_extents(root, 1);
1432 } 1497 }
1433 1498
@@ -1449,9 +1514,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1449 * it here and no for sure that nothing new will be added 1514 * it here and no for sure that nothing new will be added
1450 * to the list 1515 * to the list
1451 */ 1516 */
1452 btrfs_run_ordered_operations(root, 1); 1517 ret = btrfs_run_ordered_operations(trans, root, 1);
1453 1518
1454 return 0; 1519 return ret;
1455} 1520}
1456 1521
1457/* 1522/*
@@ -1472,27 +1537,35 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1472 int should_grow = 0; 1537 int should_grow = 0;
1473 unsigned long now = get_seconds(); 1538 unsigned long now = get_seconds();
1474 1539
1475 ret = btrfs_run_ordered_operations(root, 0); 1540 ret = btrfs_run_ordered_operations(trans, root, 0);
1476 if (ret) { 1541 if (ret) {
1477 btrfs_abort_transaction(trans, root, ret); 1542 btrfs_abort_transaction(trans, root, ret);
1478 goto cleanup_transaction; 1543 btrfs_end_transaction(trans, root);
1544 return ret;
1479 } 1545 }
1480 1546
1481 /* Stop the commit early if ->aborted is set */ 1547 /* Stop the commit early if ->aborted is set */
1482 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { 1548 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
1483 ret = cur_trans->aborted; 1549 ret = cur_trans->aborted;
1484 goto cleanup_transaction; 1550 btrfs_end_transaction(trans, root);
1551 return ret;
1485 } 1552 }
1486 1553
1487 /* make a pass through all the delayed refs we have so far 1554 /* make a pass through all the delayed refs we have so far
1488 * any runnings procs may add more while we are here 1555 * any runnings procs may add more while we are here
1489 */ 1556 */
1490 ret = btrfs_run_delayed_refs(trans, root, 0); 1557 ret = btrfs_run_delayed_refs(trans, root, 0);
1491 if (ret) 1558 if (ret) {
1492 goto cleanup_transaction; 1559 btrfs_end_transaction(trans, root);
1560 return ret;
1561 }
1493 1562
1494 btrfs_trans_release_metadata(trans, root); 1563 btrfs_trans_release_metadata(trans, root);
1495 trans->block_rsv = NULL; 1564 trans->block_rsv = NULL;
1565 if (trans->qgroup_reserved) {
1566 btrfs_qgroup_free(root, trans->qgroup_reserved);
1567 trans->qgroup_reserved = 0;
1568 }
1496 1569
1497 cur_trans = trans->transaction; 1570 cur_trans = trans->transaction;
1498 1571
@@ -1506,8 +1579,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1506 btrfs_create_pending_block_groups(trans, root); 1579 btrfs_create_pending_block_groups(trans, root);
1507 1580
1508 ret = btrfs_run_delayed_refs(trans, root, 0); 1581 ret = btrfs_run_delayed_refs(trans, root, 0);
1509 if (ret) 1582 if (ret) {
1510 goto cleanup_transaction; 1583 btrfs_end_transaction(trans, root);
1584 return ret;
1585 }
1511 1586
1512 spin_lock(&cur_trans->commit_lock); 1587 spin_lock(&cur_trans->commit_lock);
1513 if (cur_trans->in_commit) { 1588 if (cur_trans->in_commit) {
@@ -1771,6 +1846,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1771cleanup_transaction: 1846cleanup_transaction:
1772 btrfs_trans_release_metadata(trans, root); 1847 btrfs_trans_release_metadata(trans, root);
1773 trans->block_rsv = NULL; 1848 trans->block_rsv = NULL;
1849 if (trans->qgroup_reserved) {
1850 btrfs_qgroup_free(root, trans->qgroup_reserved);
1851 trans->qgroup_reserved = 0;
1852 }
1774 btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n"); 1853 btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n");
1775// WARN_ON(1); 1854// WARN_ON(1);
1776 if (current->journal_info == trans) 1855 if (current->journal_info == trans)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 0e8aa1e6c287..3c8e0d25c8e4 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -43,6 +43,7 @@ struct btrfs_transaction {
43 wait_queue_head_t writer_wait; 43 wait_queue_head_t writer_wait;
44 wait_queue_head_t commit_wait; 44 wait_queue_head_t commit_wait;
45 struct list_head pending_snapshots; 45 struct list_head pending_snapshots;
46 struct list_head ordered_operations;
46 struct btrfs_delayed_ref_root delayed_refs; 47 struct btrfs_delayed_ref_root delayed_refs;
47 int aborted; 48 int aborted;
48}; 49};
@@ -68,6 +69,7 @@ struct btrfs_trans_handle {
68 struct btrfs_block_rsv *orig_rsv; 69 struct btrfs_block_rsv *orig_rsv;
69 short aborted; 70 short aborted;
70 short adding_csums; 71 short adding_csums;
72 bool allocating_chunk;
71 enum btrfs_trans_type type; 73 enum btrfs_trans_type type;
72 /* 74 /*
73 * this root is only needed to validate that the root passed to 75 * this root is only needed to validate that the root passed to
@@ -82,11 +84,13 @@ struct btrfs_trans_handle {
82 84
83struct btrfs_pending_snapshot { 85struct btrfs_pending_snapshot {
84 struct dentry *dentry; 86 struct dentry *dentry;
87 struct inode *dir;
85 struct btrfs_root *root; 88 struct btrfs_root *root;
86 struct btrfs_root *snap; 89 struct btrfs_root *snap;
87 struct btrfs_qgroup_inherit *inherit; 90 struct btrfs_qgroup_inherit *inherit;
88 /* block reservation for the operation */ 91 /* block reservation for the operation */
89 struct btrfs_block_rsv block_rsv; 92 struct btrfs_block_rsv block_rsv;
93 u64 qgroup_reserved;
90 /* extra metadata reseration for relocation */ 94 /* extra metadata reseration for relocation */
91 int error; 95 int error;
92 bool readonly; 96 bool readonly;
@@ -110,13 +114,15 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush(
110struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); 114struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
111struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); 115struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
112struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); 116struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
117struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
118 struct btrfs_root *root);
113struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); 119struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
114int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); 120int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
115int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 121int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
116 struct btrfs_root *root); 122 struct btrfs_root *root);
117 123
118int btrfs_add_dead_root(struct btrfs_root *root); 124int btrfs_add_dead_root(struct btrfs_root *root);
119int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); 125int btrfs_defrag_root(struct btrfs_root *root);
120int btrfs_clean_old_snapshots(struct btrfs_root *root); 126int btrfs_clean_old_snapshots(struct btrfs_root *root);
121int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 127int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
122 struct btrfs_root *root); 128 struct btrfs_root *root);
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3b580ee8ab1d..94e05c1f118a 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -23,13 +23,14 @@
23#include "transaction.h" 23#include "transaction.h"
24#include "locking.h" 24#include "locking.h"
25 25
26/* defrag all the leaves in a given btree. If cache_only == 1, don't read 26/*
27 * things from disk, otherwise read all the leaves and try to get key order to 27 * Defrag all the leaves in a given btree.
28 * Read all the leaves and try to get key order to
28 * better reflect disk order 29 * better reflect disk order
29 */ 30 */
30 31
31int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 32int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
32 struct btrfs_root *root, int cache_only) 33 struct btrfs_root *root)
33{ 34{
34 struct btrfs_path *path = NULL; 35 struct btrfs_path *path = NULL;
35 struct btrfs_key key; 36 struct btrfs_key key;
@@ -41,9 +42,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
41 u64 last_ret = 0; 42 u64 last_ret = 0;
42 u64 min_trans = 0; 43 u64 min_trans = 0;
43 44
44 if (cache_only)
45 goto out;
46
47 if (root->fs_info->extent_root == root) { 45 if (root->fs_info->extent_root == root) {
48 /* 46 /*
49 * there's recursion here right now in the tree locking, 47 * there's recursion here right now in the tree locking,
@@ -86,11 +84,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
86 } 84 }
87 85
88 path->keep_locks = 1; 86 path->keep_locks = 1;
89 if (cache_only)
90 min_trans = root->defrag_trans_start;
91 87
92 ret = btrfs_search_forward(root, &key, NULL, path, 88 ret = btrfs_search_forward(root, &key, NULL, path, min_trans);
93 cache_only, min_trans);
94 if (ret < 0) 89 if (ret < 0)
95 goto out; 90 goto out;
96 if (ret > 0) { 91 if (ret > 0) {
@@ -109,11 +104,11 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
109 goto out; 104 goto out;
110 } 105 }
111 path->slots[1] = btrfs_header_nritems(path->nodes[1]); 106 path->slots[1] = btrfs_header_nritems(path->nodes[1]);
112 next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only, 107 next_key_ret = btrfs_find_next_key(root, path, &key, 1,
113 min_trans); 108 min_trans);
114 ret = btrfs_realloc_node(trans, root, 109 ret = btrfs_realloc_node(trans, root,
115 path->nodes[1], 0, 110 path->nodes[1], 0,
116 cache_only, &last_ret, 111 &last_ret,
117 &root->defrag_progress); 112 &root->defrag_progress);
118 if (ret) { 113 if (ret) {
119 WARN_ON(ret == -EAGAIN); 114 WARN_ON(ret == -EAGAIN);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9027bb1e7466..c7ef569eb22a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -278,8 +278,7 @@ static int process_one_buffer(struct btrfs_root *log,
278 struct walk_control *wc, u64 gen) 278 struct walk_control *wc, u64 gen)
279{ 279{
280 if (wc->pin) 280 if (wc->pin)
281 btrfs_pin_extent_for_log_replay(wc->trans, 281 btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
282 log->fs_info->extent_root,
283 eb->start, eb->len); 282 eb->start, eb->len);
284 283
285 if (btrfs_buffer_uptodate(eb, gen, 0)) { 284 if (btrfs_buffer_uptodate(eb, gen, 0)) {
@@ -485,7 +484,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
485 struct btrfs_key *key) 484 struct btrfs_key *key)
486{ 485{
487 int found_type; 486 int found_type;
488 u64 mask = root->sectorsize - 1;
489 u64 extent_end; 487 u64 extent_end;
490 u64 start = key->offset; 488 u64 start = key->offset;
491 u64 saved_nbytes; 489 u64 saved_nbytes;
@@ -502,7 +500,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
502 extent_end = start + btrfs_file_extent_num_bytes(eb, item); 500 extent_end = start + btrfs_file_extent_num_bytes(eb, item);
503 else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 501 else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
504 size = btrfs_file_extent_inline_len(eb, item); 502 size = btrfs_file_extent_inline_len(eb, item);
505 extent_end = (start + size + mask) & ~mask; 503 extent_end = ALIGN(start + size, root->sectorsize);
506 } else { 504 } else {
507 ret = 0; 505 ret = 0;
508 goto out; 506 goto out;
@@ -2281,6 +2279,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2281 unsigned long log_transid = 0; 2279 unsigned long log_transid = 0;
2282 2280
2283 mutex_lock(&root->log_mutex); 2281 mutex_lock(&root->log_mutex);
2282 log_transid = root->log_transid;
2284 index1 = root->log_transid % 2; 2283 index1 = root->log_transid % 2;
2285 if (atomic_read(&root->log_commit[index1])) { 2284 if (atomic_read(&root->log_commit[index1])) {
2286 wait_log_commit(trans, root, root->log_transid); 2285 wait_log_commit(trans, root, root->log_transid);
@@ -2308,11 +2307,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2308 /* bail out if we need to do a full commit */ 2307 /* bail out if we need to do a full commit */
2309 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2308 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2310 ret = -EAGAIN; 2309 ret = -EAGAIN;
2310 btrfs_free_logged_extents(log, log_transid);
2311 mutex_unlock(&root->log_mutex); 2311 mutex_unlock(&root->log_mutex);
2312 goto out; 2312 goto out;
2313 } 2313 }
2314 2314
2315 log_transid = root->log_transid;
2316 if (log_transid % 2 == 0) 2315 if (log_transid % 2 == 0)
2317 mark = EXTENT_DIRTY; 2316 mark = EXTENT_DIRTY;
2318 else 2317 else
@@ -2324,6 +2323,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2324 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); 2323 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
2325 if (ret) { 2324 if (ret) {
2326 btrfs_abort_transaction(trans, root, ret); 2325 btrfs_abort_transaction(trans, root, ret);
2326 btrfs_free_logged_extents(log, log_transid);
2327 mutex_unlock(&root->log_mutex); 2327 mutex_unlock(&root->log_mutex);
2328 goto out; 2328 goto out;
2329 } 2329 }
@@ -2363,6 +2363,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2363 } 2363 }
2364 root->fs_info->last_trans_log_full_commit = trans->transid; 2364 root->fs_info->last_trans_log_full_commit = trans->transid;
2365 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2365 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2366 btrfs_free_logged_extents(log, log_transid);
2366 mutex_unlock(&log_root_tree->log_mutex); 2367 mutex_unlock(&log_root_tree->log_mutex);
2367 ret = -EAGAIN; 2368 ret = -EAGAIN;
2368 goto out; 2369 goto out;
@@ -2373,6 +2374,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2373 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2374 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2374 wait_log_commit(trans, log_root_tree, 2375 wait_log_commit(trans, log_root_tree,
2375 log_root_tree->log_transid); 2376 log_root_tree->log_transid);
2377 btrfs_free_logged_extents(log, log_transid);
2376 mutex_unlock(&log_root_tree->log_mutex); 2378 mutex_unlock(&log_root_tree->log_mutex);
2377 ret = 0; 2379 ret = 0;
2378 goto out; 2380 goto out;
@@ -2392,6 +2394,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2392 */ 2394 */
2393 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2395 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2394 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2396 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2397 btrfs_free_logged_extents(log, log_transid);
2395 mutex_unlock(&log_root_tree->log_mutex); 2398 mutex_unlock(&log_root_tree->log_mutex);
2396 ret = -EAGAIN; 2399 ret = -EAGAIN;
2397 goto out_wake_log_root; 2400 goto out_wake_log_root;
@@ -2402,10 +2405,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2402 EXTENT_DIRTY | EXTENT_NEW); 2405 EXTENT_DIRTY | EXTENT_NEW);
2403 if (ret) { 2406 if (ret) {
2404 btrfs_abort_transaction(trans, root, ret); 2407 btrfs_abort_transaction(trans, root, ret);
2408 btrfs_free_logged_extents(log, log_transid);
2405 mutex_unlock(&log_root_tree->log_mutex); 2409 mutex_unlock(&log_root_tree->log_mutex);
2406 goto out_wake_log_root; 2410 goto out_wake_log_root;
2407 } 2411 }
2408 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2412 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2413 btrfs_wait_logged_extents(log, log_transid);
2409 2414
2410 btrfs_set_super_log_root(root->fs_info->super_for_commit, 2415 btrfs_set_super_log_root(root->fs_info->super_for_commit,
2411 log_root_tree->node->start); 2416 log_root_tree->node->start);
@@ -2461,8 +2466,10 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
2461 .process_func = process_one_buffer 2466 .process_func = process_one_buffer
2462 }; 2467 };
2463 2468
2464 ret = walk_log_tree(trans, log, &wc); 2469 if (trans) {
2465 BUG_ON(ret); 2470 ret = walk_log_tree(trans, log, &wc);
2471 BUG_ON(ret);
2472 }
2466 2473
2467 while (1) { 2474 while (1) {
2468 ret = find_first_extent_bit(&log->dirty_log_pages, 2475 ret = find_first_extent_bit(&log->dirty_log_pages,
@@ -2475,6 +2482,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
2475 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2482 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
2476 } 2483 }
2477 2484
2485 /*
2486 * We may have short-circuited the log tree with the full commit logic
2487 * and left ordered extents on our list, so clear these out to keep us
2488 * from leaking inodes and memory.
2489 */
2490 btrfs_free_logged_extents(log, 0);
2491 btrfs_free_logged_extents(log, 1);
2492
2478 free_extent_buffer(log->node); 2493 free_extent_buffer(log->node);
2479 kfree(log); 2494 kfree(log);
2480} 2495}
@@ -2724,7 +2739,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2724 path->keep_locks = 1; 2739 path->keep_locks = 1;
2725 2740
2726 ret = btrfs_search_forward(root, &min_key, &max_key, 2741 ret = btrfs_search_forward(root, &min_key, &max_key,
2727 path, 0, trans->transid); 2742 path, trans->transid);
2728 2743
2729 /* 2744 /*
2730 * we didn't find anything from this transaction, see if there 2745 * we didn't find anything from this transaction, see if there
@@ -3271,16 +3286,21 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3271 struct btrfs_root *log = root->log_root; 3286 struct btrfs_root *log = root->log_root;
3272 struct btrfs_file_extent_item *fi; 3287 struct btrfs_file_extent_item *fi;
3273 struct extent_buffer *leaf; 3288 struct extent_buffer *leaf;
3289 struct btrfs_ordered_extent *ordered;
3274 struct list_head ordered_sums; 3290 struct list_head ordered_sums;
3275 struct btrfs_map_token token; 3291 struct btrfs_map_token token;
3276 struct btrfs_key key; 3292 struct btrfs_key key;
3277 u64 csum_offset = em->mod_start - em->start; 3293 u64 mod_start = em->mod_start;
3278 u64 csum_len = em->mod_len; 3294 u64 mod_len = em->mod_len;
3295 u64 csum_offset;
3296 u64 csum_len;
3279 u64 extent_offset = em->start - em->orig_start; 3297 u64 extent_offset = em->start - em->orig_start;
3280 u64 block_len; 3298 u64 block_len;
3281 int ret; 3299 int ret;
3300 int index = log->log_transid % 2;
3282 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3301 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3283 3302
3303insert:
3284 INIT_LIST_HEAD(&ordered_sums); 3304 INIT_LIST_HEAD(&ordered_sums);
3285 btrfs_init_map_token(&token); 3305 btrfs_init_map_token(&token);
3286 key.objectid = btrfs_ino(inode); 3306 key.objectid = btrfs_ino(inode);
@@ -3296,6 +3316,23 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3296 leaf = path->nodes[0]; 3316 leaf = path->nodes[0];
3297 fi = btrfs_item_ptr(leaf, path->slots[0], 3317 fi = btrfs_item_ptr(leaf, path->slots[0],
3298 struct btrfs_file_extent_item); 3318 struct btrfs_file_extent_item);
3319
3320 /*
3321 * If we are overwriting an inline extent with a real one then we need
3322 * to just delete the inline extent as it may not be large enough to
3323 * have the entire file_extent_item.
3324 */
3325 if (ret && btrfs_token_file_extent_type(leaf, fi, &token) ==
3326 BTRFS_FILE_EXTENT_INLINE) {
3327 ret = btrfs_del_item(trans, log, path);
3328 btrfs_release_path(path);
3329 if (ret) {
3330 path->really_keep_locks = 0;
3331 return ret;
3332 }
3333 goto insert;
3334 }
3335
3299 btrfs_set_token_file_extent_generation(leaf, fi, em->generation, 3336 btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
3300 &token); 3337 &token);
3301 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3338 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
@@ -3362,6 +3399,92 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3362 csum_len = block_len; 3399 csum_len = block_len;
3363 } 3400 }
3364 3401
3402 /*
3403 * First check and see if our csums are on our outstanding ordered
3404 * extents.
3405 */
3406again:
3407 spin_lock_irq(&log->log_extents_lock[index]);
3408 list_for_each_entry(ordered, &log->logged_list[index], log_list) {
3409 struct btrfs_ordered_sum *sum;
3410
3411 if (!mod_len)
3412 break;
3413
3414 if (ordered->inode != inode)
3415 continue;
3416
3417 if (ordered->file_offset + ordered->len <= mod_start ||
3418 mod_start + mod_len <= ordered->file_offset)
3419 continue;
3420
3421 /*
3422 * We are going to copy all the csums on this ordered extent, so
3423 * go ahead and adjust mod_start and mod_len in case this
3424 * ordered extent has already been logged.
3425 */
3426 if (ordered->file_offset > mod_start) {
3427 if (ordered->file_offset + ordered->len >=
3428 mod_start + mod_len)
3429 mod_len = ordered->file_offset - mod_start;
3430 /*
3431 * If we have this case
3432 *
3433 * |--------- logged extent ---------|
3434 * |----- ordered extent ----|
3435 *
3436 * Just don't mess with mod_start and mod_len, we'll
3437 * just end up logging more csums than we need and it
3438 * will be ok.
3439 */
3440 } else {
3441 if (ordered->file_offset + ordered->len <
3442 mod_start + mod_len) {
3443 mod_len = (mod_start + mod_len) -
3444 (ordered->file_offset + ordered->len);
3445 mod_start = ordered->file_offset +
3446 ordered->len;
3447 } else {
3448 mod_len = 0;
3449 }
3450 }
3451
3452 /*
3453 * To keep us from looping for the above case of an ordered
3454 * extent that falls inside of the logged extent.
3455 */
3456 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
3457 &ordered->flags))
3458 continue;
3459 atomic_inc(&ordered->refs);
3460 spin_unlock_irq(&log->log_extents_lock[index]);
3461 /*
3462 * we've dropped the lock, we must either break or
3463 * start over after this.
3464 */
3465
3466 wait_event(ordered->wait, ordered->csum_bytes_left == 0);
3467
3468 list_for_each_entry(sum, &ordered->list, list) {
3469 ret = btrfs_csum_file_blocks(trans, log, sum);
3470 if (ret) {
3471 btrfs_put_ordered_extent(ordered);
3472 goto unlocked;
3473 }
3474 }
3475 btrfs_put_ordered_extent(ordered);
3476 goto again;
3477
3478 }
3479 spin_unlock_irq(&log->log_extents_lock[index]);
3480unlocked:
3481
3482 if (!mod_len || ret)
3483 return ret;
3484
3485 csum_offset = mod_start - em->start;
3486 csum_len = mod_len;
3487
3365 /* block start is already adjusted for the file extent offset. */ 3488 /* block start is already adjusted for the file extent offset. */
3366 ret = btrfs_lookup_csums_range(log->fs_info->csum_root, 3489 ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
3367 em->block_start + csum_offset, 3490 em->block_start + csum_offset,
@@ -3393,6 +3516,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3393 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3516 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3394 u64 test_gen; 3517 u64 test_gen;
3395 int ret = 0; 3518 int ret = 0;
3519 int num = 0;
3396 3520
3397 INIT_LIST_HEAD(&extents); 3521 INIT_LIST_HEAD(&extents);
3398 3522
@@ -3401,16 +3525,31 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3401 3525
3402 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 3526 list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
3403 list_del_init(&em->list); 3527 list_del_init(&em->list);
3528
3529 /*
3530 * Just an arbitrary number, this can be really CPU intensive
3531 * once we start getting a lot of extents, and really once we
3532 * have a bunch of extents we just want to commit since it will
3533 * be faster.
3534 */
3535 if (++num > 32768) {
3536 list_del_init(&tree->modified_extents);
3537 ret = -EFBIG;
3538 goto process;
3539 }
3540
3404 if (em->generation <= test_gen) 3541 if (em->generation <= test_gen)
3405 continue; 3542 continue;
3406 /* Need a ref to keep it from getting evicted from cache */ 3543 /* Need a ref to keep it from getting evicted from cache */
3407 atomic_inc(&em->refs); 3544 atomic_inc(&em->refs);
3408 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 3545 set_bit(EXTENT_FLAG_LOGGING, &em->flags);
3409 list_add_tail(&em->list, &extents); 3546 list_add_tail(&em->list, &extents);
3547 num++;
3410 } 3548 }
3411 3549
3412 list_sort(NULL, &extents, extent_cmp); 3550 list_sort(NULL, &extents, extent_cmp);
3413 3551
3552process:
3414 while (!list_empty(&extents)) { 3553 while (!list_empty(&extents)) {
3415 em = list_entry(extents.next, struct extent_map, list); 3554 em = list_entry(extents.next, struct extent_map, list);
3416 3555
@@ -3513,6 +3652,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3513 3652
3514 mutex_lock(&BTRFS_I(inode)->log_mutex); 3653 mutex_lock(&BTRFS_I(inode)->log_mutex);
3515 3654
3655 btrfs_get_logged_extents(log, inode);
3656
3516 /* 3657 /*
3517 * a brute force approach to making sure we get the most uptodate 3658 * a brute force approach to making sure we get the most uptodate
3518 * copies of everything. 3659 * copies of everything.
@@ -3558,7 +3699,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3558 while (1) { 3699 while (1) {
3559 ins_nr = 0; 3700 ins_nr = 0;
3560 ret = btrfs_search_forward(root, &min_key, &max_key, 3701 ret = btrfs_search_forward(root, &min_key, &max_key,
3561 path, 0, trans->transid); 3702 path, trans->transid);
3562 if (ret != 0) 3703 if (ret != 0)
3563 break; 3704 break;
3564again: 3705again:
@@ -3656,6 +3797,8 @@ log_extents:
3656 BTRFS_I(inode)->logged_trans = trans->transid; 3797 BTRFS_I(inode)->logged_trans = trans->transid;
3657 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 3798 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
3658out_unlock: 3799out_unlock:
3800 if (err)
3801 btrfs_free_logged_extents(log, log->log_transid);
3659 mutex_unlock(&BTRFS_I(inode)->log_mutex); 3802 mutex_unlock(&BTRFS_I(inode)->log_mutex);
3660 3803
3661 btrfs_free_path(path); 3804 btrfs_free_path(path);
@@ -3822,7 +3965,6 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
3822end_trans: 3965end_trans:
3823 dput(old_parent); 3966 dput(old_parent);
3824 if (ret < 0) { 3967 if (ret < 0) {
3825 WARN_ON(ret != -ENOSPC);
3826 root->fs_info->last_trans_log_full_commit = trans->transid; 3968 root->fs_info->last_trans_log_full_commit = trans->transid;
3827 ret = 1; 3969 ret = 1;
3828 } 3970 }
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 99be4c138db6..ddc61cad0080 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -5,7 +5,7 @@
5 */ 5 */
6 6
7#include <linux/slab.h> 7#include <linux/slab.h>
8#include <linux/module.h> 8#include <linux/export.h>
9#include "ulist.h" 9#include "ulist.h"
10 10
11/* 11/*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5cbb7f4b1672..35bb2d4ed29f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,6 +25,8 @@
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/raid/pq.h>
29#include <asm/div64.h>
28#include "compat.h" 30#include "compat.h"
29#include "ctree.h" 31#include "ctree.h"
30#include "extent_map.h" 32#include "extent_map.h"
@@ -32,6 +34,7 @@
32#include "transaction.h" 34#include "transaction.h"
33#include "print-tree.h" 35#include "print-tree.h"
34#include "volumes.h" 36#include "volumes.h"
37#include "raid56.h"
35#include "async-thread.h" 38#include "async-thread.h"
36#include "check-integrity.h" 39#include "check-integrity.h"
37#include "rcu-string.h" 40#include "rcu-string.h"
@@ -647,6 +650,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
647 new_device->writeable = 0; 650 new_device->writeable = 0;
648 new_device->in_fs_metadata = 0; 651 new_device->in_fs_metadata = 0;
649 new_device->can_discard = 0; 652 new_device->can_discard = 0;
653 spin_lock_init(&new_device->io_lock);
650 list_replace_rcu(&device->dev_list, &new_device->dev_list); 654 list_replace_rcu(&device->dev_list, &new_device->dev_list);
651 655
652 call_rcu(&device->rcu, free_device); 656 call_rcu(&device->rcu, free_device);
@@ -792,26 +796,75 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
792 return ret; 796 return ret;
793} 797}
794 798
799/*
800 * Look for a btrfs signature on a device. This may be called out of the mount path
801 * and we are not allowed to call set_blocksize during the scan. The superblock
802 * is read via pagecache
803 */
795int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 804int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
796 struct btrfs_fs_devices **fs_devices_ret) 805 struct btrfs_fs_devices **fs_devices_ret)
797{ 806{
798 struct btrfs_super_block *disk_super; 807 struct btrfs_super_block *disk_super;
799 struct block_device *bdev; 808 struct block_device *bdev;
800 struct buffer_head *bh; 809 struct page *page;
801 int ret; 810 void *p;
811 int ret = -EINVAL;
802 u64 devid; 812 u64 devid;
803 u64 transid; 813 u64 transid;
804 u64 total_devices; 814 u64 total_devices;
815 u64 bytenr;
816 pgoff_t index;
805 817
818 /*
819 * we would like to check all the supers, but that would make
820 * a btrfs mount succeed after a mkfs from a different FS.
821 * So, we need to add a special mount option to scan for
822 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
823 */
824 bytenr = btrfs_sb_offset(0);
806 flags |= FMODE_EXCL; 825 flags |= FMODE_EXCL;
807 mutex_lock(&uuid_mutex); 826 mutex_lock(&uuid_mutex);
808 ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh); 827
809 if (ret) 828 bdev = blkdev_get_by_path(path, flags, holder);
829
830 if (IS_ERR(bdev)) {
831 ret = PTR_ERR(bdev);
810 goto error; 832 goto error;
811 disk_super = (struct btrfs_super_block *)bh->b_data; 833 }
834
835 /* make sure our super fits in the device */
836 if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
837 goto error_bdev_put;
838
839 /* make sure our super fits in the page */
840 if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
841 goto error_bdev_put;
842
843 /* make sure our super doesn't straddle pages on disk */
844 index = bytenr >> PAGE_CACHE_SHIFT;
845 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
846 goto error_bdev_put;
847
848 /* pull in the page with our super */
849 page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
850 index, GFP_NOFS);
851
852 if (IS_ERR_OR_NULL(page))
853 goto error_bdev_put;
854
855 p = kmap(page);
856
857 /* align our pointer to the offset of the super block */
858 disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
859
860 if (btrfs_super_bytenr(disk_super) != bytenr ||
861 disk_super->magic != cpu_to_le64(BTRFS_MAGIC))
862 goto error_unmap;
863
812 devid = btrfs_stack_device_id(&disk_super->dev_item); 864 devid = btrfs_stack_device_id(&disk_super->dev_item);
813 transid = btrfs_super_generation(disk_super); 865 transid = btrfs_super_generation(disk_super);
814 total_devices = btrfs_super_num_devices(disk_super); 866 total_devices = btrfs_super_num_devices(disk_super);
867
815 if (disk_super->label[0]) { 868 if (disk_super->label[0]) {
816 if (disk_super->label[BTRFS_LABEL_SIZE - 1]) 869 if (disk_super->label[BTRFS_LABEL_SIZE - 1])
817 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; 870 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
@@ -819,12 +872,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
819 } else { 872 } else {
820 printk(KERN_INFO "device fsid %pU ", disk_super->fsid); 873 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
821 } 874 }
875
822 printk(KERN_CONT "devid %llu transid %llu %s\n", 876 printk(KERN_CONT "devid %llu transid %llu %s\n",
823 (unsigned long long)devid, (unsigned long long)transid, path); 877 (unsigned long long)devid, (unsigned long long)transid, path);
878
824 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 879 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
825 if (!ret && fs_devices_ret) 880 if (!ret && fs_devices_ret)
826 (*fs_devices_ret)->total_devices = total_devices; 881 (*fs_devices_ret)->total_devices = total_devices;
827 brelse(bh); 882
883error_unmap:
884 kunmap(page);
885 page_cache_release(page);
886
887error_bdev_put:
828 blkdev_put(bdev, flags); 888 blkdev_put(bdev, flags);
829error: 889error:
830 mutex_unlock(&uuid_mutex); 890 mutex_unlock(&uuid_mutex);
@@ -1372,14 +1432,19 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1372 u64 devid; 1432 u64 devid;
1373 u64 num_devices; 1433 u64 num_devices;
1374 u8 *dev_uuid; 1434 u8 *dev_uuid;
1435 unsigned seq;
1375 int ret = 0; 1436 int ret = 0;
1376 bool clear_super = false; 1437 bool clear_super = false;
1377 1438
1378 mutex_lock(&uuid_mutex); 1439 mutex_lock(&uuid_mutex);
1379 1440
1380 all_avail = root->fs_info->avail_data_alloc_bits | 1441 do {
1381 root->fs_info->avail_system_alloc_bits | 1442 seq = read_seqbegin(&root->fs_info->profiles_lock);
1382 root->fs_info->avail_metadata_alloc_bits; 1443
1444 all_avail = root->fs_info->avail_data_alloc_bits |
1445 root->fs_info->avail_system_alloc_bits |
1446 root->fs_info->avail_metadata_alloc_bits;
1447 } while (read_seqretry(&root->fs_info->profiles_lock, seq));
1383 1448
1384 num_devices = root->fs_info->fs_devices->num_devices; 1449 num_devices = root->fs_info->fs_devices->num_devices;
1385 btrfs_dev_replace_lock(&root->fs_info->dev_replace); 1450 btrfs_dev_replace_lock(&root->fs_info->dev_replace);
@@ -1403,6 +1468,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1403 goto out; 1468 goto out;
1404 } 1469 }
1405 1470
1471 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
1472 root->fs_info->fs_devices->rw_devices <= 2) {
1473 printk(KERN_ERR "btrfs: unable to go below two "
1474 "devices on raid5\n");
1475 ret = -EINVAL;
1476 goto out;
1477 }
1478 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
1479 root->fs_info->fs_devices->rw_devices <= 3) {
1480 printk(KERN_ERR "btrfs: unable to go below three "
1481 "devices on raid6\n");
1482 ret = -EINVAL;
1483 goto out;
1484 }
1485
1406 if (strcmp(device_path, "missing") == 0) { 1486 if (strcmp(device_path, "missing") == 0) {
1407 struct list_head *devices; 1487 struct list_head *devices;
1408 struct btrfs_device *tmp; 1488 struct btrfs_device *tmp;
@@ -2616,7 +2696,7 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2616 chunk_used = btrfs_block_group_used(&cache->item); 2696 chunk_used = btrfs_block_group_used(&cache->item);
2617 2697
2618 if (bargs->usage == 0) 2698 if (bargs->usage == 0)
2619 user_thresh = 0; 2699 user_thresh = 1;
2620 else if (bargs->usage > 100) 2700 else if (bargs->usage > 100)
2621 user_thresh = cache->key.offset; 2701 user_thresh = cache->key.offset;
2622 else 2702 else
@@ -2664,11 +2744,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
2664 return 0; 2744 return 0;
2665 2745
2666 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 2746 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2667 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) 2747 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
2668 factor = 2; 2748 factor = num_stripes / 2;
2669 else 2749 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
2670 factor = 1; 2750 factor = num_stripes - 1;
2671 factor = num_stripes / factor; 2751 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
2752 factor = num_stripes - 2;
2753 } else {
2754 factor = num_stripes;
2755 }
2672 2756
2673 for (i = 0; i < num_stripes; i++) { 2757 for (i = 0; i < num_stripes; i++) {
2674 stripe = btrfs_stripe_nr(chunk, i); 2758 stripe = btrfs_stripe_nr(chunk, i);
@@ -2985,6 +3069,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2985 int mixed = 0; 3069 int mixed = 0;
2986 int ret; 3070 int ret;
2987 u64 num_devices; 3071 u64 num_devices;
3072 unsigned seq;
2988 3073
2989 if (btrfs_fs_closing(fs_info) || 3074 if (btrfs_fs_closing(fs_info) ||
2990 atomic_read(&fs_info->balance_pause_req) || 3075 atomic_read(&fs_info->balance_pause_req) ||
@@ -3027,7 +3112,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3027 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3112 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3028 else 3113 else
3029 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 3114 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
3030 BTRFS_BLOCK_GROUP_RAID10); 3115 BTRFS_BLOCK_GROUP_RAID10 |
3116 BTRFS_BLOCK_GROUP_RAID5 |
3117 BTRFS_BLOCK_GROUP_RAID6);
3031 3118
3032 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3119 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3033 (!alloc_profile_is_valid(bctl->data.target, 1) || 3120 (!alloc_profile_is_valid(bctl->data.target, 1) ||
@@ -3067,23 +3154,29 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3067 3154
3068 /* allow to reduce meta or sys integrity only if force set */ 3155 /* allow to reduce meta or sys integrity only if force set */
3069 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3156 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3070 BTRFS_BLOCK_GROUP_RAID10; 3157 BTRFS_BLOCK_GROUP_RAID10 |
3071 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3158 BTRFS_BLOCK_GROUP_RAID5 |
3072 (fs_info->avail_system_alloc_bits & allowed) && 3159 BTRFS_BLOCK_GROUP_RAID6;
3073 !(bctl->sys.target & allowed)) || 3160 do {
3074 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3161 seq = read_seqbegin(&fs_info->profiles_lock);
3075 (fs_info->avail_metadata_alloc_bits & allowed) && 3162
3076 !(bctl->meta.target & allowed))) { 3163 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3077 if (bctl->flags & BTRFS_BALANCE_FORCE) { 3164 (fs_info->avail_system_alloc_bits & allowed) &&
3078 printk(KERN_INFO "btrfs: force reducing metadata " 3165 !(bctl->sys.target & allowed)) ||
3079 "integrity\n"); 3166 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3080 } else { 3167 (fs_info->avail_metadata_alloc_bits & allowed) &&
3081 printk(KERN_ERR "btrfs: balance will reduce metadata " 3168 !(bctl->meta.target & allowed))) {
3082 "integrity, use force if you want this\n"); 3169 if (bctl->flags & BTRFS_BALANCE_FORCE) {
3083 ret = -EINVAL; 3170 printk(KERN_INFO "btrfs: force reducing metadata "
3084 goto out; 3171 "integrity\n");
3172 } else {
3173 printk(KERN_ERR "btrfs: balance will reduce metadata "
3174 "integrity, use force if you want this\n");
3175 ret = -EINVAL;
3176 goto out;
3177 }
3085 } 3178 }
3086 } 3179 } while (read_seqretry(&fs_info->profiles_lock, seq));
3087 3180
3088 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3181 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3089 int num_tolerated_disk_barrier_failures; 3182 int num_tolerated_disk_barrier_failures;
@@ -3127,21 +3220,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3127 mutex_lock(&fs_info->balance_mutex); 3220 mutex_lock(&fs_info->balance_mutex);
3128 atomic_dec(&fs_info->balance_running); 3221 atomic_dec(&fs_info->balance_running);
3129 3222
3130 if (bargs) {
3131 memset(bargs, 0, sizeof(*bargs));
3132 update_ioctl_balance_args(fs_info, 0, bargs);
3133 }
3134
3135 if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
3136 balance_need_close(fs_info)) {
3137 __cancel_balance(fs_info);
3138 }
3139
3140 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3223 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3141 fs_info->num_tolerated_disk_barrier_failures = 3224 fs_info->num_tolerated_disk_barrier_failures =
3142 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 3225 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3143 } 3226 }
3144 3227
3228 if (bargs) {
3229 memset(bargs, 0, sizeof(*bargs));
3230 update_ioctl_balance_args(fs_info, 0, bargs);
3231 }
3232
3145 wake_up(&fs_info->balance_wait_q); 3233 wake_up(&fs_info->balance_wait_q);
3146 3234
3147 return ret; 3235 return ret;
@@ -3504,13 +3592,86 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
3504} 3592}
3505 3593
3506struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 3594struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
3507 { 2, 1, 0, 4, 2, 2 /* raid10 */ }, 3595 [BTRFS_RAID_RAID10] = {
3508 { 1, 1, 2, 2, 2, 2 /* raid1 */ }, 3596 .sub_stripes = 2,
3509 { 1, 2, 1, 1, 1, 2 /* dup */ }, 3597 .dev_stripes = 1,
3510 { 1, 1, 0, 2, 1, 1 /* raid0 */ }, 3598 .devs_max = 0, /* 0 == as many as possible */
3511 { 1, 1, 1, 1, 1, 1 /* single */ }, 3599 .devs_min = 4,
3600 .devs_increment = 2,
3601 .ncopies = 2,
3602 },
3603 [BTRFS_RAID_RAID1] = {
3604 .sub_stripes = 1,
3605 .dev_stripes = 1,
3606 .devs_max = 2,
3607 .devs_min = 2,
3608 .devs_increment = 2,
3609 .ncopies = 2,
3610 },
3611 [BTRFS_RAID_DUP] = {
3612 .sub_stripes = 1,
3613 .dev_stripes = 2,
3614 .devs_max = 1,
3615 .devs_min = 1,
3616 .devs_increment = 1,
3617 .ncopies = 2,
3618 },
3619 [BTRFS_RAID_RAID0] = {
3620 .sub_stripes = 1,
3621 .dev_stripes = 1,
3622 .devs_max = 0,
3623 .devs_min = 2,
3624 .devs_increment = 1,
3625 .ncopies = 1,
3626 },
3627 [BTRFS_RAID_SINGLE] = {
3628 .sub_stripes = 1,
3629 .dev_stripes = 1,
3630 .devs_max = 1,
3631 .devs_min = 1,
3632 .devs_increment = 1,
3633 .ncopies = 1,
3634 },
3635 [BTRFS_RAID_RAID5] = {
3636 .sub_stripes = 1,
3637 .dev_stripes = 1,
3638 .devs_max = 0,
3639 .devs_min = 2,
3640 .devs_increment = 1,
3641 .ncopies = 2,
3642 },
3643 [BTRFS_RAID_RAID6] = {
3644 .sub_stripes = 1,
3645 .dev_stripes = 1,
3646 .devs_max = 0,
3647 .devs_min = 3,
3648 .devs_increment = 1,
3649 .ncopies = 3,
3650 },
3512}; 3651};
3513 3652
3653static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
3654{
3655 /* TODO allow them to set a preferred stripe size */
3656 return 64 * 1024;
3657}
3658
3659static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
3660{
3661 u64 features;
3662
3663 if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
3664 return;
3665
3666 features = btrfs_super_incompat_flags(info->super_copy);
3667 if (features & BTRFS_FEATURE_INCOMPAT_RAID56)
3668 return;
3669
3670 features |= BTRFS_FEATURE_INCOMPAT_RAID56;
3671 btrfs_set_super_incompat_flags(info->super_copy, features);
3672 printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n");
3673}
3674
3514static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3675static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3515 struct btrfs_root *extent_root, 3676 struct btrfs_root *extent_root,
3516 struct map_lookup **map_ret, 3677 struct map_lookup **map_ret,
@@ -3526,6 +3687,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3526 struct btrfs_device_info *devices_info = NULL; 3687 struct btrfs_device_info *devices_info = NULL;
3527 u64 total_avail; 3688 u64 total_avail;
3528 int num_stripes; /* total number of stripes to allocate */ 3689 int num_stripes; /* total number of stripes to allocate */
3690 int data_stripes; /* number of stripes that count for
3691 block group size */
3529 int sub_stripes; /* sub_stripes info for map */ 3692 int sub_stripes; /* sub_stripes info for map */
3530 int dev_stripes; /* stripes per dev */ 3693 int dev_stripes; /* stripes per dev */
3531 int devs_max; /* max devs to use */ 3694 int devs_max; /* max devs to use */
@@ -3537,6 +3700,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3537 u64 max_chunk_size; 3700 u64 max_chunk_size;
3538 u64 stripe_size; 3701 u64 stripe_size;
3539 u64 num_bytes; 3702 u64 num_bytes;
3703 u64 raid_stripe_len = BTRFS_STRIPE_LEN;
3540 int ndevs; 3704 int ndevs;
3541 int i; 3705 int i;
3542 int j; 3706 int j;
@@ -3631,12 +3795,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3631 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) 3795 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
3632 continue; 3796 continue;
3633 3797
3798 if (ndevs == fs_devices->rw_devices) {
3799 WARN(1, "%s: found more than %llu devices\n",
3800 __func__, fs_devices->rw_devices);
3801 break;
3802 }
3634 devices_info[ndevs].dev_offset = dev_offset; 3803 devices_info[ndevs].dev_offset = dev_offset;
3635 devices_info[ndevs].max_avail = max_avail; 3804 devices_info[ndevs].max_avail = max_avail;
3636 devices_info[ndevs].total_avail = total_avail; 3805 devices_info[ndevs].total_avail = total_avail;
3637 devices_info[ndevs].dev = device; 3806 devices_info[ndevs].dev = device;
3638 ++ndevs; 3807 ++ndevs;
3639 WARN_ON(ndevs > fs_devices->rw_devices);
3640 } 3808 }
3641 3809
3642 /* 3810 /*
@@ -3662,16 +3830,48 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3662 stripe_size = devices_info[ndevs-1].max_avail; 3830 stripe_size = devices_info[ndevs-1].max_avail;
3663 num_stripes = ndevs * dev_stripes; 3831 num_stripes = ndevs * dev_stripes;
3664 3832
3665 if (stripe_size * ndevs > max_chunk_size * ncopies) { 3833 /*
3666 stripe_size = max_chunk_size * ncopies; 3834 * this will have to be fixed for RAID1 and RAID10 over
3667 do_div(stripe_size, ndevs); 3835 * more drives
3836 */
3837 data_stripes = num_stripes / ncopies;
3838
3839 if (type & BTRFS_BLOCK_GROUP_RAID5) {
3840 raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
3841 btrfs_super_stripesize(info->super_copy));
3842 data_stripes = num_stripes - 1;
3843 }
3844 if (type & BTRFS_BLOCK_GROUP_RAID6) {
3845 raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
3846 btrfs_super_stripesize(info->super_copy));
3847 data_stripes = num_stripes - 2;
3848 }
3849
3850 /*
3851 * Use the number of data stripes to figure out how big this chunk
3852 * is really going to be in terms of logical address space,
3853 * and compare that answer with the max chunk size
3854 */
3855 if (stripe_size * data_stripes > max_chunk_size) {
3856 u64 mask = (1ULL << 24) - 1;
3857 stripe_size = max_chunk_size;
3858 do_div(stripe_size, data_stripes);
3859
3860 /* bump the answer up to a 16MB boundary */
3861 stripe_size = (stripe_size + mask) & ~mask;
3862
3863 /* but don't go higher than the limits we found
3864 * while searching for free extents
3865 */
3866 if (stripe_size > devices_info[ndevs-1].max_avail)
3867 stripe_size = devices_info[ndevs-1].max_avail;
3668 } 3868 }
3669 3869
3670 do_div(stripe_size, dev_stripes); 3870 do_div(stripe_size, dev_stripes);
3671 3871
3672 /* align to BTRFS_STRIPE_LEN */ 3872 /* align to BTRFS_STRIPE_LEN */
3673 do_div(stripe_size, BTRFS_STRIPE_LEN); 3873 do_div(stripe_size, raid_stripe_len);
3674 stripe_size *= BTRFS_STRIPE_LEN; 3874 stripe_size *= raid_stripe_len;
3675 3875
3676 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 3876 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3677 if (!map) { 3877 if (!map) {
@@ -3689,14 +3889,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3689 } 3889 }
3690 } 3890 }
3691 map->sector_size = extent_root->sectorsize; 3891 map->sector_size = extent_root->sectorsize;
3692 map->stripe_len = BTRFS_STRIPE_LEN; 3892 map->stripe_len = raid_stripe_len;
3693 map->io_align = BTRFS_STRIPE_LEN; 3893 map->io_align = raid_stripe_len;
3694 map->io_width = BTRFS_STRIPE_LEN; 3894 map->io_width = raid_stripe_len;
3695 map->type = type; 3895 map->type = type;
3696 map->sub_stripes = sub_stripes; 3896 map->sub_stripes = sub_stripes;
3697 3897
3698 *map_ret = map; 3898 *map_ret = map;
3699 num_bytes = stripe_size * (num_stripes / ncopies); 3899 num_bytes = stripe_size * data_stripes;
3700 3900
3701 *stripe_size_out = stripe_size; 3901 *stripe_size_out = stripe_size;
3702 *num_bytes_out = num_bytes; 3902 *num_bytes_out = num_bytes;
@@ -3718,15 +3918,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3718 write_lock(&em_tree->lock); 3918 write_lock(&em_tree->lock);
3719 ret = add_extent_mapping(em_tree, em); 3919 ret = add_extent_mapping(em_tree, em);
3720 write_unlock(&em_tree->lock); 3920 write_unlock(&em_tree->lock);
3721 free_extent_map(em); 3921 if (ret) {
3722 if (ret) 3922 free_extent_map(em);
3723 goto error;
3724
3725 ret = btrfs_make_block_group(trans, extent_root, 0, type,
3726 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3727 start, num_bytes);
3728 if (ret)
3729 goto error; 3923 goto error;
3924 }
3730 3925
3731 for (i = 0; i < map->num_stripes; ++i) { 3926 for (i = 0; i < map->num_stripes; ++i) {
3732 struct btrfs_device *device; 3927 struct btrfs_device *device;
@@ -3739,15 +3934,44 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3739 info->chunk_root->root_key.objectid, 3934 info->chunk_root->root_key.objectid,
3740 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 3935 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3741 start, dev_offset, stripe_size); 3936 start, dev_offset, stripe_size);
3742 if (ret) { 3937 if (ret)
3743 btrfs_abort_transaction(trans, extent_root, ret); 3938 goto error_dev_extent;
3744 goto error; 3939 }
3745 } 3940
3941 ret = btrfs_make_block_group(trans, extent_root, 0, type,
3942 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3943 start, num_bytes);
3944 if (ret) {
3945 i = map->num_stripes - 1;
3946 goto error_dev_extent;
3746 } 3947 }
3747 3948
3949 free_extent_map(em);
3950 check_raid56_incompat_flag(extent_root->fs_info, type);
3951
3748 kfree(devices_info); 3952 kfree(devices_info);
3749 return 0; 3953 return 0;
3750 3954
3955error_dev_extent:
3956 for (; i >= 0; i--) {
3957 struct btrfs_device *device;
3958 int err;
3959
3960 device = map->stripes[i].dev;
3961 err = btrfs_free_dev_extent(trans, device, start);
3962 if (err) {
3963 btrfs_abort_transaction(trans, extent_root, err);
3964 break;
3965 }
3966 }
3967 write_lock(&em_tree->lock);
3968 remove_extent_mapping(em_tree, em);
3969 write_unlock(&em_tree->lock);
3970
3971 /* One for our allocation */
3972 free_extent_map(em);
3973 /* One for the tree reference */
3974 free_extent_map(em);
3751error: 3975error:
3752 kfree(map); 3976 kfree(map);
3753 kfree(devices_info); 3977 kfree(devices_info);
@@ -3887,10 +4111,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3887 if (ret) 4111 if (ret)
3888 return ret; 4112 return ret;
3889 4113
3890 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 4114 alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
3891 fs_info->avail_metadata_alloc_bits;
3892 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3893
3894 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 4115 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
3895 &stripe_size, chunk_offset, alloc_profile); 4116 &stripe_size, chunk_offset, alloc_profile);
3896 if (ret) 4117 if (ret)
@@ -3898,10 +4119,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3898 4119
3899 sys_chunk_offset = chunk_offset + chunk_size; 4120 sys_chunk_offset = chunk_offset + chunk_size;
3900 4121
3901 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | 4122 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
3902 fs_info->avail_system_alloc_bits;
3903 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3904
3905 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 4123 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
3906 &sys_chunk_size, &sys_stripe_size, 4124 &sys_chunk_size, &sys_stripe_size,
3907 sys_chunk_offset, alloc_profile); 4125 sys_chunk_offset, alloc_profile);
@@ -4014,6 +4232,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4014 ret = map->num_stripes; 4232 ret = map->num_stripes;
4015 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4233 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4016 ret = map->sub_stripes; 4234 ret = map->sub_stripes;
4235 else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
4236 ret = 2;
4237 else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4238 ret = 3;
4017 else 4239 else
4018 ret = 1; 4240 ret = 1;
4019 free_extent_map(em); 4241 free_extent_map(em);
@@ -4026,6 +4248,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4026 return ret; 4248 return ret;
4027} 4249}
4028 4250
4251unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
4252 struct btrfs_mapping_tree *map_tree,
4253 u64 logical)
4254{
4255 struct extent_map *em;
4256 struct map_lookup *map;
4257 struct extent_map_tree *em_tree = &map_tree->map_tree;
4258 unsigned long len = root->sectorsize;
4259
4260 read_lock(&em_tree->lock);
4261 em = lookup_extent_mapping(em_tree, logical, len);
4262 read_unlock(&em_tree->lock);
4263 BUG_ON(!em);
4264
4265 BUG_ON(em->start > logical || em->start + em->len < logical);
4266 map = (struct map_lookup *)em->bdev;
4267 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4268 BTRFS_BLOCK_GROUP_RAID6)) {
4269 len = map->stripe_len * nr_data_stripes(map);
4270 }
4271 free_extent_map(em);
4272 return len;
4273}
4274
4275int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
4276 u64 logical, u64 len, int mirror_num)
4277{
4278 struct extent_map *em;
4279 struct map_lookup *map;
4280 struct extent_map_tree *em_tree = &map_tree->map_tree;
4281 int ret = 0;
4282
4283 read_lock(&em_tree->lock);
4284 em = lookup_extent_mapping(em_tree, logical, len);
4285 read_unlock(&em_tree->lock);
4286 BUG_ON(!em);
4287
4288 BUG_ON(em->start > logical || em->start + em->len < logical);
4289 map = (struct map_lookup *)em->bdev;
4290 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4291 BTRFS_BLOCK_GROUP_RAID6))
4292 ret = 1;
4293 free_extent_map(em);
4294 return ret;
4295}
4296
4029static int find_live_mirror(struct btrfs_fs_info *fs_info, 4297static int find_live_mirror(struct btrfs_fs_info *fs_info,
4030 struct map_lookup *map, int first, int num, 4298 struct map_lookup *map, int first, int num,
4031 int optimal, int dev_replace_is_ongoing) 4299 int optimal, int dev_replace_is_ongoing)
@@ -4063,10 +4331,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
4063 return optimal; 4331 return optimal;
4064} 4332}
4065 4333
4334static inline int parity_smaller(u64 a, u64 b)
4335{
4336 return a > b;
4337}
4338
4339/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
4340static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
4341{
4342 struct btrfs_bio_stripe s;
4343 int i;
4344 u64 l;
4345 int again = 1;
4346
4347 while (again) {
4348 again = 0;
4349 for (i = 0; i < bbio->num_stripes - 1; i++) {
4350 if (parity_smaller(raid_map[i], raid_map[i+1])) {
4351 s = bbio->stripes[i];
4352 l = raid_map[i];
4353 bbio->stripes[i] = bbio->stripes[i+1];
4354 raid_map[i] = raid_map[i+1];
4355 bbio->stripes[i+1] = s;
4356 raid_map[i+1] = l;
4357 again = 1;
4358 }
4359 }
4360 }
4361}
4362
4066static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 4363static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4067 u64 logical, u64 *length, 4364 u64 logical, u64 *length,
4068 struct btrfs_bio **bbio_ret, 4365 struct btrfs_bio **bbio_ret,
4069 int mirror_num) 4366 int mirror_num, u64 **raid_map_ret)
4070{ 4367{
4071 struct extent_map *em; 4368 struct extent_map *em;
4072 struct map_lookup *map; 4369 struct map_lookup *map;
@@ -4078,6 +4375,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4078 u64 stripe_nr; 4375 u64 stripe_nr;
4079 u64 stripe_nr_orig; 4376 u64 stripe_nr_orig;
4080 u64 stripe_nr_end; 4377 u64 stripe_nr_end;
4378 u64 stripe_len;
4379 u64 *raid_map = NULL;
4081 int stripe_index; 4380 int stripe_index;
4082 int i; 4381 int i;
4083 int ret = 0; 4382 int ret = 0;
@@ -4089,6 +4388,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4089 int num_alloc_stripes; 4388 int num_alloc_stripes;
4090 int patch_the_first_stripe_for_dev_replace = 0; 4389 int patch_the_first_stripe_for_dev_replace = 0;
4091 u64 physical_to_patch_in_first_stripe = 0; 4390 u64 physical_to_patch_in_first_stripe = 0;
4391 u64 raid56_full_stripe_start = (u64)-1;
4092 4392
4093 read_lock(&em_tree->lock); 4393 read_lock(&em_tree->lock);
4094 em = lookup_extent_mapping(em_tree, logical, *length); 4394 em = lookup_extent_mapping(em_tree, logical, *length);
@@ -4105,29 +4405,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4105 map = (struct map_lookup *)em->bdev; 4405 map = (struct map_lookup *)em->bdev;
4106 offset = logical - em->start; 4406 offset = logical - em->start;
4107 4407
4408 if (mirror_num > map->num_stripes)
4409 mirror_num = 0;
4410
4411 stripe_len = map->stripe_len;
4108 stripe_nr = offset; 4412 stripe_nr = offset;
4109 /* 4413 /*
4110 * stripe_nr counts the total number of stripes we have to stride 4414 * stripe_nr counts the total number of stripes we have to stride
4111 * to get to this block 4415 * to get to this block
4112 */ 4416 */
4113 do_div(stripe_nr, map->stripe_len); 4417 do_div(stripe_nr, stripe_len);
4114 4418
4115 stripe_offset = stripe_nr * map->stripe_len; 4419 stripe_offset = stripe_nr * stripe_len;
4116 BUG_ON(offset < stripe_offset); 4420 BUG_ON(offset < stripe_offset);
4117 4421
4118 /* stripe_offset is the offset of this block in its stripe*/ 4422 /* stripe_offset is the offset of this block in its stripe*/
4119 stripe_offset = offset - stripe_offset; 4423 stripe_offset = offset - stripe_offset;
4120 4424
4121 if (rw & REQ_DISCARD) 4425 /* if we're here for raid56, we need to know the stripe aligned start */
4426 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4427 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
4428 raid56_full_stripe_start = offset;
4429
4430 /* allow a write of a full stripe, but make sure we don't
4431 * allow straddling of stripes
4432 */
4433 do_div(raid56_full_stripe_start, full_stripe_len);
4434 raid56_full_stripe_start *= full_stripe_len;
4435 }
4436
4437 if (rw & REQ_DISCARD) {
4438 /* we don't discard raid56 yet */
4439 if (map->type &
4440 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4441 ret = -EOPNOTSUPP;
4442 goto out;
4443 }
4122 *length = min_t(u64, em->len - offset, *length); 4444 *length = min_t(u64, em->len - offset, *length);
4123 else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 4445 } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
4124 /* we limit the length of each bio to what fits in a stripe */ 4446 u64 max_len;
4125 *length = min_t(u64, em->len - offset, 4447 /* For writes to RAID[56], allow a full stripeset across all disks.
4126 map->stripe_len - stripe_offset); 4448 For other RAID types and for RAID[56] reads, just allow a single
4449 stripe (on a single disk). */
4450 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
4451 (rw & REQ_WRITE)) {
4452 max_len = stripe_len * nr_data_stripes(map) -
4453 (offset - raid56_full_stripe_start);
4454 } else {
4455 /* we limit the length of each bio to what fits in a stripe */
4456 max_len = stripe_len - stripe_offset;
4457 }
4458 *length = min_t(u64, em->len - offset, max_len);
4127 } else { 4459 } else {
4128 *length = em->len - offset; 4460 *length = em->len - offset;
4129 } 4461 }
4130 4462
4463 /* This is for when we're called from btrfs_merge_bio_hook() and all
4464 it cares about is the length */
4131 if (!bbio_ret) 4465 if (!bbio_ret)
4132 goto out; 4466 goto out;
4133 4467
@@ -4160,7 +4494,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4160 u64 physical_of_found = 0; 4494 u64 physical_of_found = 0;
4161 4495
4162 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, 4496 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
4163 logical, &tmp_length, &tmp_bbio, 0); 4497 logical, &tmp_length, &tmp_bbio, 0, NULL);
4164 if (ret) { 4498 if (ret) {
4165 WARN_ON(tmp_bbio != NULL); 4499 WARN_ON(tmp_bbio != NULL);
4166 goto out; 4500 goto out;
@@ -4221,11 +4555,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4221 num_stripes = 1; 4555 num_stripes = 1;
4222 stripe_index = 0; 4556 stripe_index = 0;
4223 stripe_nr_orig = stripe_nr; 4557 stripe_nr_orig = stripe_nr;
4224 stripe_nr_end = (offset + *length + map->stripe_len - 1) & 4558 stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
4225 (~(map->stripe_len - 1));
4226 do_div(stripe_nr_end, map->stripe_len); 4559 do_div(stripe_nr_end, map->stripe_len);
4227 stripe_end_offset = stripe_nr_end * map->stripe_len - 4560 stripe_end_offset = stripe_nr_end * map->stripe_len -
4228 (offset + *length); 4561 (offset + *length);
4562
4229 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4563 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4230 if (rw & REQ_DISCARD) 4564 if (rw & REQ_DISCARD)
4231 num_stripes = min_t(u64, map->num_stripes, 4565 num_stripes = min_t(u64, map->num_stripes,
@@ -4276,6 +4610,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4276 dev_replace_is_ongoing); 4610 dev_replace_is_ongoing);
4277 mirror_num = stripe_index - old_stripe_index + 1; 4611 mirror_num = stripe_index - old_stripe_index + 1;
4278 } 4612 }
4613
4614 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4615 BTRFS_BLOCK_GROUP_RAID6)) {
4616 u64 tmp;
4617
4618 if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
4619 && raid_map_ret) {
4620 int i, rot;
4621
4622 /* push stripe_nr back to the start of the full stripe */
4623 stripe_nr = raid56_full_stripe_start;
4624 do_div(stripe_nr, stripe_len);
4625
4626 stripe_index = do_div(stripe_nr, nr_data_stripes(map));
4627
4628 /* RAID[56] write or recovery. Return all stripes */
4629 num_stripes = map->num_stripes;
4630 max_errors = nr_parity_stripes(map);
4631
4632 raid_map = kmalloc(sizeof(u64) * num_stripes,
4633 GFP_NOFS);
4634 if (!raid_map) {
4635 ret = -ENOMEM;
4636 goto out;
4637 }
4638
4639 /* Work out the disk rotation on this stripe-set */
4640 tmp = stripe_nr;
4641 rot = do_div(tmp, num_stripes);
4642
4643 /* Fill in the logical address of each stripe */
4644 tmp = stripe_nr * nr_data_stripes(map);
4645 for (i = 0; i < nr_data_stripes(map); i++)
4646 raid_map[(i+rot) % num_stripes] =
4647 em->start + (tmp + i) * map->stripe_len;
4648
4649 raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
4650 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4651 raid_map[(i+rot+1) % num_stripes] =
4652 RAID6_Q_STRIPE;
4653
4654 *length = map->stripe_len;
4655 stripe_index = 0;
4656 stripe_offset = 0;
4657 } else {
4658 /*
4659 * Mirror #0 or #1 means the original data block.
4660 * Mirror #2 is RAID5 parity block.
4661 * Mirror #3 is RAID6 Q block.
4662 */
4663 stripe_index = do_div(stripe_nr, nr_data_stripes(map));
4664 if (mirror_num > 1)
4665 stripe_index = nr_data_stripes(map) +
4666 mirror_num - 2;
4667
4668 /* We distribute the parity blocks across stripes */
4669 tmp = stripe_nr + stripe_index;
4670 stripe_index = do_div(tmp, map->num_stripes);
4671 }
4279 } else { 4672 } else {
4280 /* 4673 /*
4281 * after this do_div call, stripe_nr is the number of stripes 4674 * after this do_div call, stripe_nr is the number of stripes
@@ -4384,8 +4777,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4384 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { 4777 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
4385 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 4778 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4386 BTRFS_BLOCK_GROUP_RAID10 | 4779 BTRFS_BLOCK_GROUP_RAID10 |
4780 BTRFS_BLOCK_GROUP_RAID5 |
4387 BTRFS_BLOCK_GROUP_DUP)) { 4781 BTRFS_BLOCK_GROUP_DUP)) {
4388 max_errors = 1; 4782 max_errors = 1;
4783 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
4784 max_errors = 2;
4389 } 4785 }
4390 } 4786 }
4391 4787
@@ -4486,6 +4882,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4486 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 4882 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
4487 bbio->mirror_num = map->num_stripes + 1; 4883 bbio->mirror_num = map->num_stripes + 1;
4488 } 4884 }
4885 if (raid_map) {
4886 sort_parity_stripes(bbio, raid_map);
4887 *raid_map_ret = raid_map;
4888 }
4489out: 4889out:
4490 if (dev_replace_is_ongoing) 4890 if (dev_replace_is_ongoing)
4491 btrfs_dev_replace_unlock(dev_replace); 4891 btrfs_dev_replace_unlock(dev_replace);
@@ -4498,7 +4898,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4498 struct btrfs_bio **bbio_ret, int mirror_num) 4898 struct btrfs_bio **bbio_ret, int mirror_num)
4499{ 4899{
4500 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, 4900 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
4501 mirror_num); 4901 mirror_num, NULL);
4502} 4902}
4503 4903
4504int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 4904int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -4512,6 +4912,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4512 u64 bytenr; 4912 u64 bytenr;
4513 u64 length; 4913 u64 length;
4514 u64 stripe_nr; 4914 u64 stripe_nr;
4915 u64 rmap_len;
4515 int i, j, nr = 0; 4916 int i, j, nr = 0;
4516 4917
4517 read_lock(&em_tree->lock); 4918 read_lock(&em_tree->lock);
@@ -4522,10 +4923,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4522 map = (struct map_lookup *)em->bdev; 4923 map = (struct map_lookup *)em->bdev;
4523 4924
4524 length = em->len; 4925 length = em->len;
4926 rmap_len = map->stripe_len;
4927
4525 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4928 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4526 do_div(length, map->num_stripes / map->sub_stripes); 4929 do_div(length, map->num_stripes / map->sub_stripes);
4527 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 4930 else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
4528 do_div(length, map->num_stripes); 4931 do_div(length, map->num_stripes);
4932 else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4933 BTRFS_BLOCK_GROUP_RAID6)) {
4934 do_div(length, nr_data_stripes(map));
4935 rmap_len = map->stripe_len * nr_data_stripes(map);
4936 }
4529 4937
4530 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 4938 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
4531 BUG_ON(!buf); /* -ENOMEM */ 4939 BUG_ON(!buf); /* -ENOMEM */
@@ -4545,8 +4953,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4545 do_div(stripe_nr, map->sub_stripes); 4953 do_div(stripe_nr, map->sub_stripes);
4546 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4954 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4547 stripe_nr = stripe_nr * map->num_stripes + i; 4955 stripe_nr = stripe_nr * map->num_stripes + i;
4548 } 4956 } /* else if RAID[56], multiply by nr_data_stripes().
4549 bytenr = chunk_start + stripe_nr * map->stripe_len; 4957 * Alternatively, just use rmap_len below instead of
4958 * map->stripe_len */
4959
4960 bytenr = chunk_start + stripe_nr * rmap_len;
4550 WARN_ON(nr >= map->num_stripes); 4961 WARN_ON(nr >= map->num_stripes);
4551 for (j = 0; j < nr; j++) { 4962 for (j = 0; j < nr; j++) {
4552 if (buf[j] == bytenr) 4963 if (buf[j] == bytenr)
@@ -4560,7 +4971,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4560 4971
4561 *logical = buf; 4972 *logical = buf;
4562 *naddrs = nr; 4973 *naddrs = nr;
4563 *stripe_len = map->stripe_len; 4974 *stripe_len = rmap_len;
4564 4975
4565 free_extent_map(em); 4976 free_extent_map(em);
4566 return 0; 4977 return 0;
@@ -4634,7 +5045,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
4634 bio->bi_bdev = (struct block_device *) 5045 bio->bi_bdev = (struct block_device *)
4635 (unsigned long)bbio->mirror_num; 5046 (unsigned long)bbio->mirror_num;
4636 /* only send an error to the higher layers if it is 5047 /* only send an error to the higher layers if it is
4637 * beyond the tolerance of the multi-bio 5048 * beyond the tolerance of the btrfs bio
4638 */ 5049 */
4639 if (atomic_read(&bbio->error) > bbio->max_errors) { 5050 if (atomic_read(&bbio->error) > bbio->max_errors) {
4640 err = -EIO; 5051 err = -EIO;
@@ -4668,13 +5079,18 @@ struct async_sched {
4668 * This will add one bio to the pending list for a device and make sure 5079 * This will add one bio to the pending list for a device and make sure
4669 * the work struct is scheduled. 5080 * the work struct is scheduled.
4670 */ 5081 */
4671static noinline void schedule_bio(struct btrfs_root *root, 5082noinline void btrfs_schedule_bio(struct btrfs_root *root,
4672 struct btrfs_device *device, 5083 struct btrfs_device *device,
4673 int rw, struct bio *bio) 5084 int rw, struct bio *bio)
4674{ 5085{
4675 int should_queue = 1; 5086 int should_queue = 1;
4676 struct btrfs_pending_bios *pending_bios; 5087 struct btrfs_pending_bios *pending_bios;
4677 5088
5089 if (device->missing || !device->bdev) {
5090 bio_endio(bio, -EIO);
5091 return;
5092 }
5093
4678 /* don't bother with additional async steps for reads, right now */ 5094 /* don't bother with additional async steps for reads, right now */
4679 if (!(rw & REQ_WRITE)) { 5095 if (!(rw & REQ_WRITE)) {
4680 bio_get(bio); 5096 bio_get(bio);
@@ -4772,7 +5188,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
4772#endif 5188#endif
4773 bio->bi_bdev = dev->bdev; 5189 bio->bi_bdev = dev->bdev;
4774 if (async) 5190 if (async)
4775 schedule_bio(root, dev, rw, bio); 5191 btrfs_schedule_bio(root, dev, rw, bio);
4776 else 5192 else
4777 btrfsic_submit_bio(rw, bio); 5193 btrfsic_submit_bio(rw, bio);
4778} 5194}
@@ -4831,6 +5247,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4831 u64 logical = (u64)bio->bi_sector << 9; 5247 u64 logical = (u64)bio->bi_sector << 9;
4832 u64 length = 0; 5248 u64 length = 0;
4833 u64 map_length; 5249 u64 map_length;
5250 u64 *raid_map = NULL;
4834 int ret; 5251 int ret;
4835 int dev_nr = 0; 5252 int dev_nr = 0;
4836 int total_devs = 1; 5253 int total_devs = 1;
@@ -4839,12 +5256,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4839 length = bio->bi_size; 5256 length = bio->bi_size;
4840 map_length = length; 5257 map_length = length;
4841 5258
4842 ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5259 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
4843 mirror_num); 5260 mirror_num, &raid_map);
4844 if (ret) 5261 if (ret) /* -ENOMEM */
4845 return ret; 5262 return ret;
4846 5263
4847 total_devs = bbio->num_stripes; 5264 total_devs = bbio->num_stripes;
5265 bbio->orig_bio = first_bio;
5266 bbio->private = first_bio->bi_private;
5267 bbio->end_io = first_bio->bi_end_io;
5268 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
5269
5270 if (raid_map) {
5271 /* In this case, map_length has been set to the length of
5272 a single stripe; not the whole write */
5273 if (rw & WRITE) {
5274 return raid56_parity_write(root, bio, bbio,
5275 raid_map, map_length);
5276 } else {
5277 return raid56_parity_recover(root, bio, bbio,
5278 raid_map, map_length,
5279 mirror_num);
5280 }
5281 }
5282
4848 if (map_length < length) { 5283 if (map_length < length) {
4849 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " 5284 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
4850 "len %llu\n", (unsigned long long)logical, 5285 "len %llu\n", (unsigned long long)logical,
@@ -4853,11 +5288,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4853 BUG(); 5288 BUG();
4854 } 5289 }
4855 5290
4856 bbio->orig_bio = first_bio;
4857 bbio->private = first_bio->bi_private;
4858 bbio->end_io = first_bio->bi_end_io;
4859 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4860
4861 while (dev_nr < total_devs) { 5291 while (dev_nr < total_devs) {
4862 dev = bbio->stripes[dev_nr].dev; 5292 dev = bbio->stripes[dev_nr].dev;
4863 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { 5293 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d3c3939ac751..062d8604d35b 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -21,8 +21,8 @@
21 21
22#include <linux/bio.h> 22#include <linux/bio.h>
23#include <linux/sort.h> 23#include <linux/sort.h>
24#include <linux/btrfs.h>
24#include "async-thread.h" 25#include "async-thread.h"
25#include "ioctl.h"
26 26
27#define BTRFS_STRIPE_LEN (64 * 1024) 27#define BTRFS_STRIPE_LEN (64 * 1024)
28 28
@@ -321,7 +321,14 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
321void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, 321void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
322 struct btrfs_device *tgtdev); 322 struct btrfs_device *tgtdev);
323int btrfs_scratch_superblock(struct btrfs_device *device); 323int btrfs_scratch_superblock(struct btrfs_device *device);
324 324void btrfs_schedule_bio(struct btrfs_root *root,
325 struct btrfs_device *device,
326 int rw, struct bio *bio);
327int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
328 u64 logical, u64 len, int mirror_num);
329unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
330 struct btrfs_mapping_tree *map_tree,
331 u64 logical);
325static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 332static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
326 int index) 333 int index)
327{ 334{