diff options
Diffstat (limited to 'fs')
46 files changed, 5421 insertions, 1518 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index ccd25ba7a9ac..9a8622a5b867 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig | |||
@@ -5,6 +5,9 @@ config BTRFS_FS | |||
5 | select ZLIB_DEFLATE | 5 | select ZLIB_DEFLATE |
6 | select LZO_COMPRESS | 6 | select LZO_COMPRESS |
7 | select LZO_DECOMPRESS | 7 | select LZO_DECOMPRESS |
8 | select RAID6_PQ | ||
9 | select XOR_BLOCKS | ||
10 | |||
8 | help | 11 | help |
9 | Btrfs is a new filesystem with extents, writable snapshotting, | 12 | Btrfs is a new filesystem with extents, writable snapshotting, |
10 | support for multiple devices and many more features. | 13 | support for multiple devices and many more features. |
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 7df3e0f0ee51..3932224f99e9 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile | |||
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ | |||
8 | extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ | 8 | extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ |
9 | export.o tree-log.o free-space-cache.o zlib.o lzo.o \ | 9 | export.o tree-log.o free-space-cache.o zlib.o lzo.o \ |
10 | compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ | 10 | compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ |
11 | reada.o backref.o ulist.o qgroup.o send.o dev-replace.o | 11 | reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o |
12 | 12 | ||
13 | btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o | 13 | btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o |
14 | btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o | 14 | btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o |
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 04edf69be875..bd605c87adfd 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c | |||
@@ -352,11 +352,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, | |||
352 | err = __resolve_indirect_ref(fs_info, search_commit_root, | 352 | err = __resolve_indirect_ref(fs_info, search_commit_root, |
353 | time_seq, ref, parents, | 353 | time_seq, ref, parents, |
354 | extent_item_pos); | 354 | extent_item_pos); |
355 | if (err) { | 355 | if (err) |
356 | if (ret == 0) | ||
357 | ret = err; | ||
358 | continue; | 356 | continue; |
359 | } | ||
360 | 357 | ||
361 | /* we put the first parent into the ref at hand */ | 358 | /* we put the first parent into the ref at hand */ |
362 | ULIST_ITER_INIT(&uiter); | 359 | ULIST_ITER_INIT(&uiter); |
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index d61feca79455..310a7f6d09b1 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h | |||
@@ -19,7 +19,7 @@ | |||
19 | #ifndef __BTRFS_BACKREF__ | 19 | #ifndef __BTRFS_BACKREF__ |
20 | #define __BTRFS_BACKREF__ | 20 | #define __BTRFS_BACKREF__ |
21 | 21 | ||
22 | #include "ioctl.h" | 22 | #include <linux/btrfs.h> |
23 | #include "ulist.h" | 23 | #include "ulist.h" |
24 | #include "extent_io.h" | 24 | #include "extent_io.h" |
25 | 25 | ||
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 2a8c242bc4f5..d9b97d4960e6 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h | |||
@@ -40,6 +40,8 @@ | |||
40 | #define BTRFS_INODE_HAS_ASYNC_EXTENT 6 | 40 | #define BTRFS_INODE_HAS_ASYNC_EXTENT 6 |
41 | #define BTRFS_INODE_NEEDS_FULL_SYNC 7 | 41 | #define BTRFS_INODE_NEEDS_FULL_SYNC 7 |
42 | #define BTRFS_INODE_COPY_EVERYTHING 8 | 42 | #define BTRFS_INODE_COPY_EVERYTHING 8 |
43 | #define BTRFS_INODE_IN_DELALLOC_LIST 9 | ||
44 | #define BTRFS_INODE_READDIO_NEED_LOCK 10 | ||
43 | 45 | ||
44 | /* in memory btrfs inode */ | 46 | /* in memory btrfs inode */ |
45 | struct btrfs_inode { | 47 | struct btrfs_inode { |
@@ -216,4 +218,22 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) | |||
216 | return 0; | 218 | return 0; |
217 | } | 219 | } |
218 | 220 | ||
221 | /* | ||
222 | * Disable DIO read nolock optimization, so new dio readers will be forced | ||
223 | * to grab i_mutex. It is used to avoid the endless truncate due to | ||
224 | * nonlocked dio read. | ||
225 | */ | ||
226 | static inline void btrfs_inode_block_unlocked_dio(struct inode *inode) | ||
227 | { | ||
228 | set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags); | ||
229 | smp_mb(); | ||
230 | } | ||
231 | |||
232 | static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode) | ||
233 | { | ||
234 | smp_mb__before_clear_bit(); | ||
235 | clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, | ||
236 | &BTRFS_I(inode)->runtime_flags); | ||
237 | } | ||
238 | |||
219 | #endif | 239 | #endif |
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 11d47bfb62b4..18af6f48781a 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c | |||
@@ -813,8 +813,7 @@ static int btrfsic_process_superblock_dev_mirror( | |||
813 | (bh->b_data + (dev_bytenr & 4095)); | 813 | (bh->b_data + (dev_bytenr & 4095)); |
814 | 814 | ||
815 | if (btrfs_super_bytenr(super_tmp) != dev_bytenr || | 815 | if (btrfs_super_bytenr(super_tmp) != dev_bytenr || |
816 | strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC, | 816 | super_tmp->magic != cpu_to_le64(BTRFS_MAGIC) || |
817 | sizeof(super_tmp->magic)) || | ||
818 | memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || | 817 | memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || |
819 | btrfs_super_nodesize(super_tmp) != state->metablock_size || | 818 | btrfs_super_nodesize(super_tmp) != state->metablock_size || |
820 | btrfs_super_leafsize(super_tmp) != state->metablock_size || | 819 | btrfs_super_leafsize(super_tmp) != state->metablock_size || |
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 94ab2f80e7e3..15b94089abc4 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c | |||
@@ -372,7 +372,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, | |||
372 | page = compressed_pages[pg_index]; | 372 | page = compressed_pages[pg_index]; |
373 | page->mapping = inode->i_mapping; | 373 | page->mapping = inode->i_mapping; |
374 | if (bio->bi_size) | 374 | if (bio->bi_size) |
375 | ret = io_tree->ops->merge_bio_hook(page, 0, | 375 | ret = io_tree->ops->merge_bio_hook(WRITE, page, 0, |
376 | PAGE_CACHE_SIZE, | 376 | PAGE_CACHE_SIZE, |
377 | bio, 0); | 377 | bio, 0); |
378 | else | 378 | else |
@@ -655,7 +655,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, | |||
655 | page->index = em_start >> PAGE_CACHE_SHIFT; | 655 | page->index = em_start >> PAGE_CACHE_SHIFT; |
656 | 656 | ||
657 | if (comp_bio->bi_size) | 657 | if (comp_bio->bi_size) |
658 | ret = tree->ops->merge_bio_hook(page, 0, | 658 | ret = tree->ops->merge_bio_hook(READ, page, 0, |
659 | PAGE_CACHE_SIZE, | 659 | PAGE_CACHE_SIZE, |
660 | comp_bio, 0); | 660 | comp_bio, 0); |
661 | else | 661 | else |
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index eea5da7a2b9a..ecd25a1b4e51 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c | |||
@@ -1138,6 +1138,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, | |||
1138 | switch (tm->op) { | 1138 | switch (tm->op) { |
1139 | case MOD_LOG_KEY_REMOVE_WHILE_FREEING: | 1139 | case MOD_LOG_KEY_REMOVE_WHILE_FREEING: |
1140 | BUG_ON(tm->slot < n); | 1140 | BUG_ON(tm->slot < n); |
1141 | /* Fallthrough */ | ||
1141 | case MOD_LOG_KEY_REMOVE_WHILE_MOVING: | 1142 | case MOD_LOG_KEY_REMOVE_WHILE_MOVING: |
1142 | case MOD_LOG_KEY_REMOVE: | 1143 | case MOD_LOG_KEY_REMOVE: |
1143 | btrfs_set_node_key(eb, &tm->key, tm->slot); | 1144 | btrfs_set_node_key(eb, &tm->key, tm->slot); |
@@ -1222,7 +1223,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, | |||
1222 | 1223 | ||
1223 | __tree_mod_log_rewind(eb_rewin, time_seq, tm); | 1224 | __tree_mod_log_rewind(eb_rewin, time_seq, tm); |
1224 | WARN_ON(btrfs_header_nritems(eb_rewin) > | 1225 | WARN_ON(btrfs_header_nritems(eb_rewin) > |
1225 | BTRFS_NODEPTRS_PER_BLOCK(fs_info->fs_root)); | 1226 | BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root)); |
1226 | 1227 | ||
1227 | return eb_rewin; | 1228 | return eb_rewin; |
1228 | } | 1229 | } |
@@ -1441,7 +1442,7 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) | |||
1441 | */ | 1442 | */ |
1442 | int btrfs_realloc_node(struct btrfs_trans_handle *trans, | 1443 | int btrfs_realloc_node(struct btrfs_trans_handle *trans, |
1443 | struct btrfs_root *root, struct extent_buffer *parent, | 1444 | struct btrfs_root *root, struct extent_buffer *parent, |
1444 | int start_slot, int cache_only, u64 *last_ret, | 1445 | int start_slot, u64 *last_ret, |
1445 | struct btrfs_key *progress) | 1446 | struct btrfs_key *progress) |
1446 | { | 1447 | { |
1447 | struct extent_buffer *cur; | 1448 | struct extent_buffer *cur; |
@@ -1461,8 +1462,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, | |||
1461 | struct btrfs_disk_key disk_key; | 1462 | struct btrfs_disk_key disk_key; |
1462 | 1463 | ||
1463 | parent_level = btrfs_header_level(parent); | 1464 | parent_level = btrfs_header_level(parent); |
1464 | if (cache_only && parent_level != 1) | ||
1465 | return 0; | ||
1466 | 1465 | ||
1467 | WARN_ON(trans->transaction != root->fs_info->running_transaction); | 1466 | WARN_ON(trans->transaction != root->fs_info->running_transaction); |
1468 | WARN_ON(trans->transid != root->fs_info->generation); | 1467 | WARN_ON(trans->transid != root->fs_info->generation); |
@@ -1508,10 +1507,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, | |||
1508 | else | 1507 | else |
1509 | uptodate = 0; | 1508 | uptodate = 0; |
1510 | if (!cur || !uptodate) { | 1509 | if (!cur || !uptodate) { |
1511 | if (cache_only) { | ||
1512 | free_extent_buffer(cur); | ||
1513 | continue; | ||
1514 | } | ||
1515 | if (!cur) { | 1510 | if (!cur) { |
1516 | cur = read_tree_block(root, blocknr, | 1511 | cur = read_tree_block(root, blocknr, |
1517 | blocksize, gen); | 1512 | blocksize, gen); |
@@ -4825,8 +4820,8 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) | |||
4825 | 4820 | ||
4826 | /* | 4821 | /* |
4827 | * A helper function to walk down the tree starting at min_key, and looking | 4822 | * A helper function to walk down the tree starting at min_key, and looking |
4828 | * for nodes or leaves that are either in cache or have a minimum | 4823 | * for nodes or leaves that are have a minimum transaction id. |
4829 | * transaction id. This is used by the btree defrag code, and tree logging | 4824 | * This is used by the btree defrag code, and tree logging |
4830 | * | 4825 | * |
4831 | * This does not cow, but it does stuff the starting key it finds back | 4826 | * This does not cow, but it does stuff the starting key it finds back |
4832 | * into min_key, so you can call btrfs_search_slot with cow=1 on the | 4827 | * into min_key, so you can call btrfs_search_slot with cow=1 on the |
@@ -4847,7 +4842,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) | |||
4847 | */ | 4842 | */ |
4848 | int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, | 4843 | int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, |
4849 | struct btrfs_key *max_key, | 4844 | struct btrfs_key *max_key, |
4850 | struct btrfs_path *path, int cache_only, | 4845 | struct btrfs_path *path, |
4851 | u64 min_trans) | 4846 | u64 min_trans) |
4852 | { | 4847 | { |
4853 | struct extent_buffer *cur; | 4848 | struct extent_buffer *cur; |
@@ -4887,15 +4882,12 @@ again: | |||
4887 | if (sret && slot > 0) | 4882 | if (sret && slot > 0) |
4888 | slot--; | 4883 | slot--; |
4889 | /* | 4884 | /* |
4890 | * check this node pointer against the cache_only and | 4885 | * check this node pointer against the min_trans parameters. |
4891 | * min_trans parameters. If it isn't in cache or is too | 4886 | * If it is too old, old, skip to the next one. |
4892 | * old, skip to the next one. | ||
4893 | */ | 4887 | */ |
4894 | while (slot < nritems) { | 4888 | while (slot < nritems) { |
4895 | u64 blockptr; | 4889 | u64 blockptr; |
4896 | u64 gen; | 4890 | u64 gen; |
4897 | struct extent_buffer *tmp; | ||
4898 | struct btrfs_disk_key disk_key; | ||
4899 | 4891 | ||
4900 | blockptr = btrfs_node_blockptr(cur, slot); | 4892 | blockptr = btrfs_node_blockptr(cur, slot); |
4901 | gen = btrfs_node_ptr_generation(cur, slot); | 4893 | gen = btrfs_node_ptr_generation(cur, slot); |
@@ -4903,27 +4895,7 @@ again: | |||
4903 | slot++; | 4895 | slot++; |
4904 | continue; | 4896 | continue; |
4905 | } | 4897 | } |
4906 | if (!cache_only) | 4898 | break; |
4907 | break; | ||
4908 | |||
4909 | if (max_key) { | ||
4910 | btrfs_node_key(cur, &disk_key, slot); | ||
4911 | if (comp_keys(&disk_key, max_key) >= 0) { | ||
4912 | ret = 1; | ||
4913 | goto out; | ||
4914 | } | ||
4915 | } | ||
4916 | |||
4917 | tmp = btrfs_find_tree_block(root, blockptr, | ||
4918 | btrfs_level_size(root, level - 1)); | ||
4919 | |||
4920 | if (tmp && btrfs_buffer_uptodate(tmp, gen, 1) > 0) { | ||
4921 | free_extent_buffer(tmp); | ||
4922 | break; | ||
4923 | } | ||
4924 | if (tmp) | ||
4925 | free_extent_buffer(tmp); | ||
4926 | slot++; | ||
4927 | } | 4899 | } |
4928 | find_next_key: | 4900 | find_next_key: |
4929 | /* | 4901 | /* |
@@ -4934,7 +4906,7 @@ find_next_key: | |||
4934 | path->slots[level] = slot; | 4906 | path->slots[level] = slot; |
4935 | btrfs_set_path_blocking(path); | 4907 | btrfs_set_path_blocking(path); |
4936 | sret = btrfs_find_next_key(root, path, min_key, level, | 4908 | sret = btrfs_find_next_key(root, path, min_key, level, |
4937 | cache_only, min_trans); | 4909 | min_trans); |
4938 | if (sret == 0) { | 4910 | if (sret == 0) { |
4939 | btrfs_release_path(path); | 4911 | btrfs_release_path(path); |
4940 | goto again; | 4912 | goto again; |
@@ -5399,8 +5371,7 @@ out: | |||
5399 | /* | 5371 | /* |
5400 | * this is similar to btrfs_next_leaf, but does not try to preserve | 5372 | * this is similar to btrfs_next_leaf, but does not try to preserve |
5401 | * and fixup the path. It looks for and returns the next key in the | 5373 | * and fixup the path. It looks for and returns the next key in the |
5402 | * tree based on the current path and the cache_only and min_trans | 5374 | * tree based on the current path and the min_trans parameters. |
5403 | * parameters. | ||
5404 | * | 5375 | * |
5405 | * 0 is returned if another key is found, < 0 if there are any errors | 5376 | * 0 is returned if another key is found, < 0 if there are any errors |
5406 | * and 1 is returned if there are no higher keys in the tree | 5377 | * and 1 is returned if there are no higher keys in the tree |
@@ -5409,8 +5380,7 @@ out: | |||
5409 | * calling this function. | 5380 | * calling this function. |
5410 | */ | 5381 | */ |
5411 | int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, | 5382 | int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, |
5412 | struct btrfs_key *key, int level, | 5383 | struct btrfs_key *key, int level, u64 min_trans) |
5413 | int cache_only, u64 min_trans) | ||
5414 | { | 5384 | { |
5415 | int slot; | 5385 | int slot; |
5416 | struct extent_buffer *c; | 5386 | struct extent_buffer *c; |
@@ -5461,22 +5431,8 @@ next: | |||
5461 | if (level == 0) | 5431 | if (level == 0) |
5462 | btrfs_item_key_to_cpu(c, key, slot); | 5432 | btrfs_item_key_to_cpu(c, key, slot); |
5463 | else { | 5433 | else { |
5464 | u64 blockptr = btrfs_node_blockptr(c, slot); | ||
5465 | u64 gen = btrfs_node_ptr_generation(c, slot); | 5434 | u64 gen = btrfs_node_ptr_generation(c, slot); |
5466 | 5435 | ||
5467 | if (cache_only) { | ||
5468 | struct extent_buffer *cur; | ||
5469 | cur = btrfs_find_tree_block(root, blockptr, | ||
5470 | btrfs_level_size(root, level - 1)); | ||
5471 | if (!cur || | ||
5472 | btrfs_buffer_uptodate(cur, gen, 1) <= 0) { | ||
5473 | slot++; | ||
5474 | if (cur) | ||
5475 | free_extent_buffer(cur); | ||
5476 | goto next; | ||
5477 | } | ||
5478 | free_extent_buffer(cur); | ||
5479 | } | ||
5480 | if (gen < min_trans) { | 5436 | if (gen < min_trans) { |
5481 | slot++; | 5437 | slot++; |
5482 | goto next; | 5438 | goto next; |
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 547b7b05727f..0d82922179db 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h | |||
@@ -31,10 +31,10 @@ | |||
31 | #include <trace/events/btrfs.h> | 31 | #include <trace/events/btrfs.h> |
32 | #include <asm/kmap_types.h> | 32 | #include <asm/kmap_types.h> |
33 | #include <linux/pagemap.h> | 33 | #include <linux/pagemap.h> |
34 | #include <linux/btrfs.h> | ||
34 | #include "extent_io.h" | 35 | #include "extent_io.h" |
35 | #include "extent_map.h" | 36 | #include "extent_map.h" |
36 | #include "async-thread.h" | 37 | #include "async-thread.h" |
37 | #include "ioctl.h" | ||
38 | 38 | ||
39 | struct btrfs_trans_handle; | 39 | struct btrfs_trans_handle; |
40 | struct btrfs_transaction; | 40 | struct btrfs_transaction; |
@@ -46,7 +46,7 @@ extern struct kmem_cache *btrfs_path_cachep; | |||
46 | extern struct kmem_cache *btrfs_free_space_cachep; | 46 | extern struct kmem_cache *btrfs_free_space_cachep; |
47 | struct btrfs_ordered_sum; | 47 | struct btrfs_ordered_sum; |
48 | 48 | ||
49 | #define BTRFS_MAGIC "_BHRfS_M" | 49 | #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ |
50 | 50 | ||
51 | #define BTRFS_MAX_MIRRORS 3 | 51 | #define BTRFS_MAX_MIRRORS 3 |
52 | 52 | ||
@@ -191,6 +191,8 @@ static int btrfs_csum_sizes[] = { 4, 0 }; | |||
191 | /* ioprio of readahead is set to idle */ | 191 | /* ioprio of readahead is set to idle */ |
192 | #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) | 192 | #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) |
193 | 193 | ||
194 | #define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024) | ||
195 | |||
194 | /* | 196 | /* |
195 | * The key defines the order in the tree, and so it also defines (optimal) | 197 | * The key defines the order in the tree, and so it also defines (optimal) |
196 | * block layout. | 198 | * block layout. |
@@ -336,7 +338,10 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) | |||
336 | /* | 338 | /* |
337 | * File system states | 339 | * File system states |
338 | */ | 340 | */ |
341 | #define BTRFS_FS_STATE_ERROR 0 | ||
342 | #define BTRFS_FS_STATE_REMOUNTING 1 | ||
339 | 343 | ||
344 | /* Super block flags */ | ||
340 | /* Errors detected */ | 345 | /* Errors detected */ |
341 | #define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) | 346 | #define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) |
342 | 347 | ||
@@ -502,6 +507,7 @@ struct btrfs_super_block { | |||
502 | #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) | 507 | #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) |
503 | 508 | ||
504 | #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) | 509 | #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) |
510 | #define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) | ||
505 | 511 | ||
506 | #define BTRFS_FEATURE_COMPAT_SUPP 0ULL | 512 | #define BTRFS_FEATURE_COMPAT_SUPP 0ULL |
507 | #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL | 513 | #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL |
@@ -511,6 +517,7 @@ struct btrfs_super_block { | |||
511 | BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ | 517 | BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ |
512 | BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ | 518 | BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ |
513 | BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ | 519 | BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ |
520 | BTRFS_FEATURE_INCOMPAT_RAID56 | \ | ||
514 | BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) | 521 | BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) |
515 | 522 | ||
516 | /* | 523 | /* |
@@ -952,8 +959,20 @@ struct btrfs_dev_replace_item { | |||
952 | #define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) | 959 | #define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) |
953 | #define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) | 960 | #define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) |
954 | #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) | 961 | #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) |
962 | #define BTRFS_BLOCK_GROUP_RAID5 (1 << 7) | ||
963 | #define BTRFS_BLOCK_GROUP_RAID6 (1 << 8) | ||
955 | #define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE | 964 | #define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE |
956 | #define BTRFS_NR_RAID_TYPES 5 | 965 | |
966 | enum btrfs_raid_types { | ||
967 | BTRFS_RAID_RAID10, | ||
968 | BTRFS_RAID_RAID1, | ||
969 | BTRFS_RAID_DUP, | ||
970 | BTRFS_RAID_RAID0, | ||
971 | BTRFS_RAID_SINGLE, | ||
972 | BTRFS_RAID_RAID5, | ||
973 | BTRFS_RAID_RAID6, | ||
974 | BTRFS_NR_RAID_TYPES | ||
975 | }; | ||
957 | 976 | ||
958 | #define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ | 977 | #define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ |
959 | BTRFS_BLOCK_GROUP_SYSTEM | \ | 978 | BTRFS_BLOCK_GROUP_SYSTEM | \ |
@@ -961,6 +980,8 @@ struct btrfs_dev_replace_item { | |||
961 | 980 | ||
962 | #define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ | 981 | #define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ |
963 | BTRFS_BLOCK_GROUP_RAID1 | \ | 982 | BTRFS_BLOCK_GROUP_RAID1 | \ |
983 | BTRFS_BLOCK_GROUP_RAID5 | \ | ||
984 | BTRFS_BLOCK_GROUP_RAID6 | \ | ||
964 | BTRFS_BLOCK_GROUP_DUP | \ | 985 | BTRFS_BLOCK_GROUP_DUP | \ |
965 | BTRFS_BLOCK_GROUP_RAID10) | 986 | BTRFS_BLOCK_GROUP_RAID10) |
966 | /* | 987 | /* |
@@ -1185,6 +1206,10 @@ struct btrfs_block_group_cache { | |||
1185 | u64 flags; | 1206 | u64 flags; |
1186 | u64 sectorsize; | 1207 | u64 sectorsize; |
1187 | u64 cache_generation; | 1208 | u64 cache_generation; |
1209 | |||
1210 | /* for raid56, this is a full stripe, without parity */ | ||
1211 | unsigned long full_stripe_len; | ||
1212 | |||
1188 | unsigned int ro:1; | 1213 | unsigned int ro:1; |
1189 | unsigned int dirty:1; | 1214 | unsigned int dirty:1; |
1190 | unsigned int iref:1; | 1215 | unsigned int iref:1; |
@@ -1225,6 +1250,28 @@ struct seq_list { | |||
1225 | u64 seq; | 1250 | u64 seq; |
1226 | }; | 1251 | }; |
1227 | 1252 | ||
1253 | enum btrfs_orphan_cleanup_state { | ||
1254 | ORPHAN_CLEANUP_STARTED = 1, | ||
1255 | ORPHAN_CLEANUP_DONE = 2, | ||
1256 | }; | ||
1257 | |||
1258 | /* used by the raid56 code to lock stripes for read/modify/write */ | ||
1259 | struct btrfs_stripe_hash { | ||
1260 | struct list_head hash_list; | ||
1261 | wait_queue_head_t wait; | ||
1262 | spinlock_t lock; | ||
1263 | }; | ||
1264 | |||
1265 | /* used by the raid56 code to lock stripes for read/modify/write */ | ||
1266 | struct btrfs_stripe_hash_table { | ||
1267 | struct list_head stripe_cache; | ||
1268 | spinlock_t cache_lock; | ||
1269 | int cache_size; | ||
1270 | struct btrfs_stripe_hash table[]; | ||
1271 | }; | ||
1272 | |||
1273 | #define BTRFS_STRIPE_HASH_TABLE_BITS 11 | ||
1274 | |||
1228 | /* fs_info */ | 1275 | /* fs_info */ |
1229 | struct reloc_control; | 1276 | struct reloc_control; |
1230 | struct btrfs_device; | 1277 | struct btrfs_device; |
@@ -1250,6 +1297,7 @@ struct btrfs_fs_info { | |||
1250 | 1297 | ||
1251 | /* block group cache stuff */ | 1298 | /* block group cache stuff */ |
1252 | spinlock_t block_group_cache_lock; | 1299 | spinlock_t block_group_cache_lock; |
1300 | u64 first_logical_byte; | ||
1253 | struct rb_root block_group_cache_tree; | 1301 | struct rb_root block_group_cache_tree; |
1254 | 1302 | ||
1255 | /* keep track of unallocated space */ | 1303 | /* keep track of unallocated space */ |
@@ -1288,7 +1336,23 @@ struct btrfs_fs_info { | |||
1288 | u64 last_trans_log_full_commit; | 1336 | u64 last_trans_log_full_commit; |
1289 | unsigned long mount_opt; | 1337 | unsigned long mount_opt; |
1290 | unsigned long compress_type:4; | 1338 | unsigned long compress_type:4; |
1339 | /* | ||
1340 | * It is a suggestive number, the read side is safe even it gets a | ||
1341 | * wrong number because we will write out the data into a regular | ||
1342 | * extent. The write side(mount/remount) is under ->s_umount lock, | ||
1343 | * so it is also safe. | ||
1344 | */ | ||
1291 | u64 max_inline; | 1345 | u64 max_inline; |
1346 | /* | ||
1347 | * Protected by ->chunk_mutex and sb->s_umount. | ||
1348 | * | ||
1349 | * The reason that we use two lock to protect it is because only | ||
1350 | * remount and mount operations can change it and these two operations | ||
1351 | * are under sb->s_umount, but the read side (chunk allocation) can not | ||
1352 | * acquire sb->s_umount or the deadlock would happen. So we use two | ||
1353 | * locks to protect it. On the write side, we must acquire two locks, | ||
1354 | * and on the read side, we just need acquire one of them. | ||
1355 | */ | ||
1292 | u64 alloc_start; | 1356 | u64 alloc_start; |
1293 | struct btrfs_transaction *running_transaction; | 1357 | struct btrfs_transaction *running_transaction; |
1294 | wait_queue_head_t transaction_throttle; | 1358 | wait_queue_head_t transaction_throttle; |
@@ -1307,6 +1371,13 @@ struct btrfs_fs_info { | |||
1307 | struct mutex cleaner_mutex; | 1371 | struct mutex cleaner_mutex; |
1308 | struct mutex chunk_mutex; | 1372 | struct mutex chunk_mutex; |
1309 | struct mutex volume_mutex; | 1373 | struct mutex volume_mutex; |
1374 | |||
1375 | /* this is used during read/modify/write to make sure | ||
1376 | * no two ios are trying to mod the same stripe at the same | ||
1377 | * time | ||
1378 | */ | ||
1379 | struct btrfs_stripe_hash_table *stripe_hash_table; | ||
1380 | |||
1310 | /* | 1381 | /* |
1311 | * this protects the ordered operations list only while we are | 1382 | * this protects the ordered operations list only while we are |
1312 | * processing all of the entries on it. This way we make | 1383 | * processing all of the entries on it. This way we make |
@@ -1365,6 +1436,7 @@ struct btrfs_fs_info { | |||
1365 | */ | 1436 | */ |
1366 | struct list_head ordered_extents; | 1437 | struct list_head ordered_extents; |
1367 | 1438 | ||
1439 | spinlock_t delalloc_lock; | ||
1368 | /* | 1440 | /* |
1369 | * all of the inodes that have delalloc bytes. It is possible for | 1441 | * all of the inodes that have delalloc bytes. It is possible for |
1370 | * this list to be empty even when there is still dirty data=ordered | 1442 | * this list to be empty even when there is still dirty data=ordered |
@@ -1373,13 +1445,6 @@ struct btrfs_fs_info { | |||
1373 | struct list_head delalloc_inodes; | 1445 | struct list_head delalloc_inodes; |
1374 | 1446 | ||
1375 | /* | 1447 | /* |
1376 | * special rename and truncate targets that must be on disk before | ||
1377 | * we're allowed to commit. This is basically the ext3 style | ||
1378 | * data=ordered list. | ||
1379 | */ | ||
1380 | struct list_head ordered_operations; | ||
1381 | |||
1382 | /* | ||
1383 | * there is a pool of worker threads for checksumming during writes | 1448 | * there is a pool of worker threads for checksumming during writes |
1384 | * and a pool for checksumming after reads. This is because readers | 1449 | * and a pool for checksumming after reads. This is because readers |
1385 | * can run with FS locks held, and the writers may be waiting for | 1450 | * can run with FS locks held, and the writers may be waiting for |
@@ -1395,6 +1460,8 @@ struct btrfs_fs_info { | |||
1395 | struct btrfs_workers flush_workers; | 1460 | struct btrfs_workers flush_workers; |
1396 | struct btrfs_workers endio_workers; | 1461 | struct btrfs_workers endio_workers; |
1397 | struct btrfs_workers endio_meta_workers; | 1462 | struct btrfs_workers endio_meta_workers; |
1463 | struct btrfs_workers endio_raid56_workers; | ||
1464 | struct btrfs_workers rmw_workers; | ||
1398 | struct btrfs_workers endio_meta_write_workers; | 1465 | struct btrfs_workers endio_meta_write_workers; |
1399 | struct btrfs_workers endio_write_workers; | 1466 | struct btrfs_workers endio_write_workers; |
1400 | struct btrfs_workers endio_freespace_worker; | 1467 | struct btrfs_workers endio_freespace_worker; |
@@ -1423,10 +1490,12 @@ struct btrfs_fs_info { | |||
1423 | 1490 | ||
1424 | u64 total_pinned; | 1491 | u64 total_pinned; |
1425 | 1492 | ||
1426 | /* protected by the delalloc lock, used to keep from writing | 1493 | /* used to keep from writing metadata until there is a nice batch */ |
1427 | * metadata until there is a nice batch | 1494 | struct percpu_counter dirty_metadata_bytes; |
1428 | */ | 1495 | struct percpu_counter delalloc_bytes; |
1429 | u64 dirty_metadata_bytes; | 1496 | s32 dirty_metadata_batch; |
1497 | s32 delalloc_batch; | ||
1498 | |||
1430 | struct list_head dirty_cowonly_roots; | 1499 | struct list_head dirty_cowonly_roots; |
1431 | 1500 | ||
1432 | struct btrfs_fs_devices *fs_devices; | 1501 | struct btrfs_fs_devices *fs_devices; |
@@ -1442,9 +1511,6 @@ struct btrfs_fs_info { | |||
1442 | 1511 | ||
1443 | struct reloc_control *reloc_ctl; | 1512 | struct reloc_control *reloc_ctl; |
1444 | 1513 | ||
1445 | spinlock_t delalloc_lock; | ||
1446 | u64 delalloc_bytes; | ||
1447 | |||
1448 | /* data_alloc_cluster is only used in ssd mode */ | 1514 | /* data_alloc_cluster is only used in ssd mode */ |
1449 | struct btrfs_free_cluster data_alloc_cluster; | 1515 | struct btrfs_free_cluster data_alloc_cluster; |
1450 | 1516 | ||
@@ -1456,6 +1522,8 @@ struct btrfs_fs_info { | |||
1456 | struct rb_root defrag_inodes; | 1522 | struct rb_root defrag_inodes; |
1457 | atomic_t defrag_running; | 1523 | atomic_t defrag_running; |
1458 | 1524 | ||
1525 | /* Used to protect avail_{data, metadata, system}_alloc_bits */ | ||
1526 | seqlock_t profiles_lock; | ||
1459 | /* | 1527 | /* |
1460 | * these three are in extended format (availability of single | 1528 | * these three are in extended format (availability of single |
1461 | * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other | 1529 | * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other |
@@ -1520,7 +1588,7 @@ struct btrfs_fs_info { | |||
1520 | u64 qgroup_seq; | 1588 | u64 qgroup_seq; |
1521 | 1589 | ||
1522 | /* filesystem state */ | 1590 | /* filesystem state */ |
1523 | u64 fs_state; | 1591 | unsigned long fs_state; |
1524 | 1592 | ||
1525 | struct btrfs_delayed_root *delayed_root; | 1593 | struct btrfs_delayed_root *delayed_root; |
1526 | 1594 | ||
@@ -1623,6 +1691,9 @@ struct btrfs_root { | |||
1623 | 1691 | ||
1624 | struct list_head root_list; | 1692 | struct list_head root_list; |
1625 | 1693 | ||
1694 | spinlock_t log_extents_lock[2]; | ||
1695 | struct list_head logged_list[2]; | ||
1696 | |||
1626 | spinlock_t orphan_lock; | 1697 | spinlock_t orphan_lock; |
1627 | atomic_t orphan_inodes; | 1698 | atomic_t orphan_inodes; |
1628 | struct btrfs_block_rsv *orphan_block_rsv; | 1699 | struct btrfs_block_rsv *orphan_block_rsv; |
@@ -1832,6 +1903,7 @@ struct btrfs_ioctl_defrag_range_args { | |||
1832 | 1903 | ||
1833 | #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) | 1904 | #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) |
1834 | #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) | 1905 | #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) |
1906 | #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) | ||
1835 | #define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ | 1907 | #define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ |
1836 | BTRFS_MOUNT_##opt) | 1908 | BTRFS_MOUNT_##opt) |
1837 | /* | 1909 | /* |
@@ -2936,8 +3008,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, | |||
2936 | u64 num_bytes, u64 *refs, u64 *flags); | 3008 | u64 num_bytes, u64 *refs, u64 *flags); |
2937 | int btrfs_pin_extent(struct btrfs_root *root, | 3009 | int btrfs_pin_extent(struct btrfs_root *root, |
2938 | u64 bytenr, u64 num, int reserved); | 3010 | u64 bytenr, u64 num, int reserved); |
2939 | int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, | 3011 | int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, |
2940 | struct btrfs_root *root, | ||
2941 | u64 bytenr, u64 num_bytes); | 3012 | u64 bytenr, u64 num_bytes); |
2942 | int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, | 3013 | int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, |
2943 | struct btrfs_root *root, | 3014 | struct btrfs_root *root, |
@@ -3035,8 +3106,13 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, | |||
3035 | int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, | 3106 | int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, |
3036 | struct inode *inode); | 3107 | struct inode *inode); |
3037 | void btrfs_orphan_release_metadata(struct inode *inode); | 3108 | void btrfs_orphan_release_metadata(struct inode *inode); |
3038 | int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, | 3109 | int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, |
3039 | struct btrfs_pending_snapshot *pending); | 3110 | struct btrfs_block_rsv *rsv, |
3111 | int nitems, | ||
3112 | u64 *qgroup_reserved); | ||
3113 | void btrfs_subvolume_release_metadata(struct btrfs_root *root, | ||
3114 | struct btrfs_block_rsv *rsv, | ||
3115 | u64 qgroup_reserved); | ||
3040 | int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); | 3116 | int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); |
3041 | void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); | 3117 | void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); |
3042 | int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); | 3118 | int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); |
@@ -3092,10 +3168,10 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root); | |||
3092 | struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); | 3168 | struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); |
3093 | int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, | 3169 | int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, |
3094 | struct btrfs_key *key, int lowest_level, | 3170 | struct btrfs_key *key, int lowest_level, |
3095 | int cache_only, u64 min_trans); | 3171 | u64 min_trans); |
3096 | int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, | 3172 | int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, |
3097 | struct btrfs_key *max_key, | 3173 | struct btrfs_key *max_key, |
3098 | struct btrfs_path *path, int cache_only, | 3174 | struct btrfs_path *path, |
3099 | u64 min_trans); | 3175 | u64 min_trans); |
3100 | enum btrfs_compare_tree_result { | 3176 | enum btrfs_compare_tree_result { |
3101 | BTRFS_COMPARE_TREE_NEW, | 3177 | BTRFS_COMPARE_TREE_NEW, |
@@ -3148,7 +3224,7 @@ int btrfs_search_slot_for_read(struct btrfs_root *root, | |||
3148 | int find_higher, int return_any); | 3224 | int find_higher, int return_any); |
3149 | int btrfs_realloc_node(struct btrfs_trans_handle *trans, | 3225 | int btrfs_realloc_node(struct btrfs_trans_handle *trans, |
3150 | struct btrfs_root *root, struct extent_buffer *parent, | 3226 | struct btrfs_root *root, struct extent_buffer *parent, |
3151 | int start_slot, int cache_only, u64 *last_ret, | 3227 | int start_slot, u64 *last_ret, |
3152 | struct btrfs_key *progress); | 3228 | struct btrfs_key *progress); |
3153 | void btrfs_release_path(struct btrfs_path *p); | 3229 | void btrfs_release_path(struct btrfs_path *p); |
3154 | struct btrfs_path *btrfs_alloc_path(void); | 3230 | struct btrfs_path *btrfs_alloc_path(void); |
@@ -3459,9 +3535,9 @@ int btrfs_writepages(struct address_space *mapping, | |||
3459 | struct writeback_control *wbc); | 3535 | struct writeback_control *wbc); |
3460 | int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, | 3536 | int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, |
3461 | struct btrfs_root *new_root, u64 new_dirid); | 3537 | struct btrfs_root *new_root, u64 new_dirid); |
3462 | int btrfs_merge_bio_hook(struct page *page, unsigned long offset, | 3538 | int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, |
3463 | size_t size, struct bio *bio, unsigned long bio_flags); | 3539 | size_t size, struct bio *bio, |
3464 | 3540 | unsigned long bio_flags); | |
3465 | int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); | 3541 | int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); |
3466 | int btrfs_readpage(struct file *file, struct page *page); | 3542 | int btrfs_readpage(struct file *file, struct page *page); |
3467 | void btrfs_evict_inode(struct inode *inode); | 3543 | void btrfs_evict_inode(struct inode *inode); |
@@ -3543,7 +3619,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, | |||
3543 | 3619 | ||
3544 | /* tree-defrag.c */ | 3620 | /* tree-defrag.c */ |
3545 | int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, | 3621 | int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, |
3546 | struct btrfs_root *root, int cache_only); | 3622 | struct btrfs_root *root); |
3547 | 3623 | ||
3548 | /* sysfs.c */ | 3624 | /* sysfs.c */ |
3549 | int btrfs_init_sysfs(void); | 3625 | int btrfs_init_sysfs(void); |
@@ -3620,11 +3696,14 @@ __printf(5, 6) | |||
3620 | void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, | 3696 | void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, |
3621 | unsigned int line, int errno, const char *fmt, ...); | 3697 | unsigned int line, int errno, const char *fmt, ...); |
3622 | 3698 | ||
3699 | /* | ||
3700 | * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic | ||
3701 | * will panic(). Otherwise we BUG() here. | ||
3702 | */ | ||
3623 | #define btrfs_panic(fs_info, errno, fmt, args...) \ | 3703 | #define btrfs_panic(fs_info, errno, fmt, args...) \ |
3624 | do { \ | 3704 | do { \ |
3625 | struct btrfs_fs_info *_i = (fs_info); \ | 3705 | __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ |
3626 | __btrfs_panic(_i, __func__, __LINE__, errno, fmt, ##args); \ | 3706 | BUG(); \ |
3627 | BUG_ON(!(_i->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)); \ | ||
3628 | } while (0) | 3707 | } while (0) |
3629 | 3708 | ||
3630 | /* acl.c */ | 3709 | /* acl.c */ |
@@ -3745,4 +3824,11 @@ static inline int is_fstree(u64 rootid) | |||
3745 | return 1; | 3824 | return 1; |
3746 | return 0; | 3825 | return 0; |
3747 | } | 3826 | } |
3827 | |||
3828 | static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) | ||
3829 | { | ||
3830 | return signal_pending(current); | ||
3831 | } | ||
3832 | |||
3833 | |||
3748 | #endif | 3834 | #endif |
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 34836036f01b..0b278b117cbe 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c | |||
@@ -875,7 +875,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, | |||
875 | struct btrfs_delayed_item *delayed_item) | 875 | struct btrfs_delayed_item *delayed_item) |
876 | { | 876 | { |
877 | struct extent_buffer *leaf; | 877 | struct extent_buffer *leaf; |
878 | struct btrfs_item *item; | ||
879 | char *ptr; | 878 | char *ptr; |
880 | int ret; | 879 | int ret; |
881 | 880 | ||
@@ -886,7 +885,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, | |||
886 | 885 | ||
887 | leaf = path->nodes[0]; | 886 | leaf = path->nodes[0]; |
888 | 887 | ||
889 | item = btrfs_item_nr(leaf, path->slots[0]); | ||
890 | ptr = btrfs_item_ptr(leaf, path->slots[0], char); | 888 | ptr = btrfs_item_ptr(leaf, path->slots[0], char); |
891 | 889 | ||
892 | write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr, | 890 | write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr, |
@@ -1065,32 +1063,25 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) | |||
1065 | } | 1063 | } |
1066 | } | 1064 | } |
1067 | 1065 | ||
1068 | static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, | 1066 | static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, |
1069 | struct btrfs_root *root, | 1067 | struct btrfs_root *root, |
1070 | struct btrfs_path *path, | 1068 | struct btrfs_path *path, |
1071 | struct btrfs_delayed_node *node) | 1069 | struct btrfs_delayed_node *node) |
1072 | { | 1070 | { |
1073 | struct btrfs_key key; | 1071 | struct btrfs_key key; |
1074 | struct btrfs_inode_item *inode_item; | 1072 | struct btrfs_inode_item *inode_item; |
1075 | struct extent_buffer *leaf; | 1073 | struct extent_buffer *leaf; |
1076 | int ret; | 1074 | int ret; |
1077 | 1075 | ||
1078 | mutex_lock(&node->mutex); | ||
1079 | if (!node->inode_dirty) { | ||
1080 | mutex_unlock(&node->mutex); | ||
1081 | return 0; | ||
1082 | } | ||
1083 | |||
1084 | key.objectid = node->inode_id; | 1076 | key.objectid = node->inode_id; |
1085 | btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); | 1077 | btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); |
1086 | key.offset = 0; | 1078 | key.offset = 0; |
1079 | |||
1087 | ret = btrfs_lookup_inode(trans, root, path, &key, 1); | 1080 | ret = btrfs_lookup_inode(trans, root, path, &key, 1); |
1088 | if (ret > 0) { | 1081 | if (ret > 0) { |
1089 | btrfs_release_path(path); | 1082 | btrfs_release_path(path); |
1090 | mutex_unlock(&node->mutex); | ||
1091 | return -ENOENT; | 1083 | return -ENOENT; |
1092 | } else if (ret < 0) { | 1084 | } else if (ret < 0) { |
1093 | mutex_unlock(&node->mutex); | ||
1094 | return ret; | 1085 | return ret; |
1095 | } | 1086 | } |
1096 | 1087 | ||
@@ -1105,11 +1096,47 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, | |||
1105 | 1096 | ||
1106 | btrfs_delayed_inode_release_metadata(root, node); | 1097 | btrfs_delayed_inode_release_metadata(root, node); |
1107 | btrfs_release_delayed_inode(node); | 1098 | btrfs_release_delayed_inode(node); |
1108 | mutex_unlock(&node->mutex); | ||
1109 | 1099 | ||
1110 | return 0; | 1100 | return 0; |
1111 | } | 1101 | } |
1112 | 1102 | ||
1103 | static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, | ||
1104 | struct btrfs_root *root, | ||
1105 | struct btrfs_path *path, | ||
1106 | struct btrfs_delayed_node *node) | ||
1107 | { | ||
1108 | int ret; | ||
1109 | |||
1110 | mutex_lock(&node->mutex); | ||
1111 | if (!node->inode_dirty) { | ||
1112 | mutex_unlock(&node->mutex); | ||
1113 | return 0; | ||
1114 | } | ||
1115 | |||
1116 | ret = __btrfs_update_delayed_inode(trans, root, path, node); | ||
1117 | mutex_unlock(&node->mutex); | ||
1118 | return ret; | ||
1119 | } | ||
1120 | |||
1121 | static inline int | ||
1122 | __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, | ||
1123 | struct btrfs_path *path, | ||
1124 | struct btrfs_delayed_node *node) | ||
1125 | { | ||
1126 | int ret; | ||
1127 | |||
1128 | ret = btrfs_insert_delayed_items(trans, path, node->root, node); | ||
1129 | if (ret) | ||
1130 | return ret; | ||
1131 | |||
1132 | ret = btrfs_delete_delayed_items(trans, path, node->root, node); | ||
1133 | if (ret) | ||
1134 | return ret; | ||
1135 | |||
1136 | ret = btrfs_update_delayed_inode(trans, node->root, path, node); | ||
1137 | return ret; | ||
1138 | } | ||
1139 | |||
1113 | /* | 1140 | /* |
1114 | * Called when committing the transaction. | 1141 | * Called when committing the transaction. |
1115 | * Returns 0 on success. | 1142 | * Returns 0 on success. |
@@ -1119,7 +1146,6 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, | |||
1119 | static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, | 1146 | static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, |
1120 | struct btrfs_root *root, int nr) | 1147 | struct btrfs_root *root, int nr) |
1121 | { | 1148 | { |
1122 | struct btrfs_root *curr_root = root; | ||
1123 | struct btrfs_delayed_root *delayed_root; | 1149 | struct btrfs_delayed_root *delayed_root; |
1124 | struct btrfs_delayed_node *curr_node, *prev_node; | 1150 | struct btrfs_delayed_node *curr_node, *prev_node; |
1125 | struct btrfs_path *path; | 1151 | struct btrfs_path *path; |
@@ -1142,15 +1168,8 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, | |||
1142 | 1168 | ||
1143 | curr_node = btrfs_first_delayed_node(delayed_root); | 1169 | curr_node = btrfs_first_delayed_node(delayed_root); |
1144 | while (curr_node && (!count || (count && nr--))) { | 1170 | while (curr_node && (!count || (count && nr--))) { |
1145 | curr_root = curr_node->root; | 1171 | ret = __btrfs_commit_inode_delayed_items(trans, path, |
1146 | ret = btrfs_insert_delayed_items(trans, path, curr_root, | 1172 | curr_node); |
1147 | curr_node); | ||
1148 | if (!ret) | ||
1149 | ret = btrfs_delete_delayed_items(trans, path, | ||
1150 | curr_root, curr_node); | ||
1151 | if (!ret) | ||
1152 | ret = btrfs_update_delayed_inode(trans, curr_root, | ||
1153 | path, curr_node); | ||
1154 | if (ret) { | 1173 | if (ret) { |
1155 | btrfs_release_delayed_node(curr_node); | 1174 | btrfs_release_delayed_node(curr_node); |
1156 | curr_node = NULL; | 1175 | curr_node = NULL; |
@@ -1183,51 +1202,93 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, | |||
1183 | return __btrfs_run_delayed_items(trans, root, nr); | 1202 | return __btrfs_run_delayed_items(trans, root, nr); |
1184 | } | 1203 | } |
1185 | 1204 | ||
1186 | static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, | 1205 | int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, |
1187 | struct btrfs_delayed_node *node) | 1206 | struct inode *inode) |
1188 | { | 1207 | { |
1208 | struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); | ||
1189 | struct btrfs_path *path; | 1209 | struct btrfs_path *path; |
1190 | struct btrfs_block_rsv *block_rsv; | 1210 | struct btrfs_block_rsv *block_rsv; |
1191 | int ret; | 1211 | int ret; |
1192 | 1212 | ||
1213 | if (!delayed_node) | ||
1214 | return 0; | ||
1215 | |||
1216 | mutex_lock(&delayed_node->mutex); | ||
1217 | if (!delayed_node->count) { | ||
1218 | mutex_unlock(&delayed_node->mutex); | ||
1219 | btrfs_release_delayed_node(delayed_node); | ||
1220 | return 0; | ||
1221 | } | ||
1222 | mutex_unlock(&delayed_node->mutex); | ||
1223 | |||
1193 | path = btrfs_alloc_path(); | 1224 | path = btrfs_alloc_path(); |
1194 | if (!path) | 1225 | if (!path) |
1195 | return -ENOMEM; | 1226 | return -ENOMEM; |
1196 | path->leave_spinning = 1; | 1227 | path->leave_spinning = 1; |
1197 | 1228 | ||
1198 | block_rsv = trans->block_rsv; | 1229 | block_rsv = trans->block_rsv; |
1199 | trans->block_rsv = &node->root->fs_info->delayed_block_rsv; | 1230 | trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; |
1200 | 1231 | ||
1201 | ret = btrfs_insert_delayed_items(trans, path, node->root, node); | 1232 | ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node); |
1202 | if (!ret) | ||
1203 | ret = btrfs_delete_delayed_items(trans, path, node->root, node); | ||
1204 | if (!ret) | ||
1205 | ret = btrfs_update_delayed_inode(trans, node->root, path, node); | ||
1206 | btrfs_free_path(path); | ||
1207 | 1233 | ||
1234 | btrfs_release_delayed_node(delayed_node); | ||
1235 | btrfs_free_path(path); | ||
1208 | trans->block_rsv = block_rsv; | 1236 | trans->block_rsv = block_rsv; |
1237 | |||
1209 | return ret; | 1238 | return ret; |
1210 | } | 1239 | } |
1211 | 1240 | ||
1212 | int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, | 1241 | int btrfs_commit_inode_delayed_inode(struct inode *inode) |
1213 | struct inode *inode) | ||
1214 | { | 1242 | { |
1243 | struct btrfs_trans_handle *trans; | ||
1215 | struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); | 1244 | struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); |
1245 | struct btrfs_path *path; | ||
1246 | struct btrfs_block_rsv *block_rsv; | ||
1216 | int ret; | 1247 | int ret; |
1217 | 1248 | ||
1218 | if (!delayed_node) | 1249 | if (!delayed_node) |
1219 | return 0; | 1250 | return 0; |
1220 | 1251 | ||
1221 | mutex_lock(&delayed_node->mutex); | 1252 | mutex_lock(&delayed_node->mutex); |
1222 | if (!delayed_node->count) { | 1253 | if (!delayed_node->inode_dirty) { |
1223 | mutex_unlock(&delayed_node->mutex); | 1254 | mutex_unlock(&delayed_node->mutex); |
1224 | btrfs_release_delayed_node(delayed_node); | 1255 | btrfs_release_delayed_node(delayed_node); |
1225 | return 0; | 1256 | return 0; |
1226 | } | 1257 | } |
1227 | mutex_unlock(&delayed_node->mutex); | 1258 | mutex_unlock(&delayed_node->mutex); |
1228 | 1259 | ||
1229 | ret = __btrfs_commit_inode_delayed_items(trans, delayed_node); | 1260 | trans = btrfs_join_transaction(delayed_node->root); |
1261 | if (IS_ERR(trans)) { | ||
1262 | ret = PTR_ERR(trans); | ||
1263 | goto out; | ||
1264 | } | ||
1265 | |||
1266 | path = btrfs_alloc_path(); | ||
1267 | if (!path) { | ||
1268 | ret = -ENOMEM; | ||
1269 | goto trans_out; | ||
1270 | } | ||
1271 | path->leave_spinning = 1; | ||
1272 | |||
1273 | block_rsv = trans->block_rsv; | ||
1274 | trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; | ||
1275 | |||
1276 | mutex_lock(&delayed_node->mutex); | ||
1277 | if (delayed_node->inode_dirty) | ||
1278 | ret = __btrfs_update_delayed_inode(trans, delayed_node->root, | ||
1279 | path, delayed_node); | ||
1280 | else | ||
1281 | ret = 0; | ||
1282 | mutex_unlock(&delayed_node->mutex); | ||
1283 | |||
1284 | btrfs_free_path(path); | ||
1285 | trans->block_rsv = block_rsv; | ||
1286 | trans_out: | ||
1287 | btrfs_end_transaction(trans, delayed_node->root); | ||
1288 | btrfs_btree_balance_dirty(delayed_node->root); | ||
1289 | out: | ||
1230 | btrfs_release_delayed_node(delayed_node); | 1290 | btrfs_release_delayed_node(delayed_node); |
1291 | |||
1231 | return ret; | 1292 | return ret; |
1232 | } | 1293 | } |
1233 | 1294 | ||
@@ -1258,7 +1319,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) | |||
1258 | struct btrfs_root *root; | 1319 | struct btrfs_root *root; |
1259 | struct btrfs_block_rsv *block_rsv; | 1320 | struct btrfs_block_rsv *block_rsv; |
1260 | int need_requeue = 0; | 1321 | int need_requeue = 0; |
1261 | int ret; | ||
1262 | 1322 | ||
1263 | async_node = container_of(work, struct btrfs_async_delayed_node, work); | 1323 | async_node = container_of(work, struct btrfs_async_delayed_node, work); |
1264 | 1324 | ||
@@ -1277,14 +1337,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) | |||
1277 | block_rsv = trans->block_rsv; | 1337 | block_rsv = trans->block_rsv; |
1278 | trans->block_rsv = &root->fs_info->delayed_block_rsv; | 1338 | trans->block_rsv = &root->fs_info->delayed_block_rsv; |
1279 | 1339 | ||
1280 | ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); | 1340 | __btrfs_commit_inode_delayed_items(trans, path, delayed_node); |
1281 | if (!ret) | ||
1282 | ret = btrfs_delete_delayed_items(trans, path, root, | ||
1283 | delayed_node); | ||
1284 | |||
1285 | if (!ret) | ||
1286 | btrfs_update_delayed_inode(trans, root, path, delayed_node); | ||
1287 | |||
1288 | /* | 1341 | /* |
1289 | * Maybe new delayed items have been inserted, so we need requeue | 1342 | * Maybe new delayed items have been inserted, so we need requeue |
1290 | * the work. Besides that, we must dequeue the empty delayed nodes | 1343 | * the work. Besides that, we must dequeue the empty delayed nodes |
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 4f808e1baeed..78b6ad0fc669 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h | |||
@@ -117,6 +117,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, | |||
117 | /* Used for evicting the inode. */ | 117 | /* Used for evicting the inode. */ |
118 | void btrfs_remove_delayed_node(struct inode *inode); | 118 | void btrfs_remove_delayed_node(struct inode *inode); |
119 | void btrfs_kill_delayed_inode_items(struct inode *inode); | 119 | void btrfs_kill_delayed_inode_items(struct inode *inode); |
120 | int btrfs_commit_inode_delayed_inode(struct inode *inode); | ||
120 | 121 | ||
121 | 122 | ||
122 | int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, | 123 | int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, |
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index ae9411773397..b7a0641ead77 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c | |||
@@ -23,6 +23,10 @@ | |||
23 | #include "delayed-ref.h" | 23 | #include "delayed-ref.h" |
24 | #include "transaction.h" | 24 | #include "transaction.h" |
25 | 25 | ||
26 | struct kmem_cache *btrfs_delayed_ref_head_cachep; | ||
27 | struct kmem_cache *btrfs_delayed_tree_ref_cachep; | ||
28 | struct kmem_cache *btrfs_delayed_data_ref_cachep; | ||
29 | struct kmem_cache *btrfs_delayed_extent_op_cachep; | ||
26 | /* | 30 | /* |
27 | * delayed back reference update tracking. For subvolume trees | 31 | * delayed back reference update tracking. For subvolume trees |
28 | * we queue up extent allocations and backref maintenance for | 32 | * we queue up extent allocations and backref maintenance for |
@@ -422,6 +426,14 @@ again: | |||
422 | return 1; | 426 | return 1; |
423 | } | 427 | } |
424 | 428 | ||
429 | void btrfs_release_ref_cluster(struct list_head *cluster) | ||
430 | { | ||
431 | struct list_head *pos, *q; | ||
432 | |||
433 | list_for_each_safe(pos, q, cluster) | ||
434 | list_del_init(pos); | ||
435 | } | ||
436 | |||
425 | /* | 437 | /* |
426 | * helper function to update an extent delayed ref in the | 438 | * helper function to update an extent delayed ref in the |
427 | * rbtree. existing and update must both have the same | 439 | * rbtree. existing and update must both have the same |
@@ -511,7 +523,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, | |||
511 | ref->extent_op->flags_to_set; | 523 | ref->extent_op->flags_to_set; |
512 | existing_ref->extent_op->update_flags = 1; | 524 | existing_ref->extent_op->update_flags = 1; |
513 | } | 525 | } |
514 | kfree(ref->extent_op); | 526 | btrfs_free_delayed_extent_op(ref->extent_op); |
515 | } | 527 | } |
516 | } | 528 | } |
517 | /* | 529 | /* |
@@ -592,7 +604,7 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info, | |||
592 | * we've updated the existing ref, free the newly | 604 | * we've updated the existing ref, free the newly |
593 | * allocated ref | 605 | * allocated ref |
594 | */ | 606 | */ |
595 | kfree(head_ref); | 607 | kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); |
596 | } else { | 608 | } else { |
597 | delayed_refs->num_heads++; | 609 | delayed_refs->num_heads++; |
598 | delayed_refs->num_heads_ready++; | 610 | delayed_refs->num_heads_ready++; |
@@ -653,7 +665,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, | |||
653 | * we've updated the existing ref, free the newly | 665 | * we've updated the existing ref, free the newly |
654 | * allocated ref | 666 | * allocated ref |
655 | */ | 667 | */ |
656 | kfree(full_ref); | 668 | kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref); |
657 | } else { | 669 | } else { |
658 | delayed_refs->num_entries++; | 670 | delayed_refs->num_entries++; |
659 | trans->delayed_ref_updates++; | 671 | trans->delayed_ref_updates++; |
@@ -714,7 +726,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, | |||
714 | * we've updated the existing ref, free the newly | 726 | * we've updated the existing ref, free the newly |
715 | * allocated ref | 727 | * allocated ref |
716 | */ | 728 | */ |
717 | kfree(full_ref); | 729 | kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); |
718 | } else { | 730 | } else { |
719 | delayed_refs->num_entries++; | 731 | delayed_refs->num_entries++; |
720 | trans->delayed_ref_updates++; | 732 | trans->delayed_ref_updates++; |
@@ -738,13 +750,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, | |||
738 | struct btrfs_delayed_ref_root *delayed_refs; | 750 | struct btrfs_delayed_ref_root *delayed_refs; |
739 | 751 | ||
740 | BUG_ON(extent_op && extent_op->is_data); | 752 | BUG_ON(extent_op && extent_op->is_data); |
741 | ref = kmalloc(sizeof(*ref), GFP_NOFS); | 753 | ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); |
742 | if (!ref) | 754 | if (!ref) |
743 | return -ENOMEM; | 755 | return -ENOMEM; |
744 | 756 | ||
745 | head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); | 757 | head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); |
746 | if (!head_ref) { | 758 | if (!head_ref) { |
747 | kfree(ref); | 759 | kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); |
748 | return -ENOMEM; | 760 | return -ENOMEM; |
749 | } | 761 | } |
750 | 762 | ||
@@ -786,13 +798,13 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, | |||
786 | struct btrfs_delayed_ref_root *delayed_refs; | 798 | struct btrfs_delayed_ref_root *delayed_refs; |
787 | 799 | ||
788 | BUG_ON(extent_op && !extent_op->is_data); | 800 | BUG_ON(extent_op && !extent_op->is_data); |
789 | ref = kmalloc(sizeof(*ref), GFP_NOFS); | 801 | ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); |
790 | if (!ref) | 802 | if (!ref) |
791 | return -ENOMEM; | 803 | return -ENOMEM; |
792 | 804 | ||
793 | head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); | 805 | head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); |
794 | if (!head_ref) { | 806 | if (!head_ref) { |
795 | kfree(ref); | 807 | kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); |
796 | return -ENOMEM; | 808 | return -ENOMEM; |
797 | } | 809 | } |
798 | 810 | ||
@@ -826,7 +838,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, | |||
826 | struct btrfs_delayed_ref_head *head_ref; | 838 | struct btrfs_delayed_ref_head *head_ref; |
827 | struct btrfs_delayed_ref_root *delayed_refs; | 839 | struct btrfs_delayed_ref_root *delayed_refs; |
828 | 840 | ||
829 | head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); | 841 | head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); |
830 | if (!head_ref) | 842 | if (!head_ref) |
831 | return -ENOMEM; | 843 | return -ENOMEM; |
832 | 844 | ||
@@ -860,3 +872,51 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) | |||
860 | return btrfs_delayed_node_to_head(ref); | 872 | return btrfs_delayed_node_to_head(ref); |
861 | return NULL; | 873 | return NULL; |
862 | } | 874 | } |
875 | |||
876 | void btrfs_delayed_ref_exit(void) | ||
877 | { | ||
878 | if (btrfs_delayed_ref_head_cachep) | ||
879 | kmem_cache_destroy(btrfs_delayed_ref_head_cachep); | ||
880 | if (btrfs_delayed_tree_ref_cachep) | ||
881 | kmem_cache_destroy(btrfs_delayed_tree_ref_cachep); | ||
882 | if (btrfs_delayed_data_ref_cachep) | ||
883 | kmem_cache_destroy(btrfs_delayed_data_ref_cachep); | ||
884 | if (btrfs_delayed_extent_op_cachep) | ||
885 | kmem_cache_destroy(btrfs_delayed_extent_op_cachep); | ||
886 | } | ||
887 | |||
888 | int btrfs_delayed_ref_init(void) | ||
889 | { | ||
890 | btrfs_delayed_ref_head_cachep = kmem_cache_create( | ||
891 | "btrfs_delayed_ref_head", | ||
892 | sizeof(struct btrfs_delayed_ref_head), 0, | ||
893 | SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); | ||
894 | if (!btrfs_delayed_ref_head_cachep) | ||
895 | goto fail; | ||
896 | |||
897 | btrfs_delayed_tree_ref_cachep = kmem_cache_create( | ||
898 | "btrfs_delayed_tree_ref", | ||
899 | sizeof(struct btrfs_delayed_tree_ref), 0, | ||
900 | SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); | ||
901 | if (!btrfs_delayed_tree_ref_cachep) | ||
902 | goto fail; | ||
903 | |||
904 | btrfs_delayed_data_ref_cachep = kmem_cache_create( | ||
905 | "btrfs_delayed_data_ref", | ||
906 | sizeof(struct btrfs_delayed_data_ref), 0, | ||
907 | SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); | ||
908 | if (!btrfs_delayed_data_ref_cachep) | ||
909 | goto fail; | ||
910 | |||
911 | btrfs_delayed_extent_op_cachep = kmem_cache_create( | ||
912 | "btrfs_delayed_extent_op", | ||
913 | sizeof(struct btrfs_delayed_extent_op), 0, | ||
914 | SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); | ||
915 | if (!btrfs_delayed_extent_op_cachep) | ||
916 | goto fail; | ||
917 | |||
918 | return 0; | ||
919 | fail: | ||
920 | btrfs_delayed_ref_exit(); | ||
921 | return -ENOMEM; | ||
922 | } | ||
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c9d703693df0..f75fcaf79aeb 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h | |||
@@ -132,6 +132,15 @@ struct btrfs_delayed_ref_root { | |||
132 | unsigned long num_heads_ready; | 132 | unsigned long num_heads_ready; |
133 | 133 | ||
134 | /* | 134 | /* |
135 | * bumped when someone is making progress on the delayed | ||
136 | * refs, so that other procs know they are just adding to | ||
137 | * contention intead of helping | ||
138 | */ | ||
139 | atomic_t procs_running_refs; | ||
140 | atomic_t ref_seq; | ||
141 | wait_queue_head_t wait; | ||
142 | |||
143 | /* | ||
135 | * set when the tree is flushing before a transaction commit, | 144 | * set when the tree is flushing before a transaction commit, |
136 | * used by the throttling code to decide if new updates need | 145 | * used by the throttling code to decide if new updates need |
137 | * to be run right away | 146 | * to be run right away |
@@ -141,12 +150,47 @@ struct btrfs_delayed_ref_root { | |||
141 | u64 run_delayed_start; | 150 | u64 run_delayed_start; |
142 | }; | 151 | }; |
143 | 152 | ||
153 | extern struct kmem_cache *btrfs_delayed_ref_head_cachep; | ||
154 | extern struct kmem_cache *btrfs_delayed_tree_ref_cachep; | ||
155 | extern struct kmem_cache *btrfs_delayed_data_ref_cachep; | ||
156 | extern struct kmem_cache *btrfs_delayed_extent_op_cachep; | ||
157 | |||
158 | int btrfs_delayed_ref_init(void); | ||
159 | void btrfs_delayed_ref_exit(void); | ||
160 | |||
161 | static inline struct btrfs_delayed_extent_op * | ||
162 | btrfs_alloc_delayed_extent_op(void) | ||
163 | { | ||
164 | return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS); | ||
165 | } | ||
166 | |||
167 | static inline void | ||
168 | btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) | ||
169 | { | ||
170 | if (op) | ||
171 | kmem_cache_free(btrfs_delayed_extent_op_cachep, op); | ||
172 | } | ||
173 | |||
144 | static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) | 174 | static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) |
145 | { | 175 | { |
146 | WARN_ON(atomic_read(&ref->refs) == 0); | 176 | WARN_ON(atomic_read(&ref->refs) == 0); |
147 | if (atomic_dec_and_test(&ref->refs)) { | 177 | if (atomic_dec_and_test(&ref->refs)) { |
148 | WARN_ON(ref->in_tree); | 178 | WARN_ON(ref->in_tree); |
149 | kfree(ref); | 179 | switch (ref->type) { |
180 | case BTRFS_TREE_BLOCK_REF_KEY: | ||
181 | case BTRFS_SHARED_BLOCK_REF_KEY: | ||
182 | kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); | ||
183 | break; | ||
184 | case BTRFS_EXTENT_DATA_REF_KEY: | ||
185 | case BTRFS_SHARED_DATA_REF_KEY: | ||
186 | kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); | ||
187 | break; | ||
188 | case 0: | ||
189 | kmem_cache_free(btrfs_delayed_ref_head_cachep, ref); | ||
190 | break; | ||
191 | default: | ||
192 | BUG(); | ||
193 | } | ||
150 | } | 194 | } |
151 | } | 195 | } |
152 | 196 | ||
@@ -176,8 +220,14 @@ struct btrfs_delayed_ref_head * | |||
176 | btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); | 220 | btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); |
177 | int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, | 221 | int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, |
178 | struct btrfs_delayed_ref_head *head); | 222 | struct btrfs_delayed_ref_head *head); |
223 | static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) | ||
224 | { | ||
225 | mutex_unlock(&head->mutex); | ||
226 | } | ||
227 | |||
179 | int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, | 228 | int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, |
180 | struct list_head *cluster, u64 search_start); | 229 | struct list_head *cluster, u64 search_start); |
230 | void btrfs_release_ref_cluster(struct list_head *cluster); | ||
181 | 231 | ||
182 | int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, | 232 | int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, |
183 | struct btrfs_delayed_ref_root *delayed_refs, | 233 | struct btrfs_delayed_ref_root *delayed_refs, |
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 66dbc8dbddf7..7ba7b3900cb8 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c | |||
@@ -465,7 +465,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, | |||
465 | * flush all outstanding I/O and inode extent mappings before the | 465 | * flush all outstanding I/O and inode extent mappings before the |
466 | * copy operation is declared as being finished | 466 | * copy operation is declared as being finished |
467 | */ | 467 | */ |
468 | btrfs_start_delalloc_inodes(root, 0); | 468 | ret = btrfs_start_delalloc_inodes(root, 0); |
469 | if (ret) { | ||
470 | mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); | ||
471 | return ret; | ||
472 | } | ||
469 | btrfs_wait_ordered_extents(root, 0); | 473 | btrfs_wait_ordered_extents(root, 0); |
470 | 474 | ||
471 | trans = btrfs_start_transaction(root, 0); | 475 | trans = btrfs_start_transaction(root, 0); |
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a8f652dc940b..02369a3c162e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c | |||
@@ -46,6 +46,7 @@ | |||
46 | #include "check-integrity.h" | 46 | #include "check-integrity.h" |
47 | #include "rcu-string.h" | 47 | #include "rcu-string.h" |
48 | #include "dev-replace.h" | 48 | #include "dev-replace.h" |
49 | #include "raid56.h" | ||
49 | 50 | ||
50 | #ifdef CONFIG_X86 | 51 | #ifdef CONFIG_X86 |
51 | #include <asm/cpufeature.h> | 52 | #include <asm/cpufeature.h> |
@@ -56,7 +57,8 @@ static void end_workqueue_fn(struct btrfs_work *work); | |||
56 | static void free_fs_root(struct btrfs_root *root); | 57 | static void free_fs_root(struct btrfs_root *root); |
57 | static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, | 58 | static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, |
58 | int read_only); | 59 | int read_only); |
59 | static void btrfs_destroy_ordered_operations(struct btrfs_root *root); | 60 | static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, |
61 | struct btrfs_root *root); | ||
60 | static void btrfs_destroy_ordered_extents(struct btrfs_root *root); | 62 | static void btrfs_destroy_ordered_extents(struct btrfs_root *root); |
61 | static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, | 63 | static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, |
62 | struct btrfs_root *root); | 64 | struct btrfs_root *root); |
@@ -420,7 +422,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, | |||
420 | static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) | 422 | static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) |
421 | { | 423 | { |
422 | struct extent_io_tree *tree; | 424 | struct extent_io_tree *tree; |
423 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | 425 | u64 start = page_offset(page); |
424 | u64 found_start; | 426 | u64 found_start; |
425 | struct extent_buffer *eb; | 427 | struct extent_buffer *eb; |
426 | 428 | ||
@@ -639,8 +641,15 @@ err: | |||
639 | btree_readahead_hook(root, eb, eb->start, ret); | 641 | btree_readahead_hook(root, eb, eb->start, ret); |
640 | } | 642 | } |
641 | 643 | ||
642 | if (ret) | 644 | if (ret) { |
645 | /* | ||
646 | * our io error hook is going to dec the io pages | ||
647 | * again, we have to make sure it has something | ||
648 | * to decrement | ||
649 | */ | ||
650 | atomic_inc(&eb->io_pages); | ||
643 | clear_extent_buffer_uptodate(eb); | 651 | clear_extent_buffer_uptodate(eb); |
652 | } | ||
644 | free_extent_buffer(eb); | 653 | free_extent_buffer(eb); |
645 | out: | 654 | out: |
646 | return ret; | 655 | return ret; |
@@ -654,6 +663,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) | |||
654 | eb = (struct extent_buffer *)page->private; | 663 | eb = (struct extent_buffer *)page->private; |
655 | set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); | 664 | set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); |
656 | eb->read_mirror = failed_mirror; | 665 | eb->read_mirror = failed_mirror; |
666 | atomic_dec(&eb->io_pages); | ||
657 | if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) | 667 | if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) |
658 | btree_readahead_hook(root, eb, eb->start, -EIO); | 668 | btree_readahead_hook(root, eb, eb->start, -EIO); |
659 | return -EIO; /* we fixed nothing */ | 669 | return -EIO; /* we fixed nothing */ |
@@ -670,17 +680,23 @@ static void end_workqueue_bio(struct bio *bio, int err) | |||
670 | end_io_wq->work.flags = 0; | 680 | end_io_wq->work.flags = 0; |
671 | 681 | ||
672 | if (bio->bi_rw & REQ_WRITE) { | 682 | if (bio->bi_rw & REQ_WRITE) { |
673 | if (end_io_wq->metadata == 1) | 683 | if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) |
674 | btrfs_queue_worker(&fs_info->endio_meta_write_workers, | 684 | btrfs_queue_worker(&fs_info->endio_meta_write_workers, |
675 | &end_io_wq->work); | 685 | &end_io_wq->work); |
676 | else if (end_io_wq->metadata == 2) | 686 | else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) |
677 | btrfs_queue_worker(&fs_info->endio_freespace_worker, | 687 | btrfs_queue_worker(&fs_info->endio_freespace_worker, |
678 | &end_io_wq->work); | 688 | &end_io_wq->work); |
689 | else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) | ||
690 | btrfs_queue_worker(&fs_info->endio_raid56_workers, | ||
691 | &end_io_wq->work); | ||
679 | else | 692 | else |
680 | btrfs_queue_worker(&fs_info->endio_write_workers, | 693 | btrfs_queue_worker(&fs_info->endio_write_workers, |
681 | &end_io_wq->work); | 694 | &end_io_wq->work); |
682 | } else { | 695 | } else { |
683 | if (end_io_wq->metadata) | 696 | if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) |
697 | btrfs_queue_worker(&fs_info->endio_raid56_workers, | ||
698 | &end_io_wq->work); | ||
699 | else if (end_io_wq->metadata) | ||
684 | btrfs_queue_worker(&fs_info->endio_meta_workers, | 700 | btrfs_queue_worker(&fs_info->endio_meta_workers, |
685 | &end_io_wq->work); | 701 | &end_io_wq->work); |
686 | else | 702 | else |
@@ -695,6 +711,7 @@ static void end_workqueue_bio(struct bio *bio, int err) | |||
695 | * 0 - if data | 711 | * 0 - if data |
696 | * 1 - if normal metadta | 712 | * 1 - if normal metadta |
697 | * 2 - if writing to the free space cache area | 713 | * 2 - if writing to the free space cache area |
714 | * 3 - raid parity work | ||
698 | */ | 715 | */ |
699 | int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, | 716 | int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, |
700 | int metadata) | 717 | int metadata) |
@@ -946,18 +963,20 @@ static int btree_writepages(struct address_space *mapping, | |||
946 | struct writeback_control *wbc) | 963 | struct writeback_control *wbc) |
947 | { | 964 | { |
948 | struct extent_io_tree *tree; | 965 | struct extent_io_tree *tree; |
966 | struct btrfs_fs_info *fs_info; | ||
967 | int ret; | ||
968 | |||
949 | tree = &BTRFS_I(mapping->host)->io_tree; | 969 | tree = &BTRFS_I(mapping->host)->io_tree; |
950 | if (wbc->sync_mode == WB_SYNC_NONE) { | 970 | if (wbc->sync_mode == WB_SYNC_NONE) { |
951 | struct btrfs_root *root = BTRFS_I(mapping->host)->root; | ||
952 | u64 num_dirty; | ||
953 | unsigned long thresh = 32 * 1024 * 1024; | ||
954 | 971 | ||
955 | if (wbc->for_kupdate) | 972 | if (wbc->for_kupdate) |
956 | return 0; | 973 | return 0; |
957 | 974 | ||
975 | fs_info = BTRFS_I(mapping->host)->root->fs_info; | ||
958 | /* this is a bit racy, but that's ok */ | 976 | /* this is a bit racy, but that's ok */ |
959 | num_dirty = root->fs_info->dirty_metadata_bytes; | 977 | ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes, |
960 | if (num_dirty < thresh) | 978 | BTRFS_DIRTY_METADATA_THRESH); |
979 | if (ret < 0) | ||
961 | return 0; | 980 | return 0; |
962 | } | 981 | } |
963 | return btree_write_cache_pages(mapping, wbc); | 982 | return btree_write_cache_pages(mapping, wbc); |
@@ -1125,24 +1144,16 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, | |||
1125 | void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, | 1144 | void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, |
1126 | struct extent_buffer *buf) | 1145 | struct extent_buffer *buf) |
1127 | { | 1146 | { |
1147 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
1148 | |||
1128 | if (btrfs_header_generation(buf) == | 1149 | if (btrfs_header_generation(buf) == |
1129 | root->fs_info->running_transaction->transid) { | 1150 | fs_info->running_transaction->transid) { |
1130 | btrfs_assert_tree_locked(buf); | 1151 | btrfs_assert_tree_locked(buf); |
1131 | 1152 | ||
1132 | if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { | 1153 | if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { |
1133 | spin_lock(&root->fs_info->delalloc_lock); | 1154 | __percpu_counter_add(&fs_info->dirty_metadata_bytes, |
1134 | if (root->fs_info->dirty_metadata_bytes >= buf->len) | 1155 | -buf->len, |
1135 | root->fs_info->dirty_metadata_bytes -= buf->len; | 1156 | fs_info->dirty_metadata_batch); |
1136 | else { | ||
1137 | spin_unlock(&root->fs_info->delalloc_lock); | ||
1138 | btrfs_panic(root->fs_info, -EOVERFLOW, | ||
1139 | "Can't clear %lu bytes from " | ||
1140 | " dirty_mdatadata_bytes (%llu)", | ||
1141 | buf->len, | ||
1142 | root->fs_info->dirty_metadata_bytes); | ||
1143 | } | ||
1144 | spin_unlock(&root->fs_info->delalloc_lock); | ||
1145 | |||
1146 | /* ugh, clear_extent_buffer_dirty needs to lock the page */ | 1157 | /* ugh, clear_extent_buffer_dirty needs to lock the page */ |
1147 | btrfs_set_lock_blocking(buf); | 1158 | btrfs_set_lock_blocking(buf); |
1148 | clear_extent_buffer_dirty(buf); | 1159 | clear_extent_buffer_dirty(buf); |
@@ -1178,9 +1189,13 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, | |||
1178 | 1189 | ||
1179 | INIT_LIST_HEAD(&root->dirty_list); | 1190 | INIT_LIST_HEAD(&root->dirty_list); |
1180 | INIT_LIST_HEAD(&root->root_list); | 1191 | INIT_LIST_HEAD(&root->root_list); |
1192 | INIT_LIST_HEAD(&root->logged_list[0]); | ||
1193 | INIT_LIST_HEAD(&root->logged_list[1]); | ||
1181 | spin_lock_init(&root->orphan_lock); | 1194 | spin_lock_init(&root->orphan_lock); |
1182 | spin_lock_init(&root->inode_lock); | 1195 | spin_lock_init(&root->inode_lock); |
1183 | spin_lock_init(&root->accounting_lock); | 1196 | spin_lock_init(&root->accounting_lock); |
1197 | spin_lock_init(&root->log_extents_lock[0]); | ||
1198 | spin_lock_init(&root->log_extents_lock[1]); | ||
1184 | mutex_init(&root->objectid_mutex); | 1199 | mutex_init(&root->objectid_mutex); |
1185 | mutex_init(&root->log_mutex); | 1200 | mutex_init(&root->log_mutex); |
1186 | init_waitqueue_head(&root->log_writer_wait); | 1201 | init_waitqueue_head(&root->log_writer_wait); |
@@ -2004,10 +2019,24 @@ int open_ctree(struct super_block *sb, | |||
2004 | goto fail_srcu; | 2019 | goto fail_srcu; |
2005 | } | 2020 | } |
2006 | 2021 | ||
2022 | ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0); | ||
2023 | if (ret) { | ||
2024 | err = ret; | ||
2025 | goto fail_bdi; | ||
2026 | } | ||
2027 | fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * | ||
2028 | (1 + ilog2(nr_cpu_ids)); | ||
2029 | |||
2030 | ret = percpu_counter_init(&fs_info->delalloc_bytes, 0); | ||
2031 | if (ret) { | ||
2032 | err = ret; | ||
2033 | goto fail_dirty_metadata_bytes; | ||
2034 | } | ||
2035 | |||
2007 | fs_info->btree_inode = new_inode(sb); | 2036 | fs_info->btree_inode = new_inode(sb); |
2008 | if (!fs_info->btree_inode) { | 2037 | if (!fs_info->btree_inode) { |
2009 | err = -ENOMEM; | 2038 | err = -ENOMEM; |
2010 | goto fail_bdi; | 2039 | goto fail_delalloc_bytes; |
2011 | } | 2040 | } |
2012 | 2041 | ||
2013 | mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); | 2042 | mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); |
@@ -2017,7 +2046,6 @@ int open_ctree(struct super_block *sb, | |||
2017 | INIT_LIST_HEAD(&fs_info->dead_roots); | 2046 | INIT_LIST_HEAD(&fs_info->dead_roots); |
2018 | INIT_LIST_HEAD(&fs_info->delayed_iputs); | 2047 | INIT_LIST_HEAD(&fs_info->delayed_iputs); |
2019 | INIT_LIST_HEAD(&fs_info->delalloc_inodes); | 2048 | INIT_LIST_HEAD(&fs_info->delalloc_inodes); |
2020 | INIT_LIST_HEAD(&fs_info->ordered_operations); | ||
2021 | INIT_LIST_HEAD(&fs_info->caching_block_groups); | 2049 | INIT_LIST_HEAD(&fs_info->caching_block_groups); |
2022 | spin_lock_init(&fs_info->delalloc_lock); | 2050 | spin_lock_init(&fs_info->delalloc_lock); |
2023 | spin_lock_init(&fs_info->trans_lock); | 2051 | spin_lock_init(&fs_info->trans_lock); |
@@ -2028,6 +2056,7 @@ int open_ctree(struct super_block *sb, | |||
2028 | spin_lock_init(&fs_info->tree_mod_seq_lock); | 2056 | spin_lock_init(&fs_info->tree_mod_seq_lock); |
2029 | rwlock_init(&fs_info->tree_mod_log_lock); | 2057 | rwlock_init(&fs_info->tree_mod_log_lock); |
2030 | mutex_init(&fs_info->reloc_mutex); | 2058 | mutex_init(&fs_info->reloc_mutex); |
2059 | seqlock_init(&fs_info->profiles_lock); | ||
2031 | 2060 | ||
2032 | init_completion(&fs_info->kobj_unregister); | 2061 | init_completion(&fs_info->kobj_unregister); |
2033 | INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); | 2062 | INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); |
@@ -2126,6 +2155,7 @@ int open_ctree(struct super_block *sb, | |||
2126 | 2155 | ||
2127 | spin_lock_init(&fs_info->block_group_cache_lock); | 2156 | spin_lock_init(&fs_info->block_group_cache_lock); |
2128 | fs_info->block_group_cache_tree = RB_ROOT; | 2157 | fs_info->block_group_cache_tree = RB_ROOT; |
2158 | fs_info->first_logical_byte = (u64)-1; | ||
2129 | 2159 | ||
2130 | extent_io_tree_init(&fs_info->freed_extents[0], | 2160 | extent_io_tree_init(&fs_info->freed_extents[0], |
2131 | fs_info->btree_inode->i_mapping); | 2161 | fs_info->btree_inode->i_mapping); |
@@ -2165,6 +2195,12 @@ int open_ctree(struct super_block *sb, | |||
2165 | init_waitqueue_head(&fs_info->transaction_blocked_wait); | 2195 | init_waitqueue_head(&fs_info->transaction_blocked_wait); |
2166 | init_waitqueue_head(&fs_info->async_submit_wait); | 2196 | init_waitqueue_head(&fs_info->async_submit_wait); |
2167 | 2197 | ||
2198 | ret = btrfs_alloc_stripe_hash_table(fs_info); | ||
2199 | if (ret) { | ||
2200 | err = ret; | ||
2201 | goto fail_alloc; | ||
2202 | } | ||
2203 | |||
2168 | __setup_root(4096, 4096, 4096, 4096, tree_root, | 2204 | __setup_root(4096, 4096, 4096, 4096, tree_root, |
2169 | fs_info, BTRFS_ROOT_TREE_OBJECTID); | 2205 | fs_info, BTRFS_ROOT_TREE_OBJECTID); |
2170 | 2206 | ||
@@ -2187,7 +2223,8 @@ int open_ctree(struct super_block *sb, | |||
2187 | goto fail_alloc; | 2223 | goto fail_alloc; |
2188 | 2224 | ||
2189 | /* check FS state, whether FS is broken. */ | 2225 | /* check FS state, whether FS is broken. */ |
2190 | fs_info->fs_state |= btrfs_super_flags(disk_super); | 2226 | if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) |
2227 | set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); | ||
2191 | 2228 | ||
2192 | ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); | 2229 | ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); |
2193 | if (ret) { | 2230 | if (ret) { |
@@ -2261,6 +2298,8 @@ int open_ctree(struct super_block *sb, | |||
2261 | leafsize = btrfs_super_leafsize(disk_super); | 2298 | leafsize = btrfs_super_leafsize(disk_super); |
2262 | sectorsize = btrfs_super_sectorsize(disk_super); | 2299 | sectorsize = btrfs_super_sectorsize(disk_super); |
2263 | stripesize = btrfs_super_stripesize(disk_super); | 2300 | stripesize = btrfs_super_stripesize(disk_super); |
2301 | fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids)); | ||
2302 | fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); | ||
2264 | 2303 | ||
2265 | /* | 2304 | /* |
2266 | * mixed block groups end up with duplicate but slightly offset | 2305 | * mixed block groups end up with duplicate but slightly offset |
@@ -2332,6 +2371,12 @@ int open_ctree(struct super_block *sb, | |||
2332 | btrfs_init_workers(&fs_info->endio_meta_write_workers, | 2371 | btrfs_init_workers(&fs_info->endio_meta_write_workers, |
2333 | "endio-meta-write", fs_info->thread_pool_size, | 2372 | "endio-meta-write", fs_info->thread_pool_size, |
2334 | &fs_info->generic_worker); | 2373 | &fs_info->generic_worker); |
2374 | btrfs_init_workers(&fs_info->endio_raid56_workers, | ||
2375 | "endio-raid56", fs_info->thread_pool_size, | ||
2376 | &fs_info->generic_worker); | ||
2377 | btrfs_init_workers(&fs_info->rmw_workers, | ||
2378 | "rmw", fs_info->thread_pool_size, | ||
2379 | &fs_info->generic_worker); | ||
2335 | btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", | 2380 | btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", |
2336 | fs_info->thread_pool_size, | 2381 | fs_info->thread_pool_size, |
2337 | &fs_info->generic_worker); | 2382 | &fs_info->generic_worker); |
@@ -2350,6 +2395,8 @@ int open_ctree(struct super_block *sb, | |||
2350 | */ | 2395 | */ |
2351 | fs_info->endio_workers.idle_thresh = 4; | 2396 | fs_info->endio_workers.idle_thresh = 4; |
2352 | fs_info->endio_meta_workers.idle_thresh = 4; | 2397 | fs_info->endio_meta_workers.idle_thresh = 4; |
2398 | fs_info->endio_raid56_workers.idle_thresh = 4; | ||
2399 | fs_info->rmw_workers.idle_thresh = 2; | ||
2353 | 2400 | ||
2354 | fs_info->endio_write_workers.idle_thresh = 2; | 2401 | fs_info->endio_write_workers.idle_thresh = 2; |
2355 | fs_info->endio_meta_write_workers.idle_thresh = 2; | 2402 | fs_info->endio_meta_write_workers.idle_thresh = 2; |
@@ -2366,6 +2413,8 @@ int open_ctree(struct super_block *sb, | |||
2366 | ret |= btrfs_start_workers(&fs_info->fixup_workers); | 2413 | ret |= btrfs_start_workers(&fs_info->fixup_workers); |
2367 | ret |= btrfs_start_workers(&fs_info->endio_workers); | 2414 | ret |= btrfs_start_workers(&fs_info->endio_workers); |
2368 | ret |= btrfs_start_workers(&fs_info->endio_meta_workers); | 2415 | ret |= btrfs_start_workers(&fs_info->endio_meta_workers); |
2416 | ret |= btrfs_start_workers(&fs_info->rmw_workers); | ||
2417 | ret |= btrfs_start_workers(&fs_info->endio_raid56_workers); | ||
2369 | ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); | 2418 | ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); |
2370 | ret |= btrfs_start_workers(&fs_info->endio_write_workers); | 2419 | ret |= btrfs_start_workers(&fs_info->endio_write_workers); |
2371 | ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); | 2420 | ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); |
@@ -2390,8 +2439,7 @@ int open_ctree(struct super_block *sb, | |||
2390 | sb->s_blocksize = sectorsize; | 2439 | sb->s_blocksize = sectorsize; |
2391 | sb->s_blocksize_bits = blksize_bits(sectorsize); | 2440 | sb->s_blocksize_bits = blksize_bits(sectorsize); |
2392 | 2441 | ||
2393 | if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, | 2442 | if (disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) { |
2394 | sizeof(disk_super->magic))) { | ||
2395 | printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); | 2443 | printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); |
2396 | goto fail_sb_buffer; | 2444 | goto fail_sb_buffer; |
2397 | } | 2445 | } |
@@ -2694,13 +2742,13 @@ fail_cleaner: | |||
2694 | * kthreads | 2742 | * kthreads |
2695 | */ | 2743 | */ |
2696 | filemap_write_and_wait(fs_info->btree_inode->i_mapping); | 2744 | filemap_write_and_wait(fs_info->btree_inode->i_mapping); |
2697 | invalidate_inode_pages2(fs_info->btree_inode->i_mapping); | ||
2698 | 2745 | ||
2699 | fail_block_groups: | 2746 | fail_block_groups: |
2700 | btrfs_free_block_groups(fs_info); | 2747 | btrfs_free_block_groups(fs_info); |
2701 | 2748 | ||
2702 | fail_tree_roots: | 2749 | fail_tree_roots: |
2703 | free_root_pointers(fs_info, 1); | 2750 | free_root_pointers(fs_info, 1); |
2751 | invalidate_inode_pages2(fs_info->btree_inode->i_mapping); | ||
2704 | 2752 | ||
2705 | fail_sb_buffer: | 2753 | fail_sb_buffer: |
2706 | btrfs_stop_workers(&fs_info->generic_worker); | 2754 | btrfs_stop_workers(&fs_info->generic_worker); |
@@ -2710,6 +2758,8 @@ fail_sb_buffer: | |||
2710 | btrfs_stop_workers(&fs_info->workers); | 2758 | btrfs_stop_workers(&fs_info->workers); |
2711 | btrfs_stop_workers(&fs_info->endio_workers); | 2759 | btrfs_stop_workers(&fs_info->endio_workers); |
2712 | btrfs_stop_workers(&fs_info->endio_meta_workers); | 2760 | btrfs_stop_workers(&fs_info->endio_meta_workers); |
2761 | btrfs_stop_workers(&fs_info->endio_raid56_workers); | ||
2762 | btrfs_stop_workers(&fs_info->rmw_workers); | ||
2713 | btrfs_stop_workers(&fs_info->endio_meta_write_workers); | 2763 | btrfs_stop_workers(&fs_info->endio_meta_write_workers); |
2714 | btrfs_stop_workers(&fs_info->endio_write_workers); | 2764 | btrfs_stop_workers(&fs_info->endio_write_workers); |
2715 | btrfs_stop_workers(&fs_info->endio_freespace_worker); | 2765 | btrfs_stop_workers(&fs_info->endio_freespace_worker); |
@@ -2721,13 +2771,17 @@ fail_alloc: | |||
2721 | fail_iput: | 2771 | fail_iput: |
2722 | btrfs_mapping_tree_free(&fs_info->mapping_tree); | 2772 | btrfs_mapping_tree_free(&fs_info->mapping_tree); |
2723 | 2773 | ||
2724 | invalidate_inode_pages2(fs_info->btree_inode->i_mapping); | ||
2725 | iput(fs_info->btree_inode); | 2774 | iput(fs_info->btree_inode); |
2775 | fail_delalloc_bytes: | ||
2776 | percpu_counter_destroy(&fs_info->delalloc_bytes); | ||
2777 | fail_dirty_metadata_bytes: | ||
2778 | percpu_counter_destroy(&fs_info->dirty_metadata_bytes); | ||
2726 | fail_bdi: | 2779 | fail_bdi: |
2727 | bdi_destroy(&fs_info->bdi); | 2780 | bdi_destroy(&fs_info->bdi); |
2728 | fail_srcu: | 2781 | fail_srcu: |
2729 | cleanup_srcu_struct(&fs_info->subvol_srcu); | 2782 | cleanup_srcu_struct(&fs_info->subvol_srcu); |
2730 | fail: | 2783 | fail: |
2784 | btrfs_free_stripe_hash_table(fs_info); | ||
2731 | btrfs_close_devices(fs_info->fs_devices); | 2785 | btrfs_close_devices(fs_info->fs_devices); |
2732 | return err; | 2786 | return err; |
2733 | 2787 | ||
@@ -2795,8 +2849,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) | |||
2795 | 2849 | ||
2796 | super = (struct btrfs_super_block *)bh->b_data; | 2850 | super = (struct btrfs_super_block *)bh->b_data; |
2797 | if (btrfs_super_bytenr(super) != bytenr || | 2851 | if (btrfs_super_bytenr(super) != bytenr || |
2798 | strncmp((char *)(&super->magic), BTRFS_MAGIC, | 2852 | super->magic != cpu_to_le64(BTRFS_MAGIC)) { |
2799 | sizeof(super->magic))) { | ||
2800 | brelse(bh); | 2853 | brelse(bh); |
2801 | continue; | 2854 | continue; |
2802 | } | 2855 | } |
@@ -3076,11 +3129,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures( | |||
3076 | ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) | 3129 | ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) |
3077 | == 0))) | 3130 | == 0))) |
3078 | num_tolerated_disk_barrier_failures = 0; | 3131 | num_tolerated_disk_barrier_failures = 0; |
3079 | else if (num_tolerated_disk_barrier_failures > 1 | 3132 | else if (num_tolerated_disk_barrier_failures > 1) { |
3080 | && | 3133 | if (flags & (BTRFS_BLOCK_GROUP_RAID1 | |
3081 | (flags & (BTRFS_BLOCK_GROUP_RAID1 | | 3134 | BTRFS_BLOCK_GROUP_RAID5 | |
3082 | BTRFS_BLOCK_GROUP_RAID10))) | 3135 | BTRFS_BLOCK_GROUP_RAID10)) { |
3083 | num_tolerated_disk_barrier_failures = 1; | 3136 | num_tolerated_disk_barrier_failures = 1; |
3137 | } else if (flags & | ||
3138 | BTRFS_BLOCK_GROUP_RAID5) { | ||
3139 | num_tolerated_disk_barrier_failures = 2; | ||
3140 | } | ||
3141 | } | ||
3084 | } | 3142 | } |
3085 | } | 3143 | } |
3086 | up_read(&sinfo->groups_sem); | 3144 | up_read(&sinfo->groups_sem); |
@@ -3195,6 +3253,11 @@ void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) | |||
3195 | if (btrfs_root_refs(&root->root_item) == 0) | 3253 | if (btrfs_root_refs(&root->root_item) == 0) |
3196 | synchronize_srcu(&fs_info->subvol_srcu); | 3254 | synchronize_srcu(&fs_info->subvol_srcu); |
3197 | 3255 | ||
3256 | if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { | ||
3257 | btrfs_free_log(NULL, root); | ||
3258 | btrfs_free_log_root_tree(NULL, fs_info); | ||
3259 | } | ||
3260 | |||
3198 | __btrfs_remove_free_space_cache(root->free_ino_pinned); | 3261 | __btrfs_remove_free_space_cache(root->free_ino_pinned); |
3199 | __btrfs_remove_free_space_cache(root->free_ino_ctl); | 3262 | __btrfs_remove_free_space_cache(root->free_ino_ctl); |
3200 | free_fs_root(root); | 3263 | free_fs_root(root); |
@@ -3339,7 +3402,7 @@ int close_ctree(struct btrfs_root *root) | |||
3339 | printk(KERN_ERR "btrfs: commit super ret %d\n", ret); | 3402 | printk(KERN_ERR "btrfs: commit super ret %d\n", ret); |
3340 | } | 3403 | } |
3341 | 3404 | ||
3342 | if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) | 3405 | if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) |
3343 | btrfs_error_commit_super(root); | 3406 | btrfs_error_commit_super(root); |
3344 | 3407 | ||
3345 | btrfs_put_block_group_cache(fs_info); | 3408 | btrfs_put_block_group_cache(fs_info); |
@@ -3352,9 +3415,9 @@ int close_ctree(struct btrfs_root *root) | |||
3352 | 3415 | ||
3353 | btrfs_free_qgroup_config(root->fs_info); | 3416 | btrfs_free_qgroup_config(root->fs_info); |
3354 | 3417 | ||
3355 | if (fs_info->delalloc_bytes) { | 3418 | if (percpu_counter_sum(&fs_info->delalloc_bytes)) { |
3356 | printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", | 3419 | printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n", |
3357 | (unsigned long long)fs_info->delalloc_bytes); | 3420 | percpu_counter_sum(&fs_info->delalloc_bytes)); |
3358 | } | 3421 | } |
3359 | 3422 | ||
3360 | free_extent_buffer(fs_info->extent_root->node); | 3423 | free_extent_buffer(fs_info->extent_root->node); |
@@ -3384,6 +3447,8 @@ int close_ctree(struct btrfs_root *root) | |||
3384 | btrfs_stop_workers(&fs_info->workers); | 3447 | btrfs_stop_workers(&fs_info->workers); |
3385 | btrfs_stop_workers(&fs_info->endio_workers); | 3448 | btrfs_stop_workers(&fs_info->endio_workers); |
3386 | btrfs_stop_workers(&fs_info->endio_meta_workers); | 3449 | btrfs_stop_workers(&fs_info->endio_meta_workers); |
3450 | btrfs_stop_workers(&fs_info->endio_raid56_workers); | ||
3451 | btrfs_stop_workers(&fs_info->rmw_workers); | ||
3387 | btrfs_stop_workers(&fs_info->endio_meta_write_workers); | 3452 | btrfs_stop_workers(&fs_info->endio_meta_write_workers); |
3388 | btrfs_stop_workers(&fs_info->endio_write_workers); | 3453 | btrfs_stop_workers(&fs_info->endio_write_workers); |
3389 | btrfs_stop_workers(&fs_info->endio_freespace_worker); | 3454 | btrfs_stop_workers(&fs_info->endio_freespace_worker); |
@@ -3401,9 +3466,13 @@ int close_ctree(struct btrfs_root *root) | |||
3401 | btrfs_close_devices(fs_info->fs_devices); | 3466 | btrfs_close_devices(fs_info->fs_devices); |
3402 | btrfs_mapping_tree_free(&fs_info->mapping_tree); | 3467 | btrfs_mapping_tree_free(&fs_info->mapping_tree); |
3403 | 3468 | ||
3469 | percpu_counter_destroy(&fs_info->dirty_metadata_bytes); | ||
3470 | percpu_counter_destroy(&fs_info->delalloc_bytes); | ||
3404 | bdi_destroy(&fs_info->bdi); | 3471 | bdi_destroy(&fs_info->bdi); |
3405 | cleanup_srcu_struct(&fs_info->subvol_srcu); | 3472 | cleanup_srcu_struct(&fs_info->subvol_srcu); |
3406 | 3473 | ||
3474 | btrfs_free_stripe_hash_table(fs_info); | ||
3475 | |||
3407 | return 0; | 3476 | return 0; |
3408 | } | 3477 | } |
3409 | 3478 | ||
@@ -3443,11 +3512,10 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) | |||
3443 | (unsigned long long)transid, | 3512 | (unsigned long long)transid, |
3444 | (unsigned long long)root->fs_info->generation); | 3513 | (unsigned long long)root->fs_info->generation); |
3445 | was_dirty = set_extent_buffer_dirty(buf); | 3514 | was_dirty = set_extent_buffer_dirty(buf); |
3446 | if (!was_dirty) { | 3515 | if (!was_dirty) |
3447 | spin_lock(&root->fs_info->delalloc_lock); | 3516 | __percpu_counter_add(&root->fs_info->dirty_metadata_bytes, |
3448 | root->fs_info->dirty_metadata_bytes += buf->len; | 3517 | buf->len, |
3449 | spin_unlock(&root->fs_info->delalloc_lock); | 3518 | root->fs_info->dirty_metadata_batch); |
3450 | } | ||
3451 | } | 3519 | } |
3452 | 3520 | ||
3453 | static void __btrfs_btree_balance_dirty(struct btrfs_root *root, | 3521 | static void __btrfs_btree_balance_dirty(struct btrfs_root *root, |
@@ -3457,8 +3525,7 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root, | |||
3457 | * looks as though older kernels can get into trouble with | 3525 | * looks as though older kernels can get into trouble with |
3458 | * this code, they end up stuck in balance_dirty_pages forever | 3526 | * this code, they end up stuck in balance_dirty_pages forever |
3459 | */ | 3527 | */ |
3460 | u64 num_dirty; | 3528 | int ret; |
3461 | unsigned long thresh = 32 * 1024 * 1024; | ||
3462 | 3529 | ||
3463 | if (current->flags & PF_MEMALLOC) | 3530 | if (current->flags & PF_MEMALLOC) |
3464 | return; | 3531 | return; |
@@ -3466,9 +3533,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root, | |||
3466 | if (flush_delayed) | 3533 | if (flush_delayed) |
3467 | btrfs_balance_delayed_items(root); | 3534 | btrfs_balance_delayed_items(root); |
3468 | 3535 | ||
3469 | num_dirty = root->fs_info->dirty_metadata_bytes; | 3536 | ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes, |
3470 | 3537 | BTRFS_DIRTY_METADATA_THRESH); | |
3471 | if (num_dirty > thresh) { | 3538 | if (ret > 0) { |
3472 | balance_dirty_pages_ratelimited( | 3539 | balance_dirty_pages_ratelimited( |
3473 | root->fs_info->btree_inode->i_mapping); | 3540 | root->fs_info->btree_inode->i_mapping); |
3474 | } | 3541 | } |
@@ -3518,7 +3585,8 @@ void btrfs_error_commit_super(struct btrfs_root *root) | |||
3518 | btrfs_cleanup_transaction(root); | 3585 | btrfs_cleanup_transaction(root); |
3519 | } | 3586 | } |
3520 | 3587 | ||
3521 | static void btrfs_destroy_ordered_operations(struct btrfs_root *root) | 3588 | static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, |
3589 | struct btrfs_root *root) | ||
3522 | { | 3590 | { |
3523 | struct btrfs_inode *btrfs_inode; | 3591 | struct btrfs_inode *btrfs_inode; |
3524 | struct list_head splice; | 3592 | struct list_head splice; |
@@ -3528,7 +3596,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root) | |||
3528 | mutex_lock(&root->fs_info->ordered_operations_mutex); | 3596 | mutex_lock(&root->fs_info->ordered_operations_mutex); |
3529 | spin_lock(&root->fs_info->ordered_extent_lock); | 3597 | spin_lock(&root->fs_info->ordered_extent_lock); |
3530 | 3598 | ||
3531 | list_splice_init(&root->fs_info->ordered_operations, &splice); | 3599 | list_splice_init(&t->ordered_operations, &splice); |
3532 | while (!list_empty(&splice)) { | 3600 | while (!list_empty(&splice)) { |
3533 | btrfs_inode = list_entry(splice.next, struct btrfs_inode, | 3601 | btrfs_inode = list_entry(splice.next, struct btrfs_inode, |
3534 | ordered_operations); | 3602 | ordered_operations); |
@@ -3544,35 +3612,16 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root) | |||
3544 | 3612 | ||
3545 | static void btrfs_destroy_ordered_extents(struct btrfs_root *root) | 3613 | static void btrfs_destroy_ordered_extents(struct btrfs_root *root) |
3546 | { | 3614 | { |
3547 | struct list_head splice; | ||
3548 | struct btrfs_ordered_extent *ordered; | 3615 | struct btrfs_ordered_extent *ordered; |
3549 | struct inode *inode; | ||
3550 | |||
3551 | INIT_LIST_HEAD(&splice); | ||
3552 | 3616 | ||
3553 | spin_lock(&root->fs_info->ordered_extent_lock); | 3617 | spin_lock(&root->fs_info->ordered_extent_lock); |
3554 | 3618 | /* | |
3555 | list_splice_init(&root->fs_info->ordered_extents, &splice); | 3619 | * This will just short circuit the ordered completion stuff which will |
3556 | while (!list_empty(&splice)) { | 3620 | * make sure the ordered extent gets properly cleaned up. |
3557 | ordered = list_entry(splice.next, struct btrfs_ordered_extent, | 3621 | */ |
3558 | root_extent_list); | 3622 | list_for_each_entry(ordered, &root->fs_info->ordered_extents, |
3559 | 3623 | root_extent_list) | |
3560 | list_del_init(&ordered->root_extent_list); | 3624 | set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); |
3561 | atomic_inc(&ordered->refs); | ||
3562 | |||
3563 | /* the inode may be getting freed (in sys_unlink path). */ | ||
3564 | inode = igrab(ordered->inode); | ||
3565 | |||
3566 | spin_unlock(&root->fs_info->ordered_extent_lock); | ||
3567 | if (inode) | ||
3568 | iput(inode); | ||
3569 | |||
3570 | atomic_set(&ordered->refs, 1); | ||
3571 | btrfs_put_ordered_extent(ordered); | ||
3572 | |||
3573 | spin_lock(&root->fs_info->ordered_extent_lock); | ||
3574 | } | ||
3575 | |||
3576 | spin_unlock(&root->fs_info->ordered_extent_lock); | 3625 | spin_unlock(&root->fs_info->ordered_extent_lock); |
3577 | } | 3626 | } |
3578 | 3627 | ||
@@ -3594,11 +3643,11 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, | |||
3594 | } | 3643 | } |
3595 | 3644 | ||
3596 | while ((node = rb_first(&delayed_refs->root)) != NULL) { | 3645 | while ((node = rb_first(&delayed_refs->root)) != NULL) { |
3597 | ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); | 3646 | struct btrfs_delayed_ref_head *head = NULL; |
3598 | 3647 | ||
3648 | ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); | ||
3599 | atomic_set(&ref->refs, 1); | 3649 | atomic_set(&ref->refs, 1); |
3600 | if (btrfs_delayed_ref_is_head(ref)) { | 3650 | if (btrfs_delayed_ref_is_head(ref)) { |
3601 | struct btrfs_delayed_ref_head *head; | ||
3602 | 3651 | ||
3603 | head = btrfs_delayed_node_to_head(ref); | 3652 | head = btrfs_delayed_node_to_head(ref); |
3604 | if (!mutex_trylock(&head->mutex)) { | 3653 | if (!mutex_trylock(&head->mutex)) { |
@@ -3614,16 +3663,18 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, | |||
3614 | continue; | 3663 | continue; |
3615 | } | 3664 | } |
3616 | 3665 | ||
3617 | kfree(head->extent_op); | 3666 | btrfs_free_delayed_extent_op(head->extent_op); |
3618 | delayed_refs->num_heads--; | 3667 | delayed_refs->num_heads--; |
3619 | if (list_empty(&head->cluster)) | 3668 | if (list_empty(&head->cluster)) |
3620 | delayed_refs->num_heads_ready--; | 3669 | delayed_refs->num_heads_ready--; |
3621 | list_del_init(&head->cluster); | 3670 | list_del_init(&head->cluster); |
3622 | } | 3671 | } |
3672 | |||
3623 | ref->in_tree = 0; | 3673 | ref->in_tree = 0; |
3624 | rb_erase(&ref->rb_node, &delayed_refs->root); | 3674 | rb_erase(&ref->rb_node, &delayed_refs->root); |
3625 | delayed_refs->num_entries--; | 3675 | delayed_refs->num_entries--; |
3626 | 3676 | if (head) | |
3677 | mutex_unlock(&head->mutex); | ||
3627 | spin_unlock(&delayed_refs->lock); | 3678 | spin_unlock(&delayed_refs->lock); |
3628 | btrfs_put_delayed_ref(ref); | 3679 | btrfs_put_delayed_ref(ref); |
3629 | 3680 | ||
@@ -3671,6 +3722,8 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) | |||
3671 | delalloc_inodes); | 3722 | delalloc_inodes); |
3672 | 3723 | ||
3673 | list_del_init(&btrfs_inode->delalloc_inodes); | 3724 | list_del_init(&btrfs_inode->delalloc_inodes); |
3725 | clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, | ||
3726 | &btrfs_inode->runtime_flags); | ||
3674 | 3727 | ||
3675 | btrfs_invalidate_inodes(btrfs_inode->root); | 3728 | btrfs_invalidate_inodes(btrfs_inode->root); |
3676 | } | 3729 | } |
@@ -3823,10 +3876,8 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) | |||
3823 | 3876 | ||
3824 | while (!list_empty(&list)) { | 3877 | while (!list_empty(&list)) { |
3825 | t = list_entry(list.next, struct btrfs_transaction, list); | 3878 | t = list_entry(list.next, struct btrfs_transaction, list); |
3826 | if (!t) | ||
3827 | break; | ||
3828 | 3879 | ||
3829 | btrfs_destroy_ordered_operations(root); | 3880 | btrfs_destroy_ordered_operations(t, root); |
3830 | 3881 | ||
3831 | btrfs_destroy_ordered_extents(root); | 3882 | btrfs_destroy_ordered_extents(root); |
3832 | 3883 | ||
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 305c33efb0e3..034d7dc552b2 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h | |||
@@ -25,6 +25,13 @@ | |||
25 | #define BTRFS_SUPER_MIRROR_MAX 3 | 25 | #define BTRFS_SUPER_MIRROR_MAX 3 |
26 | #define BTRFS_SUPER_MIRROR_SHIFT 12 | 26 | #define BTRFS_SUPER_MIRROR_SHIFT 12 |
27 | 27 | ||
28 | enum { | ||
29 | BTRFS_WQ_ENDIO_DATA = 0, | ||
30 | BTRFS_WQ_ENDIO_METADATA = 1, | ||
31 | BTRFS_WQ_ENDIO_FREE_SPACE = 2, | ||
32 | BTRFS_WQ_ENDIO_RAID56 = 3, | ||
33 | }; | ||
34 | |||
28 | static inline u64 btrfs_sb_offset(int mirror) | 35 | static inline u64 btrfs_sb_offset(int mirror) |
29 | { | 36 | { |
30 | u64 start = 16 * 1024; | 37 | u64 start = 16 * 1024; |
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cf54bdfee334..3e074dab2d57 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include "print-tree.h" | 31 | #include "print-tree.h" |
32 | #include "transaction.h" | 32 | #include "transaction.h" |
33 | #include "volumes.h" | 33 | #include "volumes.h" |
34 | #include "raid56.h" | ||
34 | #include "locking.h" | 35 | #include "locking.h" |
35 | #include "free-space-cache.h" | 36 | #include "free-space-cache.h" |
36 | #include "math.h" | 37 | #include "math.h" |
@@ -72,8 +73,7 @@ enum { | |||
72 | RESERVE_ALLOC_NO_ACCOUNT = 2, | 73 | RESERVE_ALLOC_NO_ACCOUNT = 2, |
73 | }; | 74 | }; |
74 | 75 | ||
75 | static int update_block_group(struct btrfs_trans_handle *trans, | 76 | static int update_block_group(struct btrfs_root *root, |
76 | struct btrfs_root *root, | ||
77 | u64 bytenr, u64 num_bytes, int alloc); | 77 | u64 bytenr, u64 num_bytes, int alloc); |
78 | static int __btrfs_free_extent(struct btrfs_trans_handle *trans, | 78 | static int __btrfs_free_extent(struct btrfs_trans_handle *trans, |
79 | struct btrfs_root *root, | 79 | struct btrfs_root *root, |
@@ -103,6 +103,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, | |||
103 | int dump_block_groups); | 103 | int dump_block_groups); |
104 | static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, | 104 | static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, |
105 | u64 num_bytes, int reserve); | 105 | u64 num_bytes, int reserve); |
106 | static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, | ||
107 | u64 num_bytes); | ||
106 | 108 | ||
107 | static noinline int | 109 | static noinline int |
108 | block_group_cache_done(struct btrfs_block_group_cache *cache) | 110 | block_group_cache_done(struct btrfs_block_group_cache *cache) |
@@ -162,6 +164,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, | |||
162 | rb_link_node(&block_group->cache_node, parent, p); | 164 | rb_link_node(&block_group->cache_node, parent, p); |
163 | rb_insert_color(&block_group->cache_node, | 165 | rb_insert_color(&block_group->cache_node, |
164 | &info->block_group_cache_tree); | 166 | &info->block_group_cache_tree); |
167 | |||
168 | if (info->first_logical_byte > block_group->key.objectid) | ||
169 | info->first_logical_byte = block_group->key.objectid; | ||
170 | |||
165 | spin_unlock(&info->block_group_cache_lock); | 171 | spin_unlock(&info->block_group_cache_lock); |
166 | 172 | ||
167 | return 0; | 173 | return 0; |
@@ -203,8 +209,11 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, | |||
203 | break; | 209 | break; |
204 | } | 210 | } |
205 | } | 211 | } |
206 | if (ret) | 212 | if (ret) { |
207 | btrfs_get_block_group(ret); | 213 | btrfs_get_block_group(ret); |
214 | if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) | ||
215 | info->first_logical_byte = ret->key.objectid; | ||
216 | } | ||
208 | spin_unlock(&info->block_group_cache_lock); | 217 | spin_unlock(&info->block_group_cache_lock); |
209 | 218 | ||
210 | return ret; | 219 | return ret; |
@@ -468,8 +477,6 @@ out: | |||
468 | } | 477 | } |
469 | 478 | ||
470 | static int cache_block_group(struct btrfs_block_group_cache *cache, | 479 | static int cache_block_group(struct btrfs_block_group_cache *cache, |
471 | struct btrfs_trans_handle *trans, | ||
472 | struct btrfs_root *root, | ||
473 | int load_cache_only) | 480 | int load_cache_only) |
474 | { | 481 | { |
475 | DEFINE_WAIT(wait); | 482 | DEFINE_WAIT(wait); |
@@ -527,12 +534,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, | |||
527 | cache->cached = BTRFS_CACHE_FAST; | 534 | cache->cached = BTRFS_CACHE_FAST; |
528 | spin_unlock(&cache->lock); | 535 | spin_unlock(&cache->lock); |
529 | 536 | ||
530 | /* | ||
531 | * We can't do the read from on-disk cache during a commit since we need | ||
532 | * to have the normal tree locking. Also if we are currently trying to | ||
533 | * allocate blocks for the tree root we can't do the fast caching since | ||
534 | * we likely hold important locks. | ||
535 | */ | ||
536 | if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { | 537 | if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { |
537 | ret = load_free_space_cache(fs_info, cache); | 538 | ret = load_free_space_cache(fs_info, cache); |
538 | 539 | ||
@@ -1852,6 +1853,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, | |||
1852 | *actual_bytes = discarded_bytes; | 1853 | *actual_bytes = discarded_bytes; |
1853 | 1854 | ||
1854 | 1855 | ||
1856 | if (ret == -EOPNOTSUPP) | ||
1857 | ret = 0; | ||
1855 | return ret; | 1858 | return ret; |
1856 | } | 1859 | } |
1857 | 1860 | ||
@@ -2143,7 +2146,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, | |||
2143 | node->num_bytes); | 2146 | node->num_bytes); |
2144 | } | 2147 | } |
2145 | } | 2148 | } |
2146 | mutex_unlock(&head->mutex); | ||
2147 | return ret; | 2149 | return ret; |
2148 | } | 2150 | } |
2149 | 2151 | ||
@@ -2258,7 +2260,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, | |||
2258 | * process of being added. Don't run this ref yet. | 2260 | * process of being added. Don't run this ref yet. |
2259 | */ | 2261 | */ |
2260 | list_del_init(&locked_ref->cluster); | 2262 | list_del_init(&locked_ref->cluster); |
2261 | mutex_unlock(&locked_ref->mutex); | 2263 | btrfs_delayed_ref_unlock(locked_ref); |
2262 | locked_ref = NULL; | 2264 | locked_ref = NULL; |
2263 | delayed_refs->num_heads_ready++; | 2265 | delayed_refs->num_heads_ready++; |
2264 | spin_unlock(&delayed_refs->lock); | 2266 | spin_unlock(&delayed_refs->lock); |
@@ -2285,7 +2287,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, | |||
2285 | ref = &locked_ref->node; | 2287 | ref = &locked_ref->node; |
2286 | 2288 | ||
2287 | if (extent_op && must_insert_reserved) { | 2289 | if (extent_op && must_insert_reserved) { |
2288 | kfree(extent_op); | 2290 | btrfs_free_delayed_extent_op(extent_op); |
2289 | extent_op = NULL; | 2291 | extent_op = NULL; |
2290 | } | 2292 | } |
2291 | 2293 | ||
@@ -2294,28 +2296,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, | |||
2294 | 2296 | ||
2295 | ret = run_delayed_extent_op(trans, root, | 2297 | ret = run_delayed_extent_op(trans, root, |
2296 | ref, extent_op); | 2298 | ref, extent_op); |
2297 | kfree(extent_op); | 2299 | btrfs_free_delayed_extent_op(extent_op); |
2298 | 2300 | ||
2299 | if (ret) { | 2301 | if (ret) { |
2300 | list_del_init(&locked_ref->cluster); | 2302 | printk(KERN_DEBUG |
2301 | mutex_unlock(&locked_ref->mutex); | 2303 | "btrfs: run_delayed_extent_op " |
2302 | 2304 | "returned %d\n", ret); | |
2303 | printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); | ||
2304 | spin_lock(&delayed_refs->lock); | 2305 | spin_lock(&delayed_refs->lock); |
2306 | btrfs_delayed_ref_unlock(locked_ref); | ||
2305 | return ret; | 2307 | return ret; |
2306 | } | 2308 | } |
2307 | 2309 | ||
2308 | goto next; | 2310 | goto next; |
2309 | } | 2311 | } |
2310 | |||
2311 | list_del_init(&locked_ref->cluster); | ||
2312 | locked_ref = NULL; | ||
2313 | } | 2312 | } |
2314 | 2313 | ||
2315 | ref->in_tree = 0; | 2314 | ref->in_tree = 0; |
2316 | rb_erase(&ref->rb_node, &delayed_refs->root); | 2315 | rb_erase(&ref->rb_node, &delayed_refs->root); |
2317 | delayed_refs->num_entries--; | 2316 | delayed_refs->num_entries--; |
2318 | if (locked_ref) { | 2317 | if (!btrfs_delayed_ref_is_head(ref)) { |
2319 | /* | 2318 | /* |
2320 | * when we play the delayed ref, also correct the | 2319 | * when we play the delayed ref, also correct the |
2321 | * ref_mod on head | 2320 | * ref_mod on head |
@@ -2337,20 +2336,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, | |||
2337 | ret = run_one_delayed_ref(trans, root, ref, extent_op, | 2336 | ret = run_one_delayed_ref(trans, root, ref, extent_op, |
2338 | must_insert_reserved); | 2337 | must_insert_reserved); |
2339 | 2338 | ||
2340 | btrfs_put_delayed_ref(ref); | 2339 | btrfs_free_delayed_extent_op(extent_op); |
2341 | kfree(extent_op); | ||
2342 | count++; | ||
2343 | |||
2344 | if (ret) { | 2340 | if (ret) { |
2345 | if (locked_ref) { | 2341 | btrfs_delayed_ref_unlock(locked_ref); |
2346 | list_del_init(&locked_ref->cluster); | 2342 | btrfs_put_delayed_ref(ref); |
2347 | mutex_unlock(&locked_ref->mutex); | 2343 | printk(KERN_DEBUG |
2348 | } | 2344 | "btrfs: run_one_delayed_ref returned %d\n", ret); |
2349 | printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); | ||
2350 | spin_lock(&delayed_refs->lock); | 2345 | spin_lock(&delayed_refs->lock); |
2351 | return ret; | 2346 | return ret; |
2352 | } | 2347 | } |
2353 | 2348 | ||
2349 | /* | ||
2350 | * If this node is a head, that means all the refs in this head | ||
2351 | * have been dealt with, and we will pick the next head to deal | ||
2352 | * with, so we must unlock the head and drop it from the cluster | ||
2353 | * list before we release it. | ||
2354 | */ | ||
2355 | if (btrfs_delayed_ref_is_head(ref)) { | ||
2356 | list_del_init(&locked_ref->cluster); | ||
2357 | btrfs_delayed_ref_unlock(locked_ref); | ||
2358 | locked_ref = NULL; | ||
2359 | } | ||
2360 | btrfs_put_delayed_ref(ref); | ||
2361 | count++; | ||
2354 | next: | 2362 | next: |
2355 | cond_resched(); | 2363 | cond_resched(); |
2356 | spin_lock(&delayed_refs->lock); | 2364 | spin_lock(&delayed_refs->lock); |
@@ -2435,6 +2443,16 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, | |||
2435 | return ret; | 2443 | return ret; |
2436 | } | 2444 | } |
2437 | 2445 | ||
2446 | static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq, | ||
2447 | int count) | ||
2448 | { | ||
2449 | int val = atomic_read(&delayed_refs->ref_seq); | ||
2450 | |||
2451 | if (val < seq || val >= seq + count) | ||
2452 | return 1; | ||
2453 | return 0; | ||
2454 | } | ||
2455 | |||
2438 | /* | 2456 | /* |
2439 | * this starts processing the delayed reference count updates and | 2457 | * this starts processing the delayed reference count updates and |
2440 | * extent insertions we have queued up so far. count can be | 2458 | * extent insertions we have queued up so far. count can be |
@@ -2469,6 +2487,44 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, | |||
2469 | 2487 | ||
2470 | delayed_refs = &trans->transaction->delayed_refs; | 2488 | delayed_refs = &trans->transaction->delayed_refs; |
2471 | INIT_LIST_HEAD(&cluster); | 2489 | INIT_LIST_HEAD(&cluster); |
2490 | if (count == 0) { | ||
2491 | count = delayed_refs->num_entries * 2; | ||
2492 | run_most = 1; | ||
2493 | } | ||
2494 | |||
2495 | if (!run_all && !run_most) { | ||
2496 | int old; | ||
2497 | int seq = atomic_read(&delayed_refs->ref_seq); | ||
2498 | |||
2499 | progress: | ||
2500 | old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); | ||
2501 | if (old) { | ||
2502 | DEFINE_WAIT(__wait); | ||
2503 | if (delayed_refs->num_entries < 16348) | ||
2504 | return 0; | ||
2505 | |||
2506 | prepare_to_wait(&delayed_refs->wait, &__wait, | ||
2507 | TASK_UNINTERRUPTIBLE); | ||
2508 | |||
2509 | old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); | ||
2510 | if (old) { | ||
2511 | schedule(); | ||
2512 | finish_wait(&delayed_refs->wait, &__wait); | ||
2513 | |||
2514 | if (!refs_newer(delayed_refs, seq, 256)) | ||
2515 | goto progress; | ||
2516 | else | ||
2517 | return 0; | ||
2518 | } else { | ||
2519 | finish_wait(&delayed_refs->wait, &__wait); | ||
2520 | goto again; | ||
2521 | } | ||
2522 | } | ||
2523 | |||
2524 | } else { | ||
2525 | atomic_inc(&delayed_refs->procs_running_refs); | ||
2526 | } | ||
2527 | |||
2472 | again: | 2528 | again: |
2473 | loops = 0; | 2529 | loops = 0; |
2474 | spin_lock(&delayed_refs->lock); | 2530 | spin_lock(&delayed_refs->lock); |
@@ -2477,10 +2533,6 @@ again: | |||
2477 | delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); | 2533 | delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); |
2478 | #endif | 2534 | #endif |
2479 | 2535 | ||
2480 | if (count == 0) { | ||
2481 | count = delayed_refs->num_entries * 2; | ||
2482 | run_most = 1; | ||
2483 | } | ||
2484 | while (1) { | 2536 | while (1) { |
2485 | if (!(run_all || run_most) && | 2537 | if (!(run_all || run_most) && |
2486 | delayed_refs->num_heads_ready < 64) | 2538 | delayed_refs->num_heads_ready < 64) |
@@ -2500,11 +2552,15 @@ again: | |||
2500 | 2552 | ||
2501 | ret = run_clustered_refs(trans, root, &cluster); | 2553 | ret = run_clustered_refs(trans, root, &cluster); |
2502 | if (ret < 0) { | 2554 | if (ret < 0) { |
2555 | btrfs_release_ref_cluster(&cluster); | ||
2503 | spin_unlock(&delayed_refs->lock); | 2556 | spin_unlock(&delayed_refs->lock); |
2504 | btrfs_abort_transaction(trans, root, ret); | 2557 | btrfs_abort_transaction(trans, root, ret); |
2558 | atomic_dec(&delayed_refs->procs_running_refs); | ||
2505 | return ret; | 2559 | return ret; |
2506 | } | 2560 | } |
2507 | 2561 | ||
2562 | atomic_add(ret, &delayed_refs->ref_seq); | ||
2563 | |||
2508 | count -= min_t(unsigned long, ret, count); | 2564 | count -= min_t(unsigned long, ret, count); |
2509 | 2565 | ||
2510 | if (count == 0) | 2566 | if (count == 0) |
@@ -2573,6 +2629,11 @@ again: | |||
2573 | goto again; | 2629 | goto again; |
2574 | } | 2630 | } |
2575 | out: | 2631 | out: |
2632 | atomic_dec(&delayed_refs->procs_running_refs); | ||
2633 | smp_mb(); | ||
2634 | if (waitqueue_active(&delayed_refs->wait)) | ||
2635 | wake_up(&delayed_refs->wait); | ||
2636 | |||
2576 | spin_unlock(&delayed_refs->lock); | 2637 | spin_unlock(&delayed_refs->lock); |
2577 | assert_qgroups_uptodate(trans); | 2638 | assert_qgroups_uptodate(trans); |
2578 | return 0; | 2639 | return 0; |
@@ -2586,7 +2647,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, | |||
2586 | struct btrfs_delayed_extent_op *extent_op; | 2647 | struct btrfs_delayed_extent_op *extent_op; |
2587 | int ret; | 2648 | int ret; |
2588 | 2649 | ||
2589 | extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); | 2650 | extent_op = btrfs_alloc_delayed_extent_op(); |
2590 | if (!extent_op) | 2651 | if (!extent_op) |
2591 | return -ENOMEM; | 2652 | return -ENOMEM; |
2592 | 2653 | ||
@@ -2598,7 +2659,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, | |||
2598 | ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, | 2659 | ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, |
2599 | num_bytes, extent_op); | 2660 | num_bytes, extent_op); |
2600 | if (ret) | 2661 | if (ret) |
2601 | kfree(extent_op); | 2662 | btrfs_free_delayed_extent_op(extent_op); |
2602 | return ret; | 2663 | return ret; |
2603 | } | 2664 | } |
2604 | 2665 | ||
@@ -3223,12 +3284,14 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) | |||
3223 | u64 extra_flags = chunk_to_extended(flags) & | 3284 | u64 extra_flags = chunk_to_extended(flags) & |
3224 | BTRFS_EXTENDED_PROFILE_MASK; | 3285 | BTRFS_EXTENDED_PROFILE_MASK; |
3225 | 3286 | ||
3287 | write_seqlock(&fs_info->profiles_lock); | ||
3226 | if (flags & BTRFS_BLOCK_GROUP_DATA) | 3288 | if (flags & BTRFS_BLOCK_GROUP_DATA) |
3227 | fs_info->avail_data_alloc_bits |= extra_flags; | 3289 | fs_info->avail_data_alloc_bits |= extra_flags; |
3228 | if (flags & BTRFS_BLOCK_GROUP_METADATA) | 3290 | if (flags & BTRFS_BLOCK_GROUP_METADATA) |
3229 | fs_info->avail_metadata_alloc_bits |= extra_flags; | 3291 | fs_info->avail_metadata_alloc_bits |= extra_flags; |
3230 | if (flags & BTRFS_BLOCK_GROUP_SYSTEM) | 3292 | if (flags & BTRFS_BLOCK_GROUP_SYSTEM) |
3231 | fs_info->avail_system_alloc_bits |= extra_flags; | 3293 | fs_info->avail_system_alloc_bits |= extra_flags; |
3294 | write_sequnlock(&fs_info->profiles_lock); | ||
3232 | } | 3295 | } |
3233 | 3296 | ||
3234 | /* | 3297 | /* |
@@ -3276,6 +3339,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) | |||
3276 | u64 num_devices = root->fs_info->fs_devices->rw_devices + | 3339 | u64 num_devices = root->fs_info->fs_devices->rw_devices + |
3277 | root->fs_info->fs_devices->missing_devices; | 3340 | root->fs_info->fs_devices->missing_devices; |
3278 | u64 target; | 3341 | u64 target; |
3342 | u64 tmp; | ||
3279 | 3343 | ||
3280 | /* | 3344 | /* |
3281 | * see if restripe for this chunk_type is in progress, if so | 3345 | * see if restripe for this chunk_type is in progress, if so |
@@ -3292,40 +3356,48 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) | |||
3292 | } | 3356 | } |
3293 | spin_unlock(&root->fs_info->balance_lock); | 3357 | spin_unlock(&root->fs_info->balance_lock); |
3294 | 3358 | ||
3359 | /* First, mask out the RAID levels which aren't possible */ | ||
3295 | if (num_devices == 1) | 3360 | if (num_devices == 1) |
3296 | flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); | 3361 | flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | |
3362 | BTRFS_BLOCK_GROUP_RAID5); | ||
3363 | if (num_devices < 3) | ||
3364 | flags &= ~BTRFS_BLOCK_GROUP_RAID6; | ||
3297 | if (num_devices < 4) | 3365 | if (num_devices < 4) |
3298 | flags &= ~BTRFS_BLOCK_GROUP_RAID10; | 3366 | flags &= ~BTRFS_BLOCK_GROUP_RAID10; |
3299 | 3367 | ||
3300 | if ((flags & BTRFS_BLOCK_GROUP_DUP) && | 3368 | tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | |
3301 | (flags & (BTRFS_BLOCK_GROUP_RAID1 | | 3369 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | |
3302 | BTRFS_BLOCK_GROUP_RAID10))) { | 3370 | BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); |
3303 | flags &= ~BTRFS_BLOCK_GROUP_DUP; | 3371 | flags &= ~tmp; |
3304 | } | ||
3305 | |||
3306 | if ((flags & BTRFS_BLOCK_GROUP_RAID1) && | ||
3307 | (flags & BTRFS_BLOCK_GROUP_RAID10)) { | ||
3308 | flags &= ~BTRFS_BLOCK_GROUP_RAID1; | ||
3309 | } | ||
3310 | 3372 | ||
3311 | if ((flags & BTRFS_BLOCK_GROUP_RAID0) && | 3373 | if (tmp & BTRFS_BLOCK_GROUP_RAID6) |
3312 | ((flags & BTRFS_BLOCK_GROUP_RAID1) | | 3374 | tmp = BTRFS_BLOCK_GROUP_RAID6; |
3313 | (flags & BTRFS_BLOCK_GROUP_RAID10) | | 3375 | else if (tmp & BTRFS_BLOCK_GROUP_RAID5) |
3314 | (flags & BTRFS_BLOCK_GROUP_DUP))) { | 3376 | tmp = BTRFS_BLOCK_GROUP_RAID5; |
3315 | flags &= ~BTRFS_BLOCK_GROUP_RAID0; | 3377 | else if (tmp & BTRFS_BLOCK_GROUP_RAID10) |
3316 | } | 3378 | tmp = BTRFS_BLOCK_GROUP_RAID10; |
3379 | else if (tmp & BTRFS_BLOCK_GROUP_RAID1) | ||
3380 | tmp = BTRFS_BLOCK_GROUP_RAID1; | ||
3381 | else if (tmp & BTRFS_BLOCK_GROUP_RAID0) | ||
3382 | tmp = BTRFS_BLOCK_GROUP_RAID0; | ||
3317 | 3383 | ||
3318 | return extended_to_chunk(flags); | 3384 | return extended_to_chunk(flags | tmp); |
3319 | } | 3385 | } |
3320 | 3386 | ||
3321 | static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) | 3387 | static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) |
3322 | { | 3388 | { |
3323 | if (flags & BTRFS_BLOCK_GROUP_DATA) | 3389 | unsigned seq; |
3324 | flags |= root->fs_info->avail_data_alloc_bits; | 3390 | |
3325 | else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) | 3391 | do { |
3326 | flags |= root->fs_info->avail_system_alloc_bits; | 3392 | seq = read_seqbegin(&root->fs_info->profiles_lock); |
3327 | else if (flags & BTRFS_BLOCK_GROUP_METADATA) | 3393 | |
3328 | flags |= root->fs_info->avail_metadata_alloc_bits; | 3394 | if (flags & BTRFS_BLOCK_GROUP_DATA) |
3395 | flags |= root->fs_info->avail_data_alloc_bits; | ||
3396 | else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) | ||
3397 | flags |= root->fs_info->avail_system_alloc_bits; | ||
3398 | else if (flags & BTRFS_BLOCK_GROUP_METADATA) | ||
3399 | flags |= root->fs_info->avail_metadata_alloc_bits; | ||
3400 | } while (read_seqretry(&root->fs_info->profiles_lock, seq)); | ||
3329 | 3401 | ||
3330 | return btrfs_reduce_alloc_profile(root, flags); | 3402 | return btrfs_reduce_alloc_profile(root, flags); |
3331 | } | 3403 | } |
@@ -3333,6 +3405,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) | |||
3333 | u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) | 3405 | u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) |
3334 | { | 3406 | { |
3335 | u64 flags; | 3407 | u64 flags; |
3408 | u64 ret; | ||
3336 | 3409 | ||
3337 | if (data) | 3410 | if (data) |
3338 | flags = BTRFS_BLOCK_GROUP_DATA; | 3411 | flags = BTRFS_BLOCK_GROUP_DATA; |
@@ -3341,7 +3414,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) | |||
3341 | else | 3414 | else |
3342 | flags = BTRFS_BLOCK_GROUP_METADATA; | 3415 | flags = BTRFS_BLOCK_GROUP_METADATA; |
3343 | 3416 | ||
3344 | return get_alloc_profile(root, flags); | 3417 | ret = get_alloc_profile(root, flags); |
3418 | return ret; | ||
3345 | } | 3419 | } |
3346 | 3420 | ||
3347 | /* | 3421 | /* |
@@ -3357,7 +3431,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes) | |||
3357 | int ret = 0, committed = 0, alloc_chunk = 1; | 3431 | int ret = 0, committed = 0, alloc_chunk = 1; |
3358 | 3432 | ||
3359 | /* make sure bytes are sectorsize aligned */ | 3433 | /* make sure bytes are sectorsize aligned */ |
3360 | bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); | 3434 | bytes = ALIGN(bytes, root->sectorsize); |
3361 | 3435 | ||
3362 | if (root == root->fs_info->tree_root || | 3436 | if (root == root->fs_info->tree_root || |
3363 | BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { | 3437 | BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { |
@@ -3452,7 +3526,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) | |||
3452 | struct btrfs_space_info *data_sinfo; | 3526 | struct btrfs_space_info *data_sinfo; |
3453 | 3527 | ||
3454 | /* make sure bytes are sectorsize aligned */ | 3528 | /* make sure bytes are sectorsize aligned */ |
3455 | bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); | 3529 | bytes = ALIGN(bytes, root->sectorsize); |
3456 | 3530 | ||
3457 | data_sinfo = root->fs_info->data_sinfo; | 3531 | data_sinfo = root->fs_info->data_sinfo; |
3458 | spin_lock(&data_sinfo->lock); | 3532 | spin_lock(&data_sinfo->lock); |
@@ -3516,8 +3590,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) | |||
3516 | { | 3590 | { |
3517 | u64 num_dev; | 3591 | u64 num_dev; |
3518 | 3592 | ||
3519 | if (type & BTRFS_BLOCK_GROUP_RAID10 || | 3593 | if (type & (BTRFS_BLOCK_GROUP_RAID10 | |
3520 | type & BTRFS_BLOCK_GROUP_RAID0) | 3594 | BTRFS_BLOCK_GROUP_RAID0 | |
3595 | BTRFS_BLOCK_GROUP_RAID5 | | ||
3596 | BTRFS_BLOCK_GROUP_RAID6)) | ||
3521 | num_dev = root->fs_info->fs_devices->rw_devices; | 3597 | num_dev = root->fs_info->fs_devices->rw_devices; |
3522 | else if (type & BTRFS_BLOCK_GROUP_RAID1) | 3598 | else if (type & BTRFS_BLOCK_GROUP_RAID1) |
3523 | num_dev = 2; | 3599 | num_dev = 2; |
@@ -3564,6 +3640,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, | |||
3564 | int wait_for_alloc = 0; | 3640 | int wait_for_alloc = 0; |
3565 | int ret = 0; | 3641 | int ret = 0; |
3566 | 3642 | ||
3643 | /* Don't re-enter if we're already allocating a chunk */ | ||
3644 | if (trans->allocating_chunk) | ||
3645 | return -ENOSPC; | ||
3646 | |||
3567 | space_info = __find_space_info(extent_root->fs_info, flags); | 3647 | space_info = __find_space_info(extent_root->fs_info, flags); |
3568 | if (!space_info) { | 3648 | if (!space_info) { |
3569 | ret = update_space_info(extent_root->fs_info, flags, | 3649 | ret = update_space_info(extent_root->fs_info, flags, |
@@ -3606,6 +3686,8 @@ again: | |||
3606 | goto again; | 3686 | goto again; |
3607 | } | 3687 | } |
3608 | 3688 | ||
3689 | trans->allocating_chunk = true; | ||
3690 | |||
3609 | /* | 3691 | /* |
3610 | * If we have mixed data/metadata chunks we want to make sure we keep | 3692 | * If we have mixed data/metadata chunks we want to make sure we keep |
3611 | * allocating mixed chunks instead of individual chunks. | 3693 | * allocating mixed chunks instead of individual chunks. |
@@ -3632,19 +3714,20 @@ again: | |||
3632 | check_system_chunk(trans, extent_root, flags); | 3714 | check_system_chunk(trans, extent_root, flags); |
3633 | 3715 | ||
3634 | ret = btrfs_alloc_chunk(trans, extent_root, flags); | 3716 | ret = btrfs_alloc_chunk(trans, extent_root, flags); |
3635 | if (ret < 0 && ret != -ENOSPC) | 3717 | trans->allocating_chunk = false; |
3636 | goto out; | ||
3637 | 3718 | ||
3638 | spin_lock(&space_info->lock); | 3719 | spin_lock(&space_info->lock); |
3720 | if (ret < 0 && ret != -ENOSPC) | ||
3721 | goto out; | ||
3639 | if (ret) | 3722 | if (ret) |
3640 | space_info->full = 1; | 3723 | space_info->full = 1; |
3641 | else | 3724 | else |
3642 | ret = 1; | 3725 | ret = 1; |
3643 | 3726 | ||
3644 | space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; | 3727 | space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; |
3728 | out: | ||
3645 | space_info->chunk_alloc = 0; | 3729 | space_info->chunk_alloc = 0; |
3646 | spin_unlock(&space_info->lock); | 3730 | spin_unlock(&space_info->lock); |
3647 | out: | ||
3648 | mutex_unlock(&fs_info->chunk_mutex); | 3731 | mutex_unlock(&fs_info->chunk_mutex); |
3649 | return ret; | 3732 | return ret; |
3650 | } | 3733 | } |
@@ -3653,13 +3736,31 @@ static int can_overcommit(struct btrfs_root *root, | |||
3653 | struct btrfs_space_info *space_info, u64 bytes, | 3736 | struct btrfs_space_info *space_info, u64 bytes, |
3654 | enum btrfs_reserve_flush_enum flush) | 3737 | enum btrfs_reserve_flush_enum flush) |
3655 | { | 3738 | { |
3739 | struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; | ||
3656 | u64 profile = btrfs_get_alloc_profile(root, 0); | 3740 | u64 profile = btrfs_get_alloc_profile(root, 0); |
3741 | u64 rsv_size = 0; | ||
3657 | u64 avail; | 3742 | u64 avail; |
3658 | u64 used; | 3743 | u64 used; |
3744 | u64 to_add; | ||
3659 | 3745 | ||
3660 | used = space_info->bytes_used + space_info->bytes_reserved + | 3746 | used = space_info->bytes_used + space_info->bytes_reserved + |
3661 | space_info->bytes_pinned + space_info->bytes_readonly + | 3747 | space_info->bytes_pinned + space_info->bytes_readonly; |
3662 | space_info->bytes_may_use; | 3748 | |
3749 | spin_lock(&global_rsv->lock); | ||
3750 | rsv_size = global_rsv->size; | ||
3751 | spin_unlock(&global_rsv->lock); | ||
3752 | |||
3753 | /* | ||
3754 | * We only want to allow over committing if we have lots of actual space | ||
3755 | * free, but if we don't have enough space to handle the global reserve | ||
3756 | * space then we could end up having a real enospc problem when trying | ||
3757 | * to allocate a chunk or some other such important allocation. | ||
3758 | */ | ||
3759 | rsv_size <<= 1; | ||
3760 | if (used + rsv_size >= space_info->total_bytes) | ||
3761 | return 0; | ||
3762 | |||
3763 | used += space_info->bytes_may_use; | ||
3663 | 3764 | ||
3664 | spin_lock(&root->fs_info->free_chunk_lock); | 3765 | spin_lock(&root->fs_info->free_chunk_lock); |
3665 | avail = root->fs_info->free_chunk_space; | 3766 | avail = root->fs_info->free_chunk_space; |
@@ -3667,28 +3768,60 @@ static int can_overcommit(struct btrfs_root *root, | |||
3667 | 3768 | ||
3668 | /* | 3769 | /* |
3669 | * If we have dup, raid1 or raid10 then only half of the free | 3770 | * If we have dup, raid1 or raid10 then only half of the free |
3670 | * space is actually useable. | 3771 | * space is actually useable. For raid56, the space info used |
3772 | * doesn't include the parity drive, so we don't have to | ||
3773 | * change the math | ||
3671 | */ | 3774 | */ |
3672 | if (profile & (BTRFS_BLOCK_GROUP_DUP | | 3775 | if (profile & (BTRFS_BLOCK_GROUP_DUP | |
3673 | BTRFS_BLOCK_GROUP_RAID1 | | 3776 | BTRFS_BLOCK_GROUP_RAID1 | |
3674 | BTRFS_BLOCK_GROUP_RAID10)) | 3777 | BTRFS_BLOCK_GROUP_RAID10)) |
3675 | avail >>= 1; | 3778 | avail >>= 1; |
3676 | 3779 | ||
3780 | to_add = space_info->total_bytes; | ||
3781 | |||
3677 | /* | 3782 | /* |
3678 | * If we aren't flushing all things, let us overcommit up to | 3783 | * If we aren't flushing all things, let us overcommit up to |
3679 | * 1/2th of the space. If we can flush, don't let us overcommit | 3784 | * 1/2th of the space. If we can flush, don't let us overcommit |
3680 | * too much, let it overcommit up to 1/8 of the space. | 3785 | * too much, let it overcommit up to 1/8 of the space. |
3681 | */ | 3786 | */ |
3682 | if (flush == BTRFS_RESERVE_FLUSH_ALL) | 3787 | if (flush == BTRFS_RESERVE_FLUSH_ALL) |
3683 | avail >>= 3; | 3788 | to_add >>= 3; |
3684 | else | 3789 | else |
3685 | avail >>= 1; | 3790 | to_add >>= 1; |
3686 | 3791 | ||
3687 | if (used + bytes < space_info->total_bytes + avail) | 3792 | /* |
3793 | * Limit the overcommit to the amount of free space we could possibly | ||
3794 | * allocate for chunks. | ||
3795 | */ | ||
3796 | to_add = min(avail, to_add); | ||
3797 | |||
3798 | if (used + bytes < space_info->total_bytes + to_add) | ||
3688 | return 1; | 3799 | return 1; |
3689 | return 0; | 3800 | return 0; |
3690 | } | 3801 | } |
3691 | 3802 | ||
3803 | void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, | ||
3804 | unsigned long nr_pages) | ||
3805 | { | ||
3806 | struct super_block *sb = root->fs_info->sb; | ||
3807 | int started; | ||
3808 | |||
3809 | /* If we can not start writeback, just sync all the delalloc file. */ | ||
3810 | started = try_to_writeback_inodes_sb_nr(sb, nr_pages, | ||
3811 | WB_REASON_FS_FREE_SPACE); | ||
3812 | if (!started) { | ||
3813 | /* | ||
3814 | * We needn't worry the filesystem going from r/w to r/o though | ||
3815 | * we don't acquire ->s_umount mutex, because the filesystem | ||
3816 | * should guarantee the delalloc inodes list be empty after | ||
3817 | * the filesystem is readonly(all dirty pages are written to | ||
3818 | * the disk). | ||
3819 | */ | ||
3820 | btrfs_start_delalloc_inodes(root, 0); | ||
3821 | btrfs_wait_ordered_extents(root, 0); | ||
3822 | } | ||
3823 | } | ||
3824 | |||
3692 | /* | 3825 | /* |
3693 | * shrink metadata reservation for delalloc | 3826 | * shrink metadata reservation for delalloc |
3694 | */ | 3827 | */ |
@@ -3710,7 +3843,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, | |||
3710 | space_info = block_rsv->space_info; | 3843 | space_info = block_rsv->space_info; |
3711 | 3844 | ||
3712 | smp_mb(); | 3845 | smp_mb(); |
3713 | delalloc_bytes = root->fs_info->delalloc_bytes; | 3846 | delalloc_bytes = percpu_counter_sum_positive( |
3847 | &root->fs_info->delalloc_bytes); | ||
3714 | if (delalloc_bytes == 0) { | 3848 | if (delalloc_bytes == 0) { |
3715 | if (trans) | 3849 | if (trans) |
3716 | return; | 3850 | return; |
@@ -3721,10 +3855,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, | |||
3721 | while (delalloc_bytes && loops < 3) { | 3855 | while (delalloc_bytes && loops < 3) { |
3722 | max_reclaim = min(delalloc_bytes, to_reclaim); | 3856 | max_reclaim = min(delalloc_bytes, to_reclaim); |
3723 | nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; | 3857 | nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; |
3724 | try_to_writeback_inodes_sb_nr(root->fs_info->sb, | 3858 | btrfs_writeback_inodes_sb_nr(root, nr_pages); |
3725 | nr_pages, | ||
3726 | WB_REASON_FS_FREE_SPACE); | ||
3727 | |||
3728 | /* | 3859 | /* |
3729 | * We need to wait for the async pages to actually start before | 3860 | * We need to wait for the async pages to actually start before |
3730 | * we do anything. | 3861 | * we do anything. |
@@ -3752,7 +3883,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, | |||
3752 | break; | 3883 | break; |
3753 | } | 3884 | } |
3754 | smp_mb(); | 3885 | smp_mb(); |
3755 | delalloc_bytes = root->fs_info->delalloc_bytes; | 3886 | delalloc_bytes = percpu_counter_sum_positive( |
3887 | &root->fs_info->delalloc_bytes); | ||
3756 | } | 3888 | } |
3757 | } | 3889 | } |
3758 | 3890 | ||
@@ -4016,6 +4148,15 @@ again: | |||
4016 | goto again; | 4148 | goto again; |
4017 | 4149 | ||
4018 | out: | 4150 | out: |
4151 | if (ret == -ENOSPC && | ||
4152 | unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { | ||
4153 | struct btrfs_block_rsv *global_rsv = | ||
4154 | &root->fs_info->global_block_rsv; | ||
4155 | |||
4156 | if (block_rsv != global_rsv && | ||
4157 | !block_rsv_use_bytes(global_rsv, orig_bytes)) | ||
4158 | ret = 0; | ||
4159 | } | ||
4019 | if (flushing) { | 4160 | if (flushing) { |
4020 | spin_lock(&space_info->lock); | 4161 | spin_lock(&space_info->lock); |
4021 | space_info->flush = 0; | 4162 | space_info->flush = 0; |
@@ -4402,19 +4543,60 @@ void btrfs_orphan_release_metadata(struct inode *inode) | |||
4402 | btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); | 4543 | btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); |
4403 | } | 4544 | } |
4404 | 4545 | ||
4405 | int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, | 4546 | /* |
4406 | struct btrfs_pending_snapshot *pending) | 4547 | * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation |
4548 | * root: the root of the parent directory | ||
4549 | * rsv: block reservation | ||
4550 | * items: the number of items that we need do reservation | ||
4551 | * qgroup_reserved: used to return the reserved size in qgroup | ||
4552 | * | ||
4553 | * This function is used to reserve the space for snapshot/subvolume | ||
4554 | * creation and deletion. Those operations are different with the | ||
4555 | * common file/directory operations, they change two fs/file trees | ||
4556 | * and root tree, the number of items that the qgroup reserves is | ||
4557 | * different with the free space reservation. So we can not use | ||
4558 | * the space reseravtion mechanism in start_transaction(). | ||
4559 | */ | ||
4560 | int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, | ||
4561 | struct btrfs_block_rsv *rsv, | ||
4562 | int items, | ||
4563 | u64 *qgroup_reserved) | ||
4407 | { | 4564 | { |
4408 | struct btrfs_root *root = pending->root; | 4565 | u64 num_bytes; |
4409 | struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); | 4566 | int ret; |
4410 | struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; | 4567 | |
4411 | /* | 4568 | if (root->fs_info->quota_enabled) { |
4412 | * two for root back/forward refs, two for directory entries, | 4569 | /* One for parent inode, two for dir entries */ |
4413 | * one for root of the snapshot and one for parent inode. | 4570 | num_bytes = 3 * root->leafsize; |
4414 | */ | 4571 | ret = btrfs_qgroup_reserve(root, num_bytes); |
4415 | u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6); | 4572 | if (ret) |
4416 | dst_rsv->space_info = src_rsv->space_info; | 4573 | return ret; |
4417 | return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); | 4574 | } else { |
4575 | num_bytes = 0; | ||
4576 | } | ||
4577 | |||
4578 | *qgroup_reserved = num_bytes; | ||
4579 | |||
4580 | num_bytes = btrfs_calc_trans_metadata_size(root, items); | ||
4581 | rsv->space_info = __find_space_info(root->fs_info, | ||
4582 | BTRFS_BLOCK_GROUP_METADATA); | ||
4583 | ret = btrfs_block_rsv_add(root, rsv, num_bytes, | ||
4584 | BTRFS_RESERVE_FLUSH_ALL); | ||
4585 | if (ret) { | ||
4586 | if (*qgroup_reserved) | ||
4587 | btrfs_qgroup_free(root, *qgroup_reserved); | ||
4588 | } | ||
4589 | |||
4590 | return ret; | ||
4591 | } | ||
4592 | |||
4593 | void btrfs_subvolume_release_metadata(struct btrfs_root *root, | ||
4594 | struct btrfs_block_rsv *rsv, | ||
4595 | u64 qgroup_reserved) | ||
4596 | { | ||
4597 | btrfs_block_rsv_release(root, rsv, (u64)-1); | ||
4598 | if (qgroup_reserved) | ||
4599 | btrfs_qgroup_free(root, qgroup_reserved); | ||
4418 | } | 4600 | } |
4419 | 4601 | ||
4420 | /** | 4602 | /** |
@@ -4522,6 +4704,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
4522 | enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; | 4704 | enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; |
4523 | int ret = 0; | 4705 | int ret = 0; |
4524 | bool delalloc_lock = true; | 4706 | bool delalloc_lock = true; |
4707 | u64 to_free = 0; | ||
4708 | unsigned dropped; | ||
4525 | 4709 | ||
4526 | /* If we are a free space inode we need to not flush since we will be in | 4710 | /* If we are a free space inode we need to not flush since we will be in |
4527 | * the middle of a transaction commit. We also don't need the delalloc | 4711 | * the middle of a transaction commit. We also don't need the delalloc |
@@ -4565,54 +4749,19 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
4565 | csum_bytes = BTRFS_I(inode)->csum_bytes; | 4749 | csum_bytes = BTRFS_I(inode)->csum_bytes; |
4566 | spin_unlock(&BTRFS_I(inode)->lock); | 4750 | spin_unlock(&BTRFS_I(inode)->lock); |
4567 | 4751 | ||
4568 | if (root->fs_info->quota_enabled) | 4752 | if (root->fs_info->quota_enabled) { |
4569 | ret = btrfs_qgroup_reserve(root, num_bytes + | 4753 | ret = btrfs_qgroup_reserve(root, num_bytes + |
4570 | nr_extents * root->leafsize); | 4754 | nr_extents * root->leafsize); |
4755 | if (ret) | ||
4756 | goto out_fail; | ||
4757 | } | ||
4571 | 4758 | ||
4572 | /* | 4759 | ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); |
4573 | * ret != 0 here means the qgroup reservation failed, we go straight to | 4760 | if (unlikely(ret)) { |
4574 | * the shared error handling then. | 4761 | if (root->fs_info->quota_enabled) |
4575 | */ | ||
4576 | if (ret == 0) | ||
4577 | ret = reserve_metadata_bytes(root, block_rsv, | ||
4578 | to_reserve, flush); | ||
4579 | |||
4580 | if (ret) { | ||
4581 | u64 to_free = 0; | ||
4582 | unsigned dropped; | ||
4583 | |||
4584 | spin_lock(&BTRFS_I(inode)->lock); | ||
4585 | dropped = drop_outstanding_extent(inode); | ||
4586 | /* | ||
4587 | * If the inodes csum_bytes is the same as the original | ||
4588 | * csum_bytes then we know we haven't raced with any free()ers | ||
4589 | * so we can just reduce our inodes csum bytes and carry on. | ||
4590 | * Otherwise we have to do the normal free thing to account for | ||
4591 | * the case that the free side didn't free up its reserve | ||
4592 | * because of this outstanding reservation. | ||
4593 | */ | ||
4594 | if (BTRFS_I(inode)->csum_bytes == csum_bytes) | ||
4595 | calc_csum_metadata_size(inode, num_bytes, 0); | ||
4596 | else | ||
4597 | to_free = calc_csum_metadata_size(inode, num_bytes, 0); | ||
4598 | spin_unlock(&BTRFS_I(inode)->lock); | ||
4599 | if (dropped) | ||
4600 | to_free += btrfs_calc_trans_metadata_size(root, dropped); | ||
4601 | |||
4602 | if (to_free) { | ||
4603 | btrfs_block_rsv_release(root, block_rsv, to_free); | ||
4604 | trace_btrfs_space_reservation(root->fs_info, | ||
4605 | "delalloc", | ||
4606 | btrfs_ino(inode), | ||
4607 | to_free, 0); | ||
4608 | } | ||
4609 | if (root->fs_info->quota_enabled) { | ||
4610 | btrfs_qgroup_free(root, num_bytes + | 4762 | btrfs_qgroup_free(root, num_bytes + |
4611 | nr_extents * root->leafsize); | 4763 | nr_extents * root->leafsize); |
4612 | } | 4764 | goto out_fail; |
4613 | if (delalloc_lock) | ||
4614 | mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); | ||
4615 | return ret; | ||
4616 | } | 4765 | } |
4617 | 4766 | ||
4618 | spin_lock(&BTRFS_I(inode)->lock); | 4767 | spin_lock(&BTRFS_I(inode)->lock); |
@@ -4633,6 +4782,34 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
4633 | block_rsv_add_bytes(block_rsv, to_reserve, 1); | 4782 | block_rsv_add_bytes(block_rsv, to_reserve, 1); |
4634 | 4783 | ||
4635 | return 0; | 4784 | return 0; |
4785 | |||
4786 | out_fail: | ||
4787 | spin_lock(&BTRFS_I(inode)->lock); | ||
4788 | dropped = drop_outstanding_extent(inode); | ||
4789 | /* | ||
4790 | * If the inodes csum_bytes is the same as the original | ||
4791 | * csum_bytes then we know we haven't raced with any free()ers | ||
4792 | * so we can just reduce our inodes csum bytes and carry on. | ||
4793 | * Otherwise we have to do the normal free thing to account for | ||
4794 | * the case that the free side didn't free up its reserve | ||
4795 | * because of this outstanding reservation. | ||
4796 | */ | ||
4797 | if (BTRFS_I(inode)->csum_bytes == csum_bytes) | ||
4798 | calc_csum_metadata_size(inode, num_bytes, 0); | ||
4799 | else | ||
4800 | to_free = calc_csum_metadata_size(inode, num_bytes, 0); | ||
4801 | spin_unlock(&BTRFS_I(inode)->lock); | ||
4802 | if (dropped) | ||
4803 | to_free += btrfs_calc_trans_metadata_size(root, dropped); | ||
4804 | |||
4805 | if (to_free) { | ||
4806 | btrfs_block_rsv_release(root, block_rsv, to_free); | ||
4807 | trace_btrfs_space_reservation(root->fs_info, "delalloc", | ||
4808 | btrfs_ino(inode), to_free, 0); | ||
4809 | } | ||
4810 | if (delalloc_lock) | ||
4811 | mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); | ||
4812 | return ret; | ||
4636 | } | 4813 | } |
4637 | 4814 | ||
4638 | /** | 4815 | /** |
@@ -4654,7 +4831,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) | |||
4654 | spin_lock(&BTRFS_I(inode)->lock); | 4831 | spin_lock(&BTRFS_I(inode)->lock); |
4655 | dropped = drop_outstanding_extent(inode); | 4832 | dropped = drop_outstanding_extent(inode); |
4656 | 4833 | ||
4657 | to_free = calc_csum_metadata_size(inode, num_bytes, 0); | 4834 | if (num_bytes) |
4835 | to_free = calc_csum_metadata_size(inode, num_bytes, 0); | ||
4658 | spin_unlock(&BTRFS_I(inode)->lock); | 4836 | spin_unlock(&BTRFS_I(inode)->lock); |
4659 | if (dropped > 0) | 4837 | if (dropped > 0) |
4660 | to_free += btrfs_calc_trans_metadata_size(root, dropped); | 4838 | to_free += btrfs_calc_trans_metadata_size(root, dropped); |
@@ -4721,8 +4899,7 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) | |||
4721 | btrfs_free_reserved_data_space(inode, num_bytes); | 4899 | btrfs_free_reserved_data_space(inode, num_bytes); |
4722 | } | 4900 | } |
4723 | 4901 | ||
4724 | static int update_block_group(struct btrfs_trans_handle *trans, | 4902 | static int update_block_group(struct btrfs_root *root, |
4725 | struct btrfs_root *root, | ||
4726 | u64 bytenr, u64 num_bytes, int alloc) | 4903 | u64 bytenr, u64 num_bytes, int alloc) |
4727 | { | 4904 | { |
4728 | struct btrfs_block_group_cache *cache = NULL; | 4905 | struct btrfs_block_group_cache *cache = NULL; |
@@ -4759,7 +4936,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, | |||
4759 | * space back to the block group, otherwise we will leak space. | 4936 | * space back to the block group, otherwise we will leak space. |
4760 | */ | 4937 | */ |
4761 | if (!alloc && cache->cached == BTRFS_CACHE_NO) | 4938 | if (!alloc && cache->cached == BTRFS_CACHE_NO) |
4762 | cache_block_group(cache, trans, NULL, 1); | 4939 | cache_block_group(cache, 1); |
4763 | 4940 | ||
4764 | byte_in_group = bytenr - cache->key.objectid; | 4941 | byte_in_group = bytenr - cache->key.objectid; |
4765 | WARN_ON(byte_in_group > cache->key.offset); | 4942 | WARN_ON(byte_in_group > cache->key.offset); |
@@ -4809,6 +4986,13 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) | |||
4809 | struct btrfs_block_group_cache *cache; | 4986 | struct btrfs_block_group_cache *cache; |
4810 | u64 bytenr; | 4987 | u64 bytenr; |
4811 | 4988 | ||
4989 | spin_lock(&root->fs_info->block_group_cache_lock); | ||
4990 | bytenr = root->fs_info->first_logical_byte; | ||
4991 | spin_unlock(&root->fs_info->block_group_cache_lock); | ||
4992 | |||
4993 | if (bytenr < (u64)-1) | ||
4994 | return bytenr; | ||
4995 | |||
4812 | cache = btrfs_lookup_first_block_group(root->fs_info, search_start); | 4996 | cache = btrfs_lookup_first_block_group(root->fs_info, search_start); |
4813 | if (!cache) | 4997 | if (!cache) |
4814 | return 0; | 4998 | return 0; |
@@ -4859,8 +5043,7 @@ int btrfs_pin_extent(struct btrfs_root *root, | |||
4859 | /* | 5043 | /* |
4860 | * this function must be called within transaction | 5044 | * this function must be called within transaction |
4861 | */ | 5045 | */ |
4862 | int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, | 5046 | int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, |
4863 | struct btrfs_root *root, | ||
4864 | u64 bytenr, u64 num_bytes) | 5047 | u64 bytenr, u64 num_bytes) |
4865 | { | 5048 | { |
4866 | struct btrfs_block_group_cache *cache; | 5049 | struct btrfs_block_group_cache *cache; |
@@ -4874,7 +5057,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, | |||
4874 | * to one because the slow code to read in the free extents does check | 5057 | * to one because the slow code to read in the free extents does check |
4875 | * the pinned extents. | 5058 | * the pinned extents. |
4876 | */ | 5059 | */ |
4877 | cache_block_group(cache, trans, root, 1); | 5060 | cache_block_group(cache, 1); |
4878 | 5061 | ||
4879 | pin_down_extent(root, cache, bytenr, num_bytes, 0); | 5062 | pin_down_extent(root, cache, bytenr, num_bytes, 0); |
4880 | 5063 | ||
@@ -5271,7 +5454,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, | |||
5271 | } | 5454 | } |
5272 | } | 5455 | } |
5273 | 5456 | ||
5274 | ret = update_block_group(trans, root, bytenr, num_bytes, 0); | 5457 | ret = update_block_group(root, bytenr, num_bytes, 0); |
5275 | if (ret) { | 5458 | if (ret) { |
5276 | btrfs_abort_transaction(trans, extent_root, ret); | 5459 | btrfs_abort_transaction(trans, extent_root, ret); |
5277 | goto out; | 5460 | goto out; |
@@ -5316,7 +5499,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, | |||
5316 | if (head->extent_op) { | 5499 | if (head->extent_op) { |
5317 | if (!head->must_insert_reserved) | 5500 | if (!head->must_insert_reserved) |
5318 | goto out; | 5501 | goto out; |
5319 | kfree(head->extent_op); | 5502 | btrfs_free_delayed_extent_op(head->extent_op); |
5320 | head->extent_op = NULL; | 5503 | head->extent_op = NULL; |
5321 | } | 5504 | } |
5322 | 5505 | ||
@@ -5439,10 +5622,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, | |||
5439 | return ret; | 5622 | return ret; |
5440 | } | 5623 | } |
5441 | 5624 | ||
5442 | static u64 stripe_align(struct btrfs_root *root, u64 val) | 5625 | static u64 stripe_align(struct btrfs_root *root, |
5626 | struct btrfs_block_group_cache *cache, | ||
5627 | u64 val, u64 num_bytes) | ||
5443 | { | 5628 | { |
5444 | u64 mask = ((u64)root->stripesize - 1); | 5629 | u64 ret = ALIGN(val, root->stripesize); |
5445 | u64 ret = (val + mask) & ~mask; | ||
5446 | return ret; | 5630 | return ret; |
5447 | } | 5631 | } |
5448 | 5632 | ||
@@ -5462,7 +5646,6 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, | |||
5462 | u64 num_bytes) | 5646 | u64 num_bytes) |
5463 | { | 5647 | { |
5464 | struct btrfs_caching_control *caching_ctl; | 5648 | struct btrfs_caching_control *caching_ctl; |
5465 | DEFINE_WAIT(wait); | ||
5466 | 5649 | ||
5467 | caching_ctl = get_caching_control(cache); | 5650 | caching_ctl = get_caching_control(cache); |
5468 | if (!caching_ctl) | 5651 | if (!caching_ctl) |
@@ -5479,7 +5662,6 @@ static noinline int | |||
5479 | wait_block_group_cache_done(struct btrfs_block_group_cache *cache) | 5662 | wait_block_group_cache_done(struct btrfs_block_group_cache *cache) |
5480 | { | 5663 | { |
5481 | struct btrfs_caching_control *caching_ctl; | 5664 | struct btrfs_caching_control *caching_ctl; |
5482 | DEFINE_WAIT(wait); | ||
5483 | 5665 | ||
5484 | caching_ctl = get_caching_control(cache); | 5666 | caching_ctl = get_caching_control(cache); |
5485 | if (!caching_ctl) | 5667 | if (!caching_ctl) |
@@ -5493,20 +5675,20 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) | |||
5493 | 5675 | ||
5494 | int __get_raid_index(u64 flags) | 5676 | int __get_raid_index(u64 flags) |
5495 | { | 5677 | { |
5496 | int index; | ||
5497 | |||
5498 | if (flags & BTRFS_BLOCK_GROUP_RAID10) | 5678 | if (flags & BTRFS_BLOCK_GROUP_RAID10) |
5499 | index = 0; | 5679 | return BTRFS_RAID_RAID10; |
5500 | else if (flags & BTRFS_BLOCK_GROUP_RAID1) | 5680 | else if (flags & BTRFS_BLOCK_GROUP_RAID1) |
5501 | index = 1; | 5681 | return BTRFS_RAID_RAID1; |
5502 | else if (flags & BTRFS_BLOCK_GROUP_DUP) | 5682 | else if (flags & BTRFS_BLOCK_GROUP_DUP) |
5503 | index = 2; | 5683 | return BTRFS_RAID_DUP; |
5504 | else if (flags & BTRFS_BLOCK_GROUP_RAID0) | 5684 | else if (flags & BTRFS_BLOCK_GROUP_RAID0) |
5505 | index = 3; | 5685 | return BTRFS_RAID_RAID0; |
5506 | else | 5686 | else if (flags & BTRFS_BLOCK_GROUP_RAID5) |
5507 | index = 4; | 5687 | return BTRFS_RAID_RAID5; |
5688 | else if (flags & BTRFS_BLOCK_GROUP_RAID6) | ||
5689 | return BTRFS_RAID_RAID6; | ||
5508 | 5690 | ||
5509 | return index; | 5691 | return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ |
5510 | } | 5692 | } |
5511 | 5693 | ||
5512 | static int get_block_group_index(struct btrfs_block_group_cache *cache) | 5694 | static int get_block_group_index(struct btrfs_block_group_cache *cache) |
@@ -5649,6 +5831,8 @@ search: | |||
5649 | if (!block_group_bits(block_group, data)) { | 5831 | if (!block_group_bits(block_group, data)) { |
5650 | u64 extra = BTRFS_BLOCK_GROUP_DUP | | 5832 | u64 extra = BTRFS_BLOCK_GROUP_DUP | |
5651 | BTRFS_BLOCK_GROUP_RAID1 | | 5833 | BTRFS_BLOCK_GROUP_RAID1 | |
5834 | BTRFS_BLOCK_GROUP_RAID5 | | ||
5835 | BTRFS_BLOCK_GROUP_RAID6 | | ||
5652 | BTRFS_BLOCK_GROUP_RAID10; | 5836 | BTRFS_BLOCK_GROUP_RAID10; |
5653 | 5837 | ||
5654 | /* | 5838 | /* |
@@ -5664,8 +5848,7 @@ have_block_group: | |||
5664 | cached = block_group_cache_done(block_group); | 5848 | cached = block_group_cache_done(block_group); |
5665 | if (unlikely(!cached)) { | 5849 | if (unlikely(!cached)) { |
5666 | found_uncached_bg = true; | 5850 | found_uncached_bg = true; |
5667 | ret = cache_block_group(block_group, trans, | 5851 | ret = cache_block_group(block_group, 0); |
5668 | orig_root, 0); | ||
5669 | BUG_ON(ret < 0); | 5852 | BUG_ON(ret < 0); |
5670 | ret = 0; | 5853 | ret = 0; |
5671 | } | 5854 | } |
@@ -5678,6 +5861,7 @@ have_block_group: | |||
5678 | * lets look there | 5861 | * lets look there |
5679 | */ | 5862 | */ |
5680 | if (last_ptr) { | 5863 | if (last_ptr) { |
5864 | unsigned long aligned_cluster; | ||
5681 | /* | 5865 | /* |
5682 | * the refill lock keeps out other | 5866 | * the refill lock keeps out other |
5683 | * people trying to start a new cluster | 5867 | * people trying to start a new cluster |
@@ -5744,11 +5928,15 @@ refill_cluster: | |||
5744 | goto unclustered_alloc; | 5928 | goto unclustered_alloc; |
5745 | } | 5929 | } |
5746 | 5930 | ||
5931 | aligned_cluster = max_t(unsigned long, | ||
5932 | empty_cluster + empty_size, | ||
5933 | block_group->full_stripe_len); | ||
5934 | |||
5747 | /* allocate a cluster in this block group */ | 5935 | /* allocate a cluster in this block group */ |
5748 | ret = btrfs_find_space_cluster(trans, root, | 5936 | ret = btrfs_find_space_cluster(trans, root, |
5749 | block_group, last_ptr, | 5937 | block_group, last_ptr, |
5750 | search_start, num_bytes, | 5938 | search_start, num_bytes, |
5751 | empty_cluster + empty_size); | 5939 | aligned_cluster); |
5752 | if (ret == 0) { | 5940 | if (ret == 0) { |
5753 | /* | 5941 | /* |
5754 | * now pull our allocation out of this | 5942 | * now pull our allocation out of this |
@@ -5819,7 +6007,8 @@ unclustered_alloc: | |||
5819 | goto loop; | 6007 | goto loop; |
5820 | } | 6008 | } |
5821 | checks: | 6009 | checks: |
5822 | search_start = stripe_align(root, offset); | 6010 | search_start = stripe_align(root, used_block_group, |
6011 | offset, num_bytes); | ||
5823 | 6012 | ||
5824 | /* move on to the next group */ | 6013 | /* move on to the next group */ |
5825 | if (search_start + num_bytes > | 6014 | if (search_start + num_bytes > |
@@ -5970,7 +6159,7 @@ again: | |||
5970 | if (ret == -ENOSPC) { | 6159 | if (ret == -ENOSPC) { |
5971 | if (!final_tried) { | 6160 | if (!final_tried) { |
5972 | num_bytes = num_bytes >> 1; | 6161 | num_bytes = num_bytes >> 1; |
5973 | num_bytes = num_bytes & ~(root->sectorsize - 1); | 6162 | num_bytes = round_down(num_bytes, root->sectorsize); |
5974 | num_bytes = max(num_bytes, min_alloc_size); | 6163 | num_bytes = max(num_bytes, min_alloc_size); |
5975 | if (num_bytes == min_alloc_size) | 6164 | if (num_bytes == min_alloc_size) |
5976 | final_tried = true; | 6165 | final_tried = true; |
@@ -6094,7 +6283,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, | |||
6094 | btrfs_mark_buffer_dirty(path->nodes[0]); | 6283 | btrfs_mark_buffer_dirty(path->nodes[0]); |
6095 | btrfs_free_path(path); | 6284 | btrfs_free_path(path); |
6096 | 6285 | ||
6097 | ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); | 6286 | ret = update_block_group(root, ins->objectid, ins->offset, 1); |
6098 | if (ret) { /* -ENOENT, logic error */ | 6287 | if (ret) { /* -ENOENT, logic error */ |
6099 | printk(KERN_ERR "btrfs update block group failed for %llu " | 6288 | printk(KERN_ERR "btrfs update block group failed for %llu " |
6100 | "%llu\n", (unsigned long long)ins->objectid, | 6289 | "%llu\n", (unsigned long long)ins->objectid, |
@@ -6158,7 +6347,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, | |||
6158 | btrfs_mark_buffer_dirty(leaf); | 6347 | btrfs_mark_buffer_dirty(leaf); |
6159 | btrfs_free_path(path); | 6348 | btrfs_free_path(path); |
6160 | 6349 | ||
6161 | ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); | 6350 | ret = update_block_group(root, ins->objectid, ins->offset, 1); |
6162 | if (ret) { /* -ENOENT, logic error */ | 6351 | if (ret) { /* -ENOENT, logic error */ |
6163 | printk(KERN_ERR "btrfs update block group failed for %llu " | 6352 | printk(KERN_ERR "btrfs update block group failed for %llu " |
6164 | "%llu\n", (unsigned long long)ins->objectid, | 6353 | "%llu\n", (unsigned long long)ins->objectid, |
@@ -6201,7 +6390,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, | |||
6201 | u64 num_bytes = ins->offset; | 6390 | u64 num_bytes = ins->offset; |
6202 | 6391 | ||
6203 | block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); | 6392 | block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); |
6204 | cache_block_group(block_group, trans, NULL, 0); | 6393 | cache_block_group(block_group, 0); |
6205 | caching_ctl = get_caching_control(block_group); | 6394 | caching_ctl = get_caching_control(block_group); |
6206 | 6395 | ||
6207 | if (!caching_ctl) { | 6396 | if (!caching_ctl) { |
@@ -6315,12 +6504,14 @@ use_block_rsv(struct btrfs_trans_handle *trans, | |||
6315 | if (!ret) | 6504 | if (!ret) |
6316 | return block_rsv; | 6505 | return block_rsv; |
6317 | if (ret && !block_rsv->failfast) { | 6506 | if (ret && !block_rsv->failfast) { |
6318 | static DEFINE_RATELIMIT_STATE(_rs, | 6507 | if (btrfs_test_opt(root, ENOSPC_DEBUG)) { |
6319 | DEFAULT_RATELIMIT_INTERVAL, | 6508 | static DEFINE_RATELIMIT_STATE(_rs, |
6320 | /*DEFAULT_RATELIMIT_BURST*/ 2); | 6509 | DEFAULT_RATELIMIT_INTERVAL * 10, |
6321 | if (__ratelimit(&_rs)) | 6510 | /*DEFAULT_RATELIMIT_BURST*/ 1); |
6322 | WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n", | 6511 | if (__ratelimit(&_rs)) |
6323 | ret); | 6512 | WARN(1, KERN_DEBUG |
6513 | "btrfs: block rsv returned %d\n", ret); | ||
6514 | } | ||
6324 | ret = reserve_metadata_bytes(root, block_rsv, blocksize, | 6515 | ret = reserve_metadata_bytes(root, block_rsv, blocksize, |
6325 | BTRFS_RESERVE_NO_FLUSH); | 6516 | BTRFS_RESERVE_NO_FLUSH); |
6326 | if (!ret) { | 6517 | if (!ret) { |
@@ -6386,7 +6577,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, | |||
6386 | 6577 | ||
6387 | if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { | 6578 | if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { |
6388 | struct btrfs_delayed_extent_op *extent_op; | 6579 | struct btrfs_delayed_extent_op *extent_op; |
6389 | extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); | 6580 | extent_op = btrfs_alloc_delayed_extent_op(); |
6390 | BUG_ON(!extent_op); /* -ENOMEM */ | 6581 | BUG_ON(!extent_op); /* -ENOMEM */ |
6391 | if (key) | 6582 | if (key) |
6392 | memcpy(&extent_op->key, key, sizeof(extent_op->key)); | 6583 | memcpy(&extent_op->key, key, sizeof(extent_op->key)); |
@@ -7189,6 +7380,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) | |||
7189 | root->fs_info->fs_devices->missing_devices; | 7380 | root->fs_info->fs_devices->missing_devices; |
7190 | 7381 | ||
7191 | stripped = BTRFS_BLOCK_GROUP_RAID0 | | 7382 | stripped = BTRFS_BLOCK_GROUP_RAID0 | |
7383 | BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | | ||
7192 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; | 7384 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; |
7193 | 7385 | ||
7194 | if (num_devices == 1) { | 7386 | if (num_devices == 1) { |
@@ -7467,16 +7659,16 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) | |||
7467 | index = get_block_group_index(block_group); | 7659 | index = get_block_group_index(block_group); |
7468 | } | 7660 | } |
7469 | 7661 | ||
7470 | if (index == 0) { | 7662 | if (index == BTRFS_RAID_RAID10) { |
7471 | dev_min = 4; | 7663 | dev_min = 4; |
7472 | /* Divide by 2 */ | 7664 | /* Divide by 2 */ |
7473 | min_free >>= 1; | 7665 | min_free >>= 1; |
7474 | } else if (index == 1) { | 7666 | } else if (index == BTRFS_RAID_RAID1) { |
7475 | dev_min = 2; | 7667 | dev_min = 2; |
7476 | } else if (index == 2) { | 7668 | } else if (index == BTRFS_RAID_DUP) { |
7477 | /* Multiply by 2 */ | 7669 | /* Multiply by 2 */ |
7478 | min_free <<= 1; | 7670 | min_free <<= 1; |
7479 | } else if (index == 3) { | 7671 | } else if (index == BTRFS_RAID_RAID0) { |
7480 | dev_min = fs_devices->rw_devices; | 7672 | dev_min = fs_devices->rw_devices; |
7481 | do_div(min_free, dev_min); | 7673 | do_div(min_free, dev_min); |
7482 | } | 7674 | } |
@@ -7637,11 +7829,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) | |||
7637 | space_info = list_entry(info->space_info.next, | 7829 | space_info = list_entry(info->space_info.next, |
7638 | struct btrfs_space_info, | 7830 | struct btrfs_space_info, |
7639 | list); | 7831 | list); |
7640 | if (space_info->bytes_pinned > 0 || | 7832 | if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { |
7641 | space_info->bytes_reserved > 0 || | 7833 | if (space_info->bytes_pinned > 0 || |
7642 | space_info->bytes_may_use > 0) { | 7834 | space_info->bytes_reserved > 0 || |
7643 | WARN_ON(1); | 7835 | space_info->bytes_may_use > 0) { |
7644 | dump_space_info(space_info, 0, 0); | 7836 | WARN_ON(1); |
7837 | dump_space_info(space_info, 0, 0); | ||
7838 | } | ||
7645 | } | 7839 | } |
7646 | list_del(&space_info->list); | 7840 | list_del(&space_info->list); |
7647 | kfree(space_info); | 7841 | kfree(space_info); |
@@ -7740,7 +7934,9 @@ int btrfs_read_block_groups(struct btrfs_root *root) | |||
7740 | btrfs_release_path(path); | 7934 | btrfs_release_path(path); |
7741 | cache->flags = btrfs_block_group_flags(&cache->item); | 7935 | cache->flags = btrfs_block_group_flags(&cache->item); |
7742 | cache->sectorsize = root->sectorsize; | 7936 | cache->sectorsize = root->sectorsize; |
7743 | 7937 | cache->full_stripe_len = btrfs_full_stripe_len(root, | |
7938 | &root->fs_info->mapping_tree, | ||
7939 | found_key.objectid); | ||
7744 | btrfs_init_free_space_ctl(cache); | 7940 | btrfs_init_free_space_ctl(cache); |
7745 | 7941 | ||
7746 | /* | 7942 | /* |
@@ -7794,6 +7990,8 @@ int btrfs_read_block_groups(struct btrfs_root *root) | |||
7794 | if (!(get_alloc_profile(root, space_info->flags) & | 7990 | if (!(get_alloc_profile(root, space_info->flags) & |
7795 | (BTRFS_BLOCK_GROUP_RAID10 | | 7991 | (BTRFS_BLOCK_GROUP_RAID10 | |
7796 | BTRFS_BLOCK_GROUP_RAID1 | | 7992 | BTRFS_BLOCK_GROUP_RAID1 | |
7993 | BTRFS_BLOCK_GROUP_RAID5 | | ||
7994 | BTRFS_BLOCK_GROUP_RAID6 | | ||
7797 | BTRFS_BLOCK_GROUP_DUP))) | 7995 | BTRFS_BLOCK_GROUP_DUP))) |
7798 | continue; | 7996 | continue; |
7799 | /* | 7997 | /* |
@@ -7869,6 +8067,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, | |||
7869 | cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; | 8067 | cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; |
7870 | cache->sectorsize = root->sectorsize; | 8068 | cache->sectorsize = root->sectorsize; |
7871 | cache->fs_info = root->fs_info; | 8069 | cache->fs_info = root->fs_info; |
8070 | cache->full_stripe_len = btrfs_full_stripe_len(root, | ||
8071 | &root->fs_info->mapping_tree, | ||
8072 | chunk_offset); | ||
7872 | 8073 | ||
7873 | atomic_set(&cache->count, 1); | 8074 | atomic_set(&cache->count, 1); |
7874 | spin_lock_init(&cache->lock); | 8075 | spin_lock_init(&cache->lock); |
@@ -7918,12 +8119,14 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) | |||
7918 | u64 extra_flags = chunk_to_extended(flags) & | 8119 | u64 extra_flags = chunk_to_extended(flags) & |
7919 | BTRFS_EXTENDED_PROFILE_MASK; | 8120 | BTRFS_EXTENDED_PROFILE_MASK; |
7920 | 8121 | ||
8122 | write_seqlock(&fs_info->profiles_lock); | ||
7921 | if (flags & BTRFS_BLOCK_GROUP_DATA) | 8123 | if (flags & BTRFS_BLOCK_GROUP_DATA) |
7922 | fs_info->avail_data_alloc_bits &= ~extra_flags; | 8124 | fs_info->avail_data_alloc_bits &= ~extra_flags; |
7923 | if (flags & BTRFS_BLOCK_GROUP_METADATA) | 8125 | if (flags & BTRFS_BLOCK_GROUP_METADATA) |
7924 | fs_info->avail_metadata_alloc_bits &= ~extra_flags; | 8126 | fs_info->avail_metadata_alloc_bits &= ~extra_flags; |
7925 | if (flags & BTRFS_BLOCK_GROUP_SYSTEM) | 8127 | if (flags & BTRFS_BLOCK_GROUP_SYSTEM) |
7926 | fs_info->avail_system_alloc_bits &= ~extra_flags; | 8128 | fs_info->avail_system_alloc_bits &= ~extra_flags; |
8129 | write_sequnlock(&fs_info->profiles_lock); | ||
7927 | } | 8130 | } |
7928 | 8131 | ||
7929 | int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | 8132 | int btrfs_remove_block_group(struct btrfs_trans_handle *trans, |
@@ -8022,6 +8225,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | |||
8022 | spin_lock(&root->fs_info->block_group_cache_lock); | 8225 | spin_lock(&root->fs_info->block_group_cache_lock); |
8023 | rb_erase(&block_group->cache_node, | 8226 | rb_erase(&block_group->cache_node, |
8024 | &root->fs_info->block_group_cache_tree); | 8227 | &root->fs_info->block_group_cache_tree); |
8228 | |||
8229 | if (root->fs_info->first_logical_byte == block_group->key.objectid) | ||
8230 | root->fs_info->first_logical_byte = (u64)-1; | ||
8025 | spin_unlock(&root->fs_info->block_group_cache_lock); | 8231 | spin_unlock(&root->fs_info->block_group_cache_lock); |
8026 | 8232 | ||
8027 | down_write(&block_group->space_info->groups_sem); | 8233 | down_write(&block_group->space_info->groups_sem); |
@@ -8144,7 +8350,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) | |||
8144 | 8350 | ||
8145 | if (end - start >= range->minlen) { | 8351 | if (end - start >= range->minlen) { |
8146 | if (!block_group_cache_done(cache)) { | 8352 | if (!block_group_cache_done(cache)) { |
8147 | ret = cache_block_group(cache, NULL, root, 0); | 8353 | ret = cache_block_group(cache, 0); |
8148 | if (!ret) | 8354 | if (!ret) |
8149 | wait_block_group_cache_done(cache); | 8355 | wait_block_group_cache_done(cache); |
8150 | } | 8356 | } |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1b319df29eee..f173c5af6461 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -4,7 +4,6 @@ | |||
4 | #include <linux/mm.h> | 4 | #include <linux/mm.h> |
5 | #include <linux/pagemap.h> | 5 | #include <linux/pagemap.h> |
6 | #include <linux/page-flags.h> | 6 | #include <linux/page-flags.h> |
7 | #include <linux/module.h> | ||
8 | #include <linux/spinlock.h> | 7 | #include <linux/spinlock.h> |
9 | #include <linux/blkdev.h> | 8 | #include <linux/blkdev.h> |
10 | #include <linux/swap.h> | 9 | #include <linux/swap.h> |
@@ -1834,7 +1833,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, | |||
1834 | */ | 1833 | */ |
1835 | static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) | 1834 | static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) |
1836 | { | 1835 | { |
1837 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | 1836 | u64 start = page_offset(page); |
1838 | u64 end = start + PAGE_CACHE_SIZE - 1; | 1837 | u64 end = start + PAGE_CACHE_SIZE - 1; |
1839 | if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) | 1838 | if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) |
1840 | SetPageUptodate(page); | 1839 | SetPageUptodate(page); |
@@ -1846,7 +1845,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) | |||
1846 | */ | 1845 | */ |
1847 | static void check_page_locked(struct extent_io_tree *tree, struct page *page) | 1846 | static void check_page_locked(struct extent_io_tree *tree, struct page *page) |
1848 | { | 1847 | { |
1849 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | 1848 | u64 start = page_offset(page); |
1850 | u64 end = start + PAGE_CACHE_SIZE - 1; | 1849 | u64 end = start + PAGE_CACHE_SIZE - 1; |
1851 | if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) | 1850 | if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) |
1852 | unlock_page(page); | 1851 | unlock_page(page); |
@@ -1895,13 +1894,11 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec, | |||
1895 | if (ret) | 1894 | if (ret) |
1896 | err = ret; | 1895 | err = ret; |
1897 | 1896 | ||
1898 | if (did_repair) { | 1897 | ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, |
1899 | ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, | 1898 | rec->start + rec->len - 1, |
1900 | rec->start + rec->len - 1, | 1899 | EXTENT_DAMAGED, GFP_NOFS); |
1901 | EXTENT_DAMAGED, GFP_NOFS); | 1900 | if (ret && !err) |
1902 | if (ret && !err) | 1901 | err = ret; |
1903 | err = ret; | ||
1904 | } | ||
1905 | 1902 | ||
1906 | kfree(rec); | 1903 | kfree(rec); |
1907 | return err; | 1904 | return err; |
@@ -1932,10 +1929,15 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, | |||
1932 | u64 map_length = 0; | 1929 | u64 map_length = 0; |
1933 | u64 sector; | 1930 | u64 sector; |
1934 | struct btrfs_bio *bbio = NULL; | 1931 | struct btrfs_bio *bbio = NULL; |
1932 | struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; | ||
1935 | int ret; | 1933 | int ret; |
1936 | 1934 | ||
1937 | BUG_ON(!mirror_num); | 1935 | BUG_ON(!mirror_num); |
1938 | 1936 | ||
1937 | /* we can't repair anything in raid56 yet */ | ||
1938 | if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) | ||
1939 | return 0; | ||
1940 | |||
1939 | bio = bio_alloc(GFP_NOFS, 1); | 1941 | bio = bio_alloc(GFP_NOFS, 1); |
1940 | if (!bio) | 1942 | if (!bio) |
1941 | return -EIO; | 1943 | return -EIO; |
@@ -1960,7 +1962,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, | |||
1960 | return -EIO; | 1962 | return -EIO; |
1961 | } | 1963 | } |
1962 | bio->bi_bdev = dev->bdev; | 1964 | bio->bi_bdev = dev->bdev; |
1963 | bio_add_page(bio, page, length, start-page_offset(page)); | 1965 | bio_add_page(bio, page, length, start - page_offset(page)); |
1964 | btrfsic_submit_bio(WRITE_SYNC, bio); | 1966 | btrfsic_submit_bio(WRITE_SYNC, bio); |
1965 | wait_for_completion(&compl); | 1967 | wait_for_completion(&compl); |
1966 | 1968 | ||
@@ -2052,6 +2054,7 @@ static int clean_io_failure(u64 start, struct page *page) | |||
2052 | failrec->failed_mirror); | 2054 | failrec->failed_mirror); |
2053 | did_repair = !ret; | 2055 | did_repair = !ret; |
2054 | } | 2056 | } |
2057 | ret = 0; | ||
2055 | } | 2058 | } |
2056 | 2059 | ||
2057 | out: | 2060 | out: |
@@ -2293,8 +2296,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err) | |||
2293 | struct page *page = bvec->bv_page; | 2296 | struct page *page = bvec->bv_page; |
2294 | tree = &BTRFS_I(page->mapping->host)->io_tree; | 2297 | tree = &BTRFS_I(page->mapping->host)->io_tree; |
2295 | 2298 | ||
2296 | start = ((u64)page->index << PAGE_CACHE_SHIFT) + | 2299 | start = page_offset(page) + bvec->bv_offset; |
2297 | bvec->bv_offset; | ||
2298 | end = start + bvec->bv_len - 1; | 2300 | end = start + bvec->bv_len - 1; |
2299 | 2301 | ||
2300 | if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) | 2302 | if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) |
@@ -2353,8 +2355,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) | |||
2353 | (long int)bio->bi_bdev); | 2355 | (long int)bio->bi_bdev); |
2354 | tree = &BTRFS_I(page->mapping->host)->io_tree; | 2356 | tree = &BTRFS_I(page->mapping->host)->io_tree; |
2355 | 2357 | ||
2356 | start = ((u64)page->index << PAGE_CACHE_SHIFT) + | 2358 | start = page_offset(page) + bvec->bv_offset; |
2357 | bvec->bv_offset; | ||
2358 | end = start + bvec->bv_len - 1; | 2359 | end = start + bvec->bv_len - 1; |
2359 | 2360 | ||
2360 | if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) | 2361 | if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) |
@@ -2471,7 +2472,7 @@ static int __must_check submit_one_bio(int rw, struct bio *bio, | |||
2471 | struct extent_io_tree *tree = bio->bi_private; | 2472 | struct extent_io_tree *tree = bio->bi_private; |
2472 | u64 start; | 2473 | u64 start; |
2473 | 2474 | ||
2474 | start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; | 2475 | start = page_offset(page) + bvec->bv_offset; |
2475 | 2476 | ||
2476 | bio->bi_private = NULL; | 2477 | bio->bi_private = NULL; |
2477 | 2478 | ||
@@ -2489,13 +2490,13 @@ static int __must_check submit_one_bio(int rw, struct bio *bio, | |||
2489 | return ret; | 2490 | return ret; |
2490 | } | 2491 | } |
2491 | 2492 | ||
2492 | static int merge_bio(struct extent_io_tree *tree, struct page *page, | 2493 | static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page, |
2493 | unsigned long offset, size_t size, struct bio *bio, | 2494 | unsigned long offset, size_t size, struct bio *bio, |
2494 | unsigned long bio_flags) | 2495 | unsigned long bio_flags) |
2495 | { | 2496 | { |
2496 | int ret = 0; | 2497 | int ret = 0; |
2497 | if (tree->ops && tree->ops->merge_bio_hook) | 2498 | if (tree->ops && tree->ops->merge_bio_hook) |
2498 | ret = tree->ops->merge_bio_hook(page, offset, size, bio, | 2499 | ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio, |
2499 | bio_flags); | 2500 | bio_flags); |
2500 | BUG_ON(ret < 0); | 2501 | BUG_ON(ret < 0); |
2501 | return ret; | 2502 | return ret; |
@@ -2530,7 +2531,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, | |||
2530 | sector; | 2531 | sector; |
2531 | 2532 | ||
2532 | if (prev_bio_flags != bio_flags || !contig || | 2533 | if (prev_bio_flags != bio_flags || !contig || |
2533 | merge_bio(tree, page, offset, page_size, bio, bio_flags) || | 2534 | merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || |
2534 | bio_add_page(bio, page, page_size, offset) < page_size) { | 2535 | bio_add_page(bio, page, page_size, offset) < page_size) { |
2535 | ret = submit_one_bio(rw, bio, mirror_num, | 2536 | ret = submit_one_bio(rw, bio, mirror_num, |
2536 | prev_bio_flags); | 2537 | prev_bio_flags); |
@@ -2595,7 +2596,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, | |||
2595 | unsigned long *bio_flags) | 2596 | unsigned long *bio_flags) |
2596 | { | 2597 | { |
2597 | struct inode *inode = page->mapping->host; | 2598 | struct inode *inode = page->mapping->host; |
2598 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | 2599 | u64 start = page_offset(page); |
2599 | u64 page_end = start + PAGE_CACHE_SIZE - 1; | 2600 | u64 page_end = start + PAGE_CACHE_SIZE - 1; |
2600 | u64 end; | 2601 | u64 end; |
2601 | u64 cur = start; | 2602 | u64 cur = start; |
@@ -2648,6 +2649,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, | |||
2648 | } | 2649 | } |
2649 | } | 2650 | } |
2650 | while (cur <= end) { | 2651 | while (cur <= end) { |
2652 | unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; | ||
2653 | |||
2651 | if (cur >= last_byte) { | 2654 | if (cur >= last_byte) { |
2652 | char *userpage; | 2655 | char *userpage; |
2653 | struct extent_state *cached = NULL; | 2656 | struct extent_state *cached = NULL; |
@@ -2682,7 +2685,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, | |||
2682 | 2685 | ||
2683 | iosize = min(extent_map_end(em) - cur, end - cur + 1); | 2686 | iosize = min(extent_map_end(em) - cur, end - cur + 1); |
2684 | cur_end = min(extent_map_end(em) - 1, end); | 2687 | cur_end = min(extent_map_end(em) - 1, end); |
2685 | iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); | 2688 | iosize = ALIGN(iosize, blocksize); |
2686 | if (this_bio_flag & EXTENT_BIO_COMPRESSED) { | 2689 | if (this_bio_flag & EXTENT_BIO_COMPRESSED) { |
2687 | disk_io_size = em->block_len; | 2690 | disk_io_size = em->block_len; |
2688 | sector = em->block_start >> 9; | 2691 | sector = em->block_start >> 9; |
@@ -2735,26 +2738,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree, | |||
2735 | continue; | 2738 | continue; |
2736 | } | 2739 | } |
2737 | 2740 | ||
2738 | ret = 0; | 2741 | pnr -= page->index; |
2739 | if (tree->ops && tree->ops->readpage_io_hook) { | 2742 | ret = submit_extent_page(READ, tree, page, |
2740 | ret = tree->ops->readpage_io_hook(page, cur, | ||
2741 | cur + iosize - 1); | ||
2742 | } | ||
2743 | if (!ret) { | ||
2744 | unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; | ||
2745 | pnr -= page->index; | ||
2746 | ret = submit_extent_page(READ, tree, page, | ||
2747 | sector, disk_io_size, pg_offset, | 2743 | sector, disk_io_size, pg_offset, |
2748 | bdev, bio, pnr, | 2744 | bdev, bio, pnr, |
2749 | end_bio_extent_readpage, mirror_num, | 2745 | end_bio_extent_readpage, mirror_num, |
2750 | *bio_flags, | 2746 | *bio_flags, |
2751 | this_bio_flag); | 2747 | this_bio_flag); |
2752 | if (!ret) { | 2748 | if (!ret) { |
2753 | nr++; | 2749 | nr++; |
2754 | *bio_flags = this_bio_flag; | 2750 | *bio_flags = this_bio_flag; |
2755 | } | 2751 | } else { |
2756 | } | ||
2757 | if (ret) { | ||
2758 | SetPageError(page); | 2752 | SetPageError(page); |
2759 | unlock_extent(tree, cur, cur + iosize - 1); | 2753 | unlock_extent(tree, cur, cur + iosize - 1); |
2760 | } | 2754 | } |
@@ -2806,7 +2800,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, | |||
2806 | struct inode *inode = page->mapping->host; | 2800 | struct inode *inode = page->mapping->host; |
2807 | struct extent_page_data *epd = data; | 2801 | struct extent_page_data *epd = data; |
2808 | struct extent_io_tree *tree = epd->tree; | 2802 | struct extent_io_tree *tree = epd->tree; |
2809 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | 2803 | u64 start = page_offset(page); |
2810 | u64 delalloc_start; | 2804 | u64 delalloc_start; |
2811 | u64 page_end = start + PAGE_CACHE_SIZE - 1; | 2805 | u64 page_end = start + PAGE_CACHE_SIZE - 1; |
2812 | u64 end; | 2806 | u64 end; |
@@ -2982,7 +2976,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, | |||
2982 | BUG_ON(extent_map_end(em) <= cur); | 2976 | BUG_ON(extent_map_end(em) <= cur); |
2983 | BUG_ON(end < cur); | 2977 | BUG_ON(end < cur); |
2984 | iosize = min(extent_map_end(em) - cur, end - cur + 1); | 2978 | iosize = min(extent_map_end(em) - cur, end - cur + 1); |
2985 | iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); | 2979 | iosize = ALIGN(iosize, blocksize); |
2986 | sector = (em->block_start + extent_offset) >> 9; | 2980 | sector = (em->block_start + extent_offset) >> 9; |
2987 | bdev = em->bdev; | 2981 | bdev = em->bdev; |
2988 | block_start = em->block_start; | 2982 | block_start = em->block_start; |
@@ -3124,12 +3118,9 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb, | |||
3124 | set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); | 3118 | set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); |
3125 | spin_unlock(&eb->refs_lock); | 3119 | spin_unlock(&eb->refs_lock); |
3126 | btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); | 3120 | btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); |
3127 | spin_lock(&fs_info->delalloc_lock); | 3121 | __percpu_counter_add(&fs_info->dirty_metadata_bytes, |
3128 | if (fs_info->dirty_metadata_bytes >= eb->len) | 3122 | -eb->len, |
3129 | fs_info->dirty_metadata_bytes -= eb->len; | 3123 | fs_info->dirty_metadata_batch); |
3130 | else | ||
3131 | WARN_ON(1); | ||
3132 | spin_unlock(&fs_info->delalloc_lock); | ||
3133 | ret = 1; | 3124 | ret = 1; |
3134 | } else { | 3125 | } else { |
3135 | spin_unlock(&eb->refs_lock); | 3126 | spin_unlock(&eb->refs_lock); |
@@ -3446,15 +3437,9 @@ retry: | |||
3446 | * swizzled back from swapper_space to tmpfs file | 3437 | * swizzled back from swapper_space to tmpfs file |
3447 | * mapping | 3438 | * mapping |
3448 | */ | 3439 | */ |
3449 | if (tree->ops && | 3440 | if (!trylock_page(page)) { |
3450 | tree->ops->write_cache_pages_lock_hook) { | 3441 | flush_fn(data); |
3451 | tree->ops->write_cache_pages_lock_hook(page, | 3442 | lock_page(page); |
3452 | data, flush_fn); | ||
3453 | } else { | ||
3454 | if (!trylock_page(page)) { | ||
3455 | flush_fn(data); | ||
3456 | lock_page(page); | ||
3457 | } | ||
3458 | } | 3443 | } |
3459 | 3444 | ||
3460 | if (unlikely(page->mapping != mapping)) { | 3445 | if (unlikely(page->mapping != mapping)) { |
@@ -3674,11 +3659,11 @@ int extent_invalidatepage(struct extent_io_tree *tree, | |||
3674 | struct page *page, unsigned long offset) | 3659 | struct page *page, unsigned long offset) |
3675 | { | 3660 | { |
3676 | struct extent_state *cached_state = NULL; | 3661 | struct extent_state *cached_state = NULL; |
3677 | u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); | 3662 | u64 start = page_offset(page); |
3678 | u64 end = start + PAGE_CACHE_SIZE - 1; | 3663 | u64 end = start + PAGE_CACHE_SIZE - 1; |
3679 | size_t blocksize = page->mapping->host->i_sb->s_blocksize; | 3664 | size_t blocksize = page->mapping->host->i_sb->s_blocksize; |
3680 | 3665 | ||
3681 | start += (offset + blocksize - 1) & ~(blocksize - 1); | 3666 | start += ALIGN(offset, blocksize); |
3682 | if (start > end) | 3667 | if (start > end) |
3683 | return 0; | 3668 | return 0; |
3684 | 3669 | ||
@@ -3700,7 +3685,7 @@ int try_release_extent_state(struct extent_map_tree *map, | |||
3700 | struct extent_io_tree *tree, struct page *page, | 3685 | struct extent_io_tree *tree, struct page *page, |
3701 | gfp_t mask) | 3686 | gfp_t mask) |
3702 | { | 3687 | { |
3703 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | 3688 | u64 start = page_offset(page); |
3704 | u64 end = start + PAGE_CACHE_SIZE - 1; | 3689 | u64 end = start + PAGE_CACHE_SIZE - 1; |
3705 | int ret = 1; | 3690 | int ret = 1; |
3706 | 3691 | ||
@@ -3739,7 +3724,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, | |||
3739 | gfp_t mask) | 3724 | gfp_t mask) |
3740 | { | 3725 | { |
3741 | struct extent_map *em; | 3726 | struct extent_map *em; |
3742 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | 3727 | u64 start = page_offset(page); |
3743 | u64 end = start + PAGE_CACHE_SIZE - 1; | 3728 | u64 end = start + PAGE_CACHE_SIZE - 1; |
3744 | 3729 | ||
3745 | if ((mask & __GFP_WAIT) && | 3730 | if ((mask & __GFP_WAIT) && |
@@ -3797,7 +3782,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, | |||
3797 | len = last - offset; | 3782 | len = last - offset; |
3798 | if (len == 0) | 3783 | if (len == 0) |
3799 | break; | 3784 | break; |
3800 | len = (len + sectorsize - 1) & ~(sectorsize - 1); | 3785 | len = ALIGN(len, sectorsize); |
3801 | em = get_extent(inode, NULL, 0, offset, len, 0); | 3786 | em = get_extent(inode, NULL, 0, offset, len, 0); |
3802 | if (IS_ERR_OR_NULL(em)) | 3787 | if (IS_ERR_OR_NULL(em)) |
3803 | return em; | 3788 | return em; |
@@ -3995,8 +3980,6 @@ static void __free_extent_buffer(struct extent_buffer *eb) | |||
3995 | list_del(&eb->leak_list); | 3980 | list_del(&eb->leak_list); |
3996 | spin_unlock_irqrestore(&leak_lock, flags); | 3981 | spin_unlock_irqrestore(&leak_lock, flags); |
3997 | #endif | 3982 | #endif |
3998 | if (eb->pages && eb->pages != eb->inline_pages) | ||
3999 | kfree(eb->pages); | ||
4000 | kmem_cache_free(extent_buffer_cache, eb); | 3983 | kmem_cache_free(extent_buffer_cache, eb); |
4001 | } | 3984 | } |
4002 | 3985 | ||
@@ -4037,19 +4020,12 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, | |||
4037 | atomic_set(&eb->refs, 1); | 4020 | atomic_set(&eb->refs, 1); |
4038 | atomic_set(&eb->io_pages, 0); | 4021 | atomic_set(&eb->io_pages, 0); |
4039 | 4022 | ||
4040 | if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) { | 4023 | /* |
4041 | struct page **pages; | 4024 | * Sanity checks, currently the maximum is 64k covered by 16x 4k pages |
4042 | int num_pages = (len + PAGE_CACHE_SIZE - 1) >> | 4025 | */ |
4043 | PAGE_CACHE_SHIFT; | 4026 | BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE |
4044 | pages = kzalloc(num_pages, mask); | 4027 | > MAX_INLINE_EXTENT_BUFFER_SIZE); |
4045 | if (!pages) { | 4028 | BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); |
4046 | __free_extent_buffer(eb); | ||
4047 | return NULL; | ||
4048 | } | ||
4049 | eb->pages = pages; | ||
4050 | } else { | ||
4051 | eb->pages = eb->inline_pages; | ||
4052 | } | ||
4053 | 4029 | ||
4054 | return eb; | 4030 | return eb; |
4055 | } | 4031 | } |
@@ -4180,6 +4156,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) | |||
4180 | 4156 | ||
4181 | static void check_buffer_tree_ref(struct extent_buffer *eb) | 4157 | static void check_buffer_tree_ref(struct extent_buffer *eb) |
4182 | { | 4158 | { |
4159 | int refs; | ||
4183 | /* the ref bit is tricky. We have to make sure it is set | 4160 | /* the ref bit is tricky. We have to make sure it is set |
4184 | * if we have the buffer dirty. Otherwise the | 4161 | * if we have the buffer dirty. Otherwise the |
4185 | * code to free a buffer can end up dropping a dirty | 4162 | * code to free a buffer can end up dropping a dirty |
@@ -4200,6 +4177,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) | |||
4200 | * So bump the ref count first, then set the bit. If someone | 4177 | * So bump the ref count first, then set the bit. If someone |
4201 | * beat us to it, drop the ref we added. | 4178 | * beat us to it, drop the ref we added. |
4202 | */ | 4179 | */ |
4180 | refs = atomic_read(&eb->refs); | ||
4181 | if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) | ||
4182 | return; | ||
4183 | |||
4203 | spin_lock(&eb->refs_lock); | 4184 | spin_lock(&eb->refs_lock); |
4204 | if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) | 4185 | if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) |
4205 | atomic_inc(&eb->refs); | 4186 | atomic_inc(&eb->refs); |
@@ -4401,9 +4382,20 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask) | |||
4401 | 4382 | ||
4402 | void free_extent_buffer(struct extent_buffer *eb) | 4383 | void free_extent_buffer(struct extent_buffer *eb) |
4403 | { | 4384 | { |
4385 | int refs; | ||
4386 | int old; | ||
4404 | if (!eb) | 4387 | if (!eb) |
4405 | return; | 4388 | return; |
4406 | 4389 | ||
4390 | while (1) { | ||
4391 | refs = atomic_read(&eb->refs); | ||
4392 | if (refs <= 3) | ||
4393 | break; | ||
4394 | old = atomic_cmpxchg(&eb->refs, refs, refs - 1); | ||
4395 | if (old == refs) | ||
4396 | return; | ||
4397 | } | ||
4398 | |||
4407 | spin_lock(&eb->refs_lock); | 4399 | spin_lock(&eb->refs_lock); |
4408 | if (atomic_read(&eb->refs) == 2 && | 4400 | if (atomic_read(&eb->refs) == 2 && |
4409 | test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) | 4401 | test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) |
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 2eacfabd3263..6068a1985560 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h | |||
@@ -72,10 +72,9 @@ struct extent_io_ops { | |||
72 | int (*writepage_start_hook)(struct page *page, u64 start, u64 end); | 72 | int (*writepage_start_hook)(struct page *page, u64 start, u64 end); |
73 | int (*writepage_io_hook)(struct page *page, u64 start, u64 end); | 73 | int (*writepage_io_hook)(struct page *page, u64 start, u64 end); |
74 | extent_submit_bio_hook_t *submit_bio_hook; | 74 | extent_submit_bio_hook_t *submit_bio_hook; |
75 | int (*merge_bio_hook)(struct page *page, unsigned long offset, | 75 | int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset, |
76 | size_t size, struct bio *bio, | 76 | size_t size, struct bio *bio, |
77 | unsigned long bio_flags); | 77 | unsigned long bio_flags); |
78 | int (*readpage_io_hook)(struct page *page, u64 start, u64 end); | ||
79 | int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); | 78 | int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); |
80 | int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, | 79 | int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, |
81 | struct extent_state *state, int mirror); | 80 | struct extent_state *state, int mirror); |
@@ -90,8 +89,6 @@ struct extent_io_ops { | |||
90 | struct extent_state *other); | 89 | struct extent_state *other); |
91 | void (*split_extent_hook)(struct inode *inode, | 90 | void (*split_extent_hook)(struct inode *inode, |
92 | struct extent_state *orig, u64 split); | 91 | struct extent_state *orig, u64 split); |
93 | int (*write_cache_pages_lock_hook)(struct page *page, void *data, | ||
94 | void (*flush_fn)(void *)); | ||
95 | }; | 92 | }; |
96 | 93 | ||
97 | struct extent_io_tree { | 94 | struct extent_io_tree { |
@@ -161,8 +158,7 @@ struct extent_buffer { | |||
161 | */ | 158 | */ |
162 | wait_queue_head_t read_lock_wq; | 159 | wait_queue_head_t read_lock_wq; |
163 | wait_queue_head_t lock_wq; | 160 | wait_queue_head_t lock_wq; |
164 | struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES]; | 161 | struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; |
165 | struct page **pages; | ||
166 | }; | 162 | }; |
167 | 163 | ||
168 | static inline void extent_set_compress_type(unsigned long *bio_flags, | 164 | static inline void extent_set_compress_type(unsigned long *bio_flags, |
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index fdb7a8db3b57..2834ca5768ea 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c | |||
@@ -1,6 +1,5 @@ | |||
1 | #include <linux/err.h> | 1 | #include <linux/err.h> |
2 | #include <linux/slab.h> | 2 | #include <linux/slab.h> |
3 | #include <linux/module.h> | ||
4 | #include <linux/spinlock.h> | 3 | #include <linux/spinlock.h> |
5 | #include <linux/hardirq.h> | 4 | #include <linux/hardirq.h> |
6 | #include "ctree.h" | 5 | #include "ctree.h" |
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 94aa53b38721..ec160202be3e 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c | |||
@@ -684,6 +684,24 @@ out: | |||
684 | return ret; | 684 | return ret; |
685 | } | 685 | } |
686 | 686 | ||
687 | static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums, | ||
688 | struct btrfs_sector_sum *sector_sum, | ||
689 | u64 total_bytes, u64 sectorsize) | ||
690 | { | ||
691 | u64 tmp = sectorsize; | ||
692 | u64 next_sector = sector_sum->bytenr; | ||
693 | struct btrfs_sector_sum *next = sector_sum + 1; | ||
694 | |||
695 | while ((tmp + total_bytes) < sums->len) { | ||
696 | if (next_sector + sectorsize != next->bytenr) | ||
697 | break; | ||
698 | tmp += sectorsize; | ||
699 | next_sector = next->bytenr; | ||
700 | next++; | ||
701 | } | ||
702 | return tmp; | ||
703 | } | ||
704 | |||
687 | int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, | 705 | int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, |
688 | struct btrfs_root *root, | 706 | struct btrfs_root *root, |
689 | struct btrfs_ordered_sum *sums) | 707 | struct btrfs_ordered_sum *sums) |
@@ -789,20 +807,32 @@ again: | |||
789 | goto insert; | 807 | goto insert; |
790 | } | 808 | } |
791 | 809 | ||
792 | if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) / | 810 | if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) / |
793 | csum_size) { | 811 | csum_size) { |
794 | u32 diff = (csum_offset + 1) * csum_size; | 812 | int extend_nr; |
813 | u64 tmp; | ||
814 | u32 diff; | ||
815 | u32 free_space; | ||
795 | 816 | ||
796 | /* | 817 | if (btrfs_leaf_free_space(root, leaf) < |
797 | * is the item big enough already? we dropped our lock | 818 | sizeof(struct btrfs_item) + csum_size * 2) |
798 | * before and need to recheck | 819 | goto insert; |
799 | */ | 820 | |
800 | if (diff < btrfs_item_size_nr(leaf, path->slots[0])) | 821 | free_space = btrfs_leaf_free_space(root, leaf) - |
801 | goto csum; | 822 | sizeof(struct btrfs_item) - csum_size; |
823 | tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, | ||
824 | root->sectorsize); | ||
825 | tmp >>= root->fs_info->sb->s_blocksize_bits; | ||
826 | WARN_ON(tmp < 1); | ||
827 | |||
828 | extend_nr = max_t(int, 1, (int)tmp); | ||
829 | diff = (csum_offset + extend_nr) * csum_size; | ||
830 | diff = min(diff, MAX_CSUM_ITEMS(root, csum_size) * csum_size); | ||
802 | 831 | ||
803 | diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); | 832 | diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); |
804 | if (diff != csum_size) | 833 | diff = min(free_space, diff); |
805 | goto insert; | 834 | diff /= csum_size; |
835 | diff *= csum_size; | ||
806 | 836 | ||
807 | btrfs_extend_item(trans, root, path, diff); | 837 | btrfs_extend_item(trans, root, path, diff); |
808 | goto csum; | 838 | goto csum; |
@@ -812,19 +842,14 @@ insert: | |||
812 | btrfs_release_path(path); | 842 | btrfs_release_path(path); |
813 | csum_offset = 0; | 843 | csum_offset = 0; |
814 | if (found_next) { | 844 | if (found_next) { |
815 | u64 tmp = total_bytes + root->sectorsize; | 845 | u64 tmp; |
816 | u64 next_sector = sector_sum->bytenr; | ||
817 | struct btrfs_sector_sum *next = sector_sum + 1; | ||
818 | 846 | ||
819 | while (tmp < sums->len) { | 847 | tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, |
820 | if (next_sector + root->sectorsize != next->bytenr) | 848 | root->sectorsize); |
821 | break; | ||
822 | tmp += root->sectorsize; | ||
823 | next_sector = next->bytenr; | ||
824 | next++; | ||
825 | } | ||
826 | tmp = min(tmp, next_offset - file_key.offset); | ||
827 | tmp >>= root->fs_info->sb->s_blocksize_bits; | 849 | tmp >>= root->fs_info->sb->s_blocksize_bits; |
850 | tmp = min(tmp, (next_offset - file_key.offset) >> | ||
851 | root->fs_info->sb->s_blocksize_bits); | ||
852 | |||
828 | tmp = max((u64)1, tmp); | 853 | tmp = max((u64)1, tmp); |
829 | tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size)); | 854 | tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size)); |
830 | ins_size = csum_size * tmp; | 855 | ins_size = csum_size * tmp; |
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4b241fe9d2fe..af1d0605a5c1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
@@ -30,11 +30,11 @@ | |||
30 | #include <linux/statfs.h> | 30 | #include <linux/statfs.h> |
31 | #include <linux/compat.h> | 31 | #include <linux/compat.h> |
32 | #include <linux/slab.h> | 32 | #include <linux/slab.h> |
33 | #include <linux/btrfs.h> | ||
33 | #include "ctree.h" | 34 | #include "ctree.h" |
34 | #include "disk-io.h" | 35 | #include "disk-io.h" |
35 | #include "transaction.h" | 36 | #include "transaction.h" |
36 | #include "btrfs_inode.h" | 37 | #include "btrfs_inode.h" |
37 | #include "ioctl.h" | ||
38 | #include "print-tree.h" | 38 | #include "print-tree.h" |
39 | #include "tree-log.h" | 39 | #include "tree-log.h" |
40 | #include "locking.h" | 40 | #include "locking.h" |
@@ -374,6 +374,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) | |||
374 | 374 | ||
375 | atomic_inc(&fs_info->defrag_running); | 375 | atomic_inc(&fs_info->defrag_running); |
376 | while(1) { | 376 | while(1) { |
377 | /* Pause the auto defragger. */ | ||
378 | if (test_bit(BTRFS_FS_STATE_REMOUNTING, | ||
379 | &fs_info->fs_state)) | ||
380 | break; | ||
381 | |||
377 | if (!__need_auto_defrag(fs_info->tree_root)) | 382 | if (!__need_auto_defrag(fs_info->tree_root)) |
378 | break; | 383 | break; |
379 | 384 | ||
@@ -505,8 +510,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, | |||
505 | loff_t isize = i_size_read(inode); | 510 | loff_t isize = i_size_read(inode); |
506 | 511 | ||
507 | start_pos = pos & ~((u64)root->sectorsize - 1); | 512 | start_pos = pos & ~((u64)root->sectorsize - 1); |
508 | num_bytes = (write_bytes + pos - start_pos + | 513 | num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize); |
509 | root->sectorsize - 1) & ~((u64)root->sectorsize - 1); | ||
510 | 514 | ||
511 | end_of_last_block = start_pos + num_bytes - 1; | 515 | end_of_last_block = start_pos + num_bytes - 1; |
512 | err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, | 516 | err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, |
@@ -1544,7 +1548,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, | |||
1544 | * although we have opened a file as writable, we have | 1548 | * although we have opened a file as writable, we have |
1545 | * to stop this write operation to ensure FS consistency. | 1549 | * to stop this write operation to ensure FS consistency. |
1546 | */ | 1550 | */ |
1547 | if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { | 1551 | if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { |
1548 | mutex_unlock(&inode->i_mutex); | 1552 | mutex_unlock(&inode->i_mutex); |
1549 | err = -EROFS; | 1553 | err = -EROFS; |
1550 | goto out; | 1554 | goto out; |
@@ -1627,7 +1631,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp) | |||
1627 | */ | 1631 | */ |
1628 | if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, | 1632 | if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, |
1629 | &BTRFS_I(inode)->runtime_flags)) { | 1633 | &BTRFS_I(inode)->runtime_flags)) { |
1630 | btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); | 1634 | struct btrfs_trans_handle *trans; |
1635 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
1636 | |||
1637 | /* | ||
1638 | * We need to block on a committing transaction to keep us from | ||
1639 | * throwing a ordered operation on to the list and causing | ||
1640 | * something like sync to deadlock trying to flush out this | ||
1641 | * inode. | ||
1642 | */ | ||
1643 | trans = btrfs_start_transaction(root, 0); | ||
1644 | if (IS_ERR(trans)) | ||
1645 | return PTR_ERR(trans); | ||
1646 | btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode); | ||
1647 | btrfs_end_transaction(trans, root); | ||
1631 | if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) | 1648 | if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) |
1632 | filemap_flush(inode->i_mapping); | 1649 | filemap_flush(inode->i_mapping); |
1633 | } | 1650 | } |
@@ -1654,16 +1671,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | |||
1654 | struct btrfs_root *root = BTRFS_I(inode)->root; | 1671 | struct btrfs_root *root = BTRFS_I(inode)->root; |
1655 | int ret = 0; | 1672 | int ret = 0; |
1656 | struct btrfs_trans_handle *trans; | 1673 | struct btrfs_trans_handle *trans; |
1674 | bool full_sync = 0; | ||
1657 | 1675 | ||
1658 | trace_btrfs_sync_file(file, datasync); | 1676 | trace_btrfs_sync_file(file, datasync); |
1659 | 1677 | ||
1660 | /* | 1678 | /* |
1661 | * We write the dirty pages in the range and wait until they complete | 1679 | * We write the dirty pages in the range and wait until they complete |
1662 | * out of the ->i_mutex. If so, we can flush the dirty pages by | 1680 | * out of the ->i_mutex. If so, we can flush the dirty pages by |
1663 | * multi-task, and make the performance up. | 1681 | * multi-task, and make the performance up. See |
1682 | * btrfs_wait_ordered_range for an explanation of the ASYNC check. | ||
1664 | */ | 1683 | */ |
1665 | atomic_inc(&BTRFS_I(inode)->sync_writers); | 1684 | atomic_inc(&BTRFS_I(inode)->sync_writers); |
1666 | ret = filemap_write_and_wait_range(inode->i_mapping, start, end); | 1685 | ret = filemap_fdatawrite_range(inode->i_mapping, start, end); |
1686 | if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, | ||
1687 | &BTRFS_I(inode)->runtime_flags)) | ||
1688 | ret = filemap_fdatawrite_range(inode->i_mapping, start, end); | ||
1667 | atomic_dec(&BTRFS_I(inode)->sync_writers); | 1689 | atomic_dec(&BTRFS_I(inode)->sync_writers); |
1668 | if (ret) | 1690 | if (ret) |
1669 | return ret; | 1691 | return ret; |
@@ -1675,7 +1697,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | |||
1675 | * range being left. | 1697 | * range being left. |
1676 | */ | 1698 | */ |
1677 | atomic_inc(&root->log_batch); | 1699 | atomic_inc(&root->log_batch); |
1678 | btrfs_wait_ordered_range(inode, start, end - start + 1); | 1700 | full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, |
1701 | &BTRFS_I(inode)->runtime_flags); | ||
1702 | if (full_sync) | ||
1703 | btrfs_wait_ordered_range(inode, start, end - start + 1); | ||
1679 | atomic_inc(&root->log_batch); | 1704 | atomic_inc(&root->log_batch); |
1680 | 1705 | ||
1681 | /* | 1706 | /* |
@@ -1742,13 +1767,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | |||
1742 | 1767 | ||
1743 | if (ret != BTRFS_NO_LOG_SYNC) { | 1768 | if (ret != BTRFS_NO_LOG_SYNC) { |
1744 | if (ret > 0) { | 1769 | if (ret > 0) { |
1770 | /* | ||
1771 | * If we didn't already wait for ordered extents we need | ||
1772 | * to do that now. | ||
1773 | */ | ||
1774 | if (!full_sync) | ||
1775 | btrfs_wait_ordered_range(inode, start, | ||
1776 | end - start + 1); | ||
1745 | ret = btrfs_commit_transaction(trans, root); | 1777 | ret = btrfs_commit_transaction(trans, root); |
1746 | } else { | 1778 | } else { |
1747 | ret = btrfs_sync_log(trans, root); | 1779 | ret = btrfs_sync_log(trans, root); |
1748 | if (ret == 0) | 1780 | if (ret == 0) { |
1749 | ret = btrfs_end_transaction(trans, root); | 1781 | ret = btrfs_end_transaction(trans, root); |
1750 | else | 1782 | } else { |
1783 | if (!full_sync) | ||
1784 | btrfs_wait_ordered_range(inode, start, | ||
1785 | end - | ||
1786 | start + 1); | ||
1751 | ret = btrfs_commit_transaction(trans, root); | 1787 | ret = btrfs_commit_transaction(trans, root); |
1788 | } | ||
1752 | } | 1789 | } |
1753 | } else { | 1790 | } else { |
1754 | ret = btrfs_end_transaction(trans, root); | 1791 | ret = btrfs_end_transaction(trans, root); |
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 0be7a8742a43..1f84fc09c1a8 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c | |||
@@ -1356,6 +1356,8 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) | |||
1356 | u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; | 1356 | u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; |
1357 | int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); | 1357 | int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); |
1358 | 1358 | ||
1359 | max_bitmaps = max(max_bitmaps, 1); | ||
1360 | |||
1359 | BUG_ON(ctl->total_bitmaps > max_bitmaps); | 1361 | BUG_ON(ctl->total_bitmaps > max_bitmaps); |
1360 | 1362 | ||
1361 | /* | 1363 | /* |
@@ -1463,10 +1465,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, | |||
1463 | } | 1465 | } |
1464 | 1466 | ||
1465 | static struct btrfs_free_space * | 1467 | static struct btrfs_free_space * |
1466 | find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) | 1468 | find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, |
1469 | unsigned long align) | ||
1467 | { | 1470 | { |
1468 | struct btrfs_free_space *entry; | 1471 | struct btrfs_free_space *entry; |
1469 | struct rb_node *node; | 1472 | struct rb_node *node; |
1473 | u64 ctl_off; | ||
1474 | u64 tmp; | ||
1475 | u64 align_off; | ||
1470 | int ret; | 1476 | int ret; |
1471 | 1477 | ||
1472 | if (!ctl->free_space_offset.rb_node) | 1478 | if (!ctl->free_space_offset.rb_node) |
@@ -1481,15 +1487,34 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) | |||
1481 | if (entry->bytes < *bytes) | 1487 | if (entry->bytes < *bytes) |
1482 | continue; | 1488 | continue; |
1483 | 1489 | ||
1490 | /* make sure the space returned is big enough | ||
1491 | * to match our requested alignment | ||
1492 | */ | ||
1493 | if (*bytes >= align) { | ||
1494 | ctl_off = entry->offset - ctl->start; | ||
1495 | tmp = ctl_off + align - 1;; | ||
1496 | do_div(tmp, align); | ||
1497 | tmp = tmp * align + ctl->start; | ||
1498 | align_off = tmp - entry->offset; | ||
1499 | } else { | ||
1500 | align_off = 0; | ||
1501 | tmp = entry->offset; | ||
1502 | } | ||
1503 | |||
1504 | if (entry->bytes < *bytes + align_off) | ||
1505 | continue; | ||
1506 | |||
1484 | if (entry->bitmap) { | 1507 | if (entry->bitmap) { |
1485 | ret = search_bitmap(ctl, entry, offset, bytes); | 1508 | ret = search_bitmap(ctl, entry, &tmp, bytes); |
1486 | if (!ret) | 1509 | if (!ret) { |
1510 | *offset = tmp; | ||
1487 | return entry; | 1511 | return entry; |
1512 | } | ||
1488 | continue; | 1513 | continue; |
1489 | } | 1514 | } |
1490 | 1515 | ||
1491 | *offset = entry->offset; | 1516 | *offset = tmp; |
1492 | *bytes = entry->bytes; | 1517 | *bytes = entry->bytes - align_off; |
1493 | return entry; | 1518 | return entry; |
1494 | } | 1519 | } |
1495 | 1520 | ||
@@ -1636,10 +1661,14 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, | |||
1636 | } | 1661 | } |
1637 | 1662 | ||
1638 | /* | 1663 | /* |
1639 | * some block groups are so tiny they can't be enveloped by a bitmap, so | 1664 | * The original block groups from mkfs can be really small, like 8 |
1640 | * don't even bother to create a bitmap for this | 1665 | * megabytes, so don't bother with a bitmap for those entries. However |
1666 | * some block groups can be smaller than what a bitmap would cover but | ||
1667 | * are still large enough that they could overflow the 32k memory limit, | ||
1668 | * so allow those block groups to still be allowed to have a bitmap | ||
1669 | * entry. | ||
1641 | */ | 1670 | */ |
1642 | if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset) | 1671 | if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset) |
1643 | return false; | 1672 | return false; |
1644 | 1673 | ||
1645 | return true; | 1674 | return true; |
@@ -2095,9 +2124,12 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, | |||
2095 | struct btrfs_free_space *entry = NULL; | 2124 | struct btrfs_free_space *entry = NULL; |
2096 | u64 bytes_search = bytes + empty_size; | 2125 | u64 bytes_search = bytes + empty_size; |
2097 | u64 ret = 0; | 2126 | u64 ret = 0; |
2127 | u64 align_gap = 0; | ||
2128 | u64 align_gap_len = 0; | ||
2098 | 2129 | ||
2099 | spin_lock(&ctl->tree_lock); | 2130 | spin_lock(&ctl->tree_lock); |
2100 | entry = find_free_space(ctl, &offset, &bytes_search); | 2131 | entry = find_free_space(ctl, &offset, &bytes_search, |
2132 | block_group->full_stripe_len); | ||
2101 | if (!entry) | 2133 | if (!entry) |
2102 | goto out; | 2134 | goto out; |
2103 | 2135 | ||
@@ -2107,9 +2139,15 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, | |||
2107 | if (!entry->bytes) | 2139 | if (!entry->bytes) |
2108 | free_bitmap(ctl, entry); | 2140 | free_bitmap(ctl, entry); |
2109 | } else { | 2141 | } else { |
2142 | |||
2110 | unlink_free_space(ctl, entry); | 2143 | unlink_free_space(ctl, entry); |
2111 | entry->offset += bytes; | 2144 | align_gap_len = offset - entry->offset; |
2112 | entry->bytes -= bytes; | 2145 | align_gap = entry->offset; |
2146 | |||
2147 | entry->offset = offset + bytes; | ||
2148 | WARN_ON(entry->bytes < bytes + align_gap_len); | ||
2149 | |||
2150 | entry->bytes -= bytes + align_gap_len; | ||
2113 | if (!entry->bytes) | 2151 | if (!entry->bytes) |
2114 | kmem_cache_free(btrfs_free_space_cachep, entry); | 2152 | kmem_cache_free(btrfs_free_space_cachep, entry); |
2115 | else | 2153 | else |
@@ -2119,6 +2157,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, | |||
2119 | out: | 2157 | out: |
2120 | spin_unlock(&ctl->tree_lock); | 2158 | spin_unlock(&ctl->tree_lock); |
2121 | 2159 | ||
2160 | if (align_gap_len) | ||
2161 | __btrfs_add_free_space(ctl, align_gap, align_gap_len); | ||
2122 | return ret; | 2162 | return ret; |
2123 | } | 2163 | } |
2124 | 2164 | ||
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 55c07b650378..c226daefd65d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -39,12 +39,13 @@ | |||
39 | #include <linux/slab.h> | 39 | #include <linux/slab.h> |
40 | #include <linux/ratelimit.h> | 40 | #include <linux/ratelimit.h> |
41 | #include <linux/mount.h> | 41 | #include <linux/mount.h> |
42 | #include <linux/btrfs.h> | ||
43 | #include <linux/blkdev.h> | ||
42 | #include "compat.h" | 44 | #include "compat.h" |
43 | #include "ctree.h" | 45 | #include "ctree.h" |
44 | #include "disk-io.h" | 46 | #include "disk-io.h" |
45 | #include "transaction.h" | 47 | #include "transaction.h" |
46 | #include "btrfs_inode.h" | 48 | #include "btrfs_inode.h" |
47 | #include "ioctl.h" | ||
48 | #include "print-tree.h" | 49 | #include "print-tree.h" |
49 | #include "ordered-data.h" | 50 | #include "ordered-data.h" |
50 | #include "xattr.h" | 51 | #include "xattr.h" |
@@ -54,6 +55,7 @@ | |||
54 | #include "locking.h" | 55 | #include "locking.h" |
55 | #include "free-space-cache.h" | 56 | #include "free-space-cache.h" |
56 | #include "inode-map.h" | 57 | #include "inode-map.h" |
58 | #include "backref.h" | ||
57 | 59 | ||
58 | struct btrfs_iget_args { | 60 | struct btrfs_iget_args { |
59 | u64 ino; | 61 | u64 ino; |
@@ -231,8 +233,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, | |||
231 | u64 isize = i_size_read(inode); | 233 | u64 isize = i_size_read(inode); |
232 | u64 actual_end = min(end + 1, isize); | 234 | u64 actual_end = min(end + 1, isize); |
233 | u64 inline_len = actual_end - start; | 235 | u64 inline_len = actual_end - start; |
234 | u64 aligned_end = (end + root->sectorsize - 1) & | 236 | u64 aligned_end = ALIGN(end, root->sectorsize); |
235 | ~((u64)root->sectorsize - 1); | ||
236 | u64 data_len = inline_len; | 237 | u64 data_len = inline_len; |
237 | int ret; | 238 | int ret; |
238 | 239 | ||
@@ -265,6 +266,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, | |||
265 | return 1; | 266 | return 1; |
266 | } | 267 | } |
267 | 268 | ||
269 | set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); | ||
268 | btrfs_delalloc_release_metadata(inode, end + 1 - start); | 270 | btrfs_delalloc_release_metadata(inode, end + 1 - start); |
269 | btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); | 271 | btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); |
270 | return 0; | 272 | return 0; |
@@ -389,7 +391,7 @@ again: | |||
389 | * a compressed extent to 128k. | 391 | * a compressed extent to 128k. |
390 | */ | 392 | */ |
391 | total_compressed = min(total_compressed, max_uncompressed); | 393 | total_compressed = min(total_compressed, max_uncompressed); |
392 | num_bytes = (end - start + blocksize) & ~(blocksize - 1); | 394 | num_bytes = ALIGN(end - start + 1, blocksize); |
393 | num_bytes = max(blocksize, num_bytes); | 395 | num_bytes = max(blocksize, num_bytes); |
394 | total_in = 0; | 396 | total_in = 0; |
395 | ret = 0; | 397 | ret = 0; |
@@ -488,15 +490,13 @@ cont: | |||
488 | * up to a block size boundary so the allocator does sane | 490 | * up to a block size boundary so the allocator does sane |
489 | * things | 491 | * things |
490 | */ | 492 | */ |
491 | total_compressed = (total_compressed + blocksize - 1) & | 493 | total_compressed = ALIGN(total_compressed, blocksize); |
492 | ~(blocksize - 1); | ||
493 | 494 | ||
494 | /* | 495 | /* |
495 | * one last check to make sure the compression is really a | 496 | * one last check to make sure the compression is really a |
496 | * win, compare the page count read with the blocks on disk | 497 | * win, compare the page count read with the blocks on disk |
497 | */ | 498 | */ |
498 | total_in = (total_in + PAGE_CACHE_SIZE - 1) & | 499 | total_in = ALIGN(total_in, PAGE_CACHE_SIZE); |
499 | ~(PAGE_CACHE_SIZE - 1); | ||
500 | if (total_compressed >= total_in) { | 500 | if (total_compressed >= total_in) { |
501 | will_compress = 0; | 501 | will_compress = 0; |
502 | } else { | 502 | } else { |
@@ -608,7 +608,7 @@ static noinline int submit_compressed_extents(struct inode *inode, | |||
608 | if (list_empty(&async_cow->extents)) | 608 | if (list_empty(&async_cow->extents)) |
609 | return 0; | 609 | return 0; |
610 | 610 | ||
611 | 611 | again: | |
612 | while (!list_empty(&async_cow->extents)) { | 612 | while (!list_empty(&async_cow->extents)) { |
613 | async_extent = list_entry(async_cow->extents.next, | 613 | async_extent = list_entry(async_cow->extents.next, |
614 | struct async_extent, list); | 614 | struct async_extent, list); |
@@ -648,6 +648,8 @@ retry: | |||
648 | async_extent->ram_size - 1, | 648 | async_extent->ram_size - 1, |
649 | btrfs_get_extent, | 649 | btrfs_get_extent, |
650 | WB_SYNC_ALL); | 650 | WB_SYNC_ALL); |
651 | else if (ret) | ||
652 | unlock_page(async_cow->locked_page); | ||
651 | kfree(async_extent); | 653 | kfree(async_extent); |
652 | cond_resched(); | 654 | cond_resched(); |
653 | continue; | 655 | continue; |
@@ -672,6 +674,7 @@ retry: | |||
672 | 674 | ||
673 | if (ret) { | 675 | if (ret) { |
674 | int i; | 676 | int i; |
677 | |||
675 | for (i = 0; i < async_extent->nr_pages; i++) { | 678 | for (i = 0; i < async_extent->nr_pages; i++) { |
676 | WARN_ON(async_extent->pages[i]->mapping); | 679 | WARN_ON(async_extent->pages[i]->mapping); |
677 | page_cache_release(async_extent->pages[i]); | 680 | page_cache_release(async_extent->pages[i]); |
@@ -679,12 +682,10 @@ retry: | |||
679 | kfree(async_extent->pages); | 682 | kfree(async_extent->pages); |
680 | async_extent->nr_pages = 0; | 683 | async_extent->nr_pages = 0; |
681 | async_extent->pages = NULL; | 684 | async_extent->pages = NULL; |
682 | unlock_extent(io_tree, async_extent->start, | 685 | |
683 | async_extent->start + | ||
684 | async_extent->ram_size - 1); | ||
685 | if (ret == -ENOSPC) | 686 | if (ret == -ENOSPC) |
686 | goto retry; | 687 | goto retry; |
687 | goto out_free; /* JDM: Requeue? */ | 688 | goto out_free; |
688 | } | 689 | } |
689 | 690 | ||
690 | /* | 691 | /* |
@@ -696,10 +697,13 @@ retry: | |||
696 | async_extent->ram_size - 1, 0); | 697 | async_extent->ram_size - 1, 0); |
697 | 698 | ||
698 | em = alloc_extent_map(); | 699 | em = alloc_extent_map(); |
699 | BUG_ON(!em); /* -ENOMEM */ | 700 | if (!em) |
701 | goto out_free_reserve; | ||
700 | em->start = async_extent->start; | 702 | em->start = async_extent->start; |
701 | em->len = async_extent->ram_size; | 703 | em->len = async_extent->ram_size; |
702 | em->orig_start = em->start; | 704 | em->orig_start = em->start; |
705 | em->mod_start = em->start; | ||
706 | em->mod_len = em->len; | ||
703 | 707 | ||
704 | em->block_start = ins.objectid; | 708 | em->block_start = ins.objectid; |
705 | em->block_len = ins.offset; | 709 | em->block_len = ins.offset; |
@@ -726,6 +730,9 @@ retry: | |||
726 | async_extent->ram_size - 1, 0); | 730 | async_extent->ram_size - 1, 0); |
727 | } | 731 | } |
728 | 732 | ||
733 | if (ret) | ||
734 | goto out_free_reserve; | ||
735 | |||
729 | ret = btrfs_add_ordered_extent_compress(inode, | 736 | ret = btrfs_add_ordered_extent_compress(inode, |
730 | async_extent->start, | 737 | async_extent->start, |
731 | ins.objectid, | 738 | ins.objectid, |
@@ -733,7 +740,8 @@ retry: | |||
733 | ins.offset, | 740 | ins.offset, |
734 | BTRFS_ORDERED_COMPRESSED, | 741 | BTRFS_ORDERED_COMPRESSED, |
735 | async_extent->compress_type); | 742 | async_extent->compress_type); |
736 | BUG_ON(ret); /* -ENOMEM */ | 743 | if (ret) |
744 | goto out_free_reserve; | ||
737 | 745 | ||
738 | /* | 746 | /* |
739 | * clear dirty, set writeback and unlock the pages. | 747 | * clear dirty, set writeback and unlock the pages. |
@@ -754,18 +762,30 @@ retry: | |||
754 | ins.objectid, | 762 | ins.objectid, |
755 | ins.offset, async_extent->pages, | 763 | ins.offset, async_extent->pages, |
756 | async_extent->nr_pages); | 764 | async_extent->nr_pages); |
757 | |||
758 | BUG_ON(ret); /* -ENOMEM */ | ||
759 | alloc_hint = ins.objectid + ins.offset; | 765 | alloc_hint = ins.objectid + ins.offset; |
760 | kfree(async_extent); | 766 | kfree(async_extent); |
767 | if (ret) | ||
768 | goto out; | ||
761 | cond_resched(); | 769 | cond_resched(); |
762 | } | 770 | } |
763 | ret = 0; | 771 | ret = 0; |
764 | out: | 772 | out: |
765 | return ret; | 773 | return ret; |
774 | out_free_reserve: | ||
775 | btrfs_free_reserved_extent(root, ins.objectid, ins.offset); | ||
766 | out_free: | 776 | out_free: |
777 | extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, | ||
778 | async_extent->start, | ||
779 | async_extent->start + | ||
780 | async_extent->ram_size - 1, | ||
781 | NULL, EXTENT_CLEAR_UNLOCK_PAGE | | ||
782 | EXTENT_CLEAR_UNLOCK | | ||
783 | EXTENT_CLEAR_DELALLOC | | ||
784 | EXTENT_CLEAR_DIRTY | | ||
785 | EXTENT_SET_WRITEBACK | | ||
786 | EXTENT_END_WRITEBACK); | ||
767 | kfree(async_extent); | 787 | kfree(async_extent); |
768 | goto out; | 788 | goto again; |
769 | } | 789 | } |
770 | 790 | ||
771 | static u64 get_extent_allocation_hint(struct inode *inode, u64 start, | 791 | static u64 get_extent_allocation_hint(struct inode *inode, u64 start, |
@@ -834,7 +854,7 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, | |||
834 | 854 | ||
835 | BUG_ON(btrfs_is_free_space_inode(inode)); | 855 | BUG_ON(btrfs_is_free_space_inode(inode)); |
836 | 856 | ||
837 | num_bytes = (end - start + blocksize) & ~(blocksize - 1); | 857 | num_bytes = ALIGN(end - start + 1, blocksize); |
838 | num_bytes = max(blocksize, num_bytes); | 858 | num_bytes = max(blocksize, num_bytes); |
839 | disk_num_bytes = num_bytes; | 859 | disk_num_bytes = num_bytes; |
840 | 860 | ||
@@ -892,6 +912,8 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, | |||
892 | em->orig_start = em->start; | 912 | em->orig_start = em->start; |
893 | ram_size = ins.offset; | 913 | ram_size = ins.offset; |
894 | em->len = ins.offset; | 914 | em->len = ins.offset; |
915 | em->mod_start = em->start; | ||
916 | em->mod_len = em->len; | ||
895 | 917 | ||
896 | em->block_start = ins.objectid; | 918 | em->block_start = ins.objectid; |
897 | em->block_len = ins.offset; | 919 | em->block_len = ins.offset; |
@@ -1338,6 +1360,8 @@ out_check: | |||
1338 | em->block_start = disk_bytenr; | 1360 | em->block_start = disk_bytenr; |
1339 | em->orig_block_len = disk_num_bytes; | 1361 | em->orig_block_len = disk_num_bytes; |
1340 | em->bdev = root->fs_info->fs_devices->latest_bdev; | 1362 | em->bdev = root->fs_info->fs_devices->latest_bdev; |
1363 | em->mod_start = em->start; | ||
1364 | em->mod_len = em->len; | ||
1341 | set_bit(EXTENT_FLAG_PINNED, &em->flags); | 1365 | set_bit(EXTENT_FLAG_PINNED, &em->flags); |
1342 | set_bit(EXTENT_FLAG_FILLING, &em->flags); | 1366 | set_bit(EXTENT_FLAG_FILLING, &em->flags); |
1343 | em->generation = -1; | 1367 | em->generation = -1; |
@@ -1508,14 +1532,22 @@ static void btrfs_set_bit_hook(struct inode *inode, | |||
1508 | spin_unlock(&BTRFS_I(inode)->lock); | 1532 | spin_unlock(&BTRFS_I(inode)->lock); |
1509 | } | 1533 | } |
1510 | 1534 | ||
1511 | spin_lock(&root->fs_info->delalloc_lock); | 1535 | __percpu_counter_add(&root->fs_info->delalloc_bytes, len, |
1536 | root->fs_info->delalloc_batch); | ||
1537 | spin_lock(&BTRFS_I(inode)->lock); | ||
1512 | BTRFS_I(inode)->delalloc_bytes += len; | 1538 | BTRFS_I(inode)->delalloc_bytes += len; |
1513 | root->fs_info->delalloc_bytes += len; | 1539 | if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, |
1514 | if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { | 1540 | &BTRFS_I(inode)->runtime_flags)) { |
1515 | list_add_tail(&BTRFS_I(inode)->delalloc_inodes, | 1541 | spin_lock(&root->fs_info->delalloc_lock); |
1516 | &root->fs_info->delalloc_inodes); | 1542 | if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { |
1543 | list_add_tail(&BTRFS_I(inode)->delalloc_inodes, | ||
1544 | &root->fs_info->delalloc_inodes); | ||
1545 | set_bit(BTRFS_INODE_IN_DELALLOC_LIST, | ||
1546 | &BTRFS_I(inode)->runtime_flags); | ||
1547 | } | ||
1548 | spin_unlock(&root->fs_info->delalloc_lock); | ||
1517 | } | 1549 | } |
1518 | spin_unlock(&root->fs_info->delalloc_lock); | 1550 | spin_unlock(&BTRFS_I(inode)->lock); |
1519 | } | 1551 | } |
1520 | } | 1552 | } |
1521 | 1553 | ||
@@ -1550,15 +1582,22 @@ static void btrfs_clear_bit_hook(struct inode *inode, | |||
1550 | && do_list) | 1582 | && do_list) |
1551 | btrfs_free_reserved_data_space(inode, len); | 1583 | btrfs_free_reserved_data_space(inode, len); |
1552 | 1584 | ||
1553 | spin_lock(&root->fs_info->delalloc_lock); | 1585 | __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, |
1554 | root->fs_info->delalloc_bytes -= len; | 1586 | root->fs_info->delalloc_batch); |
1587 | spin_lock(&BTRFS_I(inode)->lock); | ||
1555 | BTRFS_I(inode)->delalloc_bytes -= len; | 1588 | BTRFS_I(inode)->delalloc_bytes -= len; |
1556 | |||
1557 | if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && | 1589 | if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && |
1558 | !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { | 1590 | test_bit(BTRFS_INODE_IN_DELALLOC_LIST, |
1559 | list_del_init(&BTRFS_I(inode)->delalloc_inodes); | 1591 | &BTRFS_I(inode)->runtime_flags)) { |
1592 | spin_lock(&root->fs_info->delalloc_lock); | ||
1593 | if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { | ||
1594 | list_del_init(&BTRFS_I(inode)->delalloc_inodes); | ||
1595 | clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, | ||
1596 | &BTRFS_I(inode)->runtime_flags); | ||
1597 | } | ||
1598 | spin_unlock(&root->fs_info->delalloc_lock); | ||
1560 | } | 1599 | } |
1561 | spin_unlock(&root->fs_info->delalloc_lock); | 1600 | spin_unlock(&BTRFS_I(inode)->lock); |
1562 | } | 1601 | } |
1563 | } | 1602 | } |
1564 | 1603 | ||
@@ -1566,7 +1605,7 @@ static void btrfs_clear_bit_hook(struct inode *inode, | |||
1566 | * extent_io.c merge_bio_hook, this must check the chunk tree to make sure | 1605 | * extent_io.c merge_bio_hook, this must check the chunk tree to make sure |
1567 | * we don't create bios that span stripes or chunks | 1606 | * we don't create bios that span stripes or chunks |
1568 | */ | 1607 | */ |
1569 | int btrfs_merge_bio_hook(struct page *page, unsigned long offset, | 1608 | int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, |
1570 | size_t size, struct bio *bio, | 1609 | size_t size, struct bio *bio, |
1571 | unsigned long bio_flags) | 1610 | unsigned long bio_flags) |
1572 | { | 1611 | { |
@@ -1581,7 +1620,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, | |||
1581 | 1620 | ||
1582 | length = bio->bi_size; | 1621 | length = bio->bi_size; |
1583 | map_length = length; | 1622 | map_length = length; |
1584 | ret = btrfs_map_block(root->fs_info, READ, logical, | 1623 | ret = btrfs_map_block(root->fs_info, rw, logical, |
1585 | &map_length, NULL, 0); | 1624 | &map_length, NULL, 0); |
1586 | /* Will always return 0 with map_multi == NULL */ | 1625 | /* Will always return 0 with map_multi == NULL */ |
1587 | BUG_ON(ret < 0); | 1626 | BUG_ON(ret < 0); |
@@ -1892,6 +1931,640 @@ out: | |||
1892 | return ret; | 1931 | return ret; |
1893 | } | 1932 | } |
1894 | 1933 | ||
1934 | /* snapshot-aware defrag */ | ||
1935 | struct sa_defrag_extent_backref { | ||
1936 | struct rb_node node; | ||
1937 | struct old_sa_defrag_extent *old; | ||
1938 | u64 root_id; | ||
1939 | u64 inum; | ||
1940 | u64 file_pos; | ||
1941 | u64 extent_offset; | ||
1942 | u64 num_bytes; | ||
1943 | u64 generation; | ||
1944 | }; | ||
1945 | |||
1946 | struct old_sa_defrag_extent { | ||
1947 | struct list_head list; | ||
1948 | struct new_sa_defrag_extent *new; | ||
1949 | |||
1950 | u64 extent_offset; | ||
1951 | u64 bytenr; | ||
1952 | u64 offset; | ||
1953 | u64 len; | ||
1954 | int count; | ||
1955 | }; | ||
1956 | |||
1957 | struct new_sa_defrag_extent { | ||
1958 | struct rb_root root; | ||
1959 | struct list_head head; | ||
1960 | struct btrfs_path *path; | ||
1961 | struct inode *inode; | ||
1962 | u64 file_pos; | ||
1963 | u64 len; | ||
1964 | u64 bytenr; | ||
1965 | u64 disk_len; | ||
1966 | u8 compress_type; | ||
1967 | }; | ||
1968 | |||
1969 | static int backref_comp(struct sa_defrag_extent_backref *b1, | ||
1970 | struct sa_defrag_extent_backref *b2) | ||
1971 | { | ||
1972 | if (b1->root_id < b2->root_id) | ||
1973 | return -1; | ||
1974 | else if (b1->root_id > b2->root_id) | ||
1975 | return 1; | ||
1976 | |||
1977 | if (b1->inum < b2->inum) | ||
1978 | return -1; | ||
1979 | else if (b1->inum > b2->inum) | ||
1980 | return 1; | ||
1981 | |||
1982 | if (b1->file_pos < b2->file_pos) | ||
1983 | return -1; | ||
1984 | else if (b1->file_pos > b2->file_pos) | ||
1985 | return 1; | ||
1986 | |||
1987 | /* | ||
1988 | * [------------------------------] ===> (a range of space) | ||
1989 | * |<--->| |<---->| =============> (fs/file tree A) | ||
1990 | * |<---------------------------->| ===> (fs/file tree B) | ||
1991 | * | ||
1992 | * A range of space can refer to two file extents in one tree while | ||
1993 | * refer to only one file extent in another tree. | ||
1994 | * | ||
1995 | * So we may process a disk offset more than one time(two extents in A) | ||
1996 | * and locate at the same extent(one extent in B), then insert two same | ||
1997 | * backrefs(both refer to the extent in B). | ||
1998 | */ | ||
1999 | return 0; | ||
2000 | } | ||
2001 | |||
2002 | static void backref_insert(struct rb_root *root, | ||
2003 | struct sa_defrag_extent_backref *backref) | ||
2004 | { | ||
2005 | struct rb_node **p = &root->rb_node; | ||
2006 | struct rb_node *parent = NULL; | ||
2007 | struct sa_defrag_extent_backref *entry; | ||
2008 | int ret; | ||
2009 | |||
2010 | while (*p) { | ||
2011 | parent = *p; | ||
2012 | entry = rb_entry(parent, struct sa_defrag_extent_backref, node); | ||
2013 | |||
2014 | ret = backref_comp(backref, entry); | ||
2015 | if (ret < 0) | ||
2016 | p = &(*p)->rb_left; | ||
2017 | else | ||
2018 | p = &(*p)->rb_right; | ||
2019 | } | ||
2020 | |||
2021 | rb_link_node(&backref->node, parent, p); | ||
2022 | rb_insert_color(&backref->node, root); | ||
2023 | } | ||
2024 | |||
2025 | /* | ||
2026 | * Note the backref might has changed, and in this case we just return 0. | ||
2027 | */ | ||
2028 | static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, | ||
2029 | void *ctx) | ||
2030 | { | ||
2031 | struct btrfs_file_extent_item *extent; | ||
2032 | struct btrfs_fs_info *fs_info; | ||
2033 | struct old_sa_defrag_extent *old = ctx; | ||
2034 | struct new_sa_defrag_extent *new = old->new; | ||
2035 | struct btrfs_path *path = new->path; | ||
2036 | struct btrfs_key key; | ||
2037 | struct btrfs_root *root; | ||
2038 | struct sa_defrag_extent_backref *backref; | ||
2039 | struct extent_buffer *leaf; | ||
2040 | struct inode *inode = new->inode; | ||
2041 | int slot; | ||
2042 | int ret; | ||
2043 | u64 extent_offset; | ||
2044 | u64 num_bytes; | ||
2045 | |||
2046 | if (BTRFS_I(inode)->root->root_key.objectid == root_id && | ||
2047 | inum == btrfs_ino(inode)) | ||
2048 | return 0; | ||
2049 | |||
2050 | key.objectid = root_id; | ||
2051 | key.type = BTRFS_ROOT_ITEM_KEY; | ||
2052 | key.offset = (u64)-1; | ||
2053 | |||
2054 | fs_info = BTRFS_I(inode)->root->fs_info; | ||
2055 | root = btrfs_read_fs_root_no_name(fs_info, &key); | ||
2056 | if (IS_ERR(root)) { | ||
2057 | if (PTR_ERR(root) == -ENOENT) | ||
2058 | return 0; | ||
2059 | WARN_ON(1); | ||
2060 | pr_debug("inum=%llu, offset=%llu, root_id=%llu\n", | ||
2061 | inum, offset, root_id); | ||
2062 | return PTR_ERR(root); | ||
2063 | } | ||
2064 | |||
2065 | key.objectid = inum; | ||
2066 | key.type = BTRFS_EXTENT_DATA_KEY; | ||
2067 | if (offset > (u64)-1 << 32) | ||
2068 | key.offset = 0; | ||
2069 | else | ||
2070 | key.offset = offset; | ||
2071 | |||
2072 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
2073 | if (ret < 0) { | ||
2074 | WARN_ON(1); | ||
2075 | return ret; | ||
2076 | } | ||
2077 | |||
2078 | while (1) { | ||
2079 | cond_resched(); | ||
2080 | |||
2081 | leaf = path->nodes[0]; | ||
2082 | slot = path->slots[0]; | ||
2083 | |||
2084 | if (slot >= btrfs_header_nritems(leaf)) { | ||
2085 | ret = btrfs_next_leaf(root, path); | ||
2086 | if (ret < 0) { | ||
2087 | goto out; | ||
2088 | } else if (ret > 0) { | ||
2089 | ret = 0; | ||
2090 | goto out; | ||
2091 | } | ||
2092 | continue; | ||
2093 | } | ||
2094 | |||
2095 | path->slots[0]++; | ||
2096 | |||
2097 | btrfs_item_key_to_cpu(leaf, &key, slot); | ||
2098 | |||
2099 | if (key.objectid > inum) | ||
2100 | goto out; | ||
2101 | |||
2102 | if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY) | ||
2103 | continue; | ||
2104 | |||
2105 | extent = btrfs_item_ptr(leaf, slot, | ||
2106 | struct btrfs_file_extent_item); | ||
2107 | |||
2108 | if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) | ||
2109 | continue; | ||
2110 | |||
2111 | extent_offset = btrfs_file_extent_offset(leaf, extent); | ||
2112 | if (key.offset - extent_offset != offset) | ||
2113 | continue; | ||
2114 | |||
2115 | num_bytes = btrfs_file_extent_num_bytes(leaf, extent); | ||
2116 | if (extent_offset >= old->extent_offset + old->offset + | ||
2117 | old->len || extent_offset + num_bytes <= | ||
2118 | old->extent_offset + old->offset) | ||
2119 | continue; | ||
2120 | |||
2121 | break; | ||
2122 | } | ||
2123 | |||
2124 | backref = kmalloc(sizeof(*backref), GFP_NOFS); | ||
2125 | if (!backref) { | ||
2126 | ret = -ENOENT; | ||
2127 | goto out; | ||
2128 | } | ||
2129 | |||
2130 | backref->root_id = root_id; | ||
2131 | backref->inum = inum; | ||
2132 | backref->file_pos = offset + extent_offset; | ||
2133 | backref->num_bytes = num_bytes; | ||
2134 | backref->extent_offset = extent_offset; | ||
2135 | backref->generation = btrfs_file_extent_generation(leaf, extent); | ||
2136 | backref->old = old; | ||
2137 | backref_insert(&new->root, backref); | ||
2138 | old->count++; | ||
2139 | out: | ||
2140 | btrfs_release_path(path); | ||
2141 | WARN_ON(ret); | ||
2142 | return ret; | ||
2143 | } | ||
2144 | |||
2145 | static noinline bool record_extent_backrefs(struct btrfs_path *path, | ||
2146 | struct new_sa_defrag_extent *new) | ||
2147 | { | ||
2148 | struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info; | ||
2149 | struct old_sa_defrag_extent *old, *tmp; | ||
2150 | int ret; | ||
2151 | |||
2152 | new->path = path; | ||
2153 | |||
2154 | list_for_each_entry_safe(old, tmp, &new->head, list) { | ||
2155 | ret = iterate_inodes_from_logical(old->bytenr, fs_info, | ||
2156 | path, record_one_backref, | ||
2157 | old); | ||
2158 | BUG_ON(ret < 0 && ret != -ENOENT); | ||
2159 | |||
2160 | /* no backref to be processed for this extent */ | ||
2161 | if (!old->count) { | ||
2162 | list_del(&old->list); | ||
2163 | kfree(old); | ||
2164 | } | ||
2165 | } | ||
2166 | |||
2167 | if (list_empty(&new->head)) | ||
2168 | return false; | ||
2169 | |||
2170 | return true; | ||
2171 | } | ||
2172 | |||
2173 | static int relink_is_mergable(struct extent_buffer *leaf, | ||
2174 | struct btrfs_file_extent_item *fi, | ||
2175 | u64 disk_bytenr) | ||
2176 | { | ||
2177 | if (btrfs_file_extent_disk_bytenr(leaf, fi) != disk_bytenr) | ||
2178 | return 0; | ||
2179 | |||
2180 | if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) | ||
2181 | return 0; | ||
2182 | |||
2183 | if (btrfs_file_extent_compression(leaf, fi) || | ||
2184 | btrfs_file_extent_encryption(leaf, fi) || | ||
2185 | btrfs_file_extent_other_encoding(leaf, fi)) | ||
2186 | return 0; | ||
2187 | |||
2188 | return 1; | ||
2189 | } | ||
2190 | |||
2191 | /* | ||
2192 | * Note the backref might has changed, and in this case we just return 0. | ||
2193 | */ | ||
2194 | static noinline int relink_extent_backref(struct btrfs_path *path, | ||
2195 | struct sa_defrag_extent_backref *prev, | ||
2196 | struct sa_defrag_extent_backref *backref) | ||
2197 | { | ||
2198 | struct btrfs_file_extent_item *extent; | ||
2199 | struct btrfs_file_extent_item *item; | ||
2200 | struct btrfs_ordered_extent *ordered; | ||
2201 | struct btrfs_trans_handle *trans; | ||
2202 | struct btrfs_fs_info *fs_info; | ||
2203 | struct btrfs_root *root; | ||
2204 | struct btrfs_key key; | ||
2205 | struct extent_buffer *leaf; | ||
2206 | struct old_sa_defrag_extent *old = backref->old; | ||
2207 | struct new_sa_defrag_extent *new = old->new; | ||
2208 | struct inode *src_inode = new->inode; | ||
2209 | struct inode *inode; | ||
2210 | struct extent_state *cached = NULL; | ||
2211 | int ret = 0; | ||
2212 | u64 start; | ||
2213 | u64 len; | ||
2214 | u64 lock_start; | ||
2215 | u64 lock_end; | ||
2216 | bool merge = false; | ||
2217 | int index; | ||
2218 | |||
2219 | if (prev && prev->root_id == backref->root_id && | ||
2220 | prev->inum == backref->inum && | ||
2221 | prev->file_pos + prev->num_bytes == backref->file_pos) | ||
2222 | merge = true; | ||
2223 | |||
2224 | /* step 1: get root */ | ||
2225 | key.objectid = backref->root_id; | ||
2226 | key.type = BTRFS_ROOT_ITEM_KEY; | ||
2227 | key.offset = (u64)-1; | ||
2228 | |||
2229 | fs_info = BTRFS_I(src_inode)->root->fs_info; | ||
2230 | index = srcu_read_lock(&fs_info->subvol_srcu); | ||
2231 | |||
2232 | root = btrfs_read_fs_root_no_name(fs_info, &key); | ||
2233 | if (IS_ERR(root)) { | ||
2234 | srcu_read_unlock(&fs_info->subvol_srcu, index); | ||
2235 | if (PTR_ERR(root) == -ENOENT) | ||
2236 | return 0; | ||
2237 | return PTR_ERR(root); | ||
2238 | } | ||
2239 | if (btrfs_root_refs(&root->root_item) == 0) { | ||
2240 | srcu_read_unlock(&fs_info->subvol_srcu, index); | ||
2241 | /* parse ENOENT to 0 */ | ||
2242 | return 0; | ||
2243 | } | ||
2244 | |||
2245 | /* step 2: get inode */ | ||
2246 | key.objectid = backref->inum; | ||
2247 | key.type = BTRFS_INODE_ITEM_KEY; | ||
2248 | key.offset = 0; | ||
2249 | |||
2250 | inode = btrfs_iget(fs_info->sb, &key, root, NULL); | ||
2251 | if (IS_ERR(inode)) { | ||
2252 | srcu_read_unlock(&fs_info->subvol_srcu, index); | ||
2253 | return 0; | ||
2254 | } | ||
2255 | |||
2256 | srcu_read_unlock(&fs_info->subvol_srcu, index); | ||
2257 | |||
2258 | /* step 3: relink backref */ | ||
2259 | lock_start = backref->file_pos; | ||
2260 | lock_end = backref->file_pos + backref->num_bytes - 1; | ||
2261 | lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, | ||
2262 | 0, &cached); | ||
2263 | |||
2264 | ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); | ||
2265 | if (ordered) { | ||
2266 | btrfs_put_ordered_extent(ordered); | ||
2267 | goto out_unlock; | ||
2268 | } | ||
2269 | |||
2270 | trans = btrfs_join_transaction(root); | ||
2271 | if (IS_ERR(trans)) { | ||
2272 | ret = PTR_ERR(trans); | ||
2273 | goto out_unlock; | ||
2274 | } | ||
2275 | |||
2276 | key.objectid = backref->inum; | ||
2277 | key.type = BTRFS_EXTENT_DATA_KEY; | ||
2278 | key.offset = backref->file_pos; | ||
2279 | |||
2280 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
2281 | if (ret < 0) { | ||
2282 | goto out_free_path; | ||
2283 | } else if (ret > 0) { | ||
2284 | ret = 0; | ||
2285 | goto out_free_path; | ||
2286 | } | ||
2287 | |||
2288 | extent = btrfs_item_ptr(path->nodes[0], path->slots[0], | ||
2289 | struct btrfs_file_extent_item); | ||
2290 | |||
2291 | if (btrfs_file_extent_generation(path->nodes[0], extent) != | ||
2292 | backref->generation) | ||
2293 | goto out_free_path; | ||
2294 | |||
2295 | btrfs_release_path(path); | ||
2296 | |||
2297 | start = backref->file_pos; | ||
2298 | if (backref->extent_offset < old->extent_offset + old->offset) | ||
2299 | start += old->extent_offset + old->offset - | ||
2300 | backref->extent_offset; | ||
2301 | |||
2302 | len = min(backref->extent_offset + backref->num_bytes, | ||
2303 | old->extent_offset + old->offset + old->len); | ||
2304 | len -= max(backref->extent_offset, old->extent_offset + old->offset); | ||
2305 | |||
2306 | ret = btrfs_drop_extents(trans, root, inode, start, | ||
2307 | start + len, 1); | ||
2308 | if (ret) | ||
2309 | goto out_free_path; | ||
2310 | again: | ||
2311 | key.objectid = btrfs_ino(inode); | ||
2312 | key.type = BTRFS_EXTENT_DATA_KEY; | ||
2313 | key.offset = start; | ||
2314 | |||
2315 | if (merge) { | ||
2316 | struct btrfs_file_extent_item *fi; | ||
2317 | u64 extent_len; | ||
2318 | struct btrfs_key found_key; | ||
2319 | |||
2320 | ret = btrfs_search_slot(trans, root, &key, path, 1, 1); | ||
2321 | if (ret < 0) | ||
2322 | goto out_free_path; | ||
2323 | |||
2324 | path->slots[0]--; | ||
2325 | leaf = path->nodes[0]; | ||
2326 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | ||
2327 | |||
2328 | fi = btrfs_item_ptr(leaf, path->slots[0], | ||
2329 | struct btrfs_file_extent_item); | ||
2330 | extent_len = btrfs_file_extent_num_bytes(leaf, fi); | ||
2331 | |||
2332 | if (relink_is_mergable(leaf, fi, new->bytenr) && | ||
2333 | extent_len + found_key.offset == start) { | ||
2334 | btrfs_set_file_extent_num_bytes(leaf, fi, | ||
2335 | extent_len + len); | ||
2336 | btrfs_mark_buffer_dirty(leaf); | ||
2337 | inode_add_bytes(inode, len); | ||
2338 | |||
2339 | ret = 1; | ||
2340 | goto out_free_path; | ||
2341 | } else { | ||
2342 | merge = false; | ||
2343 | btrfs_release_path(path); | ||
2344 | goto again; | ||
2345 | } | ||
2346 | } | ||
2347 | |||
2348 | ret = btrfs_insert_empty_item(trans, root, path, &key, | ||
2349 | sizeof(*extent)); | ||
2350 | if (ret) { | ||
2351 | btrfs_abort_transaction(trans, root, ret); | ||
2352 | goto out_free_path; | ||
2353 | } | ||
2354 | |||
2355 | leaf = path->nodes[0]; | ||
2356 | item = btrfs_item_ptr(leaf, path->slots[0], | ||
2357 | struct btrfs_file_extent_item); | ||
2358 | btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr); | ||
2359 | btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len); | ||
2360 | btrfs_set_file_extent_offset(leaf, item, start - new->file_pos); | ||
2361 | btrfs_set_file_extent_num_bytes(leaf, item, len); | ||
2362 | btrfs_set_file_extent_ram_bytes(leaf, item, new->len); | ||
2363 | btrfs_set_file_extent_generation(leaf, item, trans->transid); | ||
2364 | btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); | ||
2365 | btrfs_set_file_extent_compression(leaf, item, new->compress_type); | ||
2366 | btrfs_set_file_extent_encryption(leaf, item, 0); | ||
2367 | btrfs_set_file_extent_other_encoding(leaf, item, 0); | ||
2368 | |||
2369 | btrfs_mark_buffer_dirty(leaf); | ||
2370 | inode_add_bytes(inode, len); | ||
2371 | |||
2372 | ret = btrfs_inc_extent_ref(trans, root, new->bytenr, | ||
2373 | new->disk_len, 0, | ||
2374 | backref->root_id, backref->inum, | ||
2375 | new->file_pos, 0); /* start - extent_offset */ | ||
2376 | if (ret) { | ||
2377 | btrfs_abort_transaction(trans, root, ret); | ||
2378 | goto out_free_path; | ||
2379 | } | ||
2380 | |||
2381 | ret = 1; | ||
2382 | out_free_path: | ||
2383 | btrfs_release_path(path); | ||
2384 | btrfs_end_transaction(trans, root); | ||
2385 | out_unlock: | ||
2386 | unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, | ||
2387 | &cached, GFP_NOFS); | ||
2388 | iput(inode); | ||
2389 | return ret; | ||
2390 | } | ||
2391 | |||
2392 | static void relink_file_extents(struct new_sa_defrag_extent *new) | ||
2393 | { | ||
2394 | struct btrfs_path *path; | ||
2395 | struct old_sa_defrag_extent *old, *tmp; | ||
2396 | struct sa_defrag_extent_backref *backref; | ||
2397 | struct sa_defrag_extent_backref *prev = NULL; | ||
2398 | struct inode *inode; | ||
2399 | struct btrfs_root *root; | ||
2400 | struct rb_node *node; | ||
2401 | int ret; | ||
2402 | |||
2403 | inode = new->inode; | ||
2404 | root = BTRFS_I(inode)->root; | ||
2405 | |||
2406 | path = btrfs_alloc_path(); | ||
2407 | if (!path) | ||
2408 | return; | ||
2409 | |||
2410 | if (!record_extent_backrefs(path, new)) { | ||
2411 | btrfs_free_path(path); | ||
2412 | goto out; | ||
2413 | } | ||
2414 | btrfs_release_path(path); | ||
2415 | |||
2416 | while (1) { | ||
2417 | node = rb_first(&new->root); | ||
2418 | if (!node) | ||
2419 | break; | ||
2420 | rb_erase(node, &new->root); | ||
2421 | |||
2422 | backref = rb_entry(node, struct sa_defrag_extent_backref, node); | ||
2423 | |||
2424 | ret = relink_extent_backref(path, prev, backref); | ||
2425 | WARN_ON(ret < 0); | ||
2426 | |||
2427 | kfree(prev); | ||
2428 | |||
2429 | if (ret == 1) | ||
2430 | prev = backref; | ||
2431 | else | ||
2432 | prev = NULL; | ||
2433 | cond_resched(); | ||
2434 | } | ||
2435 | kfree(prev); | ||
2436 | |||
2437 | btrfs_free_path(path); | ||
2438 | |||
2439 | list_for_each_entry_safe(old, tmp, &new->head, list) { | ||
2440 | list_del(&old->list); | ||
2441 | kfree(old); | ||
2442 | } | ||
2443 | out: | ||
2444 | atomic_dec(&root->fs_info->defrag_running); | ||
2445 | wake_up(&root->fs_info->transaction_wait); | ||
2446 | |||
2447 | kfree(new); | ||
2448 | } | ||
2449 | |||
2450 | static struct new_sa_defrag_extent * | ||
2451 | record_old_file_extents(struct inode *inode, | ||
2452 | struct btrfs_ordered_extent *ordered) | ||
2453 | { | ||
2454 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
2455 | struct btrfs_path *path; | ||
2456 | struct btrfs_key key; | ||
2457 | struct old_sa_defrag_extent *old, *tmp; | ||
2458 | struct new_sa_defrag_extent *new; | ||
2459 | int ret; | ||
2460 | |||
2461 | new = kmalloc(sizeof(*new), GFP_NOFS); | ||
2462 | if (!new) | ||
2463 | return NULL; | ||
2464 | |||
2465 | new->inode = inode; | ||
2466 | new->file_pos = ordered->file_offset; | ||
2467 | new->len = ordered->len; | ||
2468 | new->bytenr = ordered->start; | ||
2469 | new->disk_len = ordered->disk_len; | ||
2470 | new->compress_type = ordered->compress_type; | ||
2471 | new->root = RB_ROOT; | ||
2472 | INIT_LIST_HEAD(&new->head); | ||
2473 | |||
2474 | path = btrfs_alloc_path(); | ||
2475 | if (!path) | ||
2476 | goto out_kfree; | ||
2477 | |||
2478 | key.objectid = btrfs_ino(inode); | ||
2479 | key.type = BTRFS_EXTENT_DATA_KEY; | ||
2480 | key.offset = new->file_pos; | ||
2481 | |||
2482 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
2483 | if (ret < 0) | ||
2484 | goto out_free_path; | ||
2485 | if (ret > 0 && path->slots[0] > 0) | ||
2486 | path->slots[0]--; | ||
2487 | |||
2488 | /* find out all the old extents for the file range */ | ||
2489 | while (1) { | ||
2490 | struct btrfs_file_extent_item *extent; | ||
2491 | struct extent_buffer *l; | ||
2492 | int slot; | ||
2493 | u64 num_bytes; | ||
2494 | u64 offset; | ||
2495 | u64 end; | ||
2496 | u64 disk_bytenr; | ||
2497 | u64 extent_offset; | ||
2498 | |||
2499 | l = path->nodes[0]; | ||
2500 | slot = path->slots[0]; | ||
2501 | |||
2502 | if (slot >= btrfs_header_nritems(l)) { | ||
2503 | ret = btrfs_next_leaf(root, path); | ||
2504 | if (ret < 0) | ||
2505 | goto out_free_list; | ||
2506 | else if (ret > 0) | ||
2507 | break; | ||
2508 | continue; | ||
2509 | } | ||
2510 | |||
2511 | btrfs_item_key_to_cpu(l, &key, slot); | ||
2512 | |||
2513 | if (key.objectid != btrfs_ino(inode)) | ||
2514 | break; | ||
2515 | if (key.type != BTRFS_EXTENT_DATA_KEY) | ||
2516 | break; | ||
2517 | if (key.offset >= new->file_pos + new->len) | ||
2518 | break; | ||
2519 | |||
2520 | extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item); | ||
2521 | |||
2522 | num_bytes = btrfs_file_extent_num_bytes(l, extent); | ||
2523 | if (key.offset + num_bytes < new->file_pos) | ||
2524 | goto next; | ||
2525 | |||
2526 | disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent); | ||
2527 | if (!disk_bytenr) | ||
2528 | goto next; | ||
2529 | |||
2530 | extent_offset = btrfs_file_extent_offset(l, extent); | ||
2531 | |||
2532 | old = kmalloc(sizeof(*old), GFP_NOFS); | ||
2533 | if (!old) | ||
2534 | goto out_free_list; | ||
2535 | |||
2536 | offset = max(new->file_pos, key.offset); | ||
2537 | end = min(new->file_pos + new->len, key.offset + num_bytes); | ||
2538 | |||
2539 | old->bytenr = disk_bytenr; | ||
2540 | old->extent_offset = extent_offset; | ||
2541 | old->offset = offset - key.offset; | ||
2542 | old->len = end - offset; | ||
2543 | old->new = new; | ||
2544 | old->count = 0; | ||
2545 | list_add_tail(&old->list, &new->head); | ||
2546 | next: | ||
2547 | path->slots[0]++; | ||
2548 | cond_resched(); | ||
2549 | } | ||
2550 | |||
2551 | btrfs_free_path(path); | ||
2552 | atomic_inc(&root->fs_info->defrag_running); | ||
2553 | |||
2554 | return new; | ||
2555 | |||
2556 | out_free_list: | ||
2557 | list_for_each_entry_safe(old, tmp, &new->head, list) { | ||
2558 | list_del(&old->list); | ||
2559 | kfree(old); | ||
2560 | } | ||
2561 | out_free_path: | ||
2562 | btrfs_free_path(path); | ||
2563 | out_kfree: | ||
2564 | kfree(new); | ||
2565 | return NULL; | ||
2566 | } | ||
2567 | |||
1895 | /* | 2568 | /* |
1896 | * helper function for btrfs_finish_ordered_io, this | 2569 | * helper function for btrfs_finish_ordered_io, this |
1897 | * just reads in some of the csum leaves to prime them into ram | 2570 | * just reads in some of the csum leaves to prime them into ram |
@@ -1909,6 +2582,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) | |||
1909 | struct btrfs_trans_handle *trans = NULL; | 2582 | struct btrfs_trans_handle *trans = NULL; |
1910 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; | 2583 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; |
1911 | struct extent_state *cached_state = NULL; | 2584 | struct extent_state *cached_state = NULL; |
2585 | struct new_sa_defrag_extent *new = NULL; | ||
1912 | int compress_type = 0; | 2586 | int compress_type = 0; |
1913 | int ret; | 2587 | int ret; |
1914 | bool nolock; | 2588 | bool nolock; |
@@ -1943,6 +2617,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) | |||
1943 | ordered_extent->file_offset + ordered_extent->len - 1, | 2617 | ordered_extent->file_offset + ordered_extent->len - 1, |
1944 | 0, &cached_state); | 2618 | 0, &cached_state); |
1945 | 2619 | ||
2620 | ret = test_range_bit(io_tree, ordered_extent->file_offset, | ||
2621 | ordered_extent->file_offset + ordered_extent->len - 1, | ||
2622 | EXTENT_DEFRAG, 1, cached_state); | ||
2623 | if (ret) { | ||
2624 | u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); | ||
2625 | if (last_snapshot >= BTRFS_I(inode)->generation) | ||
2626 | /* the inode is shared */ | ||
2627 | new = record_old_file_extents(inode, ordered_extent); | ||
2628 | |||
2629 | clear_extent_bit(io_tree, ordered_extent->file_offset, | ||
2630 | ordered_extent->file_offset + ordered_extent->len - 1, | ||
2631 | EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); | ||
2632 | } | ||
2633 | |||
1946 | if (nolock) | 2634 | if (nolock) |
1947 | trans = btrfs_join_transaction_nolock(root); | 2635 | trans = btrfs_join_transaction_nolock(root); |
1948 | else | 2636 | else |
@@ -2001,17 +2689,33 @@ out: | |||
2001 | if (trans) | 2689 | if (trans) |
2002 | btrfs_end_transaction(trans, root); | 2690 | btrfs_end_transaction(trans, root); |
2003 | 2691 | ||
2004 | if (ret) | 2692 | if (ret) { |
2005 | clear_extent_uptodate(io_tree, ordered_extent->file_offset, | 2693 | clear_extent_uptodate(io_tree, ordered_extent->file_offset, |
2006 | ordered_extent->file_offset + | 2694 | ordered_extent->file_offset + |
2007 | ordered_extent->len - 1, NULL, GFP_NOFS); | 2695 | ordered_extent->len - 1, NULL, GFP_NOFS); |
2008 | 2696 | ||
2697 | /* | ||
2698 | * If the ordered extent had an IOERR or something else went | ||
2699 | * wrong we need to return the space for this ordered extent | ||
2700 | * back to the allocator. | ||
2701 | */ | ||
2702 | if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && | ||
2703 | !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) | ||
2704 | btrfs_free_reserved_extent(root, ordered_extent->start, | ||
2705 | ordered_extent->disk_len); | ||
2706 | } | ||
2707 | |||
2708 | |||
2009 | /* | 2709 | /* |
2010 | * This needs to be done to make sure anybody waiting knows we are done | 2710 | * This needs to be done to make sure anybody waiting knows we are done |
2011 | * updating everything for this ordered extent. | 2711 | * updating everything for this ordered extent. |
2012 | */ | 2712 | */ |
2013 | btrfs_remove_ordered_extent(inode, ordered_extent); | 2713 | btrfs_remove_ordered_extent(inode, ordered_extent); |
2014 | 2714 | ||
2715 | /* for snapshot-aware defrag */ | ||
2716 | if (new) | ||
2717 | relink_file_extents(new); | ||
2718 | |||
2015 | /* once for us */ | 2719 | /* once for us */ |
2016 | btrfs_put_ordered_extent(ordered_extent); | 2720 | btrfs_put_ordered_extent(ordered_extent); |
2017 | /* once for the tree */ | 2721 | /* once for the tree */ |
@@ -2062,7 +2766,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, | |||
2062 | static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, | 2766 | static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, |
2063 | struct extent_state *state, int mirror) | 2767 | struct extent_state *state, int mirror) |
2064 | { | 2768 | { |
2065 | size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); | 2769 | size_t offset = start - page_offset(page); |
2066 | struct inode *inode = page->mapping->host; | 2770 | struct inode *inode = page->mapping->host; |
2067 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; | 2771 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; |
2068 | char *kaddr; | 2772 | char *kaddr; |
@@ -2167,11 +2871,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) | |||
2167 | } | 2871 | } |
2168 | } | 2872 | } |
2169 | 2873 | ||
2170 | enum btrfs_orphan_cleanup_state { | ||
2171 | ORPHAN_CLEANUP_STARTED = 1, | ||
2172 | ORPHAN_CLEANUP_DONE = 2, | ||
2173 | }; | ||
2174 | |||
2175 | /* | 2874 | /* |
2176 | * This is called in transaction commit time. If there are no orphan | 2875 | * This is called in transaction commit time. If there are no orphan |
2177 | * files in the subvolume, it removes orphan item and frees block_rsv | 2876 | * files in the subvolume, it removes orphan item and frees block_rsv |
@@ -2469,6 +3168,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) | |||
2469 | */ | 3168 | */ |
2470 | set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, | 3169 | set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, |
2471 | &BTRFS_I(inode)->runtime_flags); | 3170 | &BTRFS_I(inode)->runtime_flags); |
3171 | atomic_inc(&root->orphan_inodes); | ||
2472 | 3172 | ||
2473 | /* if we have links, this was a truncate, lets do that */ | 3173 | /* if we have links, this was a truncate, lets do that */ |
2474 | if (inode->i_nlink) { | 3174 | if (inode->i_nlink) { |
@@ -2491,6 +3191,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) | |||
2491 | goto out; | 3191 | goto out; |
2492 | 3192 | ||
2493 | ret = btrfs_truncate(inode); | 3193 | ret = btrfs_truncate(inode); |
3194 | if (ret) | ||
3195 | btrfs_orphan_del(NULL, inode); | ||
2494 | } else { | 3196 | } else { |
2495 | nr_unlink++; | 3197 | nr_unlink++; |
2496 | } | 3198 | } |
@@ -2709,34 +3411,41 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, | |||
2709 | struct btrfs_inode_item *item, | 3411 | struct btrfs_inode_item *item, |
2710 | struct inode *inode) | 3412 | struct inode *inode) |
2711 | { | 3413 | { |
2712 | btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); | 3414 | struct btrfs_map_token token; |
2713 | btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); | 3415 | |
2714 | btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); | 3416 | btrfs_init_map_token(&token); |
2715 | btrfs_set_inode_mode(leaf, item, inode->i_mode); | 3417 | |
2716 | btrfs_set_inode_nlink(leaf, item, inode->i_nlink); | 3418 | btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); |
3419 | btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); | ||
3420 | btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size, | ||
3421 | &token); | ||
3422 | btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); | ||
3423 | btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); | ||
2717 | 3424 | ||
2718 | btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), | 3425 | btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), |
2719 | inode->i_atime.tv_sec); | 3426 | inode->i_atime.tv_sec, &token); |
2720 | btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), | 3427 | btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), |
2721 | inode->i_atime.tv_nsec); | 3428 | inode->i_atime.tv_nsec, &token); |
2722 | 3429 | ||
2723 | btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), | 3430 | btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), |
2724 | inode->i_mtime.tv_sec); | 3431 | inode->i_mtime.tv_sec, &token); |
2725 | btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), | 3432 | btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), |
2726 | inode->i_mtime.tv_nsec); | 3433 | inode->i_mtime.tv_nsec, &token); |
2727 | 3434 | ||
2728 | btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), | 3435 | btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), |
2729 | inode->i_ctime.tv_sec); | 3436 | inode->i_ctime.tv_sec, &token); |
2730 | btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), | 3437 | btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), |
2731 | inode->i_ctime.tv_nsec); | 3438 | inode->i_ctime.tv_nsec, &token); |
2732 | 3439 | ||
2733 | btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); | 3440 | btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), |
2734 | btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); | 3441 | &token); |
2735 | btrfs_set_inode_sequence(leaf, item, inode->i_version); | 3442 | btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, |
2736 | btrfs_set_inode_transid(leaf, item, trans->transid); | 3443 | &token); |
2737 | btrfs_set_inode_rdev(leaf, item, inode->i_rdev); | 3444 | btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); |
2738 | btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); | 3445 | btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); |
2739 | btrfs_set_inode_block_group(leaf, item, 0); | 3446 | btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); |
3447 | btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); | ||
3448 | btrfs_set_token_inode_block_group(leaf, item, 0, &token); | ||
2740 | } | 3449 | } |
2741 | 3450 | ||
2742 | /* | 3451 | /* |
@@ -3304,7 +4013,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, | |||
3304 | u64 extent_num_bytes = 0; | 4013 | u64 extent_num_bytes = 0; |
3305 | u64 extent_offset = 0; | 4014 | u64 extent_offset = 0; |
3306 | u64 item_end = 0; | 4015 | u64 item_end = 0; |
3307 | u64 mask = root->sectorsize - 1; | ||
3308 | u32 found_type = (u8)-1; | 4016 | u32 found_type = (u8)-1; |
3309 | int found_extent; | 4017 | int found_extent; |
3310 | int del_item; | 4018 | int del_item; |
@@ -3328,7 +4036,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, | |||
3328 | * extent just the way it is. | 4036 | * extent just the way it is. |
3329 | */ | 4037 | */ |
3330 | if (root->ref_cows || root == root->fs_info->tree_root) | 4038 | if (root->ref_cows || root == root->fs_info->tree_root) |
3331 | btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0); | 4039 | btrfs_drop_extent_cache(inode, ALIGN(new_size, |
4040 | root->sectorsize), (u64)-1, 0); | ||
3332 | 4041 | ||
3333 | /* | 4042 | /* |
3334 | * This function is also used to drop the items in the log tree before | 4043 | * This function is also used to drop the items in the log tree before |
@@ -3407,10 +4116,9 @@ search_again: | |||
3407 | if (!del_item) { | 4116 | if (!del_item) { |
3408 | u64 orig_num_bytes = | 4117 | u64 orig_num_bytes = |
3409 | btrfs_file_extent_num_bytes(leaf, fi); | 4118 | btrfs_file_extent_num_bytes(leaf, fi); |
3410 | extent_num_bytes = new_size - | 4119 | extent_num_bytes = ALIGN(new_size - |
3411 | found_key.offset + root->sectorsize - 1; | 4120 | found_key.offset, |
3412 | extent_num_bytes = extent_num_bytes & | 4121 | root->sectorsize); |
3413 | ~((u64)root->sectorsize - 1); | ||
3414 | btrfs_set_file_extent_num_bytes(leaf, fi, | 4122 | btrfs_set_file_extent_num_bytes(leaf, fi, |
3415 | extent_num_bytes); | 4123 | extent_num_bytes); |
3416 | num_dec = (orig_num_bytes - | 4124 | num_dec = (orig_num_bytes - |
@@ -3646,9 +4354,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) | |||
3646 | struct extent_map *em = NULL; | 4354 | struct extent_map *em = NULL; |
3647 | struct extent_state *cached_state = NULL; | 4355 | struct extent_state *cached_state = NULL; |
3648 | struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; | 4356 | struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; |
3649 | u64 mask = root->sectorsize - 1; | 4357 | u64 hole_start = ALIGN(oldsize, root->sectorsize); |
3650 | u64 hole_start = (oldsize + mask) & ~mask; | 4358 | u64 block_end = ALIGN(size, root->sectorsize); |
3651 | u64 block_end = (size + mask) & ~mask; | ||
3652 | u64 last_byte; | 4359 | u64 last_byte; |
3653 | u64 cur_offset; | 4360 | u64 cur_offset; |
3654 | u64 hole_size; | 4361 | u64 hole_size; |
@@ -3681,7 +4388,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) | |||
3681 | break; | 4388 | break; |
3682 | } | 4389 | } |
3683 | last_byte = min(extent_map_end(em), block_end); | 4390 | last_byte = min(extent_map_end(em), block_end); |
3684 | last_byte = (last_byte + mask) & ~mask; | 4391 | last_byte = ALIGN(last_byte , root->sectorsize); |
3685 | if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { | 4392 | if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { |
3686 | struct extent_map *hole_em; | 4393 | struct extent_map *hole_em; |
3687 | hole_size = last_byte - cur_offset; | 4394 | hole_size = last_byte - cur_offset; |
@@ -3832,6 +4539,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) | |||
3832 | 4539 | ||
3833 | /* we don't support swapfiles, so vmtruncate shouldn't fail */ | 4540 | /* we don't support swapfiles, so vmtruncate shouldn't fail */ |
3834 | truncate_setsize(inode, newsize); | 4541 | truncate_setsize(inode, newsize); |
4542 | |||
4543 | /* Disable nonlocked read DIO to avoid the end less truncate */ | ||
4544 | btrfs_inode_block_unlocked_dio(inode); | ||
4545 | inode_dio_wait(inode); | ||
4546 | btrfs_inode_resume_unlocked_dio(inode); | ||
4547 | |||
3835 | ret = btrfs_truncate(inode); | 4548 | ret = btrfs_truncate(inode); |
3836 | if (ret && inode->i_nlink) | 4549 | if (ret && inode->i_nlink) |
3837 | btrfs_orphan_del(NULL, inode); | 4550 | btrfs_orphan_del(NULL, inode); |
@@ -3904,6 +4617,12 @@ void btrfs_evict_inode(struct inode *inode) | |||
3904 | goto no_delete; | 4617 | goto no_delete; |
3905 | } | 4618 | } |
3906 | 4619 | ||
4620 | ret = btrfs_commit_inode_delayed_inode(inode); | ||
4621 | if (ret) { | ||
4622 | btrfs_orphan_del(NULL, inode); | ||
4623 | goto no_delete; | ||
4624 | } | ||
4625 | |||
3907 | rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); | 4626 | rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); |
3908 | if (!rsv) { | 4627 | if (!rsv) { |
3909 | btrfs_orphan_del(NULL, inode); | 4628 | btrfs_orphan_del(NULL, inode); |
@@ -3941,7 +4660,7 @@ void btrfs_evict_inode(struct inode *inode) | |||
3941 | goto no_delete; | 4660 | goto no_delete; |
3942 | } | 4661 | } |
3943 | 4662 | ||
3944 | trans = btrfs_start_transaction_lflush(root, 1); | 4663 | trans = btrfs_join_transaction(root); |
3945 | if (IS_ERR(trans)) { | 4664 | if (IS_ERR(trans)) { |
3946 | btrfs_orphan_del(NULL, inode); | 4665 | btrfs_orphan_del(NULL, inode); |
3947 | btrfs_free_block_rsv(root, rsv); | 4666 | btrfs_free_block_rsv(root, rsv); |
@@ -3955,9 +4674,6 @@ void btrfs_evict_inode(struct inode *inode) | |||
3955 | break; | 4674 | break; |
3956 | 4675 | ||
3957 | trans->block_rsv = &root->fs_info->trans_block_rsv; | 4676 | trans->block_rsv = &root->fs_info->trans_block_rsv; |
3958 | ret = btrfs_update_inode(trans, root, inode); | ||
3959 | BUG_ON(ret); | ||
3960 | |||
3961 | btrfs_end_transaction(trans, root); | 4677 | btrfs_end_transaction(trans, root); |
3962 | trans = NULL; | 4678 | trans = NULL; |
3963 | btrfs_btree_balance_dirty(root); | 4679 | btrfs_btree_balance_dirty(root); |
@@ -4854,7 +5570,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, | |||
4854 | if (btrfs_test_opt(root, NODATASUM)) | 5570 | if (btrfs_test_opt(root, NODATASUM)) |
4855 | BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; | 5571 | BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; |
4856 | if (btrfs_test_opt(root, NODATACOW)) | 5572 | if (btrfs_test_opt(root, NODATACOW)) |
4857 | BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; | 5573 | BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | |
5574 | BTRFS_INODE_NODATASUM; | ||
4858 | } | 5575 | } |
4859 | 5576 | ||
4860 | insert_inode_hash(inode); | 5577 | insert_inode_hash(inode); |
@@ -5006,12 +5723,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, | |||
5006 | goto out_unlock; | 5723 | goto out_unlock; |
5007 | } | 5724 | } |
5008 | 5725 | ||
5009 | err = btrfs_update_inode(trans, root, inode); | ||
5010 | if (err) { | ||
5011 | drop_inode = 1; | ||
5012 | goto out_unlock; | ||
5013 | } | ||
5014 | |||
5015 | /* | 5726 | /* |
5016 | * If the active LSM wants to access the inode during | 5727 | * If the active LSM wants to access the inode during |
5017 | * d_instantiate it needs these. Smack checks to see | 5728 | * d_instantiate it needs these. Smack checks to see |
@@ -5396,8 +6107,7 @@ again: | |||
5396 | } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { | 6107 | } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { |
5397 | size_t size; | 6108 | size_t size; |
5398 | size = btrfs_file_extent_inline_len(leaf, item); | 6109 | size = btrfs_file_extent_inline_len(leaf, item); |
5399 | extent_end = (extent_start + size + root->sectorsize - 1) & | 6110 | extent_end = ALIGN(extent_start + size, root->sectorsize); |
5400 | ~((u64)root->sectorsize - 1); | ||
5401 | } | 6111 | } |
5402 | 6112 | ||
5403 | if (start >= extent_end) { | 6113 | if (start >= extent_end) { |
@@ -5469,8 +6179,7 @@ again: | |||
5469 | copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, | 6179 | copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, |
5470 | size - extent_offset); | 6180 | size - extent_offset); |
5471 | em->start = extent_start + extent_offset; | 6181 | em->start = extent_start + extent_offset; |
5472 | em->len = (copy_size + root->sectorsize - 1) & | 6182 | em->len = ALIGN(copy_size, root->sectorsize); |
5473 | ~((u64)root->sectorsize - 1); | ||
5474 | em->orig_block_len = em->len; | 6183 | em->orig_block_len = em->len; |
5475 | em->orig_start = em->start; | 6184 | em->orig_start = em->start; |
5476 | if (compress_type) { | 6185 | if (compress_type) { |
@@ -5949,6 +6658,8 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, | |||
5949 | 6658 | ||
5950 | em->start = start; | 6659 | em->start = start; |
5951 | em->orig_start = orig_start; | 6660 | em->orig_start = orig_start; |
6661 | em->mod_start = start; | ||
6662 | em->mod_len = len; | ||
5952 | em->len = len; | 6663 | em->len = len; |
5953 | em->block_len = block_len; | 6664 | em->block_len = block_len; |
5954 | em->block_start = block_start; | 6665 | em->block_start = block_start; |
@@ -5990,16 +6701,12 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, | |||
5990 | u64 len = bh_result->b_size; | 6701 | u64 len = bh_result->b_size; |
5991 | struct btrfs_trans_handle *trans; | 6702 | struct btrfs_trans_handle *trans; |
5992 | int unlock_bits = EXTENT_LOCKED; | 6703 | int unlock_bits = EXTENT_LOCKED; |
5993 | int ret; | 6704 | int ret = 0; |
5994 | 6705 | ||
5995 | if (create) { | 6706 | if (create) |
5996 | ret = btrfs_delalloc_reserve_space(inode, len); | ||
5997 | if (ret) | ||
5998 | return ret; | ||
5999 | unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; | 6707 | unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; |
6000 | } else { | 6708 | else |
6001 | len = min_t(u64, len, root->sectorsize); | 6709 | len = min_t(u64, len, root->sectorsize); |
6002 | } | ||
6003 | 6710 | ||
6004 | lockstart = start; | 6711 | lockstart = start; |
6005 | lockend = start + len - 1; | 6712 | lockend = start + len - 1; |
@@ -6011,14 +6718,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, | |||
6011 | if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create)) | 6718 | if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create)) |
6012 | return -ENOTBLK; | 6719 | return -ENOTBLK; |
6013 | 6720 | ||
6014 | if (create) { | ||
6015 | ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, | ||
6016 | lockend, EXTENT_DELALLOC, NULL, | ||
6017 | &cached_state, GFP_NOFS); | ||
6018 | if (ret) | ||
6019 | goto unlock_err; | ||
6020 | } | ||
6021 | |||
6022 | em = btrfs_get_extent(inode, NULL, 0, start, len, 0); | 6721 | em = btrfs_get_extent(inode, NULL, 0, start, len, 0); |
6023 | if (IS_ERR(em)) { | 6722 | if (IS_ERR(em)) { |
6024 | ret = PTR_ERR(em); | 6723 | ret = PTR_ERR(em); |
@@ -6050,7 +6749,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, | |||
6050 | if (!create && (em->block_start == EXTENT_MAP_HOLE || | 6749 | if (!create && (em->block_start == EXTENT_MAP_HOLE || |
6051 | test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { | 6750 | test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { |
6052 | free_extent_map(em); | 6751 | free_extent_map(em); |
6053 | ret = 0; | ||
6054 | goto unlock_err; | 6752 | goto unlock_err; |
6055 | } | 6753 | } |
6056 | 6754 | ||
@@ -6148,6 +6846,15 @@ unlock: | |||
6148 | */ | 6846 | */ |
6149 | if (start + len > i_size_read(inode)) | 6847 | if (start + len > i_size_read(inode)) |
6150 | i_size_write(inode, start + len); | 6848 | i_size_write(inode, start + len); |
6849 | |||
6850 | spin_lock(&BTRFS_I(inode)->lock); | ||
6851 | BTRFS_I(inode)->outstanding_extents++; | ||
6852 | spin_unlock(&BTRFS_I(inode)->lock); | ||
6853 | |||
6854 | ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, | ||
6855 | lockstart + len - 1, EXTENT_DELALLOC, NULL, | ||
6856 | &cached_state, GFP_NOFS); | ||
6857 | BUG_ON(ret); | ||
6151 | } | 6858 | } |
6152 | 6859 | ||
6153 | /* | 6860 | /* |
@@ -6156,24 +6863,9 @@ unlock: | |||
6156 | * aren't using if there is any left over space. | 6863 | * aren't using if there is any left over space. |
6157 | */ | 6864 | */ |
6158 | if (lockstart < lockend) { | 6865 | if (lockstart < lockend) { |
6159 | if (create && len < lockend - lockstart) { | 6866 | clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, |
6160 | clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, | 6867 | lockend, unlock_bits, 1, 0, |
6161 | lockstart + len - 1, | 6868 | &cached_state, GFP_NOFS); |
6162 | unlock_bits | EXTENT_DEFRAG, 1, 0, | ||
6163 | &cached_state, GFP_NOFS); | ||
6164 | /* | ||
6165 | * Beside unlock, we also need to cleanup reserved space | ||
6166 | * for the left range by attaching EXTENT_DO_ACCOUNTING. | ||
6167 | */ | ||
6168 | clear_extent_bit(&BTRFS_I(inode)->io_tree, | ||
6169 | lockstart + len, lockend, | ||
6170 | unlock_bits | EXTENT_DO_ACCOUNTING | | ||
6171 | EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS); | ||
6172 | } else { | ||
6173 | clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, | ||
6174 | lockend, unlock_bits, 1, 0, | ||
6175 | &cached_state, GFP_NOFS); | ||
6176 | } | ||
6177 | } else { | 6869 | } else { |
6178 | free_extent_state(cached_state); | 6870 | free_extent_state(cached_state); |
6179 | } | 6871 | } |
@@ -6183,9 +6875,6 @@ unlock: | |||
6183 | return 0; | 6875 | return 0; |
6184 | 6876 | ||
6185 | unlock_err: | 6877 | unlock_err: |
6186 | if (create) | ||
6187 | unlock_bits |= EXTENT_DO_ACCOUNTING; | ||
6188 | |||
6189 | clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, | 6878 | clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, |
6190 | unlock_bits, 1, 0, &cached_state, GFP_NOFS); | 6879 | unlock_bits, 1, 0, &cached_state, GFP_NOFS); |
6191 | return ret; | 6880 | return ret; |
@@ -6426,19 +7115,24 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, | |||
6426 | int async_submit = 0; | 7115 | int async_submit = 0; |
6427 | 7116 | ||
6428 | map_length = orig_bio->bi_size; | 7117 | map_length = orig_bio->bi_size; |
6429 | ret = btrfs_map_block(root->fs_info, READ, start_sector << 9, | 7118 | ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, |
6430 | &map_length, NULL, 0); | 7119 | &map_length, NULL, 0); |
6431 | if (ret) { | 7120 | if (ret) { |
6432 | bio_put(orig_bio); | 7121 | bio_put(orig_bio); |
6433 | return -EIO; | 7122 | return -EIO; |
6434 | } | 7123 | } |
6435 | |||
6436 | if (map_length >= orig_bio->bi_size) { | 7124 | if (map_length >= orig_bio->bi_size) { |
6437 | bio = orig_bio; | 7125 | bio = orig_bio; |
6438 | goto submit; | 7126 | goto submit; |
6439 | } | 7127 | } |
6440 | 7128 | ||
6441 | async_submit = 1; | 7129 | /* async crcs make it difficult to collect full stripe writes. */ |
7130 | if (btrfs_get_alloc_profile(root, 1) & | ||
7131 | (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) | ||
7132 | async_submit = 0; | ||
7133 | else | ||
7134 | async_submit = 1; | ||
7135 | |||
6442 | bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); | 7136 | bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); |
6443 | if (!bio) | 7137 | if (!bio) |
6444 | return -ENOMEM; | 7138 | return -ENOMEM; |
@@ -6480,7 +7174,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, | |||
6480 | bio->bi_end_io = btrfs_end_dio_bio; | 7174 | bio->bi_end_io = btrfs_end_dio_bio; |
6481 | 7175 | ||
6482 | map_length = orig_bio->bi_size; | 7176 | map_length = orig_bio->bi_size; |
6483 | ret = btrfs_map_block(root->fs_info, READ, | 7177 | ret = btrfs_map_block(root->fs_info, rw, |
6484 | start_sector << 9, | 7178 | start_sector << 9, |
6485 | &map_length, NULL, 0); | 7179 | &map_length, NULL, 0); |
6486 | if (ret) { | 7180 | if (ret) { |
@@ -6623,15 +7317,60 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, | |||
6623 | { | 7317 | { |
6624 | struct file *file = iocb->ki_filp; | 7318 | struct file *file = iocb->ki_filp; |
6625 | struct inode *inode = file->f_mapping->host; | 7319 | struct inode *inode = file->f_mapping->host; |
7320 | size_t count = 0; | ||
7321 | int flags = 0; | ||
7322 | bool wakeup = true; | ||
7323 | bool relock = false; | ||
7324 | ssize_t ret; | ||
6626 | 7325 | ||
6627 | if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, | 7326 | if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, |
6628 | offset, nr_segs)) | 7327 | offset, nr_segs)) |
6629 | return 0; | 7328 | return 0; |
6630 | 7329 | ||
6631 | return __blockdev_direct_IO(rw, iocb, inode, | 7330 | atomic_inc(&inode->i_dio_count); |
6632 | BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, | 7331 | smp_mb__after_atomic_inc(); |
6633 | iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, | 7332 | |
6634 | btrfs_submit_direct, 0); | 7333 | if (rw & WRITE) { |
7334 | count = iov_length(iov, nr_segs); | ||
7335 | /* | ||
7336 | * If the write DIO is beyond the EOF, we need update | ||
7337 | * the isize, but it is protected by i_mutex. So we can | ||
7338 | * not unlock the i_mutex at this case. | ||
7339 | */ | ||
7340 | if (offset + count <= inode->i_size) { | ||
7341 | mutex_unlock(&inode->i_mutex); | ||
7342 | relock = true; | ||
7343 | } | ||
7344 | ret = btrfs_delalloc_reserve_space(inode, count); | ||
7345 | if (ret) | ||
7346 | goto out; | ||
7347 | } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, | ||
7348 | &BTRFS_I(inode)->runtime_flags))) { | ||
7349 | inode_dio_done(inode); | ||
7350 | flags = DIO_LOCKING | DIO_SKIP_HOLES; | ||
7351 | wakeup = false; | ||
7352 | } | ||
7353 | |||
7354 | ret = __blockdev_direct_IO(rw, iocb, inode, | ||
7355 | BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, | ||
7356 | iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, | ||
7357 | btrfs_submit_direct, flags); | ||
7358 | if (rw & WRITE) { | ||
7359 | if (ret < 0 && ret != -EIOCBQUEUED) | ||
7360 | btrfs_delalloc_release_space(inode, count); | ||
7361 | else if (ret >= 0 && (size_t)ret < count) | ||
7362 | btrfs_delalloc_release_space(inode, | ||
7363 | count - (size_t)ret); | ||
7364 | else | ||
7365 | btrfs_delalloc_release_metadata(inode, 0); | ||
7366 | } | ||
7367 | out: | ||
7368 | if (wakeup) | ||
7369 | inode_dio_done(inode); | ||
7370 | if (relock) | ||
7371 | mutex_lock(&inode->i_mutex); | ||
7372 | |||
7373 | return ret; | ||
6635 | } | 7374 | } |
6636 | 7375 | ||
6637 | #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) | 7376 | #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) |
@@ -6735,8 +7474,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) | |||
6735 | return; | 7474 | return; |
6736 | } | 7475 | } |
6737 | lock_extent_bits(tree, page_start, page_end, 0, &cached_state); | 7476 | lock_extent_bits(tree, page_start, page_end, 0, &cached_state); |
6738 | ordered = btrfs_lookup_ordered_extent(inode, | 7477 | ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); |
6739 | page_offset(page)); | ||
6740 | if (ordered) { | 7478 | if (ordered) { |
6741 | /* | 7479 | /* |
6742 | * IO on this page will never be started, so we need | 7480 | * IO on this page will never be started, so we need |
@@ -7216,8 +7954,9 @@ int btrfs_drop_inode(struct inode *inode) | |||
7216 | { | 7954 | { |
7217 | struct btrfs_root *root = BTRFS_I(inode)->root; | 7955 | struct btrfs_root *root = BTRFS_I(inode)->root; |
7218 | 7956 | ||
7957 | /* the snap/subvol tree is on deleting */ | ||
7219 | if (btrfs_root_refs(&root->root_item) == 0 && | 7958 | if (btrfs_root_refs(&root->root_item) == 0 && |
7220 | !btrfs_is_free_space_inode(inode)) | 7959 | root != root->fs_info->tree_root) |
7221 | return 1; | 7960 | return 1; |
7222 | else | 7961 | else |
7223 | return generic_drop_inode(inode); | 7962 | return generic_drop_inode(inode); |
@@ -7299,40 +8038,22 @@ fail: | |||
7299 | static int btrfs_getattr(struct vfsmount *mnt, | 8038 | static int btrfs_getattr(struct vfsmount *mnt, |
7300 | struct dentry *dentry, struct kstat *stat) | 8039 | struct dentry *dentry, struct kstat *stat) |
7301 | { | 8040 | { |
8041 | u64 delalloc_bytes; | ||
7302 | struct inode *inode = dentry->d_inode; | 8042 | struct inode *inode = dentry->d_inode; |
7303 | u32 blocksize = inode->i_sb->s_blocksize; | 8043 | u32 blocksize = inode->i_sb->s_blocksize; |
7304 | 8044 | ||
7305 | generic_fillattr(inode, stat); | 8045 | generic_fillattr(inode, stat); |
7306 | stat->dev = BTRFS_I(inode)->root->anon_dev; | 8046 | stat->dev = BTRFS_I(inode)->root->anon_dev; |
7307 | stat->blksize = PAGE_CACHE_SIZE; | 8047 | stat->blksize = PAGE_CACHE_SIZE; |
8048 | |||
8049 | spin_lock(&BTRFS_I(inode)->lock); | ||
8050 | delalloc_bytes = BTRFS_I(inode)->delalloc_bytes; | ||
8051 | spin_unlock(&BTRFS_I(inode)->lock); | ||
7308 | stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + | 8052 | stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + |
7309 | ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; | 8053 | ALIGN(delalloc_bytes, blocksize)) >> 9; |
7310 | return 0; | 8054 | return 0; |
7311 | } | 8055 | } |
7312 | 8056 | ||
7313 | /* | ||
7314 | * If a file is moved, it will inherit the cow and compression flags of the new | ||
7315 | * directory. | ||
7316 | */ | ||
7317 | static void fixup_inode_flags(struct inode *dir, struct inode *inode) | ||
7318 | { | ||
7319 | struct btrfs_inode *b_dir = BTRFS_I(dir); | ||
7320 | struct btrfs_inode *b_inode = BTRFS_I(inode); | ||
7321 | |||
7322 | if (b_dir->flags & BTRFS_INODE_NODATACOW) | ||
7323 | b_inode->flags |= BTRFS_INODE_NODATACOW; | ||
7324 | else | ||
7325 | b_inode->flags &= ~BTRFS_INODE_NODATACOW; | ||
7326 | |||
7327 | if (b_dir->flags & BTRFS_INODE_COMPRESS) { | ||
7328 | b_inode->flags |= BTRFS_INODE_COMPRESS; | ||
7329 | b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS; | ||
7330 | } else { | ||
7331 | b_inode->flags &= ~(BTRFS_INODE_COMPRESS | | ||
7332 | BTRFS_INODE_NOCOMPRESS); | ||
7333 | } | ||
7334 | } | ||
7335 | |||
7336 | static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, | 8057 | static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, |
7337 | struct inode *new_dir, struct dentry *new_dentry) | 8058 | struct inode *new_dir, struct dentry *new_dentry) |
7338 | { | 8059 | { |
@@ -7498,8 +8219,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, | |||
7498 | } | 8219 | } |
7499 | } | 8220 | } |
7500 | 8221 | ||
7501 | fixup_inode_flags(new_dir, old_inode); | ||
7502 | |||
7503 | ret = btrfs_add_link(trans, new_dir, old_inode, | 8222 | ret = btrfs_add_link(trans, new_dir, old_inode, |
7504 | new_dentry->d_name.name, | 8223 | new_dentry->d_name.name, |
7505 | new_dentry->d_name.len, 0, index); | 8224 | new_dentry->d_name.len, 0, index); |
@@ -7583,7 +8302,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) | |||
7583 | 8302 | ||
7584 | INIT_LIST_HEAD(&works); | 8303 | INIT_LIST_HEAD(&works); |
7585 | INIT_LIST_HEAD(&splice); | 8304 | INIT_LIST_HEAD(&splice); |
7586 | again: | 8305 | |
7587 | spin_lock(&root->fs_info->delalloc_lock); | 8306 | spin_lock(&root->fs_info->delalloc_lock); |
7588 | list_splice_init(&root->fs_info->delalloc_inodes, &splice); | 8307 | list_splice_init(&root->fs_info->delalloc_inodes, &splice); |
7589 | while (!list_empty(&splice)) { | 8308 | while (!list_empty(&splice)) { |
@@ -7593,8 +8312,11 @@ again: | |||
7593 | list_del_init(&binode->delalloc_inodes); | 8312 | list_del_init(&binode->delalloc_inodes); |
7594 | 8313 | ||
7595 | inode = igrab(&binode->vfs_inode); | 8314 | inode = igrab(&binode->vfs_inode); |
7596 | if (!inode) | 8315 | if (!inode) { |
8316 | clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, | ||
8317 | &binode->runtime_flags); | ||
7597 | continue; | 8318 | continue; |
8319 | } | ||
7598 | 8320 | ||
7599 | list_add_tail(&binode->delalloc_inodes, | 8321 | list_add_tail(&binode->delalloc_inodes, |
7600 | &root->fs_info->delalloc_inodes); | 8322 | &root->fs_info->delalloc_inodes); |
@@ -7619,13 +8341,6 @@ again: | |||
7619 | btrfs_wait_and_free_delalloc_work(work); | 8341 | btrfs_wait_and_free_delalloc_work(work); |
7620 | } | 8342 | } |
7621 | 8343 | ||
7622 | spin_lock(&root->fs_info->delalloc_lock); | ||
7623 | if (!list_empty(&root->fs_info->delalloc_inodes)) { | ||
7624 | spin_unlock(&root->fs_info->delalloc_lock); | ||
7625 | goto again; | ||
7626 | } | ||
7627 | spin_unlock(&root->fs_info->delalloc_lock); | ||
7628 | |||
7629 | /* the filemap_flush will queue IO into the worker threads, but | 8344 | /* the filemap_flush will queue IO into the worker threads, but |
7630 | * we have to make sure the IO is actually started and that | 8345 | * we have to make sure the IO is actually started and that |
7631 | * ordered extents get created before we return | 8346 | * ordered extents get created before we return |
@@ -7801,8 +8516,9 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, | |||
7801 | } | 8516 | } |
7802 | } | 8517 | } |
7803 | 8518 | ||
7804 | ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, | 8519 | ret = btrfs_reserve_extent(trans, root, |
7805 | 0, *alloc_hint, &ins, 1); | 8520 | min(num_bytes, 256ULL * 1024 * 1024), |
8521 | min_size, 0, *alloc_hint, &ins, 1); | ||
7806 | if (ret) { | 8522 | if (ret) { |
7807 | if (own_trans) | 8523 | if (own_trans) |
7808 | btrfs_end_transaction(trans, root); | 8524 | btrfs_end_transaction(trans, root); |
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c3f09f71bedd..c83086fdda05 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c | |||
@@ -42,12 +42,12 @@ | |||
42 | #include <linux/slab.h> | 42 | #include <linux/slab.h> |
43 | #include <linux/blkdev.h> | 43 | #include <linux/blkdev.h> |
44 | #include <linux/uuid.h> | 44 | #include <linux/uuid.h> |
45 | #include <linux/btrfs.h> | ||
45 | #include "compat.h" | 46 | #include "compat.h" |
46 | #include "ctree.h" | 47 | #include "ctree.h" |
47 | #include "disk-io.h" | 48 | #include "disk-io.h" |
48 | #include "transaction.h" | 49 | #include "transaction.h" |
49 | #include "btrfs_inode.h" | 50 | #include "btrfs_inode.h" |
50 | #include "ioctl.h" | ||
51 | #include "print-tree.h" | 51 | #include "print-tree.h" |
52 | #include "volumes.h" | 52 | #include "volumes.h" |
53 | #include "locking.h" | 53 | #include "locking.h" |
@@ -363,46 +363,52 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) | |||
363 | return 0; | 363 | return 0; |
364 | } | 364 | } |
365 | 365 | ||
366 | static noinline int create_subvol(struct btrfs_root *root, | 366 | static noinline int create_subvol(struct inode *dir, |
367 | struct dentry *dentry, | 367 | struct dentry *dentry, |
368 | char *name, int namelen, | 368 | char *name, int namelen, |
369 | u64 *async_transid, | 369 | u64 *async_transid, |
370 | struct btrfs_qgroup_inherit **inherit) | 370 | struct btrfs_qgroup_inherit *inherit) |
371 | { | 371 | { |
372 | struct btrfs_trans_handle *trans; | 372 | struct btrfs_trans_handle *trans; |
373 | struct btrfs_key key; | 373 | struct btrfs_key key; |
374 | struct btrfs_root_item root_item; | 374 | struct btrfs_root_item root_item; |
375 | struct btrfs_inode_item *inode_item; | 375 | struct btrfs_inode_item *inode_item; |
376 | struct extent_buffer *leaf; | 376 | struct extent_buffer *leaf; |
377 | struct btrfs_root *root = BTRFS_I(dir)->root; | ||
377 | struct btrfs_root *new_root; | 378 | struct btrfs_root *new_root; |
378 | struct dentry *parent = dentry->d_parent; | 379 | struct btrfs_block_rsv block_rsv; |
379 | struct inode *dir; | ||
380 | struct timespec cur_time = CURRENT_TIME; | 380 | struct timespec cur_time = CURRENT_TIME; |
381 | int ret; | 381 | int ret; |
382 | int err; | 382 | int err; |
383 | u64 objectid; | 383 | u64 objectid; |
384 | u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; | 384 | u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; |
385 | u64 index = 0; | 385 | u64 index = 0; |
386 | u64 qgroup_reserved; | ||
386 | uuid_le new_uuid; | 387 | uuid_le new_uuid; |
387 | 388 | ||
388 | ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); | 389 | ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); |
389 | if (ret) | 390 | if (ret) |
390 | return ret; | 391 | return ret; |
391 | 392 | ||
392 | dir = parent->d_inode; | 393 | btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); |
393 | |||
394 | /* | 394 | /* |
395 | * 1 - inode item | 395 | * The same as the snapshot creation, please see the comment |
396 | * 2 - refs | 396 | * of create_snapshot(). |
397 | * 1 - root item | ||
398 | * 2 - dir items | ||
399 | */ | 397 | */ |
400 | trans = btrfs_start_transaction(root, 6); | 398 | ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, |
401 | if (IS_ERR(trans)) | 399 | 7, &qgroup_reserved); |
402 | return PTR_ERR(trans); | 400 | if (ret) |
401 | return ret; | ||
402 | |||
403 | trans = btrfs_start_transaction(root, 0); | ||
404 | if (IS_ERR(trans)) { | ||
405 | ret = PTR_ERR(trans); | ||
406 | goto out; | ||
407 | } | ||
408 | trans->block_rsv = &block_rsv; | ||
409 | trans->bytes_reserved = block_rsv.size; | ||
403 | 410 | ||
404 | ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, | 411 | ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit); |
405 | inherit ? *inherit : NULL); | ||
406 | if (ret) | 412 | if (ret) |
407 | goto fail; | 413 | goto fail; |
408 | 414 | ||
@@ -516,6 +522,8 @@ static noinline int create_subvol(struct btrfs_root *root, | |||
516 | BUG_ON(ret); | 522 | BUG_ON(ret); |
517 | 523 | ||
518 | fail: | 524 | fail: |
525 | trans->block_rsv = NULL; | ||
526 | trans->bytes_reserved = 0; | ||
519 | if (async_transid) { | 527 | if (async_transid) { |
520 | *async_transid = trans->transid; | 528 | *async_transid = trans->transid; |
521 | err = btrfs_commit_transaction_async(trans, root, 1); | 529 | err = btrfs_commit_transaction_async(trans, root, 1); |
@@ -527,13 +535,15 @@ fail: | |||
527 | 535 | ||
528 | if (!ret) | 536 | if (!ret) |
529 | d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); | 537 | d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); |
530 | 538 | out: | |
539 | btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); | ||
531 | return ret; | 540 | return ret; |
532 | } | 541 | } |
533 | 542 | ||
534 | static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, | 543 | static int create_snapshot(struct btrfs_root *root, struct inode *dir, |
535 | char *name, int namelen, u64 *async_transid, | 544 | struct dentry *dentry, char *name, int namelen, |
536 | bool readonly, struct btrfs_qgroup_inherit **inherit) | 545 | u64 *async_transid, bool readonly, |
546 | struct btrfs_qgroup_inherit *inherit) | ||
537 | { | 547 | { |
538 | struct inode *inode; | 548 | struct inode *inode; |
539 | struct btrfs_pending_snapshot *pending_snapshot; | 549 | struct btrfs_pending_snapshot *pending_snapshot; |
@@ -549,23 +559,31 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, | |||
549 | 559 | ||
550 | btrfs_init_block_rsv(&pending_snapshot->block_rsv, | 560 | btrfs_init_block_rsv(&pending_snapshot->block_rsv, |
551 | BTRFS_BLOCK_RSV_TEMP); | 561 | BTRFS_BLOCK_RSV_TEMP); |
562 | /* | ||
563 | * 1 - parent dir inode | ||
564 | * 2 - dir entries | ||
565 | * 1 - root item | ||
566 | * 2 - root ref/backref | ||
567 | * 1 - root of snapshot | ||
568 | */ | ||
569 | ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, | ||
570 | &pending_snapshot->block_rsv, 7, | ||
571 | &pending_snapshot->qgroup_reserved); | ||
572 | if (ret) | ||
573 | goto out; | ||
574 | |||
552 | pending_snapshot->dentry = dentry; | 575 | pending_snapshot->dentry = dentry; |
553 | pending_snapshot->root = root; | 576 | pending_snapshot->root = root; |
554 | pending_snapshot->readonly = readonly; | 577 | pending_snapshot->readonly = readonly; |
555 | if (inherit) { | 578 | pending_snapshot->dir = dir; |
556 | pending_snapshot->inherit = *inherit; | 579 | pending_snapshot->inherit = inherit; |
557 | *inherit = NULL; /* take responsibility to free it */ | ||
558 | } | ||
559 | 580 | ||
560 | trans = btrfs_start_transaction(root->fs_info->extent_root, 6); | 581 | trans = btrfs_start_transaction(root, 0); |
561 | if (IS_ERR(trans)) { | 582 | if (IS_ERR(trans)) { |
562 | ret = PTR_ERR(trans); | 583 | ret = PTR_ERR(trans); |
563 | goto fail; | 584 | goto fail; |
564 | } | 585 | } |
565 | 586 | ||
566 | ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); | ||
567 | BUG_ON(ret); | ||
568 | |||
569 | spin_lock(&root->fs_info->trans_lock); | 587 | spin_lock(&root->fs_info->trans_lock); |
570 | list_add(&pending_snapshot->list, | 588 | list_add(&pending_snapshot->list, |
571 | &trans->transaction->pending_snapshots); | 589 | &trans->transaction->pending_snapshots); |
@@ -602,6 +620,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, | |||
602 | d_instantiate(dentry, inode); | 620 | d_instantiate(dentry, inode); |
603 | ret = 0; | 621 | ret = 0; |
604 | fail: | 622 | fail: |
623 | btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, | ||
624 | &pending_snapshot->block_rsv, | ||
625 | pending_snapshot->qgroup_reserved); | ||
626 | out: | ||
605 | kfree(pending_snapshot); | 627 | kfree(pending_snapshot); |
606 | return ret; | 628 | return ret; |
607 | } | 629 | } |
@@ -695,7 +717,7 @@ static noinline int btrfs_mksubvol(struct path *parent, | |||
695 | char *name, int namelen, | 717 | char *name, int namelen, |
696 | struct btrfs_root *snap_src, | 718 | struct btrfs_root *snap_src, |
697 | u64 *async_transid, bool readonly, | 719 | u64 *async_transid, bool readonly, |
698 | struct btrfs_qgroup_inherit **inherit) | 720 | struct btrfs_qgroup_inherit *inherit) |
699 | { | 721 | { |
700 | struct inode *dir = parent->dentry->d_inode; | 722 | struct inode *dir = parent->dentry->d_inode; |
701 | struct dentry *dentry; | 723 | struct dentry *dentry; |
@@ -732,11 +754,11 @@ static noinline int btrfs_mksubvol(struct path *parent, | |||
732 | goto out_up_read; | 754 | goto out_up_read; |
733 | 755 | ||
734 | if (snap_src) { | 756 | if (snap_src) { |
735 | error = create_snapshot(snap_src, dentry, name, namelen, | 757 | error = create_snapshot(snap_src, dir, dentry, name, namelen, |
736 | async_transid, readonly, inherit); | 758 | async_transid, readonly, inherit); |
737 | } else { | 759 | } else { |
738 | error = create_subvol(BTRFS_I(dir)->root, dentry, | 760 | error = create_subvol(dir, dentry, name, namelen, |
739 | name, namelen, async_transid, inherit); | 761 | async_transid, inherit); |
740 | } | 762 | } |
741 | if (!error) | 763 | if (!error) |
742 | fsnotify_mkdir(dir, dentry); | 764 | fsnotify_mkdir(dir, dentry); |
@@ -818,7 +840,7 @@ static int find_new_extents(struct btrfs_root *root, | |||
818 | 840 | ||
819 | while(1) { | 841 | while(1) { |
820 | ret = btrfs_search_forward(root, &min_key, &max_key, | 842 | ret = btrfs_search_forward(root, &min_key, &max_key, |
821 | path, 0, newer_than); | 843 | path, newer_than); |
822 | if (ret != 0) | 844 | if (ret != 0) |
823 | goto none; | 845 | goto none; |
824 | if (min_key.objectid != ino) | 846 | if (min_key.objectid != ino) |
@@ -1206,6 +1228,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, | |||
1206 | if (!(inode->i_sb->s_flags & MS_ACTIVE)) | 1228 | if (!(inode->i_sb->s_flags & MS_ACTIVE)) |
1207 | break; | 1229 | break; |
1208 | 1230 | ||
1231 | if (btrfs_defrag_cancelled(root->fs_info)) { | ||
1232 | printk(KERN_DEBUG "btrfs: defrag_file cancelled\n"); | ||
1233 | ret = -EAGAIN; | ||
1234 | break; | ||
1235 | } | ||
1236 | |||
1209 | if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, | 1237 | if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, |
1210 | extent_thresh, &last_len, &skip, | 1238 | extent_thresh, &last_len, &skip, |
1211 | &defrag_end, range->flags & | 1239 | &defrag_end, range->flags & |
@@ -1329,9 +1357,6 @@ static noinline int btrfs_ioctl_resize(struct file *file, | |||
1329 | int ret = 0; | 1357 | int ret = 0; |
1330 | int mod = 0; | 1358 | int mod = 0; |
1331 | 1359 | ||
1332 | if (root->fs_info->sb->s_flags & MS_RDONLY) | ||
1333 | return -EROFS; | ||
1334 | |||
1335 | if (!capable(CAP_SYS_ADMIN)) | 1360 | if (!capable(CAP_SYS_ADMIN)) |
1336 | return -EPERM; | 1361 | return -EPERM; |
1337 | 1362 | ||
@@ -1363,6 +1388,10 @@ static noinline int btrfs_ioctl_resize(struct file *file, | |||
1363 | *devstr = '\0'; | 1388 | *devstr = '\0'; |
1364 | devstr = vol_args->name; | 1389 | devstr = vol_args->name; |
1365 | devid = simple_strtoull(devstr, &end, 10); | 1390 | devid = simple_strtoull(devstr, &end, 10); |
1391 | if (!devid) { | ||
1392 | ret = -EINVAL; | ||
1393 | goto out_free; | ||
1394 | } | ||
1366 | printk(KERN_INFO "btrfs: resizing devid %llu\n", | 1395 | printk(KERN_INFO "btrfs: resizing devid %llu\n", |
1367 | (unsigned long long)devid); | 1396 | (unsigned long long)devid); |
1368 | } | 1397 | } |
@@ -1371,7 +1400,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, | |||
1371 | if (!device) { | 1400 | if (!device) { |
1372 | printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", | 1401 | printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", |
1373 | (unsigned long long)devid); | 1402 | (unsigned long long)devid); |
1374 | ret = -EINVAL; | 1403 | ret = -ENODEV; |
1375 | goto out_free; | 1404 | goto out_free; |
1376 | } | 1405 | } |
1377 | 1406 | ||
@@ -1379,7 +1408,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, | |||
1379 | printk(KERN_INFO "btrfs: resizer unable to apply on " | 1408 | printk(KERN_INFO "btrfs: resizer unable to apply on " |
1380 | "readonly device %llu\n", | 1409 | "readonly device %llu\n", |
1381 | (unsigned long long)devid); | 1410 | (unsigned long long)devid); |
1382 | ret = -EINVAL; | 1411 | ret = -EPERM; |
1383 | goto out_free; | 1412 | goto out_free; |
1384 | } | 1413 | } |
1385 | 1414 | ||
@@ -1401,7 +1430,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, | |||
1401 | } | 1430 | } |
1402 | 1431 | ||
1403 | if (device->is_tgtdev_for_dev_replace) { | 1432 | if (device->is_tgtdev_for_dev_replace) { |
1404 | ret = -EINVAL; | 1433 | ret = -EPERM; |
1405 | goto out_free; | 1434 | goto out_free; |
1406 | } | 1435 | } |
1407 | 1436 | ||
@@ -1457,7 +1486,7 @@ out: | |||
1457 | static noinline int btrfs_ioctl_snap_create_transid(struct file *file, | 1486 | static noinline int btrfs_ioctl_snap_create_transid(struct file *file, |
1458 | char *name, unsigned long fd, int subvol, | 1487 | char *name, unsigned long fd, int subvol, |
1459 | u64 *transid, bool readonly, | 1488 | u64 *transid, bool readonly, |
1460 | struct btrfs_qgroup_inherit **inherit) | 1489 | struct btrfs_qgroup_inherit *inherit) |
1461 | { | 1490 | { |
1462 | int namelen; | 1491 | int namelen; |
1463 | int ret = 0; | 1492 | int ret = 0; |
@@ -1566,7 +1595,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, | |||
1566 | 1595 | ||
1567 | ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, | 1596 | ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, |
1568 | vol_args->fd, subvol, ptr, | 1597 | vol_args->fd, subvol, ptr, |
1569 | readonly, &inherit); | 1598 | readonly, inherit); |
1570 | 1599 | ||
1571 | if (ret == 0 && ptr && | 1600 | if (ret == 0 && ptr && |
1572 | copy_to_user(arg + | 1601 | copy_to_user(arg + |
@@ -1863,7 +1892,7 @@ static noinline int search_ioctl(struct inode *inode, | |||
1863 | path->keep_locks = 1; | 1892 | path->keep_locks = 1; |
1864 | 1893 | ||
1865 | while(1) { | 1894 | while(1) { |
1866 | ret = btrfs_search_forward(root, &key, &max_key, path, 0, | 1895 | ret = btrfs_search_forward(root, &key, &max_key, path, |
1867 | sk->min_transid); | 1896 | sk->min_transid); |
1868 | if (ret != 0) { | 1897 | if (ret != 0) { |
1869 | if (ret > 0) | 1898 | if (ret > 0) |
@@ -2035,6 +2064,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, | |||
2035 | struct btrfs_root *dest = NULL; | 2064 | struct btrfs_root *dest = NULL; |
2036 | struct btrfs_ioctl_vol_args *vol_args; | 2065 | struct btrfs_ioctl_vol_args *vol_args; |
2037 | struct btrfs_trans_handle *trans; | 2066 | struct btrfs_trans_handle *trans; |
2067 | struct btrfs_block_rsv block_rsv; | ||
2068 | u64 qgroup_reserved; | ||
2038 | int namelen; | 2069 | int namelen; |
2039 | int ret; | 2070 | int ret; |
2040 | int err = 0; | 2071 | int err = 0; |
@@ -2124,12 +2155,23 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, | |||
2124 | if (err) | 2155 | if (err) |
2125 | goto out_up_write; | 2156 | goto out_up_write; |
2126 | 2157 | ||
2158 | btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); | ||
2159 | /* | ||
2160 | * One for dir inode, two for dir entries, two for root | ||
2161 | * ref/backref. | ||
2162 | */ | ||
2163 | err = btrfs_subvolume_reserve_metadata(root, &block_rsv, | ||
2164 | 5, &qgroup_reserved); | ||
2165 | if (err) | ||
2166 | goto out_up_write; | ||
2167 | |||
2127 | trans = btrfs_start_transaction(root, 0); | 2168 | trans = btrfs_start_transaction(root, 0); |
2128 | if (IS_ERR(trans)) { | 2169 | if (IS_ERR(trans)) { |
2129 | err = PTR_ERR(trans); | 2170 | err = PTR_ERR(trans); |
2130 | goto out_up_write; | 2171 | goto out_release; |
2131 | } | 2172 | } |
2132 | trans->block_rsv = &root->fs_info->global_block_rsv; | 2173 | trans->block_rsv = &block_rsv; |
2174 | trans->bytes_reserved = block_rsv.size; | ||
2133 | 2175 | ||
2134 | ret = btrfs_unlink_subvol(trans, root, dir, | 2176 | ret = btrfs_unlink_subvol(trans, root, dir, |
2135 | dest->root_key.objectid, | 2177 | dest->root_key.objectid, |
@@ -2159,10 +2201,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, | |||
2159 | } | 2201 | } |
2160 | } | 2202 | } |
2161 | out_end_trans: | 2203 | out_end_trans: |
2204 | trans->block_rsv = NULL; | ||
2205 | trans->bytes_reserved = 0; | ||
2162 | ret = btrfs_end_transaction(trans, root); | 2206 | ret = btrfs_end_transaction(trans, root); |
2163 | if (ret && !err) | 2207 | if (ret && !err) |
2164 | err = ret; | 2208 | err = ret; |
2165 | inode->i_flags |= S_DEAD; | 2209 | inode->i_flags |= S_DEAD; |
2210 | out_release: | ||
2211 | btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); | ||
2166 | out_up_write: | 2212 | out_up_write: |
2167 | up_write(&root->fs_info->subvol_sem); | 2213 | up_write(&root->fs_info->subvol_sem); |
2168 | out_unlock: | 2214 | out_unlock: |
@@ -2171,6 +2217,12 @@ out_unlock: | |||
2171 | shrink_dcache_sb(root->fs_info->sb); | 2217 | shrink_dcache_sb(root->fs_info->sb); |
2172 | btrfs_invalidate_inodes(dest); | 2218 | btrfs_invalidate_inodes(dest); |
2173 | d_delete(dentry); | 2219 | d_delete(dentry); |
2220 | |||
2221 | /* the last ref */ | ||
2222 | if (dest->cache_inode) { | ||
2223 | iput(dest->cache_inode); | ||
2224 | dest->cache_inode = NULL; | ||
2225 | } | ||
2174 | } | 2226 | } |
2175 | out_dput: | 2227 | out_dput: |
2176 | dput(dentry); | 2228 | dput(dentry); |
@@ -2211,10 +2263,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) | |||
2211 | ret = -EPERM; | 2263 | ret = -EPERM; |
2212 | goto out; | 2264 | goto out; |
2213 | } | 2265 | } |
2214 | ret = btrfs_defrag_root(root, 0); | 2266 | ret = btrfs_defrag_root(root); |
2215 | if (ret) | 2267 | if (ret) |
2216 | goto out; | 2268 | goto out; |
2217 | ret = btrfs_defrag_root(root->fs_info->extent_root, 0); | 2269 | ret = btrfs_defrag_root(root->fs_info->extent_root); |
2218 | break; | 2270 | break; |
2219 | case S_IFREG: | 2271 | case S_IFREG: |
2220 | if (!(file->f_mode & FMODE_WRITE)) { | 2272 | if (!(file->f_mode & FMODE_WRITE)) { |
@@ -3111,7 +3163,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, | |||
3111 | u64 transid; | 3163 | u64 transid; |
3112 | int ret; | 3164 | int ret; |
3113 | 3165 | ||
3114 | trans = btrfs_attach_transaction(root); | 3166 | trans = btrfs_attach_transaction_barrier(root); |
3115 | if (IS_ERR(trans)) { | 3167 | if (IS_ERR(trans)) { |
3116 | if (PTR_ERR(trans) != -ENOENT) | 3168 | if (PTR_ERR(trans) != -ENOENT) |
3117 | return PTR_ERR(trans); | 3169 | return PTR_ERR(trans); |
@@ -3289,7 +3341,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) | |||
3289 | struct inode_fs_paths *ipath = NULL; | 3341 | struct inode_fs_paths *ipath = NULL; |
3290 | struct btrfs_path *path; | 3342 | struct btrfs_path *path; |
3291 | 3343 | ||
3292 | if (!capable(CAP_SYS_ADMIN)) | 3344 | if (!capable(CAP_DAC_READ_SEARCH)) |
3293 | return -EPERM; | 3345 | return -EPERM; |
3294 | 3346 | ||
3295 | path = btrfs_alloc_path(); | 3347 | path = btrfs_alloc_path(); |
@@ -3914,6 +3966,65 @@ out: | |||
3914 | return ret; | 3966 | return ret; |
3915 | } | 3967 | } |
3916 | 3968 | ||
3969 | static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) | ||
3970 | { | ||
3971 | struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; | ||
3972 | const char *label = root->fs_info->super_copy->label; | ||
3973 | size_t len = strnlen(label, BTRFS_LABEL_SIZE); | ||
3974 | int ret; | ||
3975 | |||
3976 | if (len == BTRFS_LABEL_SIZE) { | ||
3977 | pr_warn("btrfs: label is too long, return the first %zu bytes\n", | ||
3978 | --len); | ||
3979 | } | ||
3980 | |||
3981 | mutex_lock(&root->fs_info->volume_mutex); | ||
3982 | ret = copy_to_user(arg, label, len); | ||
3983 | mutex_unlock(&root->fs_info->volume_mutex); | ||
3984 | |||
3985 | return ret ? -EFAULT : 0; | ||
3986 | } | ||
3987 | |||
3988 | static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) | ||
3989 | { | ||
3990 | struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; | ||
3991 | struct btrfs_super_block *super_block = root->fs_info->super_copy; | ||
3992 | struct btrfs_trans_handle *trans; | ||
3993 | char label[BTRFS_LABEL_SIZE]; | ||
3994 | int ret; | ||
3995 | |||
3996 | if (!capable(CAP_SYS_ADMIN)) | ||
3997 | return -EPERM; | ||
3998 | |||
3999 | if (copy_from_user(label, arg, sizeof(label))) | ||
4000 | return -EFAULT; | ||
4001 | |||
4002 | if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) { | ||
4003 | pr_err("btrfs: unable to set label with more than %d bytes\n", | ||
4004 | BTRFS_LABEL_SIZE - 1); | ||
4005 | return -EINVAL; | ||
4006 | } | ||
4007 | |||
4008 | ret = mnt_want_write_file(file); | ||
4009 | if (ret) | ||
4010 | return ret; | ||
4011 | |||
4012 | mutex_lock(&root->fs_info->volume_mutex); | ||
4013 | trans = btrfs_start_transaction(root, 0); | ||
4014 | if (IS_ERR(trans)) { | ||
4015 | ret = PTR_ERR(trans); | ||
4016 | goto out_unlock; | ||
4017 | } | ||
4018 | |||
4019 | strcpy(super_block->label, label); | ||
4020 | ret = btrfs_end_transaction(trans, root); | ||
4021 | |||
4022 | out_unlock: | ||
4023 | mutex_unlock(&root->fs_info->volume_mutex); | ||
4024 | mnt_drop_write_file(file); | ||
4025 | return ret; | ||
4026 | } | ||
4027 | |||
3917 | long btrfs_ioctl(struct file *file, unsigned int | 4028 | long btrfs_ioctl(struct file *file, unsigned int |
3918 | cmd, unsigned long arg) | 4029 | cmd, unsigned long arg) |
3919 | { | 4030 | { |
@@ -4014,6 +4125,10 @@ long btrfs_ioctl(struct file *file, unsigned int | |||
4014 | return btrfs_ioctl_qgroup_limit(file, argp); | 4125 | return btrfs_ioctl_qgroup_limit(file, argp); |
4015 | case BTRFS_IOC_DEV_REPLACE: | 4126 | case BTRFS_IOC_DEV_REPLACE: |
4016 | return btrfs_ioctl_dev_replace(root, argp); | 4127 | return btrfs_ioctl_dev_replace(root, argp); |
4128 | case BTRFS_IOC_GET_FSLABEL: | ||
4129 | return btrfs_ioctl_get_fslabel(file, argp); | ||
4130 | case BTRFS_IOC_SET_FSLABEL: | ||
4131 | return btrfs_ioctl_set_fslabel(file, argp); | ||
4017 | } | 4132 | } |
4018 | 4133 | ||
4019 | return -ENOTTY; | 4134 | return -ENOTTY; |
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h deleted file mode 100644 index dabca9cc8c2e..000000000000 --- a/fs/btrfs/ioctl.h +++ /dev/null | |||
@@ -1,502 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #ifndef __IOCTL_ | ||
20 | #define __IOCTL_ | ||
21 | #include <linux/ioctl.h> | ||
22 | |||
23 | #define BTRFS_IOCTL_MAGIC 0x94 | ||
24 | #define BTRFS_VOL_NAME_MAX 255 | ||
25 | |||
26 | /* this should be 4k */ | ||
27 | #define BTRFS_PATH_NAME_MAX 4087 | ||
28 | struct btrfs_ioctl_vol_args { | ||
29 | __s64 fd; | ||
30 | char name[BTRFS_PATH_NAME_MAX + 1]; | ||
31 | }; | ||
32 | |||
33 | #define BTRFS_DEVICE_PATH_NAME_MAX 1024 | ||
34 | |||
35 | #define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) | ||
36 | #define BTRFS_SUBVOL_RDONLY (1ULL << 1) | ||
37 | #define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) | ||
38 | #define BTRFS_FSID_SIZE 16 | ||
39 | #define BTRFS_UUID_SIZE 16 | ||
40 | |||
41 | #define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0) | ||
42 | |||
43 | struct btrfs_qgroup_limit { | ||
44 | __u64 flags; | ||
45 | __u64 max_rfer; | ||
46 | __u64 max_excl; | ||
47 | __u64 rsv_rfer; | ||
48 | __u64 rsv_excl; | ||
49 | }; | ||
50 | |||
51 | struct btrfs_qgroup_inherit { | ||
52 | __u64 flags; | ||
53 | __u64 num_qgroups; | ||
54 | __u64 num_ref_copies; | ||
55 | __u64 num_excl_copies; | ||
56 | struct btrfs_qgroup_limit lim; | ||
57 | __u64 qgroups[0]; | ||
58 | }; | ||
59 | |||
60 | struct btrfs_ioctl_qgroup_limit_args { | ||
61 | __u64 qgroupid; | ||
62 | struct btrfs_qgroup_limit lim; | ||
63 | }; | ||
64 | |||
65 | #define BTRFS_SUBVOL_NAME_MAX 4039 | ||
66 | struct btrfs_ioctl_vol_args_v2 { | ||
67 | __s64 fd; | ||
68 | __u64 transid; | ||
69 | __u64 flags; | ||
70 | union { | ||
71 | struct { | ||
72 | __u64 size; | ||
73 | struct btrfs_qgroup_inherit __user *qgroup_inherit; | ||
74 | }; | ||
75 | __u64 unused[4]; | ||
76 | }; | ||
77 | char name[BTRFS_SUBVOL_NAME_MAX + 1]; | ||
78 | }; | ||
79 | |||
80 | /* | ||
81 | * structure to report errors and progress to userspace, either as a | ||
82 | * result of a finished scrub, a canceled scrub or a progress inquiry | ||
83 | */ | ||
84 | struct btrfs_scrub_progress { | ||
85 | __u64 data_extents_scrubbed; /* # of data extents scrubbed */ | ||
86 | __u64 tree_extents_scrubbed; /* # of tree extents scrubbed */ | ||
87 | __u64 data_bytes_scrubbed; /* # of data bytes scrubbed */ | ||
88 | __u64 tree_bytes_scrubbed; /* # of tree bytes scrubbed */ | ||
89 | __u64 read_errors; /* # of read errors encountered (EIO) */ | ||
90 | __u64 csum_errors; /* # of failed csum checks */ | ||
91 | __u64 verify_errors; /* # of occurences, where the metadata | ||
92 | * of a tree block did not match the | ||
93 | * expected values, like generation or | ||
94 | * logical */ | ||
95 | __u64 no_csum; /* # of 4k data block for which no csum | ||
96 | * is present, probably the result of | ||
97 | * data written with nodatasum */ | ||
98 | __u64 csum_discards; /* # of csum for which no data was found | ||
99 | * in the extent tree. */ | ||
100 | __u64 super_errors; /* # of bad super blocks encountered */ | ||
101 | __u64 malloc_errors; /* # of internal kmalloc errors. These | ||
102 | * will likely cause an incomplete | ||
103 | * scrub */ | ||
104 | __u64 uncorrectable_errors; /* # of errors where either no intact | ||
105 | * copy was found or the writeback | ||
106 | * failed */ | ||
107 | __u64 corrected_errors; /* # of errors corrected */ | ||
108 | __u64 last_physical; /* last physical address scrubbed. In | ||
109 | * case a scrub was aborted, this can | ||
110 | * be used to restart the scrub */ | ||
111 | __u64 unverified_errors; /* # of occurences where a read for a | ||
112 | * full (64k) bio failed, but the re- | ||
113 | * check succeeded for each 4k piece. | ||
114 | * Intermittent error. */ | ||
115 | }; | ||
116 | |||
117 | #define BTRFS_SCRUB_READONLY 1 | ||
118 | struct btrfs_ioctl_scrub_args { | ||
119 | __u64 devid; /* in */ | ||
120 | __u64 start; /* in */ | ||
121 | __u64 end; /* in */ | ||
122 | __u64 flags; /* in */ | ||
123 | struct btrfs_scrub_progress progress; /* out */ | ||
124 | /* pad to 1k */ | ||
125 | __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8]; | ||
126 | }; | ||
127 | |||
128 | #define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 | ||
129 | #define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 | ||
130 | struct btrfs_ioctl_dev_replace_start_params { | ||
131 | __u64 srcdevid; /* in, if 0, use srcdev_name instead */ | ||
132 | __u64 cont_reading_from_srcdev_mode; /* in, see #define | ||
133 | * above */ | ||
134 | __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */ | ||
135 | __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */ | ||
136 | }; | ||
137 | |||
138 | #define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0 | ||
139 | #define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED 1 | ||
140 | #define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2 | ||
141 | #define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3 | ||
142 | #define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4 | ||
143 | struct btrfs_ioctl_dev_replace_status_params { | ||
144 | __u64 replace_state; /* out, see #define above */ | ||
145 | __u64 progress_1000; /* out, 0 <= x <= 1000 */ | ||
146 | __u64 time_started; /* out, seconds since 1-Jan-1970 */ | ||
147 | __u64 time_stopped; /* out, seconds since 1-Jan-1970 */ | ||
148 | __u64 num_write_errors; /* out */ | ||
149 | __u64 num_uncorrectable_read_errors; /* out */ | ||
150 | }; | ||
151 | |||
152 | #define BTRFS_IOCTL_DEV_REPLACE_CMD_START 0 | ||
153 | #define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS 1 | ||
154 | #define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL 2 | ||
155 | #define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0 | ||
156 | #define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1 | ||
157 | #define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2 | ||
158 | struct btrfs_ioctl_dev_replace_args { | ||
159 | __u64 cmd; /* in */ | ||
160 | __u64 result; /* out */ | ||
161 | |||
162 | union { | ||
163 | struct btrfs_ioctl_dev_replace_start_params start; | ||
164 | struct btrfs_ioctl_dev_replace_status_params status; | ||
165 | }; /* in/out */ | ||
166 | |||
167 | __u64 spare[64]; | ||
168 | }; | ||
169 | |||
170 | struct btrfs_ioctl_dev_info_args { | ||
171 | __u64 devid; /* in/out */ | ||
172 | __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */ | ||
173 | __u64 bytes_used; /* out */ | ||
174 | __u64 total_bytes; /* out */ | ||
175 | __u64 unused[379]; /* pad to 4k */ | ||
176 | __u8 path[BTRFS_DEVICE_PATH_NAME_MAX]; /* out */ | ||
177 | }; | ||
178 | |||
179 | struct btrfs_ioctl_fs_info_args { | ||
180 | __u64 max_id; /* out */ | ||
181 | __u64 num_devices; /* out */ | ||
182 | __u8 fsid[BTRFS_FSID_SIZE]; /* out */ | ||
183 | __u64 reserved[124]; /* pad to 1k */ | ||
184 | }; | ||
185 | |||
186 | /* balance control ioctl modes */ | ||
187 | #define BTRFS_BALANCE_CTL_PAUSE 1 | ||
188 | #define BTRFS_BALANCE_CTL_CANCEL 2 | ||
189 | |||
190 | /* | ||
191 | * this is packed, because it should be exactly the same as its disk | ||
192 | * byte order counterpart (struct btrfs_disk_balance_args) | ||
193 | */ | ||
194 | struct btrfs_balance_args { | ||
195 | __u64 profiles; | ||
196 | __u64 usage; | ||
197 | __u64 devid; | ||
198 | __u64 pstart; | ||
199 | __u64 pend; | ||
200 | __u64 vstart; | ||
201 | __u64 vend; | ||
202 | |||
203 | __u64 target; | ||
204 | |||
205 | __u64 flags; | ||
206 | |||
207 | __u64 unused[8]; | ||
208 | } __attribute__ ((__packed__)); | ||
209 | |||
210 | /* report balance progress to userspace */ | ||
211 | struct btrfs_balance_progress { | ||
212 | __u64 expected; /* estimated # of chunks that will be | ||
213 | * relocated to fulfill the request */ | ||
214 | __u64 considered; /* # of chunks we have considered so far */ | ||
215 | __u64 completed; /* # of chunks relocated so far */ | ||
216 | }; | ||
217 | |||
218 | #define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0) | ||
219 | #define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1) | ||
220 | #define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2) | ||
221 | |||
222 | struct btrfs_ioctl_balance_args { | ||
223 | __u64 flags; /* in/out */ | ||
224 | __u64 state; /* out */ | ||
225 | |||
226 | struct btrfs_balance_args data; /* in/out */ | ||
227 | struct btrfs_balance_args meta; /* in/out */ | ||
228 | struct btrfs_balance_args sys; /* in/out */ | ||
229 | |||
230 | struct btrfs_balance_progress stat; /* out */ | ||
231 | |||
232 | __u64 unused[72]; /* pad to 1k */ | ||
233 | }; | ||
234 | |||
235 | #define BTRFS_INO_LOOKUP_PATH_MAX 4080 | ||
236 | struct btrfs_ioctl_ino_lookup_args { | ||
237 | __u64 treeid; | ||
238 | __u64 objectid; | ||
239 | char name[BTRFS_INO_LOOKUP_PATH_MAX]; | ||
240 | }; | ||
241 | |||
242 | struct btrfs_ioctl_search_key { | ||
243 | /* which root are we searching. 0 is the tree of tree roots */ | ||
244 | __u64 tree_id; | ||
245 | |||
246 | /* keys returned will be >= min and <= max */ | ||
247 | __u64 min_objectid; | ||
248 | __u64 max_objectid; | ||
249 | |||
250 | /* keys returned will be >= min and <= max */ | ||
251 | __u64 min_offset; | ||
252 | __u64 max_offset; | ||
253 | |||
254 | /* max and min transids to search for */ | ||
255 | __u64 min_transid; | ||
256 | __u64 max_transid; | ||
257 | |||
258 | /* keys returned will be >= min and <= max */ | ||
259 | __u32 min_type; | ||
260 | __u32 max_type; | ||
261 | |||
262 | /* | ||
263 | * how many items did userland ask for, and how many are we | ||
264 | * returning | ||
265 | */ | ||
266 | __u32 nr_items; | ||
267 | |||
268 | /* align to 64 bits */ | ||
269 | __u32 unused; | ||
270 | |||
271 | /* some extra for later */ | ||
272 | __u64 unused1; | ||
273 | __u64 unused2; | ||
274 | __u64 unused3; | ||
275 | __u64 unused4; | ||
276 | }; | ||
277 | |||
278 | struct btrfs_ioctl_search_header { | ||
279 | __u64 transid; | ||
280 | __u64 objectid; | ||
281 | __u64 offset; | ||
282 | __u32 type; | ||
283 | __u32 len; | ||
284 | }; | ||
285 | |||
286 | #define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key)) | ||
287 | /* | ||
288 | * the buf is an array of search headers where | ||
289 | * each header is followed by the actual item | ||
290 | * the type field is expanded to 32 bits for alignment | ||
291 | */ | ||
292 | struct btrfs_ioctl_search_args { | ||
293 | struct btrfs_ioctl_search_key key; | ||
294 | char buf[BTRFS_SEARCH_ARGS_BUFSIZE]; | ||
295 | }; | ||
296 | |||
297 | struct btrfs_ioctl_clone_range_args { | ||
298 | __s64 src_fd; | ||
299 | __u64 src_offset, src_length; | ||
300 | __u64 dest_offset; | ||
301 | }; | ||
302 | |||
303 | /* flags for the defrag range ioctl */ | ||
304 | #define BTRFS_DEFRAG_RANGE_COMPRESS 1 | ||
305 | #define BTRFS_DEFRAG_RANGE_START_IO 2 | ||
306 | |||
307 | struct btrfs_ioctl_space_info { | ||
308 | __u64 flags; | ||
309 | __u64 total_bytes; | ||
310 | __u64 used_bytes; | ||
311 | }; | ||
312 | |||
313 | struct btrfs_ioctl_space_args { | ||
314 | __u64 space_slots; | ||
315 | __u64 total_spaces; | ||
316 | struct btrfs_ioctl_space_info spaces[0]; | ||
317 | }; | ||
318 | |||
319 | struct btrfs_data_container { | ||
320 | __u32 bytes_left; /* out -- bytes not needed to deliver output */ | ||
321 | __u32 bytes_missing; /* out -- additional bytes needed for result */ | ||
322 | __u32 elem_cnt; /* out */ | ||
323 | __u32 elem_missed; /* out */ | ||
324 | __u64 val[0]; /* out */ | ||
325 | }; | ||
326 | |||
327 | struct btrfs_ioctl_ino_path_args { | ||
328 | __u64 inum; /* in */ | ||
329 | __u64 size; /* in */ | ||
330 | __u64 reserved[4]; | ||
331 | /* struct btrfs_data_container *fspath; out */ | ||
332 | __u64 fspath; /* out */ | ||
333 | }; | ||
334 | |||
335 | struct btrfs_ioctl_logical_ino_args { | ||
336 | __u64 logical; /* in */ | ||
337 | __u64 size; /* in */ | ||
338 | __u64 reserved[4]; | ||
339 | /* struct btrfs_data_container *inodes; out */ | ||
340 | __u64 inodes; | ||
341 | }; | ||
342 | |||
343 | enum btrfs_dev_stat_values { | ||
344 | /* disk I/O failure stats */ | ||
345 | BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */ | ||
346 | BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */ | ||
347 | BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */ | ||
348 | |||
349 | /* stats for indirect indications for I/O failures */ | ||
350 | BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or | ||
351 | * contents is illegal: this is an | ||
352 | * indication that the block was damaged | ||
353 | * during read or write, or written to | ||
354 | * wrong location or read from wrong | ||
355 | * location */ | ||
356 | BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not | ||
357 | * been written */ | ||
358 | |||
359 | BTRFS_DEV_STAT_VALUES_MAX | ||
360 | }; | ||
361 | |||
362 | /* Reset statistics after reading; needs SYS_ADMIN capability */ | ||
363 | #define BTRFS_DEV_STATS_RESET (1ULL << 0) | ||
364 | |||
365 | struct btrfs_ioctl_get_dev_stats { | ||
366 | __u64 devid; /* in */ | ||
367 | __u64 nr_items; /* in/out */ | ||
368 | __u64 flags; /* in/out */ | ||
369 | |||
370 | /* out values: */ | ||
371 | __u64 values[BTRFS_DEV_STAT_VALUES_MAX]; | ||
372 | |||
373 | __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */ | ||
374 | }; | ||
375 | |||
376 | #define BTRFS_QUOTA_CTL_ENABLE 1 | ||
377 | #define BTRFS_QUOTA_CTL_DISABLE 2 | ||
378 | #define BTRFS_QUOTA_CTL_RESCAN 3 | ||
379 | struct btrfs_ioctl_quota_ctl_args { | ||
380 | __u64 cmd; | ||
381 | __u64 status; | ||
382 | }; | ||
383 | |||
384 | struct btrfs_ioctl_qgroup_assign_args { | ||
385 | __u64 assign; | ||
386 | __u64 src; | ||
387 | __u64 dst; | ||
388 | }; | ||
389 | |||
390 | struct btrfs_ioctl_qgroup_create_args { | ||
391 | __u64 create; | ||
392 | __u64 qgroupid; | ||
393 | }; | ||
394 | struct btrfs_ioctl_timespec { | ||
395 | __u64 sec; | ||
396 | __u32 nsec; | ||
397 | }; | ||
398 | |||
399 | struct btrfs_ioctl_received_subvol_args { | ||
400 | char uuid[BTRFS_UUID_SIZE]; /* in */ | ||
401 | __u64 stransid; /* in */ | ||
402 | __u64 rtransid; /* out */ | ||
403 | struct btrfs_ioctl_timespec stime; /* in */ | ||
404 | struct btrfs_ioctl_timespec rtime; /* out */ | ||
405 | __u64 flags; /* in */ | ||
406 | __u64 reserved[16]; /* in */ | ||
407 | }; | ||
408 | |||
409 | struct btrfs_ioctl_send_args { | ||
410 | __s64 send_fd; /* in */ | ||
411 | __u64 clone_sources_count; /* in */ | ||
412 | __u64 __user *clone_sources; /* in */ | ||
413 | __u64 parent_root; /* in */ | ||
414 | __u64 flags; /* in */ | ||
415 | __u64 reserved[4]; /* in */ | ||
416 | }; | ||
417 | |||
418 | #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ | ||
419 | struct btrfs_ioctl_vol_args) | ||
420 | #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ | ||
421 | struct btrfs_ioctl_vol_args) | ||
422 | #define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \ | ||
423 | struct btrfs_ioctl_vol_args) | ||
424 | #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ | ||
425 | struct btrfs_ioctl_vol_args) | ||
426 | /* trans start and trans end are dangerous, and only for | ||
427 | * use by applications that know how to avoid the | ||
428 | * resulting deadlocks | ||
429 | */ | ||
430 | #define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6) | ||
431 | #define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7) | ||
432 | #define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8) | ||
433 | |||
434 | #define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int) | ||
435 | #define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \ | ||
436 | struct btrfs_ioctl_vol_args) | ||
437 | #define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \ | ||
438 | struct btrfs_ioctl_vol_args) | ||
439 | #define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \ | ||
440 | struct btrfs_ioctl_vol_args) | ||
441 | |||
442 | #define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \ | ||
443 | struct btrfs_ioctl_clone_range_args) | ||
444 | |||
445 | #define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \ | ||
446 | struct btrfs_ioctl_vol_args) | ||
447 | #define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \ | ||
448 | struct btrfs_ioctl_vol_args) | ||
449 | #define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \ | ||
450 | struct btrfs_ioctl_defrag_range_args) | ||
451 | #define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \ | ||
452 | struct btrfs_ioctl_search_args) | ||
453 | #define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \ | ||
454 | struct btrfs_ioctl_ino_lookup_args) | ||
455 | #define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64) | ||
456 | #define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \ | ||
457 | struct btrfs_ioctl_space_args) | ||
458 | #define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64) | ||
459 | #define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) | ||
460 | #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ | ||
461 | struct btrfs_ioctl_vol_args_v2) | ||
462 | #define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \ | ||
463 | struct btrfs_ioctl_vol_args_v2) | ||
464 | #define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64) | ||
465 | #define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64) | ||
466 | #define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \ | ||
467 | struct btrfs_ioctl_scrub_args) | ||
468 | #define BTRFS_IOC_SCRUB_CANCEL _IO(BTRFS_IOCTL_MAGIC, 28) | ||
469 | #define BTRFS_IOC_SCRUB_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 29, \ | ||
470 | struct btrfs_ioctl_scrub_args) | ||
471 | #define BTRFS_IOC_DEV_INFO _IOWR(BTRFS_IOCTL_MAGIC, 30, \ | ||
472 | struct btrfs_ioctl_dev_info_args) | ||
473 | #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ | ||
474 | struct btrfs_ioctl_fs_info_args) | ||
475 | #define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \ | ||
476 | struct btrfs_ioctl_balance_args) | ||
477 | #define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int) | ||
478 | #define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \ | ||
479 | struct btrfs_ioctl_balance_args) | ||
480 | #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ | ||
481 | struct btrfs_ioctl_ino_path_args) | ||
482 | #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ | ||
483 | struct btrfs_ioctl_ino_path_args) | ||
484 | #define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \ | ||
485 | struct btrfs_ioctl_received_subvol_args) | ||
486 | #define BTRFS_IOC_SEND _IOW(BTRFS_IOCTL_MAGIC, 38, struct btrfs_ioctl_send_args) | ||
487 | #define BTRFS_IOC_DEVICES_READY _IOR(BTRFS_IOCTL_MAGIC, 39, \ | ||
488 | struct btrfs_ioctl_vol_args) | ||
489 | #define BTRFS_IOC_QUOTA_CTL _IOWR(BTRFS_IOCTL_MAGIC, 40, \ | ||
490 | struct btrfs_ioctl_quota_ctl_args) | ||
491 | #define BTRFS_IOC_QGROUP_ASSIGN _IOW(BTRFS_IOCTL_MAGIC, 41, \ | ||
492 | struct btrfs_ioctl_qgroup_assign_args) | ||
493 | #define BTRFS_IOC_QGROUP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 42, \ | ||
494 | struct btrfs_ioctl_qgroup_create_args) | ||
495 | #define BTRFS_IOC_QGROUP_LIMIT _IOR(BTRFS_IOCTL_MAGIC, 43, \ | ||
496 | struct btrfs_ioctl_qgroup_limit_args) | ||
497 | #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ | ||
498 | struct btrfs_ioctl_get_dev_stats) | ||
499 | #define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \ | ||
500 | struct btrfs_ioctl_dev_replace_args) | ||
501 | |||
502 | #endif | ||
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 2a1762c66041..e95df435d897 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c | |||
@@ -113,11 +113,10 @@ again: | |||
113 | read_unlock(&eb->lock); | 113 | read_unlock(&eb->lock); |
114 | return; | 114 | return; |
115 | } | 115 | } |
116 | read_unlock(&eb->lock); | ||
117 | wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); | ||
118 | read_lock(&eb->lock); | ||
119 | if (atomic_read(&eb->blocking_writers)) { | 116 | if (atomic_read(&eb->blocking_writers)) { |
120 | read_unlock(&eb->lock); | 117 | read_unlock(&eb->lock); |
118 | wait_event(eb->write_lock_wq, | ||
119 | atomic_read(&eb->blocking_writers) == 0); | ||
121 | goto again; | 120 | goto again; |
122 | } | 121 | } |
123 | atomic_inc(&eb->read_locks); | 122 | atomic_inc(&eb->read_locks); |
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index e5ed56729607..dc08d77b717e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c | |||
@@ -196,6 +196,9 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, | |||
196 | entry->file_offset = file_offset; | 196 | entry->file_offset = file_offset; |
197 | entry->start = start; | 197 | entry->start = start; |
198 | entry->len = len; | 198 | entry->len = len; |
199 | if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) && | ||
200 | !(type == BTRFS_ORDERED_NOCOW)) | ||
201 | entry->csum_bytes_left = disk_len; | ||
199 | entry->disk_len = disk_len; | 202 | entry->disk_len = disk_len; |
200 | entry->bytes_left = len; | 203 | entry->bytes_left = len; |
201 | entry->inode = igrab(inode); | 204 | entry->inode = igrab(inode); |
@@ -213,6 +216,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, | |||
213 | INIT_LIST_HEAD(&entry->root_extent_list); | 216 | INIT_LIST_HEAD(&entry->root_extent_list); |
214 | INIT_LIST_HEAD(&entry->work_list); | 217 | INIT_LIST_HEAD(&entry->work_list); |
215 | init_completion(&entry->completion); | 218 | init_completion(&entry->completion); |
219 | INIT_LIST_HEAD(&entry->log_list); | ||
216 | 220 | ||
217 | trace_btrfs_ordered_extent_add(inode, entry); | 221 | trace_btrfs_ordered_extent_add(inode, entry); |
218 | 222 | ||
@@ -270,6 +274,10 @@ void btrfs_add_ordered_sum(struct inode *inode, | |||
270 | tree = &BTRFS_I(inode)->ordered_tree; | 274 | tree = &BTRFS_I(inode)->ordered_tree; |
271 | spin_lock_irq(&tree->lock); | 275 | spin_lock_irq(&tree->lock); |
272 | list_add_tail(&sum->list, &entry->list); | 276 | list_add_tail(&sum->list, &entry->list); |
277 | WARN_ON(entry->csum_bytes_left < sum->len); | ||
278 | entry->csum_bytes_left -= sum->len; | ||
279 | if (entry->csum_bytes_left == 0) | ||
280 | wake_up(&entry->wait); | ||
273 | spin_unlock_irq(&tree->lock); | 281 | spin_unlock_irq(&tree->lock); |
274 | } | 282 | } |
275 | 283 | ||
@@ -405,6 +413,66 @@ out: | |||
405 | return ret == 0; | 413 | return ret == 0; |
406 | } | 414 | } |
407 | 415 | ||
416 | /* Needs to either be called under a log transaction or the log_mutex */ | ||
417 | void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode) | ||
418 | { | ||
419 | struct btrfs_ordered_inode_tree *tree; | ||
420 | struct btrfs_ordered_extent *ordered; | ||
421 | struct rb_node *n; | ||
422 | int index = log->log_transid % 2; | ||
423 | |||
424 | tree = &BTRFS_I(inode)->ordered_tree; | ||
425 | spin_lock_irq(&tree->lock); | ||
426 | for (n = rb_first(&tree->tree); n; n = rb_next(n)) { | ||
427 | ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); | ||
428 | spin_lock(&log->log_extents_lock[index]); | ||
429 | if (list_empty(&ordered->log_list)) { | ||
430 | list_add_tail(&ordered->log_list, &log->logged_list[index]); | ||
431 | atomic_inc(&ordered->refs); | ||
432 | } | ||
433 | spin_unlock(&log->log_extents_lock[index]); | ||
434 | } | ||
435 | spin_unlock_irq(&tree->lock); | ||
436 | } | ||
437 | |||
438 | void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) | ||
439 | { | ||
440 | struct btrfs_ordered_extent *ordered; | ||
441 | int index = transid % 2; | ||
442 | |||
443 | spin_lock_irq(&log->log_extents_lock[index]); | ||
444 | while (!list_empty(&log->logged_list[index])) { | ||
445 | ordered = list_first_entry(&log->logged_list[index], | ||
446 | struct btrfs_ordered_extent, | ||
447 | log_list); | ||
448 | list_del_init(&ordered->log_list); | ||
449 | spin_unlock_irq(&log->log_extents_lock[index]); | ||
450 | wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, | ||
451 | &ordered->flags)); | ||
452 | btrfs_put_ordered_extent(ordered); | ||
453 | spin_lock_irq(&log->log_extents_lock[index]); | ||
454 | } | ||
455 | spin_unlock_irq(&log->log_extents_lock[index]); | ||
456 | } | ||
457 | |||
458 | void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid) | ||
459 | { | ||
460 | struct btrfs_ordered_extent *ordered; | ||
461 | int index = transid % 2; | ||
462 | |||
463 | spin_lock_irq(&log->log_extents_lock[index]); | ||
464 | while (!list_empty(&log->logged_list[index])) { | ||
465 | ordered = list_first_entry(&log->logged_list[index], | ||
466 | struct btrfs_ordered_extent, | ||
467 | log_list); | ||
468 | list_del_init(&ordered->log_list); | ||
469 | spin_unlock_irq(&log->log_extents_lock[index]); | ||
470 | btrfs_put_ordered_extent(ordered); | ||
471 | spin_lock_irq(&log->log_extents_lock[index]); | ||
472 | } | ||
473 | spin_unlock_irq(&log->log_extents_lock[index]); | ||
474 | } | ||
475 | |||
408 | /* | 476 | /* |
409 | * used to drop a reference on an ordered extent. This will free | 477 | * used to drop a reference on an ordered extent. This will free |
410 | * the extent if the last reference is dropped | 478 | * the extent if the last reference is dropped |
@@ -544,10 +612,12 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) | |||
544 | * extra check to make sure the ordered operation list really is empty | 612 | * extra check to make sure the ordered operation list really is empty |
545 | * before we return | 613 | * before we return |
546 | */ | 614 | */ |
547 | int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) | 615 | int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, |
616 | struct btrfs_root *root, int wait) | ||
548 | { | 617 | { |
549 | struct btrfs_inode *btrfs_inode; | 618 | struct btrfs_inode *btrfs_inode; |
550 | struct inode *inode; | 619 | struct inode *inode; |
620 | struct btrfs_transaction *cur_trans = trans->transaction; | ||
551 | struct list_head splice; | 621 | struct list_head splice; |
552 | struct list_head works; | 622 | struct list_head works; |
553 | struct btrfs_delalloc_work *work, *next; | 623 | struct btrfs_delalloc_work *work, *next; |
@@ -558,14 +628,10 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) | |||
558 | 628 | ||
559 | mutex_lock(&root->fs_info->ordered_operations_mutex); | 629 | mutex_lock(&root->fs_info->ordered_operations_mutex); |
560 | spin_lock(&root->fs_info->ordered_extent_lock); | 630 | spin_lock(&root->fs_info->ordered_extent_lock); |
561 | again: | 631 | list_splice_init(&cur_trans->ordered_operations, &splice); |
562 | list_splice_init(&root->fs_info->ordered_operations, &splice); | ||
563 | |||
564 | while (!list_empty(&splice)) { | 632 | while (!list_empty(&splice)) { |
565 | |||
566 | btrfs_inode = list_entry(splice.next, struct btrfs_inode, | 633 | btrfs_inode = list_entry(splice.next, struct btrfs_inode, |
567 | ordered_operations); | 634 | ordered_operations); |
568 | |||
569 | inode = &btrfs_inode->vfs_inode; | 635 | inode = &btrfs_inode->vfs_inode; |
570 | 636 | ||
571 | list_del_init(&btrfs_inode->ordered_operations); | 637 | list_del_init(&btrfs_inode->ordered_operations); |
@@ -574,24 +640,22 @@ again: | |||
574 | * the inode may be getting freed (in sys_unlink path). | 640 | * the inode may be getting freed (in sys_unlink path). |
575 | */ | 641 | */ |
576 | inode = igrab(inode); | 642 | inode = igrab(inode); |
577 | |||
578 | if (!wait && inode) { | ||
579 | list_add_tail(&BTRFS_I(inode)->ordered_operations, | ||
580 | &root->fs_info->ordered_operations); | ||
581 | } | ||
582 | |||
583 | if (!inode) | 643 | if (!inode) |
584 | continue; | 644 | continue; |
645 | |||
646 | if (!wait) | ||
647 | list_add_tail(&BTRFS_I(inode)->ordered_operations, | ||
648 | &cur_trans->ordered_operations); | ||
585 | spin_unlock(&root->fs_info->ordered_extent_lock); | 649 | spin_unlock(&root->fs_info->ordered_extent_lock); |
586 | 650 | ||
587 | work = btrfs_alloc_delalloc_work(inode, wait, 1); | 651 | work = btrfs_alloc_delalloc_work(inode, wait, 1); |
588 | if (!work) { | 652 | if (!work) { |
653 | spin_lock(&root->fs_info->ordered_extent_lock); | ||
589 | if (list_empty(&BTRFS_I(inode)->ordered_operations)) | 654 | if (list_empty(&BTRFS_I(inode)->ordered_operations)) |
590 | list_add_tail(&btrfs_inode->ordered_operations, | 655 | list_add_tail(&btrfs_inode->ordered_operations, |
591 | &splice); | 656 | &splice); |
592 | spin_lock(&root->fs_info->ordered_extent_lock); | ||
593 | list_splice_tail(&splice, | 657 | list_splice_tail(&splice, |
594 | &root->fs_info->ordered_operations); | 658 | &cur_trans->ordered_operations); |
595 | spin_unlock(&root->fs_info->ordered_extent_lock); | 659 | spin_unlock(&root->fs_info->ordered_extent_lock); |
596 | ret = -ENOMEM; | 660 | ret = -ENOMEM; |
597 | goto out; | 661 | goto out; |
@@ -603,9 +667,6 @@ again: | |||
603 | cond_resched(); | 667 | cond_resched(); |
604 | spin_lock(&root->fs_info->ordered_extent_lock); | 668 | spin_lock(&root->fs_info->ordered_extent_lock); |
605 | } | 669 | } |
606 | if (wait && !list_empty(&root->fs_info->ordered_operations)) | ||
607 | goto again; | ||
608 | |||
609 | spin_unlock(&root->fs_info->ordered_extent_lock); | 670 | spin_unlock(&root->fs_info->ordered_extent_lock); |
610 | out: | 671 | out: |
611 | list_for_each_entry_safe(work, next, &works, list) { | 672 | list_for_each_entry_safe(work, next, &works, list) { |
@@ -974,6 +1035,7 @@ out: | |||
974 | void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, | 1035 | void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, |
975 | struct btrfs_root *root, struct inode *inode) | 1036 | struct btrfs_root *root, struct inode *inode) |
976 | { | 1037 | { |
1038 | struct btrfs_transaction *cur_trans = trans->transaction; | ||
977 | u64 last_mod; | 1039 | u64 last_mod; |
978 | 1040 | ||
979 | last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); | 1041 | last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); |
@@ -988,7 +1050,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, | |||
988 | spin_lock(&root->fs_info->ordered_extent_lock); | 1050 | spin_lock(&root->fs_info->ordered_extent_lock); |
989 | if (list_empty(&BTRFS_I(inode)->ordered_operations)) { | 1051 | if (list_empty(&BTRFS_I(inode)->ordered_operations)) { |
990 | list_add_tail(&BTRFS_I(inode)->ordered_operations, | 1052 | list_add_tail(&BTRFS_I(inode)->ordered_operations, |
991 | &root->fs_info->ordered_operations); | 1053 | &cur_trans->ordered_operations); |
992 | } | 1054 | } |
993 | spin_unlock(&root->fs_info->ordered_extent_lock); | 1055 | spin_unlock(&root->fs_info->ordered_extent_lock); |
994 | } | 1056 | } |
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index f29d4bf5fbe7..8eadfe406cdd 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h | |||
@@ -79,6 +79,8 @@ struct btrfs_ordered_sum { | |||
79 | #define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent | 79 | #define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent |
80 | * has done its due diligence in updating | 80 | * has done its due diligence in updating |
81 | * the isize. */ | 81 | * the isize. */ |
82 | #define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered | ||
83 | ordered extent */ | ||
82 | 84 | ||
83 | struct btrfs_ordered_extent { | 85 | struct btrfs_ordered_extent { |
84 | /* logical offset in the file */ | 86 | /* logical offset in the file */ |
@@ -96,6 +98,9 @@ struct btrfs_ordered_extent { | |||
96 | /* number of bytes that still need writing */ | 98 | /* number of bytes that still need writing */ |
97 | u64 bytes_left; | 99 | u64 bytes_left; |
98 | 100 | ||
101 | /* number of bytes that still need csumming */ | ||
102 | u64 csum_bytes_left; | ||
103 | |||
99 | /* | 104 | /* |
100 | * the end of the ordered extent which is behind it but | 105 | * the end of the ordered extent which is behind it but |
101 | * didn't update disk_i_size. Please see the comment of | 106 | * didn't update disk_i_size. Please see the comment of |
@@ -118,6 +123,9 @@ struct btrfs_ordered_extent { | |||
118 | /* list of checksums for insertion when the extent io is done */ | 123 | /* list of checksums for insertion when the extent io is done */ |
119 | struct list_head list; | 124 | struct list_head list; |
120 | 125 | ||
126 | /* If we need to wait on this to be done */ | ||
127 | struct list_head log_list; | ||
128 | |||
121 | /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ | 129 | /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ |
122 | wait_queue_head_t wait; | 130 | wait_queue_head_t wait; |
123 | 131 | ||
@@ -189,11 +197,15 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, | |||
189 | int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, | 197 | int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, |
190 | struct btrfs_ordered_extent *ordered); | 198 | struct btrfs_ordered_extent *ordered); |
191 | int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); | 199 | int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); |
192 | int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); | 200 | int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, |
201 | struct btrfs_root *root, int wait); | ||
193 | void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, | 202 | void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, |
194 | struct btrfs_root *root, | 203 | struct btrfs_root *root, |
195 | struct inode *inode); | 204 | struct inode *inode); |
196 | void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput); | 205 | void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput); |
206 | void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode); | ||
207 | void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); | ||
208 | void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); | ||
197 | int __init ordered_data_init(void); | 209 | int __init ordered_data_init(void); |
198 | void ordered_data_exit(void); | 210 | void ordered_data_exit(void); |
199 | #endif | 211 | #endif |
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 50d95fd190a5..920957ecb27e 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c | |||
@@ -294,6 +294,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) | |||
294 | btrfs_dev_extent_chunk_offset(l, dev_extent), | 294 | btrfs_dev_extent_chunk_offset(l, dev_extent), |
295 | (unsigned long long) | 295 | (unsigned long long) |
296 | btrfs_dev_extent_length(l, dev_extent)); | 296 | btrfs_dev_extent_length(l, dev_extent)); |
297 | break; | ||
297 | case BTRFS_DEV_STATS_KEY: | 298 | case BTRFS_DEV_STATS_KEY: |
298 | printk(KERN_INFO "\t\tdevice stats\n"); | 299 | printk(KERN_INFO "\t\tdevice stats\n"); |
299 | break; | 300 | break; |
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index a5c856234323..aee4b1cc3d98 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c | |||
@@ -23,13 +23,13 @@ | |||
23 | #include <linux/rbtree.h> | 23 | #include <linux/rbtree.h> |
24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | #include <linux/workqueue.h> | 25 | #include <linux/workqueue.h> |
26 | #include <linux/btrfs.h> | ||
26 | 27 | ||
27 | #include "ctree.h" | 28 | #include "ctree.h" |
28 | #include "transaction.h" | 29 | #include "transaction.h" |
29 | #include "disk-io.h" | 30 | #include "disk-io.h" |
30 | #include "locking.h" | 31 | #include "locking.h" |
31 | #include "ulist.h" | 32 | #include "ulist.h" |
32 | #include "ioctl.h" | ||
33 | #include "backref.h" | 33 | #include "backref.h" |
34 | 34 | ||
35 | /* TODO XXX FIXME | 35 | /* TODO XXX FIXME |
@@ -620,7 +620,9 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, | |||
620 | key.offset = qgroupid; | 620 | key.offset = qgroupid; |
621 | 621 | ||
622 | path = btrfs_alloc_path(); | 622 | path = btrfs_alloc_path(); |
623 | BUG_ON(!path); | 623 | if (!path) |
624 | return -ENOMEM; | ||
625 | |||
624 | ret = btrfs_search_slot(trans, root, &key, path, 0, 1); | 626 | ret = btrfs_search_slot(trans, root, &key, path, 0, 1); |
625 | if (ret > 0) | 627 | if (ret > 0) |
626 | ret = -ENOENT; | 628 | ret = -ENOENT; |
@@ -661,7 +663,9 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, | |||
661 | key.offset = qgroup->qgroupid; | 663 | key.offset = qgroup->qgroupid; |
662 | 664 | ||
663 | path = btrfs_alloc_path(); | 665 | path = btrfs_alloc_path(); |
664 | BUG_ON(!path); | 666 | if (!path) |
667 | return -ENOMEM; | ||
668 | |||
665 | ret = btrfs_search_slot(trans, root, &key, path, 0, 1); | 669 | ret = btrfs_search_slot(trans, root, &key, path, 0, 1); |
666 | if (ret > 0) | 670 | if (ret > 0) |
667 | ret = -ENOENT; | 671 | ret = -ENOENT; |
@@ -702,7 +706,9 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans, | |||
702 | key.offset = 0; | 706 | key.offset = 0; |
703 | 707 | ||
704 | path = btrfs_alloc_path(); | 708 | path = btrfs_alloc_path(); |
705 | BUG_ON(!path); | 709 | if (!path) |
710 | return -ENOMEM; | ||
711 | |||
706 | ret = btrfs_search_slot(trans, root, &key, path, 0, 1); | 712 | ret = btrfs_search_slot(trans, root, &key, path, 0, 1); |
707 | if (ret > 0) | 713 | if (ret > 0) |
708 | ret = -ENOENT; | 714 | ret = -ENOENT; |
@@ -732,33 +738,38 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, | |||
732 | { | 738 | { |
733 | struct btrfs_path *path; | 739 | struct btrfs_path *path; |
734 | struct btrfs_key key; | 740 | struct btrfs_key key; |
741 | struct extent_buffer *leaf = NULL; | ||
735 | int ret; | 742 | int ret; |
736 | 743 | int nr = 0; | |
737 | if (!root) | ||
738 | return -EINVAL; | ||
739 | 744 | ||
740 | path = btrfs_alloc_path(); | 745 | path = btrfs_alloc_path(); |
741 | if (!path) | 746 | if (!path) |
742 | return -ENOMEM; | 747 | return -ENOMEM; |
743 | 748 | ||
744 | while (1) { | 749 | path->leave_spinning = 1; |
745 | key.objectid = 0; | ||
746 | key.offset = 0; | ||
747 | key.type = 0; | ||
748 | 750 | ||
749 | path->leave_spinning = 1; | 751 | key.objectid = 0; |
752 | key.offset = 0; | ||
753 | key.type = 0; | ||
754 | |||
755 | while (1) { | ||
750 | ret = btrfs_search_slot(trans, root, &key, path, -1, 1); | 756 | ret = btrfs_search_slot(trans, root, &key, path, -1, 1); |
751 | if (ret > 0) { | 757 | if (ret < 0) |
752 | if (path->slots[0] == 0) | 758 | goto out; |
753 | break; | 759 | leaf = path->nodes[0]; |
754 | path->slots[0]--; | 760 | nr = btrfs_header_nritems(leaf); |
755 | } else if (ret < 0) { | 761 | if (!nr) |
756 | break; | 762 | break; |
757 | } | 763 | /* |
758 | 764 | * delete the leaf one by one | |
759 | ret = btrfs_del_item(trans, root, path); | 765 | * since the whole tree is going |
766 | * to be deleted. | ||
767 | */ | ||
768 | path->slots[0] = 0; | ||
769 | ret = btrfs_del_items(trans, root, path, 0, nr); | ||
760 | if (ret) | 770 | if (ret) |
761 | goto out; | 771 | goto out; |
772 | |||
762 | btrfs_release_path(path); | 773 | btrfs_release_path(path); |
763 | } | 774 | } |
764 | ret = 0; | 775 | ret = 0; |
@@ -847,6 +858,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, | |||
847 | int ret = 0; | 858 | int ret = 0; |
848 | 859 | ||
849 | spin_lock(&fs_info->qgroup_lock); | 860 | spin_lock(&fs_info->qgroup_lock); |
861 | if (!fs_info->quota_root) { | ||
862 | spin_unlock(&fs_info->qgroup_lock); | ||
863 | return 0; | ||
864 | } | ||
850 | fs_info->quota_enabled = 0; | 865 | fs_info->quota_enabled = 0; |
851 | fs_info->pending_quota_state = 0; | 866 | fs_info->pending_quota_state = 0; |
852 | quota_root = fs_info->quota_root; | 867 | quota_root = fs_info->quota_root; |
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c new file mode 100644 index 000000000000..07222053c7d8 --- /dev/null +++ b/fs/btrfs/raid56.c | |||
@@ -0,0 +1,2099 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2012 Fusion-io All rights reserved. | ||
3 | * Copyright (C) 2012 Intel Corp. All rights reserved. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of the GNU General Public | ||
7 | * License v2 as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
12 | * General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public | ||
15 | * License along with this program; if not, write to the | ||
16 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
17 | * Boston, MA 021110-1307, USA. | ||
18 | */ | ||
19 | #include <linux/sched.h> | ||
20 | #include <linux/wait.h> | ||
21 | #include <linux/bio.h> | ||
22 | #include <linux/slab.h> | ||
23 | #include <linux/buffer_head.h> | ||
24 | #include <linux/blkdev.h> | ||
25 | #include <linux/random.h> | ||
26 | #include <linux/iocontext.h> | ||
27 | #include <linux/capability.h> | ||
28 | #include <linux/ratelimit.h> | ||
29 | #include <linux/kthread.h> | ||
30 | #include <linux/raid/pq.h> | ||
31 | #include <linux/hash.h> | ||
32 | #include <linux/list_sort.h> | ||
33 | #include <linux/raid/xor.h> | ||
34 | #include <asm/div64.h> | ||
35 | #include "compat.h" | ||
36 | #include "ctree.h" | ||
37 | #include "extent_map.h" | ||
38 | #include "disk-io.h" | ||
39 | #include "transaction.h" | ||
40 | #include "print-tree.h" | ||
41 | #include "volumes.h" | ||
42 | #include "raid56.h" | ||
43 | #include "async-thread.h" | ||
44 | #include "check-integrity.h" | ||
45 | #include "rcu-string.h" | ||
46 | |||
47 | /* set when additional merges to this rbio are not allowed */ | ||
48 | #define RBIO_RMW_LOCKED_BIT 1 | ||
49 | |||
50 | /* | ||
51 | * set when this rbio is sitting in the hash, but it is just a cache | ||
52 | * of past RMW | ||
53 | */ | ||
54 | #define RBIO_CACHE_BIT 2 | ||
55 | |||
56 | /* | ||
57 | * set when it is safe to trust the stripe_pages for caching | ||
58 | */ | ||
59 | #define RBIO_CACHE_READY_BIT 3 | ||
60 | |||
61 | |||
62 | #define RBIO_CACHE_SIZE 1024 | ||
63 | |||
64 | struct btrfs_raid_bio { | ||
65 | struct btrfs_fs_info *fs_info; | ||
66 | struct btrfs_bio *bbio; | ||
67 | |||
68 | /* | ||
69 | * logical block numbers for the start of each stripe | ||
70 | * The last one or two are p/q. These are sorted, | ||
71 | * so raid_map[0] is the start of our full stripe | ||
72 | */ | ||
73 | u64 *raid_map; | ||
74 | |||
75 | /* while we're doing rmw on a stripe | ||
76 | * we put it into a hash table so we can | ||
77 | * lock the stripe and merge more rbios | ||
78 | * into it. | ||
79 | */ | ||
80 | struct list_head hash_list; | ||
81 | |||
82 | /* | ||
83 | * LRU list for the stripe cache | ||
84 | */ | ||
85 | struct list_head stripe_cache; | ||
86 | |||
87 | /* | ||
88 | * for scheduling work in the helper threads | ||
89 | */ | ||
90 | struct btrfs_work work; | ||
91 | |||
92 | /* | ||
93 | * bio list and bio_list_lock are used | ||
94 | * to add more bios into the stripe | ||
95 | * in hopes of avoiding the full rmw | ||
96 | */ | ||
97 | struct bio_list bio_list; | ||
98 | spinlock_t bio_list_lock; | ||
99 | |||
100 | /* also protected by the bio_list_lock, the | ||
101 | * plug list is used by the plugging code | ||
102 | * to collect partial bios while plugged. The | ||
103 | * stripe locking code also uses it to hand off | ||
104 | * the stripe lock to the next pending IO | ||
105 | */ | ||
106 | struct list_head plug_list; | ||
107 | |||
108 | /* | ||
109 | * flags that tell us if it is safe to | ||
110 | * merge with this bio | ||
111 | */ | ||
112 | unsigned long flags; | ||
113 | |||
114 | /* size of each individual stripe on disk */ | ||
115 | int stripe_len; | ||
116 | |||
117 | /* number of data stripes (no p/q) */ | ||
118 | int nr_data; | ||
119 | |||
120 | /* | ||
121 | * set if we're doing a parity rebuild | ||
122 | * for a read from higher up, which is handled | ||
123 | * differently from a parity rebuild as part of | ||
124 | * rmw | ||
125 | */ | ||
126 | int read_rebuild; | ||
127 | |||
128 | /* first bad stripe */ | ||
129 | int faila; | ||
130 | |||
131 | /* second bad stripe (for raid6 use) */ | ||
132 | int failb; | ||
133 | |||
134 | /* | ||
135 | * number of pages needed to represent the full | ||
136 | * stripe | ||
137 | */ | ||
138 | int nr_pages; | ||
139 | |||
140 | /* | ||
141 | * size of all the bios in the bio_list. This | ||
142 | * helps us decide if the rbio maps to a full | ||
143 | * stripe or not | ||
144 | */ | ||
145 | int bio_list_bytes; | ||
146 | |||
147 | atomic_t refs; | ||
148 | |||
149 | /* | ||
150 | * these are two arrays of pointers. We allocate the | ||
151 | * rbio big enough to hold them both and setup their | ||
152 | * locations when the rbio is allocated | ||
153 | */ | ||
154 | |||
155 | /* pointers to pages that we allocated for | ||
156 | * reading/writing stripes directly from the disk (including P/Q) | ||
157 | */ | ||
158 | struct page **stripe_pages; | ||
159 | |||
160 | /* | ||
161 | * pointers to the pages in the bio_list. Stored | ||
162 | * here for faster lookup | ||
163 | */ | ||
164 | struct page **bio_pages; | ||
165 | }; | ||
166 | |||
167 | static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); | ||
168 | static noinline void finish_rmw(struct btrfs_raid_bio *rbio); | ||
169 | static void rmw_work(struct btrfs_work *work); | ||
170 | static void read_rebuild_work(struct btrfs_work *work); | ||
171 | static void async_rmw_stripe(struct btrfs_raid_bio *rbio); | ||
172 | static void async_read_rebuild(struct btrfs_raid_bio *rbio); | ||
173 | static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); | ||
174 | static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); | ||
175 | static void __free_raid_bio(struct btrfs_raid_bio *rbio); | ||
176 | static void index_rbio_pages(struct btrfs_raid_bio *rbio); | ||
177 | static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); | ||
178 | |||
179 | /* | ||
180 | * the stripe hash table is used for locking, and to collect | ||
181 | * bios in hopes of making a full stripe | ||
182 | */ | ||
183 | int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) | ||
184 | { | ||
185 | struct btrfs_stripe_hash_table *table; | ||
186 | struct btrfs_stripe_hash_table *x; | ||
187 | struct btrfs_stripe_hash *cur; | ||
188 | struct btrfs_stripe_hash *h; | ||
189 | int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; | ||
190 | int i; | ||
191 | int table_size; | ||
192 | |||
193 | if (info->stripe_hash_table) | ||
194 | return 0; | ||
195 | |||
196 | /* | ||
197 | * The table is large, starting with order 4 and can go as high as | ||
198 | * order 7 in case lock debugging is turned on. | ||
199 | * | ||
200 | * Try harder to allocate and fallback to vmalloc to lower the chance | ||
201 | * of a failing mount. | ||
202 | */ | ||
203 | table_size = sizeof(*table) + sizeof(*h) * num_entries; | ||
204 | table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); | ||
205 | if (!table) { | ||
206 | table = vzalloc(table_size); | ||
207 | if (!table) | ||
208 | return -ENOMEM; | ||
209 | } | ||
210 | |||
211 | spin_lock_init(&table->cache_lock); | ||
212 | INIT_LIST_HEAD(&table->stripe_cache); | ||
213 | |||
214 | h = table->table; | ||
215 | |||
216 | for (i = 0; i < num_entries; i++) { | ||
217 | cur = h + i; | ||
218 | INIT_LIST_HEAD(&cur->hash_list); | ||
219 | spin_lock_init(&cur->lock); | ||
220 | init_waitqueue_head(&cur->wait); | ||
221 | } | ||
222 | |||
223 | x = cmpxchg(&info->stripe_hash_table, NULL, table); | ||
224 | if (x) { | ||
225 | if (is_vmalloc_addr(x)) | ||
226 | vfree(x); | ||
227 | else | ||
228 | kfree(x); | ||
229 | } | ||
230 | return 0; | ||
231 | } | ||
232 | |||
233 | /* | ||
234 | * caching an rbio means to copy anything from the | ||
235 | * bio_pages array into the stripe_pages array. We | ||
236 | * use the page uptodate bit in the stripe cache array | ||
237 | * to indicate if it has valid data | ||
238 | * | ||
239 | * once the caching is done, we set the cache ready | ||
240 | * bit. | ||
241 | */ | ||
242 | static void cache_rbio_pages(struct btrfs_raid_bio *rbio) | ||
243 | { | ||
244 | int i; | ||
245 | char *s; | ||
246 | char *d; | ||
247 | int ret; | ||
248 | |||
249 | ret = alloc_rbio_pages(rbio); | ||
250 | if (ret) | ||
251 | return; | ||
252 | |||
253 | for (i = 0; i < rbio->nr_pages; i++) { | ||
254 | if (!rbio->bio_pages[i]) | ||
255 | continue; | ||
256 | |||
257 | s = kmap(rbio->bio_pages[i]); | ||
258 | d = kmap(rbio->stripe_pages[i]); | ||
259 | |||
260 | memcpy(d, s, PAGE_CACHE_SIZE); | ||
261 | |||
262 | kunmap(rbio->bio_pages[i]); | ||
263 | kunmap(rbio->stripe_pages[i]); | ||
264 | SetPageUptodate(rbio->stripe_pages[i]); | ||
265 | } | ||
266 | set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); | ||
267 | } | ||
268 | |||
269 | /* | ||
270 | * we hash on the first logical address of the stripe | ||
271 | */ | ||
272 | static int rbio_bucket(struct btrfs_raid_bio *rbio) | ||
273 | { | ||
274 | u64 num = rbio->raid_map[0]; | ||
275 | |||
276 | /* | ||
277 | * we shift down quite a bit. We're using byte | ||
278 | * addressing, and most of the lower bits are zeros. | ||
279 | * This tends to upset hash_64, and it consistently | ||
280 | * returns just one or two different values. | ||
281 | * | ||
282 | * shifting off the lower bits fixes things. | ||
283 | */ | ||
284 | return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); | ||
285 | } | ||
286 | |||
287 | /* | ||
288 | * stealing an rbio means taking all the uptodate pages from the stripe | ||
289 | * array in the source rbio and putting them into the destination rbio | ||
290 | */ | ||
291 | static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) | ||
292 | { | ||
293 | int i; | ||
294 | struct page *s; | ||
295 | struct page *d; | ||
296 | |||
297 | if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) | ||
298 | return; | ||
299 | |||
300 | for (i = 0; i < dest->nr_pages; i++) { | ||
301 | s = src->stripe_pages[i]; | ||
302 | if (!s || !PageUptodate(s)) { | ||
303 | continue; | ||
304 | } | ||
305 | |||
306 | d = dest->stripe_pages[i]; | ||
307 | if (d) | ||
308 | __free_page(d); | ||
309 | |||
310 | dest->stripe_pages[i] = s; | ||
311 | src->stripe_pages[i] = NULL; | ||
312 | } | ||
313 | } | ||
314 | |||
315 | /* | ||
316 | * merging means we take the bio_list from the victim and | ||
317 | * splice it into the destination. The victim should | ||
318 | * be discarded afterwards. | ||
319 | * | ||
320 | * must be called with dest->rbio_list_lock held | ||
321 | */ | ||
322 | static void merge_rbio(struct btrfs_raid_bio *dest, | ||
323 | struct btrfs_raid_bio *victim) | ||
324 | { | ||
325 | bio_list_merge(&dest->bio_list, &victim->bio_list); | ||
326 | dest->bio_list_bytes += victim->bio_list_bytes; | ||
327 | bio_list_init(&victim->bio_list); | ||
328 | } | ||
329 | |||
330 | /* | ||
331 | * used to prune items that are in the cache. The caller | ||
332 | * must hold the hash table lock. | ||
333 | */ | ||
334 | static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) | ||
335 | { | ||
336 | int bucket = rbio_bucket(rbio); | ||
337 | struct btrfs_stripe_hash_table *table; | ||
338 | struct btrfs_stripe_hash *h; | ||
339 | int freeit = 0; | ||
340 | |||
341 | /* | ||
342 | * check the bit again under the hash table lock. | ||
343 | */ | ||
344 | if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) | ||
345 | return; | ||
346 | |||
347 | table = rbio->fs_info->stripe_hash_table; | ||
348 | h = table->table + bucket; | ||
349 | |||
350 | /* hold the lock for the bucket because we may be | ||
351 | * removing it from the hash table | ||
352 | */ | ||
353 | spin_lock(&h->lock); | ||
354 | |||
355 | /* | ||
356 | * hold the lock for the bio list because we need | ||
357 | * to make sure the bio list is empty | ||
358 | */ | ||
359 | spin_lock(&rbio->bio_list_lock); | ||
360 | |||
361 | if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { | ||
362 | list_del_init(&rbio->stripe_cache); | ||
363 | table->cache_size -= 1; | ||
364 | freeit = 1; | ||
365 | |||
366 | /* if the bio list isn't empty, this rbio is | ||
367 | * still involved in an IO. We take it out | ||
368 | * of the cache list, and drop the ref that | ||
369 | * was held for the list. | ||
370 | * | ||
371 | * If the bio_list was empty, we also remove | ||
372 | * the rbio from the hash_table, and drop | ||
373 | * the corresponding ref | ||
374 | */ | ||
375 | if (bio_list_empty(&rbio->bio_list)) { | ||
376 | if (!list_empty(&rbio->hash_list)) { | ||
377 | list_del_init(&rbio->hash_list); | ||
378 | atomic_dec(&rbio->refs); | ||
379 | BUG_ON(!list_empty(&rbio->plug_list)); | ||
380 | } | ||
381 | } | ||
382 | } | ||
383 | |||
384 | spin_unlock(&rbio->bio_list_lock); | ||
385 | spin_unlock(&h->lock); | ||
386 | |||
387 | if (freeit) | ||
388 | __free_raid_bio(rbio); | ||
389 | } | ||
390 | |||
391 | /* | ||
392 | * prune a given rbio from the cache | ||
393 | */ | ||
394 | static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) | ||
395 | { | ||
396 | struct btrfs_stripe_hash_table *table; | ||
397 | unsigned long flags; | ||
398 | |||
399 | if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) | ||
400 | return; | ||
401 | |||
402 | table = rbio->fs_info->stripe_hash_table; | ||
403 | |||
404 | spin_lock_irqsave(&table->cache_lock, flags); | ||
405 | __remove_rbio_from_cache(rbio); | ||
406 | spin_unlock_irqrestore(&table->cache_lock, flags); | ||
407 | } | ||
408 | |||
409 | /* | ||
410 | * remove everything in the cache | ||
411 | */ | ||
412 | void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) | ||
413 | { | ||
414 | struct btrfs_stripe_hash_table *table; | ||
415 | unsigned long flags; | ||
416 | struct btrfs_raid_bio *rbio; | ||
417 | |||
418 | table = info->stripe_hash_table; | ||
419 | |||
420 | spin_lock_irqsave(&table->cache_lock, flags); | ||
421 | while (!list_empty(&table->stripe_cache)) { | ||
422 | rbio = list_entry(table->stripe_cache.next, | ||
423 | struct btrfs_raid_bio, | ||
424 | stripe_cache); | ||
425 | __remove_rbio_from_cache(rbio); | ||
426 | } | ||
427 | spin_unlock_irqrestore(&table->cache_lock, flags); | ||
428 | } | ||
429 | |||
430 | /* | ||
431 | * remove all cached entries and free the hash table | ||
432 | * used by unmount | ||
433 | */ | ||
434 | void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) | ||
435 | { | ||
436 | if (!info->stripe_hash_table) | ||
437 | return; | ||
438 | btrfs_clear_rbio_cache(info); | ||
439 | if (is_vmalloc_addr(info->stripe_hash_table)) | ||
440 | vfree(info->stripe_hash_table); | ||
441 | else | ||
442 | kfree(info->stripe_hash_table); | ||
443 | info->stripe_hash_table = NULL; | ||
444 | } | ||
445 | |||
446 | /* | ||
447 | * insert an rbio into the stripe cache. It | ||
448 | * must have already been prepared by calling | ||
449 | * cache_rbio_pages | ||
450 | * | ||
451 | * If this rbio was already cached, it gets | ||
452 | * moved to the front of the lru. | ||
453 | * | ||
454 | * If the size of the rbio cache is too big, we | ||
455 | * prune an item. | ||
456 | */ | ||
457 | static void cache_rbio(struct btrfs_raid_bio *rbio) | ||
458 | { | ||
459 | struct btrfs_stripe_hash_table *table; | ||
460 | unsigned long flags; | ||
461 | |||
462 | if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) | ||
463 | return; | ||
464 | |||
465 | table = rbio->fs_info->stripe_hash_table; | ||
466 | |||
467 | spin_lock_irqsave(&table->cache_lock, flags); | ||
468 | spin_lock(&rbio->bio_list_lock); | ||
469 | |||
470 | /* bump our ref if we were not in the list before */ | ||
471 | if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) | ||
472 | atomic_inc(&rbio->refs); | ||
473 | |||
474 | if (!list_empty(&rbio->stripe_cache)){ | ||
475 | list_move(&rbio->stripe_cache, &table->stripe_cache); | ||
476 | } else { | ||
477 | list_add(&rbio->stripe_cache, &table->stripe_cache); | ||
478 | table->cache_size += 1; | ||
479 | } | ||
480 | |||
481 | spin_unlock(&rbio->bio_list_lock); | ||
482 | |||
483 | if (table->cache_size > RBIO_CACHE_SIZE) { | ||
484 | struct btrfs_raid_bio *found; | ||
485 | |||
486 | found = list_entry(table->stripe_cache.prev, | ||
487 | struct btrfs_raid_bio, | ||
488 | stripe_cache); | ||
489 | |||
490 | if (found != rbio) | ||
491 | __remove_rbio_from_cache(found); | ||
492 | } | ||
493 | |||
494 | spin_unlock_irqrestore(&table->cache_lock, flags); | ||
495 | return; | ||
496 | } | ||
497 | |||
498 | /* | ||
499 | * helper function to run the xor_blocks api. It is only | ||
500 | * able to do MAX_XOR_BLOCKS at a time, so we need to | ||
501 | * loop through. | ||
502 | */ | ||
503 | static void run_xor(void **pages, int src_cnt, ssize_t len) | ||
504 | { | ||
505 | int src_off = 0; | ||
506 | int xor_src_cnt = 0; | ||
507 | void *dest = pages[src_cnt]; | ||
508 | |||
509 | while(src_cnt > 0) { | ||
510 | xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); | ||
511 | xor_blocks(xor_src_cnt, len, dest, pages + src_off); | ||
512 | |||
513 | src_cnt -= xor_src_cnt; | ||
514 | src_off += xor_src_cnt; | ||
515 | } | ||
516 | } | ||
517 | |||
518 | /* | ||
519 | * returns true if the bio list inside this rbio | ||
520 | * covers an entire stripe (no rmw required). | ||
521 | * Must be called with the bio list lock held, or | ||
522 | * at a time when you know it is impossible to add | ||
523 | * new bios into the list | ||
524 | */ | ||
525 | static int __rbio_is_full(struct btrfs_raid_bio *rbio) | ||
526 | { | ||
527 | unsigned long size = rbio->bio_list_bytes; | ||
528 | int ret = 1; | ||
529 | |||
530 | if (size != rbio->nr_data * rbio->stripe_len) | ||
531 | ret = 0; | ||
532 | |||
533 | BUG_ON(size > rbio->nr_data * rbio->stripe_len); | ||
534 | return ret; | ||
535 | } | ||
536 | |||
537 | static int rbio_is_full(struct btrfs_raid_bio *rbio) | ||
538 | { | ||
539 | unsigned long flags; | ||
540 | int ret; | ||
541 | |||
542 | spin_lock_irqsave(&rbio->bio_list_lock, flags); | ||
543 | ret = __rbio_is_full(rbio); | ||
544 | spin_unlock_irqrestore(&rbio->bio_list_lock, flags); | ||
545 | return ret; | ||
546 | } | ||
547 | |||
548 | /* | ||
549 | * returns 1 if it is safe to merge two rbios together. | ||
550 | * The merging is safe if the two rbios correspond to | ||
551 | * the same stripe and if they are both going in the same | ||
552 | * direction (read vs write), and if neither one is | ||
553 | * locked for final IO | ||
554 | * | ||
555 | * The caller is responsible for locking such that | ||
556 | * rmw_locked is safe to test | ||
557 | */ | ||
558 | static int rbio_can_merge(struct btrfs_raid_bio *last, | ||
559 | struct btrfs_raid_bio *cur) | ||
560 | { | ||
561 | if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || | ||
562 | test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) | ||
563 | return 0; | ||
564 | |||
565 | /* | ||
566 | * we can't merge with cached rbios, since the | ||
567 | * idea is that when we merge the destination | ||
568 | * rbio is going to run our IO for us. We can | ||
569 | * steal from cached rbio's though, other functions | ||
570 | * handle that. | ||
571 | */ | ||
572 | if (test_bit(RBIO_CACHE_BIT, &last->flags) || | ||
573 | test_bit(RBIO_CACHE_BIT, &cur->flags)) | ||
574 | return 0; | ||
575 | |||
576 | if (last->raid_map[0] != | ||
577 | cur->raid_map[0]) | ||
578 | return 0; | ||
579 | |||
580 | /* reads can't merge with writes */ | ||
581 | if (last->read_rebuild != | ||
582 | cur->read_rebuild) { | ||
583 | return 0; | ||
584 | } | ||
585 | |||
586 | return 1; | ||
587 | } | ||
588 | |||
589 | /* | ||
590 | * helper to index into the pstripe | ||
591 | */ | ||
592 | static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) | ||
593 | { | ||
594 | index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; | ||
595 | return rbio->stripe_pages[index]; | ||
596 | } | ||
597 | |||
598 | /* | ||
599 | * helper to index into the qstripe, returns null | ||
600 | * if there is no qstripe | ||
601 | */ | ||
602 | static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) | ||
603 | { | ||
604 | if (rbio->nr_data + 1 == rbio->bbio->num_stripes) | ||
605 | return NULL; | ||
606 | |||
607 | index += ((rbio->nr_data + 1) * rbio->stripe_len) >> | ||
608 | PAGE_CACHE_SHIFT; | ||
609 | return rbio->stripe_pages[index]; | ||
610 | } | ||
611 | |||
612 | /* | ||
613 | * The first stripe in the table for a logical address | ||
614 | * has the lock. rbios are added in one of three ways: | ||
615 | * | ||
616 | * 1) Nobody has the stripe locked yet. The rbio is given | ||
617 | * the lock and 0 is returned. The caller must start the IO | ||
618 | * themselves. | ||
619 | * | ||
620 | * 2) Someone has the stripe locked, but we're able to merge | ||
621 | * with the lock owner. The rbio is freed and the IO will | ||
622 | * start automatically along with the existing rbio. 1 is returned. | ||
623 | * | ||
624 | * 3) Someone has the stripe locked, but we're not able to merge. | ||
625 | * The rbio is added to the lock owner's plug list, or merged into | ||
626 | * an rbio already on the plug list. When the lock owner unlocks, | ||
627 | * the next rbio on the list is run and the IO is started automatically. | ||
628 | * 1 is returned | ||
629 | * | ||
630 | * If we return 0, the caller still owns the rbio and must continue with | ||
631 | * IO submission. If we return 1, the caller must assume the rbio has | ||
632 | * already been freed. | ||
633 | */ | ||
634 | static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) | ||
635 | { | ||
636 | int bucket = rbio_bucket(rbio); | ||
637 | struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; | ||
638 | struct btrfs_raid_bio *cur; | ||
639 | struct btrfs_raid_bio *pending; | ||
640 | unsigned long flags; | ||
641 | DEFINE_WAIT(wait); | ||
642 | struct btrfs_raid_bio *freeit = NULL; | ||
643 | struct btrfs_raid_bio *cache_drop = NULL; | ||
644 | int ret = 0; | ||
645 | int walk = 0; | ||
646 | |||
647 | spin_lock_irqsave(&h->lock, flags); | ||
648 | list_for_each_entry(cur, &h->hash_list, hash_list) { | ||
649 | walk++; | ||
650 | if (cur->raid_map[0] == rbio->raid_map[0]) { | ||
651 | spin_lock(&cur->bio_list_lock); | ||
652 | |||
653 | /* can we steal this cached rbio's pages? */ | ||
654 | if (bio_list_empty(&cur->bio_list) && | ||
655 | list_empty(&cur->plug_list) && | ||
656 | test_bit(RBIO_CACHE_BIT, &cur->flags) && | ||
657 | !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { | ||
658 | list_del_init(&cur->hash_list); | ||
659 | atomic_dec(&cur->refs); | ||
660 | |||
661 | steal_rbio(cur, rbio); | ||
662 | cache_drop = cur; | ||
663 | spin_unlock(&cur->bio_list_lock); | ||
664 | |||
665 | goto lockit; | ||
666 | } | ||
667 | |||
668 | /* can we merge into the lock owner? */ | ||
669 | if (rbio_can_merge(cur, rbio)) { | ||
670 | merge_rbio(cur, rbio); | ||
671 | spin_unlock(&cur->bio_list_lock); | ||
672 | freeit = rbio; | ||
673 | ret = 1; | ||
674 | goto out; | ||
675 | } | ||
676 | |||
677 | |||
678 | /* | ||
679 | * we couldn't merge with the running | ||
680 | * rbio, see if we can merge with the | ||
681 | * pending ones. We don't have to | ||
682 | * check for rmw_locked because there | ||
683 | * is no way they are inside finish_rmw | ||
684 | * right now | ||
685 | */ | ||
686 | list_for_each_entry(pending, &cur->plug_list, | ||
687 | plug_list) { | ||
688 | if (rbio_can_merge(pending, rbio)) { | ||
689 | merge_rbio(pending, rbio); | ||
690 | spin_unlock(&cur->bio_list_lock); | ||
691 | freeit = rbio; | ||
692 | ret = 1; | ||
693 | goto out; | ||
694 | } | ||
695 | } | ||
696 | |||
697 | /* no merging, put us on the tail of the plug list, | ||
698 | * our rbio will be started with the currently | ||
699 | * running rbio unlocks | ||
700 | */ | ||
701 | list_add_tail(&rbio->plug_list, &cur->plug_list); | ||
702 | spin_unlock(&cur->bio_list_lock); | ||
703 | ret = 1; | ||
704 | goto out; | ||
705 | } | ||
706 | } | ||
707 | lockit: | ||
708 | atomic_inc(&rbio->refs); | ||
709 | list_add(&rbio->hash_list, &h->hash_list); | ||
710 | out: | ||
711 | spin_unlock_irqrestore(&h->lock, flags); | ||
712 | if (cache_drop) | ||
713 | remove_rbio_from_cache(cache_drop); | ||
714 | if (freeit) | ||
715 | __free_raid_bio(freeit); | ||
716 | return ret; | ||
717 | } | ||
718 | |||
719 | /* | ||
720 | * called as rmw or parity rebuild is completed. If the plug list has more | ||
721 | * rbios waiting for this stripe, the next one on the list will be started | ||
722 | */ | ||
723 | static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) | ||
724 | { | ||
725 | int bucket; | ||
726 | struct btrfs_stripe_hash *h; | ||
727 | unsigned long flags; | ||
728 | int keep_cache = 0; | ||
729 | |||
730 | bucket = rbio_bucket(rbio); | ||
731 | h = rbio->fs_info->stripe_hash_table->table + bucket; | ||
732 | |||
733 | if (list_empty(&rbio->plug_list)) | ||
734 | cache_rbio(rbio); | ||
735 | |||
736 | spin_lock_irqsave(&h->lock, flags); | ||
737 | spin_lock(&rbio->bio_list_lock); | ||
738 | |||
739 | if (!list_empty(&rbio->hash_list)) { | ||
740 | /* | ||
741 | * if we're still cached and there is no other IO | ||
742 | * to perform, just leave this rbio here for others | ||
743 | * to steal from later | ||
744 | */ | ||
745 | if (list_empty(&rbio->plug_list) && | ||
746 | test_bit(RBIO_CACHE_BIT, &rbio->flags)) { | ||
747 | keep_cache = 1; | ||
748 | clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); | ||
749 | BUG_ON(!bio_list_empty(&rbio->bio_list)); | ||
750 | goto done; | ||
751 | } | ||
752 | |||
753 | list_del_init(&rbio->hash_list); | ||
754 | atomic_dec(&rbio->refs); | ||
755 | |||
756 | /* | ||
757 | * we use the plug list to hold all the rbios | ||
758 | * waiting for the chance to lock this stripe. | ||
759 | * hand the lock over to one of them. | ||
760 | */ | ||
761 | if (!list_empty(&rbio->plug_list)) { | ||
762 | struct btrfs_raid_bio *next; | ||
763 | struct list_head *head = rbio->plug_list.next; | ||
764 | |||
765 | next = list_entry(head, struct btrfs_raid_bio, | ||
766 | plug_list); | ||
767 | |||
768 | list_del_init(&rbio->plug_list); | ||
769 | |||
770 | list_add(&next->hash_list, &h->hash_list); | ||
771 | atomic_inc(&next->refs); | ||
772 | spin_unlock(&rbio->bio_list_lock); | ||
773 | spin_unlock_irqrestore(&h->lock, flags); | ||
774 | |||
775 | if (next->read_rebuild) | ||
776 | async_read_rebuild(next); | ||
777 | else { | ||
778 | steal_rbio(rbio, next); | ||
779 | async_rmw_stripe(next); | ||
780 | } | ||
781 | |||
782 | goto done_nolock; | ||
783 | } else if (waitqueue_active(&h->wait)) { | ||
784 | spin_unlock(&rbio->bio_list_lock); | ||
785 | spin_unlock_irqrestore(&h->lock, flags); | ||
786 | wake_up(&h->wait); | ||
787 | goto done_nolock; | ||
788 | } | ||
789 | } | ||
790 | done: | ||
791 | spin_unlock(&rbio->bio_list_lock); | ||
792 | spin_unlock_irqrestore(&h->lock, flags); | ||
793 | |||
794 | done_nolock: | ||
795 | if (!keep_cache) | ||
796 | remove_rbio_from_cache(rbio); | ||
797 | } | ||
798 | |||
799 | static void __free_raid_bio(struct btrfs_raid_bio *rbio) | ||
800 | { | ||
801 | int i; | ||
802 | |||
803 | WARN_ON(atomic_read(&rbio->refs) < 0); | ||
804 | if (!atomic_dec_and_test(&rbio->refs)) | ||
805 | return; | ||
806 | |||
807 | WARN_ON(!list_empty(&rbio->stripe_cache)); | ||
808 | WARN_ON(!list_empty(&rbio->hash_list)); | ||
809 | WARN_ON(!bio_list_empty(&rbio->bio_list)); | ||
810 | |||
811 | for (i = 0; i < rbio->nr_pages; i++) { | ||
812 | if (rbio->stripe_pages[i]) { | ||
813 | __free_page(rbio->stripe_pages[i]); | ||
814 | rbio->stripe_pages[i] = NULL; | ||
815 | } | ||
816 | } | ||
817 | kfree(rbio->raid_map); | ||
818 | kfree(rbio->bbio); | ||
819 | kfree(rbio); | ||
820 | } | ||
821 | |||
822 | static void free_raid_bio(struct btrfs_raid_bio *rbio) | ||
823 | { | ||
824 | unlock_stripe(rbio); | ||
825 | __free_raid_bio(rbio); | ||
826 | } | ||
827 | |||
828 | /* | ||
829 | * this frees the rbio and runs through all the bios in the | ||
830 | * bio_list and calls end_io on them | ||
831 | */ | ||
832 | static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) | ||
833 | { | ||
834 | struct bio *cur = bio_list_get(&rbio->bio_list); | ||
835 | struct bio *next; | ||
836 | free_raid_bio(rbio); | ||
837 | |||
838 | while (cur) { | ||
839 | next = cur->bi_next; | ||
840 | cur->bi_next = NULL; | ||
841 | if (uptodate) | ||
842 | set_bit(BIO_UPTODATE, &cur->bi_flags); | ||
843 | bio_endio(cur, err); | ||
844 | cur = next; | ||
845 | } | ||
846 | } | ||
847 | |||
848 | /* | ||
849 | * end io function used by finish_rmw. When we finally | ||
850 | * get here, we've written a full stripe | ||
851 | */ | ||
852 | static void raid_write_end_io(struct bio *bio, int err) | ||
853 | { | ||
854 | struct btrfs_raid_bio *rbio = bio->bi_private; | ||
855 | |||
856 | if (err) | ||
857 | fail_bio_stripe(rbio, bio); | ||
858 | |||
859 | bio_put(bio); | ||
860 | |||
861 | if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) | ||
862 | return; | ||
863 | |||
864 | err = 0; | ||
865 | |||
866 | /* OK, we have read all the stripes we need to. */ | ||
867 | if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) | ||
868 | err = -EIO; | ||
869 | |||
870 | rbio_orig_end_io(rbio, err, 0); | ||
871 | return; | ||
872 | } | ||
873 | |||
874 | /* | ||
875 | * the read/modify/write code wants to use the original bio for | ||
876 | * any pages it included, and then use the rbio for everything | ||
877 | * else. This function decides if a given index (stripe number) | ||
878 | * and page number in that stripe fall inside the original bio | ||
879 | * or the rbio. | ||
880 | * | ||
881 | * if you set bio_list_only, you'll get a NULL back for any ranges | ||
882 | * that are outside the bio_list | ||
883 | * | ||
884 | * This doesn't take any refs on anything, you get a bare page pointer | ||
885 | * and the caller must bump refs as required. | ||
886 | * | ||
887 | * You must call index_rbio_pages once before you can trust | ||
888 | * the answers from this function. | ||
889 | */ | ||
890 | static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, | ||
891 | int index, int pagenr, int bio_list_only) | ||
892 | { | ||
893 | int chunk_page; | ||
894 | struct page *p = NULL; | ||
895 | |||
896 | chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; | ||
897 | |||
898 | spin_lock_irq(&rbio->bio_list_lock); | ||
899 | p = rbio->bio_pages[chunk_page]; | ||
900 | spin_unlock_irq(&rbio->bio_list_lock); | ||
901 | |||
902 | if (p || bio_list_only) | ||
903 | return p; | ||
904 | |||
905 | return rbio->stripe_pages[chunk_page]; | ||
906 | } | ||
907 | |||
908 | /* | ||
909 | * number of pages we need for the entire stripe across all the | ||
910 | * drives | ||
911 | */ | ||
912 | static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) | ||
913 | { | ||
914 | unsigned long nr = stripe_len * nr_stripes; | ||
915 | return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
916 | } | ||
917 | |||
918 | /* | ||
919 | * allocation and initial setup for the btrfs_raid_bio. Not | ||
920 | * this does not allocate any pages for rbio->pages. | ||
921 | */ | ||
922 | static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, | ||
923 | struct btrfs_bio *bbio, u64 *raid_map, | ||
924 | u64 stripe_len) | ||
925 | { | ||
926 | struct btrfs_raid_bio *rbio; | ||
927 | int nr_data = 0; | ||
928 | int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); | ||
929 | void *p; | ||
930 | |||
931 | rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, | ||
932 | GFP_NOFS); | ||
933 | if (!rbio) { | ||
934 | kfree(raid_map); | ||
935 | kfree(bbio); | ||
936 | return ERR_PTR(-ENOMEM); | ||
937 | } | ||
938 | |||
939 | bio_list_init(&rbio->bio_list); | ||
940 | INIT_LIST_HEAD(&rbio->plug_list); | ||
941 | spin_lock_init(&rbio->bio_list_lock); | ||
942 | INIT_LIST_HEAD(&rbio->stripe_cache); | ||
943 | INIT_LIST_HEAD(&rbio->hash_list); | ||
944 | rbio->bbio = bbio; | ||
945 | rbio->raid_map = raid_map; | ||
946 | rbio->fs_info = root->fs_info; | ||
947 | rbio->stripe_len = stripe_len; | ||
948 | rbio->nr_pages = num_pages; | ||
949 | rbio->faila = -1; | ||
950 | rbio->failb = -1; | ||
951 | atomic_set(&rbio->refs, 1); | ||
952 | |||
953 | /* | ||
954 | * the stripe_pages and bio_pages array point to the extra | ||
955 | * memory we allocated past the end of the rbio | ||
956 | */ | ||
957 | p = rbio + 1; | ||
958 | rbio->stripe_pages = p; | ||
959 | rbio->bio_pages = p + sizeof(struct page *) * num_pages; | ||
960 | |||
961 | if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) | ||
962 | nr_data = bbio->num_stripes - 2; | ||
963 | else | ||
964 | nr_data = bbio->num_stripes - 1; | ||
965 | |||
966 | rbio->nr_data = nr_data; | ||
967 | return rbio; | ||
968 | } | ||
969 | |||
970 | /* allocate pages for all the stripes in the bio, including parity */ | ||
971 | static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) | ||
972 | { | ||
973 | int i; | ||
974 | struct page *page; | ||
975 | |||
976 | for (i = 0; i < rbio->nr_pages; i++) { | ||
977 | if (rbio->stripe_pages[i]) | ||
978 | continue; | ||
979 | page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); | ||
980 | if (!page) | ||
981 | return -ENOMEM; | ||
982 | rbio->stripe_pages[i] = page; | ||
983 | ClearPageUptodate(page); | ||
984 | } | ||
985 | return 0; | ||
986 | } | ||
987 | |||
988 | /* allocate pages for just the p/q stripes */ | ||
989 | static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) | ||
990 | { | ||
991 | int i; | ||
992 | struct page *page; | ||
993 | |||
994 | i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; | ||
995 | |||
996 | for (; i < rbio->nr_pages; i++) { | ||
997 | if (rbio->stripe_pages[i]) | ||
998 | continue; | ||
999 | page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); | ||
1000 | if (!page) | ||
1001 | return -ENOMEM; | ||
1002 | rbio->stripe_pages[i] = page; | ||
1003 | } | ||
1004 | return 0; | ||
1005 | } | ||
1006 | |||
1007 | /* | ||
1008 | * add a single page from a specific stripe into our list of bios for IO | ||
1009 | * this will try to merge into existing bios if possible, and returns | ||
1010 | * zero if all went well. | ||
1011 | */ | ||
1012 | int rbio_add_io_page(struct btrfs_raid_bio *rbio, | ||
1013 | struct bio_list *bio_list, | ||
1014 | struct page *page, | ||
1015 | int stripe_nr, | ||
1016 | unsigned long page_index, | ||
1017 | unsigned long bio_max_len) | ||
1018 | { | ||
1019 | struct bio *last = bio_list->tail; | ||
1020 | u64 last_end = 0; | ||
1021 | int ret; | ||
1022 | struct bio *bio; | ||
1023 | struct btrfs_bio_stripe *stripe; | ||
1024 | u64 disk_start; | ||
1025 | |||
1026 | stripe = &rbio->bbio->stripes[stripe_nr]; | ||
1027 | disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT); | ||
1028 | |||
1029 | /* if the device is missing, just fail this stripe */ | ||
1030 | if (!stripe->dev->bdev) | ||
1031 | return fail_rbio_index(rbio, stripe_nr); | ||
1032 | |||
1033 | /* see if we can add this page onto our existing bio */ | ||
1034 | if (last) { | ||
1035 | last_end = (u64)last->bi_sector << 9; | ||
1036 | last_end += last->bi_size; | ||
1037 | |||
1038 | /* | ||
1039 | * we can't merge these if they are from different | ||
1040 | * devices or if they are not contiguous | ||
1041 | */ | ||
1042 | if (last_end == disk_start && stripe->dev->bdev && | ||
1043 | test_bit(BIO_UPTODATE, &last->bi_flags) && | ||
1044 | last->bi_bdev == stripe->dev->bdev) { | ||
1045 | ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); | ||
1046 | if (ret == PAGE_CACHE_SIZE) | ||
1047 | return 0; | ||
1048 | } | ||
1049 | } | ||
1050 | |||
1051 | /* put a new bio on the list */ | ||
1052 | bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); | ||
1053 | if (!bio) | ||
1054 | return -ENOMEM; | ||
1055 | |||
1056 | bio->bi_size = 0; | ||
1057 | bio->bi_bdev = stripe->dev->bdev; | ||
1058 | bio->bi_sector = disk_start >> 9; | ||
1059 | set_bit(BIO_UPTODATE, &bio->bi_flags); | ||
1060 | |||
1061 | bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); | ||
1062 | bio_list_add(bio_list, bio); | ||
1063 | return 0; | ||
1064 | } | ||
1065 | |||
1066 | /* | ||
1067 | * while we're doing the read/modify/write cycle, we could | ||
1068 | * have errors in reading pages off the disk. This checks | ||
1069 | * for errors and if we're not able to read the page it'll | ||
1070 | * trigger parity reconstruction. The rmw will be finished | ||
1071 | * after we've reconstructed the failed stripes | ||
1072 | */ | ||
1073 | static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) | ||
1074 | { | ||
1075 | if (rbio->faila >= 0 || rbio->failb >= 0) { | ||
1076 | BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); | ||
1077 | __raid56_parity_recover(rbio); | ||
1078 | } else { | ||
1079 | finish_rmw(rbio); | ||
1080 | } | ||
1081 | } | ||
1082 | |||
1083 | /* | ||
1084 | * these are just the pages from the rbio array, not from anything | ||
1085 | * the FS sent down to us | ||
1086 | */ | ||
1087 | static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) | ||
1088 | { | ||
1089 | int index; | ||
1090 | index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); | ||
1091 | index += page; | ||
1092 | return rbio->stripe_pages[index]; | ||
1093 | } | ||
1094 | |||
1095 | /* | ||
1096 | * helper function to walk our bio list and populate the bio_pages array with | ||
1097 | * the result. This seems expensive, but it is faster than constantly | ||
1098 | * searching through the bio list as we setup the IO in finish_rmw or stripe | ||
1099 | * reconstruction. | ||
1100 | * | ||
1101 | * This must be called before you trust the answers from page_in_rbio | ||
1102 | */ | ||
1103 | static void index_rbio_pages(struct btrfs_raid_bio *rbio) | ||
1104 | { | ||
1105 | struct bio *bio; | ||
1106 | u64 start; | ||
1107 | unsigned long stripe_offset; | ||
1108 | unsigned long page_index; | ||
1109 | struct page *p; | ||
1110 | int i; | ||
1111 | |||
1112 | spin_lock_irq(&rbio->bio_list_lock); | ||
1113 | bio_list_for_each(bio, &rbio->bio_list) { | ||
1114 | start = (u64)bio->bi_sector << 9; | ||
1115 | stripe_offset = start - rbio->raid_map[0]; | ||
1116 | page_index = stripe_offset >> PAGE_CACHE_SHIFT; | ||
1117 | |||
1118 | for (i = 0; i < bio->bi_vcnt; i++) { | ||
1119 | p = bio->bi_io_vec[i].bv_page; | ||
1120 | rbio->bio_pages[page_index + i] = p; | ||
1121 | } | ||
1122 | } | ||
1123 | spin_unlock_irq(&rbio->bio_list_lock); | ||
1124 | } | ||
1125 | |||
1126 | /* | ||
1127 | * this is called from one of two situations. We either | ||
1128 | * have a full stripe from the higher layers, or we've read all | ||
1129 | * the missing bits off disk. | ||
1130 | * | ||
1131 | * This will calculate the parity and then send down any | ||
1132 | * changed blocks. | ||
1133 | */ | ||
1134 | static noinline void finish_rmw(struct btrfs_raid_bio *rbio) | ||
1135 | { | ||
1136 | struct btrfs_bio *bbio = rbio->bbio; | ||
1137 | void *pointers[bbio->num_stripes]; | ||
1138 | int stripe_len = rbio->stripe_len; | ||
1139 | int nr_data = rbio->nr_data; | ||
1140 | int stripe; | ||
1141 | int pagenr; | ||
1142 | int p_stripe = -1; | ||
1143 | int q_stripe = -1; | ||
1144 | struct bio_list bio_list; | ||
1145 | struct bio *bio; | ||
1146 | int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; | ||
1147 | int ret; | ||
1148 | |||
1149 | bio_list_init(&bio_list); | ||
1150 | |||
1151 | if (bbio->num_stripes - rbio->nr_data == 1) { | ||
1152 | p_stripe = bbio->num_stripes - 1; | ||
1153 | } else if (bbio->num_stripes - rbio->nr_data == 2) { | ||
1154 | p_stripe = bbio->num_stripes - 2; | ||
1155 | q_stripe = bbio->num_stripes - 1; | ||
1156 | } else { | ||
1157 | BUG(); | ||
1158 | } | ||
1159 | |||
1160 | /* at this point we either have a full stripe, | ||
1161 | * or we've read the full stripe from the drive. | ||
1162 | * recalculate the parity and write the new results. | ||
1163 | * | ||
1164 | * We're not allowed to add any new bios to the | ||
1165 | * bio list here, anyone else that wants to | ||
1166 | * change this stripe needs to do their own rmw. | ||
1167 | */ | ||
1168 | spin_lock_irq(&rbio->bio_list_lock); | ||
1169 | set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); | ||
1170 | spin_unlock_irq(&rbio->bio_list_lock); | ||
1171 | |||
1172 | atomic_set(&rbio->bbio->error, 0); | ||
1173 | |||
1174 | /* | ||
1175 | * now that we've set rmw_locked, run through the | ||
1176 | * bio list one last time and map the page pointers | ||
1177 | * | ||
1178 | * We don't cache full rbios because we're assuming | ||
1179 | * the higher layers are unlikely to use this area of | ||
1180 | * the disk again soon. If they do use it again, | ||
1181 | * hopefully they will send another full bio. | ||
1182 | */ | ||
1183 | index_rbio_pages(rbio); | ||
1184 | if (!rbio_is_full(rbio)) | ||
1185 | cache_rbio_pages(rbio); | ||
1186 | else | ||
1187 | clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); | ||
1188 | |||
1189 | for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { | ||
1190 | struct page *p; | ||
1191 | /* first collect one page from each data stripe */ | ||
1192 | for (stripe = 0; stripe < nr_data; stripe++) { | ||
1193 | p = page_in_rbio(rbio, stripe, pagenr, 0); | ||
1194 | pointers[stripe] = kmap(p); | ||
1195 | } | ||
1196 | |||
1197 | /* then add the parity stripe */ | ||
1198 | p = rbio_pstripe_page(rbio, pagenr); | ||
1199 | SetPageUptodate(p); | ||
1200 | pointers[stripe++] = kmap(p); | ||
1201 | |||
1202 | if (q_stripe != -1) { | ||
1203 | |||
1204 | /* | ||
1205 | * raid6, add the qstripe and call the | ||
1206 | * library function to fill in our p/q | ||
1207 | */ | ||
1208 | p = rbio_qstripe_page(rbio, pagenr); | ||
1209 | SetPageUptodate(p); | ||
1210 | pointers[stripe++] = kmap(p); | ||
1211 | |||
1212 | raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, | ||
1213 | pointers); | ||
1214 | } else { | ||
1215 | /* raid5 */ | ||
1216 | memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); | ||
1217 | run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); | ||
1218 | } | ||
1219 | |||
1220 | |||
1221 | for (stripe = 0; stripe < bbio->num_stripes; stripe++) | ||
1222 | kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); | ||
1223 | } | ||
1224 | |||
1225 | /* | ||
1226 | * time to start writing. Make bios for everything from the | ||
1227 | * higher layers (the bio_list in our rbio) and our p/q. Ignore | ||
1228 | * everything else. | ||
1229 | */ | ||
1230 | for (stripe = 0; stripe < bbio->num_stripes; stripe++) { | ||
1231 | for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { | ||
1232 | struct page *page; | ||
1233 | if (stripe < rbio->nr_data) { | ||
1234 | page = page_in_rbio(rbio, stripe, pagenr, 1); | ||
1235 | if (!page) | ||
1236 | continue; | ||
1237 | } else { | ||
1238 | page = rbio_stripe_page(rbio, stripe, pagenr); | ||
1239 | } | ||
1240 | |||
1241 | ret = rbio_add_io_page(rbio, &bio_list, | ||
1242 | page, stripe, pagenr, rbio->stripe_len); | ||
1243 | if (ret) | ||
1244 | goto cleanup; | ||
1245 | } | ||
1246 | } | ||
1247 | |||
1248 | atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); | ||
1249 | BUG_ON(atomic_read(&bbio->stripes_pending) == 0); | ||
1250 | |||
1251 | while (1) { | ||
1252 | bio = bio_list_pop(&bio_list); | ||
1253 | if (!bio) | ||
1254 | break; | ||
1255 | |||
1256 | bio->bi_private = rbio; | ||
1257 | bio->bi_end_io = raid_write_end_io; | ||
1258 | BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); | ||
1259 | submit_bio(WRITE, bio); | ||
1260 | } | ||
1261 | return; | ||
1262 | |||
1263 | cleanup: | ||
1264 | rbio_orig_end_io(rbio, -EIO, 0); | ||
1265 | } | ||
1266 | |||
1267 | /* | ||
1268 | * helper to find the stripe number for a given bio. Used to figure out which | ||
1269 | * stripe has failed. This expects the bio to correspond to a physical disk, | ||
1270 | * so it looks up based on physical sector numbers. | ||
1271 | */ | ||
1272 | static int find_bio_stripe(struct btrfs_raid_bio *rbio, | ||
1273 | struct bio *bio) | ||
1274 | { | ||
1275 | u64 physical = bio->bi_sector; | ||
1276 | u64 stripe_start; | ||
1277 | int i; | ||
1278 | struct btrfs_bio_stripe *stripe; | ||
1279 | |||
1280 | physical <<= 9; | ||
1281 | |||
1282 | for (i = 0; i < rbio->bbio->num_stripes; i++) { | ||
1283 | stripe = &rbio->bbio->stripes[i]; | ||
1284 | stripe_start = stripe->physical; | ||
1285 | if (physical >= stripe_start && | ||
1286 | physical < stripe_start + rbio->stripe_len) { | ||
1287 | return i; | ||
1288 | } | ||
1289 | } | ||
1290 | return -1; | ||
1291 | } | ||
1292 | |||
1293 | /* | ||
1294 | * helper to find the stripe number for a given | ||
1295 | * bio (before mapping). Used to figure out which stripe has | ||
1296 | * failed. This looks up based on logical block numbers. | ||
1297 | */ | ||
1298 | static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, | ||
1299 | struct bio *bio) | ||
1300 | { | ||
1301 | u64 logical = bio->bi_sector; | ||
1302 | u64 stripe_start; | ||
1303 | int i; | ||
1304 | |||
1305 | logical <<= 9; | ||
1306 | |||
1307 | for (i = 0; i < rbio->nr_data; i++) { | ||
1308 | stripe_start = rbio->raid_map[i]; | ||
1309 | if (logical >= stripe_start && | ||
1310 | logical < stripe_start + rbio->stripe_len) { | ||
1311 | return i; | ||
1312 | } | ||
1313 | } | ||
1314 | return -1; | ||
1315 | } | ||
1316 | |||
1317 | /* | ||
1318 | * returns -EIO if we had too many failures | ||
1319 | */ | ||
1320 | static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) | ||
1321 | { | ||
1322 | unsigned long flags; | ||
1323 | int ret = 0; | ||
1324 | |||
1325 | spin_lock_irqsave(&rbio->bio_list_lock, flags); | ||
1326 | |||
1327 | /* we already know this stripe is bad, move on */ | ||
1328 | if (rbio->faila == failed || rbio->failb == failed) | ||
1329 | goto out; | ||
1330 | |||
1331 | if (rbio->faila == -1) { | ||
1332 | /* first failure on this rbio */ | ||
1333 | rbio->faila = failed; | ||
1334 | atomic_inc(&rbio->bbio->error); | ||
1335 | } else if (rbio->failb == -1) { | ||
1336 | /* second failure on this rbio */ | ||
1337 | rbio->failb = failed; | ||
1338 | atomic_inc(&rbio->bbio->error); | ||
1339 | } else { | ||
1340 | ret = -EIO; | ||
1341 | } | ||
1342 | out: | ||
1343 | spin_unlock_irqrestore(&rbio->bio_list_lock, flags); | ||
1344 | |||
1345 | return ret; | ||
1346 | } | ||
1347 | |||
1348 | /* | ||
1349 | * helper to fail a stripe based on a physical disk | ||
1350 | * bio. | ||
1351 | */ | ||
1352 | static int fail_bio_stripe(struct btrfs_raid_bio *rbio, | ||
1353 | struct bio *bio) | ||
1354 | { | ||
1355 | int failed = find_bio_stripe(rbio, bio); | ||
1356 | |||
1357 | if (failed < 0) | ||
1358 | return -EIO; | ||
1359 | |||
1360 | return fail_rbio_index(rbio, failed); | ||
1361 | } | ||
1362 | |||
1363 | /* | ||
1364 | * this sets each page in the bio uptodate. It should only be used on private | ||
1365 | * rbio pages, nothing that comes in from the higher layers | ||
1366 | */ | ||
1367 | static void set_bio_pages_uptodate(struct bio *bio) | ||
1368 | { | ||
1369 | int i; | ||
1370 | struct page *p; | ||
1371 | |||
1372 | for (i = 0; i < bio->bi_vcnt; i++) { | ||
1373 | p = bio->bi_io_vec[i].bv_page; | ||
1374 | SetPageUptodate(p); | ||
1375 | } | ||
1376 | } | ||
1377 | |||
1378 | /* | ||
1379 | * end io for the read phase of the rmw cycle. All the bios here are physical | ||
1380 | * stripe bios we've read from the disk so we can recalculate the parity of the | ||
1381 | * stripe. | ||
1382 | * | ||
1383 | * This will usually kick off finish_rmw once all the bios are read in, but it | ||
1384 | * may trigger parity reconstruction if we had any errors along the way | ||
1385 | */ | ||
1386 | static void raid_rmw_end_io(struct bio *bio, int err) | ||
1387 | { | ||
1388 | struct btrfs_raid_bio *rbio = bio->bi_private; | ||
1389 | |||
1390 | if (err) | ||
1391 | fail_bio_stripe(rbio, bio); | ||
1392 | else | ||
1393 | set_bio_pages_uptodate(bio); | ||
1394 | |||
1395 | bio_put(bio); | ||
1396 | |||
1397 | if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) | ||
1398 | return; | ||
1399 | |||
1400 | err = 0; | ||
1401 | if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) | ||
1402 | goto cleanup; | ||
1403 | |||
1404 | /* | ||
1405 | * this will normally call finish_rmw to start our write | ||
1406 | * but if there are any failed stripes we'll reconstruct | ||
1407 | * from parity first | ||
1408 | */ | ||
1409 | validate_rbio_for_rmw(rbio); | ||
1410 | return; | ||
1411 | |||
1412 | cleanup: | ||
1413 | |||
1414 | rbio_orig_end_io(rbio, -EIO, 0); | ||
1415 | } | ||
1416 | |||
1417 | static void async_rmw_stripe(struct btrfs_raid_bio *rbio) | ||
1418 | { | ||
1419 | rbio->work.flags = 0; | ||
1420 | rbio->work.func = rmw_work; | ||
1421 | |||
1422 | btrfs_queue_worker(&rbio->fs_info->rmw_workers, | ||
1423 | &rbio->work); | ||
1424 | } | ||
1425 | |||
1426 | static void async_read_rebuild(struct btrfs_raid_bio *rbio) | ||
1427 | { | ||
1428 | rbio->work.flags = 0; | ||
1429 | rbio->work.func = read_rebuild_work; | ||
1430 | |||
1431 | btrfs_queue_worker(&rbio->fs_info->rmw_workers, | ||
1432 | &rbio->work); | ||
1433 | } | ||
1434 | |||
1435 | /* | ||
1436 | * the stripe must be locked by the caller. It will | ||
1437 | * unlock after all the writes are done | ||
1438 | */ | ||
1439 | static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) | ||
1440 | { | ||
1441 | int bios_to_read = 0; | ||
1442 | struct btrfs_bio *bbio = rbio->bbio; | ||
1443 | struct bio_list bio_list; | ||
1444 | int ret; | ||
1445 | int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1446 | int pagenr; | ||
1447 | int stripe; | ||
1448 | struct bio *bio; | ||
1449 | |||
1450 | bio_list_init(&bio_list); | ||
1451 | |||
1452 | ret = alloc_rbio_pages(rbio); | ||
1453 | if (ret) | ||
1454 | goto cleanup; | ||
1455 | |||
1456 | index_rbio_pages(rbio); | ||
1457 | |||
1458 | atomic_set(&rbio->bbio->error, 0); | ||
1459 | /* | ||
1460 | * build a list of bios to read all the missing parts of this | ||
1461 | * stripe | ||
1462 | */ | ||
1463 | for (stripe = 0; stripe < rbio->nr_data; stripe++) { | ||
1464 | for (pagenr = 0; pagenr < nr_pages; pagenr++) { | ||
1465 | struct page *page; | ||
1466 | /* | ||
1467 | * we want to find all the pages missing from | ||
1468 | * the rbio and read them from the disk. If | ||
1469 | * page_in_rbio finds a page in the bio list | ||
1470 | * we don't need to read it off the stripe. | ||
1471 | */ | ||
1472 | page = page_in_rbio(rbio, stripe, pagenr, 1); | ||
1473 | if (page) | ||
1474 | continue; | ||
1475 | |||
1476 | page = rbio_stripe_page(rbio, stripe, pagenr); | ||
1477 | /* | ||
1478 | * the bio cache may have handed us an uptodate | ||
1479 | * page. If so, be happy and use it | ||
1480 | */ | ||
1481 | if (PageUptodate(page)) | ||
1482 | continue; | ||
1483 | |||
1484 | ret = rbio_add_io_page(rbio, &bio_list, page, | ||
1485 | stripe, pagenr, rbio->stripe_len); | ||
1486 | if (ret) | ||
1487 | goto cleanup; | ||
1488 | } | ||
1489 | } | ||
1490 | |||
1491 | bios_to_read = bio_list_size(&bio_list); | ||
1492 | if (!bios_to_read) { | ||
1493 | /* | ||
1494 | * this can happen if others have merged with | ||
1495 | * us, it means there is nothing left to read. | ||
1496 | * But if there are missing devices it may not be | ||
1497 | * safe to do the full stripe write yet. | ||
1498 | */ | ||
1499 | goto finish; | ||
1500 | } | ||
1501 | |||
1502 | /* | ||
1503 | * the bbio may be freed once we submit the last bio. Make sure | ||
1504 | * not to touch it after that | ||
1505 | */ | ||
1506 | atomic_set(&bbio->stripes_pending, bios_to_read); | ||
1507 | while (1) { | ||
1508 | bio = bio_list_pop(&bio_list); | ||
1509 | if (!bio) | ||
1510 | break; | ||
1511 | |||
1512 | bio->bi_private = rbio; | ||
1513 | bio->bi_end_io = raid_rmw_end_io; | ||
1514 | |||
1515 | btrfs_bio_wq_end_io(rbio->fs_info, bio, | ||
1516 | BTRFS_WQ_ENDIO_RAID56); | ||
1517 | |||
1518 | BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); | ||
1519 | submit_bio(READ, bio); | ||
1520 | } | ||
1521 | /* the actual write will happen once the reads are done */ | ||
1522 | return 0; | ||
1523 | |||
1524 | cleanup: | ||
1525 | rbio_orig_end_io(rbio, -EIO, 0); | ||
1526 | return -EIO; | ||
1527 | |||
1528 | finish: | ||
1529 | validate_rbio_for_rmw(rbio); | ||
1530 | return 0; | ||
1531 | } | ||
1532 | |||
1533 | /* | ||
1534 | * if the upper layers pass in a full stripe, we thank them by only allocating | ||
1535 | * enough pages to hold the parity, and sending it all down quickly. | ||
1536 | */ | ||
1537 | static int full_stripe_write(struct btrfs_raid_bio *rbio) | ||
1538 | { | ||
1539 | int ret; | ||
1540 | |||
1541 | ret = alloc_rbio_parity_pages(rbio); | ||
1542 | if (ret) | ||
1543 | return ret; | ||
1544 | |||
1545 | ret = lock_stripe_add(rbio); | ||
1546 | if (ret == 0) | ||
1547 | finish_rmw(rbio); | ||
1548 | return 0; | ||
1549 | } | ||
1550 | |||
1551 | /* | ||
1552 | * partial stripe writes get handed over to async helpers. | ||
1553 | * We're really hoping to merge a few more writes into this | ||
1554 | * rbio before calculating new parity | ||
1555 | */ | ||
1556 | static int partial_stripe_write(struct btrfs_raid_bio *rbio) | ||
1557 | { | ||
1558 | int ret; | ||
1559 | |||
1560 | ret = lock_stripe_add(rbio); | ||
1561 | if (ret == 0) | ||
1562 | async_rmw_stripe(rbio); | ||
1563 | return 0; | ||
1564 | } | ||
1565 | |||
1566 | /* | ||
1567 | * sometimes while we were reading from the drive to | ||
1568 | * recalculate parity, enough new bios come into create | ||
1569 | * a full stripe. So we do a check here to see if we can | ||
1570 | * go directly to finish_rmw | ||
1571 | */ | ||
1572 | static int __raid56_parity_write(struct btrfs_raid_bio *rbio) | ||
1573 | { | ||
1574 | /* head off into rmw land if we don't have a full stripe */ | ||
1575 | if (!rbio_is_full(rbio)) | ||
1576 | return partial_stripe_write(rbio); | ||
1577 | return full_stripe_write(rbio); | ||
1578 | } | ||
1579 | |||
1580 | /* | ||
1581 | * We use plugging call backs to collect full stripes. | ||
1582 | * Any time we get a partial stripe write while plugged | ||
1583 | * we collect it into a list. When the unplug comes down, | ||
1584 | * we sort the list by logical block number and merge | ||
1585 | * everything we can into the same rbios | ||
1586 | */ | ||
1587 | struct btrfs_plug_cb { | ||
1588 | struct blk_plug_cb cb; | ||
1589 | struct btrfs_fs_info *info; | ||
1590 | struct list_head rbio_list; | ||
1591 | struct btrfs_work work; | ||
1592 | }; | ||
1593 | |||
1594 | /* | ||
1595 | * rbios on the plug list are sorted for easier merging. | ||
1596 | */ | ||
1597 | static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) | ||
1598 | { | ||
1599 | struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, | ||
1600 | plug_list); | ||
1601 | struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, | ||
1602 | plug_list); | ||
1603 | u64 a_sector = ra->bio_list.head->bi_sector; | ||
1604 | u64 b_sector = rb->bio_list.head->bi_sector; | ||
1605 | |||
1606 | if (a_sector < b_sector) | ||
1607 | return -1; | ||
1608 | if (a_sector > b_sector) | ||
1609 | return 1; | ||
1610 | return 0; | ||
1611 | } | ||
1612 | |||
1613 | static void run_plug(struct btrfs_plug_cb *plug) | ||
1614 | { | ||
1615 | struct btrfs_raid_bio *cur; | ||
1616 | struct btrfs_raid_bio *last = NULL; | ||
1617 | |||
1618 | /* | ||
1619 | * sort our plug list then try to merge | ||
1620 | * everything we can in hopes of creating full | ||
1621 | * stripes. | ||
1622 | */ | ||
1623 | list_sort(NULL, &plug->rbio_list, plug_cmp); | ||
1624 | while (!list_empty(&plug->rbio_list)) { | ||
1625 | cur = list_entry(plug->rbio_list.next, | ||
1626 | struct btrfs_raid_bio, plug_list); | ||
1627 | list_del_init(&cur->plug_list); | ||
1628 | |||
1629 | if (rbio_is_full(cur)) { | ||
1630 | /* we have a full stripe, send it down */ | ||
1631 | full_stripe_write(cur); | ||
1632 | continue; | ||
1633 | } | ||
1634 | if (last) { | ||
1635 | if (rbio_can_merge(last, cur)) { | ||
1636 | merge_rbio(last, cur); | ||
1637 | __free_raid_bio(cur); | ||
1638 | continue; | ||
1639 | |||
1640 | } | ||
1641 | __raid56_parity_write(last); | ||
1642 | } | ||
1643 | last = cur; | ||
1644 | } | ||
1645 | if (last) { | ||
1646 | __raid56_parity_write(last); | ||
1647 | } | ||
1648 | kfree(plug); | ||
1649 | } | ||
1650 | |||
1651 | /* | ||
1652 | * if the unplug comes from schedule, we have to push the | ||
1653 | * work off to a helper thread | ||
1654 | */ | ||
1655 | static void unplug_work(struct btrfs_work *work) | ||
1656 | { | ||
1657 | struct btrfs_plug_cb *plug; | ||
1658 | plug = container_of(work, struct btrfs_plug_cb, work); | ||
1659 | run_plug(plug); | ||
1660 | } | ||
1661 | |||
1662 | static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) | ||
1663 | { | ||
1664 | struct btrfs_plug_cb *plug; | ||
1665 | plug = container_of(cb, struct btrfs_plug_cb, cb); | ||
1666 | |||
1667 | if (from_schedule) { | ||
1668 | plug->work.flags = 0; | ||
1669 | plug->work.func = unplug_work; | ||
1670 | btrfs_queue_worker(&plug->info->rmw_workers, | ||
1671 | &plug->work); | ||
1672 | return; | ||
1673 | } | ||
1674 | run_plug(plug); | ||
1675 | } | ||
1676 | |||
1677 | /* | ||
1678 | * our main entry point for writes from the rest of the FS. | ||
1679 | */ | ||
1680 | int raid56_parity_write(struct btrfs_root *root, struct bio *bio, | ||
1681 | struct btrfs_bio *bbio, u64 *raid_map, | ||
1682 | u64 stripe_len) | ||
1683 | { | ||
1684 | struct btrfs_raid_bio *rbio; | ||
1685 | struct btrfs_plug_cb *plug = NULL; | ||
1686 | struct blk_plug_cb *cb; | ||
1687 | |||
1688 | rbio = alloc_rbio(root, bbio, raid_map, stripe_len); | ||
1689 | if (IS_ERR(rbio)) { | ||
1690 | kfree(raid_map); | ||
1691 | kfree(bbio); | ||
1692 | return PTR_ERR(rbio); | ||
1693 | } | ||
1694 | bio_list_add(&rbio->bio_list, bio); | ||
1695 | rbio->bio_list_bytes = bio->bi_size; | ||
1696 | |||
1697 | /* | ||
1698 | * don't plug on full rbios, just get them out the door | ||
1699 | * as quickly as we can | ||
1700 | */ | ||
1701 | if (rbio_is_full(rbio)) | ||
1702 | return full_stripe_write(rbio); | ||
1703 | |||
1704 | cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, | ||
1705 | sizeof(*plug)); | ||
1706 | if (cb) { | ||
1707 | plug = container_of(cb, struct btrfs_plug_cb, cb); | ||
1708 | if (!plug->info) { | ||
1709 | plug->info = root->fs_info; | ||
1710 | INIT_LIST_HEAD(&plug->rbio_list); | ||
1711 | } | ||
1712 | list_add_tail(&rbio->plug_list, &plug->rbio_list); | ||
1713 | } else { | ||
1714 | return __raid56_parity_write(rbio); | ||
1715 | } | ||
1716 | return 0; | ||
1717 | } | ||
1718 | |||
1719 | /* | ||
1720 | * all parity reconstruction happens here. We've read in everything | ||
1721 | * we can find from the drives and this does the heavy lifting of | ||
1722 | * sorting the good from the bad. | ||
1723 | */ | ||
1724 | static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) | ||
1725 | { | ||
1726 | int pagenr, stripe; | ||
1727 | void **pointers; | ||
1728 | int faila = -1, failb = -1; | ||
1729 | int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1730 | struct page *page; | ||
1731 | int err; | ||
1732 | int i; | ||
1733 | |||
1734 | pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), | ||
1735 | GFP_NOFS); | ||
1736 | if (!pointers) { | ||
1737 | err = -ENOMEM; | ||
1738 | goto cleanup_io; | ||
1739 | } | ||
1740 | |||
1741 | faila = rbio->faila; | ||
1742 | failb = rbio->failb; | ||
1743 | |||
1744 | if (rbio->read_rebuild) { | ||
1745 | spin_lock_irq(&rbio->bio_list_lock); | ||
1746 | set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); | ||
1747 | spin_unlock_irq(&rbio->bio_list_lock); | ||
1748 | } | ||
1749 | |||
1750 | index_rbio_pages(rbio); | ||
1751 | |||
1752 | for (pagenr = 0; pagenr < nr_pages; pagenr++) { | ||
1753 | /* setup our array of pointers with pages | ||
1754 | * from each stripe | ||
1755 | */ | ||
1756 | for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { | ||
1757 | /* | ||
1758 | * if we're rebuilding a read, we have to use | ||
1759 | * pages from the bio list | ||
1760 | */ | ||
1761 | if (rbio->read_rebuild && | ||
1762 | (stripe == faila || stripe == failb)) { | ||
1763 | page = page_in_rbio(rbio, stripe, pagenr, 0); | ||
1764 | } else { | ||
1765 | page = rbio_stripe_page(rbio, stripe, pagenr); | ||
1766 | } | ||
1767 | pointers[stripe] = kmap(page); | ||
1768 | } | ||
1769 | |||
1770 | /* all raid6 handling here */ | ||
1771 | if (rbio->raid_map[rbio->bbio->num_stripes - 1] == | ||
1772 | RAID6_Q_STRIPE) { | ||
1773 | |||
1774 | /* | ||
1775 | * single failure, rebuild from parity raid5 | ||
1776 | * style | ||
1777 | */ | ||
1778 | if (failb < 0) { | ||
1779 | if (faila == rbio->nr_data) { | ||
1780 | /* | ||
1781 | * Just the P stripe has failed, without | ||
1782 | * a bad data or Q stripe. | ||
1783 | * TODO, we should redo the xor here. | ||
1784 | */ | ||
1785 | err = -EIO; | ||
1786 | goto cleanup; | ||
1787 | } | ||
1788 | /* | ||
1789 | * a single failure in raid6 is rebuilt | ||
1790 | * in the pstripe code below | ||
1791 | */ | ||
1792 | goto pstripe; | ||
1793 | } | ||
1794 | |||
1795 | /* make sure our ps and qs are in order */ | ||
1796 | if (faila > failb) { | ||
1797 | int tmp = failb; | ||
1798 | failb = faila; | ||
1799 | faila = tmp; | ||
1800 | } | ||
1801 | |||
1802 | /* if the q stripe is failed, do a pstripe reconstruction | ||
1803 | * from the xors. | ||
1804 | * If both the q stripe and the P stripe are failed, we're | ||
1805 | * here due to a crc mismatch and we can't give them the | ||
1806 | * data they want | ||
1807 | */ | ||
1808 | if (rbio->raid_map[failb] == RAID6_Q_STRIPE) { | ||
1809 | if (rbio->raid_map[faila] == RAID5_P_STRIPE) { | ||
1810 | err = -EIO; | ||
1811 | goto cleanup; | ||
1812 | } | ||
1813 | /* | ||
1814 | * otherwise we have one bad data stripe and | ||
1815 | * a good P stripe. raid5! | ||
1816 | */ | ||
1817 | goto pstripe; | ||
1818 | } | ||
1819 | |||
1820 | if (rbio->raid_map[failb] == RAID5_P_STRIPE) { | ||
1821 | raid6_datap_recov(rbio->bbio->num_stripes, | ||
1822 | PAGE_SIZE, faila, pointers); | ||
1823 | } else { | ||
1824 | raid6_2data_recov(rbio->bbio->num_stripes, | ||
1825 | PAGE_SIZE, faila, failb, | ||
1826 | pointers); | ||
1827 | } | ||
1828 | } else { | ||
1829 | void *p; | ||
1830 | |||
1831 | /* rebuild from P stripe here (raid5 or raid6) */ | ||
1832 | BUG_ON(failb != -1); | ||
1833 | pstripe: | ||
1834 | /* Copy parity block into failed block to start with */ | ||
1835 | memcpy(pointers[faila], | ||
1836 | pointers[rbio->nr_data], | ||
1837 | PAGE_CACHE_SIZE); | ||
1838 | |||
1839 | /* rearrange the pointer array */ | ||
1840 | p = pointers[faila]; | ||
1841 | for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) | ||
1842 | pointers[stripe] = pointers[stripe + 1]; | ||
1843 | pointers[rbio->nr_data - 1] = p; | ||
1844 | |||
1845 | /* xor in the rest */ | ||
1846 | run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE); | ||
1847 | } | ||
1848 | /* if we're doing this rebuild as part of an rmw, go through | ||
1849 | * and set all of our private rbio pages in the | ||
1850 | * failed stripes as uptodate. This way finish_rmw will | ||
1851 | * know they can be trusted. If this was a read reconstruction, | ||
1852 | * other endio functions will fiddle the uptodate bits | ||
1853 | */ | ||
1854 | if (!rbio->read_rebuild) { | ||
1855 | for (i = 0; i < nr_pages; i++) { | ||
1856 | if (faila != -1) { | ||
1857 | page = rbio_stripe_page(rbio, faila, i); | ||
1858 | SetPageUptodate(page); | ||
1859 | } | ||
1860 | if (failb != -1) { | ||
1861 | page = rbio_stripe_page(rbio, failb, i); | ||
1862 | SetPageUptodate(page); | ||
1863 | } | ||
1864 | } | ||
1865 | } | ||
1866 | for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { | ||
1867 | /* | ||
1868 | * if we're rebuilding a read, we have to use | ||
1869 | * pages from the bio list | ||
1870 | */ | ||
1871 | if (rbio->read_rebuild && | ||
1872 | (stripe == faila || stripe == failb)) { | ||
1873 | page = page_in_rbio(rbio, stripe, pagenr, 0); | ||
1874 | } else { | ||
1875 | page = rbio_stripe_page(rbio, stripe, pagenr); | ||
1876 | } | ||
1877 | kunmap(page); | ||
1878 | } | ||
1879 | } | ||
1880 | |||
1881 | err = 0; | ||
1882 | cleanup: | ||
1883 | kfree(pointers); | ||
1884 | |||
1885 | cleanup_io: | ||
1886 | |||
1887 | if (rbio->read_rebuild) { | ||
1888 | if (err == 0) | ||
1889 | cache_rbio_pages(rbio); | ||
1890 | else | ||
1891 | clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); | ||
1892 | |||
1893 | rbio_orig_end_io(rbio, err, err == 0); | ||
1894 | } else if (err == 0) { | ||
1895 | rbio->faila = -1; | ||
1896 | rbio->failb = -1; | ||
1897 | finish_rmw(rbio); | ||
1898 | } else { | ||
1899 | rbio_orig_end_io(rbio, err, 0); | ||
1900 | } | ||
1901 | } | ||
1902 | |||
1903 | /* | ||
1904 | * This is called only for stripes we've read from disk to | ||
1905 | * reconstruct the parity. | ||
1906 | */ | ||
1907 | static void raid_recover_end_io(struct bio *bio, int err) | ||
1908 | { | ||
1909 | struct btrfs_raid_bio *rbio = bio->bi_private; | ||
1910 | |||
1911 | /* | ||
1912 | * we only read stripe pages off the disk, set them | ||
1913 | * up to date if there were no errors | ||
1914 | */ | ||
1915 | if (err) | ||
1916 | fail_bio_stripe(rbio, bio); | ||
1917 | else | ||
1918 | set_bio_pages_uptodate(bio); | ||
1919 | bio_put(bio); | ||
1920 | |||
1921 | if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) | ||
1922 | return; | ||
1923 | |||
1924 | if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) | ||
1925 | rbio_orig_end_io(rbio, -EIO, 0); | ||
1926 | else | ||
1927 | __raid_recover_end_io(rbio); | ||
1928 | } | ||
1929 | |||
1930 | /* | ||
1931 | * reads everything we need off the disk to reconstruct | ||
1932 | * the parity. endio handlers trigger final reconstruction | ||
1933 | * when the IO is done. | ||
1934 | * | ||
1935 | * This is used both for reads from the higher layers and for | ||
1936 | * parity construction required to finish a rmw cycle. | ||
1937 | */ | ||
1938 | static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) | ||
1939 | { | ||
1940 | int bios_to_read = 0; | ||
1941 | struct btrfs_bio *bbio = rbio->bbio; | ||
1942 | struct bio_list bio_list; | ||
1943 | int ret; | ||
1944 | int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1945 | int pagenr; | ||
1946 | int stripe; | ||
1947 | struct bio *bio; | ||
1948 | |||
1949 | bio_list_init(&bio_list); | ||
1950 | |||
1951 | ret = alloc_rbio_pages(rbio); | ||
1952 | if (ret) | ||
1953 | goto cleanup; | ||
1954 | |||
1955 | atomic_set(&rbio->bbio->error, 0); | ||
1956 | |||
1957 | /* | ||
1958 | * read everything that hasn't failed. Thanks to the | ||
1959 | * stripe cache, it is possible that some or all of these | ||
1960 | * pages are going to be uptodate. | ||
1961 | */ | ||
1962 | for (stripe = 0; stripe < bbio->num_stripes; stripe++) { | ||
1963 | if (rbio->faila == stripe || | ||
1964 | rbio->failb == stripe) | ||
1965 | continue; | ||
1966 | |||
1967 | for (pagenr = 0; pagenr < nr_pages; pagenr++) { | ||
1968 | struct page *p; | ||
1969 | |||
1970 | /* | ||
1971 | * the rmw code may have already read this | ||
1972 | * page in | ||
1973 | */ | ||
1974 | p = rbio_stripe_page(rbio, stripe, pagenr); | ||
1975 | if (PageUptodate(p)) | ||
1976 | continue; | ||
1977 | |||
1978 | ret = rbio_add_io_page(rbio, &bio_list, | ||
1979 | rbio_stripe_page(rbio, stripe, pagenr), | ||
1980 | stripe, pagenr, rbio->stripe_len); | ||
1981 | if (ret < 0) | ||
1982 | goto cleanup; | ||
1983 | } | ||
1984 | } | ||
1985 | |||
1986 | bios_to_read = bio_list_size(&bio_list); | ||
1987 | if (!bios_to_read) { | ||
1988 | /* | ||
1989 | * we might have no bios to read just because the pages | ||
1990 | * were up to date, or we might have no bios to read because | ||
1991 | * the devices were gone. | ||
1992 | */ | ||
1993 | if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { | ||
1994 | __raid_recover_end_io(rbio); | ||
1995 | goto out; | ||
1996 | } else { | ||
1997 | goto cleanup; | ||
1998 | } | ||
1999 | } | ||
2000 | |||
2001 | /* | ||
2002 | * the bbio may be freed once we submit the last bio. Make sure | ||
2003 | * not to touch it after that | ||
2004 | */ | ||
2005 | atomic_set(&bbio->stripes_pending, bios_to_read); | ||
2006 | while (1) { | ||
2007 | bio = bio_list_pop(&bio_list); | ||
2008 | if (!bio) | ||
2009 | break; | ||
2010 | |||
2011 | bio->bi_private = rbio; | ||
2012 | bio->bi_end_io = raid_recover_end_io; | ||
2013 | |||
2014 | btrfs_bio_wq_end_io(rbio->fs_info, bio, | ||
2015 | BTRFS_WQ_ENDIO_RAID56); | ||
2016 | |||
2017 | BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); | ||
2018 | submit_bio(READ, bio); | ||
2019 | } | ||
2020 | out: | ||
2021 | return 0; | ||
2022 | |||
2023 | cleanup: | ||
2024 | if (rbio->read_rebuild) | ||
2025 | rbio_orig_end_io(rbio, -EIO, 0); | ||
2026 | return -EIO; | ||
2027 | } | ||
2028 | |||
2029 | /* | ||
2030 | * the main entry point for reads from the higher layers. This | ||
2031 | * is really only called when the normal read path had a failure, | ||
2032 | * so we assume the bio they send down corresponds to a failed part | ||
2033 | * of the drive. | ||
2034 | */ | ||
2035 | int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, | ||
2036 | struct btrfs_bio *bbio, u64 *raid_map, | ||
2037 | u64 stripe_len, int mirror_num) | ||
2038 | { | ||
2039 | struct btrfs_raid_bio *rbio; | ||
2040 | int ret; | ||
2041 | |||
2042 | rbio = alloc_rbio(root, bbio, raid_map, stripe_len); | ||
2043 | if (IS_ERR(rbio)) { | ||
2044 | return PTR_ERR(rbio); | ||
2045 | } | ||
2046 | |||
2047 | rbio->read_rebuild = 1; | ||
2048 | bio_list_add(&rbio->bio_list, bio); | ||
2049 | rbio->bio_list_bytes = bio->bi_size; | ||
2050 | |||
2051 | rbio->faila = find_logical_bio_stripe(rbio, bio); | ||
2052 | if (rbio->faila == -1) { | ||
2053 | BUG(); | ||
2054 | kfree(rbio); | ||
2055 | return -EIO; | ||
2056 | } | ||
2057 | |||
2058 | /* | ||
2059 | * reconstruct from the q stripe if they are | ||
2060 | * asking for mirror 3 | ||
2061 | */ | ||
2062 | if (mirror_num == 3) | ||
2063 | rbio->failb = bbio->num_stripes - 2; | ||
2064 | |||
2065 | ret = lock_stripe_add(rbio); | ||
2066 | |||
2067 | /* | ||
2068 | * __raid56_parity_recover will end the bio with | ||
2069 | * any errors it hits. We don't want to return | ||
2070 | * its error value up the stack because our caller | ||
2071 | * will end up calling bio_endio with any nonzero | ||
2072 | * return | ||
2073 | */ | ||
2074 | if (ret == 0) | ||
2075 | __raid56_parity_recover(rbio); | ||
2076 | /* | ||
2077 | * our rbio has been added to the list of | ||
2078 | * rbios that will be handled after the | ||
2079 | * currently lock owner is done | ||
2080 | */ | ||
2081 | return 0; | ||
2082 | |||
2083 | } | ||
2084 | |||
2085 | static void rmw_work(struct btrfs_work *work) | ||
2086 | { | ||
2087 | struct btrfs_raid_bio *rbio; | ||
2088 | |||
2089 | rbio = container_of(work, struct btrfs_raid_bio, work); | ||
2090 | raid56_rmw_stripe(rbio); | ||
2091 | } | ||
2092 | |||
2093 | static void read_rebuild_work(struct btrfs_work *work) | ||
2094 | { | ||
2095 | struct btrfs_raid_bio *rbio; | ||
2096 | |||
2097 | rbio = container_of(work, struct btrfs_raid_bio, work); | ||
2098 | __raid56_parity_recover(rbio); | ||
2099 | } | ||
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h new file mode 100644 index 000000000000..ea5d73bfdfbe --- /dev/null +++ b/fs/btrfs/raid56.h | |||
@@ -0,0 +1,51 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2012 Fusion-io All rights reserved. | ||
3 | * Copyright (C) 2012 Intel Corp. All rights reserved. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of the GNU General Public | ||
7 | * License v2 as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
12 | * General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public | ||
15 | * License along with this program; if not, write to the | ||
16 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
17 | * Boston, MA 021110-1307, USA. | ||
18 | */ | ||
19 | |||
20 | #ifndef __BTRFS_RAID56__ | ||
21 | #define __BTRFS_RAID56__ | ||
22 | static inline int nr_parity_stripes(struct map_lookup *map) | ||
23 | { | ||
24 | if (map->type & BTRFS_BLOCK_GROUP_RAID5) | ||
25 | return 1; | ||
26 | else if (map->type & BTRFS_BLOCK_GROUP_RAID6) | ||
27 | return 2; | ||
28 | else | ||
29 | return 0; | ||
30 | } | ||
31 | |||
32 | static inline int nr_data_stripes(struct map_lookup *map) | ||
33 | { | ||
34 | return map->num_stripes - nr_parity_stripes(map); | ||
35 | } | ||
36 | #define RAID5_P_STRIPE ((u64)-2) | ||
37 | #define RAID6_Q_STRIPE ((u64)-1) | ||
38 | |||
39 | #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ | ||
40 | ((x) == RAID6_Q_STRIPE)) | ||
41 | |||
42 | int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, | ||
43 | struct btrfs_bio *bbio, u64 *raid_map, | ||
44 | u64 stripe_len, int mirror_num); | ||
45 | int raid56_parity_write(struct btrfs_root *root, struct bio *bio, | ||
46 | struct btrfs_bio *bbio, u64 *raid_map, | ||
47 | u64 stripe_len); | ||
48 | |||
49 | int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); | ||
50 | void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); | ||
51 | #endif | ||
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 17c306bf177a..50695dc5e2ab 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c | |||
@@ -3017,7 +3017,7 @@ static int relocate_file_extent_cluster(struct inode *inode, | |||
3017 | } | 3017 | } |
3018 | } | 3018 | } |
3019 | 3019 | ||
3020 | page_start = (u64)page->index << PAGE_CACHE_SHIFT; | 3020 | page_start = page_offset(page); |
3021 | page_end = page_start + PAGE_CACHE_SIZE - 1; | 3021 | page_end = page_start + PAGE_CACHE_SIZE - 1; |
3022 | 3022 | ||
3023 | lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); | 3023 | lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); |
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 67783e03d121..53c3501fa4ca 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c | |||
@@ -28,6 +28,7 @@ | |||
28 | #include "dev-replace.h" | 28 | #include "dev-replace.h" |
29 | #include "check-integrity.h" | 29 | #include "check-integrity.h" |
30 | #include "rcu-string.h" | 30 | #include "rcu-string.h" |
31 | #include "raid56.h" | ||
31 | 32 | ||
32 | /* | 33 | /* |
33 | * This is only the first step towards a full-features scrub. It reads all | 34 | * This is only the first step towards a full-features scrub. It reads all |
@@ -2254,6 +2255,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, | |||
2254 | struct btrfs_device *extent_dev; | 2255 | struct btrfs_device *extent_dev; |
2255 | int extent_mirror_num; | 2256 | int extent_mirror_num; |
2256 | 2257 | ||
2258 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
2259 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
2260 | if (num >= nr_data_stripes(map)) { | ||
2261 | return 0; | ||
2262 | } | ||
2263 | } | ||
2264 | |||
2257 | nstripes = length; | 2265 | nstripes = length; |
2258 | offset = 0; | 2266 | offset = 0; |
2259 | do_div(nstripes, map->stripe_len); | 2267 | do_div(nstripes, map->stripe_len); |
@@ -2708,7 +2716,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, | |||
2708 | int ret; | 2716 | int ret; |
2709 | struct btrfs_root *root = sctx->dev_root; | 2717 | struct btrfs_root *root = sctx->dev_root; |
2710 | 2718 | ||
2711 | if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) | 2719 | if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) |
2712 | return -EIO; | 2720 | return -EIO; |
2713 | 2721 | ||
2714 | gen = root->fs_info->last_trans_committed; | 2722 | gen = root->fs_info->last_trans_committed; |
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f4ab7a9260eb..f7a8b861058b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c | |||
@@ -85,6 +85,7 @@ struct send_ctx { | |||
85 | u32 send_max_size; | 85 | u32 send_max_size; |
86 | u64 total_send_size; | 86 | u64 total_send_size; |
87 | u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; | 87 | u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; |
88 | u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ | ||
88 | 89 | ||
89 | struct vfsmount *mnt; | 90 | struct vfsmount *mnt; |
90 | 91 | ||
@@ -3709,6 +3710,39 @@ out: | |||
3709 | return ret; | 3710 | return ret; |
3710 | } | 3711 | } |
3711 | 3712 | ||
3713 | /* | ||
3714 | * Send an update extent command to user space. | ||
3715 | */ | ||
3716 | static int send_update_extent(struct send_ctx *sctx, | ||
3717 | u64 offset, u32 len) | ||
3718 | { | ||
3719 | int ret = 0; | ||
3720 | struct fs_path *p; | ||
3721 | |||
3722 | p = fs_path_alloc(sctx); | ||
3723 | if (!p) | ||
3724 | return -ENOMEM; | ||
3725 | |||
3726 | ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT); | ||
3727 | if (ret < 0) | ||
3728 | goto out; | ||
3729 | |||
3730 | ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); | ||
3731 | if (ret < 0) | ||
3732 | goto out; | ||
3733 | |||
3734 | TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); | ||
3735 | TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); | ||
3736 | TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len); | ||
3737 | |||
3738 | ret = send_cmd(sctx); | ||
3739 | |||
3740 | tlv_put_failure: | ||
3741 | out: | ||
3742 | fs_path_free(sctx, p); | ||
3743 | return ret; | ||
3744 | } | ||
3745 | |||
3712 | static int send_write_or_clone(struct send_ctx *sctx, | 3746 | static int send_write_or_clone(struct send_ctx *sctx, |
3713 | struct btrfs_path *path, | 3747 | struct btrfs_path *path, |
3714 | struct btrfs_key *key, | 3748 | struct btrfs_key *key, |
@@ -3744,7 +3778,11 @@ static int send_write_or_clone(struct send_ctx *sctx, | |||
3744 | goto out; | 3778 | goto out; |
3745 | } | 3779 | } |
3746 | 3780 | ||
3747 | if (!clone_root) { | 3781 | if (clone_root) { |
3782 | ret = send_clone(sctx, offset, len, clone_root); | ||
3783 | } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) { | ||
3784 | ret = send_update_extent(sctx, offset, len); | ||
3785 | } else { | ||
3748 | while (pos < len) { | 3786 | while (pos < len) { |
3749 | l = len - pos; | 3787 | l = len - pos; |
3750 | if (l > BTRFS_SEND_READ_SIZE) | 3788 | if (l > BTRFS_SEND_READ_SIZE) |
@@ -3757,10 +3795,7 @@ static int send_write_or_clone(struct send_ctx *sctx, | |||
3757 | pos += ret; | 3795 | pos += ret; |
3758 | } | 3796 | } |
3759 | ret = 0; | 3797 | ret = 0; |
3760 | } else { | ||
3761 | ret = send_clone(sctx, offset, len, clone_root); | ||
3762 | } | 3798 | } |
3763 | |||
3764 | out: | 3799 | out: |
3765 | return ret; | 3800 | return ret; |
3766 | } | 3801 | } |
@@ -4536,7 +4571,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) | |||
4536 | struct btrfs_fs_info *fs_info; | 4571 | struct btrfs_fs_info *fs_info; |
4537 | struct btrfs_ioctl_send_args *arg = NULL; | 4572 | struct btrfs_ioctl_send_args *arg = NULL; |
4538 | struct btrfs_key key; | 4573 | struct btrfs_key key; |
4539 | struct file *filp = NULL; | ||
4540 | struct send_ctx *sctx = NULL; | 4574 | struct send_ctx *sctx = NULL; |
4541 | u32 i; | 4575 | u32 i; |
4542 | u64 *clone_sources_tmp = NULL; | 4576 | u64 *clone_sources_tmp = NULL; |
@@ -4561,6 +4595,11 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) | |||
4561 | goto out; | 4595 | goto out; |
4562 | } | 4596 | } |
4563 | 4597 | ||
4598 | if (arg->flags & ~BTRFS_SEND_FLAG_NO_FILE_DATA) { | ||
4599 | ret = -EINVAL; | ||
4600 | goto out; | ||
4601 | } | ||
4602 | |||
4564 | sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS); | 4603 | sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS); |
4565 | if (!sctx) { | 4604 | if (!sctx) { |
4566 | ret = -ENOMEM; | 4605 | ret = -ENOMEM; |
@@ -4572,6 +4611,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) | |||
4572 | INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS); | 4611 | INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS); |
4573 | INIT_LIST_HEAD(&sctx->name_cache_list); | 4612 | INIT_LIST_HEAD(&sctx->name_cache_list); |
4574 | 4613 | ||
4614 | sctx->flags = arg->flags; | ||
4615 | |||
4575 | sctx->send_filp = fget(arg->send_fd); | 4616 | sctx->send_filp = fget(arg->send_fd); |
4576 | if (IS_ERR(sctx->send_filp)) { | 4617 | if (IS_ERR(sctx->send_filp)) { |
4577 | ret = PTR_ERR(sctx->send_filp); | 4618 | ret = PTR_ERR(sctx->send_filp); |
@@ -4673,8 +4714,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) | |||
4673 | goto out; | 4714 | goto out; |
4674 | 4715 | ||
4675 | out: | 4716 | out: |
4676 | if (filp) | ||
4677 | fput(filp); | ||
4678 | kfree(arg); | 4717 | kfree(arg); |
4679 | vfree(clone_sources_tmp); | 4718 | vfree(clone_sources_tmp); |
4680 | 4719 | ||
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 1bf4f32fd4ef..8bb18f7ccaa6 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h | |||
@@ -86,6 +86,7 @@ enum btrfs_send_cmd { | |||
86 | BTRFS_SEND_C_UTIMES, | 86 | BTRFS_SEND_C_UTIMES, |
87 | 87 | ||
88 | BTRFS_SEND_C_END, | 88 | BTRFS_SEND_C_END, |
89 | BTRFS_SEND_C_UPDATE_EXTENT, | ||
89 | __BTRFS_SEND_C_MAX, | 90 | __BTRFS_SEND_C_MAX, |
90 | }; | 91 | }; |
91 | #define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) | 92 | #define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) |
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d8982e9601d3..68a29a1ea068 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c | |||
@@ -41,13 +41,13 @@ | |||
41 | #include <linux/slab.h> | 41 | #include <linux/slab.h> |
42 | #include <linux/cleancache.h> | 42 | #include <linux/cleancache.h> |
43 | #include <linux/ratelimit.h> | 43 | #include <linux/ratelimit.h> |
44 | #include <linux/btrfs.h> | ||
44 | #include "compat.h" | 45 | #include "compat.h" |
45 | #include "delayed-inode.h" | 46 | #include "delayed-inode.h" |
46 | #include "ctree.h" | 47 | #include "ctree.h" |
47 | #include "disk-io.h" | 48 | #include "disk-io.h" |
48 | #include "transaction.h" | 49 | #include "transaction.h" |
49 | #include "btrfs_inode.h" | 50 | #include "btrfs_inode.h" |
50 | #include "ioctl.h" | ||
51 | #include "print-tree.h" | 51 | #include "print-tree.h" |
52 | #include "xattr.h" | 52 | #include "xattr.h" |
53 | #include "volumes.h" | 53 | #include "volumes.h" |
@@ -63,8 +63,7 @@ | |||
63 | static const struct super_operations btrfs_super_ops; | 63 | static const struct super_operations btrfs_super_ops; |
64 | static struct file_system_type btrfs_fs_type; | 64 | static struct file_system_type btrfs_fs_type; |
65 | 65 | ||
66 | static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, | 66 | static const char *btrfs_decode_error(int errno, char nbuf[16]) |
67 | char nbuf[16]) | ||
68 | { | 67 | { |
69 | char *errstr = NULL; | 68 | char *errstr = NULL; |
70 | 69 | ||
@@ -98,7 +97,7 @@ static void __save_error_info(struct btrfs_fs_info *fs_info) | |||
98 | * today we only save the error info into ram. Long term we'll | 97 | * today we only save the error info into ram. Long term we'll |
99 | * also send it down to the disk | 98 | * also send it down to the disk |
100 | */ | 99 | */ |
101 | fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR; | 100 | set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); |
102 | } | 101 | } |
103 | 102 | ||
104 | static void save_error_info(struct btrfs_fs_info *fs_info) | 103 | static void save_error_info(struct btrfs_fs_info *fs_info) |
@@ -114,7 +113,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) | |||
114 | if (sb->s_flags & MS_RDONLY) | 113 | if (sb->s_flags & MS_RDONLY) |
115 | return; | 114 | return; |
116 | 115 | ||
117 | if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { | 116 | if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { |
118 | sb->s_flags |= MS_RDONLY; | 117 | sb->s_flags |= MS_RDONLY; |
119 | printk(KERN_INFO "btrfs is forced readonly\n"); | 118 | printk(KERN_INFO "btrfs is forced readonly\n"); |
120 | /* | 119 | /* |
@@ -142,8 +141,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, | |||
142 | struct super_block *sb = fs_info->sb; | 141 | struct super_block *sb = fs_info->sb; |
143 | char nbuf[16]; | 142 | char nbuf[16]; |
144 | const char *errstr; | 143 | const char *errstr; |
145 | va_list args; | ||
146 | va_start(args, fmt); | ||
147 | 144 | ||
148 | /* | 145 | /* |
149 | * Special case: if the error is EROFS, and we're already | 146 | * Special case: if the error is EROFS, and we're already |
@@ -152,15 +149,18 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, | |||
152 | if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) | 149 | if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) |
153 | return; | 150 | return; |
154 | 151 | ||
155 | errstr = btrfs_decode_error(fs_info, errno, nbuf); | 152 | errstr = btrfs_decode_error(errno, nbuf); |
156 | if (fmt) { | 153 | if (fmt) { |
157 | struct va_format vaf = { | 154 | struct va_format vaf; |
158 | .fmt = fmt, | 155 | va_list args; |
159 | .va = &args, | 156 | |
160 | }; | 157 | va_start(args, fmt); |
158 | vaf.fmt = fmt; | ||
159 | vaf.va = &args; | ||
161 | 160 | ||
162 | printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n", | 161 | printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n", |
163 | sb->s_id, function, line, errstr, &vaf); | 162 | sb->s_id, function, line, errstr, &vaf); |
163 | va_end(args); | ||
164 | } else { | 164 | } else { |
165 | printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n", | 165 | printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n", |
166 | sb->s_id, function, line, errstr); | 166 | sb->s_id, function, line, errstr); |
@@ -171,7 +171,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, | |||
171 | save_error_info(fs_info); | 171 | save_error_info(fs_info); |
172 | btrfs_handle_error(fs_info); | 172 | btrfs_handle_error(fs_info); |
173 | } | 173 | } |
174 | va_end(args); | ||
175 | } | 174 | } |
176 | 175 | ||
177 | static const char * const logtypes[] = { | 176 | static const char * const logtypes[] = { |
@@ -261,7 +260,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, | |||
261 | char nbuf[16]; | 260 | char nbuf[16]; |
262 | const char *errstr; | 261 | const char *errstr; |
263 | 262 | ||
264 | errstr = btrfs_decode_error(root->fs_info, errno, nbuf); | 263 | errstr = btrfs_decode_error(errno, nbuf); |
265 | btrfs_printk(root->fs_info, | 264 | btrfs_printk(root->fs_info, |
266 | "%s:%d: Aborting unused transaction(%s).\n", | 265 | "%s:%d: Aborting unused transaction(%s).\n", |
267 | function, line, errstr); | 266 | function, line, errstr); |
@@ -289,8 +288,8 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, | |||
289 | va_start(args, fmt); | 288 | va_start(args, fmt); |
290 | vaf.va = &args; | 289 | vaf.va = &args; |
291 | 290 | ||
292 | errstr = btrfs_decode_error(fs_info, errno, nbuf); | 291 | errstr = btrfs_decode_error(errno, nbuf); |
293 | if (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR) | 292 | if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)) |
294 | panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n", | 293 | panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n", |
295 | s_id, function, line, &vaf, errstr); | 294 | s_id, function, line, &vaf, errstr); |
296 | 295 | ||
@@ -438,6 +437,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) | |||
438 | case Opt_compress_force: | 437 | case Opt_compress_force: |
439 | case Opt_compress_force_type: | 438 | case Opt_compress_force_type: |
440 | compress_force = true; | 439 | compress_force = true; |
440 | /* Fallthrough */ | ||
441 | case Opt_compress: | 441 | case Opt_compress: |
442 | case Opt_compress_type: | 442 | case Opt_compress_type: |
443 | if (token == Opt_compress || | 443 | if (token == Opt_compress || |
@@ -519,7 +519,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) | |||
519 | case Opt_alloc_start: | 519 | case Opt_alloc_start: |
520 | num = match_strdup(&args[0]); | 520 | num = match_strdup(&args[0]); |
521 | if (num) { | 521 | if (num) { |
522 | mutex_lock(&info->chunk_mutex); | ||
522 | info->alloc_start = memparse(num, NULL); | 523 | info->alloc_start = memparse(num, NULL); |
524 | mutex_unlock(&info->chunk_mutex); | ||
523 | kfree(num); | 525 | kfree(num); |
524 | printk(KERN_INFO | 526 | printk(KERN_INFO |
525 | "btrfs: allocations start at %llu\n", | 527 | "btrfs: allocations start at %llu\n", |
@@ -876,7 +878,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) | |||
876 | 878 | ||
877 | btrfs_wait_ordered_extents(root, 0); | 879 | btrfs_wait_ordered_extents(root, 0); |
878 | 880 | ||
879 | trans = btrfs_attach_transaction(root); | 881 | trans = btrfs_attach_transaction_barrier(root); |
880 | if (IS_ERR(trans)) { | 882 | if (IS_ERR(trans)) { |
881 | /* no transaction, don't bother */ | 883 | /* no transaction, don't bother */ |
882 | if (PTR_ERR(trans) == -ENOENT) | 884 | if (PTR_ERR(trans) == -ENOENT) |
@@ -1200,6 +1202,38 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, | |||
1200 | new_pool_size); | 1202 | new_pool_size); |
1201 | } | 1203 | } |
1202 | 1204 | ||
1205 | static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info, | ||
1206 | unsigned long old_opts, int flags) | ||
1207 | { | ||
1208 | set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); | ||
1209 | |||
1210 | if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && | ||
1211 | (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || | ||
1212 | (flags & MS_RDONLY))) { | ||
1213 | /* wait for any defraggers to finish */ | ||
1214 | wait_event(fs_info->transaction_wait, | ||
1215 | (atomic_read(&fs_info->defrag_running) == 0)); | ||
1216 | if (flags & MS_RDONLY) | ||
1217 | sync_filesystem(fs_info->sb); | ||
1218 | } | ||
1219 | } | ||
1220 | |||
1221 | static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, | ||
1222 | unsigned long old_opts) | ||
1223 | { | ||
1224 | /* | ||
1225 | * We need cleanup all defragable inodes if the autodefragment is | ||
1226 | * close or the fs is R/O. | ||
1227 | */ | ||
1228 | if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && | ||
1229 | (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || | ||
1230 | (fs_info->sb->s_flags & MS_RDONLY))) { | ||
1231 | btrfs_cleanup_defrag_inodes(fs_info); | ||
1232 | } | ||
1233 | |||
1234 | clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); | ||
1235 | } | ||
1236 | |||
1203 | static int btrfs_remount(struct super_block *sb, int *flags, char *data) | 1237 | static int btrfs_remount(struct super_block *sb, int *flags, char *data) |
1204 | { | 1238 | { |
1205 | struct btrfs_fs_info *fs_info = btrfs_sb(sb); | 1239 | struct btrfs_fs_info *fs_info = btrfs_sb(sb); |
@@ -1213,6 +1247,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) | |||
1213 | unsigned int old_metadata_ratio = fs_info->metadata_ratio; | 1247 | unsigned int old_metadata_ratio = fs_info->metadata_ratio; |
1214 | int ret; | 1248 | int ret; |
1215 | 1249 | ||
1250 | btrfs_remount_prepare(fs_info, old_opts, *flags); | ||
1251 | |||
1216 | ret = btrfs_parse_options(root, data); | 1252 | ret = btrfs_parse_options(root, data); |
1217 | if (ret) { | 1253 | if (ret) { |
1218 | ret = -EINVAL; | 1254 | ret = -EINVAL; |
@@ -1223,7 +1259,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) | |||
1223 | fs_info->thread_pool_size, old_thread_pool_size); | 1259 | fs_info->thread_pool_size, old_thread_pool_size); |
1224 | 1260 | ||
1225 | if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) | 1261 | if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) |
1226 | return 0; | 1262 | goto out; |
1227 | 1263 | ||
1228 | if (*flags & MS_RDONLY) { | 1264 | if (*flags & MS_RDONLY) { |
1229 | /* | 1265 | /* |
@@ -1278,7 +1314,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) | |||
1278 | } | 1314 | } |
1279 | sb->s_flags &= ~MS_RDONLY; | 1315 | sb->s_flags &= ~MS_RDONLY; |
1280 | } | 1316 | } |
1281 | 1317 | out: | |
1318 | btrfs_remount_cleanup(fs_info, old_opts); | ||
1282 | return 0; | 1319 | return 0; |
1283 | 1320 | ||
1284 | restore: | 1321 | restore: |
@@ -1289,10 +1326,13 @@ restore: | |||
1289 | fs_info->mount_opt = old_opts; | 1326 | fs_info->mount_opt = old_opts; |
1290 | fs_info->compress_type = old_compress_type; | 1327 | fs_info->compress_type = old_compress_type; |
1291 | fs_info->max_inline = old_max_inline; | 1328 | fs_info->max_inline = old_max_inline; |
1329 | mutex_lock(&fs_info->chunk_mutex); | ||
1292 | fs_info->alloc_start = old_alloc_start; | 1330 | fs_info->alloc_start = old_alloc_start; |
1331 | mutex_unlock(&fs_info->chunk_mutex); | ||
1293 | btrfs_resize_thread_pool(fs_info, | 1332 | btrfs_resize_thread_pool(fs_info, |
1294 | old_thread_pool_size, fs_info->thread_pool_size); | 1333 | old_thread_pool_size, fs_info->thread_pool_size); |
1295 | fs_info->metadata_ratio = old_metadata_ratio; | 1334 | fs_info->metadata_ratio = old_metadata_ratio; |
1335 | btrfs_remount_cleanup(fs_info, old_opts); | ||
1296 | return ret; | 1336 | return ret; |
1297 | } | 1337 | } |
1298 | 1338 | ||
@@ -1559,7 +1599,7 @@ static int btrfs_freeze(struct super_block *sb) | |||
1559 | struct btrfs_trans_handle *trans; | 1599 | struct btrfs_trans_handle *trans; |
1560 | struct btrfs_root *root = btrfs_sb(sb)->tree_root; | 1600 | struct btrfs_root *root = btrfs_sb(sb)->tree_root; |
1561 | 1601 | ||
1562 | trans = btrfs_attach_transaction(root); | 1602 | trans = btrfs_attach_transaction_barrier(root); |
1563 | if (IS_ERR(trans)) { | 1603 | if (IS_ERR(trans)) { |
1564 | /* no transaction, don't bother */ | 1604 | /* no transaction, don't bother */ |
1565 | if (PTR_ERR(trans) == -ENOENT) | 1605 | if (PTR_ERR(trans) == -ENOENT) |
@@ -1684,10 +1724,14 @@ static int __init init_btrfs_fs(void) | |||
1684 | if (err) | 1724 | if (err) |
1685 | goto free_delayed_inode; | 1725 | goto free_delayed_inode; |
1686 | 1726 | ||
1687 | err = btrfs_interface_init(); | 1727 | err = btrfs_delayed_ref_init(); |
1688 | if (err) | 1728 | if (err) |
1689 | goto free_auto_defrag; | 1729 | goto free_auto_defrag; |
1690 | 1730 | ||
1731 | err = btrfs_interface_init(); | ||
1732 | if (err) | ||
1733 | goto free_delayed_ref; | ||
1734 | |||
1691 | err = register_filesystem(&btrfs_fs_type); | 1735 | err = register_filesystem(&btrfs_fs_type); |
1692 | if (err) | 1736 | if (err) |
1693 | goto unregister_ioctl; | 1737 | goto unregister_ioctl; |
@@ -1699,6 +1743,8 @@ static int __init init_btrfs_fs(void) | |||
1699 | 1743 | ||
1700 | unregister_ioctl: | 1744 | unregister_ioctl: |
1701 | btrfs_interface_exit(); | 1745 | btrfs_interface_exit(); |
1746 | free_delayed_ref: | ||
1747 | btrfs_delayed_ref_exit(); | ||
1702 | free_auto_defrag: | 1748 | free_auto_defrag: |
1703 | btrfs_auto_defrag_exit(); | 1749 | btrfs_auto_defrag_exit(); |
1704 | free_delayed_inode: | 1750 | free_delayed_inode: |
@@ -1720,6 +1766,7 @@ free_compress: | |||
1720 | static void __exit exit_btrfs_fs(void) | 1766 | static void __exit exit_btrfs_fs(void) |
1721 | { | 1767 | { |
1722 | btrfs_destroy_cachep(); | 1768 | btrfs_destroy_cachep(); |
1769 | btrfs_delayed_ref_exit(); | ||
1723 | btrfs_auto_defrag_exit(); | 1770 | btrfs_auto_defrag_exit(); |
1724 | btrfs_delayed_inode_exit(); | 1771 | btrfs_delayed_inode_exit(); |
1725 | ordered_data_exit(); | 1772 | ordered_data_exit(); |
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index daac9ae6d731..5b326cd60a4a 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c | |||
@@ -21,7 +21,6 @@ | |||
21 | #include <linux/spinlock.h> | 21 | #include <linux/spinlock.h> |
22 | #include <linux/completion.h> | 22 | #include <linux/completion.h> |
23 | #include <linux/buffer_head.h> | 23 | #include <linux/buffer_head.h> |
24 | #include <linux/module.h> | ||
25 | #include <linux/kobject.h> | 24 | #include <linux/kobject.h> |
26 | 25 | ||
27 | #include "ctree.h" | 26 | #include "ctree.h" |
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 4c0067c4f76d..e52da6fb1165 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c | |||
@@ -40,7 +40,6 @@ void put_transaction(struct btrfs_transaction *transaction) | |||
40 | if (atomic_dec_and_test(&transaction->use_count)) { | 40 | if (atomic_dec_and_test(&transaction->use_count)) { |
41 | BUG_ON(!list_empty(&transaction->list)); | 41 | BUG_ON(!list_empty(&transaction->list)); |
42 | WARN_ON(transaction->delayed_refs.root.rb_node); | 42 | WARN_ON(transaction->delayed_refs.root.rb_node); |
43 | memset(transaction, 0, sizeof(*transaction)); | ||
44 | kmem_cache_free(btrfs_transaction_cachep, transaction); | 43 | kmem_cache_free(btrfs_transaction_cachep, transaction); |
45 | } | 44 | } |
46 | } | 45 | } |
@@ -51,6 +50,14 @@ static noinline void switch_commit_root(struct btrfs_root *root) | |||
51 | root->commit_root = btrfs_root_node(root); | 50 | root->commit_root = btrfs_root_node(root); |
52 | } | 51 | } |
53 | 52 | ||
53 | static inline int can_join_transaction(struct btrfs_transaction *trans, | ||
54 | int type) | ||
55 | { | ||
56 | return !(trans->in_commit && | ||
57 | type != TRANS_JOIN && | ||
58 | type != TRANS_JOIN_NOLOCK); | ||
59 | } | ||
60 | |||
54 | /* | 61 | /* |
55 | * either allocate a new transaction or hop into the existing one | 62 | * either allocate a new transaction or hop into the existing one |
56 | */ | 63 | */ |
@@ -62,7 +69,7 @@ static noinline int join_transaction(struct btrfs_root *root, int type) | |||
62 | spin_lock(&fs_info->trans_lock); | 69 | spin_lock(&fs_info->trans_lock); |
63 | loop: | 70 | loop: |
64 | /* The file system has been taken offline. No new transactions. */ | 71 | /* The file system has been taken offline. No new transactions. */ |
65 | if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { | 72 | if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { |
66 | spin_unlock(&fs_info->trans_lock); | 73 | spin_unlock(&fs_info->trans_lock); |
67 | return -EROFS; | 74 | return -EROFS; |
68 | } | 75 | } |
@@ -86,6 +93,10 @@ loop: | |||
86 | spin_unlock(&fs_info->trans_lock); | 93 | spin_unlock(&fs_info->trans_lock); |
87 | return cur_trans->aborted; | 94 | return cur_trans->aborted; |
88 | } | 95 | } |
96 | if (!can_join_transaction(cur_trans, type)) { | ||
97 | spin_unlock(&fs_info->trans_lock); | ||
98 | return -EBUSY; | ||
99 | } | ||
89 | atomic_inc(&cur_trans->use_count); | 100 | atomic_inc(&cur_trans->use_count); |
90 | atomic_inc(&cur_trans->num_writers); | 101 | atomic_inc(&cur_trans->num_writers); |
91 | cur_trans->num_joined++; | 102 | cur_trans->num_joined++; |
@@ -113,7 +124,7 @@ loop: | |||
113 | */ | 124 | */ |
114 | kmem_cache_free(btrfs_transaction_cachep, cur_trans); | 125 | kmem_cache_free(btrfs_transaction_cachep, cur_trans); |
115 | goto loop; | 126 | goto loop; |
116 | } else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { | 127 | } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { |
117 | spin_unlock(&fs_info->trans_lock); | 128 | spin_unlock(&fs_info->trans_lock); |
118 | kmem_cache_free(btrfs_transaction_cachep, cur_trans); | 129 | kmem_cache_free(btrfs_transaction_cachep, cur_trans); |
119 | return -EROFS; | 130 | return -EROFS; |
@@ -155,8 +166,12 @@ loop: | |||
155 | 166 | ||
156 | spin_lock_init(&cur_trans->commit_lock); | 167 | spin_lock_init(&cur_trans->commit_lock); |
157 | spin_lock_init(&cur_trans->delayed_refs.lock); | 168 | spin_lock_init(&cur_trans->delayed_refs.lock); |
169 | atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0); | ||
170 | atomic_set(&cur_trans->delayed_refs.ref_seq, 0); | ||
171 | init_waitqueue_head(&cur_trans->delayed_refs.wait); | ||
158 | 172 | ||
159 | INIT_LIST_HEAD(&cur_trans->pending_snapshots); | 173 | INIT_LIST_HEAD(&cur_trans->pending_snapshots); |
174 | INIT_LIST_HEAD(&cur_trans->ordered_operations); | ||
160 | list_add_tail(&cur_trans->list, &fs_info->trans_list); | 175 | list_add_tail(&cur_trans->list, &fs_info->trans_list); |
161 | extent_io_tree_init(&cur_trans->dirty_pages, | 176 | extent_io_tree_init(&cur_trans->dirty_pages, |
162 | fs_info->btree_inode->i_mapping); | 177 | fs_info->btree_inode->i_mapping); |
@@ -301,7 +316,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type, | |||
301 | int ret; | 316 | int ret; |
302 | u64 qgroup_reserved = 0; | 317 | u64 qgroup_reserved = 0; |
303 | 318 | ||
304 | if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) | 319 | if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) |
305 | return ERR_PTR(-EROFS); | 320 | return ERR_PTR(-EROFS); |
306 | 321 | ||
307 | if (current->journal_info) { | 322 | if (current->journal_info) { |
@@ -359,8 +374,11 @@ again: | |||
359 | 374 | ||
360 | do { | 375 | do { |
361 | ret = join_transaction(root, type); | 376 | ret = join_transaction(root, type); |
362 | if (ret == -EBUSY) | 377 | if (ret == -EBUSY) { |
363 | wait_current_trans(root); | 378 | wait_current_trans(root); |
379 | if (unlikely(type == TRANS_ATTACH)) | ||
380 | ret = -ENOENT; | ||
381 | } | ||
364 | } while (ret == -EBUSY); | 382 | } while (ret == -EBUSY); |
365 | 383 | ||
366 | if (ret < 0) { | 384 | if (ret < 0) { |
@@ -382,9 +400,10 @@ again: | |||
382 | h->block_rsv = NULL; | 400 | h->block_rsv = NULL; |
383 | h->orig_rsv = NULL; | 401 | h->orig_rsv = NULL; |
384 | h->aborted = 0; | 402 | h->aborted = 0; |
385 | h->qgroup_reserved = qgroup_reserved; | 403 | h->qgroup_reserved = 0; |
386 | h->delayed_ref_elem.seq = 0; | 404 | h->delayed_ref_elem.seq = 0; |
387 | h->type = type; | 405 | h->type = type; |
406 | h->allocating_chunk = false; | ||
388 | INIT_LIST_HEAD(&h->qgroup_ref_list); | 407 | INIT_LIST_HEAD(&h->qgroup_ref_list); |
389 | INIT_LIST_HEAD(&h->new_bgs); | 408 | INIT_LIST_HEAD(&h->new_bgs); |
390 | 409 | ||
@@ -400,6 +419,7 @@ again: | |||
400 | h->block_rsv = &root->fs_info->trans_block_rsv; | 419 | h->block_rsv = &root->fs_info->trans_block_rsv; |
401 | h->bytes_reserved = num_bytes; | 420 | h->bytes_reserved = num_bytes; |
402 | } | 421 | } |
422 | h->qgroup_reserved = qgroup_reserved; | ||
403 | 423 | ||
404 | got_it: | 424 | got_it: |
405 | btrfs_record_root_in_trans(h, root); | 425 | btrfs_record_root_in_trans(h, root); |
@@ -451,11 +471,43 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root | |||
451 | return start_transaction(root, 0, TRANS_USERSPACE, 0); | 471 | return start_transaction(root, 0, TRANS_USERSPACE, 0); |
452 | } | 472 | } |
453 | 473 | ||
474 | /* | ||
475 | * btrfs_attach_transaction() - catch the running transaction | ||
476 | * | ||
477 | * It is used when we want to commit the current the transaction, but | ||
478 | * don't want to start a new one. | ||
479 | * | ||
480 | * Note: If this function return -ENOENT, it just means there is no | ||
481 | * running transaction. But it is possible that the inactive transaction | ||
482 | * is still in the memory, not fully on disk. If you hope there is no | ||
483 | * inactive transaction in the fs when -ENOENT is returned, you should | ||
484 | * invoke | ||
485 | * btrfs_attach_transaction_barrier() | ||
486 | */ | ||
454 | struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) | 487 | struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) |
455 | { | 488 | { |
456 | return start_transaction(root, 0, TRANS_ATTACH, 0); | 489 | return start_transaction(root, 0, TRANS_ATTACH, 0); |
457 | } | 490 | } |
458 | 491 | ||
492 | /* | ||
493 | * btrfs_attach_transaction() - catch the running transaction | ||
494 | * | ||
495 | * It is similar to the above function, the differentia is this one | ||
496 | * will wait for all the inactive transactions until they fully | ||
497 | * complete. | ||
498 | */ | ||
499 | struct btrfs_trans_handle * | ||
500 | btrfs_attach_transaction_barrier(struct btrfs_root *root) | ||
501 | { | ||
502 | struct btrfs_trans_handle *trans; | ||
503 | |||
504 | trans = start_transaction(root, 0, TRANS_ATTACH, 0); | ||
505 | if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT) | ||
506 | btrfs_wait_for_commit(root, 0); | ||
507 | |||
508 | return trans; | ||
509 | } | ||
510 | |||
459 | /* wait for a transaction commit to be fully complete */ | 511 | /* wait for a transaction commit to be fully complete */ |
460 | static noinline void wait_for_commit(struct btrfs_root *root, | 512 | static noinline void wait_for_commit(struct btrfs_root *root, |
461 | struct btrfs_transaction *commit) | 513 | struct btrfs_transaction *commit) |
@@ -587,7 +639,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, | |||
587 | if (!list_empty(&trans->new_bgs)) | 639 | if (!list_empty(&trans->new_bgs)) |
588 | btrfs_create_pending_block_groups(trans, root); | 640 | btrfs_create_pending_block_groups(trans, root); |
589 | 641 | ||
590 | while (count < 2) { | 642 | while (count < 1) { |
591 | unsigned long cur = trans->delayed_ref_updates; | 643 | unsigned long cur = trans->delayed_ref_updates; |
592 | trans->delayed_ref_updates = 0; | 644 | trans->delayed_ref_updates = 0; |
593 | if (cur && | 645 | if (cur && |
@@ -599,6 +651,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, | |||
599 | } | 651 | } |
600 | count++; | 652 | count++; |
601 | } | 653 | } |
654 | |||
602 | btrfs_trans_release_metadata(trans, root); | 655 | btrfs_trans_release_metadata(trans, root); |
603 | trans->block_rsv = NULL; | 656 | trans->block_rsv = NULL; |
604 | 657 | ||
@@ -644,12 +697,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, | |||
644 | btrfs_run_delayed_iputs(root); | 697 | btrfs_run_delayed_iputs(root); |
645 | 698 | ||
646 | if (trans->aborted || | 699 | if (trans->aborted || |
647 | root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { | 700 | test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) |
648 | err = -EIO; | 701 | err = -EIO; |
649 | } | ||
650 | assert_qgroups_uptodate(trans); | 702 | assert_qgroups_uptodate(trans); |
651 | 703 | ||
652 | memset(trans, 0, sizeof(*trans)); | ||
653 | kmem_cache_free(btrfs_trans_handle_cachep, trans); | 704 | kmem_cache_free(btrfs_trans_handle_cachep, trans); |
654 | return err; | 705 | return err; |
655 | } | 706 | } |
@@ -696,7 +747,9 @@ int btrfs_write_marked_extents(struct btrfs_root *root, | |||
696 | struct extent_state *cached_state = NULL; | 747 | struct extent_state *cached_state = NULL; |
697 | u64 start = 0; | 748 | u64 start = 0; |
698 | u64 end; | 749 | u64 end; |
750 | struct blk_plug plug; | ||
699 | 751 | ||
752 | blk_start_plug(&plug); | ||
700 | while (!find_first_extent_bit(dirty_pages, start, &start, &end, | 753 | while (!find_first_extent_bit(dirty_pages, start, &start, &end, |
701 | mark, &cached_state)) { | 754 | mark, &cached_state)) { |
702 | convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, | 755 | convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, |
@@ -710,6 +763,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root, | |||
710 | } | 763 | } |
711 | if (err) | 764 | if (err) |
712 | werr = err; | 765 | werr = err; |
766 | blk_finish_plug(&plug); | ||
713 | return werr; | 767 | return werr; |
714 | } | 768 | } |
715 | 769 | ||
@@ -960,10 +1014,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, | |||
960 | } | 1014 | } |
961 | 1015 | ||
962 | /* | 1016 | /* |
963 | * defrag a given btree. If cacheonly == 1, this won't read from the disk, | 1017 | * defrag a given btree. |
964 | * otherwise every leaf in the btree is read and defragged. | 1018 | * Every leaf in the btree is read and defragged. |
965 | */ | 1019 | */ |
966 | int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) | 1020 | int btrfs_defrag_root(struct btrfs_root *root) |
967 | { | 1021 | { |
968 | struct btrfs_fs_info *info = root->fs_info; | 1022 | struct btrfs_fs_info *info = root->fs_info; |
969 | struct btrfs_trans_handle *trans; | 1023 | struct btrfs_trans_handle *trans; |
@@ -977,7 +1031,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) | |||
977 | if (IS_ERR(trans)) | 1031 | if (IS_ERR(trans)) |
978 | return PTR_ERR(trans); | 1032 | return PTR_ERR(trans); |
979 | 1033 | ||
980 | ret = btrfs_defrag_leaves(trans, root, cacheonly); | 1034 | ret = btrfs_defrag_leaves(trans, root); |
981 | 1035 | ||
982 | btrfs_end_transaction(trans, root); | 1036 | btrfs_end_transaction(trans, root); |
983 | btrfs_btree_balance_dirty(info->tree_root); | 1037 | btrfs_btree_balance_dirty(info->tree_root); |
@@ -985,6 +1039,12 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) | |||
985 | 1039 | ||
986 | if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) | 1040 | if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) |
987 | break; | 1041 | break; |
1042 | |||
1043 | if (btrfs_defrag_cancelled(root->fs_info)) { | ||
1044 | printk(KERN_DEBUG "btrfs: defrag_root cancelled\n"); | ||
1045 | ret = -EAGAIN; | ||
1046 | break; | ||
1047 | } | ||
988 | } | 1048 | } |
989 | root->defrag_running = 0; | 1049 | root->defrag_running = 0; |
990 | return ret; | 1050 | return ret; |
@@ -1007,7 +1067,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, | |||
1007 | struct inode *parent_inode; | 1067 | struct inode *parent_inode; |
1008 | struct btrfs_path *path; | 1068 | struct btrfs_path *path; |
1009 | struct btrfs_dir_item *dir_item; | 1069 | struct btrfs_dir_item *dir_item; |
1010 | struct dentry *parent; | ||
1011 | struct dentry *dentry; | 1070 | struct dentry *dentry; |
1012 | struct extent_buffer *tmp; | 1071 | struct extent_buffer *tmp; |
1013 | struct extent_buffer *old; | 1072 | struct extent_buffer *old; |
@@ -1022,7 +1081,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, | |||
1022 | path = btrfs_alloc_path(); | 1081 | path = btrfs_alloc_path(); |
1023 | if (!path) { | 1082 | if (!path) { |
1024 | ret = pending->error = -ENOMEM; | 1083 | ret = pending->error = -ENOMEM; |
1025 | goto path_alloc_fail; | 1084 | return ret; |
1026 | } | 1085 | } |
1027 | 1086 | ||
1028 | new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); | 1087 | new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); |
@@ -1062,10 +1121,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, | |||
1062 | 1121 | ||
1063 | rsv = trans->block_rsv; | 1122 | rsv = trans->block_rsv; |
1064 | trans->block_rsv = &pending->block_rsv; | 1123 | trans->block_rsv = &pending->block_rsv; |
1124 | trans->bytes_reserved = trans->block_rsv->reserved; | ||
1065 | 1125 | ||
1066 | dentry = pending->dentry; | 1126 | dentry = pending->dentry; |
1067 | parent = dget_parent(dentry); | 1127 | parent_inode = pending->dir; |
1068 | parent_inode = parent->d_inode; | ||
1069 | parent_root = BTRFS_I(parent_inode)->root; | 1128 | parent_root = BTRFS_I(parent_inode)->root; |
1070 | record_root_in_trans(trans, parent_root); | 1129 | record_root_in_trans(trans, parent_root); |
1071 | 1130 | ||
@@ -1213,14 +1272,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, | |||
1213 | if (ret) | 1272 | if (ret) |
1214 | btrfs_abort_transaction(trans, root, ret); | 1273 | btrfs_abort_transaction(trans, root, ret); |
1215 | fail: | 1274 | fail: |
1216 | dput(parent); | ||
1217 | trans->block_rsv = rsv; | 1275 | trans->block_rsv = rsv; |
1276 | trans->bytes_reserved = 0; | ||
1218 | no_free_objectid: | 1277 | no_free_objectid: |
1219 | kfree(new_root_item); | 1278 | kfree(new_root_item); |
1220 | root_item_alloc_fail: | 1279 | root_item_alloc_fail: |
1221 | btrfs_free_path(path); | 1280 | btrfs_free_path(path); |
1222 | path_alloc_fail: | ||
1223 | btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); | ||
1224 | return ret; | 1281 | return ret; |
1225 | } | 1282 | } |
1226 | 1283 | ||
@@ -1306,13 +1363,13 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, | |||
1306 | struct btrfs_async_commit { | 1363 | struct btrfs_async_commit { |
1307 | struct btrfs_trans_handle *newtrans; | 1364 | struct btrfs_trans_handle *newtrans; |
1308 | struct btrfs_root *root; | 1365 | struct btrfs_root *root; |
1309 | struct delayed_work work; | 1366 | struct work_struct work; |
1310 | }; | 1367 | }; |
1311 | 1368 | ||
1312 | static void do_async_commit(struct work_struct *work) | 1369 | static void do_async_commit(struct work_struct *work) |
1313 | { | 1370 | { |
1314 | struct btrfs_async_commit *ac = | 1371 | struct btrfs_async_commit *ac = |
1315 | container_of(work, struct btrfs_async_commit, work.work); | 1372 | container_of(work, struct btrfs_async_commit, work); |
1316 | 1373 | ||
1317 | /* | 1374 | /* |
1318 | * We've got freeze protection passed with the transaction. | 1375 | * We've got freeze protection passed with the transaction. |
@@ -1340,7 +1397,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, | |||
1340 | if (!ac) | 1397 | if (!ac) |
1341 | return -ENOMEM; | 1398 | return -ENOMEM; |
1342 | 1399 | ||
1343 | INIT_DELAYED_WORK(&ac->work, do_async_commit); | 1400 | INIT_WORK(&ac->work, do_async_commit); |
1344 | ac->root = root; | 1401 | ac->root = root; |
1345 | ac->newtrans = btrfs_join_transaction(root); | 1402 | ac->newtrans = btrfs_join_transaction(root); |
1346 | if (IS_ERR(ac->newtrans)) { | 1403 | if (IS_ERR(ac->newtrans)) { |
@@ -1364,7 +1421,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, | |||
1364 | &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], | 1421 | &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], |
1365 | 1, _THIS_IP_); | 1422 | 1, _THIS_IP_); |
1366 | 1423 | ||
1367 | schedule_delayed_work(&ac->work, 0); | 1424 | schedule_work(&ac->work); |
1368 | 1425 | ||
1369 | /* wait for transaction to start and unblock */ | 1426 | /* wait for transaction to start and unblock */ |
1370 | if (wait_for_unblock) | 1427 | if (wait_for_unblock) |
@@ -1384,6 +1441,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, | |||
1384 | struct btrfs_root *root, int err) | 1441 | struct btrfs_root *root, int err) |
1385 | { | 1442 | { |
1386 | struct btrfs_transaction *cur_trans = trans->transaction; | 1443 | struct btrfs_transaction *cur_trans = trans->transaction; |
1444 | DEFINE_WAIT(wait); | ||
1387 | 1445 | ||
1388 | WARN_ON(trans->use_count > 1); | 1446 | WARN_ON(trans->use_count > 1); |
1389 | 1447 | ||
@@ -1392,8 +1450,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, | |||
1392 | spin_lock(&root->fs_info->trans_lock); | 1450 | spin_lock(&root->fs_info->trans_lock); |
1393 | list_del_init(&cur_trans->list); | 1451 | list_del_init(&cur_trans->list); |
1394 | if (cur_trans == root->fs_info->running_transaction) { | 1452 | if (cur_trans == root->fs_info->running_transaction) { |
1453 | root->fs_info->trans_no_join = 1; | ||
1454 | spin_unlock(&root->fs_info->trans_lock); | ||
1455 | wait_event(cur_trans->writer_wait, | ||
1456 | atomic_read(&cur_trans->num_writers) == 1); | ||
1457 | |||
1458 | spin_lock(&root->fs_info->trans_lock); | ||
1395 | root->fs_info->running_transaction = NULL; | 1459 | root->fs_info->running_transaction = NULL; |
1396 | root->fs_info->trans_no_join = 0; | ||
1397 | } | 1460 | } |
1398 | spin_unlock(&root->fs_info->trans_lock); | 1461 | spin_unlock(&root->fs_info->trans_lock); |
1399 | 1462 | ||
@@ -1427,7 +1490,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, | |||
1427 | } | 1490 | } |
1428 | 1491 | ||
1429 | if (flush_on_commit || snap_pending) { | 1492 | if (flush_on_commit || snap_pending) { |
1430 | btrfs_start_delalloc_inodes(root, 1); | 1493 | ret = btrfs_start_delalloc_inodes(root, 1); |
1494 | if (ret) | ||
1495 | return ret; | ||
1431 | btrfs_wait_ordered_extents(root, 1); | 1496 | btrfs_wait_ordered_extents(root, 1); |
1432 | } | 1497 | } |
1433 | 1498 | ||
@@ -1449,9 +1514,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, | |||
1449 | * it here and no for sure that nothing new will be added | 1514 | * it here and no for sure that nothing new will be added |
1450 | * to the list | 1515 | * to the list |
1451 | */ | 1516 | */ |
1452 | btrfs_run_ordered_operations(root, 1); | 1517 | ret = btrfs_run_ordered_operations(trans, root, 1); |
1453 | 1518 | ||
1454 | return 0; | 1519 | return ret; |
1455 | } | 1520 | } |
1456 | 1521 | ||
1457 | /* | 1522 | /* |
@@ -1472,27 +1537,35 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | |||
1472 | int should_grow = 0; | 1537 | int should_grow = 0; |
1473 | unsigned long now = get_seconds(); | 1538 | unsigned long now = get_seconds(); |
1474 | 1539 | ||
1475 | ret = btrfs_run_ordered_operations(root, 0); | 1540 | ret = btrfs_run_ordered_operations(trans, root, 0); |
1476 | if (ret) { | 1541 | if (ret) { |
1477 | btrfs_abort_transaction(trans, root, ret); | 1542 | btrfs_abort_transaction(trans, root, ret); |
1478 | goto cleanup_transaction; | 1543 | btrfs_end_transaction(trans, root); |
1544 | return ret; | ||
1479 | } | 1545 | } |
1480 | 1546 | ||
1481 | /* Stop the commit early if ->aborted is set */ | 1547 | /* Stop the commit early if ->aborted is set */ |
1482 | if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { | 1548 | if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { |
1483 | ret = cur_trans->aborted; | 1549 | ret = cur_trans->aborted; |
1484 | goto cleanup_transaction; | 1550 | btrfs_end_transaction(trans, root); |
1551 | return ret; | ||
1485 | } | 1552 | } |
1486 | 1553 | ||
1487 | /* make a pass through all the delayed refs we have so far | 1554 | /* make a pass through all the delayed refs we have so far |
1488 | * any runnings procs may add more while we are here | 1555 | * any runnings procs may add more while we are here |
1489 | */ | 1556 | */ |
1490 | ret = btrfs_run_delayed_refs(trans, root, 0); | 1557 | ret = btrfs_run_delayed_refs(trans, root, 0); |
1491 | if (ret) | 1558 | if (ret) { |
1492 | goto cleanup_transaction; | 1559 | btrfs_end_transaction(trans, root); |
1560 | return ret; | ||
1561 | } | ||
1493 | 1562 | ||
1494 | btrfs_trans_release_metadata(trans, root); | 1563 | btrfs_trans_release_metadata(trans, root); |
1495 | trans->block_rsv = NULL; | 1564 | trans->block_rsv = NULL; |
1565 | if (trans->qgroup_reserved) { | ||
1566 | btrfs_qgroup_free(root, trans->qgroup_reserved); | ||
1567 | trans->qgroup_reserved = 0; | ||
1568 | } | ||
1496 | 1569 | ||
1497 | cur_trans = trans->transaction; | 1570 | cur_trans = trans->transaction; |
1498 | 1571 | ||
@@ -1506,8 +1579,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | |||
1506 | btrfs_create_pending_block_groups(trans, root); | 1579 | btrfs_create_pending_block_groups(trans, root); |
1507 | 1580 | ||
1508 | ret = btrfs_run_delayed_refs(trans, root, 0); | 1581 | ret = btrfs_run_delayed_refs(trans, root, 0); |
1509 | if (ret) | 1582 | if (ret) { |
1510 | goto cleanup_transaction; | 1583 | btrfs_end_transaction(trans, root); |
1584 | return ret; | ||
1585 | } | ||
1511 | 1586 | ||
1512 | spin_lock(&cur_trans->commit_lock); | 1587 | spin_lock(&cur_trans->commit_lock); |
1513 | if (cur_trans->in_commit) { | 1588 | if (cur_trans->in_commit) { |
@@ -1771,6 +1846,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | |||
1771 | cleanup_transaction: | 1846 | cleanup_transaction: |
1772 | btrfs_trans_release_metadata(trans, root); | 1847 | btrfs_trans_release_metadata(trans, root); |
1773 | trans->block_rsv = NULL; | 1848 | trans->block_rsv = NULL; |
1849 | if (trans->qgroup_reserved) { | ||
1850 | btrfs_qgroup_free(root, trans->qgroup_reserved); | ||
1851 | trans->qgroup_reserved = 0; | ||
1852 | } | ||
1774 | btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n"); | 1853 | btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n"); |
1775 | // WARN_ON(1); | 1854 | // WARN_ON(1); |
1776 | if (current->journal_info == trans) | 1855 | if (current->journal_info == trans) |
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 0e8aa1e6c287..3c8e0d25c8e4 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h | |||
@@ -43,6 +43,7 @@ struct btrfs_transaction { | |||
43 | wait_queue_head_t writer_wait; | 43 | wait_queue_head_t writer_wait; |
44 | wait_queue_head_t commit_wait; | 44 | wait_queue_head_t commit_wait; |
45 | struct list_head pending_snapshots; | 45 | struct list_head pending_snapshots; |
46 | struct list_head ordered_operations; | ||
46 | struct btrfs_delayed_ref_root delayed_refs; | 47 | struct btrfs_delayed_ref_root delayed_refs; |
47 | int aborted; | 48 | int aborted; |
48 | }; | 49 | }; |
@@ -68,6 +69,7 @@ struct btrfs_trans_handle { | |||
68 | struct btrfs_block_rsv *orig_rsv; | 69 | struct btrfs_block_rsv *orig_rsv; |
69 | short aborted; | 70 | short aborted; |
70 | short adding_csums; | 71 | short adding_csums; |
72 | bool allocating_chunk; | ||
71 | enum btrfs_trans_type type; | 73 | enum btrfs_trans_type type; |
72 | /* | 74 | /* |
73 | * this root is only needed to validate that the root passed to | 75 | * this root is only needed to validate that the root passed to |
@@ -82,11 +84,13 @@ struct btrfs_trans_handle { | |||
82 | 84 | ||
83 | struct btrfs_pending_snapshot { | 85 | struct btrfs_pending_snapshot { |
84 | struct dentry *dentry; | 86 | struct dentry *dentry; |
87 | struct inode *dir; | ||
85 | struct btrfs_root *root; | 88 | struct btrfs_root *root; |
86 | struct btrfs_root *snap; | 89 | struct btrfs_root *snap; |
87 | struct btrfs_qgroup_inherit *inherit; | 90 | struct btrfs_qgroup_inherit *inherit; |
88 | /* block reservation for the operation */ | 91 | /* block reservation for the operation */ |
89 | struct btrfs_block_rsv block_rsv; | 92 | struct btrfs_block_rsv block_rsv; |
93 | u64 qgroup_reserved; | ||
90 | /* extra metadata reseration for relocation */ | 94 | /* extra metadata reseration for relocation */ |
91 | int error; | 95 | int error; |
92 | bool readonly; | 96 | bool readonly; |
@@ -110,13 +114,15 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush( | |||
110 | struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); | 114 | struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); |
111 | struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); | 115 | struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); |
112 | struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); | 116 | struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); |
117 | struct btrfs_trans_handle *btrfs_attach_transaction_barrier( | ||
118 | struct btrfs_root *root); | ||
113 | struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); | 119 | struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); |
114 | int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); | 120 | int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); |
115 | int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, | 121 | int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, |
116 | struct btrfs_root *root); | 122 | struct btrfs_root *root); |
117 | 123 | ||
118 | int btrfs_add_dead_root(struct btrfs_root *root); | 124 | int btrfs_add_dead_root(struct btrfs_root *root); |
119 | int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); | 125 | int btrfs_defrag_root(struct btrfs_root *root); |
120 | int btrfs_clean_old_snapshots(struct btrfs_root *root); | 126 | int btrfs_clean_old_snapshots(struct btrfs_root *root); |
121 | int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | 127 | int btrfs_commit_transaction(struct btrfs_trans_handle *trans, |
122 | struct btrfs_root *root); | 128 | struct btrfs_root *root); |
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 3b580ee8ab1d..94e05c1f118a 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c | |||
@@ -23,13 +23,14 @@ | |||
23 | #include "transaction.h" | 23 | #include "transaction.h" |
24 | #include "locking.h" | 24 | #include "locking.h" |
25 | 25 | ||
26 | /* defrag all the leaves in a given btree. If cache_only == 1, don't read | 26 | /* |
27 | * things from disk, otherwise read all the leaves and try to get key order to | 27 | * Defrag all the leaves in a given btree. |
28 | * Read all the leaves and try to get key order to | ||
28 | * better reflect disk order | 29 | * better reflect disk order |
29 | */ | 30 | */ |
30 | 31 | ||
31 | int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, | 32 | int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, |
32 | struct btrfs_root *root, int cache_only) | 33 | struct btrfs_root *root) |
33 | { | 34 | { |
34 | struct btrfs_path *path = NULL; | 35 | struct btrfs_path *path = NULL; |
35 | struct btrfs_key key; | 36 | struct btrfs_key key; |
@@ -41,9 +42,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, | |||
41 | u64 last_ret = 0; | 42 | u64 last_ret = 0; |
42 | u64 min_trans = 0; | 43 | u64 min_trans = 0; |
43 | 44 | ||
44 | if (cache_only) | ||
45 | goto out; | ||
46 | |||
47 | if (root->fs_info->extent_root == root) { | 45 | if (root->fs_info->extent_root == root) { |
48 | /* | 46 | /* |
49 | * there's recursion here right now in the tree locking, | 47 | * there's recursion here right now in the tree locking, |
@@ -86,11 +84,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, | |||
86 | } | 84 | } |
87 | 85 | ||
88 | path->keep_locks = 1; | 86 | path->keep_locks = 1; |
89 | if (cache_only) | ||
90 | min_trans = root->defrag_trans_start; | ||
91 | 87 | ||
92 | ret = btrfs_search_forward(root, &key, NULL, path, | 88 | ret = btrfs_search_forward(root, &key, NULL, path, min_trans); |
93 | cache_only, min_trans); | ||
94 | if (ret < 0) | 89 | if (ret < 0) |
95 | goto out; | 90 | goto out; |
96 | if (ret > 0) { | 91 | if (ret > 0) { |
@@ -109,11 +104,11 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, | |||
109 | goto out; | 104 | goto out; |
110 | } | 105 | } |
111 | path->slots[1] = btrfs_header_nritems(path->nodes[1]); | 106 | path->slots[1] = btrfs_header_nritems(path->nodes[1]); |
112 | next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only, | 107 | next_key_ret = btrfs_find_next_key(root, path, &key, 1, |
113 | min_trans); | 108 | min_trans); |
114 | ret = btrfs_realloc_node(trans, root, | 109 | ret = btrfs_realloc_node(trans, root, |
115 | path->nodes[1], 0, | 110 | path->nodes[1], 0, |
116 | cache_only, &last_ret, | 111 | &last_ret, |
117 | &root->defrag_progress); | 112 | &root->defrag_progress); |
118 | if (ret) { | 113 | if (ret) { |
119 | WARN_ON(ret == -EAGAIN); | 114 | WARN_ON(ret == -EAGAIN); |
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9027bb1e7466..c7ef569eb22a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c | |||
@@ -278,8 +278,7 @@ static int process_one_buffer(struct btrfs_root *log, | |||
278 | struct walk_control *wc, u64 gen) | 278 | struct walk_control *wc, u64 gen) |
279 | { | 279 | { |
280 | if (wc->pin) | 280 | if (wc->pin) |
281 | btrfs_pin_extent_for_log_replay(wc->trans, | 281 | btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, |
282 | log->fs_info->extent_root, | ||
283 | eb->start, eb->len); | 282 | eb->start, eb->len); |
284 | 283 | ||
285 | if (btrfs_buffer_uptodate(eb, gen, 0)) { | 284 | if (btrfs_buffer_uptodate(eb, gen, 0)) { |
@@ -485,7 +484,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, | |||
485 | struct btrfs_key *key) | 484 | struct btrfs_key *key) |
486 | { | 485 | { |
487 | int found_type; | 486 | int found_type; |
488 | u64 mask = root->sectorsize - 1; | ||
489 | u64 extent_end; | 487 | u64 extent_end; |
490 | u64 start = key->offset; | 488 | u64 start = key->offset; |
491 | u64 saved_nbytes; | 489 | u64 saved_nbytes; |
@@ -502,7 +500,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, | |||
502 | extent_end = start + btrfs_file_extent_num_bytes(eb, item); | 500 | extent_end = start + btrfs_file_extent_num_bytes(eb, item); |
503 | else if (found_type == BTRFS_FILE_EXTENT_INLINE) { | 501 | else if (found_type == BTRFS_FILE_EXTENT_INLINE) { |
504 | size = btrfs_file_extent_inline_len(eb, item); | 502 | size = btrfs_file_extent_inline_len(eb, item); |
505 | extent_end = (start + size + mask) & ~mask; | 503 | extent_end = ALIGN(start + size, root->sectorsize); |
506 | } else { | 504 | } else { |
507 | ret = 0; | 505 | ret = 0; |
508 | goto out; | 506 | goto out; |
@@ -2281,6 +2279,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, | |||
2281 | unsigned long log_transid = 0; | 2279 | unsigned long log_transid = 0; |
2282 | 2280 | ||
2283 | mutex_lock(&root->log_mutex); | 2281 | mutex_lock(&root->log_mutex); |
2282 | log_transid = root->log_transid; | ||
2284 | index1 = root->log_transid % 2; | 2283 | index1 = root->log_transid % 2; |
2285 | if (atomic_read(&root->log_commit[index1])) { | 2284 | if (atomic_read(&root->log_commit[index1])) { |
2286 | wait_log_commit(trans, root, root->log_transid); | 2285 | wait_log_commit(trans, root, root->log_transid); |
@@ -2308,11 +2307,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, | |||
2308 | /* bail out if we need to do a full commit */ | 2307 | /* bail out if we need to do a full commit */ |
2309 | if (root->fs_info->last_trans_log_full_commit == trans->transid) { | 2308 | if (root->fs_info->last_trans_log_full_commit == trans->transid) { |
2310 | ret = -EAGAIN; | 2309 | ret = -EAGAIN; |
2310 | btrfs_free_logged_extents(log, log_transid); | ||
2311 | mutex_unlock(&root->log_mutex); | 2311 | mutex_unlock(&root->log_mutex); |
2312 | goto out; | 2312 | goto out; |
2313 | } | 2313 | } |
2314 | 2314 | ||
2315 | log_transid = root->log_transid; | ||
2316 | if (log_transid % 2 == 0) | 2315 | if (log_transid % 2 == 0) |
2317 | mark = EXTENT_DIRTY; | 2316 | mark = EXTENT_DIRTY; |
2318 | else | 2317 | else |
@@ -2324,6 +2323,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, | |||
2324 | ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); | 2323 | ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); |
2325 | if (ret) { | 2324 | if (ret) { |
2326 | btrfs_abort_transaction(trans, root, ret); | 2325 | btrfs_abort_transaction(trans, root, ret); |
2326 | btrfs_free_logged_extents(log, log_transid); | ||
2327 | mutex_unlock(&root->log_mutex); | 2327 | mutex_unlock(&root->log_mutex); |
2328 | goto out; | 2328 | goto out; |
2329 | } | 2329 | } |
@@ -2363,6 +2363,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, | |||
2363 | } | 2363 | } |
2364 | root->fs_info->last_trans_log_full_commit = trans->transid; | 2364 | root->fs_info->last_trans_log_full_commit = trans->transid; |
2365 | btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); | 2365 | btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); |
2366 | btrfs_free_logged_extents(log, log_transid); | ||
2366 | mutex_unlock(&log_root_tree->log_mutex); | 2367 | mutex_unlock(&log_root_tree->log_mutex); |
2367 | ret = -EAGAIN; | 2368 | ret = -EAGAIN; |
2368 | goto out; | 2369 | goto out; |
@@ -2373,6 +2374,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, | |||
2373 | btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); | 2374 | btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); |
2374 | wait_log_commit(trans, log_root_tree, | 2375 | wait_log_commit(trans, log_root_tree, |
2375 | log_root_tree->log_transid); | 2376 | log_root_tree->log_transid); |
2377 | btrfs_free_logged_extents(log, log_transid); | ||
2376 | mutex_unlock(&log_root_tree->log_mutex); | 2378 | mutex_unlock(&log_root_tree->log_mutex); |
2377 | ret = 0; | 2379 | ret = 0; |
2378 | goto out; | 2380 | goto out; |
@@ -2392,6 +2394,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, | |||
2392 | */ | 2394 | */ |
2393 | if (root->fs_info->last_trans_log_full_commit == trans->transid) { | 2395 | if (root->fs_info->last_trans_log_full_commit == trans->transid) { |
2394 | btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); | 2396 | btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); |
2397 | btrfs_free_logged_extents(log, log_transid); | ||
2395 | mutex_unlock(&log_root_tree->log_mutex); | 2398 | mutex_unlock(&log_root_tree->log_mutex); |
2396 | ret = -EAGAIN; | 2399 | ret = -EAGAIN; |
2397 | goto out_wake_log_root; | 2400 | goto out_wake_log_root; |
@@ -2402,10 +2405,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, | |||
2402 | EXTENT_DIRTY | EXTENT_NEW); | 2405 | EXTENT_DIRTY | EXTENT_NEW); |
2403 | if (ret) { | 2406 | if (ret) { |
2404 | btrfs_abort_transaction(trans, root, ret); | 2407 | btrfs_abort_transaction(trans, root, ret); |
2408 | btrfs_free_logged_extents(log, log_transid); | ||
2405 | mutex_unlock(&log_root_tree->log_mutex); | 2409 | mutex_unlock(&log_root_tree->log_mutex); |
2406 | goto out_wake_log_root; | 2410 | goto out_wake_log_root; |
2407 | } | 2411 | } |
2408 | btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); | 2412 | btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); |
2413 | btrfs_wait_logged_extents(log, log_transid); | ||
2409 | 2414 | ||
2410 | btrfs_set_super_log_root(root->fs_info->super_for_commit, | 2415 | btrfs_set_super_log_root(root->fs_info->super_for_commit, |
2411 | log_root_tree->node->start); | 2416 | log_root_tree->node->start); |
@@ -2461,8 +2466,10 @@ static void free_log_tree(struct btrfs_trans_handle *trans, | |||
2461 | .process_func = process_one_buffer | 2466 | .process_func = process_one_buffer |
2462 | }; | 2467 | }; |
2463 | 2468 | ||
2464 | ret = walk_log_tree(trans, log, &wc); | 2469 | if (trans) { |
2465 | BUG_ON(ret); | 2470 | ret = walk_log_tree(trans, log, &wc); |
2471 | BUG_ON(ret); | ||
2472 | } | ||
2466 | 2473 | ||
2467 | while (1) { | 2474 | while (1) { |
2468 | ret = find_first_extent_bit(&log->dirty_log_pages, | 2475 | ret = find_first_extent_bit(&log->dirty_log_pages, |
@@ -2475,6 +2482,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans, | |||
2475 | EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); | 2482 | EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); |
2476 | } | 2483 | } |
2477 | 2484 | ||
2485 | /* | ||
2486 | * We may have short-circuited the log tree with the full commit logic | ||
2487 | * and left ordered extents on our list, so clear these out to keep us | ||
2488 | * from leaking inodes and memory. | ||
2489 | */ | ||
2490 | btrfs_free_logged_extents(log, 0); | ||
2491 | btrfs_free_logged_extents(log, 1); | ||
2492 | |||
2478 | free_extent_buffer(log->node); | 2493 | free_extent_buffer(log->node); |
2479 | kfree(log); | 2494 | kfree(log); |
2480 | } | 2495 | } |
@@ -2724,7 +2739,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, | |||
2724 | path->keep_locks = 1; | 2739 | path->keep_locks = 1; |
2725 | 2740 | ||
2726 | ret = btrfs_search_forward(root, &min_key, &max_key, | 2741 | ret = btrfs_search_forward(root, &min_key, &max_key, |
2727 | path, 0, trans->transid); | 2742 | path, trans->transid); |
2728 | 2743 | ||
2729 | /* | 2744 | /* |
2730 | * we didn't find anything from this transaction, see if there | 2745 | * we didn't find anything from this transaction, see if there |
@@ -3271,16 +3286,21 @@ static int log_one_extent(struct btrfs_trans_handle *trans, | |||
3271 | struct btrfs_root *log = root->log_root; | 3286 | struct btrfs_root *log = root->log_root; |
3272 | struct btrfs_file_extent_item *fi; | 3287 | struct btrfs_file_extent_item *fi; |
3273 | struct extent_buffer *leaf; | 3288 | struct extent_buffer *leaf; |
3289 | struct btrfs_ordered_extent *ordered; | ||
3274 | struct list_head ordered_sums; | 3290 | struct list_head ordered_sums; |
3275 | struct btrfs_map_token token; | 3291 | struct btrfs_map_token token; |
3276 | struct btrfs_key key; | 3292 | struct btrfs_key key; |
3277 | u64 csum_offset = em->mod_start - em->start; | 3293 | u64 mod_start = em->mod_start; |
3278 | u64 csum_len = em->mod_len; | 3294 | u64 mod_len = em->mod_len; |
3295 | u64 csum_offset; | ||
3296 | u64 csum_len; | ||
3279 | u64 extent_offset = em->start - em->orig_start; | 3297 | u64 extent_offset = em->start - em->orig_start; |
3280 | u64 block_len; | 3298 | u64 block_len; |
3281 | int ret; | 3299 | int ret; |
3300 | int index = log->log_transid % 2; | ||
3282 | bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; | 3301 | bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; |
3283 | 3302 | ||
3303 | insert: | ||
3284 | INIT_LIST_HEAD(&ordered_sums); | 3304 | INIT_LIST_HEAD(&ordered_sums); |
3285 | btrfs_init_map_token(&token); | 3305 | btrfs_init_map_token(&token); |
3286 | key.objectid = btrfs_ino(inode); | 3306 | key.objectid = btrfs_ino(inode); |
@@ -3296,6 +3316,23 @@ static int log_one_extent(struct btrfs_trans_handle *trans, | |||
3296 | leaf = path->nodes[0]; | 3316 | leaf = path->nodes[0]; |
3297 | fi = btrfs_item_ptr(leaf, path->slots[0], | 3317 | fi = btrfs_item_ptr(leaf, path->slots[0], |
3298 | struct btrfs_file_extent_item); | 3318 | struct btrfs_file_extent_item); |
3319 | |||
3320 | /* | ||
3321 | * If we are overwriting an inline extent with a real one then we need | ||
3322 | * to just delete the inline extent as it may not be large enough to | ||
3323 | * have the entire file_extent_item. | ||
3324 | */ | ||
3325 | if (ret && btrfs_token_file_extent_type(leaf, fi, &token) == | ||
3326 | BTRFS_FILE_EXTENT_INLINE) { | ||
3327 | ret = btrfs_del_item(trans, log, path); | ||
3328 | btrfs_release_path(path); | ||
3329 | if (ret) { | ||
3330 | path->really_keep_locks = 0; | ||
3331 | return ret; | ||
3332 | } | ||
3333 | goto insert; | ||
3334 | } | ||
3335 | |||
3299 | btrfs_set_token_file_extent_generation(leaf, fi, em->generation, | 3336 | btrfs_set_token_file_extent_generation(leaf, fi, em->generation, |
3300 | &token); | 3337 | &token); |
3301 | if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { | 3338 | if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { |
@@ -3362,6 +3399,92 @@ static int log_one_extent(struct btrfs_trans_handle *trans, | |||
3362 | csum_len = block_len; | 3399 | csum_len = block_len; |
3363 | } | 3400 | } |
3364 | 3401 | ||
3402 | /* | ||
3403 | * First check and see if our csums are on our outstanding ordered | ||
3404 | * extents. | ||
3405 | */ | ||
3406 | again: | ||
3407 | spin_lock_irq(&log->log_extents_lock[index]); | ||
3408 | list_for_each_entry(ordered, &log->logged_list[index], log_list) { | ||
3409 | struct btrfs_ordered_sum *sum; | ||
3410 | |||
3411 | if (!mod_len) | ||
3412 | break; | ||
3413 | |||
3414 | if (ordered->inode != inode) | ||
3415 | continue; | ||
3416 | |||
3417 | if (ordered->file_offset + ordered->len <= mod_start || | ||
3418 | mod_start + mod_len <= ordered->file_offset) | ||
3419 | continue; | ||
3420 | |||
3421 | /* | ||
3422 | * We are going to copy all the csums on this ordered extent, so | ||
3423 | * go ahead and adjust mod_start and mod_len in case this | ||
3424 | * ordered extent has already been logged. | ||
3425 | */ | ||
3426 | if (ordered->file_offset > mod_start) { | ||
3427 | if (ordered->file_offset + ordered->len >= | ||
3428 | mod_start + mod_len) | ||
3429 | mod_len = ordered->file_offset - mod_start; | ||
3430 | /* | ||
3431 | * If we have this case | ||
3432 | * | ||
3433 | * |--------- logged extent ---------| | ||
3434 | * |----- ordered extent ----| | ||
3435 | * | ||
3436 | * Just don't mess with mod_start and mod_len, we'll | ||
3437 | * just end up logging more csums than we need and it | ||
3438 | * will be ok. | ||
3439 | */ | ||
3440 | } else { | ||
3441 | if (ordered->file_offset + ordered->len < | ||
3442 | mod_start + mod_len) { | ||
3443 | mod_len = (mod_start + mod_len) - | ||
3444 | (ordered->file_offset + ordered->len); | ||
3445 | mod_start = ordered->file_offset + | ||
3446 | ordered->len; | ||
3447 | } else { | ||
3448 | mod_len = 0; | ||
3449 | } | ||
3450 | } | ||
3451 | |||
3452 | /* | ||
3453 | * To keep us from looping for the above case of an ordered | ||
3454 | * extent that falls inside of the logged extent. | ||
3455 | */ | ||
3456 | if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, | ||
3457 | &ordered->flags)) | ||
3458 | continue; | ||
3459 | atomic_inc(&ordered->refs); | ||
3460 | spin_unlock_irq(&log->log_extents_lock[index]); | ||
3461 | /* | ||
3462 | * we've dropped the lock, we must either break or | ||
3463 | * start over after this. | ||
3464 | */ | ||
3465 | |||
3466 | wait_event(ordered->wait, ordered->csum_bytes_left == 0); | ||
3467 | |||
3468 | list_for_each_entry(sum, &ordered->list, list) { | ||
3469 | ret = btrfs_csum_file_blocks(trans, log, sum); | ||
3470 | if (ret) { | ||
3471 | btrfs_put_ordered_extent(ordered); | ||
3472 | goto unlocked; | ||
3473 | } | ||
3474 | } | ||
3475 | btrfs_put_ordered_extent(ordered); | ||
3476 | goto again; | ||
3477 | |||
3478 | } | ||
3479 | spin_unlock_irq(&log->log_extents_lock[index]); | ||
3480 | unlocked: | ||
3481 | |||
3482 | if (!mod_len || ret) | ||
3483 | return ret; | ||
3484 | |||
3485 | csum_offset = mod_start - em->start; | ||
3486 | csum_len = mod_len; | ||
3487 | |||
3365 | /* block start is already adjusted for the file extent offset. */ | 3488 | /* block start is already adjusted for the file extent offset. */ |
3366 | ret = btrfs_lookup_csums_range(log->fs_info->csum_root, | 3489 | ret = btrfs_lookup_csums_range(log->fs_info->csum_root, |
3367 | em->block_start + csum_offset, | 3490 | em->block_start + csum_offset, |
@@ -3393,6 +3516,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, | |||
3393 | struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; | 3516 | struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; |
3394 | u64 test_gen; | 3517 | u64 test_gen; |
3395 | int ret = 0; | 3518 | int ret = 0; |
3519 | int num = 0; | ||
3396 | 3520 | ||
3397 | INIT_LIST_HEAD(&extents); | 3521 | INIT_LIST_HEAD(&extents); |
3398 | 3522 | ||
@@ -3401,16 +3525,31 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, | |||
3401 | 3525 | ||
3402 | list_for_each_entry_safe(em, n, &tree->modified_extents, list) { | 3526 | list_for_each_entry_safe(em, n, &tree->modified_extents, list) { |
3403 | list_del_init(&em->list); | 3527 | list_del_init(&em->list); |
3528 | |||
3529 | /* | ||
3530 | * Just an arbitrary number, this can be really CPU intensive | ||
3531 | * once we start getting a lot of extents, and really once we | ||
3532 | * have a bunch of extents we just want to commit since it will | ||
3533 | * be faster. | ||
3534 | */ | ||
3535 | if (++num > 32768) { | ||
3536 | list_del_init(&tree->modified_extents); | ||
3537 | ret = -EFBIG; | ||
3538 | goto process; | ||
3539 | } | ||
3540 | |||
3404 | if (em->generation <= test_gen) | 3541 | if (em->generation <= test_gen) |
3405 | continue; | 3542 | continue; |
3406 | /* Need a ref to keep it from getting evicted from cache */ | 3543 | /* Need a ref to keep it from getting evicted from cache */ |
3407 | atomic_inc(&em->refs); | 3544 | atomic_inc(&em->refs); |
3408 | set_bit(EXTENT_FLAG_LOGGING, &em->flags); | 3545 | set_bit(EXTENT_FLAG_LOGGING, &em->flags); |
3409 | list_add_tail(&em->list, &extents); | 3546 | list_add_tail(&em->list, &extents); |
3547 | num++; | ||
3410 | } | 3548 | } |
3411 | 3549 | ||
3412 | list_sort(NULL, &extents, extent_cmp); | 3550 | list_sort(NULL, &extents, extent_cmp); |
3413 | 3551 | ||
3552 | process: | ||
3414 | while (!list_empty(&extents)) { | 3553 | while (!list_empty(&extents)) { |
3415 | em = list_entry(extents.next, struct extent_map, list); | 3554 | em = list_entry(extents.next, struct extent_map, list); |
3416 | 3555 | ||
@@ -3513,6 +3652,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, | |||
3513 | 3652 | ||
3514 | mutex_lock(&BTRFS_I(inode)->log_mutex); | 3653 | mutex_lock(&BTRFS_I(inode)->log_mutex); |
3515 | 3654 | ||
3655 | btrfs_get_logged_extents(log, inode); | ||
3656 | |||
3516 | /* | 3657 | /* |
3517 | * a brute force approach to making sure we get the most uptodate | 3658 | * a brute force approach to making sure we get the most uptodate |
3518 | * copies of everything. | 3659 | * copies of everything. |
@@ -3558,7 +3699,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, | |||
3558 | while (1) { | 3699 | while (1) { |
3559 | ins_nr = 0; | 3700 | ins_nr = 0; |
3560 | ret = btrfs_search_forward(root, &min_key, &max_key, | 3701 | ret = btrfs_search_forward(root, &min_key, &max_key, |
3561 | path, 0, trans->transid); | 3702 | path, trans->transid); |
3562 | if (ret != 0) | 3703 | if (ret != 0) |
3563 | break; | 3704 | break; |
3564 | again: | 3705 | again: |
@@ -3656,6 +3797,8 @@ log_extents: | |||
3656 | BTRFS_I(inode)->logged_trans = trans->transid; | 3797 | BTRFS_I(inode)->logged_trans = trans->transid; |
3657 | BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; | 3798 | BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; |
3658 | out_unlock: | 3799 | out_unlock: |
3800 | if (err) | ||
3801 | btrfs_free_logged_extents(log, log->log_transid); | ||
3659 | mutex_unlock(&BTRFS_I(inode)->log_mutex); | 3802 | mutex_unlock(&BTRFS_I(inode)->log_mutex); |
3660 | 3803 | ||
3661 | btrfs_free_path(path); | 3804 | btrfs_free_path(path); |
@@ -3822,7 +3965,6 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, | |||
3822 | end_trans: | 3965 | end_trans: |
3823 | dput(old_parent); | 3966 | dput(old_parent); |
3824 | if (ret < 0) { | 3967 | if (ret < 0) { |
3825 | WARN_ON(ret != -ENOSPC); | ||
3826 | root->fs_info->last_trans_log_full_commit = trans->transid; | 3968 | root->fs_info->last_trans_log_full_commit = trans->transid; |
3827 | ret = 1; | 3969 | ret = 1; |
3828 | } | 3970 | } |
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 99be4c138db6..ddc61cad0080 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c | |||
@@ -5,7 +5,7 @@ | |||
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include <linux/slab.h> | 7 | #include <linux/slab.h> |
8 | #include <linux/module.h> | 8 | #include <linux/export.h> |
9 | #include "ulist.h" | 9 | #include "ulist.h" |
10 | 10 | ||
11 | /* | 11 | /* |
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5cbb7f4b1672..35bb2d4ed29f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c | |||
@@ -25,6 +25,8 @@ | |||
25 | #include <linux/capability.h> | 25 | #include <linux/capability.h> |
26 | #include <linux/ratelimit.h> | 26 | #include <linux/ratelimit.h> |
27 | #include <linux/kthread.h> | 27 | #include <linux/kthread.h> |
28 | #include <linux/raid/pq.h> | ||
29 | #include <asm/div64.h> | ||
28 | #include "compat.h" | 30 | #include "compat.h" |
29 | #include "ctree.h" | 31 | #include "ctree.h" |
30 | #include "extent_map.h" | 32 | #include "extent_map.h" |
@@ -32,6 +34,7 @@ | |||
32 | #include "transaction.h" | 34 | #include "transaction.h" |
33 | #include "print-tree.h" | 35 | #include "print-tree.h" |
34 | #include "volumes.h" | 36 | #include "volumes.h" |
37 | #include "raid56.h" | ||
35 | #include "async-thread.h" | 38 | #include "async-thread.h" |
36 | #include "check-integrity.h" | 39 | #include "check-integrity.h" |
37 | #include "rcu-string.h" | 40 | #include "rcu-string.h" |
@@ -647,6 +650,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) | |||
647 | new_device->writeable = 0; | 650 | new_device->writeable = 0; |
648 | new_device->in_fs_metadata = 0; | 651 | new_device->in_fs_metadata = 0; |
649 | new_device->can_discard = 0; | 652 | new_device->can_discard = 0; |
653 | spin_lock_init(&new_device->io_lock); | ||
650 | list_replace_rcu(&device->dev_list, &new_device->dev_list); | 654 | list_replace_rcu(&device->dev_list, &new_device->dev_list); |
651 | 655 | ||
652 | call_rcu(&device->rcu, free_device); | 656 | call_rcu(&device->rcu, free_device); |
@@ -792,26 +796,75 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, | |||
792 | return ret; | 796 | return ret; |
793 | } | 797 | } |
794 | 798 | ||
799 | /* | ||
800 | * Look for a btrfs signature on a device. This may be called out of the mount path | ||
801 | * and we are not allowed to call set_blocksize during the scan. The superblock | ||
802 | * is read via pagecache | ||
803 | */ | ||
795 | int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, | 804 | int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, |
796 | struct btrfs_fs_devices **fs_devices_ret) | 805 | struct btrfs_fs_devices **fs_devices_ret) |
797 | { | 806 | { |
798 | struct btrfs_super_block *disk_super; | 807 | struct btrfs_super_block *disk_super; |
799 | struct block_device *bdev; | 808 | struct block_device *bdev; |
800 | struct buffer_head *bh; | 809 | struct page *page; |
801 | int ret; | 810 | void *p; |
811 | int ret = -EINVAL; | ||
802 | u64 devid; | 812 | u64 devid; |
803 | u64 transid; | 813 | u64 transid; |
804 | u64 total_devices; | 814 | u64 total_devices; |
815 | u64 bytenr; | ||
816 | pgoff_t index; | ||
805 | 817 | ||
818 | /* | ||
819 | * we would like to check all the supers, but that would make | ||
820 | * a btrfs mount succeed after a mkfs from a different FS. | ||
821 | * So, we need to add a special mount option to scan for | ||
822 | * later supers, using BTRFS_SUPER_MIRROR_MAX instead | ||
823 | */ | ||
824 | bytenr = btrfs_sb_offset(0); | ||
806 | flags |= FMODE_EXCL; | 825 | flags |= FMODE_EXCL; |
807 | mutex_lock(&uuid_mutex); | 826 | mutex_lock(&uuid_mutex); |
808 | ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh); | 827 | |
809 | if (ret) | 828 | bdev = blkdev_get_by_path(path, flags, holder); |
829 | |||
830 | if (IS_ERR(bdev)) { | ||
831 | ret = PTR_ERR(bdev); | ||
810 | goto error; | 832 | goto error; |
811 | disk_super = (struct btrfs_super_block *)bh->b_data; | 833 | } |
834 | |||
835 | /* make sure our super fits in the device */ | ||
836 | if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode)) | ||
837 | goto error_bdev_put; | ||
838 | |||
839 | /* make sure our super fits in the page */ | ||
840 | if (sizeof(*disk_super) > PAGE_CACHE_SIZE) | ||
841 | goto error_bdev_put; | ||
842 | |||
843 | /* make sure our super doesn't straddle pages on disk */ | ||
844 | index = bytenr >> PAGE_CACHE_SHIFT; | ||
845 | if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index) | ||
846 | goto error_bdev_put; | ||
847 | |||
848 | /* pull in the page with our super */ | ||
849 | page = read_cache_page_gfp(bdev->bd_inode->i_mapping, | ||
850 | index, GFP_NOFS); | ||
851 | |||
852 | if (IS_ERR_OR_NULL(page)) | ||
853 | goto error_bdev_put; | ||
854 | |||
855 | p = kmap(page); | ||
856 | |||
857 | /* align our pointer to the offset of the super block */ | ||
858 | disk_super = p + (bytenr & ~PAGE_CACHE_MASK); | ||
859 | |||
860 | if (btrfs_super_bytenr(disk_super) != bytenr || | ||
861 | disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) | ||
862 | goto error_unmap; | ||
863 | |||
812 | devid = btrfs_stack_device_id(&disk_super->dev_item); | 864 | devid = btrfs_stack_device_id(&disk_super->dev_item); |
813 | transid = btrfs_super_generation(disk_super); | 865 | transid = btrfs_super_generation(disk_super); |
814 | total_devices = btrfs_super_num_devices(disk_super); | 866 | total_devices = btrfs_super_num_devices(disk_super); |
867 | |||
815 | if (disk_super->label[0]) { | 868 | if (disk_super->label[0]) { |
816 | if (disk_super->label[BTRFS_LABEL_SIZE - 1]) | 869 | if (disk_super->label[BTRFS_LABEL_SIZE - 1]) |
817 | disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; | 870 | disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; |
@@ -819,12 +872,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, | |||
819 | } else { | 872 | } else { |
820 | printk(KERN_INFO "device fsid %pU ", disk_super->fsid); | 873 | printk(KERN_INFO "device fsid %pU ", disk_super->fsid); |
821 | } | 874 | } |
875 | |||
822 | printk(KERN_CONT "devid %llu transid %llu %s\n", | 876 | printk(KERN_CONT "devid %llu transid %llu %s\n", |
823 | (unsigned long long)devid, (unsigned long long)transid, path); | 877 | (unsigned long long)devid, (unsigned long long)transid, path); |
878 | |||
824 | ret = device_list_add(path, disk_super, devid, fs_devices_ret); | 879 | ret = device_list_add(path, disk_super, devid, fs_devices_ret); |
825 | if (!ret && fs_devices_ret) | 880 | if (!ret && fs_devices_ret) |
826 | (*fs_devices_ret)->total_devices = total_devices; | 881 | (*fs_devices_ret)->total_devices = total_devices; |
827 | brelse(bh); | 882 | |
883 | error_unmap: | ||
884 | kunmap(page); | ||
885 | page_cache_release(page); | ||
886 | |||
887 | error_bdev_put: | ||
828 | blkdev_put(bdev, flags); | 888 | blkdev_put(bdev, flags); |
829 | error: | 889 | error: |
830 | mutex_unlock(&uuid_mutex); | 890 | mutex_unlock(&uuid_mutex); |
@@ -1372,14 +1432,19 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1372 | u64 devid; | 1432 | u64 devid; |
1373 | u64 num_devices; | 1433 | u64 num_devices; |
1374 | u8 *dev_uuid; | 1434 | u8 *dev_uuid; |
1435 | unsigned seq; | ||
1375 | int ret = 0; | 1436 | int ret = 0; |
1376 | bool clear_super = false; | 1437 | bool clear_super = false; |
1377 | 1438 | ||
1378 | mutex_lock(&uuid_mutex); | 1439 | mutex_lock(&uuid_mutex); |
1379 | 1440 | ||
1380 | all_avail = root->fs_info->avail_data_alloc_bits | | 1441 | do { |
1381 | root->fs_info->avail_system_alloc_bits | | 1442 | seq = read_seqbegin(&root->fs_info->profiles_lock); |
1382 | root->fs_info->avail_metadata_alloc_bits; | 1443 | |
1444 | all_avail = root->fs_info->avail_data_alloc_bits | | ||
1445 | root->fs_info->avail_system_alloc_bits | | ||
1446 | root->fs_info->avail_metadata_alloc_bits; | ||
1447 | } while (read_seqretry(&root->fs_info->profiles_lock, seq)); | ||
1383 | 1448 | ||
1384 | num_devices = root->fs_info->fs_devices->num_devices; | 1449 | num_devices = root->fs_info->fs_devices->num_devices; |
1385 | btrfs_dev_replace_lock(&root->fs_info->dev_replace); | 1450 | btrfs_dev_replace_lock(&root->fs_info->dev_replace); |
@@ -1403,6 +1468,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1403 | goto out; | 1468 | goto out; |
1404 | } | 1469 | } |
1405 | 1470 | ||
1471 | if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && | ||
1472 | root->fs_info->fs_devices->rw_devices <= 2) { | ||
1473 | printk(KERN_ERR "btrfs: unable to go below two " | ||
1474 | "devices on raid5\n"); | ||
1475 | ret = -EINVAL; | ||
1476 | goto out; | ||
1477 | } | ||
1478 | if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && | ||
1479 | root->fs_info->fs_devices->rw_devices <= 3) { | ||
1480 | printk(KERN_ERR "btrfs: unable to go below three " | ||
1481 | "devices on raid6\n"); | ||
1482 | ret = -EINVAL; | ||
1483 | goto out; | ||
1484 | } | ||
1485 | |||
1406 | if (strcmp(device_path, "missing") == 0) { | 1486 | if (strcmp(device_path, "missing") == 0) { |
1407 | struct list_head *devices; | 1487 | struct list_head *devices; |
1408 | struct btrfs_device *tmp; | 1488 | struct btrfs_device *tmp; |
@@ -2616,7 +2696,7 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, | |||
2616 | chunk_used = btrfs_block_group_used(&cache->item); | 2696 | chunk_used = btrfs_block_group_used(&cache->item); |
2617 | 2697 | ||
2618 | if (bargs->usage == 0) | 2698 | if (bargs->usage == 0) |
2619 | user_thresh = 0; | 2699 | user_thresh = 1; |
2620 | else if (bargs->usage > 100) | 2700 | else if (bargs->usage > 100) |
2621 | user_thresh = cache->key.offset; | 2701 | user_thresh = cache->key.offset; |
2622 | else | 2702 | else |
@@ -2664,11 +2744,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf, | |||
2664 | return 0; | 2744 | return 0; |
2665 | 2745 | ||
2666 | if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | | 2746 | if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | |
2667 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) | 2747 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { |
2668 | factor = 2; | 2748 | factor = num_stripes / 2; |
2669 | else | 2749 | } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { |
2670 | factor = 1; | 2750 | factor = num_stripes - 1; |
2671 | factor = num_stripes / factor; | 2751 | } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { |
2752 | factor = num_stripes - 2; | ||
2753 | } else { | ||
2754 | factor = num_stripes; | ||
2755 | } | ||
2672 | 2756 | ||
2673 | for (i = 0; i < num_stripes; i++) { | 2757 | for (i = 0; i < num_stripes; i++) { |
2674 | stripe = btrfs_stripe_nr(chunk, i); | 2758 | stripe = btrfs_stripe_nr(chunk, i); |
@@ -2985,6 +3069,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, | |||
2985 | int mixed = 0; | 3069 | int mixed = 0; |
2986 | int ret; | 3070 | int ret; |
2987 | u64 num_devices; | 3071 | u64 num_devices; |
3072 | unsigned seq; | ||
2988 | 3073 | ||
2989 | if (btrfs_fs_closing(fs_info) || | 3074 | if (btrfs_fs_closing(fs_info) || |
2990 | atomic_read(&fs_info->balance_pause_req) || | 3075 | atomic_read(&fs_info->balance_pause_req) || |
@@ -3027,7 +3112,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl, | |||
3027 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); | 3112 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); |
3028 | else | 3113 | else |
3029 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | | 3114 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | |
3030 | BTRFS_BLOCK_GROUP_RAID10); | 3115 | BTRFS_BLOCK_GROUP_RAID10 | |
3116 | BTRFS_BLOCK_GROUP_RAID5 | | ||
3117 | BTRFS_BLOCK_GROUP_RAID6); | ||
3031 | 3118 | ||
3032 | if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && | 3119 | if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && |
3033 | (!alloc_profile_is_valid(bctl->data.target, 1) || | 3120 | (!alloc_profile_is_valid(bctl->data.target, 1) || |
@@ -3067,23 +3154,29 @@ int btrfs_balance(struct btrfs_balance_control *bctl, | |||
3067 | 3154 | ||
3068 | /* allow to reduce meta or sys integrity only if force set */ | 3155 | /* allow to reduce meta or sys integrity only if force set */ |
3069 | allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | | 3156 | allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | |
3070 | BTRFS_BLOCK_GROUP_RAID10; | 3157 | BTRFS_BLOCK_GROUP_RAID10 | |
3071 | if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && | 3158 | BTRFS_BLOCK_GROUP_RAID5 | |
3072 | (fs_info->avail_system_alloc_bits & allowed) && | 3159 | BTRFS_BLOCK_GROUP_RAID6; |
3073 | !(bctl->sys.target & allowed)) || | 3160 | do { |
3074 | ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && | 3161 | seq = read_seqbegin(&fs_info->profiles_lock); |
3075 | (fs_info->avail_metadata_alloc_bits & allowed) && | 3162 | |
3076 | !(bctl->meta.target & allowed))) { | 3163 | if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && |
3077 | if (bctl->flags & BTRFS_BALANCE_FORCE) { | 3164 | (fs_info->avail_system_alloc_bits & allowed) && |
3078 | printk(KERN_INFO "btrfs: force reducing metadata " | 3165 | !(bctl->sys.target & allowed)) || |
3079 | "integrity\n"); | 3166 | ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && |
3080 | } else { | 3167 | (fs_info->avail_metadata_alloc_bits & allowed) && |
3081 | printk(KERN_ERR "btrfs: balance will reduce metadata " | 3168 | !(bctl->meta.target & allowed))) { |
3082 | "integrity, use force if you want this\n"); | 3169 | if (bctl->flags & BTRFS_BALANCE_FORCE) { |
3083 | ret = -EINVAL; | 3170 | printk(KERN_INFO "btrfs: force reducing metadata " |
3084 | goto out; | 3171 | "integrity\n"); |
3172 | } else { | ||
3173 | printk(KERN_ERR "btrfs: balance will reduce metadata " | ||
3174 | "integrity, use force if you want this\n"); | ||
3175 | ret = -EINVAL; | ||
3176 | goto out; | ||
3177 | } | ||
3085 | } | 3178 | } |
3086 | } | 3179 | } while (read_seqretry(&fs_info->profiles_lock, seq)); |
3087 | 3180 | ||
3088 | if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { | 3181 | if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { |
3089 | int num_tolerated_disk_barrier_failures; | 3182 | int num_tolerated_disk_barrier_failures; |
@@ -3127,21 +3220,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl, | |||
3127 | mutex_lock(&fs_info->balance_mutex); | 3220 | mutex_lock(&fs_info->balance_mutex); |
3128 | atomic_dec(&fs_info->balance_running); | 3221 | atomic_dec(&fs_info->balance_running); |
3129 | 3222 | ||
3130 | if (bargs) { | ||
3131 | memset(bargs, 0, sizeof(*bargs)); | ||
3132 | update_ioctl_balance_args(fs_info, 0, bargs); | ||
3133 | } | ||
3134 | |||
3135 | if ((ret && ret != -ECANCELED && ret != -ENOSPC) || | ||
3136 | balance_need_close(fs_info)) { | ||
3137 | __cancel_balance(fs_info); | ||
3138 | } | ||
3139 | |||
3140 | if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { | 3223 | if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { |
3141 | fs_info->num_tolerated_disk_barrier_failures = | 3224 | fs_info->num_tolerated_disk_barrier_failures = |
3142 | btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); | 3225 | btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); |
3143 | } | 3226 | } |
3144 | 3227 | ||
3228 | if (bargs) { | ||
3229 | memset(bargs, 0, sizeof(*bargs)); | ||
3230 | update_ioctl_balance_args(fs_info, 0, bargs); | ||
3231 | } | ||
3232 | |||
3145 | wake_up(&fs_info->balance_wait_q); | 3233 | wake_up(&fs_info->balance_wait_q); |
3146 | 3234 | ||
3147 | return ret; | 3235 | return ret; |
@@ -3504,13 +3592,86 @@ static int btrfs_cmp_device_info(const void *a, const void *b) | |||
3504 | } | 3592 | } |
3505 | 3593 | ||
3506 | struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { | 3594 | struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { |
3507 | { 2, 1, 0, 4, 2, 2 /* raid10 */ }, | 3595 | [BTRFS_RAID_RAID10] = { |
3508 | { 1, 1, 2, 2, 2, 2 /* raid1 */ }, | 3596 | .sub_stripes = 2, |
3509 | { 1, 2, 1, 1, 1, 2 /* dup */ }, | 3597 | .dev_stripes = 1, |
3510 | { 1, 1, 0, 2, 1, 1 /* raid0 */ }, | 3598 | .devs_max = 0, /* 0 == as many as possible */ |
3511 | { 1, 1, 1, 1, 1, 1 /* single */ }, | 3599 | .devs_min = 4, |
3600 | .devs_increment = 2, | ||
3601 | .ncopies = 2, | ||
3602 | }, | ||
3603 | [BTRFS_RAID_RAID1] = { | ||
3604 | .sub_stripes = 1, | ||
3605 | .dev_stripes = 1, | ||
3606 | .devs_max = 2, | ||
3607 | .devs_min = 2, | ||
3608 | .devs_increment = 2, | ||
3609 | .ncopies = 2, | ||
3610 | }, | ||
3611 | [BTRFS_RAID_DUP] = { | ||
3612 | .sub_stripes = 1, | ||
3613 | .dev_stripes = 2, | ||
3614 | .devs_max = 1, | ||
3615 | .devs_min = 1, | ||
3616 | .devs_increment = 1, | ||
3617 | .ncopies = 2, | ||
3618 | }, | ||
3619 | [BTRFS_RAID_RAID0] = { | ||
3620 | .sub_stripes = 1, | ||
3621 | .dev_stripes = 1, | ||
3622 | .devs_max = 0, | ||
3623 | .devs_min = 2, | ||
3624 | .devs_increment = 1, | ||
3625 | .ncopies = 1, | ||
3626 | }, | ||
3627 | [BTRFS_RAID_SINGLE] = { | ||
3628 | .sub_stripes = 1, | ||
3629 | .dev_stripes = 1, | ||
3630 | .devs_max = 1, | ||
3631 | .devs_min = 1, | ||
3632 | .devs_increment = 1, | ||
3633 | .ncopies = 1, | ||
3634 | }, | ||
3635 | [BTRFS_RAID_RAID5] = { | ||
3636 | .sub_stripes = 1, | ||
3637 | .dev_stripes = 1, | ||
3638 | .devs_max = 0, | ||
3639 | .devs_min = 2, | ||
3640 | .devs_increment = 1, | ||
3641 | .ncopies = 2, | ||
3642 | }, | ||
3643 | [BTRFS_RAID_RAID6] = { | ||
3644 | .sub_stripes = 1, | ||
3645 | .dev_stripes = 1, | ||
3646 | .devs_max = 0, | ||
3647 | .devs_min = 3, | ||
3648 | .devs_increment = 1, | ||
3649 | .ncopies = 3, | ||
3650 | }, | ||
3512 | }; | 3651 | }; |
3513 | 3652 | ||
3653 | static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) | ||
3654 | { | ||
3655 | /* TODO allow them to set a preferred stripe size */ | ||
3656 | return 64 * 1024; | ||
3657 | } | ||
3658 | |||
3659 | static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) | ||
3660 | { | ||
3661 | u64 features; | ||
3662 | |||
3663 | if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) | ||
3664 | return; | ||
3665 | |||
3666 | features = btrfs_super_incompat_flags(info->super_copy); | ||
3667 | if (features & BTRFS_FEATURE_INCOMPAT_RAID56) | ||
3668 | return; | ||
3669 | |||
3670 | features |= BTRFS_FEATURE_INCOMPAT_RAID56; | ||
3671 | btrfs_set_super_incompat_flags(info->super_copy, features); | ||
3672 | printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n"); | ||
3673 | } | ||
3674 | |||
3514 | static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | 3675 | static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, |
3515 | struct btrfs_root *extent_root, | 3676 | struct btrfs_root *extent_root, |
3516 | struct map_lookup **map_ret, | 3677 | struct map_lookup **map_ret, |
@@ -3526,6 +3687,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3526 | struct btrfs_device_info *devices_info = NULL; | 3687 | struct btrfs_device_info *devices_info = NULL; |
3527 | u64 total_avail; | 3688 | u64 total_avail; |
3528 | int num_stripes; /* total number of stripes to allocate */ | 3689 | int num_stripes; /* total number of stripes to allocate */ |
3690 | int data_stripes; /* number of stripes that count for | ||
3691 | block group size */ | ||
3529 | int sub_stripes; /* sub_stripes info for map */ | 3692 | int sub_stripes; /* sub_stripes info for map */ |
3530 | int dev_stripes; /* stripes per dev */ | 3693 | int dev_stripes; /* stripes per dev */ |
3531 | int devs_max; /* max devs to use */ | 3694 | int devs_max; /* max devs to use */ |
@@ -3537,6 +3700,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3537 | u64 max_chunk_size; | 3700 | u64 max_chunk_size; |
3538 | u64 stripe_size; | 3701 | u64 stripe_size; |
3539 | u64 num_bytes; | 3702 | u64 num_bytes; |
3703 | u64 raid_stripe_len = BTRFS_STRIPE_LEN; | ||
3540 | int ndevs; | 3704 | int ndevs; |
3541 | int i; | 3705 | int i; |
3542 | int j; | 3706 | int j; |
@@ -3631,12 +3795,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3631 | if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) | 3795 | if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) |
3632 | continue; | 3796 | continue; |
3633 | 3797 | ||
3798 | if (ndevs == fs_devices->rw_devices) { | ||
3799 | WARN(1, "%s: found more than %llu devices\n", | ||
3800 | __func__, fs_devices->rw_devices); | ||
3801 | break; | ||
3802 | } | ||
3634 | devices_info[ndevs].dev_offset = dev_offset; | 3803 | devices_info[ndevs].dev_offset = dev_offset; |
3635 | devices_info[ndevs].max_avail = max_avail; | 3804 | devices_info[ndevs].max_avail = max_avail; |
3636 | devices_info[ndevs].total_avail = total_avail; | 3805 | devices_info[ndevs].total_avail = total_avail; |
3637 | devices_info[ndevs].dev = device; | 3806 | devices_info[ndevs].dev = device; |
3638 | ++ndevs; | 3807 | ++ndevs; |
3639 | WARN_ON(ndevs > fs_devices->rw_devices); | ||
3640 | } | 3808 | } |
3641 | 3809 | ||
3642 | /* | 3810 | /* |
@@ -3662,16 +3830,48 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3662 | stripe_size = devices_info[ndevs-1].max_avail; | 3830 | stripe_size = devices_info[ndevs-1].max_avail; |
3663 | num_stripes = ndevs * dev_stripes; | 3831 | num_stripes = ndevs * dev_stripes; |
3664 | 3832 | ||
3665 | if (stripe_size * ndevs > max_chunk_size * ncopies) { | 3833 | /* |
3666 | stripe_size = max_chunk_size * ncopies; | 3834 | * this will have to be fixed for RAID1 and RAID10 over |
3667 | do_div(stripe_size, ndevs); | 3835 | * more drives |
3836 | */ | ||
3837 | data_stripes = num_stripes / ncopies; | ||
3838 | |||
3839 | if (type & BTRFS_BLOCK_GROUP_RAID5) { | ||
3840 | raid_stripe_len = find_raid56_stripe_len(ndevs - 1, | ||
3841 | btrfs_super_stripesize(info->super_copy)); | ||
3842 | data_stripes = num_stripes - 1; | ||
3843 | } | ||
3844 | if (type & BTRFS_BLOCK_GROUP_RAID6) { | ||
3845 | raid_stripe_len = find_raid56_stripe_len(ndevs - 2, | ||
3846 | btrfs_super_stripesize(info->super_copy)); | ||
3847 | data_stripes = num_stripes - 2; | ||
3848 | } | ||
3849 | |||
3850 | /* | ||
3851 | * Use the number of data stripes to figure out how big this chunk | ||
3852 | * is really going to be in terms of logical address space, | ||
3853 | * and compare that answer with the max chunk size | ||
3854 | */ | ||
3855 | if (stripe_size * data_stripes > max_chunk_size) { | ||
3856 | u64 mask = (1ULL << 24) - 1; | ||
3857 | stripe_size = max_chunk_size; | ||
3858 | do_div(stripe_size, data_stripes); | ||
3859 | |||
3860 | /* bump the answer up to a 16MB boundary */ | ||
3861 | stripe_size = (stripe_size + mask) & ~mask; | ||
3862 | |||
3863 | /* but don't go higher than the limits we found | ||
3864 | * while searching for free extents | ||
3865 | */ | ||
3866 | if (stripe_size > devices_info[ndevs-1].max_avail) | ||
3867 | stripe_size = devices_info[ndevs-1].max_avail; | ||
3668 | } | 3868 | } |
3669 | 3869 | ||
3670 | do_div(stripe_size, dev_stripes); | 3870 | do_div(stripe_size, dev_stripes); |
3671 | 3871 | ||
3672 | /* align to BTRFS_STRIPE_LEN */ | 3872 | /* align to BTRFS_STRIPE_LEN */ |
3673 | do_div(stripe_size, BTRFS_STRIPE_LEN); | 3873 | do_div(stripe_size, raid_stripe_len); |
3674 | stripe_size *= BTRFS_STRIPE_LEN; | 3874 | stripe_size *= raid_stripe_len; |
3675 | 3875 | ||
3676 | map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); | 3876 | map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); |
3677 | if (!map) { | 3877 | if (!map) { |
@@ -3689,14 +3889,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3689 | } | 3889 | } |
3690 | } | 3890 | } |
3691 | map->sector_size = extent_root->sectorsize; | 3891 | map->sector_size = extent_root->sectorsize; |
3692 | map->stripe_len = BTRFS_STRIPE_LEN; | 3892 | map->stripe_len = raid_stripe_len; |
3693 | map->io_align = BTRFS_STRIPE_LEN; | 3893 | map->io_align = raid_stripe_len; |
3694 | map->io_width = BTRFS_STRIPE_LEN; | 3894 | map->io_width = raid_stripe_len; |
3695 | map->type = type; | 3895 | map->type = type; |
3696 | map->sub_stripes = sub_stripes; | 3896 | map->sub_stripes = sub_stripes; |
3697 | 3897 | ||
3698 | *map_ret = map; | 3898 | *map_ret = map; |
3699 | num_bytes = stripe_size * (num_stripes / ncopies); | 3899 | num_bytes = stripe_size * data_stripes; |
3700 | 3900 | ||
3701 | *stripe_size_out = stripe_size; | 3901 | *stripe_size_out = stripe_size; |
3702 | *num_bytes_out = num_bytes; | 3902 | *num_bytes_out = num_bytes; |
@@ -3718,15 +3918,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3718 | write_lock(&em_tree->lock); | 3918 | write_lock(&em_tree->lock); |
3719 | ret = add_extent_mapping(em_tree, em); | 3919 | ret = add_extent_mapping(em_tree, em); |
3720 | write_unlock(&em_tree->lock); | 3920 | write_unlock(&em_tree->lock); |
3721 | free_extent_map(em); | 3921 | if (ret) { |
3722 | if (ret) | 3922 | free_extent_map(em); |
3723 | goto error; | ||
3724 | |||
3725 | ret = btrfs_make_block_group(trans, extent_root, 0, type, | ||
3726 | BTRFS_FIRST_CHUNK_TREE_OBJECTID, | ||
3727 | start, num_bytes); | ||
3728 | if (ret) | ||
3729 | goto error; | 3923 | goto error; |
3924 | } | ||
3730 | 3925 | ||
3731 | for (i = 0; i < map->num_stripes; ++i) { | 3926 | for (i = 0; i < map->num_stripes; ++i) { |
3732 | struct btrfs_device *device; | 3927 | struct btrfs_device *device; |
@@ -3739,15 +3934,44 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3739 | info->chunk_root->root_key.objectid, | 3934 | info->chunk_root->root_key.objectid, |
3740 | BTRFS_FIRST_CHUNK_TREE_OBJECTID, | 3935 | BTRFS_FIRST_CHUNK_TREE_OBJECTID, |
3741 | start, dev_offset, stripe_size); | 3936 | start, dev_offset, stripe_size); |
3742 | if (ret) { | 3937 | if (ret) |
3743 | btrfs_abort_transaction(trans, extent_root, ret); | 3938 | goto error_dev_extent; |
3744 | goto error; | 3939 | } |
3745 | } | 3940 | |
3941 | ret = btrfs_make_block_group(trans, extent_root, 0, type, | ||
3942 | BTRFS_FIRST_CHUNK_TREE_OBJECTID, | ||
3943 | start, num_bytes); | ||
3944 | if (ret) { | ||
3945 | i = map->num_stripes - 1; | ||
3946 | goto error_dev_extent; | ||
3746 | } | 3947 | } |
3747 | 3948 | ||
3949 | free_extent_map(em); | ||
3950 | check_raid56_incompat_flag(extent_root->fs_info, type); | ||
3951 | |||
3748 | kfree(devices_info); | 3952 | kfree(devices_info); |
3749 | return 0; | 3953 | return 0; |
3750 | 3954 | ||
3955 | error_dev_extent: | ||
3956 | for (; i >= 0; i--) { | ||
3957 | struct btrfs_device *device; | ||
3958 | int err; | ||
3959 | |||
3960 | device = map->stripes[i].dev; | ||
3961 | err = btrfs_free_dev_extent(trans, device, start); | ||
3962 | if (err) { | ||
3963 | btrfs_abort_transaction(trans, extent_root, err); | ||
3964 | break; | ||
3965 | } | ||
3966 | } | ||
3967 | write_lock(&em_tree->lock); | ||
3968 | remove_extent_mapping(em_tree, em); | ||
3969 | write_unlock(&em_tree->lock); | ||
3970 | |||
3971 | /* One for our allocation */ | ||
3972 | free_extent_map(em); | ||
3973 | /* One for the tree reference */ | ||
3974 | free_extent_map(em); | ||
3751 | error: | 3975 | error: |
3752 | kfree(map); | 3976 | kfree(map); |
3753 | kfree(devices_info); | 3977 | kfree(devices_info); |
@@ -3887,10 +4111,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, | |||
3887 | if (ret) | 4111 | if (ret) |
3888 | return ret; | 4112 | return ret; |
3889 | 4113 | ||
3890 | alloc_profile = BTRFS_BLOCK_GROUP_METADATA | | 4114 | alloc_profile = btrfs_get_alloc_profile(extent_root, 0); |
3891 | fs_info->avail_metadata_alloc_bits; | ||
3892 | alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); | ||
3893 | |||
3894 | ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, | 4115 | ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, |
3895 | &stripe_size, chunk_offset, alloc_profile); | 4116 | &stripe_size, chunk_offset, alloc_profile); |
3896 | if (ret) | 4117 | if (ret) |
@@ -3898,10 +4119,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, | |||
3898 | 4119 | ||
3899 | sys_chunk_offset = chunk_offset + chunk_size; | 4120 | sys_chunk_offset = chunk_offset + chunk_size; |
3900 | 4121 | ||
3901 | alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | | 4122 | alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); |
3902 | fs_info->avail_system_alloc_bits; | ||
3903 | alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); | ||
3904 | |||
3905 | ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, | 4123 | ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, |
3906 | &sys_chunk_size, &sys_stripe_size, | 4124 | &sys_chunk_size, &sys_stripe_size, |
3907 | sys_chunk_offset, alloc_profile); | 4125 | sys_chunk_offset, alloc_profile); |
@@ -4014,6 +4232,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) | |||
4014 | ret = map->num_stripes; | 4232 | ret = map->num_stripes; |
4015 | else if (map->type & BTRFS_BLOCK_GROUP_RAID10) | 4233 | else if (map->type & BTRFS_BLOCK_GROUP_RAID10) |
4016 | ret = map->sub_stripes; | 4234 | ret = map->sub_stripes; |
4235 | else if (map->type & BTRFS_BLOCK_GROUP_RAID5) | ||
4236 | ret = 2; | ||
4237 | else if (map->type & BTRFS_BLOCK_GROUP_RAID6) | ||
4238 | ret = 3; | ||
4017 | else | 4239 | else |
4018 | ret = 1; | 4240 | ret = 1; |
4019 | free_extent_map(em); | 4241 | free_extent_map(em); |
@@ -4026,6 +4248,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) | |||
4026 | return ret; | 4248 | return ret; |
4027 | } | 4249 | } |
4028 | 4250 | ||
4251 | unsigned long btrfs_full_stripe_len(struct btrfs_root *root, | ||
4252 | struct btrfs_mapping_tree *map_tree, | ||
4253 | u64 logical) | ||
4254 | { | ||
4255 | struct extent_map *em; | ||
4256 | struct map_lookup *map; | ||
4257 | struct extent_map_tree *em_tree = &map_tree->map_tree; | ||
4258 | unsigned long len = root->sectorsize; | ||
4259 | |||
4260 | read_lock(&em_tree->lock); | ||
4261 | em = lookup_extent_mapping(em_tree, logical, len); | ||
4262 | read_unlock(&em_tree->lock); | ||
4263 | BUG_ON(!em); | ||
4264 | |||
4265 | BUG_ON(em->start > logical || em->start + em->len < logical); | ||
4266 | map = (struct map_lookup *)em->bdev; | ||
4267 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
4268 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4269 | len = map->stripe_len * nr_data_stripes(map); | ||
4270 | } | ||
4271 | free_extent_map(em); | ||
4272 | return len; | ||
4273 | } | ||
4274 | |||
4275 | int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, | ||
4276 | u64 logical, u64 len, int mirror_num) | ||
4277 | { | ||
4278 | struct extent_map *em; | ||
4279 | struct map_lookup *map; | ||
4280 | struct extent_map_tree *em_tree = &map_tree->map_tree; | ||
4281 | int ret = 0; | ||
4282 | |||
4283 | read_lock(&em_tree->lock); | ||
4284 | em = lookup_extent_mapping(em_tree, logical, len); | ||
4285 | read_unlock(&em_tree->lock); | ||
4286 | BUG_ON(!em); | ||
4287 | |||
4288 | BUG_ON(em->start > logical || em->start + em->len < logical); | ||
4289 | map = (struct map_lookup *)em->bdev; | ||
4290 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
4291 | BTRFS_BLOCK_GROUP_RAID6)) | ||
4292 | ret = 1; | ||
4293 | free_extent_map(em); | ||
4294 | return ret; | ||
4295 | } | ||
4296 | |||
4029 | static int find_live_mirror(struct btrfs_fs_info *fs_info, | 4297 | static int find_live_mirror(struct btrfs_fs_info *fs_info, |
4030 | struct map_lookup *map, int first, int num, | 4298 | struct map_lookup *map, int first, int num, |
4031 | int optimal, int dev_replace_is_ongoing) | 4299 | int optimal, int dev_replace_is_ongoing) |
@@ -4063,10 +4331,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, | |||
4063 | return optimal; | 4331 | return optimal; |
4064 | } | 4332 | } |
4065 | 4333 | ||
4334 | static inline int parity_smaller(u64 a, u64 b) | ||
4335 | { | ||
4336 | return a > b; | ||
4337 | } | ||
4338 | |||
4339 | /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ | ||
4340 | static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) | ||
4341 | { | ||
4342 | struct btrfs_bio_stripe s; | ||
4343 | int i; | ||
4344 | u64 l; | ||
4345 | int again = 1; | ||
4346 | |||
4347 | while (again) { | ||
4348 | again = 0; | ||
4349 | for (i = 0; i < bbio->num_stripes - 1; i++) { | ||
4350 | if (parity_smaller(raid_map[i], raid_map[i+1])) { | ||
4351 | s = bbio->stripes[i]; | ||
4352 | l = raid_map[i]; | ||
4353 | bbio->stripes[i] = bbio->stripes[i+1]; | ||
4354 | raid_map[i] = raid_map[i+1]; | ||
4355 | bbio->stripes[i+1] = s; | ||
4356 | raid_map[i+1] = l; | ||
4357 | again = 1; | ||
4358 | } | ||
4359 | } | ||
4360 | } | ||
4361 | } | ||
4362 | |||
4066 | static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | 4363 | static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, |
4067 | u64 logical, u64 *length, | 4364 | u64 logical, u64 *length, |
4068 | struct btrfs_bio **bbio_ret, | 4365 | struct btrfs_bio **bbio_ret, |
4069 | int mirror_num) | 4366 | int mirror_num, u64 **raid_map_ret) |
4070 | { | 4367 | { |
4071 | struct extent_map *em; | 4368 | struct extent_map *em; |
4072 | struct map_lookup *map; | 4369 | struct map_lookup *map; |
@@ -4078,6 +4375,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4078 | u64 stripe_nr; | 4375 | u64 stripe_nr; |
4079 | u64 stripe_nr_orig; | 4376 | u64 stripe_nr_orig; |
4080 | u64 stripe_nr_end; | 4377 | u64 stripe_nr_end; |
4378 | u64 stripe_len; | ||
4379 | u64 *raid_map = NULL; | ||
4081 | int stripe_index; | 4380 | int stripe_index; |
4082 | int i; | 4381 | int i; |
4083 | int ret = 0; | 4382 | int ret = 0; |
@@ -4089,6 +4388,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4089 | int num_alloc_stripes; | 4388 | int num_alloc_stripes; |
4090 | int patch_the_first_stripe_for_dev_replace = 0; | 4389 | int patch_the_first_stripe_for_dev_replace = 0; |
4091 | u64 physical_to_patch_in_first_stripe = 0; | 4390 | u64 physical_to_patch_in_first_stripe = 0; |
4391 | u64 raid56_full_stripe_start = (u64)-1; | ||
4092 | 4392 | ||
4093 | read_lock(&em_tree->lock); | 4393 | read_lock(&em_tree->lock); |
4094 | em = lookup_extent_mapping(em_tree, logical, *length); | 4394 | em = lookup_extent_mapping(em_tree, logical, *length); |
@@ -4105,29 +4405,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4105 | map = (struct map_lookup *)em->bdev; | 4405 | map = (struct map_lookup *)em->bdev; |
4106 | offset = logical - em->start; | 4406 | offset = logical - em->start; |
4107 | 4407 | ||
4408 | if (mirror_num > map->num_stripes) | ||
4409 | mirror_num = 0; | ||
4410 | |||
4411 | stripe_len = map->stripe_len; | ||
4108 | stripe_nr = offset; | 4412 | stripe_nr = offset; |
4109 | /* | 4413 | /* |
4110 | * stripe_nr counts the total number of stripes we have to stride | 4414 | * stripe_nr counts the total number of stripes we have to stride |
4111 | * to get to this block | 4415 | * to get to this block |
4112 | */ | 4416 | */ |
4113 | do_div(stripe_nr, map->stripe_len); | 4417 | do_div(stripe_nr, stripe_len); |
4114 | 4418 | ||
4115 | stripe_offset = stripe_nr * map->stripe_len; | 4419 | stripe_offset = stripe_nr * stripe_len; |
4116 | BUG_ON(offset < stripe_offset); | 4420 | BUG_ON(offset < stripe_offset); |
4117 | 4421 | ||
4118 | /* stripe_offset is the offset of this block in its stripe*/ | 4422 | /* stripe_offset is the offset of this block in its stripe*/ |
4119 | stripe_offset = offset - stripe_offset; | 4423 | stripe_offset = offset - stripe_offset; |
4120 | 4424 | ||
4121 | if (rw & REQ_DISCARD) | 4425 | /* if we're here for raid56, we need to know the stripe aligned start */ |
4426 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4427 | unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); | ||
4428 | raid56_full_stripe_start = offset; | ||
4429 | |||
4430 | /* allow a write of a full stripe, but make sure we don't | ||
4431 | * allow straddling of stripes | ||
4432 | */ | ||
4433 | do_div(raid56_full_stripe_start, full_stripe_len); | ||
4434 | raid56_full_stripe_start *= full_stripe_len; | ||
4435 | } | ||
4436 | |||
4437 | if (rw & REQ_DISCARD) { | ||
4438 | /* we don't discard raid56 yet */ | ||
4439 | if (map->type & | ||
4440 | (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4441 | ret = -EOPNOTSUPP; | ||
4442 | goto out; | ||
4443 | } | ||
4122 | *length = min_t(u64, em->len - offset, *length); | 4444 | *length = min_t(u64, em->len - offset, *length); |
4123 | else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { | 4445 | } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { |
4124 | /* we limit the length of each bio to what fits in a stripe */ | 4446 | u64 max_len; |
4125 | *length = min_t(u64, em->len - offset, | 4447 | /* For writes to RAID[56], allow a full stripeset across all disks. |
4126 | map->stripe_len - stripe_offset); | 4448 | For other RAID types and for RAID[56] reads, just allow a single |
4449 | stripe (on a single disk). */ | ||
4450 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && | ||
4451 | (rw & REQ_WRITE)) { | ||
4452 | max_len = stripe_len * nr_data_stripes(map) - | ||
4453 | (offset - raid56_full_stripe_start); | ||
4454 | } else { | ||
4455 | /* we limit the length of each bio to what fits in a stripe */ | ||
4456 | max_len = stripe_len - stripe_offset; | ||
4457 | } | ||
4458 | *length = min_t(u64, em->len - offset, max_len); | ||
4127 | } else { | 4459 | } else { |
4128 | *length = em->len - offset; | 4460 | *length = em->len - offset; |
4129 | } | 4461 | } |
4130 | 4462 | ||
4463 | /* This is for when we're called from btrfs_merge_bio_hook() and all | ||
4464 | it cares about is the length */ | ||
4131 | if (!bbio_ret) | 4465 | if (!bbio_ret) |
4132 | goto out; | 4466 | goto out; |
4133 | 4467 | ||
@@ -4160,7 +4494,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4160 | u64 physical_of_found = 0; | 4494 | u64 physical_of_found = 0; |
4161 | 4495 | ||
4162 | ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, | 4496 | ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, |
4163 | logical, &tmp_length, &tmp_bbio, 0); | 4497 | logical, &tmp_length, &tmp_bbio, 0, NULL); |
4164 | if (ret) { | 4498 | if (ret) { |
4165 | WARN_ON(tmp_bbio != NULL); | 4499 | WARN_ON(tmp_bbio != NULL); |
4166 | goto out; | 4500 | goto out; |
@@ -4221,11 +4555,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4221 | num_stripes = 1; | 4555 | num_stripes = 1; |
4222 | stripe_index = 0; | 4556 | stripe_index = 0; |
4223 | stripe_nr_orig = stripe_nr; | 4557 | stripe_nr_orig = stripe_nr; |
4224 | stripe_nr_end = (offset + *length + map->stripe_len - 1) & | 4558 | stripe_nr_end = ALIGN(offset + *length, map->stripe_len); |
4225 | (~(map->stripe_len - 1)); | ||
4226 | do_div(stripe_nr_end, map->stripe_len); | 4559 | do_div(stripe_nr_end, map->stripe_len); |
4227 | stripe_end_offset = stripe_nr_end * map->stripe_len - | 4560 | stripe_end_offset = stripe_nr_end * map->stripe_len - |
4228 | (offset + *length); | 4561 | (offset + *length); |
4562 | |||
4229 | if (map->type & BTRFS_BLOCK_GROUP_RAID0) { | 4563 | if (map->type & BTRFS_BLOCK_GROUP_RAID0) { |
4230 | if (rw & REQ_DISCARD) | 4564 | if (rw & REQ_DISCARD) |
4231 | num_stripes = min_t(u64, map->num_stripes, | 4565 | num_stripes = min_t(u64, map->num_stripes, |
@@ -4276,6 +4610,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4276 | dev_replace_is_ongoing); | 4610 | dev_replace_is_ongoing); |
4277 | mirror_num = stripe_index - old_stripe_index + 1; | 4611 | mirror_num = stripe_index - old_stripe_index + 1; |
4278 | } | 4612 | } |
4613 | |||
4614 | } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
4615 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4616 | u64 tmp; | ||
4617 | |||
4618 | if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1) | ||
4619 | && raid_map_ret) { | ||
4620 | int i, rot; | ||
4621 | |||
4622 | /* push stripe_nr back to the start of the full stripe */ | ||
4623 | stripe_nr = raid56_full_stripe_start; | ||
4624 | do_div(stripe_nr, stripe_len); | ||
4625 | |||
4626 | stripe_index = do_div(stripe_nr, nr_data_stripes(map)); | ||
4627 | |||
4628 | /* RAID[56] write or recovery. Return all stripes */ | ||
4629 | num_stripes = map->num_stripes; | ||
4630 | max_errors = nr_parity_stripes(map); | ||
4631 | |||
4632 | raid_map = kmalloc(sizeof(u64) * num_stripes, | ||
4633 | GFP_NOFS); | ||
4634 | if (!raid_map) { | ||
4635 | ret = -ENOMEM; | ||
4636 | goto out; | ||
4637 | } | ||
4638 | |||
4639 | /* Work out the disk rotation on this stripe-set */ | ||
4640 | tmp = stripe_nr; | ||
4641 | rot = do_div(tmp, num_stripes); | ||
4642 | |||
4643 | /* Fill in the logical address of each stripe */ | ||
4644 | tmp = stripe_nr * nr_data_stripes(map); | ||
4645 | for (i = 0; i < nr_data_stripes(map); i++) | ||
4646 | raid_map[(i+rot) % num_stripes] = | ||
4647 | em->start + (tmp + i) * map->stripe_len; | ||
4648 | |||
4649 | raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; | ||
4650 | if (map->type & BTRFS_BLOCK_GROUP_RAID6) | ||
4651 | raid_map[(i+rot+1) % num_stripes] = | ||
4652 | RAID6_Q_STRIPE; | ||
4653 | |||
4654 | *length = map->stripe_len; | ||
4655 | stripe_index = 0; | ||
4656 | stripe_offset = 0; | ||
4657 | } else { | ||
4658 | /* | ||
4659 | * Mirror #0 or #1 means the original data block. | ||
4660 | * Mirror #2 is RAID5 parity block. | ||
4661 | * Mirror #3 is RAID6 Q block. | ||
4662 | */ | ||
4663 | stripe_index = do_div(stripe_nr, nr_data_stripes(map)); | ||
4664 | if (mirror_num > 1) | ||
4665 | stripe_index = nr_data_stripes(map) + | ||
4666 | mirror_num - 2; | ||
4667 | |||
4668 | /* We distribute the parity blocks across stripes */ | ||
4669 | tmp = stripe_nr + stripe_index; | ||
4670 | stripe_index = do_div(tmp, map->num_stripes); | ||
4671 | } | ||
4279 | } else { | 4672 | } else { |
4280 | /* | 4673 | /* |
4281 | * after this do_div call, stripe_nr is the number of stripes | 4674 | * after this do_div call, stripe_nr is the number of stripes |
@@ -4384,8 +4777,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4384 | if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { | 4777 | if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { |
4385 | if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | | 4778 | if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | |
4386 | BTRFS_BLOCK_GROUP_RAID10 | | 4779 | BTRFS_BLOCK_GROUP_RAID10 | |
4780 | BTRFS_BLOCK_GROUP_RAID5 | | ||
4387 | BTRFS_BLOCK_GROUP_DUP)) { | 4781 | BTRFS_BLOCK_GROUP_DUP)) { |
4388 | max_errors = 1; | 4782 | max_errors = 1; |
4783 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { | ||
4784 | max_errors = 2; | ||
4389 | } | 4785 | } |
4390 | } | 4786 | } |
4391 | 4787 | ||
@@ -4486,6 +4882,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4486 | bbio->stripes[0].physical = physical_to_patch_in_first_stripe; | 4882 | bbio->stripes[0].physical = physical_to_patch_in_first_stripe; |
4487 | bbio->mirror_num = map->num_stripes + 1; | 4883 | bbio->mirror_num = map->num_stripes + 1; |
4488 | } | 4884 | } |
4885 | if (raid_map) { | ||
4886 | sort_parity_stripes(bbio, raid_map); | ||
4887 | *raid_map_ret = raid_map; | ||
4888 | } | ||
4489 | out: | 4889 | out: |
4490 | if (dev_replace_is_ongoing) | 4890 | if (dev_replace_is_ongoing) |
4491 | btrfs_dev_replace_unlock(dev_replace); | 4891 | btrfs_dev_replace_unlock(dev_replace); |
@@ -4498,7 +4898,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4498 | struct btrfs_bio **bbio_ret, int mirror_num) | 4898 | struct btrfs_bio **bbio_ret, int mirror_num) |
4499 | { | 4899 | { |
4500 | return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, | 4900 | return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, |
4501 | mirror_num); | 4901 | mirror_num, NULL); |
4502 | } | 4902 | } |
4503 | 4903 | ||
4504 | int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | 4904 | int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, |
@@ -4512,6 +4912,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
4512 | u64 bytenr; | 4912 | u64 bytenr; |
4513 | u64 length; | 4913 | u64 length; |
4514 | u64 stripe_nr; | 4914 | u64 stripe_nr; |
4915 | u64 rmap_len; | ||
4515 | int i, j, nr = 0; | 4916 | int i, j, nr = 0; |
4516 | 4917 | ||
4517 | read_lock(&em_tree->lock); | 4918 | read_lock(&em_tree->lock); |
@@ -4522,10 +4923,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
4522 | map = (struct map_lookup *)em->bdev; | 4923 | map = (struct map_lookup *)em->bdev; |
4523 | 4924 | ||
4524 | length = em->len; | 4925 | length = em->len; |
4926 | rmap_len = map->stripe_len; | ||
4927 | |||
4525 | if (map->type & BTRFS_BLOCK_GROUP_RAID10) | 4928 | if (map->type & BTRFS_BLOCK_GROUP_RAID10) |
4526 | do_div(length, map->num_stripes / map->sub_stripes); | 4929 | do_div(length, map->num_stripes / map->sub_stripes); |
4527 | else if (map->type & BTRFS_BLOCK_GROUP_RAID0) | 4930 | else if (map->type & BTRFS_BLOCK_GROUP_RAID0) |
4528 | do_div(length, map->num_stripes); | 4931 | do_div(length, map->num_stripes); |
4932 | else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
4933 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4934 | do_div(length, nr_data_stripes(map)); | ||
4935 | rmap_len = map->stripe_len * nr_data_stripes(map); | ||
4936 | } | ||
4529 | 4937 | ||
4530 | buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); | 4938 | buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); |
4531 | BUG_ON(!buf); /* -ENOMEM */ | 4939 | BUG_ON(!buf); /* -ENOMEM */ |
@@ -4545,8 +4953,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
4545 | do_div(stripe_nr, map->sub_stripes); | 4953 | do_div(stripe_nr, map->sub_stripes); |
4546 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { | 4954 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { |
4547 | stripe_nr = stripe_nr * map->num_stripes + i; | 4955 | stripe_nr = stripe_nr * map->num_stripes + i; |
4548 | } | 4956 | } /* else if RAID[56], multiply by nr_data_stripes(). |
4549 | bytenr = chunk_start + stripe_nr * map->stripe_len; | 4957 | * Alternatively, just use rmap_len below instead of |
4958 | * map->stripe_len */ | ||
4959 | |||
4960 | bytenr = chunk_start + stripe_nr * rmap_len; | ||
4550 | WARN_ON(nr >= map->num_stripes); | 4961 | WARN_ON(nr >= map->num_stripes); |
4551 | for (j = 0; j < nr; j++) { | 4962 | for (j = 0; j < nr; j++) { |
4552 | if (buf[j] == bytenr) | 4963 | if (buf[j] == bytenr) |
@@ -4560,7 +4971,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
4560 | 4971 | ||
4561 | *logical = buf; | 4972 | *logical = buf; |
4562 | *naddrs = nr; | 4973 | *naddrs = nr; |
4563 | *stripe_len = map->stripe_len; | 4974 | *stripe_len = rmap_len; |
4564 | 4975 | ||
4565 | free_extent_map(em); | 4976 | free_extent_map(em); |
4566 | return 0; | 4977 | return 0; |
@@ -4634,7 +5045,7 @@ static void btrfs_end_bio(struct bio *bio, int err) | |||
4634 | bio->bi_bdev = (struct block_device *) | 5045 | bio->bi_bdev = (struct block_device *) |
4635 | (unsigned long)bbio->mirror_num; | 5046 | (unsigned long)bbio->mirror_num; |
4636 | /* only send an error to the higher layers if it is | 5047 | /* only send an error to the higher layers if it is |
4637 | * beyond the tolerance of the multi-bio | 5048 | * beyond the tolerance of the btrfs bio |
4638 | */ | 5049 | */ |
4639 | if (atomic_read(&bbio->error) > bbio->max_errors) { | 5050 | if (atomic_read(&bbio->error) > bbio->max_errors) { |
4640 | err = -EIO; | 5051 | err = -EIO; |
@@ -4668,13 +5079,18 @@ struct async_sched { | |||
4668 | * This will add one bio to the pending list for a device and make sure | 5079 | * This will add one bio to the pending list for a device and make sure |
4669 | * the work struct is scheduled. | 5080 | * the work struct is scheduled. |
4670 | */ | 5081 | */ |
4671 | static noinline void schedule_bio(struct btrfs_root *root, | 5082 | noinline void btrfs_schedule_bio(struct btrfs_root *root, |
4672 | struct btrfs_device *device, | 5083 | struct btrfs_device *device, |
4673 | int rw, struct bio *bio) | 5084 | int rw, struct bio *bio) |
4674 | { | 5085 | { |
4675 | int should_queue = 1; | 5086 | int should_queue = 1; |
4676 | struct btrfs_pending_bios *pending_bios; | 5087 | struct btrfs_pending_bios *pending_bios; |
4677 | 5088 | ||
5089 | if (device->missing || !device->bdev) { | ||
5090 | bio_endio(bio, -EIO); | ||
5091 | return; | ||
5092 | } | ||
5093 | |||
4678 | /* don't bother with additional async steps for reads, right now */ | 5094 | /* don't bother with additional async steps for reads, right now */ |
4679 | if (!(rw & REQ_WRITE)) { | 5095 | if (!(rw & REQ_WRITE)) { |
4680 | bio_get(bio); | 5096 | bio_get(bio); |
@@ -4772,7 +5188,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, | |||
4772 | #endif | 5188 | #endif |
4773 | bio->bi_bdev = dev->bdev; | 5189 | bio->bi_bdev = dev->bdev; |
4774 | if (async) | 5190 | if (async) |
4775 | schedule_bio(root, dev, rw, bio); | 5191 | btrfs_schedule_bio(root, dev, rw, bio); |
4776 | else | 5192 | else |
4777 | btrfsic_submit_bio(rw, bio); | 5193 | btrfsic_submit_bio(rw, bio); |
4778 | } | 5194 | } |
@@ -4831,6 +5247,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
4831 | u64 logical = (u64)bio->bi_sector << 9; | 5247 | u64 logical = (u64)bio->bi_sector << 9; |
4832 | u64 length = 0; | 5248 | u64 length = 0; |
4833 | u64 map_length; | 5249 | u64 map_length; |
5250 | u64 *raid_map = NULL; | ||
4834 | int ret; | 5251 | int ret; |
4835 | int dev_nr = 0; | 5252 | int dev_nr = 0; |
4836 | int total_devs = 1; | 5253 | int total_devs = 1; |
@@ -4839,12 +5256,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
4839 | length = bio->bi_size; | 5256 | length = bio->bi_size; |
4840 | map_length = length; | 5257 | map_length = length; |
4841 | 5258 | ||
4842 | ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, | 5259 | ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, |
4843 | mirror_num); | 5260 | mirror_num, &raid_map); |
4844 | if (ret) | 5261 | if (ret) /* -ENOMEM */ |
4845 | return ret; | 5262 | return ret; |
4846 | 5263 | ||
4847 | total_devs = bbio->num_stripes; | 5264 | total_devs = bbio->num_stripes; |
5265 | bbio->orig_bio = first_bio; | ||
5266 | bbio->private = first_bio->bi_private; | ||
5267 | bbio->end_io = first_bio->bi_end_io; | ||
5268 | atomic_set(&bbio->stripes_pending, bbio->num_stripes); | ||
5269 | |||
5270 | if (raid_map) { | ||
5271 | /* In this case, map_length has been set to the length of | ||
5272 | a single stripe; not the whole write */ | ||
5273 | if (rw & WRITE) { | ||
5274 | return raid56_parity_write(root, bio, bbio, | ||
5275 | raid_map, map_length); | ||
5276 | } else { | ||
5277 | return raid56_parity_recover(root, bio, bbio, | ||
5278 | raid_map, map_length, | ||
5279 | mirror_num); | ||
5280 | } | ||
5281 | } | ||
5282 | |||
4848 | if (map_length < length) { | 5283 | if (map_length < length) { |
4849 | printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " | 5284 | printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " |
4850 | "len %llu\n", (unsigned long long)logical, | 5285 | "len %llu\n", (unsigned long long)logical, |
@@ -4853,11 +5288,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
4853 | BUG(); | 5288 | BUG(); |
4854 | } | 5289 | } |
4855 | 5290 | ||
4856 | bbio->orig_bio = first_bio; | ||
4857 | bbio->private = first_bio->bi_private; | ||
4858 | bbio->end_io = first_bio->bi_end_io; | ||
4859 | atomic_set(&bbio->stripes_pending, bbio->num_stripes); | ||
4860 | |||
4861 | while (dev_nr < total_devs) { | 5291 | while (dev_nr < total_devs) { |
4862 | dev = bbio->stripes[dev_nr].dev; | 5292 | dev = bbio->stripes[dev_nr].dev; |
4863 | if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { | 5293 | if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { |
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index d3c3939ac751..062d8604d35b 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h | |||
@@ -21,8 +21,8 @@ | |||
21 | 21 | ||
22 | #include <linux/bio.h> | 22 | #include <linux/bio.h> |
23 | #include <linux/sort.h> | 23 | #include <linux/sort.h> |
24 | #include <linux/btrfs.h> | ||
24 | #include "async-thread.h" | 25 | #include "async-thread.h" |
25 | #include "ioctl.h" | ||
26 | 26 | ||
27 | #define BTRFS_STRIPE_LEN (64 * 1024) | 27 | #define BTRFS_STRIPE_LEN (64 * 1024) |
28 | 28 | ||
@@ -321,7 +321,14 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, | |||
321 | void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, | 321 | void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, |
322 | struct btrfs_device *tgtdev); | 322 | struct btrfs_device *tgtdev); |
323 | int btrfs_scratch_superblock(struct btrfs_device *device); | 323 | int btrfs_scratch_superblock(struct btrfs_device *device); |
324 | 324 | void btrfs_schedule_bio(struct btrfs_root *root, | |
325 | struct btrfs_device *device, | ||
326 | int rw, struct bio *bio); | ||
327 | int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, | ||
328 | u64 logical, u64 len, int mirror_num); | ||
329 | unsigned long btrfs_full_stripe_len(struct btrfs_root *root, | ||
330 | struct btrfs_mapping_tree *map_tree, | ||
331 | u64 logical); | ||
325 | static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, | 332 | static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, |
326 | int index) | 333 | int index) |
327 | { | 334 | { |